commit 10a4f0dd780c4f1854ee74287a1eead230f3ba64 Author: lhenry15 Date: Tue Sep 8 01:19:53 2020 -0500 first commit Former-commit-id: 08bc23ba02cffbce3cf63962390a65459a132e48 [formerly 0795edd4834b9b7dc66db8d10d4cbaf42bbf82cb] [formerly b5010b42541add7e2ea2578bf2da537efc457757 [formerly a7ca09c2c34c4fc8b3d8e01fcfa08eeeb2cae99d]] [formerly 615058473a2177ca5b89e9edbb797f4c2a59c7e5 [formerly 743d8dfc6843c4c205051a8ab309fbb2116c895e] [formerly bb0ea98b1e14154ef464e2f7a16738705894e54b [formerly 960a69da74b81ef8093820e003f2d6c59a34974c]]] [formerly 2fa3be52c1b44665bc81a7cc7d4cea4bbf0d91d5 [formerly 2054589f0898627e0a17132fd9d4cc78efc91867] [formerly 3b53730e8a895e803dfdd6ca72bc05e17a4164c1 [formerly 8a2fa8ab7baf6686d21af1f322df46fd58c60e69]] [formerly 87d1e3a07a19d03c7d7c94d93ab4fa9f58dada7c [formerly f331916385a5afac1234854ee8d7f160f34b668f] [formerly 69fb3c78a483343f5071da4f7e2891b83a49dd18 [formerly 386086f05aa9487f65bce2ee54438acbdce57650]]]] Former-commit-id: a00aed8c934a6460c4d9ac902b9a74a3d6864697 [formerly 26fdeca29c2f07916d837883983ca2982056c78e] [formerly 0e3170d41a2f99ecf5c918183d361d4399d793bf [formerly 3c12ad4c88ac5192e0f5606ac0d88dd5bf8602dc]] [formerly d5894f84f2fd2e77a6913efdc5ae388cf1be0495 [formerly ad3e7bc670ff92c992730d29c9d3aa1598d844e8] [formerly 69fb3c78a483343f5071da4f7e2891b83a49dd18]] Former-commit-id: 3c19c9fae64f6106415fbc948a4dc613b9ee12f8 [formerly 467ddc0549c74bb007e8f01773bb6dc9103b417d] [formerly 5fa518345d958e2760e443b366883295de6d991c [formerly 3530e130b9fdb7280f638dbc2e785d2165ba82aa]] Former-commit-id: 9f5d473d42a435ec0d60149939d09be1acc25d92 [formerly be0b25c4ec2cde052a041baf0e11f774a158105d] Former-commit-id: 9eca71cb73ba9edccd70ac06a3b636b8d4093b04 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ac9a2f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,116 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +tests/.asv + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +docs/d3m.rst +docs/d3m.*.rst + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mypy +.mypy_cache/ + +# site +public/ + +.idea/ +tmp/ + +*.swp +results.csv +pipeline.yml +pipeline_run.yml +example_pipeline.json +.DS_Store +tmp.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..2dc5c2f --- /dev/null +++ b/README.md @@ -0,0 +1,143 @@ +# TODS +This is a time-seried outlier detection system. + +## Axolotl +Running pre-defined pipeline +``` +python examples/build_AutoEncoder_pipeline.py +python examples/run_predefined_pipeline.py +``` + +## Installation + +This package works with **Python 3.6** and pip 19+. You need to have the following packages installed on the system (for Debian/Ubuntu): +``` +sudo apt-get install libssl-dev libcurl4-openssl-dev libyaml-dev build-essential libopenblas-dev libcap-dev ffmpeg +``` + +Then run the script `install.sh`. The script witll install d3m core package with: +``` +cd d3m +pip3 install -e . +cd .. +``` +Then it installs common primitives (which will be used in the running examples): +``` +cd common-primitives +pip3 install -e . +cd .. +``` +And it installs sklearn wrapper with: +``` +cd sklearn-wrap +pip3 install -r requirements.txt +pip3 install -e . +cd .. +``` +It installs anomaly primitives (ours) by: +``` +cd anomaly-primitives +pip3 install -r requirements.txt +pip3 install -e . +cd .. +``` + +There could be some missing dependencies that are not listed above. Try to fix it by yourself if you meet any. + +# Dataset +Datasets are located in `datasets/anomaly`. `raw_data` is the raw time series data. `transform.py` is script to transform the raw data to D3M format. `template` includes some templates for generating D3M data. If you run `transform.py`, the script will load the raw `kpi` data and create a folder named `kpi` in D3M format. + +The generated csv file will have the following columns: `d3mIndex`, `timestamp`, `value`, `'ground_truth`. In the example kpi dataset, there is only one value. For other datasets there could be multiple values. The goal of the pipline is to predict the `ground_truth` based on `timestamp` and the value(s). + +There is a nice script to check whether the dataset is in the right format. Run +``` +python3 datasets/validate.py datasets/anomaly/kpi/ +``` +The expected output is as follows: +``` +Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/SCORE/problem_TEST/problemDoc.json'. +Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/SCORE/dataset_TEST/datasetDoc.json'. +Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/kpi_problem/problemDoc.json'. +Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TEST/problem_TEST/problemDoc.json'. +Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json'. +Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/kpi_dataset/datasetDoc.json'. +Validating dataset '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json'. +Validating problem '/home/grads/d/daochen/tods/tods/datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json'. +Validating all datasets and problems. +There are no errors. +``` +Of course, you can also create other datasets with `transform.py`. But for now, we can focus on this example dataset since other datasets are usually in the same format. + +# Example +In D3M, our goal is to provide a **solution** to a **problem** on a **dataset**. Here, solution is a pipline which consists of data processing, classifiers, etc. + +Run the example to build the first pipline with +``` +python3 examples/build_iforest_pipline.py +``` +Note that we have not implemented iForest yet. This one is actually Random Forest. This will generate a file `pipline.yml`, which describes a pipline. We can run the pipeline on the example data in this repo as follows: +``` +python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml +``` +Another example on a subset of the sequences of Yahoo dataset is as follows: +``` +python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml +``` +The above commands will generate two files `results.csv` and `pipline_run.yml` + +# How to add a new primitive + +For new primitives, put them in `/anomaly_pritives`. There is an example for isolation forest (however, this is essentially a RandomForest, although the name is IsolationForest. We need more efforts to change it to real IsolationForest). + +In addition to add a new file, you need to register the promitive in `anomaly-primitives/setup.py` and rerun pip install. + +Use the following command to check whether your new primitives are registered: +``` +python3 -m d3m index search +``` + +Test the new primitives: +``` +python3 examples/build_iforest_pipline.py +``` + +# Template for meta-data in primitives + +* `__author__`: `DATA Lab at Texas A&M University` +* `name`: Just a name. Name your primitive with a few words +* `python_path`: This path should have **5** segments. The first two segments should be `d3m.primitives`. The third segment shoulb be `anomaly_detection`, `data_preprocessing` or `feature_construction` (it should match `primitive_family`). The fourth segment should be your algorithm name, e.g., `isolation_forest`. Note that this name should also be added to [this file](d3m/d3m/metadata/primitive_names.py). The last segment should be one of `Preprocessing`, `Feature`, `Algorithm` (for now). +* `source`: `name` should be `DATA Lab at Texas A&M University`, `contact` should be `mailto:khlai037@tamu.edu`, `uris` should have `https://gitlab.com/lhenry15/tods.git` and the path your py file. +* `algorithms_types`: Name the primitive by your self and add it to [here](d3m/d3m/metadata/schemas/v0/definitions.json#L1957). **Then reinstall d3m.** Fill this field with `metadata_base.PrimitiveAlgorithmType.YOUR_NAME` +* `primitive_family`: For preprocessing primitives, use `metadata_base.PrimitiveFamily.DATA_PREPROCESSING`. For feature analysis primitives, use `metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION`. For anomaly detection primitives, use `metadata_base.PrimitiveFamily.ANOMALY_DETECTION`. +* `id`: Randomly generate one with `import uuid; uuid.uuid4()` +* `hyperparameters_to_tune`: Specify what hyperparameters can be tuned in your primitive +* `version`: `0.0.1` + +Notes: + +1. `installation` is not required. We remove it. + +2. Try to reinstall everything if it does not work. + +3. An example of fake Isolation Forest is [here](anomaly-primitives/anomaly_primitives/SKIsolationForest.py#L294) + + +## Resources of D3M + +If you still have questions, you may refer to the following resources. + +Dataset format [https://gitlab.com/datadrivendiscovery/data-supply](https://gitlab.com/datadrivendiscovery/data-supply) + +Instructions for creating primitives [https://docs.datadrivendiscovery.org/v2020.1.9/interfaces.html](https://docs.datadrivendiscovery.org/v2020.1.9/interfaces.html) + +We use a stable version of d3m core package at [https://gitlab.com/datadrivendiscovery/d3m/-/tree/v2020.1.9](https://gitlab.com/datadrivendiscovery/d3m/-/tree/v2020.1.9). + +The documentation is at [https://docs.datadrivendiscovery.org/](https://docs.datadrivendiscovery.org/). + +The core package documentation is at [https://docs.datadrivendiscovery.org/v2020.1.9/index.html](https://docs.datadrivendiscovery.org/v2020.1.9/index.html) + +The common-primitives is v0.8.0 at [https://gitlab.com/datadrivendiscovery/common-primitives/-/tree/v0.8.0/common_primitives](https://gitlab.com/datadrivendiscovery/common-primitives/-/tree/v0.8.0/common_primitives) + +The sklearn-wrap uses dist branch [https://gitlab.com/datadrivendiscovery/sklearn-wrap/-/tree/dist](https://gitlab.com/datadrivendiscovery/sklearn-wrap/-/tree/dist) + +There are other primitives developed by many universities but are not used in this repo. See [https://gitlab.com/datadrivendiscovery/primitives](https://gitlab.com/datadrivendiscovery/primitives) diff --git a/axolotl/.gitignore b/axolotl/.gitignore new file mode 100644 index 0000000..66fb22f --- /dev/null +++ b/axolotl/.gitignore @@ -0,0 +1,108 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +tests/.asv + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +docs/d3m.rst +docs/d3m.*.rst + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mypy +.mypy_cache/ + +# site +public/ + +.idea/ +tmp/ diff --git a/axolotl/.gitlab-ci.yml b/axolotl/.gitlab-ci.yml new file mode 100644 index 0000000..d8b359f --- /dev/null +++ b/axolotl/.gitlab-ci.yml @@ -0,0 +1,33 @@ +tests: + image: registry.gitlab.com/axolotl1/axolotl/base:latest + stage: test + tags: + - d3m_runner + services: + - docker:dind + variables: + DOCKER_HOST: tcp://docker:2375 + DOCKER_TLS_CERTDIR: "" + GIT_SUBMODULE_STRATEGY: recursive + script: + - pip3 install -e . + - python3 ./run_tests.py + + +build_base_image: + stage: build + image: registry.gitlab.com/datadrivendiscovery/images/testing:ubuntu-bionic-python36 + tags: + - d3m_runner + services: + - docker:dind + variables: + DOCKER_HOST: tcp://docker:2375 + DOCKER_TLS_CERTDIR: "" + script: + - ./images/build-images.sh base + only: + - devel + + + diff --git a/axolotl/.gitmodules b/axolotl/.gitmodules new file mode 100644 index 0000000..5706087 --- /dev/null +++ b/axolotl/.gitmodules @@ -0,0 +1,3 @@ +[submodule "tests/data"] + path = tests/data + url = https://gitlab.com/datadrivendiscovery/tests-data.git diff --git a/axolotl/LICENSE b/axolotl/LICENSE new file mode 100644 index 0000000..6f75635 --- /dev/null +++ b/axolotl/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/axolotl/README.md b/axolotl/README.md new file mode 100644 index 0000000..bd59995 --- /dev/null +++ b/axolotl/README.md @@ -0,0 +1,41 @@ +# Axolotl + +This package provides an easy and high level abstraction +of the [D3M](https://gitlab.com/datadrivendiscovery/d3m) API for AutoML. It contains a suit of basic +requirements and building blocks +[primitives](https://gitlab.com/datadrivendiscovery/primitives). + +## Installation + +The package contains two different version of dependencies, +one with GPU support and other that uses CPU. For the installation +we strongly encourage the use of a python 3.6 virtual environment. + +* CPU version. +```bash +pip3 install -e git+https://gitlab.com/axolotl1/axolotl.git@devel#egg=axolotl[cpu] +``` + +* GPU version. +```bash +pip3 install -e git+https://gitlab.com/axolotl1/axolotl.git@devel#egg=axolotl[gpu] +``` + +Note: +For MacOs, pycurls needs to be manually installed: +```bash +PYCURL_SSL_LIBRARY=openssl LDFLAGS="-L/usr/local/opt/openssl/lib" CPPFLAGS="-I/usr/local/opt/openssl/include" pip install --no-cache-dir pycurl==7.43.0.3 +``` + +## Usage +For new users we recommend installing the package and then cloning it via +```bash +git clone --recursive https://gitlab.com/axolotl1/axolotl.git +``` + +Then start jupyter lab via +```bash +jupyter lab +``` +And then open the [examples](https://gitlab.com/axolotl1/axolotl/-/tree/devel/examples) +directory and try to run them. \ No newline at end of file diff --git a/axolotl/axolotl/__init__.py b/axolotl/axolotl/__init__.py new file mode 100644 index 0000000..c8d46de --- /dev/null +++ b/axolotl/axolotl/__init__.py @@ -0,0 +1,2 @@ +__version__ = 'devel' +__description__ = 'Automated Machine Learning Framework' \ No newline at end of file diff --git a/axolotl/axolotl/algorithms/__init__.py b/axolotl/axolotl/algorithms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/axolotl/axolotl/algorithms/autokeras_integration/__init__.py b/axolotl/axolotl/algorithms/autokeras_integration/__init__.py new file mode 100644 index 0000000..96d6e74 --- /dev/null +++ b/axolotl/axolotl/algorithms/autokeras_integration/__init__.py @@ -0,0 +1,82 @@ +from d3m.metadata.pipeline import Pipeline + +from axolotl.algorithms.autokeras_integration.constants import OMIT_LAYERS, step_function +from axolotl.algorithms.autokeras_integration.steps import set_learner, set_prediction, set_data, \ + set_loss + + +def keras2pipeline(keras_model, batch_size=32): + # Creating pipeline + from tensorflow.python.keras.activations import softmax + pipeline_description = Pipeline() + + pipeline_description.add_input(name='inputs') + + set_data(pipeline_description) + set_loss(pipeline_description) + + offset = len(pipeline_description.steps) + + previous_layer_ids = get_previous_layer_ids(keras_model) + + layers = keras_model.layers + + step_id = 0 + layer_to_step_id = {} + + total_layer_num = len(layers) + for i, layer in enumerate(layers): + cls_name = get_layer_class_name(layer) + if cls_name in OMIT_LAYERS: + continue + layer_id = get_layer_id(layer) + if len(previous_layer_ids[layer_id]) > 0: + layer.previous_layer_ids = tuple( + layer_to_step_id[i] + offset for i in previous_layer_ids[layer_id] + ) + else: + layer.previous_layer_ids = [None] + # Since JPL does not support Softmax Layer, we add the workaround to make use of softmax + if i == total_layer_num - 2 and cls_name == 'Dense': + layer.activation = softmax + d3m_step = step_function[cls_name](step_id, layer) + pipeline_description.add_step(d3m_step) + layer_to_step_id[layer_id] = step_id + step_id += 1 + + set_learner(pipeline_description, batch_size) + set_prediction(pipeline_description) + pipeline_description.add_output( + name='output predictions', data_reference=f"steps.{len(pipeline_description.steps) - 1}.produce") + + return pipeline_description + + +def get_previous_layer_ids(keras_model): + from tensorflow.python.util import nest + model = keras_model + layers = model.layers + + previous_layer_ids = {} + for layer in layers: + layer_id = str(id(layer)) + previous_layer_ids[layer_id] = set() + for i, node in enumerate(layer._inbound_nodes): + node_key = layer.name + '_ib-' + str(i) + if node_key in model._network_nodes: + for inbound_layer in nest.flatten(node.inbound_layers): + inbound_cls_name = get_layer_class_name(inbound_layer) + inbound_layer_id = get_layer_id(inbound_layer) + if inbound_cls_name in OMIT_LAYERS: + previous_layer_ids[layer_id].update(previous_layer_ids[inbound_layer_id]) + else: + previous_layer_ids[layer_id].add(inbound_layer_id) + return previous_layer_ids + + +def get_layer_id(layer): + return str(id(layer)) + + +def get_layer_class_name(layer): + return layer.__class__.__name__ \ No newline at end of file diff --git a/axolotl/axolotl/algorithms/autokeras_integration/block.py b/axolotl/axolotl/algorithms/autokeras_integration/block.py new file mode 100644 index 0000000..bc4d12a --- /dev/null +++ b/axolotl/axolotl/algorithms/autokeras_integration/block.py @@ -0,0 +1,205 @@ +from d3m import index +from d3m.metadata.pipeline import PrimitiveStep +from d3m.metadata.base import ArgumentType + + +class Block: + def __init__(self, block_id, primitive, previous_layer_id): + self.block_id = block_id + self.primitive = primitive + self.previous_layer_id = previous_layer_id + + def get_step(self): + step = PrimitiveStep(primitive=index.get_primitive(self.primitive)) + if self.previous_layer_id is not None: + step.add_hyperparameter(name='previous_layer', argument_type=ArgumentType.PRIMITIVE, + data=self.previous_layer_id) + return step + + +class Conv(Block): + def __init__(self, filters, kernel_size, strides, padding, block_id, primitive, previous_layer_id): + super(Conv, self).__init__(block_id, primitive, previous_layer_id) + self.filters = filters + self.kernel_size = kernel_size[0] + self.strides = strides[0] + self.padding = 'same' if padding else 'valid' + + def get_step(self): + step = super().get_step() + step.add_hyperparameter(name='filters', argument_type=ArgumentType.VALUE, data=self.filters) + step.add_hyperparameter(name='kernel_size', argument_type=ArgumentType.VALUE, data=self.kernel_size) + step.add_hyperparameter(name='strides', argument_type=ArgumentType.VALUE, data=self.strides) + step.add_hyperparameter(name='padding', argument_type=ArgumentType.VALUE, data=self.padding) + return step + + +class Conv1D(Conv): + def __init__(self, block_id, filters=10, kernel_size=2, strides=1, padding='valid', previous_layer_id=None): + super(Conv1D, self).__init__(filters, kernel_size, strides, padding, block_id, + "d3m.primitives.layer.convolution_1d.KerasWrap", previous_layer_id) + + +class Conv2D(Conv): + def __init__(self, block_id, filters=10, kernel_size=2, strides=1, padding='valid', previous_layer_id=None): + super(Conv2D, self).__init__(filters, kernel_size, strides, padding, block_id, + "d3m.primitives.layer.convolution_2d.KerasWrap", previous_layer_id) + + +class Conv3D(Conv): + def __init__(self, block_id, filters=10, kernel_size=2, strides=1, padding='valid', previous_layer_id=None): + super(Conv3D, self).__init__(filters, kernel_size, strides, padding, block_id, + "d3m.primitives.layer.convolution_3d.KerasWrap", previous_layer_id) + + +class Dense(Block): + def __init__(self, block_id, units=120, activation='linear', previous_layer_id=None): + super(Dense, self).__init__(block_id, "d3m.primitives.layer.dense.KerasWrap", previous_layer_id) + self.units = units + self.activation = activation.__name__.lower() + + def get_step(self): + step = super().get_step() + step.add_hyperparameter(name='units', argument_type=ArgumentType.VALUE, data=self.units) + step.add_hyperparameter(name='activation', argument_type=ArgumentType.VALUE, data=self.activation) + return step + + +class BatchNorm2D(Block): + def __init__(self, block_id, previous_layer_id): + super(BatchNorm2D, self).__init__(block_id, "d3m.primitives.layer.batch_normalization.KerasWrap", + previous_layer_id) + + def get_step(self): + step = super().get_step() + return step + + +class MaxPooling(Block): + def __init__(self, pool_size, strides, padding, block_id, primitive, previous_layer_id): + super(MaxPooling, self).__init__(block_id, primitive, previous_layer_id) + self.pool_size = pool_size + self.strides = strides[0] + self.padding = 'same' if padding else 'valid' + + def get_step(self): + step = super().get_step() + step.add_hyperparameter(name='pool_size', argument_type=ArgumentType.VALUE, data=self.pool_size) + step.add_hyperparameter(name='strides', argument_type=ArgumentType.VALUE, data=self.strides) + step.add_hyperparameter(name='padding', argument_type=ArgumentType.VALUE, data=self.padding) + return step + + +class MaxPooling1D(MaxPooling): + def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None): + super(MaxPooling1D, self).__init__(pool_size, strides, padding, block_id, + "d3m.primitives.layer.max_pooling_1d.KerasWrap", previous_layer_id) + + +class MaxPooling2D(MaxPooling): + def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None): + super(MaxPooling2D, self).__init__(pool_size, strides, padding, block_id, + "d3m.primitives.layer.max_pooling_2d.KerasWrap", previous_layer_id) + + +class MaxPooling3D(MaxPooling): + def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None): + super(MaxPooling3D, self).__init__(pool_size, strides, padding, block_id, + "d3m.primitives.layer.max_pooling_3d.KerasWrap", previous_layer_id) + + +class AvgPooling(Block): + def __init__(self, pool_size, strides, padding, block_id, primitive, previous_layer_id): + super(AvgPooling, self).__init__(block_id, primitive, previous_layer_id) + self.pool_size = pool_size[0] + self.strides = strides[0] + self.padding = 'same' if padding else 'valid' + + def get_step(self): + step = super().get_step() + step.add_hyperparameter(name='pool_size', argument_type=ArgumentType.VALUE, data=self.pool_size) + step.add_hyperparameter(name='strides', argument_type=ArgumentType.VALUE, data=self.strides) + step.add_hyperparameter(name='padding', argument_type=ArgumentType.VALUE, data=self.padding) + return step + + +class AvgPooling1D(AvgPooling): + def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None): + super(AvgPooling1D, self).__init__(pool_size, strides, padding, block_id, + "d3m.primitives.layer.average_pooling_1d.KerasWrap", previous_layer_id) + + +class AvgPooling2D(AvgPooling): + def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None): + super(AvgPooling2D, self).__init__(pool_size, strides, padding, block_id, + "d3m.primitives.layer.average_pooling_2d.KerasWrap", previous_layer_id) + + +class AvgPooling3D(AvgPooling): + def __init__(self, block_id, pool_size=(2, 2), strides=(1, 1), padding='valid', previous_layer_id=None): + super(AvgPooling3D, self).__init__(pool_size, strides, padding, block_id, + "d3m.primitives.layer.average_pooling_3d.KerasWrap", previous_layer_id) + + +class GlobalAvgPooling2d(Block): + def __init__(self, block_id, data_format='channels_last', previous_layer_id=None): + super(GlobalAvgPooling2d, self).__init__(block_id, "d3m.primitives.layer.global_average_pooling_2d.KerasWrap", + previous_layer_id=previous_layer_id) + self.data_format = data_format + + def get_step(self): + step = super().get_step() + step.add_hyperparameter(name='data_format', argument_type=ArgumentType.VALUE, data=self.data_format) + return step + + +# JPL does not have such primitives, +# class GlobalMaxPooling2d(MaxPooling2D): +# def __init__(self, block_id, input_shape, previous_layer_id): +# kernel_size = input_shape[0] +# super(GlobalMaxPooling2d, self).__init__(block_id, kernel_size, previous_layer_id=previous_layer_id) + + +class Dropout(Block): + def __init__(self, block_id, rate=0.2, previous_layer_id=None): + super(Dropout, self).__init__(block_id, "d3m.primitives.layer.dropout.KerasWrap", previous_layer_id) + self.rate = rate + + def get_step(self): + step = super().get_step() + step.add_hyperparameter(name='rate', argument_type=ArgumentType.VALUE, data=self.rate) + return step + + +class Flatten(Block): + def __init__(self, block_id, previous_layer_id): + super(Flatten, self).__init__(block_id, "d3m.primitives.layer.flatten.KerasWrap", previous_layer_id) + + +class Add(Block): + def __init__(self, block_id, previous_layer_ids): + super(Add, self).__init__(block_id, "d3m.primitives.layer.add.KerasWrap", None) + self.previous_layer_ids = previous_layer_ids + + def get_step(self): + step = PrimitiveStep(primitive=index.get_primitive(self.primitive)) + step.add_hyperparameter(name='previous_layers', argument_type=ArgumentType.PRIMITIVE, + data=self.previous_layer_ids) + return step + + +class Concatenate(Block): + def __init__(self, block_id, previous_layer_ids): + super(Concatenate, self).__init__(block_id, "d3m.primitives.layer.concat.KerasWrap", None) + self.previous_layer_ids = previous_layer_ids + + def get_step(self): + step = PrimitiveStep(primitive=index.get_primitive(self.primitive)) + step.add_hyperparameter(name='previous_layers', argument_type=ArgumentType.PRIMITIVE, + data=self.previous_layer_ids) + return step + + +class Null(Block): + def __init__(self, block_id): + super(Null, self).__init__(block_id, "d3m.primitives.layer.null.KerasWrap", None) diff --git a/axolotl/axolotl/algorithms/autokeras_integration/constants.py b/axolotl/axolotl/algorithms/autokeras_integration/constants.py new file mode 100644 index 0000000..0533405 --- /dev/null +++ b/axolotl/axolotl/algorithms/autokeras_integration/constants.py @@ -0,0 +1,23 @@ +from .mapping import * + +step_function = { + 'Dense': fetch_dense_step, + 'Conv1D': fetch_conv1D_step, + 'Conv2D': fetch_conv2D_step, + 'Conv3D': fetch_conv3D_step, + 'BatchNormalization': fetch_batch_norm_step, + 'MaxPooling2D': fetch_maxpool2d_step, + 'Dropout': fetch_dropout_step, + 'AvgPooling2D': fetch_avgpool2d_step, + # 'GlobalMaxPooling2d': JPL does not have such primitives, + 'GlobalAveragePooling2D': fetch_global_avgpooling_step, + 'Flatten': fetch_flatten_step, + 'Add': fetch_add_step, + 'Concatenate': fetch_concatenate_step, + 'Null': fetch_null_step, + # 'Substract': we do not implement +} + +ACTIVATIONS = {'ReLU'} +OMIT_LAYERS = {'InputLayer', 'Normalization', 'ReLU', 'ZeroPadding2D', 'Softmax', 'Activation'} +FORWARD_LAYERS = {'Dense', 'Conv1d', 'Conv2d', 'Conv3d'} diff --git a/axolotl/axolotl/algorithms/autokeras_integration/mapping.py b/axolotl/axolotl/algorithms/autokeras_integration/mapping.py new file mode 100644 index 0000000..f10aa16 --- /dev/null +++ b/axolotl/axolotl/algorithms/autokeras_integration/mapping.py @@ -0,0 +1,122 @@ +from .block import * + + +def fetch_conv1D_step(block_id, layer): + return Conv1D( + block_id, + layer.filters, + layer.kernel_size, + layer.strides, + layer.padding, + layer.previous_layer_ids[0] + ).get_step() + + +def fetch_conv2D_step(block_id, layer): + return Conv2D( + block_id, + layer.filters, + layer.kernel_size, + layer.strides, + layer.padding, + layer.previous_layer_ids[0] + ).get_step() + + +def fetch_conv3D_step(block_id, layer): + return Conv3D( + block_id, + layer.filters, + layer.kernel_size, + layer.strides, + layer.padding, + layer.previous_layer_ids[0] + ).get_step() + + +def fetch_dense_step(block_id, layer): + return Dense( + block_id, + layer.units, + layer.activation, + layer.previous_layer_ids[0] + ).get_step() + + +def fetch_batch_norm_step(block_id, layer): + return BatchNorm2D( + block_id, + layer.previous_layer_ids[0] + ).get_step() + + +def fetch_maxpool2d_step(block_id, layer): + return MaxPooling2D( + block_id, + layer.pool_size, + layer.strides, + layer.padding, + layer.previous_layer_ids[0] + ).get_step() + + +def fetch_avgpool2d_step(block_id, layer): + return AvgPooling2D( + block_id, + layer.pool_size, + layer.strides, + layer.padding, + layer.previous_layer_ids[0] + ).get_step() + + +def fetch_dropout_step(block_id, layer): + return Dropout( + block_id, + layer.rate, + layer.previous_layer_ids[0] + ).get_step() + + +# JPL does not have such primitives, +# def fetch_global_maxpooling_step(block_id, layer): +# return GlobalMaxPooling2d( +# block_id, +# layer.input.shape, +# layer.previous_layer_ids[0] +# ).get_step() + + +def fetch_global_avgpooling_step(block_id, layer): + return GlobalAvgPooling2d( + block_id, + layer.data_format, + layer.previous_layer_ids[0] + ).get_step() + + +def fetch_flatten_step(block_id, layer): + return Flatten( + block_id, + layer.previous_layer_ids[0] + ).get_step() + + +def fetch_add_step(block_id, layer): + return Add( + block_id, + layer.previous_layer_ids + ).get_step() + + +def fetch_concatenate_step(block_id, layer): + return Concatenate( + block_id, + layer.previous_layer_ids + ).get_step() + + +def fetch_null_step(block_id): + return Null( + block_id, + ).get_step() diff --git a/axolotl/axolotl/algorithms/autokeras_integration/steps.py b/axolotl/axolotl/algorithms/autokeras_integration/steps.py new file mode 100644 index 0000000..a73f637 --- /dev/null +++ b/axolotl/axolotl/algorithms/autokeras_integration/steps.py @@ -0,0 +1,126 @@ +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import PrimitiveStep + +import d3m.primitives.data_preprocessing.image_reader +import d3m.primitives.data_transformation.denormalize +import d3m.primitives.data_transformation.dataset_to_dataframe +import d3m.primitives.data_transformation.construct_predictions +import d3m.primitives.data_transformation.extract_columns_by_semantic_types +import d3m.primitives.data_transformation.replace_semantic_types + +import d3m.primitives.loss_function.categorical_crossentropy +import d3m.primitives.loss_function.categorical_accuracy + +import d3m.primitives.learner.model +import d3m.primitives.data_wrangling.batching + +LOSS_SETUP_IDX = IP_STEP = OP_STEP = READER_STEP = -1 +BATCH_SIZE = 40 + + +def set_data(pipeline_description): + global IP_STEP, OP_STEP, READER_STEP + + # denormalize + denorm_step_idx = 0 + step = PrimitiveStep( + primitive_description=d3m.primitives.data_transformation.denormalize.Common.metadata.query()) + step.add_argument( + name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') + step.add_output('produce') + pipeline_description.add_step(step) + + # dataset_to_dataframe + dataset_to_dataframe_step_idx = len(pipeline_description.steps) + step = PrimitiveStep( + primitive_description=d3m.primitives.data_transformation.dataset_to_dataframe.Common.metadata.query()) + step.add_argument( + name='inputs', argument_type=ArgumentType.CONTAINER, + data_reference='steps.{}.produce'.format(denorm_step_idx)) + step.add_output('produce') + pipeline_description.add_step(step) + + # extract targets + extract_step_idx = len(pipeline_description.steps) + extract_targets = PrimitiveStep( + d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common.metadata.query()) + extract_targets.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, + data_reference='steps.{}.produce'.format(dataset_to_dataframe_step_idx)) + extract_targets.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) + extract_targets.add_output('produce') + pipeline_description.add_step(extract_targets) + + # replace semantic types + # Need to be used for CIFAR-10 + replace_step_idx = len(pipeline_description.steps) + replace_semantic = PrimitiveStep( + d3m.primitives.data_transformation.replace_semantic_types.Common.metadata.query()) + replace_semantic.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, + data_reference=f'steps.{extract_step_idx}.produce') + replace_semantic.add_hyperparameter(name='to_semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget']) + replace_semantic.add_hyperparameter(name='from_semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) + replace_semantic.add_output('produce') + pipeline_description.add_step(replace_semantic) + + # image reader + reader_step_idx = len(pipeline_description.steps) + reader = PrimitiveStep( + primitive_description=d3m.primitives.data_preprocessing.image_reader.Common.metadata.query()) + reader.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') + pipeline_description.add_step(reader) + + IP_STEP, OP_STEP, READER_STEP = dataset_to_dataframe_step_idx, replace_step_idx, reader_step_idx + + +def set_loss(pipeline_description): + global LOSS_SETUP_IDX + + LOSS_SETUP_IDX = len(pipeline_description.steps) + step = PrimitiveStep( + primitive_description=d3m.primitives.loss_function.categorical_crossentropy.KerasWrap.metadata.query()) + pipeline_description.add_step(step) + + +def set_learner(pipeline_description, batch_size=BATCH_SIZE): + learner_idx = len(pipeline_description.steps) + step = PrimitiveStep(primitive_description=d3m.primitives.learner.model.KerasWrap.metadata.query()) + step.add_hyperparameter(name='loss', argument_type=ArgumentType.PRIMITIVE, data=LOSS_SETUP_IDX) + step.add_hyperparameter(name='model_type', argument_type=ArgumentType.VALUE, data='classification') + step.add_hyperparameter(name='network_last_layer', argument_type=ArgumentType.PRIMITIVE, + data=learner_idx - 1) + step.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='replace') + lr = 0.0001 + adam_hypers = d3m.primitives.learner.model.KerasWrap.metadata.get_hyperparams().defaults(path='optimizer.Adam') + adam_hypers = adam_hypers.replace({'lr': lr}) + step.add_hyperparameter(name='optimizer', argument_type=ArgumentType.VALUE, data=adam_hypers) + pipeline_description.add_step(step) + + bz_loader = PrimitiveStep(primitive_description=d3m.primitives.data_wrangling.batching.TAMU.metadata.query()) + bz_loader.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, + data_reference=f'steps.{IP_STEP}.produce') + bz_loader.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, + data_reference='steps.{}.produce'.format(OP_STEP)) + bz_loader.add_hyperparameter(name='primitive_reader', argument_type=ArgumentType.PRIMITIVE, data=READER_STEP) + bz_loader.add_hyperparameter(name='primitive_learner', argument_type=ArgumentType.PRIMITIVE, data=learner_idx) + bz_loader.add_hyperparameter(name='batch_size', argument_type=ArgumentType.VALUE, data=batch_size) + bz_loader.add_hyperparameter(name='sampling_method', argument_type=ArgumentType.VALUE, data='random') + bz_loader.add_output('produce') + + pipeline_description.add_step(bz_loader) + + +def set_prediction(pipeline_description): + pred = PrimitiveStep( + primitive_description=d3m.primitives.data_transformation.construct_predictions.Common.metadata.query()) + pred.add_argument( + name='inputs', argument_type=ArgumentType.CONTAINER, + data_reference=f"steps.{len(pipeline_description.steps) - 1}.produce" + ) + pred.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, + data_reference='steps.{}.produce'.format(IP_STEP)) + pred.add_output('produce') + pipeline_description.add_step(pred) diff --git a/axolotl/axolotl/algorithms/autokeras_search.py b/axolotl/axolotl/algorithms/autokeras_search.py new file mode 100644 index 0000000..faf6c50 --- /dev/null +++ b/axolotl/axolotl/algorithms/autokeras_search.py @@ -0,0 +1,145 @@ +import logging +import numpy as np + +import autokeras as ak +from d3m import exceptions, index, container +from d3m.metadata import base as metadata_base + +from axolotl.algorithms.autokeras_integration import keras2pipeline +from axolotl.algorithms.base import PipelineSearchBase +from axolotl.utils.pipeline import PipelineResult + +logger = logging.getLogger(__name__) + + +class AutoKerasSearch(PipelineSearchBase): + + def __init__(self, problem_description, backend, + max_trials=10000, directory='.', epochs=1, batch_size=32, validation_split=0.2): + super(AutoKerasSearch, self).__init__(problem_description, backend, ranking_function=None) + + self.clf = ak.ImageClassifier(max_trials=max_trials, seed=self.random_seed, directory=directory) + self.tuner = self.clf.tuner + self.epochs = epochs + self.batch_size = batch_size + self.validation_split = validation_split + + def search_fit(self, input_data, time_limit=300, *, expose_values=False): + dataframe = self.get_dataframe(input_data) + y = self.get_y(dataframe) + x = self.get_x(dataframe) + + self.clf.fit(x=x, y=y, epochs=self.epochs, batch_size=self.batch_size, + validation_split=self.validation_split) + keras_model = self.clf.export_model() + best_pipeline = keras2pipeline(keras_model, batch_size=self.batch_size) + + fitted_pipeline_result = self.backend.fit_pipeline( + problem_description=self.problem_description, pipeline=best_pipeline, + input_data=input_data, expose_outputs=expose_values + ) + + if fitted_pipeline_result.error is not None: + logging.error('No solution founded') + pipeline_result = PipelineResult(pipeline=best_pipeline) + pipeline_result.error = RuntimeError("No solution found") + return pipeline_result + + self.best_fitted_pipeline_id = fitted_pipeline_result.fitted_pipeline_id + return fitted_pipeline_result + + def mark_columns(self, dataset): + problem_inputs = self.problem_description['inputs'] + for problem_input in problem_inputs: + for target in problem_input.get('targets', []): + if target['resource_id'] not in dataset: + raise exceptions.NotFoundError( + "Error marking target column: dataset does not contain resource with resource ID '{resource_id}'.".format( + resource_id=target['resource_id'], + ), + ) + if not isinstance(dataset[target['resource_id']], container.DataFrame): + raise TypeError( + "Error marking target column: resource '{resource_id}' is not a DataFrame.".format( + resource_id=target['resource_id'], + ), + ) + if not 0 <= target['column_index'] < dataset[target['resource_id']].shape[1]: + raise ValueError( + "Error marking target column: resource '{resource_id}' does not have a column with index '{column_index}'.".format( + resource_id=target['resource_id'], + column_index=target['column_index'], + ), + ) + + dataset.metadata = dataset.metadata.add_semantic_type( + (target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']), + 'https://metadata.datadrivendiscovery.org/types/Target', + ) + dataset.metadata = dataset.metadata.add_semantic_type( + (target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ) + # If column is marked as a target, it cannot be attribute as well. + # This allows one to define in problem description otherwise attribute columns as targets. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/265 + dataset.metadata = dataset.metadata.remove_semantic_type( + (target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']), + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ) + return dataset + + def get_dataframe(self, input_data): + # denormalize + denormalize = index.get_primitive('d3m.primitives.data_transformation.denormalize.Common') + hyperparams_class = denormalize.metadata.get_hyperparams() + primitive = denormalize(hyperparams=hyperparams_class.defaults()) + dataset = primitive.produce(inputs=input_data[0]).value + + # Add Target column into dataset + dataset = self.mark_columns(dataset) + + # dataset to dataframe + dataset_dataframe = index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common') + hyperparams_class = dataset_dataframe.metadata.get_hyperparams() + primitive = dataset_dataframe(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataset).value + + return dataframe + + def get_y(self, dataframe): + # extract targets + get_columns_semantic = index.get_primitive( + 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common') + hyperparams_class = get_columns_semantic.metadata.get_hyperparams() + primitive = get_columns_semantic( + hyperparams=hyperparams_class.defaults().replace( + { + 'semantic_types': ( + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' + ) + } + ) + ) + targets = primitive.produce(inputs=dataframe).value + y = np.array(targets, dtype=np.int64) + return y + + def get_x(self, dataframe): + # reading images + image_reader = index.get_primitive('d3m.primitives.data_preprocessing.image_reader.Common') + hyperparams_class = image_reader.metadata.get_hyperparams() + primitive = image_reader(hyperparams=hyperparams_class.defaults().replace( + {'return_result': 'replace'}) + ) + columns_to_use = primitive._get_columns(dataframe.metadata) + column_index = columns_to_use[0] + temp = [ + primitive._read_filename(column_index, dataframe.metadata.query((row_index, column_index)), value) + for row_index, value in enumerate(dataframe.iloc[:, column_index]) + ] + x = np.array(temp, dtype=np.float64) + return x diff --git a/axolotl/axolotl/algorithms/base.py b/axolotl/axolotl/algorithms/base.py new file mode 100644 index 0000000..2ccea22 --- /dev/null +++ b/axolotl/axolotl/algorithms/base.py @@ -0,0 +1,241 @@ +import abc +import uuid +import logging +import time +import typing + +from d3m.metadata.problem import Problem +from d3m.metadata.pipeline import Pipeline +from d3m import runtime as runtime_module +from d3m import container +from d3m.metadata.base import Context +from d3m import utils as d3m_utils +from d3m.metadata import pipeline_run as pipeline_run_module + +from axolotl.backend.base import RunnerBase +from axolotl.utils.pipeline import PipelineResult +from axolotl.utils.schemas import ContainerType +from axolotl.utils import resources as resources_module + +logger = logging.getLogger(__name__) + + +class PipelineSearchBase: + """ + Base class for pipeline searcher, this class should provide the common interface for pipeline + searchers to be integrated with the system. + + Nothing should be computed or initialized on the constructor, just adding more variables. + Everything else should be computed at start_search. + + Parameters + ---------- + problem_description : Problem + A problem description. + backend : RunnerBase + An instance of a backend class. + primitives_blocklist : typing.Sequence[str] + A list of string with pipeline names to avoid. + ranking_function : typing.Callable + A function that takes as an input a dataframe of scores, and generates a rank, smaller is better + + + Attributes + ---------- + backend : RunnerBase + An instance of a backend class. + random_seed : int + Random seed passed to the constructor. + volumes_dir : str + Path to a directory with static files required by primitives. + scratch_dir : str + Path to a directory to store any temporary files needed during execution. + ranking_function : typing.Callable + A function that takes as an input a dataframe of scores, and generates a rank, smaller is better + problem_description : Problem + A problem description. + primitives_blocklist : typing.Sequence[str] + A list of string with pipeline names to avoid. + + history : typing.Dict[str, PipelineResult] + A list of all the evaluated pipelines with their execution results and performance. + """ + + def __init__(self, + problem_description: Problem, backend: RunnerBase, *, + primitives_blocklist: typing.Sequence[str] = None, ranking_function: typing.Callable = None + ) -> None: + self.search_id = str(uuid.uuid4()) + self.backend = backend + self.random_seed = backend.random_seed + self.volumes_dir = backend.volumes_dir + self.scratch_dir = backend.scratch_dir + self.ranking_function = ranking_function + + self.problem_description: Problem = problem_description + self.primitives_blocklist: typing.Sequence[str] = primitives_blocklist + + self.history: typing.List[PipelineResult] = [] + + # missing typing + self.best_fitted_pipeline_id: str = None + self.input_data: typing.Sequence[ContainerType] = None + + with d3m_utils.silence(): + self.runtime_environment = pipeline_run_module.RuntimeEnvironment() + + def search(self, time_limit: float): + """ + This method executes the whole search, by calling the ``_search`` method multiple times + as long as there is time left and put the results on the history. + + Parameters + ---------- + time_limit : float + Time limit for the search + """ + time_start = time.time() + largest_iteration = 0 + + i = 0 + + while True: + i += 1 + time_left = time_limit - (time.time() - time_start) + + if time_left < 5: + logger.info('-- Time out --') + break + + if time_left - largest_iteration < 5: + logger.info("""-- Time out -- \n Time left {} Next iteration could be over {}""".format(time_left, largest_iteration)) + break + + start_iteration_time = time.time() + results = self._search(time_left=time_left) + self.history += results + current_iteration_time = time.time() - start_iteration_time + + if largest_iteration < current_iteration_time: + largest_iteration = current_iteration_time + + def search_fit(self, input_data: typing.Sequence[ContainerType], time_limit: float = 300, *, + expose_values: bool = False) -> typing.Tuple[runtime_module.Runtime, PipelineResult]: + """ + This method calls search and fit the best ranking pipelines located from the search located on the history. + + Parameters + ---------- + input_data : typing.Sequence[ContainerType] + A list of D3M containers to be use as the pipeline input. + + time_limit : float + The time limit to be use for the search. + + expose_values : bool + A flag that allows the user expose all intermediate result of the pipeline during fitting. + """ + self.input_data = input_data + self.search(time_limit) + + best_pipeline = None + for pipeline_result in self.history: + if pipeline_result.error is None: + if best_pipeline is None: + best_pipeline = pipeline_result + else: + if pipeline_result.rank < best_pipeline.rank: + best_pipeline = pipeline_result + + if best_pipeline is None: + logging.error('No solution founded') + pipeline_result = PipelineResult(fitted_pipeline_id='') + pipeline_result.error = RuntimeError("No solution found") + return _, pipeline_result + + return self.fit(best_pipeline.pipeline, input_data, expose_values) + + def fit(self, pipeline: Pipeline, input_data: typing.Sequence[container.Dataset], + expose_outputs: bool = False) -> typing.Tuple[runtime_module.Runtime, PipelineResult]: + + pipeline_result = PipelineResult(pipeline=pipeline) + + runtime, output, result = runtime_module.fit( + pipeline=pipeline, inputs=input_data, problem_description=self.problem_description, context=Context.TESTING, + hyperparams=None, random_seed=self.random_seed, volumes_dir=self.volumes_dir, + runtime_environment=self.runtime_environment, scratch_dir=self.scratch_dir, expose_produced_outputs=expose_outputs + ) + if result.has_error(): + pipeline_result.status = "ERRORED" + pipeline_result.error = result.error + else: + pipeline_result.status = "COMPLETED" + + pipeline_result.exposed_outputs = result.values + pipeline_result.output = output + + return runtime, pipeline_result + + def produce(self, fitted_pipeline: runtime_module.Runtime, input_data: typing.Sequence[container.Dataset], + expose_outputs: bool = False) -> PipelineResult: + pipeline_result = PipelineResult(fitted_pipeline_id='') + + with d3m_utils.silence(): + output, result = runtime_module.produce( + fitted_pipeline=fitted_pipeline, test_inputs=input_data, + expose_produced_outputs=expose_outputs + ) + + if result.has_error(): + pipeline_result.status = "ERRORED" + pipeline_result.error = result.error + else: + pipeline_result.status = "COMPLETED" + + pipeline_result.exposed_outputs = result.values + pipeline_result.output = output + return pipeline_result + + @abc.abstractmethod + def _search(self, time_left: float) -> typing.Sequence[PipelineResult]: + """ + A method where the search is going to be implemented. + The search algorithm should be iteration oriented, each of the call should end + on returning the status of pipelines evaluated. + + Parameters + ---------- + time_left : float + TTime left for the iteration + + Returns + ------- + typing.Sequence[PipelineResult] + A list of pipeline results with the information of the pipeline ran during the iteration. + + """ + + def pretty_print(self, deep: bool = False): + """ + A function that prints everything really nice. + """ + from pprint import pprint + + def simplify_value(input_value): + if isinstance(input_value, Problem): + return input_value.to_simple_structure() + elif isinstance(input_value, Pipeline): + return input_value.to_json_structure() + elif isinstance(input_value, PipelineResult): + return vars(input_value) + elif isinstance(input_value, dict): + new_value = {} + for nested_variable, nested_val in input_value.items(): + new_value[nested_variable] = simplify_value(nested_val) + return new_value + + class_instance = vars(self) + if deep: + class_instance = simplify_value(class_instance) + + pprint(class_instance) diff --git a/axolotl/axolotl/algorithms/bayesian_search.py b/axolotl/axolotl/algorithms/bayesian_search.py new file mode 100644 index 0000000..9b91db1 --- /dev/null +++ b/axolotl/axolotl/algorithms/bayesian_search.py @@ -0,0 +1,27 @@ +import enum + +from axolotl.algorithms.tuners.bayesian_oracle import BayesianOptimizationOracle +from axolotl.algorithms.tuners.tunable_base import TunableBase + + +class BayesianSearch(TunableBase): + def __init__(self, problem_description, backend, primitives_blocklist=None, + max_trials=10000, directory='.', num_initial_points=None, num_eval_trials=None): + super(BayesianSearch, self).__init__(problem_description, backend, + primitives_blocklist=primitives_blocklist, num_eval_trials=num_eval_trials) + self.directory = directory + self.project_name = 'random_search' + + self.objective = self.problem_description['problem']['performance_metrics'][0]['metric'] + if isinstance(self.objective, enum.Enum): + self.objective = self.objective.name + + self.oracle = BayesianOptimizationOracle( + objective=self.objective, + max_trials=max_trials, # pre-defined number, + seed=self.random_seed, # seed + hyperparameters=self.hyperparameters, + num_initial_points=num_initial_points, + ) + self.oracle._set_project_dir( + self.directory, self.project_name, overwrite=True) diff --git a/axolotl/axolotl/algorithms/data_driven_search.py b/axolotl/axolotl/algorithms/data_driven_search.py new file mode 100644 index 0000000..e1687cb --- /dev/null +++ b/axolotl/axolotl/algorithms/data_driven_search.py @@ -0,0 +1,1086 @@ +import copy +import uuid +import numpy + +from d3m.metadata.pipeline import Pipeline, PrimitiveStep, Resolver +from d3m import index +from d3m import runtime as runtime_module +from d3m import utils as d3m_utils + +from axolotl.algorithms.base import PipelineSearchBase +from axolotl.utils import schemas as schemas_utils, pipeline as pipeline_utils +from d3m.metadata.base import ArgumentType, ALL_ELEMENTS +from axolotl.algorithms.dummy import dummy_ranking_function +from axolotl.algorithms.bayesian_search import BayesianSearch +import multiprocessing + +PREP_PRIMITIVES = { + 'Denormalize': 'd3m.primitives.data_transformation.denormalize.Common', + 'DatasetToDataFrame': 'd3m.primitives.data_transformation.dataset_to_dataframe.Common', + 'ColumnParser': 'd3m.primitives.data_transformation.column_parser.Common', + 'ExtractColumnsBySemanticTypes': 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common', + 'Imputer': 'd3m.primitives.data_cleaning.imputer.SKlearn', + 'SimpleProfiler': 'd3m.primitives.schema_discovery.profiler.Common', + 'ReplaceSemanticTypes': 'd3m.primitives.data_transformation.replace_semantic_types.Common', + 'DropColumns': 'd3m.primitives.data_transformation.remove_columns.Common', + 'OneHotMaker': 'd3m.primitives.data_preprocessing.one_hot_encoder.MakerCommon', + 'ExtractColumns': 'd3m.primitives.data_transformation.extract_columns.Common', + 'GeneralHorizontalConcat': 'd3m.primitives.data_transformation.horizontal_concat.TAMU', + 'Imputer': 'd3m.primitives.data_cleaning.imputer.SKlearn', + 'FeatureSelection': 'd3m.primitives.feature_selection.select_fwe.SKlearn', + 'ConstructPredictions': 'd3m.primitives.data_transformation.construct_predictions.Common', + 'OrdinalEncoder': 'd3m.primitives.data_transformation.ordinal_encoder.SKlearn', + 'RobustScale': 'd3m.primitives.data_preprocessing.robust_scaler.SKlearn', + 'TimeSeriesToList': 'd3m.primitives.data_preprocessing.time_series_to_list.DSBOX', + 'TimeSeriesFeaturization': 'd3m.primitives.feature_extraction.random_projection_timeseries_featurization.DSBOX', + 'TextReader': 'd3m.primitives.data_preprocessing.text_reader.Common', + 'TextEncoder': 'd3m.primitives.data_transformation.encoder.DistilTextEncoder', + 'AddSemanticTypes': 'd3m.primitives.data_transformation.add_semantic_types.Common', + 'SemiClassification': 'd3m.primitives.semisupervised_classification.iterative_labeling.AutonBox' +} + +LOADED_PRIMITIVES = {} + +DATA_TYPES = { + 'attribute': 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'target': 'https://metadata.datadrivendiscovery.org/types/Target', + 'suggested_target': 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'true_target': 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + 'float': 'http://schema.org/Float', + 'int': 'http://schema.org/Integer', + 'unknown_type': 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'categorical': 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'text': 'http://schema.org/Text', + 'bool': 'http://schema.org/Boolean', + 'file': 'https://metadata.datadrivendiscovery.org/types/FileName', + 'time_series': 'https://metadata.datadrivendiscovery.org/types/Timeseries', + 'date': 'http://schema.org/DateTime', + 'time': 'https://metadata.datadrivendiscovery.org/types/Time' +} + +with d3m_utils.silence(): + for key, value in PREP_PRIMITIVES.items(): + LOADED_PRIMITIVES[key] = index.get_primitive(value) + + +def get_semantic_types(input_dataframe): + semantic_types = [] + for i in range(input_dataframe.metadata.query((ALL_ELEMENTS,))['dimension']['length']): + semantic_types.append(input_dataframe.metadata.query((ALL_ELEMENTS, i,))['semantic_types']) + return semantic_types + + +def get_indexes_by_semantic_type(input_dataframe, semantic_type): + semantic_types = get_semantic_types(input_dataframe) + indexes = [] + for i in range(len(semantic_types)): + if semantic_type in semantic_types[i]: + indexes.append(i) + return indexes + + +def get_index_data_to_profile(input_dataframe): + indexes_to_profile = [] + for i in range(input_dataframe.metadata.query((ALL_ELEMENTS,))['dimension']['length']): + if DATA_TYPES['unknown_type'] in input_dataframe.metadata.query((ALL_ELEMENTS, i,))['semantic_types'] and \ + input_dataframe.metadata.query((ALL_ELEMENTS, i,))['structural_type'] == str: + indexes_to_profile.append(i) + return indexes_to_profile + + +def run_primitive(primitive, arguments, hyperparams=()): + # TODO add static support for static file + _hyperparams = primitive.metadata.get_hyperparams().defaults() + hp_to_update = {} + for hyperparam in hyperparams: + name, argument_type, data = hyperparam + hp_to_update[name] = data + _hyperparams = _hyperparams.replace(hp_to_update) + primitive_instance = primitive(hyperparams=_hyperparams) + use_set_training_data = pipeline_utils.query_multiple_terms( + primitive.metadata, ['primitive_code', 'instance_methods', 'set_training_data', 'arguments']) + if use_set_training_data is not None and use_set_training_data: + primitive_instance.set_training_data(**arguments) + primitive_instance.fit() + + produce_arguments = pipeline_utils.query_multiple_terms( + primitive.metadata, ['primitive_code', 'instance_methods', 'produce', 'arguments']) + + arguments_keys = list(arguments.keys()) + for argument in arguments_keys: + if argument not in produce_arguments: + print('removing argument', argument) + del arguments[argument] + return primitive_instance.produce(**arguments).value + + +def add_primitive_step_to_pipeline(pipeline, primitive, arguments=(), hyperparams=(), resolver=Resolver()): + step = PrimitiveStep(primitive=primitive, resolver=resolver) + for argument in arguments: + name, argument_type, data_reference = argument + step.add_argument(name=name, argument_type=argument_type, data_reference=data_reference) + for hyperparam in hyperparams: + name, argument_type, data = hyperparam + step.add_hyperparameter(name=name, argument_type=argument_type, data=data) + step.add_output('produce') + pipeline.add_step(step) + + +def fix_arguments(arguments): + _arguments = [] + for name, reference in arguments.items(): + _arguments.append((name, ArgumentType.CONTAINER, reference)) + return _arguments + + +def prepare_arguments(available_data, arguments): + _arguments = {} + for name, reference in arguments.items(): + if isinstance(reference, list): + _arguments[name] = [] + for elem in reference: + _arguments[name].append(available_data[elem]) + else: + _arguments[name] = available_data[reference] + return _arguments + + +def shrink_dataset(dataset, n_rows=10000): + if 'learningData' not in dataset or len(dataset.keys()) > 1 or len(dataset['learningData']) <= n_rows: + return dataset + + print('=' * 100) + print('Shrinking dataset from {} to {}'.format(len(dataset['learningData']), n_rows)) + df = dataset['learningData'].sample(n=n_rows) + df['d3mIndex'] = df['d3mIndex'].apply(lambda x: int(x)) + df = df.sort_values(by=['d3mIndex']) + df['d3mIndex'] = df['d3mIndex'].apply(lambda x: str(x)) + df = df.reset_index(drop=True) + + dataset['learningData'] = df + metadata = dataset.metadata + + metadata = metadata.update(('learningData',), { + 'structural_type': metadata.query(('learningData',))['structural_type'], + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': n_rows, + }, + }) + + dataset.metadata = metadata + return dataset + + +def get_primitives_by_family(family_type): + pass + + +def index_to_operate(input_data, data_type, exclude_targets): + indexes = [] + semantic_types = get_semantic_types(input_data) + for i in range(len(semantic_types)): + if data_type in semantic_types[i]: + if DATA_TYPES['target'] in semantic_types[i]: + if not exclude_targets: + indexes.append(i) + else: + indexes.append(i) + return indexes + + +DEFAULT_HYPERPARAMS = { + 'ColumnParser': [ + ('parse_semantic_types', ArgumentType.VALUE, + ['http://schema.org/Integer', 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/FloatVector', 'http://schema.org/DateTime'] + ) + ], + 'SimpleProfiler': [ + ('return_result', ArgumentType.VALUE, 'replace'), + ('categorical_max_absolute_distinct_values', ArgumentType.VALUE, None), + ('categorical_max_ratio_distinct_values', ArgumentType.VALUE, 0.20) + ], + 'ReplaceSemanticTypes': [ + ('return_result', ArgumentType.VALUE, 'replace'), + ('from_semantic_types', ArgumentType.VALUE, [DATA_TYPES['unknown_type']]), + ('to_semantic_types', ArgumentType.VALUE, [DATA_TYPES['categorical']]) + ], + 'OneHotMaker': [ + ('return_result', ArgumentType.VALUE, 'replace'), + ('encode_target_columns', ArgumentType.VALUE, True), + ('handle_unseen', ArgumentType.VALUE, 'column'), + ('handle_missing_value', ArgumentType.VALUE, 'column') + ], + "Imputer": [ + ('return_result', ArgumentType.VALUE, 'replace'), + ('use_semantic_types', ArgumentType.VALUE, True), + ], + 'OrdinalEncoder': [ + ('return_result', ArgumentType.VALUE, 'replace'), + ('use_semantic_types', ArgumentType.VALUE, True), + ], + 'RobustScale': [ + ('return_result', ArgumentType.VALUE, 'replace'), + ('use_semantic_types', ArgumentType.VALUE, True), + ] +} + + +class PrimitiveHandler: + def __init__(self, primitive, hyperparams=[], resolver=Resolver()): + self.primitive = primitive + self.hyperparams = hyperparams + self.resolver = resolver + + def add_produce(self, available_data, pipeline, arguments, indexes=[]): + _arguments = fix_arguments(arguments) + + hyperparams = self.hyperparams + if indexes and 'use_columns' in self.primitive.metadata.get_hyperparams().defaults(): + hyperparams = self.hyperparams + [('use_columns', ArgumentType.VALUE, indexes)] + add_primitive_step_to_pipeline(pipeline, self.primitive, _arguments, hyperparams, resolver=self.resolver) + output = run_primitive(self.primitive, prepare_arguments(available_data, arguments), hyperparams) + current_data_ref = 'steps.{}.produce'.format(len(pipeline.steps) - 1) + available_data[current_data_ref] = output + return current_data_ref + + def run_primitive(self, arguments, hyperparams=[], indexes=[]): + _hyperparams = self.hyperparams + if hyperparams: + _hyperparams = self.hyperparams + hyperparams + _hyperparams = _hyperparams if not indexes else _hyperparams + [('use_columns', ArgumentType.VALUE, indexes)] + return run_primitive(self.primitive, arguments, hyperparams) + + +class FileHandler: + + def __init__(self, resolver=Resolver()): + self.use_colummns = True + self.resolver = resolver + self.exclude_targets = None + self.problem_description = None + self.task_description = None + + def add_produce(self, available_data, pipeline, arguments, indexes=[]): + last_valid_data_ref = arguments['inputs'] + origindal_data_ref = last_valid_data_ref + current_data_ref = self.add_output_time_series(available_data, pipeline, arguments, indexes=[]) + + if current_data_ref is not None: + arguments = {'inputs': current_data_ref} + last_valid_data_ref = current_data_ref + + current_data_ref = self.add_output_text(available_data, pipeline, arguments, indexes=[]) + + if current_data_ref is not None: + arguments = {'inputs': current_data_ref} + last_valid_data_ref = current_data_ref + + if last_valid_data_ref == origindal_data_ref: + last_valid_data_ref = None + + return True, last_valid_data_ref + + def add_output_time_series(self, available_data, pipeline, arguments, indexes=[]): + initial_ref = arguments['inputs'] + semantic_types = get_semantic_types(available_data[initial_ref]) + indexes_to_remove = [] + for i, _type in enumerate(semantic_types): + if DATA_TYPES['file'] in _type and DATA_TYPES['time_series'] in _type: + indexes_to_remove.append(i) + if not indexes_to_remove: + return + print('File TimeSeriesHandler') + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['TimeSeriesToList'], resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, arguments) + + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['TimeSeriesFeaturization'], resolver=self.resolver) + current_data_ref_to_concat = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + + drop_hp = [('columns', ArgumentType.VALUE, indexes_to_remove)] + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['DropColumns'], drop_hp, resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': initial_ref}) + + data_refs_to_concat = [current_data_ref, current_data_ref_to_concat] + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['GeneralHorizontalConcat'], resolver=self.resolver) + last_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': data_refs_to_concat}) + return last_data_ref + + def add_output_text(self, available_data, pipeline, arguments, indexes=[]): + initial_ref = arguments['inputs'] + semantic_types = get_semantic_types(available_data[initial_ref]) + indexes_to_remove = [] + for i, _type in enumerate(semantic_types): + if DATA_TYPES['file'] in _type and DATA_TYPES['text'] in _type: + indexes_to_remove.append(i) + if not indexes_to_remove: + return + + print('File TextReader Handler') + text_rd_hp = [('return_result', ArgumentType.VALUE, 'replace')] + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['TextReader'], text_rd_hp, resolver=self.resolver) + text_data_ref = primitive_handler.add_produce(available_data, pipeline, arguments) + + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], + [('semantic_types', ArgumentType.VALUE, [DATA_TYPES['attribute']])], + self.resolver) + attributes_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': text_data_ref}) + + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], + [('semantic_types', ArgumentType.VALUE, [DATA_TYPES['target']])], + self.resolver) + target_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': text_data_ref}) + + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['TextEncoder'], + [('encoder_type', ArgumentType.VALUE, 'tfidf')], + self.resolver) + current_data_ref = primitive_handler.add_produce( + available_data, pipeline, {'inputs': attributes_data_ref, 'outputs': target_data_ref}) + + no_semantic_types = [] + for i in range(available_data[current_data_ref].metadata.query((ALL_ELEMENTS,))['dimension']['length']): + if 'semantic_types' not in available_data[current_data_ref].metadata.query((ALL_ELEMENTS, i,)) and \ + available_data[current_data_ref].metadata.query((ALL_ELEMENTS, i,))['structural_type'] == numpy.float64: + no_semantic_types.append(i) + + add_semantic_hp = [('columns', ArgumentType.VALUE, no_semantic_types), + ('semantic_types', ArgumentType.VALUE, [DATA_TYPES['float'], DATA_TYPES['attribute']])] + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['AddSemanticTypes'], add_semantic_hp, resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + + data_refs_to_concat = [target_data_ref, current_data_ref] + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['GeneralHorizontalConcat'], resolver=self.resolver) + last_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': data_refs_to_concat}) + + return last_data_ref + + +class CategoricalHandler: + def __init__(self, resolver=Resolver()): + self.use_colummns = True + self.resolver = resolver + self.exclude_targets = None + self.problem_description = None + self.task_description = None + + def _get_criteria(self, input_data, indexes=[]): + index_to_ordinal = [] + index_to_drop = [] + index_to_one_hot = [] + + total_n_values = len(input_data) + for _index in indexes: + n_categories = len(input_data.iloc[:, _index].unique()) + categories_ratio = n_categories/total_n_values + if categories_ratio >= 0.8: + index_to_drop.append(_index) + else: + if n_categories <= 10: + index_to_one_hot.append(_index) + else: + if n_categories <= 100 and not input_data.iloc[:, _index].isnull().values.any(): + index_to_ordinal.append(_index) + else: + index_to_drop.append(_index) + return index_to_ordinal, index_to_one_hot, index_to_drop + + def add_produce(self, available_data, pipeline, arguments, indexes=[]): + index_to_ordinal, index_to_one_hot, index_to_drop = self._get_criteria( + available_data[arguments['inputs']], indexes) + _arguments = fix_arguments(arguments) + current_data_ref = arguments['inputs'] + + index_to_drop += index_to_ordinal + + if index_to_drop: + print('Drop columns', index_to_drop) + drop_hp = [('columns', ArgumentType.VALUE, index_to_drop)] + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['DropColumns'], drop_hp, resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + + if index_to_one_hot: + new_indexes = index_to_operate(available_data[current_data_ref], DATA_TYPES['categorical'], self.exclude_targets) + _, index_to_one_hot, _ = self._get_criteria(available_data[current_data_ref], new_indexes) + print('OneHot', index_to_one_hot) + + one_hot_hp = DEFAULT_HYPERPARAMS['OneHotMaker'] + [('use_columns', ArgumentType.VALUE, index_to_one_hot)] + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['OneHotMaker'], one_hot_hp, resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + + # if index_to_ordinal: + # new_indexes = index_to_operate(available_data[current_data_ref], DATA_TYPES['categorical'], + # self.exclude_targets) + # index_to_ordinal, _, _ = self._get_criteria(available_data[current_data_ref], new_indexes) + # primitive = LOADED_PRIMITIVES['OrdinalEncoder'] + # ordinal_hp = DEFAULT_HYPERPARAMS['OrdinalEncoder'] + [('use_columns', ArgumentType.VALUE, index_to_ordinal)] + # add_primitive_step_to_pipeline(pipeline, primitive, _arguments, ordinal_hp, resolver=self.resolver) + # output = run_primitive(primitive, prepare_arguments(available_data, arguments), ordinal_hp) + # current_data_ref = 'steps.{}.produce'.format(len(pipeline.steps) - 1) + # available_data[current_data_ref] = output + # arguments = {'inputs': current_data_ref} + # _arguments = fix_arguments(arguments) + # + # cat_indexes = get_indexes_by_semantic_type(available_data[current_data_ref], DATA_TYPES['categorical']) + # index_to_fix = [] + # for _index in cat_indexes: + # if available_data[current_data_ref].metadata.query((ALL_ELEMENTS, _index,))['structural_type'] == numpy.float64: + # index_to_fix.append(_index) + # + # if index_to_fix: + # primitive = LOADED_PRIMITIVES['ReplaceSemanticTypes'] + # replace_sem_hp = [ + # ('return_result', ArgumentType.VALUE, 'replace'), + # ('from_semantic_types', ArgumentType.VALUE, [DATA_TYPES['categorical']]), + # ('to_semantic_types', ArgumentType.VALUE, [DATA_TYPES['float']]), + # ('use_columns', ArgumentType.VALUE, index_to_fix) + # ] + # add_primitive_step_to_pipeline(pipeline, primitive, _arguments, replace_sem_hp, resolver=self.resolver) + # output = run_primitive(primitive, prepare_arguments(available_data, arguments), replace_sem_hp) + # current_data_ref = 'steps.{}.produce'.format(len(pipeline.steps) - 1) + # available_data[current_data_ref] = output + return True, current_data_ref + + +class BooleanHandler: + def __init__(self, resolver=Resolver()): + self.use_colummns = True + self.resolver = resolver + self.exclude_targets = None + self.problem_description = None + self.task_description = None + + def add_produce(self, available_data, pipeline, arguments, indexes=[]): + indexes = index_to_operate(available_data[arguments['inputs']], DATA_TYPES['bool'], self.exclude_targets) + if not indexes: + print("Skipping Boolean no columns to operate") + return True, None + + + replace_sem_hp = [ + ('return_result', ArgumentType.VALUE, 'replace'), + ('from_semantic_types', ArgumentType.VALUE, [DATA_TYPES['bool']]), + ('to_semantic_types', ArgumentType.VALUE, [DATA_TYPES['categorical']]), + ('use_columns', ArgumentType.VALUE, indexes) + ] + primitive_handler = PrimitiveHandler( + LOADED_PRIMITIVES['ReplaceSemanticTypes'], replace_sem_hp, resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, arguments) + + one_hot_hp = [ + ('return_result', ArgumentType.VALUE, 'replace'), + ('encode_target_columns', ArgumentType.VALUE, True), + ('handle_missing_value', ArgumentType.VALUE, 'column'), + ('use_columns', ArgumentType.VALUE, indexes) + ] + primitive_handler = PrimitiveHandler( + LOADED_PRIMITIVES['OneHotMaker'], one_hot_hp, resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + return True, current_data_ref + + +class DateHandler: + def __init__(self, resolver=Resolver()): + self.use_colummns = True + self.resolver = resolver + self.exclude_targets = None + self.problem_description = None + self.task_description = None + + def add_produce(self, available_data, pipeline, arguments, indexes=[]): + indexes = [] + semantic_types = get_semantic_types(available_data[arguments['inputs']]) + for i in range(len(semantic_types)): + if DATA_TYPES['date'] in semantic_types[i] and DATA_TYPES['time'] in semantic_types[i]: + if DATA_TYPES['target'] in semantic_types[i]: + if not self.exclude_targets: + indexes.append(i) + else: + indexes.append(i) + + if not indexes: + print("Skipping Boolean no columns to operate") + return True, None + + replace_sem_hp = [ + ('return_result', ArgumentType.VALUE, 'replace'), + ('from_semantic_types', ArgumentType.VALUE, [DATA_TYPES['date'], DATA_TYPES['time']]), + ('to_semantic_types', ArgumentType.VALUE, [DATA_TYPES['float']]), + ('use_columns', ArgumentType.VALUE, indexes) + ] + primitive_handler = PrimitiveHandler( + LOADED_PRIMITIVES['ReplaceSemanticTypes'], replace_sem_hp, resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, arguments) + + return True, current_data_ref + + +class TextHandler: + def __init__(self, resolver=Resolver()): + self.use_colummns = True + self.resolver = resolver + self.exclude_targets = None + self.problem_description = None + self.task_description = None + + def add_produce(self, available_data, pipeline, arguments, indexes=[]): + indexes = [] + semantic_types = get_semantic_types(available_data[arguments['inputs']]) + for i in range(len(semantic_types)): + if DATA_TYPES['text'] in semantic_types[i] and not DATA_TYPES['file'] in semantic_types[i]: + if DATA_TYPES['target'] in semantic_types[i]: + if not self.exclude_targets: + indexes.append(i) + else: + indexes.append(i) + + if not indexes: + print("Skipping Text no columns to operate") + return True, None + + print('TextHandler') + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], + [('semantic_types', ArgumentType.VALUE, [DATA_TYPES['attribute']])], + self.resolver) + attributes_data_ref = primitive_handler.add_produce(available_data, pipeline, arguments) + + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], + [('semantic_types', ArgumentType.VALUE, [DATA_TYPES['target']])], + self.resolver) + target_data_ref = primitive_handler.add_produce(available_data, pipeline, arguments) + + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['TextEncoder'], + [('encoder_type', ArgumentType.VALUE, 'tfidf')], + self.resolver) + current_data_ref = primitive_handler.add_produce( + available_data, pipeline, {'inputs': attributes_data_ref, 'outputs': target_data_ref}) + + no_semantic_types = [] + for i in range(available_data[current_data_ref].metadata.query((ALL_ELEMENTS,))['dimension']['length']): + if 'semantic_types' not in available_data[current_data_ref].metadata.query((ALL_ELEMENTS, i,)) and \ + available_data[current_data_ref].metadata.query((ALL_ELEMENTS, i,))[ + 'structural_type'] == numpy.float64: + no_semantic_types.append(i) + + add_semantic_hp = [('columns', ArgumentType.VALUE, no_semantic_types), + ('semantic_types', ArgumentType.VALUE, [DATA_TYPES['float'], DATA_TYPES['attribute']])] + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['AddSemanticTypes'], add_semantic_hp, + resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + + data_refs_to_concat = [target_data_ref, current_data_ref] + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['GeneralHorizontalConcat'], resolver=self.resolver) + last_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': data_refs_to_concat}) + + return True, last_data_ref + + + +class DataTypesHandler: + def __init__(self, problem_description, task_description, + handlers=None, use_default_handlers=True, exclude_targets=True, resolver=Resolver()): + DEFAULT_DATA_HANDLERS = { + DATA_TYPES['float']: None, + DATA_TYPES['int']: None, + DATA_TYPES['bool']: BooleanHandler(resolver=resolver), + DATA_TYPES['categorical']: CategoricalHandler(resolver=resolver), + DATA_TYPES['date']: DateHandler(resolver=resolver), + DATA_TYPES['file']: FileHandler(resolver=resolver), + DATA_TYPES['text']: TextHandler(resolver=resolver) + } + self.problem_description = problem_description + self.task_description = task_description + self.resolver = resolver + self.exclude_targets = exclude_targets + if handlers is None: + self.handlers = DEFAULT_DATA_HANDLERS + else: + if use_default_handlers: + self.handlers = DEFAULT_DATA_HANDLERS + for name, handler in handlers.items(): + self.handlers[name] = handlers + else: + self.handlers = handlers + + def add_produce(self, pipeline, input_dataframe): + data_ref = 'steps.{}.produce'.format(len(pipeline.steps) - 1) + available_data = {data_ref: input_dataframe} + last_data_ref = data_ref + + use_columns = [] + not_use_columns = [] + for handler_name in self.handlers.keys(): + if self.check_use_columns_in_handler(handler_name): + use_columns.append(handler_name) + else: + not_use_columns.append(handler_name) + + last_use_column_handler_index = len(use_columns) - 1 + handler_names = use_columns + not_use_columns + last_use_column_handler_data_ref = None + data_refs_to_concat = [] + + # We execute the handler in order according to whether or not the support use_columns. + for i, handler_name in enumerate(handler_names): + print(i, handler_name) + use_columns, new_data_ref = self.execute_handler(available_data, pipeline, last_data_ref, handler_name) + if new_data_ref is not None: + last_data_ref = new_data_ref + if i == last_use_column_handler_index: + last_use_column_handler_data_ref = last_data_ref + elif i > last_use_column_handler_index: + data_refs_to_concat.append(new_data_ref) + + # we get the columns of the ones that we use by using negation of excluiding types. + # we do this if there are not_use_columns + if not_use_columns: + # get the columns that columns that were not modified or used use_columns + primitive_handler = PrimitiveHandler( + LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], + [('semantic_types', ArgumentType.VALUE, not_use_columns), ('negate', ArgumentType.VALUE, True)], + self.resolver) + new_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': last_use_column_handler_data_ref}) + data_refs_to_concat.insert(0, new_data_ref) + + # We concatenate all together + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['GeneralHorizontalConcat'], resolver=self.resolver) + last_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': data_refs_to_concat}) + + return available_data[last_data_ref], pipeline + + def check_use_columns_in_handler(self, handler_name): + use_columns = True + if self.handlers[handler_name] is not None: + if isinstance(self.handlers[handler_name], PrimitiveHandler): + use_columns = 'use_columns' in self.handlers[handler_name].primitive.metadata.get_hyperparams().defaults() + else: + use_columns = self.handlers[handler_name].use_colummns + return use_columns + + def execute_handler(self, available_data, pipeline, data_ref, handler_name): + new_data_ref = None + use_columns = False + if self.handlers[handler_name] is not None: + if isinstance(self.handlers[handler_name], PrimitiveHandler): + use_columns, new_data_ref = self._execute_primitive_handler(available_data, pipeline, data_ref, handler_name) + else: + self.handlers[handler_name].exclude_targets = self.exclude_targets + self.handlers[handler_name].problem_description = self.problem_description + self.handlers[handler_name].task_description = self.task_description + indexes = self._index_to_operate(available_data[data_ref], handler_name) + if indexes: + use_columns, new_data_ref = self.handlers[handler_name].add_produce( + available_data, pipeline, {'inputs': data_ref}, indexes) + else: + print('Skipping', handler_name) + return use_columns, new_data_ref + + def _index_to_operate(self, input_data, data_type): + indexes = [] + semantic_types = get_semantic_types(input_data) + for i in range(len(semantic_types)): + if data_type in semantic_types[i]: + if DATA_TYPES['target'] in semantic_types[i]: + if not self.exclude_targets: + indexes.append(i) + else: + indexes.append(i) + return indexes + + def _execute_primitive_handler(self, available_data, pipeline, data_ref, handler_name): + use_columns = 'use_columns' in self.handlers[handler_name].primitive.metadata.get_hyperparams().defaults() + indexes = self._index_to_operate(available_data[data_ref],handler_name) + # if no columns to operate, return + if not indexes: + return [], None + + if use_columns: + new_data_ref = self.handlers[handler_name].add_produce( + available_data, pipeline, {'inputs': data_ref}, indexes) + else: + # get the columns with specific semnatic types and then we run the primitive with the inputs + primitive_handler = PrimitiveHandler( + LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], [('columns', ArgumentType.VALUE, indexes)], self.resolver) + new_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': data_ref}) + new_data_ref = self.handlers[handler_name].add_produce( + available_data, pipeline, {'inputs': available_data[new_data_ref]}, indexes) + return use_columns, new_data_ref + + +class Preprocessing: + def __init__(self, problem_description, task_description, *, primitives_blocklist=None, resolver=None): + self.problem_description = problem_description + self.task_description = task_description + self.primitives_blocklist = [] if primitives_blocklist is None else primitives_blocklist + self.resolver = Resolver(primitives_blocklist=primitives_blocklist) if resolver is None else resolver + + self.profile_pipeline = None + self.parsed_pipeline = None + self.featurization_pipeline = None + self.imputed_pipeline = None + self.feature_selection_pipeline = None + self.dataframe_data = None + self.dataframe_reference = None + + def get_imputed_pipline(self, input_data, pipeline=None, handler=None): + if pipeline is None: + pipeline = copy.deepcopy(self.featurization_pipeline) + if handler is None: + self.imputed_pipeline = pipeline + return + if not input_data.isnull().values.any(): + print('No Nan Values found') + self.imputed_pipeline = pipeline + return + + current_data_ref = 'steps.{}.produce'.format(len(pipeline.steps) - 1) + available_data = {current_data_ref: input_data} + current_data_ref = handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + self.dataframe_data = available_data[current_data_ref] + self.imputed_pipeline = pipeline + + def get_feature_selection_pipeline(self, input_data, pipeline=None, handler=None): + if pipeline is None: + pipeline = copy.deepcopy(self.imputed_pipeline) + if handler is None: + self.feature_selection_pipeline = pipeline + return + current_data_ref = 'steps.{}.produce'.format(len(pipeline.steps) - 1) + available_data = {current_data_ref: input_data} + current_data_ref = handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + self.dataframe_data = available_data[current_data_ref] + self.feature_selection_pipeline = pipeline + + def get_data_handler_pipeline(self, input_data, pipeline=None): + if pipeline is None: + pipeline = copy.deepcopy(self.parsed_pipeline) + type_handler = DataTypesHandler(self.problem_description, self.task_description) + self.dataframe_data, self.featurization_pipeline = type_handler.add_produce(pipeline, input_data) + + def get_parsed_dataframe(self, input_data, pipeline=None): + if pipeline is None: + pipeline = copy.deepcopy(self.profile_pipeline) + current_data_ref = 'steps.{}.produce'.format(len(pipeline.steps) - 1) + available_data = {current_data_ref: input_data} + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['ColumnParser'], DEFAULT_HYPERPARAMS['ColumnParser'], self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + self.dataframe_data = available_data[current_data_ref] + self.parsed_pipeline = pipeline + + def get_dataset_to_dataframe_pipeline(self, input_data, pipeline=None): + if pipeline is None: + pipeline = Pipeline() + pipeline.add_input('input_data') + current_data_ref = 'inputs.0' + available_data = {} + + if len(input_data) > 1: + raise ValueError('Search with multiple inputs is not supported yet.') + _input_data, _ = runtime_module.Runtime._mark_columns(self.problem_description.get('inputs', []), input_data[-1]) + available_data[current_data_ref] = _input_data + + # Add denormalize + if len(_input_data.keys()) > 1: + print('There are multiple resources, adding denormalize') + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['Denormalize'], resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + + # Add dataset to dataframe + print('Adding dataset to dataframe') + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['DatasetToDataFrame'], resolver=self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}) + + # add profiling + index_to_profile = get_index_data_to_profile(available_data[current_data_ref]) + if index_to_profile: + current_data_ref = self.profile_data(available_data, pipeline, current_data_ref, index_to_profile) + + self.dataframe_reference = current_data_ref + self.dataframe_data = available_data[current_data_ref] + self.profile_pipeline = pipeline + + def profile_data(self, available_data, pipeline, data_ref, index_to_profile): + # Thi sfunction helps to abstract the process when the data is profiled. + target_indexes = get_indexes_by_semantic_type(available_data[data_ref], DATA_TYPES['target']) + + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['SimpleProfiler'], DEFAULT_HYPERPARAMS['SimpleProfiler'], + self.resolver) + profiled_output = primitive_handler.run_primitive({'inputs': available_data[data_ref]}, + indexes=index_to_profile) + profiles_semantic_types = get_semantic_types(profiled_output) + + # TODO make a list of tasks that has discrete target + # If the task is classification we need to make sure that the targets are categorical, + # otherwise there is a chance that the targets are considered as numerical an wrongly parse. + categorical_indexes = [] + if self.task_description['task_type'] == 'CLASSIFICATION': + for i in target_indexes: + if DATA_TYPES['categorical'] not in profiles_semantic_types[i]: + index_to_profile.remove(i) + categorical_indexes.append(i) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': data_ref}, + indexes=index_to_profile) + if categorical_indexes: + primitive_handler = PrimitiveHandler(LOADED_PRIMITIVES['ReplaceSemanticTypes'], + DEFAULT_HYPERPARAMS['ReplaceSemanticTypes'], self.resolver) + current_data_ref = primitive_handler.add_produce(available_data, pipeline, {'inputs': current_data_ref}, + indexes=categorical_indexes) + + return current_data_ref + + def generate_preprocessing_by_step(self, input_data=None, feature_selection_handler=None, impute_handler=None): + if self.profile_pipeline is None: + print('=' * 100) + print('profiled pipeline') + self.get_dataset_to_dataframe_pipeline(input_data) + return [] + elif self.parsed_pipeline is None: + print('=' * 100) + print('parsing') + self.get_parsed_dataframe(self.dataframe_data) + self.dataframe_data.metadata.pretty_print() + return [] + elif self.featurization_pipeline is None: + print('=' * 100) + print('feature') + self.get_data_handler_pipeline(self.dataframe_data) + return [] + elif self.imputed_pipeline is None: + print('=' * 100) + print('Imputer') + self.get_imputed_pipline(self.dataframe_data, handler=impute_handler) + return [] + elif self.feature_selection_pipeline is None: + print('=' * 100) + print('selection') + self.get_feature_selection_pipeline(self.dataframe_data, handler=feature_selection_handler) + print(self.dataframe_data) + self.dataframe_data.metadata.pretty_print() + return [] + + +class DataDrivenSearch(PipelineSearchBase): + def __init__(self, problem_description, backend, *, primitives_blocklist=None, + ranking_function=None, hyperparameter_tuner=BayesianSearch, n_workers=1): + super().__init__(problem_description=problem_description, backend=backend, + primitives_blocklist=primitives_blocklist, ranking_function=ranking_function) + if self.ranking_function is None: + self.ranking_function = dummy_ranking_function + + self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords']) + self.resolver = Resolver(primitives_blocklist=self.primitives_blocklist) + + print(self.task_description) + print(self.problem_description['problem']) + + self.preprocessing = Preprocessing(self.problem_description, self.task_description, + primitives_blocklist=self.primitives_blocklist) + self.preprocessing_handlers = None + self.max_num_pipelines_to_eval = n_workers + print('max_num_pipelines_to_eval', self.max_num_pipelines_to_eval) + # self.max_num_pipelines_to_eval = 1 + + self.search_started = False + self.total_time = None + self.learner_candidates = None + self.failed_learner = [] + self.successful_learner = [] + # TODO update this to be defined on problem/metrics terms + self.data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + self.metrics = self.problem_description['problem']['performance_metrics'] + + self.scoring_pipeline = schemas_utils.get_scoring_pipeline() + self.data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + + self.tuner_enable = False + self.hyperparameter_tunner_init = False + self.hyperparameter_tunner = hyperparameter_tuner( + self.problem_description, self.backend, primitives_blocklist=self.primitives_blocklist, + max_trials=100000, directory=self.backend.scratch_dir) + self.n_pipelines_to_tune = self.max_num_pipelines_to_eval + + def _search(self, time_left): + if self.preprocessing.profile_pipeline is None: + self.preprocessing_handlers = { + 'input_data': self.input_data, + 'impute_handler': PrimitiveHandler(primitive=LOADED_PRIMITIVES['Imputer'], + hyperparams=DEFAULT_HYPERPARAMS['Imputer'], + resolver=self.resolver), + 'feature_selection_handler': PrimitiveHandler(primitive=LOADED_PRIMITIVES['RobustScale'], + hyperparams=DEFAULT_HYPERPARAMS['RobustScale'], + resolver=self.resolver), + } + if self.preprocessing.feature_selection_pipeline is None: + return self.preprocessing.generate_preprocessing_by_step(**self.preprocessing_handlers) + + if self.learner_candidates is None: + self.input_data = [shrink_dataset(self.input_data[0])] + terms_to_block = ['data_augmentation', 'data_preprocessing', 'data_cleaning', + 'data_transformation', 'evaluation', 'feature_construction', + 'feature_extraction', 'layer', 'loss_function', 'metalearning', + 'operator', 'schema_discovery', + 'd3m.primitives.semisupervised_classification.iterative_labeling.AutonBox'] + mapped_task = False + learner_candidates = pipeline_utils.filter_primitives_by_dataframe_input( + pipeline_utils.get_primitive_candidates( + self.task_description['task_type'], self.task_description['data_types'], + self.task_description['semi'], extra_block=terms_to_block) + ) + if not learner_candidates: + mapped_task = True + learner_candidates = pipeline_utils.filter_primitives_by_dataframe_input( + pipeline_utils.get_primitive_candidates( + schemas_utils.get_task_mapping(self.task_description['task_type']), + self.task_description['data_types'], self.task_description['semi'], extra_block=terms_to_block) + ) + if self.task_description['task_type'] != 'CLASSIFICATION' and \ + self.task_description['task_type'] != 'REGRESSION' and \ + learner_candidates and not mapped_task: + learner_candidates = pipeline_utils.filter_primitives_by_dataframe_input( + pipeline_utils.get_primitive_candidates( + schemas_utils.get_task_mapping(self.task_description['task_type']), self.task_description['data_types'], + self.task_description['semi'], extra_block=terms_to_block) + ) + learner_candidates + + self.learner_candidates = list(set([info[0] for info in learner_candidates])) + print(len(self.learner_candidates), self.learner_candidates) + return [] + + if len(self.learner_candidates) > len(self.failed_learner) + len(self.successful_learner): + print('Model Selection') + pipelines_to_eval = [] + for leaner_candidate in self.learner_candidates: + if len(pipelines_to_eval) >= self.max_num_pipelines_to_eval: + break + + if leaner_candidate not in self.failed_learner and leaner_candidate not in self.successful_learner: + pipeline = self.complete_pipeline(self.preprocessing.feature_selection_pipeline, leaner_candidate) + if pipeline is None: + self.failed_learner.append(leaner_candidate) + else: + print('Evaluating', leaner_candidate) + self.successful_learner.append(leaner_candidate) + pipelines_to_eval.append(pipeline) + pipeline_results = self.backend.evaluate_pipelines( + problem_description=self.problem_description, pipelines=pipelines_to_eval, input_data=self.input_data, + metrics=self.metrics, data_preparation_pipeline=self.data_preparation_pipeline, + scoring_pipeline=self.scoring_pipeline, data_preparation_params=self.data_preparation_params) + + return [self.ranking_function(pipeline_result) for pipeline_result in pipeline_results] + + if not self.hyperparameter_tunner_init and not self.tuner_enable: + print('init tuner') + self.hyperparameter_tunner_init = True + completed_pipelines = [result for result in self.history if result.status == 'COMPLETED'] + if not completed_pipelines: + print('No pipelines to tune') + return [] + completed_pipelines.sort(key=lambda x: x.rank) + pipeline_candidates = completed_pipelines[:self.n_pipelines_to_tune] + pipeline_candidates = [candidate.pipeline for candidate in pipeline_candidates] + self.hyperparameter_tunner.set_pipeline_candidates(self.input_data, pipeline_candidates) + self.hyperparameter_tunner.init_search_space() + self.hyperparameter_tunner.input_data = self.input_data + self.tuner_enable = True + + if self.hyperparameter_tunner_init and self.tuner_enable: + return self.hyperparameter_tunner._search(time_left) + return [] + + def complete_pipeline(self, pipeline, primitive): + + def add_construct_predictions(_pipeline, _dataframe_ref, _resolver): + _data_ref = 'steps.{}.produce'.format(len(_pipeline.steps) - 1) + _arguments={'inputs': _data_ref, 'reference': _dataframe_ref} + add_primitive_step_to_pipeline( + _pipeline, LOADED_PRIMITIVES['ConstructPredictions'], fix_arguments(_arguments), resolver=_resolver) + _data_ref = 'steps.{}.produce'.format(len(_pipeline.steps) - 1) + _pipeline.add_output(_data_ref, 'output') + + new_pipeline = copy.deepcopy(pipeline) + new_pipeline.id = str(uuid.uuid4()) + new_pipeline.created = Pipeline().created + + data_ref = 'steps.{}.produce'.format(len(new_pipeline.steps) - 1) + + primitive_arguments = pipeline_utils.query_multiple_terms( + primitive.metadata, ['primitive_code', 'arguments']) + + failed = False + + if not self.task_description['semi']: + + # we check if the primitive has use_semantic_types + # if that is the case, it is straight forward to complete the pipeline + try: + if 'use_semantic_types' in primitive.metadata.get_hyperparams().defaults(): + arguments = {'inputs': data_ref} + hyperparams = [('use_semantic_types', ArgumentType.VALUE, True)] + if 'outputs' in primitive_arguments: + arguments['outputs'] = data_ref + if 'return_result' in primitive.metadata.get_hyperparams().defaults(): + hyperparams.append(('return_result', ArgumentType.VALUE, 'replace')) + + add_primitive_step_to_pipeline(new_pipeline, primitive, fix_arguments(arguments), hyperparams, self.resolver) + add_construct_predictions(new_pipeline, self.preprocessing.dataframe_reference, self.resolver) + else: + # Otherwise, we need to get the inputs and outputs via extract columns by semantic_types + # for this case, we are assuming that th interface has inputs and outputs + arguments = {'inputs': data_ref} + attributes_hyperparams = [('semantic_types', ArgumentType.VALUE, [DATA_TYPES['attribute']])] + target_hyperparams = [('semantic_types', ArgumentType.VALUE, [DATA_TYPES['target']])] + add_primitive_step_to_pipeline( + new_pipeline, LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], fix_arguments(arguments), + attributes_hyperparams, self.resolver) + attributes_data_ref = 'steps.{}.produce'.format(len(new_pipeline.steps) - 1) + + add_primitive_step_to_pipeline( + new_pipeline, LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], fix_arguments(arguments), + target_hyperparams, self.resolver) + targets_data_ref = 'steps.{}.produce'.format(len(new_pipeline.steps) - 1) + + arguments = {'inputs': attributes_data_ref, 'outputs': targets_data_ref} + hyperparams = [] + if 'return_result' in primitive.metadata.get_hyperparams().defaults(): + hyperparams.append(('return_result', ArgumentType.VALUE, 'replace')) + add_primitive_step_to_pipeline(new_pipeline, primitive, fix_arguments(arguments), hyperparams, self.resolver) + add_construct_predictions(new_pipeline, self.preprocessing.dataframe_reference, self.resolver) + except Exception as e: + print(e) + failed = True + else: + try: + print('=====task_description semi: {} estimator: {} ====='.format(self.task_description['semi'], primitive)) + arguments = {'inputs': data_ref} + attributes_hyperparams = [('semantic_types', ArgumentType.VALUE, [DATA_TYPES['attribute']])] + target_hyperparams = [('semantic_types', ArgumentType.VALUE, [DATA_TYPES['target']])] + add_primitive_step_to_pipeline( + new_pipeline, LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], fix_arguments(arguments), + attributes_hyperparams, self.resolver) + attributes_data_ref = 'steps.{}.produce'.format(len(new_pipeline.steps) - 1) + + add_primitive_step_to_pipeline( + new_pipeline, LOADED_PRIMITIVES['ExtractColumnsBySemanticTypes'], fix_arguments(arguments), + target_hyperparams, self.resolver) + targets_data_ref = 'steps.{}.produce'.format(len(new_pipeline.steps) - 1) + + arguments = {'inputs': attributes_data_ref, 'outputs': targets_data_ref} + hyperparams = [('blackbox', ArgumentType.VALUE, primitive)] + add_primitive_step_to_pipeline(new_pipeline, LOADED_PRIMITIVES['SemiClassification'], + fix_arguments(arguments), hyperparams,self.resolver) + add_construct_predictions(new_pipeline, self.preprocessing.dataframe_reference, self.resolver) + except Exception as e: + print(e) + failed = True + + if failed: + return None + else: + return new_pipeline + + diff --git a/axolotl/axolotl/algorithms/dummy.py b/axolotl/axolotl/algorithms/dummy.py new file mode 100644 index 0000000..cc7d497 --- /dev/null +++ b/axolotl/axolotl/algorithms/dummy.py @@ -0,0 +1,87 @@ +import json +import uuid + +from d3m.metadata.pipeline import Pipeline + +from axolotl.algorithms.base import PipelineSearchBase +from axolotl.utils import schemas as schemas_utils, pipeline as pipeline_utils + + +def dummy_ranking_function(pipeline_result): + if pipeline_result.status == 'COMPLETED': + summarize_performance = schemas_utils.summarize_performance_metrics(pipeline_result.scores) + rank = schemas_utils.compute_rank(summarize_performance) + pipeline_result.rank = rank + return pipeline_result + + +class DummySearch(PipelineSearchBase): + def __init__(self, problem_description, backend, *, primitives_blocklist=None, ranking_function=None): + super().__init__(problem_description=problem_description, backend=backend, + primitives_blocklist=primitives_blocklist, ranking_function=ranking_function) + if self.ranking_function is None: + self.ranking_function = dummy_ranking_function + self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords']) + + self.available_pipelines = self._return_pipelines( + self.task_description['task_type'], self.task_description['task_subtype'], self.task_description['data_types']) + + # TODO update this to be defined on problem/metrics terms + self.data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + self.metrics = self.problem_description['problem']['performance_metrics'] + + self.scoring_pipeline = schemas_utils.get_scoring_pipeline() + self.data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + + self.offset = 10 + self.current_pipeline_index = 0 + + def _search(self, time_left): + pipelines_to_eval = self.available_pipelines[self.current_pipeline_index: self.current_pipeline_index+self.offset] + self.current_pipeline_index += self.offset + pipeline_results = self.backend.evaluate_pipelines( + problem_description=self.problem_description, pipelines=pipelines_to_eval, input_data=self.input_data, + metrics=self.metrics, data_preparation_pipeline=self.data_preparation_pipeline, + scoring_pipeline=self.scoring_pipeline, data_preparation_params=self.data_preparation_params) + + return [self.ranking_function(pipeline_result) for pipeline_result in pipeline_results] + + def _return_pipelines(self, task_type, task_subtype, data_type): + """ + A function that return predefined pipelines given a task type. + + Returns + ------- + A predefined pipelines if there are pipelines left, also if there is template + returns the new pipeline with the template. + + """ + # TODO incorporate task_subtype and data_type for future problems + with open(schemas_utils.PIPELINES_DB_DIR) as file: + possible_pipelines_dict = json.load(file) + + if task_type not in possible_pipelines_dict: + self.pipeline_left = False + return None + + possible_pipelines_dict = possible_pipelines_dict[task_type] + + if not possible_pipelines_dict: + return [] + + possible_pipelines = [] + for pipeline_dict in possible_pipelines_dict: + try: + pipeline = pipeline_utils.load_pipeline(pipeline_dict) + + # update id + pipeline.id = str(uuid.uuid4()) + + # update time + pipeline.created = Pipeline().created + + possible_pipelines.append(pipeline) + except Exception: + pass + + return possible_pipelines diff --git a/axolotl/axolotl/algorithms/random_search.py b/axolotl/axolotl/algorithms/random_search.py new file mode 100644 index 0000000..ef5a3d0 --- /dev/null +++ b/axolotl/axolotl/algorithms/random_search.py @@ -0,0 +1,27 @@ +import enum + +from axolotl.algorithms.tuners.random_search_oracle import RandomSearchOracle +from axolotl.algorithms.tuners.tunable_base import TunableBase + + +class RandomSearch(TunableBase): + def __init__(self, problem_description, backend, primitives_blocklist=None, + max_trials=10000, directory='.', num_eval_trials=None): + super(RandomSearch, self).__init__(problem_description, backend, + primitives_blocklist=primitives_blocklist, num_eval_trials=num_eval_trials) + self.directory = directory + self.project_name = 'random_search' + + self.objective = self.problem_description['problem']['performance_metrics'][0]['metric'] + if isinstance(self.objective, enum.Enum): + self.objective = self.objective.name + + self.oracle = RandomSearchOracle( + objective=self.objective, + max_trials=max_trials, # pre-defined number, + seed=self.random_seed, # seed + hyperparameters=self.hyperparameters, + ) + self.oracle._set_project_dir( + self.directory, self.project_name, overwrite=True) + diff --git a/axolotl/axolotl/algorithms/tuners/__init__.py b/axolotl/axolotl/algorithms/tuners/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/axolotl/axolotl/algorithms/tuners/bayesian_oracle.py b/axolotl/axolotl/algorithms/tuners/bayesian_oracle.py new file mode 100644 index 0000000..c957eae --- /dev/null +++ b/axolotl/axolotl/algorithms/tuners/bayesian_oracle.py @@ -0,0 +1,198 @@ +import numpy as np + +from scipy import optimize as scipy_optimize +from sklearn import exceptions + +from d3m.metadata import hyperparams +from kerastuner import Objective +from kerastuner.tuners.bayesian import BayesianOptimizationOracle as KerasBayesian +from kerastuner.engine import trial as trial_lib + +from axolotl.algorithms.tuners.hyperparameters import HyperParameters, \ + value_to_cumulative_prob, cumulative_prob_to_value +from axolotl.algorithms.tuners.oracle import infer_metric_direction, random_values, patch_invalid_hyperamaeters + + +class BayesianOptimizationOracle(KerasBayesian): + """ + Bayesian optimization oracle. + """ + + def __init__(self, + objective, + max_trials, + num_initial_points=None, + alpha=1e-4, + beta=2.6, + seed=None, + hyperparameters=None, + allow_new_entries=True, + tune_new_entries=True): + direction = infer_metric_direction(objective) + objective = Objective(name=objective, direction=direction) + super(BayesianOptimizationOracle, self).__init__( + objective=objective, + max_trials=max_trials, + num_initial_points=num_initial_points, + alpha=alpha, + beta=beta, + seed=seed, + hyperparameters=hyperparameters, + allow_new_entries=allow_new_entries, + tune_new_entries=tune_new_entries, + ) + self.num_complete_trials = 0 + self.sorted_candidates = [] + + # TODO how to save a trial + def _save_trial(self, trial): + pass + + def get_state(self): + # `self.trials` are saved in their own, Oracle-agnostic files. + # Just save the IDs for ongoing trials, since these are in `trials`. + state = {} + state['ongoing_trials'] = { + tuner_id: trial.trial_id + for tuner_id, trial in self.ongoing_trials.items()} + # Hyperparameters are part of the state because they can be added to + # during the course of the search. + state['hyperparameters'] = str(self.hyperparameters.get_config()) + + state.update({ + 'num_initial_points': self.num_initial_points, + 'alpha': self.alpha, + 'beta': self.beta, + }) + return state + + def _random_values(self): + """Fills the hyperparameter space with random values. + + Returns: + A dictionary mapping parameter names to suggested values. + """ + + values, seed_state = random_values(hyperparameters=self.hyperparameters, + seed_state=self._seed_state, + tried_so_far=self._tried_so_far, + max_collisions=self._max_collisions, + ) + self._seed_state = seed_state + return values + + def _nonfixed_space(self): + return [hp for hp in self.hyperparameters.space + if not isinstance(hp, hyperparams.Constant)] + + def _vector_to_values(self, vector): + hps = HyperParameters() + vector_index = 0 + for hp in self.hyperparameters.space: + hps.merge([hp]) + if isinstance(hp, hyperparams.Constant): + value = hp.get_default() + else: + prob = vector[vector_index] + vector_index += 1 + value = cumulative_prob_to_value(prob, hp) + + if hps.is_active(hp): + hps.values[hp.name] = value + patch_invalid_hyperamaeters(hps) + return hps.values + + def _vectorize_trials(self): + x = [] + y = [] + ongoing_trials = {t for t in self.ongoing_trials.values()} + for trial in self.trials.values(): + # Create a vector representation of each Trial's hyperparameters. + trial_hps = trial.hyperparameters + vector = [] + for hp in self._nonfixed_space(): + # For hyperparameters not present in the trial (either added after + # the trial or inactive in the trial), set to default value. + if trial_hps.is_active(hp): + trial_value = trial_hps.values[hp.name] + else: + trial_value = hp.default + + # Embed an HP value into the continuous space [0, 1]. + prob = value_to_cumulative_prob(trial_value, hp) + vector.append(prob) + + if trial in ongoing_trials: + # "Hallucinate" the results of ongoing trials. This ensures that + # repeat trials are not selected when running distributed. + x_h = np.array(vector).reshape((1, -1)) + y_h_mean, y_h_std = self.gpr.predict(x_h, return_std=True) + # Give a pessimistic estimate of the ongoing trial. + score = y_h_mean[0] + y_h_std[0] + elif trial.status == 'COMPLETED': + score = trial.score + # Always frame the optimization as a minimization for scipy.minimize. + if self.objective.direction == 'max': + score = -1*score + else: + continue + + x.append(vector) + y.append(score) + + x = np.array(x) + y = np.array(y) + return x, y + + def _populate_space(self, trial_id): + # Generate enough samples before training Gaussian process. + completed_trials = [t for t in self.trials.values() + if t.status == 'COMPLETED'] + + # Use 3 times the dimensionality of the space as the default number of + # random points. + dimensions = len(self.hyperparameters.space) + num_initial_points = self.num_initial_points or 3 * dimensions + if len(completed_trials) < num_initial_points: + return self._random_populate_space() + + if self.num_complete_trials == len(completed_trials) and len(self.sorted_candidates) > 0: + optimal_x = self.sorted_candidates.pop().x + values = self._vector_to_values(optimal_x) + return {'status': trial_lib.TrialStatus.RUNNING, + 'values': values} + + # track the number of complete trials + self.num_complete_trials = len(completed_trials) + + # Fit a GPR to the completed trials and return the predicted optimum values. + x, y = self._vectorize_trials() + try: + self.gpr.fit(x, y) + except exceptions.ConvergenceWarning: + # If convergence of the GPR fails, create a random trial. + return self._random_populate_space() + + def _upper_confidence_bound(x): + x = x.reshape(1, -1) + mu, sigma = self.gpr.predict(x, return_std=True) + return mu - self.beta * sigma + + num_restarts = 50 + bounds = self._get_hp_bounds() + x_seeds = self._random_state.uniform(bounds[:, 0], bounds[:, 1], + size=(num_restarts, bounds.shape[0])) + candidates = [ + scipy_optimize.minimize(_upper_confidence_bound, + x0=x_try, + bounds=bounds, + method='L-BFGS-B') + for x_try in x_seeds + ] + + self.sorted_candidates = sorted(candidates, key=lambda x: x.fun[0], reverse=True) + optimal_x = self.sorted_candidates.pop().x + + values = self._vector_to_values(optimal_x) + return {'status': trial_lib.TrialStatus.RUNNING, + 'values': values} diff --git a/axolotl/axolotl/algorithms/tuners/custom_hps.py b/axolotl/axolotl/algorithms/tuners/custom_hps.py new file mode 100644 index 0000000..e86fcdd --- /dev/null +++ b/axolotl/axolotl/algorithms/tuners/custom_hps.py @@ -0,0 +1,535 @@ +import sys +from collections import OrderedDict + +from d3m.metadata import hyperparams + +epsilon = sys.float_info.epsilon + +clf_xgboost_config = dict( + n_estimators=hyperparams.UniformInt( + lower=10, + upper=50, + default=20, + description='The number of trees in the forest.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ), + n_more_estimators=hyperparams.UniformInt( + lower=10, + upper=50, + default=20, + description='When continuing a fit, it controls how many more trees to add every time.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ), + max_depth=hyperparams.UniformInt( + lower=5, + upper=50, + default=30, + lower_inclusive=True, + upper_inclusive=True, + description='The maximum depth of the tree.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + learning_rate=hyperparams.LogUniform( + lower=1e-4, + upper=1e-1, + default=0.05, + lower_inclusive=True, + upper_inclusive=True, + description=r'Boosting learning rate (xgb\`s \"eta\")', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + gamma=hyperparams.Constant[float]( + default=0.0, + description='Minimum loss reduction required to make a further partition on a leaf node of the tree', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + min_child_weight = hyperparams.Constant[int]( + default=1, + description='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results ' + 'in a leaf node with the sum of instance weight less than min_child_weight, then the building ' + 'process will give up further partitioning ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + # max_delta_step = hyperparams.Union[Union[int, None]]( + # configuration=OrderedDict( + # limit=hyperparams.Bounded[int]( + # lower=1, + # upper=None, + # default=1, + # description='Maximum delta step we allow each leaf output to be.' + # ), + # unlimited=hyperparams.Enumeration[int]( + # values=[0], + # default=0, + # description='No constraint.', + # ), + # ), + # default='unlimited', + # description='Maximum delta step we allow.', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + # ), + subsample=hyperparams.Constant[float]( + default=1.0, + description='Subsample ratio of the training instances,this will prevent overfitting. Subsampling will occur ' + 'once in every boosting iteration.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + colsample_bytree=hyperparams.Constant[float]( + default=1.0, + description='Subsample ratio of columns when constructing each tree. Subsampling will occur once in every ' + 'boosting iteration', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + colsample_bylevel=hyperparams.Constant[float]( + default=1.0, + description='Subsample ratio of columns for each split, in each level. Subsampling will occur each time a new ' + 'split is made', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + reg_alpha=hyperparams.Uniform( + lower=0.1, + upper=1.0, + default=0.5, + lower_inclusive=True, + upper_inclusive=True, + description='L1 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + reg_lambda=hyperparams.Uniform( + lower=0.1, + upper=1.0, + default=0.5, + lower_inclusive=True, + upper_inclusive=True, + description='L2 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + # scale_pos_weight = hyperparams.Bounded[float]( + # lower=0, + # upper=None, + # default=1, + # description='Control the balance of positive and negative weights, useful for unbalanced classes', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + # ), + base_score=hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.5, + description='The initial prediction score of all instances, global bias.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), +) + +dfs_single_tab_config = dict( + max_percent_null=hyperparams.Uniform( + lower=0, + upper=1, + default=0.9, + lower_inclusive=True, + upper_inclusive=True, + description='The maximum allowed correlation between any two features returned. A lower value means features will be more uncorrelated', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + +) + +lgbm_clf_config = dict( + n_estimators=hyperparams.UniformInt( + lower=10, + upper=50, + default=20, + description='The number of trees in the forest.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ), + n_more_estimators=hyperparams.UniformInt( + lower=10, + upper=50, + default=20, + description='When continuing a fit, it controls how many more trees to add every time.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ), + max_depth=hyperparams.UniformInt( + lower=5, + upper=50, + default=30, + lower_inclusive=True, + upper_inclusive=True, + description='The maximum depth of the tree.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + # num_leaves_base=hyperparams.Bounded[float]( + # lower=1, + # upper=2, + # default=2, + # description='Maximum tree leaves for base learners, this value is the base of the formula num_leaves_base^(max_depth)', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + # ), + # subsample_for_bin=hyperparams.Bounded[int]( + # lower=1, + # upper=None, + # default=200000, + # description='number of data that sampled to construct histogram bins', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + # ), + learning_rate=hyperparams.LogUniform( + lower=1e-4, + upper=1e-1, + default=0.05, + lower_inclusive=True, + upper_inclusive=True, + description=r'Boosting learning rate (xgb\`s \"eta\")', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + min_child_weight = hyperparams.Constant[int]( + default=1, + description='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results ' + 'in a leaf node with the sum of instance weight less than min_child_weight, then the building ' + 'process will give up further partitioning ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + # min_child_samples=hyperparams.Bounded[int]( + # lower=0, + # upper=None, + # default=20, + # description='minimal number of data in one leaf. Can be used to deal with over-fitting', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + # ), + # max_delta_step = hyperparams.Union[Union[int, None]]( + # configuration=OrderedDict( + # limit=hyperparams.Bounded[int]( + # lower=1, + # upper=None, + # default=1, + # description='Maximum delta step we allow each leaf output to be.' + # ), + # unlimited=hyperparams.Enumeration[int]( + # values=[0], + # default=0, + # description='No constraint.', + # ), + # ), + # default='unlimited', + # description='Maximum delta step we allow.', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + # ), + subsample=hyperparams.Constant[float]( + default=1.0, + description='Subsample ratio of the training instances,this will prevent overfitting. Subsampling will occur ' + 'once in every boosting iteration.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + subsample_freq=hyperparams.Bounded[int]( + lower=0, + upper=1, + default=0, + description='frequency for bagging', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + colsample_bytree=hyperparams.Constant[float]( + default=1.0, + description='Subsample ratio of columns when constructing each tree. Subsampling will occur once in every ' + 'boosting iteration', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + min_split_gain=hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0, + description='the minimal gain to perform split', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + reg_alpha=hyperparams.Uniform( + lower=0.1, + upper=1.0, + default=0.5, + lower_inclusive=True, + upper_inclusive=True, + description='L1 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + reg_lambda=hyperparams.Uniform( + lower=0.1, + upper=1.0, + default=0.5, + lower_inclusive=True, + upper_inclusive=True, + description='L2 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), +) + +sk_logistic_regression_config = dict( + dual=hyperparams.Constant[bool]( + default=False, + description='Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + penalty=hyperparams.Choice( + choices={ + 'l1': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'l2': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'none': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'elasticnet': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'l1_ratio': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Uniform( + lower=0, + upper=1, + default=0.001, + lower_inclusive=True, + upper_inclusive=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + # 'l1_ratio must be between 0 and 1; got (l1_ratio=None)' + # 'none': hyperparams.Constant( + # default=None, + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + # ) + }), + default='float', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='l2', + description='Used to specify the norm used in the penalization. The \'newton-cg\', \'sag\' and \'lbfgs\' solvers support only l2 penalties.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + intercept_scaling=hyperparams.Constant[float]( + default=1, + description='Useful only when the solver \'liblinear\' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], i.e. a "synthetic" feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + +) + +sk_decision_tree_clf_config = dict( + min_samples_split=hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Constant[int]( + default=2, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + lower_inclusive=False, + # upper_inclusive=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + max_features=hyperparams.Union( + configuration=OrderedDict({ + # max_features must be in (0, n_features] + # 'specified_int': hyperparams.Bounded[int]( + # lower=0, + # upper=None, + # default=0, + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + # ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + lower_inclusive=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + # 'max_leaf_nodes 0 must be either None or larger than 1' + max_leaf_nodes=hyperparams.Constant( + default=None, + description='Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), +) + +sk_sgd_clf_config = dict( + validation_fraction=hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=0.99999999999, + lower_inclusive=False, + # upper_inclusive=False, + description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + # eta0 must be > 0 + eta0=hyperparams.Bounded[float]( + lower=0.0, + upper=1.0, + default=0.1, + lower_inclusive=False, + description='The initial learning rate for the \'constant\' or \'invscaling\' schedules. The default value is 0.0 as eta0 is not used by the default schedule \'optimal\'.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + + +) + +sk_random_forest_clf_config = dict( + max_features=hyperparams.Union( + configuration=OrderedDict({ + # max_features must be in (0, n_features] + # 'specified_int': hyperparams.Bounded[int]( + # lower=0, + # upper=None, + # default=0, + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + # ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Uniform( + default=0.25, + lower=0, + upper=1, + lower_inclusive=True, + upper_inclusive=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='calculated', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + max_samples=hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + lower=0, + upper=None, + lower_inclusive=False, + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.9, + lower=0 + epsilon, + upper=1, + upper_inclusive=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), +) + +sk_extra_tree_tree_clf_config = dict( + max_features=hyperparams.Union( + configuration=OrderedDict({ + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + lower_inclusive=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='calculated', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + max_samples=hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + lower=0, + upper=None, + lower_inclusive=False, + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.9, + lower=0 + epsilon, + upper=1, + upper_inclusive=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) +) + +# To avoid the issue, https://gitlab.com/TAMU_D3M/d3m_primitives/-/issues/1 +tamu_feature_selection_config = dict( + percentage_selected_features=hyperparams.Uniform( + default=0.5, + upper=1, + lower=0.25, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="percentage of features to select, between 0 and 1") +) + +config = { + 'd3m.primitives.classification.xgboost_gbtree.DataFrameCommon': clf_xgboost_config, + 'd3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization': dfs_single_tab_config, + 'd3m.primitives.classification.light_gbm.DataFrameCommon': lgbm_clf_config, + 'd3m.primitives.classification.logistic_regression.SKlearn': sk_logistic_regression_config, + 'd3m.primitives.classification.decision_tree.SKlearn': sk_decision_tree_clf_config, + 'd3m.primitives.classification.sgd.SKlearn': sk_sgd_clf_config, + 'd3m.primitives.classification.random_forest.SKlearn': sk_random_forest_clf_config, + 'd3m.primitives.classification.extra_trees.SKlearn': sk_extra_tree_tree_clf_config, + 'd3m.primitives.feature_selection.skfeature.TAMU': tamu_feature_selection_config, +} diff --git a/axolotl/axolotl/algorithms/tuners/hyperparameters.py b/axolotl/axolotl/algorithms/tuners/hyperparameters.py new file mode 100644 index 0000000..0da8c26 --- /dev/null +++ b/axolotl/axolotl/algorithms/tuners/hyperparameters.py @@ -0,0 +1,195 @@ +import json +import math +from scipy.stats import norm + +from d3m import utils as d3m_utils +from d3m.metadata import hyperparams +from d3m.metadata.hyperparams import HyperparameterMeta +from kerastuner.engine.hyperparameters import HyperParameters as KerasHyperparams + +PIPELINE_CHOICE = 'pipeline_choice' + + +def GET_CONFIG(param_val): + config = param_val.to_simple_structure() + config['p'] = param_val + if isinstance(param_val, hyperparams.SortedList) or isinstance(param_val, hyperparams.SortedSet): + config['is_configuration'] = param_val.is_configuration + return config + + +class HyperParameters(KerasHyperparams): + def get_config(self): + return { + 'space': [{'class_name': p.__class__.__name__, + 'config': GET_CONFIG(p)} + for p in self.space], + 'values': dict((k, v) for (k, v) in self.values.items()), + } + + def retrieve(self, name, val, parent_name=None, parent_values=None): + """Gets or creates a `HyperParameter`.""" + config = GET_CONFIG(val) + hp = config['p'] + hp.name = self._get_name(name) + hp.default = get_val(hp.get_default)() + hp.random_sample = get_val(hp.sample) + hp.conditions = [c for c in self._conditions] + with self._maybe_conditional_scope(parent_name, parent_values): + return self._retrieve(hp) + + def _register(self, hp): + """Registers a `HyperParameter` into this container.""" + self._hps[hp.name].append(hp) + self._space.append(hp) + value = hp.default + if self._conditions_are_active(hp.conditions): + self.values[hp.name] = value + return value + return None + + @classmethod + def from_config(cls, config): + hps = cls() + for p in config['space']: + p = p['config']['p'] + hps._hps[p.name].append(p) + hps._space.append(p) + hps.values = dict((k, v) for (k, v) in config['values'].items()) + return hps + + def copy(self): + return HyperParameters.from_config(self.get_config()) + + def __repr__(self): + return self.to_json() + + def to_json(self): + return json.dumps(self.__dict__, default=serialize) + + def _get_name_parts(self, full_name): + """Splits `full_name` into its scopes and leaf name.""" + str_parts = full_name.split('/') + parts = [] + + for part in str_parts: + if '=' in part: + parent_name, parent_values = part.split('=') + parent_values = parent_values.split(',') + parts.append({'parent_name': parent_name, + 'parent_values': parent_values}) + else: + parts.append(part) + + return parts + + def get_pipeline_id(self): + pipeline_id = self.values[PIPELINE_CHOICE] + return pipeline_id + + def get_name_parts(self, full_name): + step, primitive_name, hp_name = self._get_name_parts(full_name) + return step, primitive_name, hp_name + + +def get_val(func): + def wrapper(*args, **kwargs): + val = func(*args, **kwargs) + return val['choice'] if isinstance(val, dict) and 'choice' in val else val + return wrapper + + +def serialize(obj): + if isinstance(obj, HyperparameterMeta): + return obj.__dict__ + + +def value_to_cumulative_prob(value, hp): + """Convert a hyperparameter value to [0, 1].""" + if isinstance(hp, hyperparams.Constant): + return 0.5 + if isinstance(hp, hyperparams.UniformBool): + # Center the value in its probability bucket. + if value: + return 0.75 + return 0.25 + elif isinstance(hp, (hyperparams.Choice, hyperparams.Enumeration, hyperparams.Union)): + if isinstance(hp, hyperparams.Choice): + choices = hp.choices + index = list(choices.keys()).index(value) + elif isinstance(hp, hyperparams.Union): + choices = hp.configuration.keys() + for index, val_type in enumerate(hp.configuration.values()): + if isinstance(value, val_type.structural_type): + break + else: + choices = hp.values + index = choices.index(value) + ele_prob = 1 / len(choices) + # Center the value in its probability bucket. + return (index + 0.5) * ele_prob + elif isinstance(hp, (hyperparams.UniformInt, hyperparams.Uniform, hyperparams.Bounded)): + lower, upper = hp.lower, hp.upper + if lower is None or upper is None: + return 0.5 + return (value - lower) / (upper - lower) + elif isinstance(hp, hyperparams.LogUniform): + lower, upper = hp.lower, hp.upper + if lower is None or upper is None: + return 0.5 + return (math.log(value / lower) / + math.log(upper / lower)) + elif isinstance(hp, (hyperparams.Normal, hyperparams.LogNormal)): + return norm.cdf(value, hp.mu, hp.sigma) + else: + raise ValueError('Unrecognized HyperParameter type: {}'.format(hp)) + + +def cumulative_prob_to_value(prob, hp): + """Convert a value from [0, 1] to a hyperparameter value.""" + if isinstance(hp, hyperparams.Constant): + return hp.get_default() + elif isinstance(hp, hyperparams.UniformBool): + return bool(prob >= 0.5) + elif isinstance(hp, (hyperparams.Choice, hyperparams.Enumeration, hyperparams.Union)): + if isinstance(hp, hyperparams.Choice): + choices = list(hp.choices.keys()) + elif isinstance(hp, hyperparams.Union): + choices = list(hp.configuration.keys()) + else: + choices = hp.values + ele_prob = 1 / len(choices) + index = int(math.floor(prob / ele_prob)) + # Can happen when `prob` is very close to 1. + if index == len(choices): + index = index - 1 + if isinstance(hp, hyperparams.Union): + key = choices[index] + with d3m_utils.silence(): + val = hp.configuration[key].sample() + return val + return choices[index] + elif isinstance(hp, (hyperparams.UniformInt, hyperparams.Uniform, hyperparams.Bounded)): + import sys + epsilon = sys.float_info.epsilon + lower, upper = hp.lower, hp.upper + if lower is None or upper is None: + return hp.get_default() + value = prob * (upper - lower) + lower + if hp.structural_type == int: + return int(value) + if value == lower and not hp.lower_inclusive: + return value + epsilon + if value == upper and not hp.upper_inclusive: + return value - epsilon + return value + elif isinstance(hp, hyperparams.LogUniform): + lower, upper = hp.lower, hp.upper + if lower is None or upper is None: + return hp.get_default() + value = lower * math.pow(upper / lower, prob) + return value + elif isinstance(hp, (hyperparams.Normal, hyperparams.LogNormal)): + return norm.ppf(prob, loc=hp.mu, scale=hp.sigma) + else: + raise ValueError('Unrecognized HyperParameter type: {}'.format(hp)) diff --git a/axolotl/axolotl/algorithms/tuners/oracle.py b/axolotl/axolotl/algorithms/tuners/oracle.py new file mode 100644 index 0000000..7b129b1 --- /dev/null +++ b/axolotl/axolotl/algorithms/tuners/oracle.py @@ -0,0 +1,104 @@ +import os + +import hashlib +import random + +from d3m import utils as d3m_utils +from d3m.metadata import problem as problem_module +from axolotl.algorithms.tuners.hyperparameters import HyperParameters, PIPELINE_CHOICE + +_MAX_METRICS = { + problem_module.PerformanceMetric.ACCURACY, + problem_module.PerformanceMetric.PRECISION, + problem_module.PerformanceMetric.RECALL, + problem_module.PerformanceMetric.F1, + problem_module.PerformanceMetric.F1_MICRO, + problem_module.PerformanceMetric.F1_MACRO, + problem_module.PerformanceMetric.ROC_AUC, + problem_module.PerformanceMetric.JACCARD_SIMILARITY_SCORE, + problem_module.PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION, # not sure + problem_module.PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION, +} +_MAX_METRICS_NAME = {s.name for s in _MAX_METRICS} + + +_MIN_METRICS = { + problem_module.PerformanceMetric.MEAN_ABSOLUTE_ERROR, + problem_module.PerformanceMetric.MEAN_SQUARED_ERROR, + problem_module.PerformanceMetric.ROOT_MEAN_SQUARED_ERROR, + problem_module.PerformanceMetric.R_SQUARED, +} +_MIN_METRICS_NAME = {s.name for s in _MIN_METRICS} + + +def infer_metric_direction(metric): + # Handle str input and get canonical object. + if isinstance(metric, str): + metric_name = metric + if metric_name in _MIN_METRICS_NAME: + return 'min' + elif metric_name in _MAX_METRICS_NAME: + return 'max' + + +def random_values(hyperparameters, seed_state, tried_so_far, max_collisions): + collisions = 0 + while 1: + # Generate a set of random values. + hps = HyperParameters() + with d3m_utils.silence(): + for hp in hyperparameters.space: + hps.merge([hp]) + if hps.is_active(hp): # Only active params in `values`. + hps.values[hp.name] = hp.random_sample(seed_state) + seed_state += 1 + # Pick out the invalid hyper-parameters + patch_invalid_hyperamaeters(hps) + + values = hps.values + # Keep trying until the set of values is unique, + # or until we exit due to too many collisions. + values_hash = compute_values_hash(values) + if values_hash in tried_so_far: + collisions += 1 + if collisions > max_collisions: + return None + continue + tried_so_far.add(values_hash) + break + return values, seed_state + + +def compute_values_hash(values): + keys = sorted(values.keys()) + s = ''.join(str(k) + '=' + str(values[k]) for k in keys) + return hashlib.sha256(s.encode('utf-8')).hexdigest()[:32] + + +def patch_invalid_hyperamaeters(hps): + values = hps.values + for full_name in values: + if full_name == PIPELINE_CHOICE: + continue + hp_val = values[full_name] + step, primitive_name, hp_name = hps.get_name_parts(full_name) + if primitive_name == 'd3m.primitives.classification.svc.SKlearn' \ + and hp_name == 'decision_function_shape' and hp_val == 'ovo': + # break_ties must be False if decision-function_shape == 'ovo' + break_ties = os.path.join(step, primitive_name, 'break_ties') + values[break_ties] = False + if primitive_name == 'd3m.primitives.classification.logistic_regression.SKlearn': + # elasticnet' penalty, solver must be'saga' + if hp_name == 'penalty' and hp_val == 'elasticnet': + solver = os.path.join(step, primitive_name, 'solver') + values[solver] = 'saga' + if hp_name == 'solver': + penalty = os.path.join(step, primitive_name, 'penalty') + # liblinear only supports 'ovr' multi_class and [l2, l1] penalty + if hp_val == 'liblinear': + multi_class = os.path.join(step, primitive_name, 'multi_class') + values[multi_class] = 'ovr' + values[penalty] = random.choice(['l2', 'l1']) + # ['lbfgs', 'newton-cg', 'sag'] only support [l2, none] penalty + elif hp_val in ['lbfgs', 'newton-cg', 'sag']: + values[penalty] = random.choice(['l2', 'none']) diff --git a/axolotl/axolotl/algorithms/tuners/random_search_oracle.py b/axolotl/axolotl/algorithms/tuners/random_search_oracle.py new file mode 100644 index 0000000..f446389 --- /dev/null +++ b/axolotl/axolotl/algorithms/tuners/random_search_oracle.py @@ -0,0 +1,66 @@ +from kerastuner import Objective +from kerastuner.engine import trial as trial_lib +from kerastuner.tuners.randomsearch import RandomSearchOracle as KerasRandomSearchOracle + +from axolotl.algorithms.tuners.oracle import infer_metric_direction, random_values + + +class RandomSearchOracle(KerasRandomSearchOracle): + """ + Random search oracle. + """ + + def __init__(self, + objective, + max_trials, + seed=None, + hyperparameters=None, + allow_new_entries=True, + tune_new_entries=True): + direction = infer_metric_direction(objective) + objective = Objective(name=objective, direction=direction) + super(RandomSearchOracle, self).__init__( + objective=objective, + max_trials=max_trials, + seed=seed, + hyperparameters=hyperparameters, + tune_new_entries=tune_new_entries, + allow_new_entries=allow_new_entries) + + def _populate_space(self, _): + values = self._random_values() + if values is None: + return {'status': trial_lib.TrialStatus.STOPPED, + 'values': None} + return {'status': trial_lib.TrialStatus.RUNNING, + 'values': values} + + def _random_values(self): + """Fills the hyperparameter space with random values. + + Returns: + A dictionary mapping parameter names to suggested values. + """ + + values, seed_state = random_values(hyperparameters=self.hyperparameters, + seed_state=self._seed_state, + tried_so_far=self._tried_so_far, + max_collisions=self._max_collisions, + ) + self._seed_state = seed_state + return values + + def _save_trial(self, trial): + pass + + def get_state(self): + # `self.trials` are saved in their own, Oracle-agnostic files. + # Just save the IDs for ongoing trials, since these are in `trials`. + state = {} + state['ongoing_trials'] = { + tuner_id: trial.trial_id + for tuner_id, trial in self.ongoing_trials.items()} + # Hyperparameters are part of the state because they can be added to + # during the course of the search. + state['hyperparameters'] = str(self.hyperparameters.get_config()) + return state diff --git a/axolotl/axolotl/algorithms/tuners/tunable_base.py b/axolotl/axolotl/algorithms/tuners/tunable_base.py new file mode 100644 index 0000000..1c789e4 --- /dev/null +++ b/axolotl/axolotl/algorithms/tuners/tunable_base.py @@ -0,0 +1,258 @@ +import logging +import multiprocessing + +import os +import uuid +import copy +from typing import Tuple +import re +import numpy as np + +from d3m.metadata import hyperparams +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline + +from kerastuner.engine import trial as trial_module + +from axolotl import predefined_pipelines +from axolotl.algorithms.tuners import custom_hps +from axolotl.algorithms.base import PipelineSearchBase +from axolotl.algorithms.dummy import dummy_ranking_function +from axolotl.algorithms.tuners.hyperparameters import HyperParameters, PIPELINE_CHOICE +from axolotl.utils import schemas as schemas_utils + +logger = logging.getLogger(__name__) + + +class TunableBase(PipelineSearchBase): + + def __init__(self, problem_description, backend, + primitives_blocklist=None, ranking_function=None, num_eval_trials=None): + if ranking_function is None: + ranking_function = dummy_ranking_function + if num_eval_trials is None: + num_eval_trials = multiprocessing.cpu_count() + super(TunableBase, self).__init__(problem_description, backend, + primitives_blocklist=primitives_blocklist, ranking_function=ranking_function) + # TODO update this to be defined on problem/metrics terms + self.data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + self.data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + + self.scoring_pipeline = schemas_utils.get_scoring_pipeline() + self.scoring_params = None + + self.metrics = problem_description['problem']['performance_metrics'] + + self.oracle = None + self.tuner_id = 'tuner' + self.hyperparameters = HyperParameters() + self.pipeline_candidates = {} + self.num_eval_trials = num_eval_trials + + def set_pipeline_candidates(self, input_data, pipeline_candidates): + if pipeline_candidates is None: + problem = self.problem_description + # ToDo should use fetch(input_data, problem, schemas_utils.PIPELINES_DB_DIR) + for pipeline in predefined_pipelines.fetch_from_file(problem, schemas_utils.PIPELINES_DB_DIR): + self.pipeline_candidates[pipeline.id] = pipeline + elif isinstance(pipeline_candidates, list): + for pipeline in pipeline_candidates: + self.pipeline_candidates[pipeline.id] = pipeline + elif isinstance(pipeline_candidates, dict): + self.pipeline_candidates = pipeline_candidates + else: + raise ValueError('pipeline_candidate should be None, list or dict') + + def init_search_space(self): + pipeline_id = hyperparams.Enumeration[str]( + values=list(self.pipeline_candidates.keys()), + default=list(self.pipeline_candidates.keys())[0], + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + self.hyperparameters.retrieve(PIPELINE_CHOICE, pipeline_id) + for pipeline in self.pipeline_candidates.values(): + self._get_pipeline_search_space(pipeline) + + def _get_pipeline_search_space(self, pipeline): + PREFIX_STEP = 'step' + with self.hyperparameters.conditional_scope(PIPELINE_CHOICE, pipeline.id): + for i, step in enumerate(pipeline.steps): + with self.hyperparameters.name_scope('{}{}'.format(PREFIX_STEP, i)): + primitive = step.primitive + self._get_primitive_search_space(primitive) + + def _get_primitive_search_space(self, primitive): + hyperparameters = primitive.metadata.query()['primitive_code']['hyperparams'] + primitive_python_path = primitive.metadata.query()['python_path'] + name = primitive_python_path + config = primitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'].configuration + custom_config = custom_hps.config.get(primitive_python_path, None) + if not custom_config is None: + config._dict.update(custom_config) + with self.hyperparameters.name_scope(name): + for param_name, param_info in hyperparameters.items(): + if self.is_tunable(param_info['semantic_types']): + param_val = config[param_name] + # SortedSet.to_simple_structure() has bug, so we skip it. + if isinstance(param_val, (hyperparams.List, hyperparams.Set)): + continue + self.hyperparameters.retrieve(param_name, param_val) + if isinstance(param_val, hyperparams.Choice): + for choice_name, choice_val in param_val.choices.items(): + with self.hyperparameters.conditional_scope(param_name, choice_name): + for sub_param_name, sub_param_val in choice_val.configuration.items(): + if sub_param_name != 'choice': + self.hyperparameters.retrieve(sub_param_name, sub_param_val) + + def is_tunable(self, semantic_types: Tuple[str, ...]) -> bool: + return any('tuning' in t.lower() for t in semantic_types) + + def search_fit(self, input_data, time_limit=300, *, expose_values=False, pipeline_candidates=None): + self.set_pipeline_candidates(input_data, pipeline_candidates) + self.init_search_space() + return super(TunableBase, self).search_fit(input_data, time_limit, expose_values=expose_values) + + def _search(self, time_left): + trials = self.create_trials(num_trials=self.num_eval_trials) + if len(trials) == 0: + logger.info('Oracle trigger exit') + return [] + results = self.run_trials(trials, input_data=self.input_data) + self.end_trials(trials) + return results + + def run_trials(self, trials, **fit_kwargs): + pipelines = [] + id_2_trials = {} + + for trial in trials: + hp = trial.hyperparameters + try: + pipeline = self.build_pipeline(hp) + id_2_trials[pipeline.id] = trial + pipelines.append(pipeline) + except Exception as e: + logger.error('Current trial is failed. Error: {}'.format(e)) + trial.status = trial_module.TrialStatus.INVALID + + input_data = fit_kwargs.pop('input_data') + + pipeline_results = self.backend.evaluate_pipelines( + problem_description=self.problem_description, + pipelines=pipelines, + input_data=input_data, + metrics=self.metrics, + data_preparation_pipeline=self.data_preparation_pipeline, + scoring_pipeline=self.scoring_pipeline, + data_preparation_params=self.data_preparation_params, + ) + + results = [] + for result in pipeline_results: + trial = id_2_trials[result.pipeline.id] + if result.status == 'ERRORED': + logger.error('Current trial is failed. Error: {}'.format(result.error)) + trial.status = trial_module.TrialStatus.INVALID + else: + scores = result.scores + # scores = runtime_module.combine_folds(scores) + summarize_performance = schemas_utils.summarize_performance_metrics(scores) + metrics = self._get_pipeline_metrics(summarize_performance) + self.oracle.update_trial( + trial.trial_id, metrics=metrics + ) + trial.status = trial_module.TrialStatus.COMPLETED + results.append(self.ranking_function(result)) + return results + + def build_pipeline(self, hyperparameters): + """ + hyperparameters example: + { + 'STEP5/d3m.primitives.feature_construction.deep_feature_synthesis.SingleTableFeaturization/max_percent_null: 0, + 'STEP7/d3m.primitives.data_preprocessing.robust_scaler.SKlearn/quantile_range: (2.798121390864261, 14.852664215409096), + } + """ + values = hyperparameters.values + pipeline_id = hyperparameters.get_pipeline_id() + pipeline = copy.deepcopy(self.pipeline_candidates[pipeline_id]) + pipeline.id = str(uuid.uuid4()) + # update time + pipeline.created = Pipeline().created + + skip_hps = set() + # for key in sorted(values.keys()): + for hp in hyperparameters.space: + if hyperparameters.is_active(hp) and hp.name not in skip_hps and hp.name != PIPELINE_CHOICE: + key = hp.name + step, primitive_name, hp_name = hyperparameters.get_name_parts(key) + value = values[key] + step_idx = self.__get_step_idx_by_name(step) + if step_idx is None: + raise KeyError('{} not in the pipeline'.format(primitive_name)) + primitive_step = pipeline.steps[step_idx] + arg_type = ArgumentType.VALUE + # In order to avoid the following error + # Value '0' for hyper-parameter \ + # 'STEP8/d3m.primitives.classification.xgboost_gbtree.DataFrameCommon/max_delta_step' \ + # is not an instance of the structural type: typing.Union[int, NoneType] + # Here is workaround + if isinstance(value, np.int64): + value = int(value) + elif isinstance(value, np.str_): + value = str(value) + elif isinstance(value, np.bool_): + value = bool(value) + if hp_name in primitive_step.hyperparams: + del primitive_step.hyperparams[hp_name] + # Handle Choice + if isinstance(hp, hyperparams.Choice): + choice_cls = hp.choices[value] + _vals = {} + for name in choice_cls.configuration: + if name == 'choice': + _vals[name] = value + else: + _key = os.path.join(step, primitive_name, name) + _vals[name] = values[_key] + skip_hps.add(_key) + value = choice_cls(_vals) + primitive_step.add_hyperparameter(name=hp_name, argument_type=arg_type, + data=value) + return pipeline + + def __get_step_idx_by_name(self, prefix_primitive_name): + regex = r"(?<=STEP)\d+" + match = re.search(regex, prefix_primitive_name, re.IGNORECASE) + if match: + return int(match.group(0)) + return None + + def _get_pipeline_metrics(self, summarize_performance): + metrics = {} + for name, info in summarize_performance.items(): + metrics[name] = info['mean'] + return metrics + + def end_trials(self, trials): + """A hook called after each trial is run. + + # Arguments: + trial: A `Trial` instance. + """ + [self.oracle.end_trial(trial.trial_id, trial.status) for trial in trials] + # self.oracle.update_space(trial.hyperparameters) + + def create_trials(self, num_trials): + trials = [] + for i in range(num_trials): + try: + trial = self.oracle.create_trial('{}_{}'.format(self.tuner_id, i)) + except: + break + + if trial.status == trial_module.TrialStatus.STOPPED: + break + else: + trials.append(trial) + return trials diff --git a/axolotl/axolotl/backend/__init__.py b/axolotl/axolotl/backend/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/axolotl/axolotl/backend/base.py b/axolotl/axolotl/backend/base.py new file mode 100644 index 0000000..e0471d9 --- /dev/null +++ b/axolotl/axolotl/backend/base.py @@ -0,0 +1,313 @@ +import abc +import typing + +from d3m.metadata.problem import Problem, PerformanceMetric +from d3m.metadata.pipeline import Pipeline + +from axolotl.utils.pipeline import PipelineResult +from axolotl.utils.schemas import ContainerType + + +class RunnerBase: + """ + A base class for the pipeline runner backend. + This child from this class must implement ``request_status`` and ``request_results`` which should keep + track of all requests. + + Parameters + ---------- + random_seed : int + Random seed passed to the constructor. + volumes_dir : str + Path to a directory with static files required by primitives. + In the standard directory structure (as obtained running ``python3 -m d3m index download``). + scratch_dir : str + Path to a directory to store any temporary files needed during execution. + + Attributes + ---------- + random_seed : int + Random seed passed to the constructor. + volumes_dir : str + Path to a directory with static files required by primitives. + In the standard directory structure (as obtained running ``python3 -m d3m index download``). + scratch_dir : str + Path to a directory to store any temporary files needed during execution. + """ + def __init__(self, *, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None) -> None: + self.random_seed = random_seed + self.volumes_dir = volumes_dir + self.scratch_dir = scratch_dir + + def add_metric(self, name: str, *, best_value: float, worst_value: float, score_class: type, + requires_confidence: bool = False, requires_rank: bool = False): + """ + Method to register a new metric. + + Parameters + ---------- + name : str + Metric name, e.g. ACCURACY. + best_value : float + Value that represents the best e.g. in accuracy 1.0 + worst_value: float + Value that represent the worst e.g. in accuracy 0 + score_class : type + A class that helps computing the score. + requires_confidence : bool + A flag that tells if the scoring function requires a confidence value. + requires_rank : bool + A flag that tell if the scoring function requires the rank of the predictions. + """ + + PerformanceMetric.register_metric(name=name, best_value=best_value, worst_value=worst_value, score_class=score_class, + requires_confidence=requires_confidence, requires_rank=requires_rank) + + @abc.abstractmethod + def get_request(self, request_id: str) -> PipelineResult: + """ + A method that returns the result from the requests + + Parameters + ---------- + request_id : str + Request id of data to retrieve + + Returns + ------- + PipelineResult + A PipelineResult instance that contains the information. + """ + + @abc.abstractmethod + def fit_pipeline_request(self, problem_description: Problem, pipeline: Pipeline, + input_data: typing.Sequence[ContainerType], *, timeout: float = None, + expose_outputs: bool = False) -> str: + """ + A method that submit a fit_pipeline job. + + Parameters + ---------- + problem_description : Problem + A problem description. + pipeline : Pipeline + The pipeline that is going to be fitted. + input_data : typing.Sequence[ContainerType] + A list of D3M containers. + timeout : float + A maximum amount of time that pipelines are going to be executed in seconds. + expose_outputs : bool + A variable that enable exposing every intermediate results based on the input_data + + Returns + ------- + str + A request id. + """ + + def fit_pipeline(self, problem_description: Problem, pipeline: Pipeline, input_data: typing.Sequence[ContainerType], + *, timeout: float = None, expose_outputs: bool = False) -> PipelineResult: + """ + A method that fit a pipeline, save the state and returns a PipelineResult. + + Parameters + ---------- + problem_description : Problem + A problem description. + pipeline : Pipeline + A pipeline that are going to be fitted. + input_data : typing.Sequence[ContainerType] + A list of D3M containers. + timeout : float + A maximum amount of time that pipelines are going to be executed in seconds. + expose_outputs : bool + A variable that enable exposing every intermediate results based on the input_data + + Returns + ------- + PipelineResult + A pipeline result containg the result of fitting the pipeline. + """ + request_id = self.fit_pipeline_request(problem_description=problem_description, pipeline=pipeline, + input_data=input_data, timeout=timeout, + expose_outputs=expose_outputs) + return self.get_request(request_id) + + @abc.abstractmethod + def produce_pipeline_request(self, fitted_pipeline_id: str, input_data: typing.Sequence[ContainerType], *, + timeout: float = None, expose_outputs: bool = False) -> str: + """ + A method that submit a produce pipeline request. + + Parameters + ---------- + fitted_pipeline_id : str + The fitted pipeline if of the fitted pipeline to be use to produce results. + input_data : typing.Sequence[ContainerType] + A list of D3M containers. + timeout : float + A maximum amount of time that pipelines are going to be executed in seconds. + expose_outputs : bool + A variable that enable exposing every intermediate results based on the input_data + + Returns + ------- + str + A request id. + """ + + # @abc.abstractmethod + def produce_pipeline(self, fitted_pipeline_id: str, input_data: typing.Sequence[ContainerType], *, + timeout: float = None, expose_outputs: bool = False) -> PipelineResult: + """ + A method that produce multiple fitted pipelines, save their state and returns a list of PipelineResult + that contain the information of every pipeline run. + + Parameters + ---------- + fitted_pipeline_id : str + A list of fitted pipelines to run with the input_data + input_data : typing.Sequence[ContainerType] + A list of D3M containers. + timeout : float + A maximum amount of time that pipelines are going to be executed in seconds. + expose_outputs : bool + A variable that enable exposing every intermediate results based on the input_data + + Returns + ------- + PipelineResult + A PipelineResult intance containing the information about the produced pipeline. + """ + request_id = self.produce_pipeline_request(fitted_pipeline_id, input_data, timeout=timeout, + expose_outputs=expose_outputs) + return self.get_request(request_id) + + @abc.abstractmethod + def evaluate_pipeline_request( + self, problem_description: Problem, pipeline: Pipeline, + input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict], + data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None, + data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None, + timeout: float = None + ) -> str: + """ + A method that evaluate multiple pipelines, and provides returns the scores and information of the pipelines. + + Parameters + ---------- + problem_description : Problem + A problem description. + pipeline : Pipeline + A list of pipelines that are going to be run. + input_data : typing.Sequence[ContainerType] + A list of D3M containers. + metrics : typing.Sequence[typing.Dict] + A dictionary containing the metrics and their arguments. + data_preparation_pipeline : Pipeline + A pipeline that prepares the data for the pipelines to be evaluated in, e.g. Cross-fold validation + scoring_pipeline : Pipeline + A pipeline that is used to compute the scores of the pipelines. + data_preparation_params : typing.Dict[str, str] + Parameters for the data preparation pipeline + scoring_params: typing.Dict[str, str] + Parameters for the scoring pipeline + timeout : float + A maximum amount of time that pipelines are going to be executed in seconds. + + Returns + ------- + str + A request id + """ + + def evaluate_pipeline( + self, problem_description: Problem, pipeline: Pipeline, + input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict], + data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None, + data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None, + timeout: float = None + ) -> PipelineResult: + """ + A method that evaluate multiple pipelines, and provides returns the scores and information of the pipelines. + + Parameters + ---------- + problem_description : Problem + A problem description. + pipeline : Pipeline + A pipeline that is going to be evaluated. + input_data : typing.Sequence[ContainerType] + A list of D3M containers. + metrics : typing.Sequence[typing.Dict] + A dictionary containing the metrics and their arguments. + data_preparation_pipeline : Pipeline + A pipeline that prepares the data for the pipelines to be evaluated in, e.g. Cross-fold validation + scoring_pipeline : Pipeline + A pipeline that is used to compute the scores of the pipelines. + data_preparation_params : typing.Dict[str, str] + Parameters for the data preparation pipeline + scoring_params: typing.Dict[str, str] + Parameters for the scoring pipeline + timeout : float + A maximum amount of time that pipelines are going to be executed in seconds. + + Returns + ------- + PipelineResult + Result of the evaluation of the pipeline. + """ + request_id = self.evaluate_pipeline_request( + problem_description, pipeline, input_data, metrics=metrics, + data_preparation_pipeline=data_preparation_pipeline, scoring_pipeline=scoring_pipeline, + data_preparation_params=data_preparation_params, scoring_params=scoring_params, timeout=timeout + ) + return self.get_request(request_id) + + def evaluate_pipelines( + self, problem_description: Problem, pipelines: typing.Sequence[Pipeline], + input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict], + data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None, + data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None, + timeout: float = None + ) -> typing.Sequence[PipelineResult]: + """ + A method that evaluate multiple pipelines, and provides returns the scores and information of the pipelines. + + Parameters + ---------- + problem_description : Problem + A problem description. + pipelines : typing.Sequence[str] + A list of pipelines that are going to be run. + input_data : typing.Sequence[ContainerType] + A list of D3M containers. + metrics : typing.Sequence[typing.Dict] + A dictionary containing the metrics and their arguments. + data_preparation_pipeline : Pipeline + A pipeline that prepares the data for the pipelines to be evaluated in, e.g. Cross-fold validation + scoring_pipeline : Pipeline + A pipeline that is used to compute the scores of the pipelines. + data_preparation_params : typing.Dict[str, str] + Parameters for the data preparation pipeline + scoring_params: typing.Dict[str, str] + Parameters for the scoring pipeline + timeout : float + A maximum amount of time that pipelines are going to be executed in seconds. + + Returns + ------- + typing.Sequence[PipelineResult] + A sequence of PipelineResults. + """ + request_ids = [] + for pipeline in pipelines: + request_ids.append( + self.evaluate_pipeline_request( + problem_description, pipeline, input_data, metrics=metrics, + data_preparation_pipeline=data_preparation_pipeline, scoring_pipeline=scoring_pipeline, + data_preparation_params=data_preparation_params, scoring_params=scoring_params, timeout=timeout + ) + ) + + return [self.get_request(request_id) for request_id in request_ids] diff --git a/axolotl/axolotl/backend/ray.py b/axolotl/axolotl/backend/ray.py new file mode 100644 index 0000000..f93e3dd --- /dev/null +++ b/axolotl/axolotl/backend/ray.py @@ -0,0 +1,269 @@ +import ray +import typing +import uuid +import binascii +import hashlib +import time +from ray.util import ActorPool + +from d3m import index as d3m_index +from d3m import utils as d3m_utils +from d3m import runtime as runtime_module +from d3m.metadata.problem import Problem +from d3m.metadata.pipeline import Pipeline +from d3m.metadata.base import Context +from d3m.metadata import pipeline_run as pipeline_run_module +from d3m import container as container_module + +from axolotl.backend.base import RunnerBase +from axolotl.utils.pipeline import PipelineResult, save_pipeline_run, save_exposed_values +from axolotl.utils.schemas import ContainerType +import multiprocessing + + +@ray.remote +class DataHandler: + def __init__(self): + self.data = {} + + def add_data(self, input_data): + if isinstance(input_data, list): + values = [] + for _data in input_data: + if isinstance(_data, container_module.Dataset): + values.append(_data.metadata.query(())['id']) + + data_id = str(hashlib.sha256(str(values).encode('utf8')).hexdigest()) + if data_id not in self.data: + self.data[data_id] = input_data + return data_id + + def get_data(self, data_id): + if data_id in self.data: + return self.data[data_id] + + +@ray.remote +class RayExecutor: + def __init__(self, *, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None, store_results=False, + blocklist=()) -> None: + self.random_seed = random_seed + self.volumes_dir = volumes_dir + self.scratch_dir = scratch_dir + self.fitted_pipelines = {} + with d3m_utils.silence(): + d3m_index.load_all(blocklist=blocklist) + self.runtime_environment = pipeline_run_module.RuntimeEnvironment() + self.store_results = store_results + + def fit_pipeline( + self, data_handler, problem_description: Problem, pipeline: Pipeline, + input_data_id: str, *, timeout: float = None, expose_outputs: bool = False + ) -> PipelineResult: + pipeline_result = PipelineResult(pipeline=pipeline) + pipeline_result.status = "RUNNING" + pipeline_result.method_called = "fit" + + request_id = data_handler.get_data.remote(input_data_id) + input_data = ray.get(request_id) + + is_standard_pipeline = False + if len(input_data) == 1 and len(pipeline.outputs) == 1: + is_standard_pipeline = True + + with d3m_utils.silence(): + runtime, output, result = runtime_module.fit( + pipeline=pipeline, inputs=input_data, problem_description=problem_description, context=Context.TESTING, + hyperparams=None, random_seed=self.random_seed, volumes_dir=self.volumes_dir, + scratch_dir=self.scratch_dir, + runtime_environment=self.runtime_environment, is_standard_pipeline=is_standard_pipeline, + expose_produced_outputs=expose_outputs + ) + + if result.has_error(): + pipeline_result.status = "ERRORED" + pipeline_result.error = result.error + else: + pipeline_result.status = "COMPLETED" + fitted_pipeline_id = str(uuid.uuid4()) + + if self.store_results: + pipeline_result.exposed_outputs = save_exposed_values(result.values, pipeline.id, self.scratch_dir) + pipeline_result.output = save_exposed_values(output, pipeline.id, self.scratch_dir) + else: + pipeline_result.exposed_outputs = result.values + pipeline_result.output = output + + pipeline_result.fitted_pipeline_id = fitted_pipeline_id + self.fitted_pipelines[fitted_pipeline_id] = runtime + + if self.store_results: + pipeline_result.pipeline_run = save_pipeline_run(result.pipeline_run, self.scratch_dir) + + return pipeline_result + + def produce_pipeline( + self, data_handler, fitted_pipeline_id: str, input_data_id: str, *, + timeout: float = None, expose_outputs: bool = False + ) -> PipelineResult: + + pipeline_result = PipelineResult(fitted_pipeline_id=fitted_pipeline_id) + pipeline_result.status = "RUNNING" + pipeline_result.method_called = "produce" + pipeline_result.fitted_pipeline_id = fitted_pipeline_id + + request_id = data_handler.get_data.remote(input_data_id) + input_data = ray.get(request_id) + + with d3m_utils.silence(): + output, result = runtime_module.produce( + fitted_pipeline=self.fitted_pipelines[fitted_pipeline_id], test_inputs=input_data, + expose_produced_outputs=expose_outputs + ) + + if result.has_error(): + pipeline_result.status = "ERRORED" + pipeline_result.error = result.error + else: + pipeline_result.status = "COMPLETED" + if self.store_results: + pipeline_result.exposed_outputs = save_exposed_values(result.values, fitted_pipeline_id, self.scratch_dir) + pipeline_result.output = save_exposed_values(output, fitted_pipeline_id, self.scratch_dir) + else: + pipeline_result.exposed_outputs = result.values + pipeline_result.output = output + + if self.store_results: + pipeline_result.pipeline_run = save_pipeline_run(result.pipeline_run, self.scratch_dir) + + return pipeline_result + + def evaluate_pipeline( + self, data_handler, problem_description: Problem, pipeline: Pipeline, + input_data_id: str, *, metrics: typing.Sequence[typing.Dict], + data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None, + data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None, + timeout: float = None + ) -> PipelineResult: + + with d3m_utils.silence(): + pipeline_result = PipelineResult(pipeline=pipeline) + pipeline_result.status = "RUNNING" + pipeline_result.method_called = "evaluate" + + request_id = data_handler.get_data.remote(input_data_id) + input_data = ray.get(request_id) + + with d3m_utils.silence(): + scores, results = runtime_module.evaluate( + pipeline=pipeline, inputs=input_data, data_pipeline=data_preparation_pipeline, + scoring_pipeline=scoring_pipeline, problem_description=problem_description, + data_params=data_preparation_params, metrics=metrics, context=Context.TESTING, + scoring_params=scoring_params, hyperparams=None, random_seed=self.random_seed, + data_random_seed=self.random_seed, scoring_random_seed=self.random_seed, + volumes_dir=self.volumes_dir, scratch_dir=self.scratch_dir, runtime_environment=self.runtime_environment + ) + + if results.has_error(): + pipeline_result.status = "ERRORED" + pipeline_result.error = [result.error for result in results] + else: + pipeline_result.status = "COMPLETED" + pipeline_result.scores = runtime_module.combine_folds(scores) + + if self.store_results: + pipeline_result.pipeline_run = save_pipeline_run(results.pipeline_runs, self.scratch_dir) + return pipeline_result + + def fitted_pipeline_id_exists(self, fitted_pipeline_id): + return fitted_pipeline_id in self.fitted_pipelines + + +class RayRunner(RunnerBase): + def __init__(self, *, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None, + store_results=False, n_workers=None, blocklist=()) -> None: + if not ray.is_initialized(): + ray.init() + + super().__init__(random_seed=random_seed, volumes_dir=volumes_dir, scratch_dir=scratch_dir) + self.data_handler = DataHandler.remote() + self.ray_executor = RayExecutor.remote(random_seed=random_seed, + volumes_dir=volumes_dir, scratch_dir=scratch_dir, + store_results=store_results,blocklist=blocklist) + + if n_workers is None: + n_workers = multiprocessing.cpu_count() + self.actor_pool = ActorPool([ + RayExecutor.remote(random_seed=random_seed, volumes_dir=volumes_dir, + scratch_dir=scratch_dir, store_results=store_results, + blocklist=blocklist) for _ in range(n_workers)] + ) + + # Wait for primitives to be load on the workers + # time.sleep(len(d3m_index.search()) * 0.15) + + def stop_ray(self): + ray.shutdown() + + def get_request(self, request_id: str): + return ray.get(ray.ObjectID(binascii.unhexlify(request_id))) + + def fit_pipeline_request(self, problem_description: Problem, pipeline: Pipeline, + input_data: typing.Sequence[ContainerType], *, timeout: float = None, + expose_outputs: bool = False) -> str: + + request_id = self.data_handler.add_data.remote(input_data) + input_data_id = ray.get(request_id) + request_id = self.ray_executor.fit_pipeline.remote(self.data_handler, problem_description, pipeline, input_data_id, + timeout=timeout, expose_outputs=expose_outputs) + return request_id.hex() + + def produce_pipeline_request(self, fitted_pipeline_id: str, input_data: typing.Sequence[ContainerType], *, + timeout: float = None, expose_outputs: bool = False) -> str: + request_id = self.data_handler.add_data.remote(input_data) + input_data_id = ray.get(request_id) + request_id = self.ray_executor.produce_pipeline.remote(self.data_handler, fitted_pipeline_id, input_data_id, timeout=timeout, + expose_outputs=expose_outputs) + return request_id.hex() + + def evaluate_pipeline_request( + self, problem_description: Problem, pipeline: Pipeline, + input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict], + data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None, + data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None, + timeout: float = None + ) -> str: + request_id = self.data_handler.add_data.remote(input_data) + input_data_id = ray.get(request_id) + + request_id = self.ray_executor.evaluate_pipeline.remote( + self.data_handler, problem_description, pipeline, input_data_id, metrics=metrics, + data_preparation_pipeline=data_preparation_pipeline, scoring_pipeline=scoring_pipeline, + data_preparation_params=data_preparation_params, scoring_params=scoring_params, timeout=timeout + ) + return request_id.hex() + + def fitted_pipeline_id_exists(self, fitted_pipeline_id): + request_id = self.ray_executor.fitted_pipeline_id_exists.remote(fitted_pipeline_id) + return ray.get(request_id) + + def evaluate_pipelines( + self, problem_description: Problem, pipelines: typing.Sequence[Pipeline], + input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict], + data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None, + data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None, + timeout: float = None + ) -> typing.Sequence[PipelineResult]: + request_id = self.data_handler.add_data.remote(input_data) + input_data_id = ray.get(request_id) + + args = [] + for pipeline in pipelines: + args.append({ + 'data_handler': self.data_handler, 'problem_description': problem_description, 'pipeline': pipeline, + 'input_data_id': input_data_id, 'metrics': metrics, 'data_preparation_pipeline': data_preparation_pipeline, + 'scoring_pipeline': scoring_pipeline,'data_preparation_params': data_preparation_params, + 'scoring_params': scoring_params,'timeout': timeout + }) + + return self.actor_pool.map(lambda actor, arg: actor.evaluate_pipeline.remote(**arg), args) diff --git a/axolotl/axolotl/backend/simple.py b/axolotl/axolotl/backend/simple.py new file mode 100644 index 0000000..2d6b9ad --- /dev/null +++ b/axolotl/axolotl/backend/simple.py @@ -0,0 +1,178 @@ +import typing +import uuid + +from d3m import utils as d3m_utils +from d3m import runtime as runtime_module +from d3m.metadata.problem import Problem +from d3m.metadata.pipeline import Pipeline +from d3m.metadata.base import Context +from d3m.metadata import pipeline_run as pipeline_run_module + +from axolotl.backend.base import RunnerBase +from axolotl.utils.pipeline import PipelineResult +from axolotl.utils.schemas import ContainerType + + +class SimpleRunner(RunnerBase): + def __init__(self, *, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None) -> None: + super().__init__(random_seed=random_seed, volumes_dir=volumes_dir, scratch_dir=scratch_dir) + self.fitted_pipelines = {} + self.request_results = {} + + with d3m_utils.silence(): + self.runtime_environment = pipeline_run_module.RuntimeEnvironment() + + def get_request(self, request_id: str) -> PipelineResult: + """ + A method that returns the result from the requests + + Parameters + ---------- + request_id : str + Request id of data to retrieve + + Returns + ------- + PipelineResult + A PipelineResult instance that contains the information. + """ + if request_id in self.request_results: + return self.request_results[request_id] + else: + return PipelineResult(fitted_pipeline_id='') + + def fit_pipeline_request(self, problem_description: Problem, pipeline: Pipeline, + input_data: typing.Sequence[ContainerType], *, timeout: float = None, + expose_outputs: bool = False) -> str: + """ + A method that submit a fit_pipeline job. + + Parameters + ---------- + problem_description : Problem + A problem description. + pipeline : Pipeline + The pipeline that is going to be fitted. + input_data : typing.Sequence[ContainerType] + A list of D3M containers. + timeout : float + A maximum amount of time that pipelines are going to be executed in seconds. + expose_outputs : bool + A variable that enable exposing every intermediate results based on the input_data + + Returns + ------- + str + A request id. + """ + request_id = str(uuid.uuid4()) + pipeline_result = PipelineResult(pipeline=pipeline) + pipeline_result.status = "RUNNING" + pipeline_result.method_called = "fit" + + is_standard_pipeline = False + if len(input_data) == 1 and len(pipeline.outputs) == 1: + is_standard_pipeline = True + + runtime, output, result = runtime_module.fit( + pipeline=pipeline, inputs=input_data, problem_description=problem_description, context=Context.TESTING, + hyperparams=None, random_seed=self.random_seed, volumes_dir=self.volumes_dir, + scratch_dir=self.scratch_dir, + runtime_environment=self.runtime_environment, is_standard_pipeline=is_standard_pipeline, + expose_produced_outputs=expose_outputs + ) + + if result.has_error(): + pipeline_result.status = "ERRORED" + pipeline_result.error = result.error + else: + pipeline_result.status = "COMPLETED" + pipeline_result.exposed_outputs = result.values + pipeline_result.output = output + fitted_pipeline_id = str(uuid.uuid4()) + pipeline_result.fitted_pipeline_id = fitted_pipeline_id + self.fitted_pipelines[fitted_pipeline_id] = runtime + + pipeline_result.pipeline_run = result.pipeline_run + self.request_results[request_id] = pipeline_result + + return request_id + + def produce_pipeline_request(self, fitted_pipeline_id: str, input_data: typing.Sequence[ContainerType], *, + timeout: float = None, expose_outputs: bool = False) -> str: + """ + A method that submit a produce pipeline request. + + Parameters + ---------- + fitted_pipeline_id : str + The fitted pipeline if of the fitted pipeline to be use to produce results. + input_data : typing.Sequence[ContainerType] + A list of D3M containers. + timeout : float + A maximum amount of time that pipelines are going to be executed in seconds. + expose_outputs : bool + A variable that enable exposing every intermediate results based on the input_data + + Returns + ------- + str + A request id. + """ + request_id = str(uuid.uuid4()) + + pipeline_result = PipelineResult(fitted_pipeline_id=fitted_pipeline_id) + pipeline_result.status = "RUNNING" + pipeline_result.method_called = "produce" + pipeline_result.fitted_pipeline_id = fitted_pipeline_id + + output, result = runtime_module.produce( + fitted_pipeline=self.fitted_pipelines[fitted_pipeline_id], test_inputs=input_data, + expose_produced_outputs=expose_outputs + ) + + if result.has_error(): + pipeline_result.status = "ERRORED" + pipeline_result.error = result.error + else: + pipeline_result.status = "COMPLETED" + pipeline_result.output = output + pipeline_result.exposed_outputs = result.values + + pipeline_result.pipeline_run = result.pipeline_run + self.request_results[request_id] = pipeline_result + + return request_id + + def evaluate_pipeline_request( + self, problem_description: Problem, pipeline: Pipeline, + input_data: typing.Sequence[ContainerType], *, metrics: typing.Sequence[typing.Dict], + data_preparation_pipeline: Pipeline = None, scoring_pipeline: Pipeline = None, + data_preparation_params: typing.Dict[str, str] = None, scoring_params: typing.Dict[str, str] = None, + timeout: float = None + ) -> str: + request_id = str(uuid.uuid4()) + + pipeline_result = PipelineResult(pipeline=pipeline) + pipeline_result.status = "RUNNING" + pipeline_result.method_called = "evaluate" + + scores, results = runtime_module.evaluate( + pipeline=pipeline, inputs=input_data, data_pipeline=data_preparation_pipeline, + scoring_pipeline=scoring_pipeline, problem_description=problem_description, + data_params=data_preparation_params, metrics=metrics, context=Context.TESTING, + scoring_params=scoring_params, hyperparams=None, random_seed=self.random_seed, + data_random_seed=self.random_seed, scoring_random_seed=self.random_seed, + volumes_dir=self.volumes_dir, scratch_dir=self.scratch_dir, runtime_environment=self.runtime_environment + ) + + if results.has_error(): + pipeline_result.status = "ERRORED" + pipeline_result.error = [result.error for result in results] + else: + pipeline_result.status = "COMPLETED" + pipeline_result.scores = runtime_module.combine_folds(scores) + + self.request_results[request_id] = pipeline_result + return request_id + diff --git a/axolotl/axolotl/d3m_grpc/__init__.py b/axolotl/axolotl/d3m_grpc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/axolotl/axolotl/d3m_grpc/constants.py b/axolotl/axolotl/d3m_grpc/constants.py new file mode 100644 index 0000000..e1f0f4b --- /dev/null +++ b/axolotl/axolotl/d3m_grpc/constants.py @@ -0,0 +1,127 @@ +import os +import json +import re + +from axolotl.utils.resources import check_directory + + +# A class to wrap envrioment variables under d3m scope. +class EnvVars: + # A label what is the setting under which the pod is being run; possible + # values: ta2, ta2ta3; this variable is available only for informative + # purposes but it is not used anymore to change an overall mode of operation + # of TA2 system because now TA2 evaluation will happen through TA2-TA3 API + # as well + D3MRUN = 'run' + PROJECT_ROOT = os.path.join(os.path.dirname(__file__), '../..') + # A location of dataset(s), can contain multiple datasets in arbitrary + # directory structure, read-only + D3MINPUTDIR = '/input_dir' + # A location to problem description to use (should be under D3MINPUTDIR), + # datasets are linked from the problem description using IDs, those datasets + # should exist inside D3MINPUTDIR + D3MPROBLEMPATH = 'problem_path' + # A location of output files, shared by TA2 and TA3 pods (and probably data + # mart) + D3MOUTPUTDIR = os.path.join(PROJECT_ROOT, 'output_dir') + # A local-to-host directory provided; used by memory sharing mechanisms + D3MLOCALDIR = os.path.join(D3MOUTPUTDIR, 'temp', 'plasma') + # A path to the volume with primitives' static files + D3MSTATICDIR = None + # Available CPU units in Kubernetes specification + D3MCPU = 0 + # Available CPU units in Kubernetes specification + D3MRAM = 0 + # Time limit for the search phase (available to the pod), in seconds + D3MTIMEOUT = -1 + + # Plasma socket + PLASMA_SOCKET = '/tmp/plasma' + + # datamart uri DATAMART_URL_NYU + DATAMART_URL_NYU = 'https://datamart.d3m.vida-nyu.org' + + if 'D3MRUN' in os.environ: + D3MRUN = os.environ['D3MRUN'] + if 'D3MINPUTDIR' in os.environ: + D3MINPUTDIR = os.environ['D3MINPUTDIR'] + if 'D3MPROBLEMPATH' in os.environ: + D3MPROBLEMPATH = os.environ['D3MPROBLEMPATH'] + if 'D3MOUTPUTDIR' in os.environ: + D3MOUTPUTDIR = os.environ['D3MOUTPUTDIR'] + if 'D3MLOCALDIR' in os.environ: + D3MLOCALDIR = os.environ['D3MLOCALDIR'] + if 'D3MSTATICDIR' in os.environ: + D3MSTATICDIR = os.environ['D3MSTATICDIR'] + if 'D3MCPU' in os.environ: + D3MCPU = int(float(os.environ['D3MCPU'])) + # if we don't set it or its to low set to 4 + # if D3MCPU < 4: + # D3MCPU = 4 + if 'D3MRAM' in os.environ: + D3MRAM = int(re.search(r'\d+', os.environ['D3MRAM']).group()) + if 'D3MTIMEOUT' in os.environ: + D3MTIMEOUT = os.environ['D3MTIMEOUT'] + if 'PLASMA_SOCKET' in os.environ: + PLASMA_SOCKET = os.environ['PLASMA_SOCKET'] + if 'DATAMART_URL_NYU' in os.environ: + DATAMART_URL_NYU = os.environ['DATAMART_URL_NYU'] + + +# # +class Path: + # Temporary directories. + # A temporary directory for other things. + TEMP_STORAGE_ROOT = os.path.join(EnvVars.D3MOUTPUTDIR, 'temp/') + # A temporary directory to store other stuff between ta2-ta3 + OTHER_OUTPUTS = os.path.join(TEMP_STORAGE_ROOT, 'other_outputs') + # To deprecate after figure out what to do with executables. + TEMP_PROBLEM_DESC = os.path.join(TEMP_STORAGE_ROOT, 'problem_description') + + check_directory(TEMP_STORAGE_ROOT) + check_directory(OTHER_OUTPUTS) + check_directory(TEMP_PROBLEM_DESC) + + +class SearchPath: + + def __init__(self, search_id): + self.base_path = os.path.join(EnvVars.D3MOUTPUTDIR, search_id) + + # A directory with ranked pipelines to be evaluated, named + # .json; these files should have additional field pipeline_rank + self.pipelines_ranked = os.path.join(self.base_path, 'pipelines_ranked') + check_directory(self.pipelines_ranked) + + # A directory with successfully scored pipelines during the search, + # named .json + self.pipelines_scored = os.path.join(self.base_path, 'pipelines_scored') + check_directory(self.pipelines_scored) + # A directory of full pipelines which have not been scored or ranked for any + # reason, named .json + self.pipelines_searched = os.path.join(self.base_path, 'pipelines_searched') + check_directory(self.pipelines_searched) + # A directory with any subpipelines referenced from pipelines in + # pipelines_* directories, named .json + self.subpipelines = os.path.join(self.base_path, 'subpipelines') + check_directory(self.subpipelines) + # A directory with pipeline run records in YAML format, multiple can be + # stored in the same file, named .yml + self.pipeline_runs = os.path.join(self.base_path, 'pipeline_runs') + check_directory(self.pipeline_runs) + # A directory where TA2 system can store any additional datasets to be + # provided during training and testing to their pipelines; each dataset + # should be provided in a sub-directory in a D3M dataset format; all + # datasets here should have an unique ID; in the case that additional + # datasets are provided, TA2 should output also pipeline run documents for + # their ranked pipelines because those pipeline run documents contain + # information how to map these additional inputs to pipeline inputs + self.additional_inputs = os.path.join(self.base_path, 'additional_inputs') + check_directory(self.additional_inputs) + + +# A class that wraps a block list of primitives +# To generate this list is necessary to run modules.utils.primitive_selection +class PrimitivesList: + with open(os.path.join(os.path.dirname(__file__), '..', 'utils', 'resources', 'blocklist.json'), 'r') as file: + BlockList = json.load(file) diff --git a/axolotl/axolotl/d3m_grpc/server.py b/axolotl/axolotl/d3m_grpc/server.py new file mode 100644 index 0000000..7964470 --- /dev/null +++ b/axolotl/axolotl/d3m_grpc/server.py @@ -0,0 +1,854 @@ +import argparse +import json +import logging +import os +import pathlib +import time +import warnings +from concurrent import futures +import ray +import os +import uuid + +import google.protobuf.timestamp_pb2 as p_timestamp +import grpc +from d3m import utils as d3m_utils, index as d3m_index +from d3m.metadata import problem as problem_module +from d3m.metadata.pipeline import Resolver +from d3m import container +from d3m import runtime as runtime_module +from d3m.metadata.base import Context +from ta3ta2_api import core_pb2, core_pb2_grpc, primitive_pb2, value_pb2, utils + +from axolotl.backend.ray import RayRunner +from axolotl.algorithms.dummy import DummySearch, dummy_ranking_function +from axolotl.algorithms.data_driven_search import DataDrivenSearch +from axolotl.utils.pipeline import load_pipeline, save_pipeline +from axolotl.d3m_grpc.constants import SearchPath, EnvVars, PrimitivesList, Path +from axolotl.utils import resources as resources_module, schemas as schemas_utils + +from pprint import pprint + + +__version__ = '2020.4.4_pre' +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +logger = logging.getLogger(__name__) +AGENT = 'TAMU.10.0_pre' +ALLOWED_VALUE_TYPES = ['RAW', 'DATASET_URI', 'CSV_URI'] +SUPPORTED_EXTENSIONS = [] + + +def available_primitives(): + primitives_info = [] + + with d3m_utils.silence(): + for primitive_path in d3m_index.search(): + if primitive_path in PrimitivesList.BlockList: + continue + + try: + primitive = d3m_index.get_primitive(primitive_path) + primitive_id = primitive.metadata.query()['id'] + version = primitive.metadata.query()['version'] + python_path = primitive.metadata.query()['python_path'] + name = primitive.metadata.query()['name'] + digest = primitive.metadata.query().get('digest', None) + primitive_info = { + 'id': primitive_id, + 'version': version, + 'python_path': python_path, + 'name': name, + 'digest': digest + } + primitives_info.append(primitive_info) + except: + continue + return primitives_info + + +PRIMITIVES_LIST = available_primitives() + + +@ray.remote +class SearchWrappers: + def __init__(self, search_class, problem_description, backend, primitives_blocklist=None, ranking_function=None, n_workers=2): + self.search_algorithm = search_class(problem_description=problem_description, backend=backend, + primitives_blocklist=primitives_blocklist, ranking_function=ranking_function, + n_workers=n_workers) + self._seen_index = 0 + self.has_input_data = False + self.time_left = None + self.active_search = True + self.save_path = SearchPath(self.search_algorithm.search_id) + + def search_request(self, time_left, input_data=None): + time_start = time.time() + if not self.has_input_data: + self.search_algorithm.input_data = input_data + self.time_left = time_left + self.has_input_data = True + + results = self.search_algorithm._search(time_left) + self.search_algorithm.history += results + succeed_pipelines = [] + for result in results: + print('pipeline', result.pipeline.id, result.status) + # save all results in pipelines searched + save_pipeline(result.pipeline, self.save_path.pipelines_searched) + + # save all pipelines_runs + resources_module.copy_file(result.pipeline_run, self.save_path.pipeline_runs) + + # we filter the ones that were completed + if result.status == 'COMPLETED': + # since we were able to score it, we put a copy into the pipelines_scored directory + save_pipeline(result.pipeline, self.save_path.pipelines_scored) + succeed_pipelines.append(result) + + self.time_left -= time.time() - time_start + return succeed_pipelines + + def end_search(self): + self.active_search = False + + def is_search_active(self): + return self.active_search + + def get_search_id(self): + return self.search_algorithm.search_id + + def get_time_left(self): + return self.time_left + + +class Core(core_pb2_grpc.CoreServicer): + """ + A class that works as a server that provides support for the pipeline searches, and provides the interfaces + defined on the TA3-2 API. + + Attributes + ---------- + version: str + A str that represents the version of the Ta3-2 api that is supporting. + user_agents: dict() + A simple dictionary that keep the relation of the different users. + manager: ExecutionManger + Schedules the searches, and all resources related with the search. + """ + + def __init__(self): + logger.info('########## Initializing Service ##########') + self.version = core_pb2.DESCRIPTOR.GetOptions().Extensions[core_pb2.protocol_version] + self.n_workers = EnvVars.D3MCPU + if self.n_workers > 7: + self.n_workers = int(self.n_workers/2) + 1 + print('Server n_workers', self.n_workers) + self.backend = RayRunner(random_seed=0, volumes_dir=EnvVars.D3MSTATICDIR, scratch_dir=Path.TEMP_STORAGE_ROOT, + blocklist=PrimitivesList.BlockList, store_results=True, n_workers=self.n_workers) + self.searches = {} + self.request_mapping = {} + self.solutions = {} + self.problem_descriptions = {} + + # TODO add support for templates + def SearchSolutions(self, request, context): + user_agent = request.user_agent + logger.info('method=SearchSolution, agent=%s', user_agent) + + # Checking version of protocol. + if request.version != self.version: + logger.info(' method=SearchSolution, info=Different api version%s', self.version) + + # Types allowed by client + allowed_value_types = list(request.allowed_value_types) + + if not allowed_value_types: + allowed_value_types = ALLOWED_VALUE_TYPES + + problem_description = utils.decode_problem_description(request.problem) + + # Parsing and storing Pipeline Template (store this to a file instead of passing it) + with d3m_utils.silence(): + template = utils.decode_pipeline_description(pipeline_description=request.template, + resolver=Resolver(primitives_blocklist=PrimitivesList.BlockList)) + + time_bound_search = request.time_bound_search + time_bound_search = time_bound_search * 60 + + input_data = [load_data(utils.decode_value(x)) for x in request.inputs] + + search = SearchWrappers.remote(search_class=DataDrivenSearch, problem_description=problem_description, + backend=self.backend, primitives_blocklist=PrimitivesList.BlockList, + ranking_function=dummy_ranking_function, n_workers=self.n_workers) + + request_id = search.get_search_id.remote() + search_id = ray.get(request_id) + + # print('got search_id') + self.searches[search_id] = search + request_id = self.searches[search_id].search_request.remote(time_left=time_bound_search, input_data=input_data) + + self.request_mapping[search_id] = request_id + self.solutions[search_id] = [] + self.problem_descriptions[search_id] = problem_description + response = core_pb2.SearchSolutionsResponse(search_id=search_id) + return response + + def GetSearchSolutionsResults(self, request, context): + search_id = request.search_id + logger.info('method=GetSearchSolutionsResults, search_id=%s', search_id) + request_id = self.request_mapping[search_id] + + progress_start = p_timestamp.Timestamp() + progress_end = p_timestamp.Timestamp() + + all_ticks = 0 + done_ticks = 0 + + # Yield running so the client know the search is running. + progress = core_pb2.Progress(state='RUNNING', status='Running Search', start=progress_start) + response = core_pb2.GetSearchSolutionsResultsResponse(progress=progress) + yield response + + has_solution = False + + succeed_pipelines = ray.get(request_id) + time_left_id = self.searches[search_id].get_time_left.remote() + time_left = ray.get(time_left_id) + + while True: + start_time = time.time() + + # if no time left we stop + if time_left < 5: + break + + # case if a signal from EndSolution is sent to stop the search + is_active_id = self.searches[search_id].is_search_active.remote() + is_active = ray.get(is_active_id) + + if not is_active: + logger.info('method=GetSearchSolutionsResults, search_id={} message=SearchStopped'.format(search_id)) + break + + for succeed_pipeline in succeed_pipelines: + has_solution = True + logger.info('method=GetSearchSolutionsResults, search_id={} solution_id={}'.format( + search_id,succeed_pipeline.pipeline.id)) + response = core_pb2.GetSearchSolutionsResultsResponse( + progress=progress, + done_ticks=done_ticks, + all_ticks=all_ticks, + solution_id=succeed_pipeline.pipeline.id, + internal_score=1-succeed_pipeline.rank, + scores=[core_pb2.SolutionSearchScore(scores=encode_scores(succeed_pipeline))] + ) + self.solutions[search_id].append(succeed_pipeline.pipeline.id) + yield response + + finished, running = ray.wait([request_id], timeout=1) + + if finished: + succeed_pipelines = ray.get(request_id) + request_id = self.searches[search_id].search_request.remote(time_left=time_left) + else: + succeed_pipelines = [] + + time.sleep(1) + + time_left -= time.time() - start_time + + if has_solution: + progress_state = 'COMPLETED' + progress_status = 'Search completed' + else: + progress_state = 'ERRORED' + progress_status = 'No solution founded' + + logger.info('method=GetSearchSolutionsResults, search_id={}, status={}, message={}'.format( + search_id, progress_state, progress_status) + ) + progress_end.GetCurrentTime() + progress = core_pb2.Progress(state=progress_state, status=progress_status, + start=progress_start, end=progress_end) + response = core_pb2.GetSearchSolutionsResultsResponse(progress=progress, done_ticks=done_ticks, + all_ticks=all_ticks,) + yield response + + def EndSearchSolutions(self, request, context): + search_id = request.search_id + logger.info('method=EndSearchSolutions search_id=%s', search_id) + ray.kill(self.searches[search_id]) + del self.searches[search_id] + response = core_pb2.EndSearchSolutionsResponse() + return response + + def StopSearchSolutions(self, request, context): + search_id = request.search_id + self.searches[search_id].end_search.remote() + logger.info('method=StopSearchSolutions search_id=%s', search_id) + response = core_pb2.StopSearchSolutionsResponse() + return response + + def DescribeSolution(self, request, context): + solution_id = request.solution_id + logger.info('method=DescribeSolution, solution_id=%s', solution_id) + + pipeline, _, _ = self.get_solution_problem(solution_id) + if pipeline is None: + logger.info('method=DescribeSolution, solution_id=%s, error=Solution_id not found', solution_id) + response = core_pb2.DescribeSolutionResponse() + return response + + with d3m_utils.silence(): + pipeline = utils.encode_pipeline_description(pipeline, ALLOWED_VALUE_TYPES, Path.TEMP_STORAGE_ROOT) + + response = core_pb2.DescribeSolutionResponse(pipeline=pipeline) + return response + + def ScoreSolution(self, request, context): + solution_id = request.solution_id + logger.info('method=SocreSolution, solution_id=%s', solution_id) + + pipeline, problem_description, _ = self.get_solution_problem(solution_id) + if pipeline is None: + logger.info('method=FitSolution, solution_id=%s, status=ERRORED, error=Solution_id not found', solution_id) + response = core_pb2.ScoreSolutionResponse() + return response + + input_data = [load_data(utils.decode_value(x)) for x in request.inputs] + metrics = [utils.decode_performance_metric(metric) for metric in request.performance_metrics] + scoring_pipeline = schemas_utils.get_scoring_pipeline() + data_preparation_params = decode_scoring_configuration(request.configuration) + data_preparation_pipeline = schemas_utils.get_splitting_pipeline(data_preparation_params['method']) + + request_id = self.backend.evaluate_pipeline_request( + problem_description=problem_description, pipeline=pipeline, input_data=input_data, + metrics=metrics, data_preparation_pipeline=data_preparation_pipeline, + scoring_pipeline=scoring_pipeline, data_preparation_params=data_preparation_params) + + response = core_pb2.ScoreSolutionResponse(request_id=request_id) + return response + + def GetScoreSolutionResults(self, request, context): + request_id = request.request_id + logger.info('method=GetScoreSolutionResults, request_id=%s', request_id) + + progress_start = p_timestamp.Timestamp() + progress_end = p_timestamp.Timestamp() + progress_start.GetCurrentTime() + + progress = core_pb2.Progress(state='RUNNING', status='Running score job', start=progress_start) + response = core_pb2.GetScoreSolutionResultsResponse(progress=progress) + yield response + + pipeline_result = self.backend.get_request(request_id) + progress_end.GetCurrentTime() + + if pipeline_result.error is None: + progress = core_pb2.Progress( + state='COMPLETED', + status='Score job COMPLETED', + start=progress_start, + end=progress_end + ) + + response = core_pb2.GetScoreSolutionResultsResponse( + progress=progress, scores=encode_scores(pipeline_result)) + else: + progress = core_pb2.Progress( + state='ERRORED', + status=str(pipeline_result.error), + start=progress_start, + end=progress_end + ) + + response = core_pb2.GetScoreSolutionResultsResponse(progress=progress) + yield response + return + + def FitSolution(self, request, context): + solution_id = request.solution_id + logger.info('method=FitSolution solution_id=%s', solution_id) + + pipeline, problem_description, _ = self.get_solution_problem(solution_id) + if pipeline is None: + logger.info('method=FitSolution, solution_id=%s, status=ERRORED, error=Solution_id not found', solution_id) + response = core_pb2.FitSolutionResponse() + return response + + input_data = [load_data(utils.decode_value(x)) for x in request.inputs] + + expose_outputs = [expose_output for expose_output in request.expose_outputs] + if expose_outputs: + expose_outputs = True + else: + expose_outputs = False + + request_id = self.backend.fit_pipeline_request( + problem_description=problem_description, pipeline=pipeline, + input_data=input_data, expose_outputs=expose_outputs + ) + + response = core_pb2.FitSolutionResponse(request_id=request_id) + return response + + def GetFitSolutionResults(self, request, context): + request_id = request.request_id + logger.info('method=GetFitSolutionResults request_id=%s', request_id) + + progress_start = p_timestamp.Timestamp() + progress_end = p_timestamp.Timestamp() + progress_start.GetCurrentTime() + + progress = core_pb2.Progress(state='RUNNING', status='Running fit job', start=progress_start) + response = core_pb2.GetFitSolutionResultsResponse(progress=progress) + yield response + + pipeline_result = self.backend.get_request(request_id) + progress_end.GetCurrentTime() + + if pipeline_result.error is None: + progress = core_pb2.Progress( + state='COMPLETED', + status='Fit job COMPLETED', + start=progress_start, + end=progress_end + ) + response = core_pb2.GetFitSolutionResultsResponse( + progress=progress, steps=[], exposed_outputs=encode_exposed_values(pipeline_result.exposed_outputs), + fitted_solution_id=pipeline_result.fitted_pipeline_id + ) + else: + progress = core_pb2.Progress( + state='ERRORED', + status=str(pipeline_result.error), + start=progress_start, + end=progress_end + ) + + response = core_pb2.GetFitSolutionResultsResponse(progress=progress) + yield response + return + + def ProduceSolution(self, request, context): + fitted_solution_id = request.fitted_solution_id + logger.info('method=ProduceSolution, fitted_solution_id=%s', fitted_solution_id) + + if not self.backend.fitted_pipeline_id_exists(fitted_solution_id): + logger.info( + 'method=ProduceSolution, fitted_solution_id=%s, status=ERRORED info=No fitted_solution_id found', fitted_solution_id) + response = core_pb2.ProduceSolutionResponse() + return response + + input_data = [load_data(utils.decode_value(x)) for x in request.inputs] + + expose_outputs = [expose_output for expose_output in request.expose_outputs] + if expose_outputs: + expose_outputs = True + else: + expose_outputs = False + + request_id = self.backend.produce_pipeline_request(fitted_pipeline_id=fitted_solution_id, + input_data=input_data, expose_outputs=expose_outputs) + response = core_pb2.ProduceSolutionResponse(request_id=request_id) + return response + + # TODO add expose_outputs to files + def GetProduceSolutionResults(self, request, context): + request_id = request.request_id + logger.info('method=GetProduceSolutionResults, request_id=%s', request_id) + + progress_start = p_timestamp.Timestamp() + progress_end = p_timestamp.Timestamp() + progress_start.GetCurrentTime() + + progress = core_pb2.Progress(state='RUNNING', status='Running produce job', start=progress_start) + response = core_pb2.GetProduceSolutionResultsResponse(progress=progress) + yield response + + pipeline_result = self.backend.get_request(request_id) + progress_end.GetCurrentTime() + + if pipeline_result.error is None: + progress = core_pb2.Progress( + state='COMPLETED', + status='Produce job COMPLETED', + start=progress_start, + end=progress_end + ) + step_progress = [] + + response = core_pb2.GetProduceSolutionResultsResponse( + progress=progress, steps=step_progress, exposed_outputs=encode_exposed_values(pipeline_result.exposed_outputs)) + else: + progress = core_pb2.Progress( + state='ERRORED', + status=str(pipeline_result.error), + start=progress_start, + end=progress_end + ) + + response = core_pb2.GetProduceSolutionResultsResponse(progress=progress) + yield response + return + + def SolutionExport(self, request, context): + solution_id = request.solution_id + rank = request.rank + + try: + pipeline, _, search_id = self.get_solution_problem(solution_id) + except: + pipeline = None + + if pipeline is None: + logger.info('method=SolutionExport, solution_id=%s, status=ERRORED, error=No solution_id found', solution_id) + else: + logger.info('method=SolutionExport solution_id=%s', solution_id) + save_pipeline(pipeline, SearchPath(search_id).pipelines_ranked, rank=rank) + response = core_pb2.SolutionExportResponse() + return response + + # def SaveSolution(self, request, context): + # solution_id = request.solution_id + # logger.info('method=SaveSolution solution_id=%s', solution_id) + # + # if solution_id not in self.manager.solutions: + # logger.info('method=SaveSolution, solution_id=%s, error=Solution_id not found', solution_id) + # response = core_pb2.SaveSolutionResponse() + # else: + # solution_uri = self.manager.save_solution(solution_id) + # response = core_pb2.SaveSolutionResponse(solution_uri=solution_uri) + # return response + + # def LoadSolution(self, request, context): + # solution_uri = request.solution_uri + # logger.info('method=LoadSolution solution_uri=%s', solution_uri) + # + # if not os.path.exists(solution_uri): + # logger.info('method=LoadSolution, solution_uri=%s, error=solution_uri not found', solution_uri) + # response = core_pb2.LoadSolutionResponse() + # else: + # solution_id = self.manager.load_solution(solution_uri) + # response = core_pb2.LoadSolutionResponse(solution_id=solution_id) + # return response + + # def SaveFittedSolution(self, request, context): + # fitted_solution_id = request.fitted_solution_id + # logger.info('method=SaveFittedSolution, fitted_solution_id=%s', fitted_solution_id) + # + # if fitted_solution_id not in self.manager.fitted_solutions: + # logger.info('method=SaveFittedSolution, fitted_solution_id=%s, status=ERRORED, ' + # 'info=No fitted_solution_id found', fitted_solution_id) + # response = core_pb2.SaveFittedSolutionResponse() + # else: + # fitted_solution_uri = self.manager.save_fitted_solution(fitted_solution_id) + # response = core_pb2.SaveFittedSolutionResponse(fitted_solution_uri=fitted_solution_uri) + # return response + + # def LoadFittedSolution(self, request, context): + # fitted_solution_uri = request.fitted_solution_uri + # logger.info('method=LoadFittedSolution solution_uri=%s', fitted_solution_uri) + # + # if not os.path.exists(fitted_solution_uri): + # logger.info('method=LoadFittedSolution, solution_uri=%s, error=solution_uri not found', fitted_solution_uri) + # response = core_pb2.LoadFittedSolutionResponse() + # else: + # fitted_solution_id = self.manager.load_fitted_solution(fitted_solution_uri) + # response = core_pb2.LoadFittedSolutionResponse(fitted_solution_id=fitted_solution_id) + # return response + + # def ScorePredictions(self, request, context): + # logger.info('method=ScorePredictions') + # predictions = utils.decode_value(request.predictions) + # score_input = utils.decode_value(request.score_input) + # problem = utils.decode_problem_description(request.problem) + # metrics = [utils.decode_performance_metric(_metric) for _metric in request.metric] + # + # scores, score_result = self.manager.score_predictions(predictions, score_input, problem, metrics) + # if score_result.has_error(): + # logger.info('method=ScorePredictions, error={}', score_result.error) + # response = core_pb2.ScorePredictionsResponse() + # else: + # scores = self.encode_scores(scores) + # response = core_pb2.ScorePredictionsResponse(scores=scores) + # return response + + def DataAvailable(self, request, context): + user_agent = request.user_agent + version = request.version + time_bound = request.time_bound + + logger.info('method=DataAvailable, agent={}, version={}, time_bound={}'.format( + user_agent, version, time_bound)) + response = core_pb2.DataAvailableResponse() + return response + + def SplitData(self, request, context): + input_data = [load_data(utils.decode_value(x)) for x in request.inputs] + scoring_configuration = decode_scoring_configuration(request.scoring_configuration) + problem_description = utils.decode_problem_description(request.problem) + data_pipeline = schemas_utils.get_splitting_pipeline(scoring_configuration['method']) + + data_random_seed = 0 + outputs, data_result = runtime_module.prepare_data( + data_pipeline=data_pipeline, problem_description=problem_description, + inputs=input_data, data_params=scoring_configuration, context=Context.TESTING, random_seed=data_random_seed, + volumes_dir=EnvVars.D3MSTATICDIR, scratch_dir=Path.TEMP_STORAGE_ROOT, runtime_environment=None, + ) + + if data_result.has_error(): + logger.info('method=SplitData, error={}', data_result.error) + response = core_pb2.SplitDataResponse() + yield response + return + else: + for i, (train_output, test_output, score_output) in enumerate(zip(*outputs)): + uri_list = [] + for output, tag in ( + (train_output, 'train'), + (test_output, 'test'), + (score_output, 'score'), + ): + path = os.path.join( + Path.TEMP_STORAGE_ROOT, '{}_output_{}'.format(tag, i), 'datasetDoc.json') + uri = get_uri(path) + output.save(uri) + uri_list.append(uri) + # response + response = core_pb2.SplitDataResponse( + train_output=value_pb2.Value(dataset_uri=uri_list[0]), + test_output=value_pb2.Value(dataset_uri=uri_list[1]), + score_output=value_pb2.Value(dataset_uri=uri_list[2]), + ) + yield response + + def ListPrimitives(self, request, context): + logger.info('method=ListPrimitives') + primitives_list = [] + for primitive_info in PRIMITIVES_LIST: + primitives_list.append(primitive_pb2.Primitive(**primitive_info)) + response = core_pb2.ListPrimitivesResponse(primitives=primitives_list) + return response + + def Hello(self, request, context): + logger.info('method=Hello') + user_agent = AGENT + version = core_pb2.DESCRIPTOR.GetOptions().Extensions[core_pb2.protocol_version] + allowed_value_types = ALLOWED_VALUE_TYPES + supported_extensions = SUPPORTED_EXTENSIONS + + response = core_pb2.HelloResponse( + user_agent=user_agent, + version=version, + allowed_value_types=allowed_value_types, + supported_extensions=supported_extensions + ) + return response + + def get_solution_problem(self, solution_id): + describe_search_id = None + for search_id, solution_ids in self.solutions.items(): + if solution_id in solution_ids: + describe_search_id = search_id + break + + if describe_search_id is None: + return None, None, None + + solution_path = os.path.join(SearchPath(describe_search_id).pipelines_scored, '{}.json'.format(solution_id)) + + with d3m_utils.silence(): + pipeline = load_pipeline(solution_path) + + problem_description = self.problem_descriptions[describe_search_id] + return pipeline, problem_description, describe_search_id + + +def encode_exposed_values(exposed_values): + encoded_exposed_values = {} + for name, value in exposed_values.items(): + if '.csv' in value: + encoded_exposed_values[name] = utils.encode_value( + {'type': 'csv_uri', 'value': get_uri(value)}, ALLOWED_VALUE_TYPES, Path.TEMP_STORAGE_ROOT) + elif '.json' in value: + encoded_exposed_values[name] = utils.encode_value( + {'type': 'dataset_uri', 'value': get_uri(value)}, ALLOWED_VALUE_TYPES, Path.TEMP_STORAGE_ROOT) + return encoded_exposed_values + + +def decode_scoring_configuration(scoring_configuration): + """ + Decode a scoring configuration from grpc + + Parameters + ---------- + scoring_configuration: core_pb2.ScoringConfiguration + A grpc ScoringConfiguration message. + + Returns + ------- + configuration: dict + A dictionary with the scoring configuration. + """ + method = scoring_configuration.method + configuration = { + 'method': method, + 'train_score_ratio': str(scoring_configuration.train_test_ratio), + 'stratified': str(scoring_configuration.stratified).lower(), + 'shuffle': str(scoring_configuration.shuffle).lower(), + 'randomSeed': str(scoring_configuration.random_seed), + } + if method == 'K_FOLD': + configuration['number_of_folds'] = str(scoring_configuration.folds) + return configuration + + +def load_data(data): + if data['type'] == 'dataset_uri': + return container.dataset.get_dataset(data['value']) + + +def get_uri(path): + return pathlib.Path(os.path.abspath(path)).as_uri() + + +def encode_scores(pipeline_result): + """ + Encode a dict of scores to a GRPC message + + Parameters + ---------- + pipeline_result + A pipeline_result instance that contains the scores and rank to be encoded. + + Returns + ------- + score_message: GRPC + A GRPC message + """ + ranking = { + 'metric': 'RANK', + 'value': pipeline_result.rank, + 'randomSeed': 0, + 'fold': 0, + } + + all_scores = pipeline_result.scores.append(ranking, ignore_index=True) + + scores = list() + for score in all_scores.to_dict('index').values(): + score['random_seed'] = score['randomSeed'] + try: + score['metric'] = {'metric': score['metric']} + except: + score['metric'] = {'metric': problem_module.PerformanceMetric[score['metric']]} + + scores.append(utils.encode_score(score, ALLOWED_VALUE_TYPES, Path.TEMP_STORAGE_ROOT)) + return scores + + +def encode_scoring_configuration(scoring_configuration): + """ + Decode a scoring configuration from grpc + + Parameters + ---------- + scoring_configuration: dict + A dictionary with the scoring configuration. + + Returns + ------- + scoring_configuration: core_pb2.ScoringConfiguration + A grpc ScoringConfiguration message. + """ + if scoring_configuration is None: + return core_pb2.ScoringConfiguration() + else: + method = scoring_configuration['method'] + folds = scoring_configuration.get('number_of_folds', None) + if folds is not None: + folds = int(folds) + train_test_ratio = scoring_configuration.get('train_score_ratio', None) + if train_test_ratio is not None: + train_test_ratio = float(train_test_ratio) + shuffle = scoring_configuration.get('shuffle', None) + if shuffle is not None: + shuffle = json.loads(shuffle.lower()) + random_seed = scoring_configuration.get('randomSeed', None) + if random_seed is not None: + random_seed = int(random_seed) + stratified = scoring_configuration.get('stratified', None) + if stratified is not None: + stratified = json.loads(stratified.lower()) + return core_pb2.ScoringConfiguration(method=method, folds=folds, train_test_ratio=train_test_ratio, + shuffle=shuffle, random_seed=random_seed, stratified=stratified) + + +class Server: + def __init__(self, arguments): + self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) + self.core = Core() + + core_pb2_grpc.add_CoreServicer_to_server(self.core, self.server) + self.server.add_insecure_port('[::]:45042') + + def start(self): + self.server.start() + + def stop(self): + self.server.stop(0) + + +def configure_parser(parser, *, skip_arguments=()): + parser.add_argument( + '-o', '--output-path', type=str, default=os.path.join(os.getcwd(), "output/"), + help="path where the outputs would be stored" + ) + parser.add_argument( + '-v', '--verbose', type=bool, default=True, + help="Display detailed log" + ) + + +def main(): + ray.init(webui_host='127.0.0.1') + # Creating parser + parser = argparse.ArgumentParser(description="Starts server from command line") + configure_parser(parser) + arguments = parser.parse_args() + + # Setup logger + verbose_format = '%(asctime)s %(levelname)-8s %(processName)-15s [%(filename)s:%(lineno)d] %(message)s' + concise_format = '%(asctime)s %(levelname)-8s %(message)s' + log_format = verbose_format if arguments.verbose else concise_format + logging.basicConfig(format=log_format, + handlers=[logging.StreamHandler(), + logging.FileHandler('{}/d3m.log'.format(Path.TEMP_STORAGE_ROOT), 'w', 'utf-8')], + datefmt='%m/%d %H:%M:%S') + root_logger = logging.getLogger() + root_logger.setLevel(logging.INFO) + warnings.filterwarnings('ignore') + + server = Server(arguments) + + try: + load_time = time.time() + server.start() + with d3m_utils.silence(): + d3m_index.load_all(blocklist=PrimitivesList.BlockList) + print('Wait for loading workers for', len(d3m_index.search())*0.3) + time.sleep(len(d3m_index.search())*0.3) + # time.sleep(5) + logger.info('---------- Waiting for Requests ----------') + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + logger.info('############ STOPPING SERVICE ############') + server.stop() + + +if __name__ == '__main__': + main() diff --git a/axolotl/axolotl/predefined_pipelines/__init__.py b/axolotl/axolotl/predefined_pipelines/__init__.py new file mode 100644 index 0000000..d6eba82 --- /dev/null +++ b/axolotl/axolotl/predefined_pipelines/__init__.py @@ -0,0 +1,133 @@ +import json +import os +import uuid + +import copy +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import PrimitiveStep +from d3m.container import DataFrame +from d3m import utils as d3m_utils + +from axolotl.predefined_pipelines import preprocessor +from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils + +__all__ = ['fetch', 'fetch_from_file'] + + +def fetch(input_data, problem_description, predefined_path=None): + if predefined_path is None: + root = os.path.join(os.path.dirname(__file__), '../..') + predefined_path = os.path.join(root, 'axolotl', 'utils', 'resources', 'default_pipelines.json') + # ToDo should use yield + pipelines = list() + pipelines_from_file = fetch_from_file(problem_description, path=predefined_path) + pipelines_from_preprocessors = _fetch_from_preprocessors(input_data, problem_description) + for candiate in ( + pipelines_from_file, + pipelines_from_preprocessors, + ): + pipelines.extend(candiate) + return pipelines + + +def fetch_from_file(problem_description, path): + # ToDo should use yield + task_type, task_subtype, data_types, semi = _get_task_description(problem_description) + + pipelines = [] + with open(path) as file: + possible_pipelines = json.load(file) + with d3m_utils.silence(): + for task_type_in_file, pipeline_infos in possible_pipelines.items(): + if task_type_in_file == task_type: + for pipeline_info in pipeline_infos: + pipeline = pipeline_utils.load_pipeline(pipeline_info) + pipelines.append(pipeline) + return pipelines + + +def _fetch_from_preprocessors(input_data, problem_description): + task_type, task_subtype, data_types, semi = _get_task_description(problem_description) + primitive_candidates = pipeline_utils.get_primitive_candidates(task_type, data_types, semi) + + mapped_task_type = schemas_utils.get_task_mapping(task_type) + if mapped_task_type != task_type: + primitive_candidates += pipeline_utils.get_primitive_candidates(mapped_task_type, data_types, semi) + + pipelines = [] + for primitive_info in primitive_candidates: + if not check_primitive_dataframe_input(primitive_info): + continue + pps = preprocessor.get_preprocessor( + input_data=input_data, problem=problem_description, treatment=primitive_info[1] + ) + for pp in pps: + pipeline_description = copy.deepcopy(pp.pipeline_description) + pipeline_description.id = str(uuid.uuid4()) + pipeline = _complete_pipeline( + pipeline_description=pipeline_description, + dataframe_step=pp.dataset_to_dataframe_step, + primitive_info=primitive_info, + attributes=pp.attributes, + targets=pp.targets, + resolver=pp.resolver + ) + pipelines.append(pipeline) + return pipelines + + +def check_primitive_dataframe_input(primitive_info): + primitive, _ = primitive_info + primitive_arguments = primitive.metadata.query()['primitive_code']['arguments'] + if 'inputs' in primitive_arguments and primitive_arguments['inputs']['type'] == DataFrame: + return True + else: + return False + + +def get_primitive(name): + primitive = index.get_primitive(name) + return primitive + + +def _complete_pipeline(pipeline_description, dataframe_step, attributes, targets, resolver, primitive_info): + primitive, specific_primitive = primitive_info + construct_prediction = 'd3m.primitives.data_transformation.construct_predictions.Common' + construct_prediction_primitive = get_primitive(construct_prediction) + + _add_primitive_to_pipeline(pipeline_description, primitive, resolver, attributes, targets) + _add_primitive_to_pipeline(pipeline_description, construct_prediction_primitive, resolver, + dataframe_step=dataframe_step) + # Get the last step for the output + last_step_idx = len(pipeline_description.steps) - 1 + output = pipeline_utils.int_to_step(last_step_idx) + + # Adding output step to the pieline + pipeline_description.add_output(name='Predictions from the input dataset', data_reference=output) + return pipeline_description + + +def _add_primitive_to_pipeline(pipeline_description, primitive, resolver, attributes=None, targets=None, + dataframe_step=None): + step_model = PrimitiveStep(primitive=primitive, resolver=resolver) + + if dataframe_step is None: + step_model.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) + step_model.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets) + else: + last_step_idx = len(pipeline_description.steps) - 1 + step_model.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, + data_reference=pipeline_utils.int_to_step(last_step_idx)) + step_model.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference=dataframe_step) + step_model.add_output('produce') + pipeline_description.add_step(step_model) + + +def _get_task_description(problem_description): + task_description = schemas_utils.get_task_description(problem_description['problem']['task_keywords']) + task_type = task_description['task_type'] + task_subtype = task_description['task_subtype'] + data_types = task_description['data_types'] + semi = task_description['semi'] + return task_type, task_subtype, data_types, semi diff --git a/axolotl/axolotl/predefined_pipelines/base_preprocessor.py b/axolotl/axolotl/predefined_pipelines/base_preprocessor.py new file mode 100644 index 0000000..60130b2 --- /dev/null +++ b/axolotl/axolotl/predefined_pipelines/base_preprocessor.py @@ -0,0 +1,278 @@ +import typing + +import abc +from d3m import index +from d3m.metadata.base import Context, ArgumentType +from d3m.metadata.pipeline import Pipeline, Resolver, PrimitiveStep + +from axolotl.utils import pipeline as pipeline_utils + +DEFAULT_OUTPUT = '.' + + +class Preprocessor(abc.ABC): + task: str + treatment: str + expected_data_types: set + unsupported_data_types: set + semi: bool + + def __init__(self, metadata, main_resource, data_types, loaded_primitives, problem=None, start_resource='inputs.0'): + self.metadata = metadata + self.main_resource = main_resource + self.data_types = data_types + self.loaded_primitives = loaded_primitives + self.start_resource = start_resource + self.problem = problem + # Creating pipeline + pipeline_description = Pipeline(context=Context.TESTING) + pipeline_description.add_input(name='inputs') + self.pipeline = pipeline_description + self.d2d_step = None + self.attr_step = None + self.targ_step = None + self._generate_pipeline() + + def __init_subclass__(cls, task: str, treatment: str, expected_data_types: set, **kargs): + cls.task = task + cls.treatment = treatment + cls.expected_data_types = expected_data_types + cls.unsupported_data_types = kargs['unsupported_data_types'] if 'unsupported_data_types' in kargs else None + cls.semi = kargs['semi'] if 'semi' in kargs else False + + @classmethod + def check_task_treatment(cls, task, treatment): + if not cls.task: + return True + if not cls.treatment: + return cls.task == task + return cls.task == task and cls.treatment == treatment + + @classmethod + def check_expected_data_types(cls, data_types): + if not cls.expected_data_types: + return True + return any(data_type in cls.expected_data_types for data_type in data_types) + + @classmethod + def check_unsupported_data_types(cls, data_types): + if not cls.unsupported_data_types: + return True + return not any(data_type in cls.unsupported_data_types for data_type in data_types) + + @property + def pipeline_description(self) -> Pipeline: + return self.pipeline + + @property + def dataset_to_dataframe_step(self) -> typing.Optional[str]: + return self.get_output_str(self.d2d_step) if self.d2d_step else None + + @property + def attributes(self) -> typing.Optional[str]: + return self.get_output_str(self.attr_step) if self.attr_step else None + + @property + def targets(self) -> typing.Optional[str]: + return self.get_output_str(self.targ_step) if self.targ_step else None + + @property + def resolver(self) -> Resolver: + return pipeline_utils.BlackListResolver() + + @abc.abstractmethod + def _generate_pipeline(self): + raise NotImplementedError() + + @property + def gpu_budget(self) -> float: + return 0 + + def get_primitive(self, name): + primitive = index.get_primitive(name) + self.download_static_files(primitive) + return primitive + + def common_boilerplate(self): + """ + This boilerplate provides the basic init pipline that contains denormalize and dataset_to_dataframe. + + Arguments + --------- + include_dataset_to_dataframe: bool + Whether to include dataset_to_dataframe step. + include_simple_profiler: bool + whether or not to include simple profiler + """ + metadata = self.metadata + main_resource_id = self.main_resource + start_resource = self.start_resource + + # if there is more that one resource we denormalize + if len(metadata.get_elements(())) > 1: + start_resource = self.add_denormalize_step(start_resource, main_resource_id) + + # Finally we transfer to a dataframe. + dtd_step = self.add_dataset_to_dataframe_step(start_resource) + + simple_profiler_step = self.add_primitive_to_pipeline( + primitive=self.loaded_primitives['SimpleProfiler'], + attributes=dtd_step, + hyperparameters=[ + ('categorical_max_ratio_distinct_values', ArgumentType.VALUE, 1), + ('categorical_max_absolute_distinct_values', ArgumentType.VALUE, None) + ] + ) + self.set_d2d_step(simple_profiler_step) + + def tabular_common(self, target_at_column_parser=False): + self.common_boilerplate() + + # Simple preprocessor + attributes, targets = self.base(target_at_column_parser=target_at_column_parser) + + # Adding Imputer + imputer = self.add_imputer(attributes=attributes) + + attributes = self.add_simple_text_handler(imputer, targets) + self.set_attribute_step(attributes) + self.set_target_step(targets) + + def base(self, target_at_column_parser=False, exclude_attr_columns=None): + dataset_dataframe_step_pos = self.d2d_step + + # Step 2: ColumnParser + column_parser_step = self.add_column_parser_step(data_reference=dataset_dataframe_step_pos) + + # Step 3: ExtractAttributes + attributes_step = self.add_extract_col_by_semantic_types_step( + column_parser_step, + ['https://metadata.datadrivendiscovery.org/types/Attribute'], + exclude_attr_columns + ) + target_source = column_parser_step if target_at_column_parser else dataset_dataframe_step_pos + + # Step 4: ExtractTargets + targets_step = self.add_extract_col_by_semantic_types_step( + target_source, + ['https://metadata.datadrivendiscovery.org/types/TrueTarget'] + ) + return attributes_step, targets_step + + def add_imputer(self, attributes): + # SklearnImputer + primitive = self.loaded_primitives['Imputer'] + configuration = \ + primitive.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams'].configuration + hyperparameters = [] + if 'return_result' in configuration: + hyperparameters.append( + ('return_result', ArgumentType.VALUE, 'replace') + ) + if 'use_semantic_types' in configuration: + hyperparameters.append( + ('use_semantic_types', ArgumentType.VALUE, True) + ) + hyperparameters.append( + ('error_on_no_input', ArgumentType.VALUE, False) + ) + imputer = self.add_primitive_to_pipeline( + primitive=primitive, + attributes=attributes, + hyperparameters=hyperparameters + ) + return imputer + + def add_extract_col_by_semantic_types_step(self, data_reference, target_semantic_types, exclude_columns=None): + if exclude_columns: + hyperparameters = [ + ('exclude_columns', ArgumentType.VALUE, exclude_columns), + ('semantic_types', ArgumentType.VALUE, target_semantic_types) + ] + else: + hyperparameters = [ + ('semantic_types', ArgumentType.VALUE, target_semantic_types) + ] + step = self.add_primitive_to_pipeline( + primitive=self.loaded_primitives['ExtractColumnsBySemanticTypes'], + attributes=data_reference, + hyperparameters=hyperparameters + ) + return step + + def add_denormalize_step(self, start_resource, data): + denormalize_step = self.add_primitive_to_pipeline( + primitive=self.loaded_primitives['Denormalize'], + attributes=start_resource, + hyperparameters=[ + ('starting_resource', ArgumentType.VALUE, data) + ] + ) + return denormalize_step + + def add_dataset_to_dataframe_step(self, start_resource): + d2d_step = self.add_primitive_to_pipeline( + primitive=self.loaded_primitives['DatasetToDataFrame'], + attributes=start_resource + ) + return d2d_step + + def add_column_parser_step(self, data_reference, to_parse=None): + if to_parse: + hyperparameters = [ + ('parse_semantic_types', ArgumentType.VALUE, to_parse) + ] + else: + hyperparameters = [] + column_parser = self.add_primitive_to_pipeline( + primitive=self.loaded_primitives['ColumnParser'], + attributes=data_reference, + hyperparameters=hyperparameters + ) + return column_parser + + def add_simple_text_handler(self, attributes, targets): + text_encoder = self.add_primitive_to_pipeline( + primitive=self.loaded_primitives['TextEncoder'], + attributes=attributes, + hyperparameters=[ + ('encoder_type', ArgumentType.VALUE, 'tfidf') + ], + targets=targets + ) + return text_encoder + + def download_static_files(self, primitive): + primitive_metadata = primitive.metadata.query() + output = DEFAULT_OUTPUT + redownload = False + index.download_files(primitive_metadata, output, redownload) + + def add_primitive_to_pipeline(self, primitive, attributes, hyperparameters=[], targets=None, + produce_collection=False): + inputs_ref = attributes if isinstance(attributes, str) else self.get_output_str(attributes) + step = PrimitiveStep(primitive=primitive, resolver=self.resolver) + step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=inputs_ref) + for hyperparam in hyperparameters: + name, argument_type, data = hyperparam + step.add_hyperparameter(name=name, argument_type=argument_type, data=data) + if targets: + outputs_ref = targets if isinstance(targets, str) else self.get_output_str(targets) + step.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=outputs_ref) + step.add_output('produce') + if produce_collection: + step.add_output('produce_collection') + self.pipeline.add_step(step) + return step + + def get_output_str(self, step): + return pipeline_utils.int_to_step(step.index) + + def set_attribute_step(self, attributes): + self.attr_step = attributes + + def set_target_step(self, targets): + self.targ_step = targets + + def set_d2d_step(self, dataset_2_dataframe): + self.d2d_step = dataset_2_dataframe \ No newline at end of file diff --git a/axolotl/axolotl/predefined_pipelines/preprocessor.py b/axolotl/axolotl/predefined_pipelines/preprocessor.py new file mode 100644 index 0000000..907f5d3 --- /dev/null +++ b/axolotl/axolotl/predefined_pipelines/preprocessor.py @@ -0,0 +1,350 @@ +from d3m import index +from d3m.metadata import base as metadata_base +from d3m.metadata.base import ArgumentType +from d3m.metadata.problem import TaskKeyword + +from axolotl.predefined_pipelines.base_preprocessor import Preprocessor +from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils + + +def get_preprocessor(input_data, problem, treatment): + metadata = input_data.metadata + task_description = schemas_utils.get_task_description(problem['problem']['task_keywords']) + task_type = task_description['task_type'] + semi = task_description['semi'] + data_types = task_description['data_types'] + task = pipeline_utils.infer_primitive_family(task_type=task_type, data_types=data_types, is_semi=semi) + main_resource = pipeline_utils.get_tabular_resource_id(dataset=input_data) + + # Loading primitives + primitives = { + 'DatasetToDataFrame': 'd3m.primitives.data_transformation.dataset_to_dataframe.Common', + 'ColumnParser': 'd3m.primitives.data_transformation.column_parser.Common', + 'ExtractColumnsBySemanticTypes': 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common', + 'Denormalize': 'd3m.primitives.data_transformation.denormalize.Common', + 'Imputer': 'd3m.primitives.data_cleaning.imputer.SKlearn', + 'SimpleProfiler': 'd3m.primitives.schema_discovery.profiler.Common', + 'TextEncoder': 'd3m.primitives.data_transformation.encoder.DistilTextEncoder', + } + loaded_primitives = dict() + + try: + for primitive_name in primitives.keys(): + loaded_primitives[primitive_name] = index.get_primitive(primitives[primitive_name]) + except Exception as e: + print("Cannot load primitive {}".format(e)) + + candidates = [] + for preprocessor in preprocessors: + if preprocessor.check_task_treatment(task, treatment) \ + and preprocessor.check_expected_data_types(data_types) \ + and preprocessor.check_unsupported_data_types(data_types): + candidates.append(preprocessor(metadata, main_resource, data_types, loaded_primitives, problem)) + if not candidates: + candidates.append(TabularPreprocessor(metadata, main_resource, data_types, loaded_primitives)) + return candidates + + +class TimeSeriesTabularPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name, + treatment=metadata_base.PrimitiveFamily.CLASSIFICATION.name, + expected_data_types=None, + unsupported_data_types={TaskKeyword.TABULAR, TaskKeyword.RELATIONAL}): + def _generate_pipeline(self): + time_series_featurization_primitive = self.get_primitive( + 'd3m.primitives.feature_extraction.random_projection_timeseries_featurization.DSBOX' + ) + time_series_to_list_primitive = self.get_primitive( + 'd3m.primitives.data_preprocessing.time_series_to_list.DSBOX' + ) + + # denormalize -> dataset_to_df + self.common_boilerplate() + dataset_to_dataframe_step = self.d2d_step + + # timeseries_to_list + timeseries_tolist_step = self.add_primitive_to_pipeline( + primitive=time_series_to_list_primitive, + attributes=dataset_to_dataframe_step, + ) + # timeseries_featurization + timeseries_featurization_step = self.add_primitive_to_pipeline( + primitive=time_series_featurization_primitive, + attributes=timeseries_tolist_step, + ) + # extract_col_by_semantic + attr_step = self.add_extract_col_by_semantic_types_step( + timeseries_featurization_step, + ['https://metadata.datadrivendiscovery.org/types/Attribute'] + ) + # extract_col_by_semantic + targ_step = self.add_extract_col_by_semantic_types_step( + dataset_to_dataframe_step, + ['https://metadata.datadrivendiscovery.org/types/TrueTarget'] + ) + self.set_attribute_step(attr_step) + self.set_target_step(targ_step) + + +class TimeSeriesPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name, + treatment=metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name, + expected_data_types=None, + unsupported_data_types={TaskKeyword.TABULAR, TaskKeyword.RELATIONAL}): + def _generate_pipeline(self): + time_series_formatter_primitive = self.get_primitive( + 'd3m.primitives.data_preprocessing.data_cleaning.DistilTimeSeriesFormatter' + ) + ts_formatter = self.add_primitive_to_pipeline( + primitive=time_series_formatter_primitive, + attributes=self.start_resource + ) + + dtd_step = self.add_dataset_to_dataframe_step(ts_formatter) + dtd_without_ts_format = self.add_dataset_to_dataframe_step(self.start_resource) + + extract_target_step = self.add_extract_col_by_semantic_types_step( + dtd_without_ts_format, + ['https://metadata.datadrivendiscovery.org/types/TrueTarget'] + ) + target_column_parser_step = self.add_column_parser_step( + extract_target_step, + to_parse=[ + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector" + ] + ) + self.set_d2d_step(dtd_without_ts_format) + self.set_attribute_step(dtd_step) + self.set_target_step(target_column_parser_step) + + +class TimeSeriesForecastingTabularPreprocessor(Preprocessor, + task=metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING.name, + treatment=metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING.name, + expected_data_types={TaskKeyword.GROUPED.name}): + # TODO: Pipeline will fail for integer target because simple_profiler profiles it as Categorical data, + # not Float or Integer. + def _generate_pipeline(self): + grouping_compose_primitive = self.get_primitive( + 'd3m.primitives.data_transformation.grouping_field_compose.Common' + ) + + self.common_boilerplate() + + # Do not parse categorical data or GroupingCompose will fail. + column_parser = self.add_column_parser_step( + self.d2d_step, [ + "http://schema.org/DateTime", + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector" + ] + ) + + attribute_step = self.add_extract_col_by_semantic_types_step( + column_parser, ['https://metadata.datadrivendiscovery.org/types/Attribute'] + ) + + grouping = self.add_primitive_to_pipeline( + primitive=grouping_compose_primitive, + attributes=attribute_step + ) + + target_step = self.add_extract_col_by_semantic_types_step(column_parser, [ + 'https://metadata.datadrivendiscovery.org/types/TrueTarget' + ]) + self.set_attribute_step(grouping) + self.set_target_step(target_step) + + +class AudioPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.DIGITAL_SIGNAL_PROCESSING.name, + treatment=None, + expected_data_types=None): + + def _generate_pipeline(self): + audio_reader_primitive = self.get_primitive( + 'd3m.primitives.data_preprocessing.audio_reader.DistilAudioDatasetLoader' + ) + audio_feature_extraction_primitive = self.get_primitive( + 'd3m.primitives.feature_extraction.audio_transfer.DistilAudioTransfer' + ) + audio_reader = self.add_primitive_to_pipeline( + primitive=audio_reader_primitive, + attributes=self.start_resource, + produce_collection=True + ) + column_parser = self.add_column_parser_step( + data_reference=audio_reader, + to_parse=[ + 'http://schema.org/Boolean', + 'http://schema.org/Integer', + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/FloatVector' + ] + ) + audio_feature = self.add_primitive_to_pipeline( + primitive=audio_feature_extraction_primitive, + attributes='steps.{}.produce_collection'.format(audio_reader.index), + ) + target_step = self.add_extract_col_by_semantic_types_step( + column_parser, + [ + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' + ] + ) + self.set_d2d_step(audio_reader) + self.set_attribute_step(audio_feature) + self.set_target_step(target_step) + + +class ImageDataFramePreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING.name, + treatment=None, + expected_data_types={TaskKeyword.IMAGE.name}): + def _generate_pipeline(self): + image_reader_primitive = self.get_primitive('d3m.primitives.data_preprocessing.image_reader.Common') + image_feature_extraction_primitive = self.get_primitive( + 'd3m.primitives.feature_extraction.image_transfer.DistilImageTransfer') + + self.common_boilerplate() + dataset_to_dataframe_step = self.d2d_step + + image_reader = self.add_primitive_to_pipeline( + primitive=image_reader_primitive, + attributes=dataset_to_dataframe_step, + hyperparameters=[('return_result', ArgumentType.VALUE, 'replace')] + ) + column_parser = self.add_column_parser_step( + data_reference=image_reader, + to_parse=[ + 'http://schema.org/Boolean', + 'http://schema.org/Integer', + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/FloatVector' + ] + ) + image_feature_extraction = self.add_primitive_to_pipeline( + primitive=image_feature_extraction_primitive, + attributes=column_parser + ) + target_step = self.add_extract_col_by_semantic_types_step( + data_reference=dataset_to_dataframe_step, + target_semantic_types=['https://metadata.datadrivendiscovery.org/types/TrueTarget'], + ) + self.set_attribute_step(image_feature_extraction) + self.set_target_step(target_step) + + +class ImageTensorPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING.name, + treatment=None, + expected_data_types={TaskKeyword.IMAGE.name}): + def _generate_pipeline(self): + dataframe_to_tensor_primitive = self.get_primitive( + 'd3m.primitives.data_preprocessing.dataframe_to_tensor.DSBOX' + ) + resnet50_featurizer_primitive = self.get_primitive( + 'd3m.primitives.feature_extraction.resnet50_image_feature.DSBOX' + ) + + self.common_boilerplate() + dataset_to_dataframe_step = self.d2d_step + + dataframe_to_tensor = self.add_primitive_to_pipeline( + primitive=dataframe_to_tensor_primitive, + attributes=dataset_to_dataframe_step, + hyperparameters=[('return_result', ArgumentType.VALUE, 'replace')] + ) + resnet50_featurizer = self.add_primitive_to_pipeline( + primitive=resnet50_featurizer_primitive, + attributes=dataframe_to_tensor, + hyperparameters=[('return_result', ArgumentType.VALUE, 'replace')] + ) + target_step = self.add_extract_col_by_semantic_types_step( + dataset_to_dataframe_step, + ['https://metadata.datadrivendiscovery.org/types/TrueTarget'] + ) + self.set_attribute_step(resnet50_featurizer) + self.set_target_step(target_step) + + +class TabularPreprocessor(Preprocessor, task=None, treatment=None, expected_data_types={TaskKeyword.TABULAR.name}): + def _generate_pipeline(self): + return self.tabular_common() + + +class CollaborativeFilteringPreprocessor(Preprocessor, task=metadata_base.PrimitiveFamily.COLLABORATIVE_FILTERING.name, + treatment=None, + expected_data_types=None): + def _generate_pipeline(self): + return self.tabular_common(target_at_column_parser=True) + + +class TextPreprocessor(Preprocessor, task=None, treatment=None, + expected_data_types={TaskKeyword.TEXT}): + def _generate_pipeline(self): + text_reader_primitive = self.get_primitive('d3m.primitives.data_preprocessing.text_reader.Common') + + self.common_boilerplate() + + # Simple preprocessor + attributes, targets = self.base() + + text_reader_step = self.add_primitive_to_pipeline( + primitive=text_reader_primitive, + attributes=attributes, + hyperparameters=[('return_result', ArgumentType.VALUE, 'replace')] + ) + imputer = self.add_imputer(text_reader_step) + attributes = self.add_simple_text_handler(imputer, targets) + self.set_attribute_step(attributes) + self.set_target_step(targets) + + +class TextSent2VecPreprocessor(Preprocessor, task=None, treatment=None, expected_data_types={TaskKeyword.TEXT.name}): + def _generate_pipeline(self): + sent2_vec_primitive =self.get_primitive('d3m.primitives.feature_extraction.nk_sent2vec.Sent2Vec') + + self.common_boilerplate() + + # Simple preprocessor + attributes, targets = self.base() + + sent2vec = self.add_primitive_to_pipeline( + primitive=sent2_vec_primitive, + attributes=attributes, + ) + + imputer = self.add_imputer(sent2vec) + self.set_attribute_step(imputer) + self.set_target_step(targets) + + +class LupiPreprocessor(Preprocessor, task=None, treatment=None, + expected_data_types={TaskKeyword.LUPI.name}): + def _generate_pipeline(self): + self.common_boilerplate() + + privileged_column_indices = [info['column_index'] for info in self.problem['inputs'][0]['privileged_data']] + attributes, targets = self.base(exclude_attr_columns=privileged_column_indices) + + imputer = self.add_imputer(attributes) + self.set_attribute_step(imputer) + self.set_target_step(targets) + + +preprocessors = [ + # TODO DSBOX installation has error + # TimeSeriesTabularPreprocessor, + TimeSeriesPreprocessor, + TimeSeriesForecastingTabularPreprocessor, + AudioPreprocessor, + ImageDataFramePreprocessor, + # TODO DSBOX installation has error + # ImageTensorPreprocessor, + CollaborativeFilteringPreprocessor, + TextSent2VecPreprocessor, + TextPreprocessor, + LupiPreprocessor +] diff --git a/axolotl/axolotl/utils/__init__.py b/axolotl/axolotl/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/axolotl/axolotl/utils/data_problem.py b/axolotl/axolotl/utils/data_problem.py new file mode 100644 index 0000000..96b4f34 --- /dev/null +++ b/axolotl/axolotl/utils/data_problem.py @@ -0,0 +1,340 @@ +import uuid +import numpy +import pandas as pd +from d3m.container import pandas as container_pandas +from d3m.container.dataset import Dataset +from d3m.metadata import base as metadata_base +from d3m.metadata.problem import Problem + +from axolotl.utils.schemas import PROBLEM_DEFINITION + + +def make_unique_columns(data): + """ + Parameters + ---------- + data : pd.DataFrame + A dataframe to fix the column names. + + Returns + ------- + The original dataframe where the columns are strings and has a unique name/ + """ + seen_columns_name = {} + column_names = [] + for column in data.columns: + if column in seen_columns_name: + column_name = str(column) + '_' + str(seen_columns_name[column]) + seen_columns_name[column] += 1 + else: + seen_columns_name[column] = 0 + column_name = str(column) + column_names.append(column_name) + data.columns = column_names + return data + + +def get_dataset(input_data, target_index=-2, index_column=-1, semantic_types=None, parse=False): + """ + A function that has as input a dataframe, and generates a D3M dataset. + + Parameters + ---------- + input_data : pd.DataFrame + The dataframe to be converted to d3m Dataset. + target_index : int + The index of the target, if index is not present, it will be ignored. + index_column : int + The index of the index target, if not provided it will look for d3m index, if not generate one. + semantic_types : Sequence[Sequence[str]] + A list of semantic types to be applied. The sequence must be of the same length of + the dataframe columns. + parse : + A flag to determine if the dataset will contain parsed columns. By default is set to fault + to make it compatible with most of D3M current infrastructure. + + Returns + ------- + A D3M dataset. + """ + data = make_unique_columns(input_data.copy(deep=True)) + if semantic_types is None: + semantic_types = [[] for i in range(len(data.columns))] + for i, _type in enumerate(input_data.dtypes): + if _type == float: + semantic_types[i].append('http://schema.org/Float') + elif _type == int: + semantic_types[i].append('http://schema.org/Integer') + + resources = {} + + if 'd3mIndex' in data.columns: + index_column = list(data.columns).index("d3mIndex") + else: + if index_column == -1: + data.insert(0, 'd3mIndex', range(len(data))) + semantic_types.insert(0, []) + target_index += 1 + index_column = 0 + + data = container_pandas.DataFrame(data) + + # remove this + if not parse: + data = data.astype(str) + metadata = metadata_base.DataMetadata() + + resources['learningData'] = data + + metadata = metadata.update(('learningData',), { + 'structural_type': type(data), + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': len(data), + }, + }) + + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': len(data.columns), + }, + }) + + for i, column_name in enumerate(data.columns): + if i == index_column: + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), { + 'name': column_name, + 'structural_type': numpy.int64, + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + else: + _structural_type = str + if semantic_types[i]: + _semantic_types = semantic_types[i] + if 'http://schema.org/Float' in _semantic_types: + _structural_type = numpy.float64 + elif 'http://schema.org/Integer' in _semantic_types: + _structural_type = numpy.int64 + else: + _semantic_types = ['https://metadata.datadrivendiscovery.org/types/UnknownType'] + + if not parse: + _structural_type = str + if i == target_index: + _semantic_types += ['https://metadata.datadrivendiscovery.org/types/SuggestedTarget'] + else: + _semantic_types += ['https://metadata.datadrivendiscovery.org/types/Attribute'] + + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), { + 'name': column_name, + 'structural_type': _structural_type, + 'semantic_types': _semantic_types + }) + + dataset_id = str(uuid.uuid4()) + dataset_metadata = { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': Dataset, + 'id': dataset_id, + 'name': dataset_id, + 'digest': str(uuid.uuid4()), + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': len(resources), + }, + } + + metadata = metadata.update((), dataset_metadata) + + dataset = Dataset(resources, metadata) + return dataset + + +def import_dataframe(data_frame, *, index_column=-1, semantic_types=None): + """ + Function that transforms a dataframe into a dataset. + + data_frame : pd.DataFrame + The input dataframe to be converted to d3m Dataset. + index_column : int + The index of the index column. + semantic_types : Sequence[Sequence[str]] + A list of semantic types to be applied. The sequence must be of the same length of + the dataframe columns. + + Returns + ------- + A D3M dataset. + """ + data = get_dataset(input_data=data_frame, index_column=index_column, semantic_types=semantic_types) + return data + + +def import_input_data(x, y=None, *, target_index=None, index_column=-1, semantic_types=None, parse=False): + """ + Function that takes an np.array or a dataframe and convert them to a D3M dataset. + + x : Union[pd.DataFrame, np.array] + Input features or the features with targets if target index is specified. + y : Union[pd.DataFrame, np.array] + input features or the features with targets if target index is specified. + target_index : int + The index of the target, if index is not present, it will be ignored. + index_column : int + The index of the index target, if not provided it will look for d3m index, if not generate one. + semantic_types : Sequence[Sequence[str]] + A list of semantic types to be applied. The sequence must be of the same length of + the dataframe columns. + parse : + A flag to determine if the dataset will contain parsed columns. By default is set to fault + to make it compatible with most of D3M current infrastructure. + + Returns + ------- + A D3M dataset. + """ + + if y is not None and target_index is not None: + print('Ignoring target index, using y as target') + + _target_index = -1 + if y is not None: + _x = pd.DataFrame(x) + _y = pd.DataFrame(y) + input_data = pd.concat((_x, _y), axis=1) + _target_index = len(_x.columns) + elif target_index is not None: + input_data = x + else: + raise ValueError('Targets (y) or target index should be provide') + + if _target_index != -1: + target_index = _target_index + data = get_dataset(input_data=input_data, target_index=target_index, + index_column=index_column, semantic_types=semantic_types, parse=parse) + + return data + + +def generate_problem_description(dataset, task=None, *, task_keywords=None, performance_metrics=None): + """ + A function that simplifies the generation of a problem description. + + Parameters + ---------- + dataset : Dataset + Dataset to be use for pipeline search. + task : str + A string that represent the problem type, currently only supported: ``binary_classification`` and + ``regression``. + task_keywords : List[TaskKeyword] + A list of TaskKeyword. + performance_metrics: List[PerformanceMetric] + A list of PerformanceMetric. + + Returns + ------- + A Problem + """ + dataset_id = dataset.metadata.query(())['id'] + problem_id = dataset_id + '_problem' + schema = 'https://metadata.datadrivendiscovery.org/schemas/v0/problem.json' + version = '4.0.0' + + target_column_index = None + + for i in range(dataset.metadata.query(('learningData', metadata_base.ALL_ELEMENTS,))['dimension']['length']): + if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in \ + dataset.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i,))['semantic_types']: + target_column_index = i + break + + if target_column_index is None: + raise ValueError('Input dataframe does not contains targets') + + inputs = { + 'dataset_id': dataset_id, + 'targets': [{ + 'column_index': target_column_index, + 'column_name': dataset.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i,))['name'], + 'resource_id': 'learningData', + 'target_index': 0 + }] + } + + problem = None + if task is None: + if performance_metrics is not None and task_keywords is not None: + problem = { + 'performance_metrics': performance_metrics, + 'task_keywords': task_keywords + } + else: + if task in PROBLEM_DEFINITION: + problem = PROBLEM_DEFINITION[task] + else: + raise ValueError(task + """ task is not supported in default definitions. + You can define your own task by specifying the task_keywords and performance metrics.""") + + problem_description = { + 'id': problem_id, + 'schema': schema, + 'version': version, + 'inputs': [inputs], + 'problem': problem + } + + return Problem(problem_description) + + +def generate_dataset_problem(x, y=None, task=None, *, target_index=None, index_column=-1, + semantic_types=None, parse=False, task_keywords=None, performance_metrics=None): + """ + Function that takes an np.array or a dataframe and convert them to a D3M dataset. + + x : Union[pd.DataFrame, np.array] + Input features or the features with targets if target index is specified. + y : Union[pd.DataFrame, np.array] + input features or the features with targets if target index is specified. + task : str + A string that represent the problem type, currently only supported: ``binary_classification`` and + ``regression``. + target_index : int + The index of the target, if index is not present, it will be ignored. + index_column : int + The index of the index target, if not provided it will look for d3m index, if not generate one. + semantic_types : Sequence[Sequence[str]] + A list of semantic types to be applied. The sequence must be of the same length of + the dataframe columns. + parse : + A flag to determine if the dataset will contain parsed columns. By default is set to fault + to make it compatible with most of D3M current infrastructure. + task_keywords : List[TaskKeyword] + A list of TaskKeyword. + performance_metrics: List[PerformanceMetric] + A list of PerformanceMetric. + + Returns + ------- + dataset : Dataset + A D3M dataset. + problem_description : Problem + A D3M problem. + """ + dataset = import_input_data(x, y=y, target_index=target_index, index_column=index_column, + semantic_types=semantic_types, parse=parse) + problem_description = generate_problem_description(dataset=dataset, task=task, task_keywords=task_keywords, + performance_metrics=performance_metrics) + + return dataset, problem_description diff --git a/axolotl/axolotl/utils/pipeline.py b/axolotl/axolotl/utils/pipeline.py new file mode 100644 index 0000000..5a180dd --- /dev/null +++ b/axolotl/axolotl/utils/pipeline.py @@ -0,0 +1,542 @@ +import os +import pprint +import typing +import uuid +import json + +import matplotlib.pyplot as plt +from matplotlib.pyplot import figure +import networkx as nx +import pandas + +import d3m +from d3m import container +from d3m import utils as d3m_utils +from d3m.container import utils as container_utils +from d3m.metadata import base as metadata_base +from d3m.metadata.pipeline import Pipeline, PlaceholderStep, PrimitiveStep, SubpipelineStep, get_pipeline, Resolver +from d3m.metadata.pipeline_run import PipelineRun +from d3m.metadata import problem as problem_module +from d3m.primitive_interfaces import base +from d3m.container.pandas import DataFrame + + +class PipelineResult: + """ + A class that captures the output of multiple operations around the system. + + Parameters + ---------- + pipeline: Pipeline + The pipeline used for the run (fit/score) + fitted_pipeline_id: str + The id of the fitted pipeline used to produce the result. + + Attributes + ---------- + pipeline: Pipeline + Pipeline used for the run (fit/score) + fitted_pipeline_id: str + The id of the fitted pipeline used to produce the result. + status: str + A string representing the status of the run (PENDING, RUNNING, COMPLETED, ERRORED) + error: typing.Union[Exception, typing.List[Exception]] + An error of list of errors occured during the execution of the pipeline or fitted pipeline. + exposed_outputs: typing.Dict[str, typing.Any] + A dictionary containing the name of te exposed output and the value, this could be a string + of the path of the stored output or the object itself. + output: container.DataFrame + A dataframe of the pipeline output, this could be a string if the output is stored. + pipeline_run + A pipeline run, or the path where is stored. + method_called: str + The method that it was called while generating this result. (fit, produce) + scores: pandas.DataFrame + A dataframe containing the scores of the evaluated pipeline. + rank: float + The rank of the pipeline from 0 to 1, where 0 is the best. + """ + def __init__(self, *, pipeline: Pipeline = None, fitted_pipeline_id: str = None): + self.pipeline = pipeline + self.fitted_pipeline_id: str = fitted_pipeline_id + self.status: str = None + self.error: typing.Union[Exception, typing.List[Exception]] = None + self.exposed_outputs: typing.Dict[str, typing.Any] = None + self.output: container.DataFrame = None + self.pipeline_run = None + self.method_called: str = None + self.scores: pandas.DataFrame = None + self.rank: float = None + + def __str__(self): + string_representation = {} + + for name, value in self.__dict__.items(): + if not name.startswith('__') and not callable(name): + if value is not None: + string_representation[name] = str(value) + + return pprint.pformat(string_representation).replace("\\n", "") + + def __repr__(self): + base_string = 'PipelineResult' + if self.pipeline is not None: + base_string += ' pipeline_id:{}'.format(self.pipeline.id) + + if self.fitted_pipeline_id is not None: + base_string += ' fitted_pipeline_id:{}'.format(self.fitted_pipeline_id) + + return base_string + + +class PrimitivesList: + # root = os.path.dirname(__file__) + # black_list = os.path.join(root, 'axolotl', 'utils', 'resources', 'blacklist.json') + with open(os.path.join(os.path.dirname(__file__), 'resources', 'blocklist.json'), 'r') as file: + BlockList = json.load(file) + + +class BlackListResolver(Resolver): + """ + A resolver to resolve primitives and pipelines. + + It resolves primitives from available primitives on the system, + and resolves pipelines from files in pipeline search paths. + + Attributes + ---------- + strict_resolving : bool + If resolved primitive does not fully match specified primitive reference, raise an exception? + pipeline_search_paths : Sequence[str] + A list of paths to directories with pipelines to resolve from. + Their files should be named ``.json`` or ``.yml``. + + Parameters + ---------- + strict_resolving : bool + If resolved primitive does not fully match specified primitive reference, raise an exception? + pipeline_search_paths : Sequence[str] + A list of paths to directories with pipelines to resolve from. + Their files should be named ``.json`` or ``.yml``. + respect_environment_variable : bool + Use also (colon separated) pipeline search paths from ``PIPELINES_PATH`` environment variable? + """ + + def __init__(self, black_list=PrimitivesList.BlockList, *, strict_resolving: bool = False, strict_digest: bool = False, + pipeline_search_paths: typing.Sequence[str] = None, + respect_environment_variable: bool = True, load_all_primitives: bool = True, + primitives_blocklist: typing.Collection[str] = None) -> None: + super().__init__(strict_resolving=strict_resolving, strict_digest=strict_digest, + pipeline_search_paths=pipeline_search_paths, + respect_environment_variable=respect_environment_variable, + load_all_primitives=load_all_primitives, primitives_blocklist=primitives_blocklist) + self.black_list = black_list + if len(black_list) == 0: + self.black_list = None + + def _get_primitive(self, primitive_description: typing.Dict) -> typing.Optional[typing.Type[base.PrimitiveBase]]: + if not self._primitives_loaded: + self._primitives_loaded = True + + d3m.index.load_all(blacklist=self.black_list) + + return d3m.index.get_primitive_by_id(primitive_description['id']) + + +def load_pipeline(pipeline_file: typing.Union[str, typing.Dict]): + """ + Load pipeline from a pipeline URI + + Parameters + ---------- + pipeline_file: Union[str, dict] + The URI pointing to a json file of pipeline or dict of string that is a pipeline + + Returns + ------- + pipeline: Pipeline + An object of Pipeline + + """ + if isinstance(pipeline_file, dict): + try: + with d3m_utils.silence(): + pipeline = Pipeline.from_json_structure(pipeline_file) + except: + pipeline = None + else: + with d3m_utils.silence(): + pipeline = get_pipeline(pipeline_path=pipeline_file, load_all_primitives=False) + return pipeline + + +def save_pipeline(pipeline, path, *, rank=None): + """ + A function that make a copy of an already scored pipeline to scored directory according with specifications. + + Parameters + ---------- + pipeline : Pipeline + A pipeline to be save into the path + path: str + Path where the pipeline will be stored + rank : float + A float that represents the rank of the pipeline. + """ + + pipeline_path = os.path.join(path, '{}.json'.format(pipeline.id)) + + with open(pipeline_path, 'w') as file: + pipeline.to_json(file, indent=2, sort_keys=True, ensure_ascii=False) + + if rank is not None: + rank_path = os.path.join(path, '{}.rank'.format(pipeline.id)) + with open(rank_path, 'w') as file: + file.write('{rank}'.format(rank=rank)) + + +def save_pipeline_run(pipeline_run, path): + """ + A function that make a copy of an already scored pipeline to scored directory according with specifications. + + Parameters + ---------- + pipeline_run : PipelineRun + A pipeline_run to be save into the path + path: str + Path where the pipeline_run will be stored + + Returns + ------- + pipeline_run_path : str + Path where the pipeline_run is stored. + """ + + if pipeline_run is None: + return + + if isinstance(pipeline_run, list): + first = True + pipeline_run_path = os.path.join(path, '{}.yml'.format(pipeline_run[0].pipeline['id'])) + with d3m_utils.silence(): + with open(pipeline_run_path, 'w') as file: + for run in pipeline_run: + run.to_yaml(file, appending=not first) + first = False + else: + pipeline_run_path = os.path.join(path, '{}.yml'.format(pipeline_run.pipeline['id'])) + with d3m_utils.silence(): + with open(pipeline_run_path, 'w') as file: + pipeline_run.to_yaml(file) + + return pipeline_run_path + + +def save_exposed_values(values, output_id, output_dir): + """ + A function to save the exposed values of a PipelineResult. + + Parameters + ---------- + values : Union[dict[str, container], container] + A container to be stored into the path + output_id : str + An id that identify the values. + output_dir : str + The path where the values are going to be store. + + Returns + ------- + A dict of names and stored paths. + + """ + output_paths = {} + output_path = os.path.join(output_dir, output_id) + unique_id = str(uuid.uuid4()) + + def get_file_path(path): + files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] + file_path = "" + if 'data.csv' in files: + file_path = os.path.join(path, 'data.csv') + elif 'datasetDoc.json' in files: + file_path = os.path.join(path, 'datasetDoc.json') + return file_path + + if isinstance(values, dict): + for name, value in values.items(): + _output_path = os.path.join(output_path, output_id, unique_id, name) + container_utils.save_container(value, _output_path) + output_paths[name] = get_file_path(_output_path) + else: + _output_path = os.path.join(output_path, output_id, unique_id, 'output') + container_utils.save_container(values, _output_path) + output_paths['output'] = get_file_path(_output_path) + + return output_paths + + +def plot_pipeline(pipeline): + figure(num=None, figsize=(10, 12), dpi=80, facecolor='w', edgecolor='k') + graph, nodes_info = get_pipeline_graph(pipeline) + + the_table = plt.table(cellText=nodes_info, colWidths=[0.05, 0.5], colLabels=['Step', 'Primitive'], loc='right') + the_table.set_fontsize(25) + the_table.scale(2, 1) + pos = nx.kamada_kawai_layout(graph, scale=3) + grafo_labels = nx.get_edge_attributes(graph, 'label') + edges_label = nx.draw_networkx_edge_labels(graph, pos, edge_labels=grafo_labels, font_size=7) + nx.draw(graph, pos=pos, node_size=900, alpha=0.5, font_size=16, edges_label=edges_label, with_labels=True, scale=5) + + +def __get_header(index, step): + if isinstance(step, PrimitiveStep): + header = 'steps.' + str(index) + ' - ' + step.primitive.metadata.query()['python_path'] + elif isinstance(step, PlaceholderStep): + header = 'steps.' + str(index) + ' - ' + 'PlaceHolderStep' + elif isinstance(step, SubpipelineStep): + header = 'steps.' + str(index) + ' - ' + 'SubPipeline' + return header + + +def get_pipeline_graph(pipeline): + graph = nx.DiGraph() + nodes_info = [] + + for i in range(0, len(pipeline.steps)): + nodes_info.append([str(i), pipeline.steps[i].primitive.metadata.query()['python_path']]) + + if isinstance(pipeline.steps[i], PrimitiveStep) or isinstance(pipeline.steps[i], PlaceholderStep): + target = i + graph.add_node(target) + for argument in pipeline.steps[i].arguments.keys(): + data = pipeline.steps[i].arguments[argument]['data'] + if 'input' in data: + source = 'inputs' + else: + index = int(data.split('.')[1]) + source = index + label = argument + '-' + data + graph.add_edge(source, target, label=label) + + for hp in pipeline.steps[i].hyperparams.keys(): + if pipeline.steps[i].hyperparams[hp]['type'] == metadata_base.ArgumentType.PRIMITIVE: + index = pipeline.steps[i].hyperparams[hp]['data'] + source = index + label = 'Step {} hyperparam - {}'.format(i, hp) + graph.add_edge(source, target, label=label) + else: + # TODO add support here for subpipelines + continue + + for i in range(0, len(pipeline.outputs)): + index = int(pipeline.outputs[i]['data'].split('.')[1]) + source = index + label = 'outputs.{}'.format(i) + graph.add_edge(source, 'output', label=label) + + return graph, nodes_info + + +def infer_primitive_family(task_type: str, data_types: typing.Iterable, is_semi: bool = False) -> typing.Optional[str]: + """ + Infer target primitive family by task and data_types + + Parameters + ---------- + task_type: str + The task type + data_types: typing.Iterable + The data types + is_semi: bool + Is semi supervised probelm + + Returns + ------- + str + The primitive family + """ + + #TODO temp solution + if problem_module.TaskKeyword.CLASSIFICATION == task_type and \ + problem_module.TaskKeyword.TIME_SERIES in data_types and \ + problem_module.TaskKeyword.GROUPED in data_types: + return metadata_base.PrimitiveFamily.CLASSIFICATION + if problem_module.TaskKeyword.CLASSIFICATION == task_type and \ + problem_module.TaskKeyword.TIME_SERIES in data_types: + return metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name + if problem_module.TaskKeyword.FORECASTING and problem_module.TaskKeyword.TIME_SERIES in data_types: + return metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING.name + if problem_module.TaskKeyword.CLASSIFICATION == task_type and is_semi: + return metadata_base.PrimitiveFamily.SEMISUPERVISED_CLASSIFICATION.name + if problem_module.TaskKeyword.IMAGE in data_types: + return metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING.name + if problem_module.TaskKeyword.VIDEO in data_types: + return metadata_base.PrimitiveFamily.DIGITAL_SIGNAL_PROCESSING.name + + return task_type + + +def check_black_list(primitive_name: str, extra_block: typing.List=[]) -> bool: + """ + Check if the primitive is in the black list, which is from `LIST.BlACK_LIST` + + Parameters + ---------- + primitive_name: str + The name of the primitive + + Returns + ------- + bool + + """ + banned_terms = PrimitivesList.BlockList + extra_block + for banned_element in banned_terms: + if banned_element in primitive_name: + return True + return False + + +def get_primitive_candidates(task_type: str, data_types: typing.Iterable, semi: bool, + extra_block: typing.List=[]) -> typing.List: + """ + Get a list of primitive candidates related to the task type except those primitives in `BLACK_LIST` + + Parameters + ---------- + task_type: str + The task type + data_types: typing.Iterable + The data types + semi: bool + Is it semi-supervised problem + + Returns + ------- + list + A list of primitives + """ + specific_task = infer_primitive_family(task_type, data_types, semi) + primitives_path = d3m.index.search() + primitives = list() + for primitive_path in primitives_path: + if check_black_list(primitive_path, extra_block): + continue + try: + with d3m_utils.silence(): + primitive = d3m.index.get_primitive(primitive_path) + primitive_family = primitive.metadata.query()['primitive_family'].name + if primitive_family == task_type: + primitives.append((primitive, task_type)) + elif primitive_family == specific_task: + primitives.append((primitive, specific_task)) + # TODO what exception? + except Exception as e: + continue + return primitives + + +def int_to_step(n_step: int) -> str: + """ + Convert the step number to standard str step format + + Parameters + ---------- + n_step: int + + Returns + ------- + str + str format in "steps..produce" + """ + return 'steps.' + str(n_step) + '.produce' + + +def get_primitives(primitives_dict): + """ + A function that loads and returns a dictionary of primitives + + Parameters + ---------- + primitives_dict: dict[str, str] + A dictionary that contains the alias and the primitives to load. + + Returns + ------- + loaded_primitives_dict: dict[str, str] + A dictionary containing the aliases and the loaded primitives. + """ + loaded_primitives_dict = {} + for primitive_name in primitives_dict.keys(): + loaded_primitives_dict[primitive_name] = d3m.index.get_primitive(primitives_dict[primitive_name]) + return loaded_primitives_dict + + +def get_tabular_resource_id(dataset): + """ + A function that retrieves the main resource id + + Parameters + ---------- + dataset: Dataset + A dataset. + + Returns + ------- + resource_id: str + An id of the main resource. + """ + + resource_id = None + for dataset_resource_id in dataset.keys(): + if dataset.metadata.has_semantic_type((dataset_resource_id,), + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'): + resource_id = dataset_resource_id + break + + if resource_id is None: + tabular_resource_ids = [dataset_resource_id for dataset_resource_id, dataset_resource in dataset.items() if + isinstance(dataset_resource, container.DataFrame)] + if len(tabular_resource_ids) == 1: + resource_id = tabular_resource_ids[0] + + if resource_id is None: + resource_id = 'learningData' + + return resource_id + + +def query_multiple_terms(metadata, list_queries): + data = metadata.query() + valid_queries = [] + for query in list_queries: + if query in data: + valid_queries.append(query) + data = data[query] + else: + break + if len(valid_queries) == len(list_queries): + return data + + +def filter_primitives_by_dataframe_input(primitive_info): + primitives_dataframe_input = [] + for info in primitive_info: + primitive, task = info + arguments = query_multiple_terms( + primitive.metadata, ['primitive_code', 'class_type_arguments']) + + has_dataframe_arguments = True + for argument, value in arguments.items(): + if argument == 'Params' or argument == 'Hyperparams': + continue + else: + if value != DataFrame: + has_dataframe_arguments = False + break + if has_dataframe_arguments: + primitives_dataframe_input.append(info) + + return primitives_dataframe_input + diff --git a/axolotl/axolotl/utils/resources.py b/axolotl/axolotl/utils/resources.py new file mode 100644 index 0000000..9fba49e --- /dev/null +++ b/axolotl/axolotl/utils/resources.py @@ -0,0 +1,31 @@ +import os +import shutil +import signal +from contextlib import contextmanager + + +class TimeoutException(Exception): + pass + + +@contextmanager +def time_limit(seconds): + def signal_handler(signum, frame): + raise TimeoutException("Timed out!") + signal.signal(signal.SIGALRM, signal_handler) + signal.alarm(seconds) + try: + yield + finally: + signal.alarm(0) + + +def check_directory(dir_name): + dir_name = os.path.abspath(dir_name) + if not os.path.exists(dir_name): + os.makedirs(dir_name) + + +def copy_file(source_path, target_path): + path = os.path.join(target_path, os.path.basename(source_path)) + shutil.copyfile(source_path, path) diff --git a/axolotl/axolotl/utils/resources/blocklist.json b/axolotl/axolotl/utils/resources/blocklist.json new file mode 100644 index 0000000..da6a81e --- /dev/null +++ b/axolotl/axolotl/utils/resources/blocklist.json @@ -0,0 +1,31 @@ +[ + "d3m.primitives.classification.xgboost_dart.Common", + "d3m.primitives.classification.canonical_correlation_forests.UBC", + "d3m.primitives.classification.logistic_regression.UBC", + "d3m.primitives.classification.multilayer_perceptron.UBC", + "d3m.primitives.classification.simple_cnaps.UBC", + "d3m.primitives.clustering.kmeans_clustering.UBC", + "d3m.primitives.dimensionality_reduction.principal_component_analysis.UBC", + "d3m.primitives.feature_extraction.boc.UBC", + "d3m.primitives.feature_extraction.bow.UBC", + "3m.primitives.feature_extraction.googlenet_cnn.UBC", + "d3m.primitives.feature_extraction.convolutional_neural_network.UBC", + "d3m.primitives.schema_discovery.semantic_type.UBC", + "d3m.primitives.regression.linear_regression.UBC", + "d3m.primitives.operator.diagonal_mvn.UBC", + "d3m.primitives.feature_extraction.resnet_cnn.UBC", + "d3m.primitives.feature_extraction.mobilenet_cnn.UBC", + "d3m.primitives.feature_extraction.vggnet_cnn.UBC", + "d3m.primitives.regression.canonical_correlation_forests.UBC", + "d3m.primitives.regression.multilayer_perceptron.UBC", + "d3m.primitives.schema_discovery.semantic_type.UBC", + "d3m.primitives.data_transformation.missing_indicator.DistilMissingIndicator", + "d3m.primitives.data_transformation.graph_to_edge_list.DSBOX", + "d3m.primitives.feature_construction.graph_transformer.GCN", + "d3m.primitives.feature_extraction.huber_pca.Cornell", + "d3m.primitives.natural_language_processing.glda.Fastlv", + "d3m.primitives.feature_construction.corex_continuous.DSBOX", + "d3m.primitives.natural_language_processing.glda.Fastlvm", + "d3m.primitives.classification.xgboost_dart.Common", + "d3m.primitives.classification.global_causal_discovery.ClassifierRPI" +] diff --git a/axolotl/axolotl/utils/resources/default_pipelines.json b/axolotl/axolotl/utils/resources/default_pipelines.json new file mode 100644 index 0000000..e20a499 --- /dev/null +++ b/axolotl/axolotl/utils/resources/default_pipelines.json @@ -0,0 +1,64 @@ +{ + "CLASSIFICATION": [ + {"id": "6a520746-108c-45bf-a6d8-c875b5a9d326","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "1dd82833-5692-39cb-84fb-2455683075f3","version": "2019.6.7","python_path": "d3m.primitives.classification.random_forest.SKlearn","name": "sklearn.ensemble.forest.RandomForestClassifier"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "a6b468a5-4d03-405e-a707-8e377f9ad1c3","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "01d2c086-91bf-3ca5-b023-5139cf239c77","version": "2019.6.7","python_path": "d3m.primitives.classification.gradient_boosting.SKlearn","name": "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "ef1c483a-34fc-4398-a6b3-063b33786972","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "c8a28f02-ef4a-35a8-87f1-cf79980f5c3e","version": "2019.6.7","python_path": "d3m.primitives.classification.extra_trees.SKlearn","name": "sklearn.ensemble.forest.ExtraTreesClassifier"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]} + ], + "REGRESSION": [ + {"id": "efab70e7-461a-42de-a5d7-9bdd98cc05d8","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f0fd7a62-09b5-3abc-93bb-f5f999f7cc80","version": "2019.6.7","python_path": "d3m.primitives.regression.random_forest.SKlearn","name": "sklearn.ensemble.forest.RandomForestRegressor"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "a6b468a5-4d03-405e-a707-8e377f9ad1c3","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "2a031907-6b2c-3390-b365-921f89c8816a","version": "2019.6.7","python_path": "d3m.primitives.regression.gradient_boosting.SKlearn","name": "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "a6b468a5-4d03-405e-a707-8e377f9ad1c3","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json","created": "2020-01-16T20:40:25.541426Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.8.produce","name": "output predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"categorical_max_ratio_distinct_values": {"type": "VALUE","data": 1},"categorical_max_absolute_distinct_values": {"type": "VALUE","data": {"case": "unlimited","value": null}}},"outputs": [{"id": "produce"}],"primitive": {"digest": "8b12a9aececdc5b7a4d5ef47cd04cda75592fd24f49922776b614d4bbeeb97f1","id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","name": "Determine missing semantic types for columns automatically","python_path": "d3m.primitives.schema_discovery.profiler.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.5.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","version": "2019.6.7","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","name": "sklearn.impute.SimpleImputer"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "f32dcb25-4cd0-4bb9-9408-ade1edfa2b53","version": "0.1.0","python_path": "d3m.primitives.feature_selection.skfeature.TAMU","name": "Feature Selection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "35321059-2a1a-31fd-9509-5494efc751c7","version": "2019.6.7","python_path": "d3m.primitives.regression.extra_trees.SKlearn","name": "sklearn.ensemble.forest.ExtraTreesRegressor"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.6.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.7.produce"},"reference": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]} + ], + "CLUSTERING": [], + "LINK_PREDICTION": [ + {"id": "ddc6c7e9-64b4-4f9c-af07-5f27461cb940","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "c9d5da5d-0520-468e-92df-bd3a85bb4fac","version": "0.1.0","python_path": "d3m.primitives.classification.gaussian_classification.JHU","name": "jhu.gclass"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "12a4b6a8-b2e4-4604-afe5-8196bf55a925","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "5194ef94-3683-319a-9d8d-5c3fdd09de24","version": "0.1.0","python_path": "d3m.primitives.graph_clustering.gaussian_clustering.JHU","name": "jhu.gclust"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_clusters": {"type": "VALUE","data": 10}}}]}, + {"id": "6216f2bd-2f23-4dbf-92d0-f3b40aeac150","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.2.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "09f2eea8-667c-44b8-a955-6a153ba9ccc3","version": "0.1.0","python_path": "d3m.primitives.link_prediction.data_conversion.JHU","name": "jhu.link_pred_graph_reader"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"which_elbow": {"type": "VALUE","data": 1},"max_dimension": {"type": "VALUE","data": 2},"use_attributes": {"type": "VALUE","data": false}}},{"type": "PRIMITIVE","primitive": {"id": "25e97696-b96f-4f5c-8620-b340fe83414d","version": "0.1.0","python_path": "d3m.primitives.link_prediction.rank_classification.JHU","name": "jhu.link_pred_rc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "0f5d0c4a-2c7f-4a9b-9441-80449c460993","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "79012210-2463-4f94-9da6-11bdc5a7e6c4","version": "0.1.2","python_path": "d3m.primitives.data_transformation.load_single_graph.DistilSingleGraphLoader","name": "Load single graph and dataframe into a parseable object"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"},{"id": "produce_target"}]},{"type": "PRIMITIVE","primitive": {"id": "fc138210-c317-4528-81ae-5eed3a1a0267","version": "0.1.1","python_path": "d3m.primitives.link_prediction.link_prediction.DistilLinkPrediction","name": "LinkPrediction"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"},"outputs": {"type": "CONTAINER","data": "steps.0.produce_target"}},"outputs": [{"id": "produce"}],"hyperparams": {"metric": {"type": "VALUE","data": "accuracy"}}}]} + + ], + "VERTEX_NOMINATION": [], + "COMMUNITY_DETECTION": [ + {"id": "bfe17a08-bc94-4f6d-8be1-4758e899a6c6","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "79012210-2463-4f94-9da6-11bdc5a7e6c4","version": "0.1.2","python_path": "d3m.primitives.data_transformation.load_single_graph.DistilSingleGraphLoader","name": "Load single graph and dataframe into a parseable object"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"},{"id": "produce_target"}]},{"type": "PRIMITIVE","primitive": {"id": "064cec55-39dd-45b7-a663-50d3e17e0c42","version": "0.1.1","python_path": "d3m.primitives.community_detection.community_detection.DistilCommunityDetection","name": "CommunityDetection"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"},"outputs": {"type": "CONTAINER","data": "steps.0.produce_target"}},"outputs": [{"id": "produce"}],"hyperparams": {"metric": {"type": "VALUE","data": "normalizedMutualInformation"}}}]}, + {"id": "0f6cafc4-5628-47bc-bbf5-8cab3a7c0e95","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8fa6178b-84f7-37d8-87e8-4d3a44c86569","version": "0.1.0","python_path": "d3m.primitives.data_transformation.laplacian_spectral_embedding.JHU","name": "jhu.lse"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "5194ef94-3683-319a-9d8d-5c3fdd09de24","version": "0.1.0","python_path": "d3m.primitives.graph_clustering.gaussian_clustering.JHU","name": "jhu.gclust"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_clusters": {"type": "VALUE","data": 10}}}]}, + {"id": "ffc49730-eb73-423c-ab6c-acb47300fcfc","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8fa6178b-84f7-37d8-87e8-4d3a44c86569","version": "0.1.0","python_path": "d3m.primitives.data_transformation.laplacian_spectral_embedding.JHU","name": "jhu.lse"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "c9d5da5d-0520-468e-92df-bd3a85bb4fac","version": "0.1.0","python_path": "d3m.primitives.classification.gaussian_classification.JHU","name": "jhu.gclass"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]} + ], + "GRAPH_MATCHING": [ + {"id": "b5dd2766-da63-4526-a29b-e6322c1f9cc8","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.0.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "ff22e721-e4f5-32c9-ab51-b90f32603a56","version": "0.1.0","python_path": "d3m.primitives.graph_matching.seeded_graph_matching.JHU","name": "jhu.sgm"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]}]}, + {"id": "2bf14cda-1edd-4abd-a499-422913c075e6","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "ae0797506-ea7b-4a7f-a7e4-2f91e2082f05","version": "0.1.2","python_path": "d3m.primitives.data_transformation.load_graphs.DistilGraphLoader","name": "Load graphs into a parseable object"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"},{"id": "produce_target"}]},{"type": "PRIMITIVE","primitive": {"id": "8baea8e6-9d3a-46d7-acf1-04fd593dcd37","version": "0.2.0","python_path": "d3m.primitives.graph_matching.seeded_graph_matching.DistilSeededGraphMatcher","name": "SeededGraphMatcher"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"},"outputs": {"type": "CONTAINER","data": "steps.0.produce_target"}},"outputs": [{"id": "produce"}],"hyperparams": {"metric": {"type": "VALUE","data": "accuracy"}}}]} + ], + "COLLABORATIVE_FILTERING": [ + {"id": "8c3a2db6-4449-4a7a-9830-1b9cf2b993d6","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.6.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7","version": "0.2.0","python_path": "d3m.primitives.schema_discovery.profiler.Common","name": "Determine missing semantic types for columns automatically"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","version": "0.6.0","python_path": "d3m.primitives.data_transformation.column_parser.Common","name": "Parses strings into their types"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"parse_semantic_types": {"type": "VALUE","data": ["http://schema.org/Boolean","http://schema.org/Integer","http://schema.org/Float","https://metadata.datadrivendiscovery.org/types/FloatVector"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/Target","https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "a242314d-7955-483f-aed6-c74cd2b880df","version": "0.1.4","python_path": "d3m.primitives.collaborative_filtering.collaborative_filtering_link_prediction.DistilCollaborativeFiltering","name": "Collaborative filtering"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"},"outputs": {"type": "CONTAINER","data": "steps.4.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","version": "0.3.0","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","name": "Construct pipeline predictions output"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.5.produce"},"reference": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "15cea2f3-9eef-4a37-8f04-eea2e30f8d68","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.01,"type": "VALUE"},"d": {"data": 50,"type": "VALUE"},"maxiter": {"data": 500,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"n_estimators": {"data": 50,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "1b2a32a6-0ec5-3ca0-9386-b8b1f1b831d1","name": "sklearn.ensemble.bagging.BaggingClassifier","python_path": "d3m.primitives.classification.bagging.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "164f4dfe-fcca-4769-aa10-d0d9f2a72cb3","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"beta": {"data": 0.01,"type": "VALUE"},"d": {"data": 20,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.02}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "9ea39abe-b164-4eff-918e-c364ce87d167","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.6.produce","name": "output predictions"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"hyperparams": {"convert": {"data": true,"type": "VALUE"},"to_type": {"data": {"encoding": "pickle","value": "gANjYnVpbHRpbnMKaW50CnEALg=="},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [3],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"convert": {"data": false,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"},"outputs": {"data": "steps.4.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 1,"type": "VALUE"},"beta": {"data": 1,"type": "VALUE"},"d": {"data": 100,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "c4019fda-d205-4f89-9acf-5741e45e601a","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.001,"type": "VALUE"},"d": {"data": 90,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.01}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "e1a156e9-0e34-4def-b960-5ad5f3a910a1","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","name": "Parses strings into their types","python_path": "d3m.primitives.data_transformation.column_parser.Common","version": "0.5.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.001,"type": "VALUE"},"d": {"data": 100,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]} + ], + "OBJECT_DETECTION": [ + {"id": "f0aeacc2-3147-4a35-ac75-449e3f92f286", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.2.produce", "name": "output_predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e", "version": "0.2.0", "python_path": "d3m.primitives.data_transformation.denormalize.Common", "name": "Denormalize datasets"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"dataframe_resource": {"type": "VALUE", "data": "learningData"}}}, {"type": "PRIMITIVE", "primitive": {"id": "d921be1e-b158-4ab7-abb3-cb1b17f42639", "version": "0.1.0", "python_path": "d3m.primitives.object_detection.retinanet", "name": "retina_net"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}]}, + {"id": "dd2d98ed-5d94-4245-a0c9-0861ed7bc177","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "input dataset"}],"outputs": [{"data": "steps.4.produce","name": "predictions of input dataset"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e","version": "0.2.0","python_path": "d3m.primitives.data_transformation.denormalize.Common","name": "Denormalize datasets"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","version": "0.3.0","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","name": "Extract a DataFrame from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey","https://metadata.datadrivendiscovery.org/types/FileName"]}}},{"type": "PRIMITIVE","primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version": "0.3.0","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name": "Extracts columns by semantic type"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"semantic_types": {"type": "VALUE","data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type": "PRIMITIVE","primitive": {"id": "dsbox-featurizer-object-detection-yolo","version": "1.5.3","python_path": "d3m.primitives.feature_extraction.yolo.DSBOX","name": "DSBox Object Detection YOLO"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"},"outputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"epochs": {"type": "VALUE","data": 200},"use_fitted_weight": {"type": "VALUE","data": false}}}]}, + {"id":"acdb068f-be85-48b1-81cc-e65d7b148d74","schema":"https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs":[{"name":"input dataset"}],"outputs":[{"data":"steps.4.produce","name":"predictions of input dataset"}],"steps":[{"type":"PRIMITIVE","primitive":{"id":"f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e","version":"0.2.0","python_path":"d3m.primitives.data_transformation.denormalize.Common","name":"Denormalize datasets"},"arguments":{"inputs":{"type":"CONTAINER","data":"inputs.0"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"4b42ce1e-9b98-4a25-b68e-fad13311eb65","version":"0.3.0","python_path":"d3m.primitives.data_transformation.dataset_to_dataframe.Common","name":"Extract a DataFrame from a Dataset"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version":"0.4.0","python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name":"Extracts columns by semantic type"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.1.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"semantic_types":{"type":"VALUE","data":["https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey","https://metadata.datadrivendiscovery.org/types/FileName"]}}},{"type":"PRIMITIVE","primitive":{"id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version":"0.4.0","python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name":"Extracts columns by semantic type"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.1.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"semantic_types":{"type":"VALUE","data":["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type":"PRIMITIVE","primitive":{"id":"dsbox-featurizer-object-detection-yolo","version":"1.5.3","python_path":"d3m.primitives.feature_extraction.yolo.DSBOX","name":"DSBox Object Detection YOLO"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.2.produce"},"outputs":{"type":"CONTAINER","data":"steps.3.produce"}},"outputs":[{"id":"produce"}]}]} + ], + "VERTEX_CLASSIFICATION": [ + {"id": "704163cb-eb0d-4771-8258-5e057503a437","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "output"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "79012210-2463-4f94-9da6-11bdc5a7e6c4","version": "0.1.2","python_path": "d3m.primitives.data_transformation.load_single_graph.DistilSingleGraphLoader","name": "Load single graph and dataframe into a parseable object"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"},{"id": "produce_target"}]},{"type": "PRIMITIVE","primitive": {"id": "0130828c-1ac0-47a9-a167-f05bae5a3146","version": "0.1.1","python_path": "d3m.primitives.vertex_nomination.vertex_nomination.DistilVertexNomination","name": "VertexNomination"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"},"outputs": {"type": "CONTAINER","data": "steps.0.produce_target"}},"outputs": [{"id": "produce"}],"hyperparams": {"metric": {"type": "VALUE","data": "accuracy"}}}]}, + {"id": "15cea2f3-9eef-4a37-8f04-eea2e30f8d68","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.01,"type": "VALUE"},"d": {"data": 50,"type": "VALUE"},"maxiter": {"data": 500,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"n_estimators": {"data": 50,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "1b2a32a6-0ec5-3ca0-9386-b8b1f1b831d1","name": "sklearn.ensemble.bagging.BaggingClassifier","python_path": "d3m.primitives.classification.bagging.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "164f4dfe-fcca-4769-aa10-d0d9f2a72cb3","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"beta": {"data": 0.01,"type": "VALUE"},"d": {"data": 20,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.02}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "9ea39abe-b164-4eff-918e-c364ce87d167","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.6.produce","name": "output predictions"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"hyperparams": {"convert": {"data": true,"type": "VALUE"},"to_type": {"data": {"encoding": "pickle","value": "gANjYnVpbHRpbnMKaW50CnEALg=="},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [3],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"convert": {"data": false,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"},"outputs": {"data": "steps.4.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 1,"type": "VALUE"},"beta": {"data": 1,"type": "VALUE"},"d": {"data": 100,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "c4019fda-d205-4f89-9acf-5741e45e601a","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.001,"type": "VALUE"},"d": {"data": 90,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.01}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "e1a156e9-0e34-4def-b960-5ad5f3a910a1","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","name": "Parses strings into their types","python_path": "d3m.primitives.data_transformation.column_parser.Common","version": "0.5.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"},"outputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.01,"type": "VALUE"},"beta": {"data": 0.001,"type": "VALUE"},"d": {"data": 100,"type": "VALUE"},"maxiter": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "e6ee30fa-af68-4bfe-9234-5ca7e7ac8e93","name": "Matrix Completion via Sparse Factorization","python_path": "d3m.primitives.collaborative_filtering.high_rank_imputer.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 100,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 5,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "0f6cafc4-5628-47bc-bbf5-8cab3a7c0e95","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8fa6178b-84f7-37d8-87e8-4d3a44c86569","version": "0.1.0","python_path": "d3m.primitives.data_transformation.laplacian_spectral_embedding.JHU","name": "jhu.lse"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "5194ef94-3683-319a-9d8d-5c3fdd09de24","version": "0.1.0","python_path": "d3m.primitives.graph_clustering.gaussian_clustering.JHU","name": "jhu.gclust"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_clusters": {"type": "VALUE","data": 10}}}]}, + {"id": "ffc49730-eb73-423c-ab6c-acb47300fcfc","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "8fa6178b-84f7-37d8-87e8-4d3a44c86569","version": "0.1.0","python_path": "d3m.primitives.data_transformation.laplacian_spectral_embedding.JHU","name": "jhu.lse"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "c9d5da5d-0520-468e-92df-bd3a85bb4fac","version": "0.1.0","python_path": "d3m.primitives.classification.gaussian_classification.JHU","name": "jhu.gclass"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "4a2fb696-bf29-410d-934d-c4b17b273938","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.1.produce","name": "Results"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "a22f9bd3-818e-44e9-84a3-9592c5a85408","version": "1.7.8","python_path": "d3m.primitives.data_transformation.vertex_classification_parser.VertexClassificationParser","name": "Vertex Classification Parser"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "dca25a46-7a5f-48d9-ac9b-d14d4d671b0b","version": "1.7.8","python_path": "d3m.primitives.classification.vertex_nomination.VertexClassification","name": "Vertex Classification"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "2e216966-bd3b-4b53-9933-7ce9a88de6d1","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"d": {"data": 15,"type": "VALUE"},"epsilon": {"data": 0.1,"type": "VALUE"},"maxiter": {"data": 5000,"type": "VALUE"},"t": {"data": 0.001,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "7c357e6e-7124-4f2a-8371-8021c8c95cc9","name": "Huber PCA","python_path": "d3m.primitives.feature_extraction.huber_pca.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 1000,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.01}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "4f678918-1de5-4db4-8c1c-d7dd0e3b2bec","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.11.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7","name": "Parses strings into their types","python_path": "d3m.primitives.data_transformation.column_parser.Common","version": "0.5.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d016df89-de62-3c53-87ed-c06bb6a23cde","name": "sklearn.impute.SimpleImputer","python_path": "d3m.primitives.data_cleaning.imputer.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"d": {"data": 20,"type": "VALUE"},"epsilon": {"data": 1,"type": "VALUE"},"t": {"data": 0.001,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "7c357e6e-7124-4f2a-8371-8021c8c95cc9","name": "Huber PCA","python_path": "d3m.primitives.feature_extraction.huber_pca.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.6.produce","type": "CONTAINER"},"outputs": {"data": "steps.8.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 1000,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.9.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 8,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.10.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "94db5247-7827-468a-81b6-6b709af86d5c","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"d": {"data": 50,"type": "VALUE"},"epsilon": {"data": 0.1,"type": "VALUE"},"maxiter": {"data": 2000,"type": "VALUE"},"t": {"data": 0.001,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "7c357e6e-7124-4f2a-8371-8021c8c95cc9","name": "Huber PCA","python_path": "d3m.primitives.feature_extraction.huber_pca.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 1000,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.01}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id": "7cb3e0eb-2f3e-4756-9c4e-1cc2852c84b9","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.9.produce","name": "output"}],"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","steps": [{"arguments": {"inputs": {"data": "inputs.0","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65","name": "Extract a DataFrame from a Dataset","python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"columns": {"data": [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae","name": "Extracts columns","python_path": "d3m.primitives.data_transformation.extract_columns.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.1.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "196152a7-a873-4676-bbde-95627f4b5306","name": "Preprocessing for categorical columns","python_path": "d3m.primitives.column_parser.preprocess_categorical_columns.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.2.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "d639947e-ece0-3a39-a666-e974acf4521d","name": "sklearn.preprocessing.data.StandardScaler","python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.3.produce","type": "CONTAINER"}},"hyperparams": {"alpha": {"data": 0.1,"type": "VALUE"},"d": {"data": 25,"type": "VALUE"},"epsilon": {"data": 0.01,"type": "VALUE"},"maxiter": {"data": 5000,"type": "VALUE"},"t": {"data": 0.0005,"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "7c357e6e-7124-4f2a-8371-8021c8c95cc9","name": "Huber PCA","python_path": "d3m.primitives.feature_extraction.huber_pca.Cornell","version": "v0.1.1"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.0.produce","type": "CONTAINER"}},"hyperparams": {"semantic_types": {"data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"],"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","name": "Extracts columns by semantic type","python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","version": "0.3.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.5.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4","name": "Label encoder with an unseen category","python_path": "d3m.primitives.data_preprocessing.label_encoder.Common","version": "0.2.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.4.produce","type": "CONTAINER"},"outputs": {"data": "steps.6.produce","type": "CONTAINER"}},"hyperparams": {"C": {"data": 5000,"type": "VALUE"},"kernel": {"data": {"choice": "rbf","gamma": {"case": "float","value": 0.1}},"type": "VALUE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "0ae7d42d-f765-3348-a28c-57d94880aa6a","name": "sklearn.svm.classes.SVC","python_path": "d3m.primitives.classification.svc.SKlearn","version": "2019.6.7"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.7.produce","type": "CONTAINER"}},"hyperparams": {"encoder": {"data": 6,"type": "PRIMITIVE"}},"outputs": [{"id": "produce"}],"primitive": {"id": "39ae30f7-39ed-40af-8679-5cf108499605","name": "Label decoder for UnseenLabelEncoderPrimitive","python_path": "d3m.primitives.data_preprocessing.label_decoder.Common","version": "0.1.0"},"type": "PRIMITIVE"},{"arguments": {"inputs": {"data": "steps.8.produce","type": "CONTAINER"},"reference": {"data": "steps.0.produce","type": "CONTAINER"}},"outputs": [{"id": "produce"}],"primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736","name": "Construct pipeline predictions output","python_path": "d3m.primitives.data_transformation.construct_predictions.Common","version": "0.3.0"},"type": "PRIMITIVE"}]}, + {"id":"c50643d6-9f82-44fb-ae6e-e40ee96b6899","schema":"https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs":[{"name":"input dataset"}],"outputs":[{"data":"steps.5.produce","name":"predictions of input dataset"}],"steps":[{"type":"PRIMITIVE","primitive":{"id":"f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e","version":"0.2.0","python_path":"d3m.primitives.data_transformation.denormalize.Common","name":"Denormalize datasets"},"arguments":{"inputs":{"type":"CONTAINER","data":"inputs.0"}},"outputs":[{"id":"produce"}],"hyperparams":{"starting_resource":{"type":"VALUE","data":null},"recursive":{"type":"VALUE","data":true},"many_to_many":{"type":"VALUE","data":false},"discard_not_joined_tabular_resources":{"type":"VALUE","data":false}}},{"type":"PRIMITIVE","primitive":{"id":"4b42ce1e-9b98-4a25-b68e-fad13311eb65","version":"0.3.0","python_path":"d3m.primitives.data_transformation.dataset_to_dataframe.Common","name":"Extract a DataFrame from a Dataset"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version":"0.3.0","python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name":"Extracts columns by semantic type"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.1.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"semantic_types":{"type":"VALUE","data":["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type":"PRIMITIVE","primitive":{"id":"7d61e488-b5bb-4c79-bad6-f1dc07292bf4","version":"1.0.0","python_path":"d3m.primitives.feature_construction.sdne.DSBOX","name":"SDNE"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"beta":{"type":"VALUE","data":4},"alpha":{"type":"VALUE","data":0.00001},"dimension":{"type":"VALUE","data":128},"epochs":{"type":"VALUE","data":200},"lr":{"type":"VALUE","data":0.0005}}},{"type": "PRIMITIVE","primitive": {"id":"7ddf2fd8-2f7f-4e53-96a7-0d9f5aeecf93","version":"1.5.3","python_path":"d3m.primitives.data_transformation.to_numeric.DSBOX","name":"ISI DSBox To Numeric DataFrame"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.3.produce"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"1dd82833-5692-39cb-84fb-2455683075f3","version":"2019.6.7","python_path":"d3m.primitives.classification.random_forest.SKlearn","name":"sklearn.ensemble.forest.RandomForestClassifier"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.4.produce"},"outputs":{"type":"CONTAINER","data":"steps.2.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"max_depth":{"type":"VALUE","data":{"case":"int","value":30}},"min_samples_leaf":{"type":"VALUE","data":{"case":"absolute","value":2}},"min_samples_split":{"type":"VALUE","data":{"case":"absolute","value":2}},"max_features":{"type":"VALUE","data":{"case":"calculated","value":"sqrt"}},"n_estimators":{"type":"VALUE","data":100},"add_index_columns":{"type":"VALUE","data":true},"use_semantic_types":{"type":"VALUE","data":false},"error_on_no_input":{"type":"VALUE","data":true}}}]}, + {"id":"fc1eee7f-6435-4001-9cf6-6d24330d9b1c","schema":"https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs":[{"name":"input dataset"}],"outputs":[{"data":"steps.4.produce","name":"predictions of input dataset"}],"steps":[{"type":"PRIMITIVE","primitive":{"id":"f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e","version":"0.2.0","python_path":"d3m.primitives.data_transformation.denormalize.Common","name":"Denormalize datasets"},"arguments":{"inputs":{"type":"CONTAINER","data":"inputs.0"}},"outputs":[{"id":"produce"}],"hyperparams":{"starting_resource":{"type":"VALUE","data":null},"recursive":{"type":"VALUE","data":true},"many_to_many":{"type":"VALUE","data":false},"discard_not_joined_tabular_resources":{"type":"VALUE","data":false}}},{"type":"PRIMITIVE","primitive":{"id":"4b42ce1e-9b98-4a25-b68e-fad13311eb65","version":"0.3.0","python_path":"d3m.primitives.data_transformation.dataset_to_dataframe.Common","name":"Extract a DataFrame from a Dataset"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"}},"outputs":[{"id":"produce"}]},{"type":"PRIMITIVE","primitive":{"id":"4503a4c6-42f7-45a1-a1d4-ed69699cf5e1","version":"0.3.0","python_path":"d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common","name":"Extracts columns by semantic type"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.1.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"semantic_types":{"type":"VALUE","data":["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}}},{"type":"PRIMITIVE","primitive":{"id":"48572851-b86b-4fda-961d-f3f466adb58e","version":"1.0.0","python_path":"d3m.primitives.feature_construction.gcn_mixhop.DSBOX","name":"GCN"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.0.produce"},"outputs":{"type":"CONTAINER","data":"steps.2.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"epochs":{"type":"VALUE","data":200},"adjacency_order":{"type":"VALUE","data":3}}},{"type":"PRIMITIVE","primitive":{"id":"1dd82833-5692-39cb-84fb-2455683075f3","version":"2019.6.7","python_path":"d3m.primitives.classification.random_forest.SKlearn","name":"sklearn.ensemble.forest.RandomForestClassifier"},"arguments":{"inputs":{"type":"CONTAINER","data":"steps.3.produce"},"outputs":{"type":"CONTAINER","data":"steps.2.produce"}},"outputs":[{"id":"produce"}],"hyperparams":{"max_depth":{"type":"VALUE","data":{"case":"int","value":30}},"min_samples_leaf":{"type":"VALUE","data":{"case":"absolute","value":2}},"min_samples_split":{"type":"VALUE","data":{"case":"absolute","value":2}},"max_features":{"type":"VALUE","data":{"case":"calculated","value":"sqrt"}},"n_estimators":{"type":"VALUE","data":100},"add_index_columns":{"type":"VALUE","data":true},"use_semantic_types":{"type":"VALUE","data":false},"error_on_no_input":{"type":"VALUE","data":true}}}]}, + {"id": "ddc6c7e9-64b4-4f9c-af07-5f27461cb940","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "c9d5da5d-0520-468e-92df-bd3a85bb4fac","version": "0.1.0","python_path": "d3m.primitives.classification.gaussian_classification.JHU","name": "jhu.gclass"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}]}]}, + {"id": "12a4b6a8-b2e4-4604-afe5-8196bf55a925","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.3.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "cb192a83-63e2-4075-bab9-e6ba1a8365b6","version": "0.1.0","python_path": "d3m.primitives.data_transformation.load_graphs.JHU","name": "Extract a list of Graphs from a Dataset"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "32fec24f-6861-4a4c-88f3-d4ec2bc1b486","version": "0.1.0","python_path": "d3m.primitives.data_preprocessing.largest_connected_component.JHU","name": "jhu.lcc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_dimension": {"type": "VALUE","data": 5},"use_attributes": {"type": "VALUE","data": true}}},{"type": "PRIMITIVE","primitive": {"id": "5194ef94-3683-319a-9d8d-5c3fdd09de24","version": "0.1.0","python_path": "d3m.primitives.graph_clustering.gaussian_clustering.JHU","name": "jhu.gclust"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.2.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"max_clusters": {"type": "VALUE","data": 10}}}]}, + {"id": "6216f2bd-2f23-4dbf-92d0-f3b40aeac150","schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-21T20:00:00.000000Z","inputs": [{"name": "inputs"}],"outputs": [{"data": "steps.2.produce","name": "Predictions"}],"steps": [{"type": "PRIMITIVE","primitive": {"id": "09f2eea8-667c-44b8-a955-6a153ba9ccc3","version": "0.1.0","python_path": "d3m.primitives.link_prediction.data_conversion.JHU","name": "jhu.link_pred_graph_reader"},"arguments": {"inputs": {"type": "CONTAINER","data": "inputs.0"}},"outputs": [{"id": "produce"}]},{"type": "PRIMITIVE","primitive": {"id": "b940ccbd-9e9b-3166-af50-210bfd79251b","version": "0.1.0","python_path": "d3m.primitives.data_transformation.adjacency_spectral_embedding.JHU","name": "jhu.ase"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.0.produce"}},"outputs": [{"id": "produce"}],"hyperparams": {"which_elbow": {"type": "VALUE","data": 1},"max_dimension": {"type": "VALUE","data": 2},"use_attributes": {"type": "VALUE","data": false}}},{"type": "PRIMITIVE","primitive": {"id": "25e97696-b96f-4f5c-8620-b340fe83414d","version": "0.1.0","python_path": "d3m.primitives.link_prediction.rank_classification.JHU","name": "jhu.link_pred_rc"},"arguments": {"inputs": {"type": "CONTAINER","data": "steps.1.produce"}},"outputs": [{"id": "produce"}]}]} + ], + "FORECASTING": [] +} diff --git a/axolotl/axolotl/utils/resources/scoring_pipeline.yml b/axolotl/axolotl/utils/resources/scoring_pipeline.yml new file mode 100644 index 0000000..e95ecd5 --- /dev/null +++ b/axolotl/axolotl/utils/resources/scoring_pipeline.yml @@ -0,0 +1,31 @@ +id: f596cd77-25f8-4d4c-a350-bb30ab1e58f6 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2020-04-18T11:42:44.138742Z" +name: Scoring pipeline +description: |- + A general scoring pipeline. +inputs: + - name: predictions + - name: score dataset +outputs: + - name: scores + data: steps.0.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 799802fb-2e11-4ab7-9c5e-dda09eb52a70 + version: 0.5.0 + python_path: d3m.primitives.evaluation.compute_scores.Core + name: Compute scores given the metrics to use + arguments: + inputs: + type: CONTAINER + data: inputs.0 + score_dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce diff --git a/axolotl/axolotl/utils/resources/splitting_pipelines.json b/axolotl/axolotl/utils/resources/splitting_pipelines.json new file mode 100644 index 0000000..d7b0639 --- /dev/null +++ b/axolotl/axolotl/utils/resources/splitting_pipelines.json @@ -0,0 +1,7 @@ +{ + "HOLDOUT_FIXED": {"id": "9c18472e-fff7-4129-93f6-1ab996e82adb", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2018-10-27T01:30:10.245934Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "1654f000-2178-4520-be4c-a95bc26b8d3a", "version": "0.1.0", "python_path": "d3m.primitives.evaluation.fixed_split_dataset_split.Commmon", "name": "Fixed split tabular dataset splits", "digest": "4ebb8d32da071e84370aa978f0b455a592fb2cc88181d669bcf8081ecd98fa00"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Mitar"}, "name": "Fixed split of tabular datasets", "description": "A pipeline which splits a tabular dataset in a way that uses for the test\n(score) split a fixed list of primary index values or row indices of the main\nresource to be used.\n", "digest": "28193e7483794e5bd164c352e02e90090d9cda17abfe542b2393a4ecb58c0bb8"}, + "K_FOLD": {"id": "c8ed65df-aa68-4ee0-bbb5-c5f76a40bcf8", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2018-07-27T19:39:00.676949Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "bfedaf3a-6dd0-4a83-ad83-3a50fe882bf8", "version": "0.1.0", "python_path": "d3m.primitives.evaluation.kfold_dataset_split.Common", "name": "K-fold cross-validation tabular dataset splits", "digest": "8fc8fd388ed30e8e13c0c04880b0dd81051cd15ae7416a962d79b8187be65fbc"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Mitar"}, "name": "K-fold split of tabular datasets", "description": "K-fold split of tabular datasets for cross-validation.\n", "digest": "c1546da06d12b4f435973bc335a54ca7486ba51a7067c65e58e397236cecad73"}, + "k-fold-timeseries-split": {"id": "5bed1f23-ac17-4b52-9d06-a5b77a6aea51", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2019-04-08T16:18:27.250294Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "002f9ad1-46e3-40f4-89ed-eeffbb3a102b", "version": "0.3.0", "python_path": "d3m.primitives.evaluation.kfold_time_series_split.Common", "name": "K-fold cross-validation timeseries dataset splits", "digest": "e06a27b03f9cea879c21e012b031f84c2a7b37193987134481db1117f05e9657"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Jeffrey Gleason"}, "name": "K-fold split of timeseries datasets", "description": "K-fold split of timeseries datasets for cross-validation.\n", "digest": "33aea0b6bd864a383020eb9d1f64fda193e20bb8690ee516809004d805f9614a"}, + "TRAINING_DATA": {"id": "79ce71bd-db96-494b-a455-14f2e2ac5040", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2018-10-26T00:48:08.341897Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "48c683ad-da9e-48cf-b3a0-7394dba5e5d2", "version": "0.1.0", "python_path": "d3m.primitives.evaluation.no_split_dataset_split.Common", "name": "No-split tabular dataset splits", "digest": "869d62e577148338d1c732347d6d0bf2119ae9af6b90037fda5044ab0eef01dc"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Mitar"}, "name": "No split of tabular datasets", "description": "A pipeline which splits a tabular dataset in a way that for all splits it\nproduces the same (full) dataset. It still redacts the test split.\nUseful for unsupervised learning tasks.\n", "digest": "690373622142f12dc078657246b8f2f6c070ebd32720321d786a3f0c653d55cc"}, + "HOLDOUT": {"id": "3c11d171-e2ad-4d26-a034-04f3b062306c", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2018-07-28T01:24:39.642266Z", "inputs": [{"name": "folds"}, {"name": "full dataset"}], "outputs": [{"data": "steps.0.produce", "name": "train datasets"}, {"data": "steps.2.produce", "name": "test datasets"}, {"data": "steps.1.produce", "name": "score datasets"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "3fcc6dc4-6681-4c86-948e-066d14e7d803", "version": "0.1.0", "python_path": "d3m.primitives.evaluation.train_score_dataset_split.Common", "name": "Train-score tabular dataset splits", "digest": "f65655f435f9e703e00f174dae743f93fee5c10aa2016d2398f4d53bee8d5bae"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}, "dataset": {"type": "CONTAINER", "data": "inputs.1"}}, "outputs": [{"id": "produce"}, {"id": "produce_score_data"}]}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce_score_data"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "744c4090-e2f6-489e-8efc-8b1e051bfad6", "version": "0.2.0", "python_path": "d3m.primitives.evaluation.redact_columns.Common", "name": "Redact columns for evaluation", "digest": "e59c835f0ec9e720525b11e8f1409fd3733b41802d75905851c6a35b43168310"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"]}, "add_semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget", "https://metadata.datadrivendiscovery.org/types/MissingData"]}}}], "source": {"name": "Mitar"}, "name": "Train-test split of tabular datasets", "description": "Train-test split of tabular datasets.\n", "digest": "675ee3e96e9b1bfba41694b6289a889ef6fc96e5477b89c8267871b941e4d78e"} +} \ No newline at end of file diff --git a/axolotl/axolotl/utils/schemas.py b/axolotl/axolotl/utils/schemas.py new file mode 100644 index 0000000..b70187f --- /dev/null +++ b/axolotl/axolotl/utils/schemas.py @@ -0,0 +1,472 @@ +import os +import copy +import json +import typing +import logging +import math +import random +import binascii + +from d3m import container +from d3m.metadata.problem import TaskKeyword, PerformanceMetric +from d3m.metadata.pipeline import Pipeline +from d3m import utils as d3m_utils + +from axolotl.utils import pipeline as pipeline_utils + +logger = logging.getLogger(__name__) + + +# ContainerType = typing.Union[container.Dataset, container.DataFrame, container.ndarray, container.List] +ContainerType = container.Dataset + +resource_dir = os.path.dirname(__file__) +SPLITTING_PIPELINES_DIR = os.path.join(resource_dir, 'resources', 'splitting_pipelines.json') +SCORING_PIPELINES_DIR = os.path.join(resource_dir, 'resources', 'scoring_pipeline.yml') +PIPELINES_DB_DIR = os.path.join(resource_dir, 'resources', 'default_pipelines.json') + +TASK_TYPE = { + TaskKeyword.CLASSIFICATION, TaskKeyword.REGRESSION, + TaskKeyword.CLUSTERING, TaskKeyword.LINK_PREDICTION, + TaskKeyword.VERTEX_NOMINATION, TaskKeyword.COMMUNITY_DETECTION, + TaskKeyword.GRAPH_MATCHING, TaskKeyword.COLLABORATIVE_FILTERING, + TaskKeyword.OBJECT_DETECTION, TaskKeyword.VERTEX_CLASSIFICATION, + TaskKeyword.FORECASTING +} + +TASK_SUBTYPES = { + TaskKeyword.MULTIVARIATE, + TaskKeyword.BINARY, + TaskKeyword.NONOVERLAPPING, + TaskKeyword.OVERLAPPING, + TaskKeyword.UNIVARIATE, + TaskKeyword.MULTICLASS, + TaskKeyword.MULTILABEL, +} + +DATA_TYPES = { + TaskKeyword.TIME_SERIES, + TaskKeyword.AUDIO, + TaskKeyword.TABULAR, + TaskKeyword.TEXT, + TaskKeyword.VIDEO, + TaskKeyword.GRAPH, + TaskKeyword.IMAGE, + TaskKeyword.GEOSPATIAL, + TaskKeyword.RELATIONAL, + TaskKeyword.GROUPED, + TaskKeyword.LUPI +} + +CLASSIFICATION_METRICS = [ + {'metric': PerformanceMetric.ACCURACY, 'params': {}}, + {'metric': PerformanceMetric.PRECISION, 'params': {}}, + {'metric': PerformanceMetric.RECALL, 'params': {}}, + {'metric': PerformanceMetric.F1, 'params': {}}, + {'metric': PerformanceMetric.F1_MICRO, 'params': {}}, + {'metric': PerformanceMetric.F1_MACRO, 'params': {}}, + {'metric': PerformanceMetric.ROC_AUC, 'params': {}}, +] + +BINARY_CLASSIFICATION_METRICS = [ + {'metric': PerformanceMetric.ACCURACY, 'params': {}}, +] + +MULTICLASS_CLASSIFICATION_METRICS = [ + {'metric': PerformanceMetric.ACCURACY, 'params': {}}, + {'metric': PerformanceMetric.F1_MICRO, 'params': {}}, + {'metric': PerformanceMetric.F1_MACRO, 'params': {}}, +] + +MULTILABEL_CLASSIFICATION_METRICS = [ + {'metric': PerformanceMetric.ACCURACY, 'params': {}}, +] + +REGRESSION_METRICS = [ + {'metric': PerformanceMetric.MEAN_ABSOLUTE_ERROR, 'params': {}}, + {'metric': PerformanceMetric.MEAN_SQUARED_ERROR, 'params': {}}, + {'metric': PerformanceMetric.ROOT_MEAN_SQUARED_ERROR, 'params': {}}, + {'metric': PerformanceMetric.R_SQUARED, 'params': {}}, +] + +CLUSTERING_METRICS = [ + {'metric': PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION, 'params': {}}, +] + +LINK_PREDICTION_METRICS = [ + {'metric': PerformanceMetric.ACCURACY, 'params': {}}, +] + +VERTEX_NOMINATION_METRICS = [ + {'metric': PerformanceMetric.ACCURACY, 'params': {}}, +] + +COMMUNITY_DETECTION_METRICS = [ + {'metric': PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION, 'params': {}}, +] + +GRAPH_CLUSTERING_METRICS = [] + +GRAPH_MATCHING_METRICS = [ + {'metric': PerformanceMetric.ACCURACY, 'params': {}} +] + +TIME_SERIES_FORECASTING_METRICS = REGRESSION_METRICS + +COLLABORATIVE_FILTERING_METRICS = REGRESSION_METRICS + +OBJECT_DETECTION_METRICS = [ + {'metric': PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION, 'params': {}}, +] + +MULTICLASS_VERTEX_METRICS = MULTICLASS_CLASSIFICATION_METRICS + +SEMI_SUPERVISED_MULTICLASS_CLASSIFICATION_METRICS = MULTICLASS_CLASSIFICATION_METRICS + +SEMI_SUPERVISED_REGRESSION_METRICS = REGRESSION_METRICS + +DATA_PREPARATION_PARAMS = { + 'k_fold_tabular': { + 'method': 'K_FOLD', + 'number_of_folds': '3', + 'stratified': 'false', + 'shuffle': 'true', + 'randomSeed': '42', + }, + + 'holdout': { + 'method': 'HOLDOUT', + 'train_score_ratio': '0.2', + 'shuffle': 'true', + 'stratified': 'true', + 'randomSeed': '42', + }, + + 'no_stratified_holdout': { + 'method': 'HOLDOUT', + 'train_score_ratio': '0.2', + 'shuffle': 'true', + 'stratified': 'false', + 'randomSeed': '42', + }, + + 'no_split': { + 'method': 'TRAINING_DATA', + 'number_of_folds': '1', + 'stratified': 'true', + 'shuffle': 'true', + 'randomSeed': '42', + }, +} + +PROBLEM_DEFINITION = { + 'binary_classification': { + 'performance_metrics': BINARY_CLASSIFICATION_METRICS, + 'task_keywords': [TaskKeyword.CLASSIFICATION, TaskKeyword.BINARY] + }, + 'regression': { + 'performance_metrics': REGRESSION_METRICS, + 'task_keywords': [TaskKeyword.UNIVARIATE, TaskKeyword.REGRESSION] + } + +} + + +def get_task_description(keywords) -> dict: + """ + A function that parse the keywords from the problem and map them to + TaskType, SubTasktype and data type eg. tabular, images, audio, etc + + Parameters + ---------- + keywords: List[d3m.problem.TaskKeyword] + List of keywords that comes from d3m problem description + + Returns + ------- + dict + { + task_type: str + task_subtype: str + data_types: list + semi: bool + } + """ + + task_type = None + task_subtype = None + data_types = [] + semi = False + for keyword in keywords: + if keyword in TASK_TYPE: + task_type = keyword.name + elif keyword in TASK_SUBTYPES: + task_subtype = keyword.name + elif keyword in DATA_TYPES: + data_types.append(keyword.name) + elif keyword.name == TaskKeyword.SEMISUPERVISED: + semi = True + + # if data_types is empty we assume is tabular: + if not data_types: + data_types.append(TaskKeyword.TABULAR) + + return {'task_type': task_type, 'task_subtype': task_subtype, 'data_types': data_types, 'semi': semi} + + +def get_metrics_from_task(task_des, perf_metrics=None): + """ + Provides a dictionary of metrics ready to use for perfromance_metrics + + Parameters + ---------- + task_des: dict + A dictionary describe the task + perf_metrics: dict + A dictionary specifying the needed performance metric parameters + + Returns + ------- + performance_metrics: dict + A dict containing performance metrics. + """ + # For the case thet the user only want to run a full pipeline + task_type = task_des['task_type'] + task_subtype = task_des['task_subtype'] + data_types = task_des['data_types'] + if not task_des: + return None + if TaskKeyword.CLASSIFICATION == task_type or \ + TaskKeyword.VERTEX_CLASSIFICATION == task_type: + if task_des['semi']: + # TODO: Temporary solution to binary semi supervised classification + metrics = SEMI_SUPERVISED_MULTICLASS_CLASSIFICATION_METRICS + elif TaskKeyword.BINARY == task_subtype: + metrics = BINARY_CLASSIFICATION_METRICS + elif TaskKeyword.MULTICLASS == task_subtype: + metrics = MULTICLASS_CLASSIFICATION_METRICS + elif TaskKeyword.MULTILABEL == task_subtype: + metrics = MULTILABEL_CLASSIFICATION_METRICS + else: + metrics = CLASSIFICATION_METRICS + elif TaskKeyword.REGRESSION == task_type: + metrics = REGRESSION_METRICS + elif TaskKeyword.CLUSTERING == task_type: + metrics = CLUSTERING_METRICS + elif TaskKeyword.LINK_PREDICTION == task_type: + metrics = LINK_PREDICTION_METRICS + elif TaskKeyword.VERTEX_NOMINATION == task_type: + metrics = VERTEX_NOMINATION_METRICS + elif TaskKeyword.COMMUNITY_DETECTION == task_type: + metrics = COMMUNITY_DETECTION_METRICS + elif TaskKeyword.GRAPH_MATCHING == task_type: + metrics = GRAPH_MATCHING_METRICS + elif TaskKeyword.TIME_SERIES in data_types and TaskKeyword.FORECASTING: + metrics = TIME_SERIES_FORECASTING_METRICS + elif TaskKeyword.COLLABORATIVE_FILTERING == task_type: + metrics = COLLABORATIVE_FILTERING_METRICS + elif TaskKeyword.OBJECT_DETECTION == task_type: + metrics = OBJECT_DETECTION_METRICS + else: + raise ValueError('Task keywords not supported, keywords: {}'.format(task_des)) + + for i, metric in enumerate(metrics): + for perf_metric in perf_metrics: + if perf_metric['metric'] == metric['metric'] and 'params' in perf_metric: + copy_metric = copy.deepcopy(metric) + copy_metric['params']['pos_label'] = perf_metric['params']['pos_label'] + metrics[i] = copy_metric + logger.info('get_metrics_from_task:metrics: {}'.format(metrics)) + return metrics + + +def get_eval_configuration(task_type: str, data_types: typing.Sequence, semi: bool) -> typing.Dict: + """ + Determines which method of evaluation to use, cross_fold, holdout, etc. + + Parameters + ---------- + task_type: str + task type + data_types: list + data types + semi: bool + is it semi-supervised problem + + Returns + ------- + eval_configuration: dict + A dict that contains the evaluation method to use. + """ + + # for the case of no problem return None. + if not task_type: + return {} + + if semi: + # Splitting semi may get empty ground truth, which can cause error in sklearn metric. + return DATA_PREPARATION_PARAMS['no_split'] + + if TaskKeyword.CLASSIFICATION == task_type: + # These data types tend to take up a lot of time to run, so no k_fold. + if TaskKeyword.AUDIO in data_types or TaskKeyword.VIDEO in data_types \ + or TaskKeyword.IMAGE in data_types: + return DATA_PREPARATION_PARAMS['holdout'] + else: + return DATA_PREPARATION_PARAMS['k_fold_tabular'] + elif TaskKeyword.REGRESSION in data_types: + return DATA_PREPARATION_PARAMS['no_stratified_holdout'] + else: + return DATA_PREPARATION_PARAMS['no_split'] + + +def get_splitting_pipeline(splitting_name: str) -> Pipeline: + with open(SPLITTING_PIPELINES_DIR) as file: + splitting_pipelines = json.load(file) + + if splitting_name in splitting_pipelines: + return pipeline_utils.load_pipeline(splitting_pipelines[splitting_name]) + else: + raise ValueError("{} not supported".format(splitting_name)) + + +def get_scoring_pipeline() -> Pipeline: + with open(SCORING_PIPELINES_DIR, 'r') as pipeline_file: + with d3m_utils.silence(): + pipeline = Pipeline.from_yaml(pipeline_file) + return pipeline + + +def get_pipelines_db(): + with open(PIPELINES_DB_DIR) as file: + pipelines_dict = json.load(file) + return pipelines_dict + + +def get_task_mapping(task: str) -> str: + """ + Map the task in problem_doc to the task types that are currently supported + + Parameters + ---------- + task: str + The task type in problem_doc + + Returns + ------- + str + One of task types that are supported + + """ + mapping = { + 'LINK_PREDICTION': 'CLASSIFICATION', + TaskKeyword.VERTEX_CLASSIFICATION: 'CLASSIFICATION', + 'COMMUNITY_DETECTION': 'CLASSIFICATION', + 'GRAPH_MATCHING': 'CLASSIFICATION', + TaskKeyword.FORECASTING: 'REGRESSION', + 'OBJECT_DETECTION': 'CLASSIFICATION', + 'VERTEX_CLASSIFICATION': 'CLASSIFICATION', + } + if task in mapping: + return mapping[task] + else: + return task + + + +def hex_to_binary(hex_identifier): + return binascii.unhexlify(hex_identifier) + + +def binary_to_hex(identifier): + hex_identifier = binascii.hexlify(identifier) + return hex_identifier.decode() + + +def summarize_performance_metrics(performance_metrics): + """ + A function that averages all the folds if they exist. + + Parameters + ---------- + performance_metrics: dict + A dictionary containing the fold, metrics targets and values from evaluation. + """ + sumarized_performance_metrics = {} + + for metric in performance_metrics.metric.unique(): + mean = performance_metrics[performance_metrics.metric == metric]['value'].mean() + std = performance_metrics[performance_metrics.metric == metric]['value'].std() + if math.isnan(std): + std = 0 + sumarized_performance_metrics[metric] = { + 'mean': mean, + 'std': std, + } + return sumarized_performance_metrics + + +def compute_score(sumarized_performance_metrics): + """ + A function that computes the internal score based on the average normalized metrics. + + Parameters + ---------- + sumarized_performance_metrics: dict + A dictionary containing the summarized version. + """ + score = 0 + + for metric, info in sumarized_performance_metrics.items(): + score += PerformanceMetric[metric].normalize(info['mean']) + + score = score / float(len(sumarized_performance_metrics)) + return score + + +def compute_rank(sumarized_performance_metrics): + """ + A function that computes the rank based on the average normalized metrics. + + Parameters + ---------- + sumarized_performance_metrics: dict + A dictionary containing the summarized version. + """ + ranks = {} + mean = 0 + for metric, info in sumarized_performance_metrics.items(): + try: + ranks[metric] = PerformanceMetric[metric].normalize(abs(info['mean'] - info['std'])) + except: + ranks[metric] = 0 + mean += ranks[metric] + + mean = mean / len(sumarized_performance_metrics) + # rank = 1 - ranks[min(ranks.keys(), key=(lambda k: ranks[k]))] + random.randint(10, 30)**-6 + rank = 1 - mean + + # We add some randomness on the rank to avoid duplications + noise = 0 + sign = -1 if random.randint(0, 1) == 0 else 1 + range_0 = -9 + range_1 = -5 + if rank < 1e-5: + range_0 = -12 + range_1 = -9 + + for i in range(range_0, range_1): + noise += random.randint(0, 9) * 10 ** i + rank = rank + noise * sign + if rank < 0: + rank *= -1 + return rank + + +def random_rank(): + ranks = 0 + average_number = 5 + for i in range(average_number): + ranks += random.uniform(0, 1) + ranks = ranks/average_number + return ranks diff --git a/axolotl/examples/build_search_algorithm.ipynb b/axolotl/examples/build_search_algorithm.ipynb new file mode 100644 index 0000000..49f2d28 --- /dev/null +++ b/axolotl/examples/build_search_algorithm.ipynb @@ -0,0 +1,284 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Axolotl Build dummy search method example [Binary Classification]." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we are showcasing different components of the system.\n", + "- Loading syntethic data for a binary classification task.\n", + "- Easy use of the backend.\n", + "- Creation of custom rank function as well as a simple search method.\n", + "- Use of simple interface for search.\n", + "- Exploring searched pipelines." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import multiple utils we will be using" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import random\n", + "import pandas as pd\n", + "from pprint import pprint\n", + "from sklearn.datasets import make_classification\n", + "\n", + "from d3m import container\n", + "from d3m.metadata.pipeline import Pipeline\n", + "\n", + "from axolotl.utils import data_problem\n", + "from axolotl.backend.ray import RayRunner\n", + "from axolotl.algorithms.base import PipelineSearchBase\n", + "from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generate synthetic data and import it to the system" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "x, y = make_classification(n_samples=100, n_features=20)\n", + "dataset, problem_description = data_problem.generate_dataset_problem(x, y, 'binary_classification')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Make an instance of the runner that is in charge of evaluating and running pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-07-11 19:07:52,353\tINFO resource_spec.py:212 -- Starting Ray with 3.56 GiB memory available for workers and up to 1.79 GiB for objects. You can adjust these settings with ray.init(memory=, object_store_memory=).\n", + "2020-07-11 19:07:52,793\tINFO services.py:1170 -- View the Ray dashboard at \u001b[1m\u001b[32mlocalhost:8265\u001b[39m\u001b[22m\n" + ] + } + ], + "source": [ + "backend = RayRunner(random_seed=42, volumes_dir=None, n_workers=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a random rank function." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def random_rank(pipeline_result):\n", + " if pipeline_result.status == 'COMPLETED':\n", + " pipeline_result.rank = random.uniform(0, 1)\n", + " return pipeline_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a predefined Search algorithm that is loading some predefined pipelines previosuly stored." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "class PredefinedSearch(PipelineSearchBase):\n", + " def __init__(self, problem_description, backend, *, primitives_blocklist=None, ranking_function=None):\n", + " super().__init__(problem_description=problem_description, backend=backend,\n", + " primitives_blocklist=primitives_blocklist, ranking_function=ranking_function)\n", + " if self.ranking_function is None:\n", + " self.ranking_function = random_rank\n", + " self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords'])\n", + "\n", + " self.available_pipelines = self._return_pipelines(\n", + " self.task_description['task_type'], self.task_description['task_subtype'], self.task_description['data_types'])\n", + "\n", + " # Selection of a data preparation pipeline, we provide some predefine options such as train_test_split, k_fold, etc\n", + " # as well as the user can provide their own.\n", + " self.data_preparation_pipeline = schemas_utils.get_splitting_pipeline(\"K_FOLD\")\n", + " \n", + " # Get the metrics to evaluate the pipelines based on the problem description.\n", + " self.metrics = self.problem_description['problem']['performance_metrics']\n", + "\n", + " # Pipeline to be use for scoring, we recommend using the one provided.\n", + " self.scoring_pipeline = schemas_utils.get_scoring_pipeline()\n", + " \n", + " # Get the parameters for the datapreparation pipeline, such as number of folds, and so on.\n", + " self.data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['k_fold_tabular']\n", + "\n", + " self.offset = 10\n", + " self.current_pipeline_index = 0\n", + "\n", + " def _search(self, time_left):\n", + " # Read all the pipelines to be evaluated\n", + " pipelines_to_eval = self.available_pipelines[self.current_pipeline_index: self.current_pipeline_index+self.offset]\n", + " self.current_pipeline_index += self.offset\n", + " \n", + " # Evaluate the pipelines.\n", + " pipeline_results = self.backend.evaluate_pipelines(\n", + " problem_description=self.problem_description, pipelines=pipelines_to_eval, input_data=self.input_data,\n", + " metrics=self.metrics, data_preparation_pipeline=self.data_preparation_pipeline,\n", + " scoring_pipeline=self.scoring_pipeline, data_preparation_params=self.data_preparation_params)\n", + "\n", + " return [self.ranking_function(pipeline_result) for pipeline_result in pipeline_results]\n", + "\n", + " def _return_pipelines(self, task_type, task_subtype, data_type):\n", + " pipeline_candidates = []\n", + " for pipeline_dict in schemas_utils.get_pipelines_db()['CLASSIFICATION']:\n", + " pipeline = pipeline_utils.load_pipeline(pipeline_dict)\n", + " pipeline.id = str(uuid.uuid4())\n", + " pipeline.created = Pipeline().created\n", + " pipeline_candidates.append(pipeline)\n", + "\n", + " return pipeline_candidates\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an instance of the search and fit with the input_data." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# The method fit search for the best pipeline based on the time butget and fit the best pipeline based on the rank with the input_data.\n", + "search = PredefinedSearch(problem_description=problem_description, backend=backend)\n", + "fitted_pipeline, pipeline_result = search.search_fit(input_data=[dataset], time_limit=30)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Print information about scores of the succeded pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------------------\n", + "Pipeline id: dbca8f6a-ad11-4e32-9bc1-b49c554ff224\n", + "Rank: 0.11134933476057562\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.588235 0.588235 42 0\n", + "1 ACCURACY 0.878788 0.878788 42 1\n", + "2 ACCURACY 0.818182 0.818182 42 2\n" + ] + } + ], + "source": [ + "for pipeline_result in search.history:\n", + " print('-' * 52)\n", + " print('Pipeline id:', pipeline_result.pipeline.id)\n", + " print('Rank:', pipeline_result.rank)\n", + " print(pipeline_result.scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Display succeded pipelines" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABSYAAALkCAYAAAAS1et6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAMTQAADE0B0s6tTgAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nOzdeXxU5d3///eZJZnsmUw2spAAgYAgwYCA4gouKIJLK1B3re1dretDLd/6s9X21tb71rZab++7rcXWoihWAQXUgihSkFUIuwiBBEhIQvZ1MjNnzu8PJIUCmkDIJPB6/pUczjnXZ0KYB+c9n+u6DMuyLAEAAAAAAABAF7KFugAAAAAAAAAAZx6CSQAAAAAAAABdjmASAAAAAAAAQJcjmAQAAAAAAADQ5QgmAQAAAAAAAHQ5gkkAAAAAAAAAXY5gEgAAAAAAAECXI5gEAAAAAAAA0OUIJgEAAAAAAAB0OYJJAAAAAAAAAF2OYBIAAAAAAABAlyOYBAAAAAAAANDlCCYBAAAAAAAAdDmCSQAAAAAAAABdjmASAAAAAAAAQJcjmAQAAAAAAADQ5QgmAQAAAAAAAHQ5gkkAAAAAAAAAXY5gEgAAAAAAAECXI5gEAAAAAAAA0OUIJgEAAAAAAAB0OYJJAAAAAAAAAF2OYBIAAAAAAABAlyOYBAAAAAAAANDlCCYBAAAAAAAAdDmCSQAAAAAAAABdjmASAAAAAAAAQJcjmAQAAAAAAADQ5QgmAQAAAAAAAHQ5gkkAAAAAAAAAXY5gEgAAAAAAAECXI5gEAAAAAAAA0OUIJgEAAAAAAAB0OYJJAAAAAAAAAF2OYBIAAAAAAABAlyOYBAAAAAAAANDlCCYBAAAAAAAAdDmCSQAAAAAAAABdjmASAAAAAAAAQJcjmAQAAAAAAADQ5QgmAQAAAAAAAHQ5gkkAAAAAAAAAXY5gEgAAAAAAAECXI5gEAAAAAAAA0OUIJgEAAAAAAAB0OYJJAAAAAAAAAF2OYBIAAAAAAABAlyOYBAAAAAAAANDlCCYBAAAAAAAAdDmCSQAAAAAAAABdjmASAAAAAAAAQJcjmAQAAAAAAADQ5RyhLgAAAOBUCJhBVTb6VF7vVWlti2pbfAqYlhx2Q/ERYUqLj1BKrEuJ0WFy2PmsFgAAAOhqBJMAAOC0UtHg1RfFNVpeWKVGr182GZIhOe02GYZkWZLfDEqWFJSlaJdTY/p5NDzLreQYV6jLBwAAAM4YhmVZVqiLAAAAOFlVja2at6FUG0pqJRnyRIUpMswuwzCOe41lWWr2mapq8kmylJcer4l5afJEh3dZ3QAAAMCZimASAAD0aMGgpTVF1ZpTUCK/GVRqrOuEpmYHzKDK6r1y2m26fli6zs1OkM12/FATAAAAwMkhmAQAAD1Wa8DUrDV7taaoWkkx4YpxOU/6ng1evw40tOrc7ARNOTdT4Q57J1QKAAAA4N8RTAIAgB6pNWBqxopibSqpU2ZCpOyd2N1oBi3tqW7W0PQ43XpeFuEkAAAAcAqwBSUAAOhxgkFLs9bs1aaSOvXu5FBSkuw2Q1kJkdpYUqe31+5VMMjnuAAAAEBnI5gEAAA9zpqiaq0pqlZmQuQpWwfSZjPUOyFSq3cfHAsAAABA5yKYBAAAPUplY6vmFJQoKSa80zsl/53dZigpJlxzCkpU1dh6SscCAAAAzjQEkwAAIOQKCgr01FNPqaio6FvPnb+hVH4z2Ckb3bRHjMspvxnU/I37u2Q8AAAA4ExBMAkAAHqMigavNpTUKjXW1SXjlezcpqKtBUqNdalgX40qGrxdMi4AAABwJnCEugAAAIChQ4dqyJAhstu/effrL4prJBly2Lvms9WSwm1qaapX9lnDJBlaV1yj8UN6dcnYAAAAwOmOjkkAABByNptNDodDhnH8NSMDZlDLC6vkiQrrwsr+xRMVpmWFVQqYwZCMDwAAAJxuDMuyrFAXAQAAzmwFBQWaO3eu7rjjDmVnZ2vJkiVasmSJ7rvvPq1fv14bN25URU2Dvqgwdd7F45SQktZ2bXXZPq1eOEdDzh+ngK9Ve77cqNaWJkVEx6nv2cOV1ndg27ktjfX6bPZryskbqZy8UUfUsGn5xyop3Kbxt90vSfps9l/V0tjwrxMsS3Utfr32m18oNytVe/bs0WeffaaysjK1trYqMjJSvXr10tixY5WSknJqf2AAAADAaYCp3AAAoNuaM2eOwsLCdMEFF2jL3kotn/0Prf90vi664XY5w8KPOHfPtg3y+1qV0X+wbHa7SnZu08Zli2QFLaXnDOrw2ANHXKiv1q2Qv7VFA8+9UJJUVutVQ8CmyspKzZgxQwkJCRozZoxcLpcaGxu1e/duVVZWEkwCAAAA7UAwCQAAuq3o6GhNnTpVhmGoOnK/+o3wqqxgicp2f6XM3LOPOLe5sV4XXHuzXJHRkqTM/kO0fP5Mfbn2n0rt0192e8f+25PSu5+Kt21QMBho67o0q5t1oNmUWblbfr9ft912m6Kiotquueiii07yFQMAAABnDtaYBAAA3daoUaPa1p2sbfHJk5opSWpuqDvq3F59BrSFkpLkCAtT5oAh8vtaVVNeqtbWVtXX159UPU67TXVev8LDD3Zrbt26VcEga04CAAAAJ4KOSQAA0G3Fx8e3fR0wLYW5XJIkX6v3qHOjYt3HPdZQU6Vde8sUFx1xUvUYhuQ3LQ05Z4g2btyoBQsW6OOPP1ZmZqZycnI0ZMgQRUdHf/uNAAAAANAxCQAAui+b7V//VXHYDZ3Iln1WMKidO3cqOjpamb17f+N533ovS3LaDTkcDt122226++67dd5558k0TS1cuFAvvfSSioqKOl4kAAAAcAaiYxIAAPQI8RFh8pvHDw+b6muOOtZYW62KAwfUN2OgBg4aqGAgIEnyH6Pjsrnx6Onh/85vBhXncrZ9n5GRoYyMDElSVVWV/vjHP2rJkiW64447vvVeAAAAwJmOjkkAANAjpMVHSN/QMbl/91fyNje2fR9obdUXyz6RYbNr5AUXyWazyREWpvCISFWXlxxxbe2BMtVVlh91T7vTqYDP968D1sE6mpubjzrX7XYrPDz8mH8GAAAA4Gh0TAIAgB4hJdaloCwdbz53ZHSsVn34jjIHDJFhs2nDyqVqqK3W2OtvUrjrX2tLZg44Wzs3rNIXn8xTcnq2WpoatG/HFsXEe1RfU3nEPeMTU3VgX5G2rf5MsZ4UHWholdvVV0uXLlVhYaEGDBggt9utYDCobdu2qaGhQaNHjz6lPwcAAADgdEEwCQAAeoTE6DBFu5zym8cOJnsPylPA16o9X25U+f5StfgCuvS676nPoLwjzus7ZLj8Pq/27/5K1fv3KsadpGGXXK2SnduOCiazBuWpqb5W+3d/pZ1bNihgBhV55zjl5uaqoaFBmzdvVlNTk5xOpxITE3XDDTdo6NChp+xnAAAAAJxODMs6kWXkAQAAut6Hm/dr4dZy9U6IbDtWXbZPqxfO0dnnX6b0nEEqLyvTjh07lTcsTzExMZ029p7qZl15VorGD+nVafcEAAAAzmSsMQkAAHqM4VluSZYCx9kEp7qqSl99tUODhwzu1FDy4HiW8rPcnXZPAAAA4ExHMAkAAHqM5BiX8tLjVVZ/9K7ajU2N2rJ1q3IH5srt7twAsazeq2EZbiXHuDr1vgAAAMCZjGASAAD0KBPz0uS029Tg9bcd8/v92rljp/r26aPk5OROHa/B65fTbtM1Q5nCDQAAAHQmgkkAANCjeKLDdf2wdB1oaJUZtBTlTlLSWefrrPxRSs/I6NSxzKClysZWXT8sXZ7o8E69NwAAAHCmY1duAADQ45ybnaDt5Q1atatS1cXb5Xa71adPn04dIxi0tKe6WedmJ+jc7IROvTcAAAAAOiYBAEAPZLMZumFYL/nLd6rZFql+/ftLhtFp9zeDloqrmzU0PU6TR2TKZuu8ewMAAAA4yLAsywp1EQAAAB0RDAb19ttvq66xSeEDL9EXe+uUFBOuGJfzpO/d4PXrQEOrRvZJ0OQRmQp32DuhYgAAAAD/jqncAACgR7EsSwsWLFBVVZXuuusuhYe7NKioWnMKSlTX4ldqrEsOe8cnhQTMoMrqvXLabbplVJbOzU6gUxIAAAA4heiYBAAAPcqnn36qgoICff/731dsbGzb8arGVs3bUKoNJbWSDHmiwhQZZpfxDVO8LctSs89UVZNPkqVhGW5dM7QXG90AAAAAXYBgEgAA9BirV6/Wp59+qrvuuktJSUnHPKeiwat1xTVaVlilJq9fhgzJkJx2mwxDsizJbwYlS7JkKdrl1AU5iTqnd7ySY1xd/IoAAACAMxfBJAAA6BG2bNmi9957T7feeqsyMzO/9fyAGVRlo0/l9V6V1raozuuX37TktBuKczmVFh+hlFiXEqPDTmjqNwAAAICTQzAJAAC6vd27d2vmzJm68cYbNWDAgFCXAwAAAKAT0B4AAAC6tf379+utt97ShAkTCCUBAACA0wjBJAAA6LZqamr0xhtv6KKLLtKwYcNCXQ4AAACATkQwCQAAuqXGxkbNmDFDQ4YM0fnnnx/qcgAAAAB0MoJJAADQ7bS2tuqNN95Qenq6rrzyShmGEeqSAAAAAHQygkkAANCtBAIBzZo1S5GRkbruuusIJQEAAIDTFMEkAADoNizL0ty5c+X1ejV58mTZ7fZQlwQAAADgFCGYBAAA3YJlWfroo4+0f/9+3XzzzQoPDw91SQAAAABOIUeoCwBwcgJmUJWNPpXXe1Va26LaFp8CpiWH3VB8RJjS4iOUEutSYnSYHHY+iwDQfS1fvlxbtmzR97//fUVFRYW6HAAAAACnGMEk0ENVNHj1RXGNlhdWqdHrl02GZEhOu02GIVmW5DeDkiUFZSna5dSYfh4Nz3IrOcYV6vIB4Ajr16/XsmXLdMcdd8jtdoe6HAAAAABdwLAsywp1EQDar6qxVfM2lGpDSa0kQ56oMEWG2b9xcwjLstTsM1XV5JNkKS89XhPz0uSJZpokgNDbvn273n33Xd10003Kzs4OdTkAAAAAugjBJNBDBIOW1hRVa05BifxmUKmxrhOamh0wgyqr98ppt+n6Yek6NztBNhs73gIIjb1792rGjBm6/vrrNWjQoFCXAwAAAKALEUwCPUBrwNSsNXu1pqhaSTHhinE5T/qeDV6/DjS06tzsBE05N1PhDna+BdC1Kioq9Je//EXjxo3TiBEjQl0OAAAAgC7GGpNAN9caMDVjRbE2ldQpyxMleyd1N8a4nIoMc2htcY18gaBuPS+LcBJAl6mrq9Prr7+uUaNGEUoCAAAAZyi26AW6sWDQ0qw1e7WppE69EyI7LZQ8xG4zlJUQqY0ldXp77V4FgzRQAzj1LMtSRUWFBgwYoIsvvjjU5QAAAAAIEaZyA93Yql1Ven1Vcad2Sh6LGbRUXNWkW0ZlaVRfzykbBwAOMU1TNpvtGzfuAgAAAHB6o2MS6KYqG1s1p6BESTHhpzSUlA52TibFhGtOQYmqGltP6VgAIEl2u51QEgAAADjDEUwC3dT8DaXym8FO2eimPWJcTvnNoOZv3N8l4wEAAAAAgDMbm98A3VBFg1cbSmqVFhfRrvPrqypUumu7qsv2qaWpQZIUFetW74Fnq1ef3HZ3JaXGulSwr0ZXNaQqOcZ1wvUDwL8rLCxUv379Ql0GAAAAgG6EYBLohr4orpFkyGFvX1Pz7i3rVLV/r1KycpQ5YIhM01RZ8Q5tXLZI1WX7NOT8y9p1n4PjGVpXXKPxQ3qd+AsAgMPcfffdam5uVt++fXXTTTdpwIABcjj4LwgAAABwpmMqN9DNBMyglhdWyRMV1u5reg/M08XfvVODR1+qzNyzlX3WMI0a/10lpKRr385taqipave9PFFhWlZYpYAZPJHyAeAI69atU0NDg2bOnKkBAwbok08+0ZIlS0JdFgAAAIBugGAS6GYqG31q9PoVGWZv9zXu5F6y24/sPjIMQym9D06bbKxtfzAZGWZXk9evykZfu68BgOMZMmSIcnNztW3bNk2dOlW9e/fWV199pcrKylCXBgAAACDECCaBbqa83iubjE7Zrdbb3ChJCnO1b61K6WCgachQeb33pMcHAMMw1KdPH61evVr19fW6/PLLVVxcrM8//zzUpQEAAAAIMYJJoJsprW2RTj6TlLe5Uft2bFFEdIzcyWkdu9j4ug4A6CDLso743ul06rLLLlNpaanmzZuniooKDR8+XOvXr1cgEAhRlQAAAAC6A4JJoJupbfHJ2c5Nb47FNE35/T5tWPqRAn6fhpw3TjZ7+6eFS5LTblOd13/CNQA4M1mW1dbtvXnz5rbjmZmZuummm5SYmKj/+q//0h//+EdNnDiRDXAAAACAMxxPBEA3EzAtdXQWt2maqqqq0oGKA6qsOiD/gT0yW+o1ePSl8vTK7HANhiH5TevbTwSAwxwKJf/85z9r7ty5+utf/6rExERJUlZWlrKysnTZZZeppaVFCQkJRwSZAAAAAM48dEwC3YzDbshqRyZomqYOVFRoy5Yt+nz55youKlZkZIQiWmsVaK7TwBEXKqP/4BOqwbIkp52wAED7FBcXt329adMmvfLKK3r77bdVWVmpTz/9VJK0YMECrV27VhEREYSSAAAAACTRMQl0O/ERYfKbwWP+WdA0VVVdrQMVB1RVVaVwV7iSk5KUnZ2tyMhIbVy2UNVle5U7fIyyBuWdcA1+M6g4l/OErwdwZlm+fLlee+01xcXF6ZprrtGFF16oadOmKSIiQsXFxWpsbNTo0aPldrvbriGUBAAAAGBY/75KPYCQ2rC3Vq99XqRMT6Skg2FkdXW1Kg58HUaGhSspOUnJSUmKioqSDEOWZWnT8o9VuutLDTjnPPU9e8RJ1bC3qlm3n5+tvMz4znhJAE5TwWBQNtvByRfnn3++SkpK2roni4qKlJ2drR/+8IfKzc3VI488Ikl0SgIAAABoQzAJdDNldV79+sOtirJaVHmgUpVVlQoLC1NyUpKSkpMV/XUYebgv1/xTRdsKFJeYoqyBQ4+6Z4w7UTHuxHaNb1mW9lY3a9r4QUqNc3XKawJw+vL5fJo/f77Cw8M1a9YsXXjhhfrBD34gn8+np59+WiUlJZo+fXqoywQAAADQDTGVG+gmAoGACgsLtXHzFu3Y2iCXw6701ESdM2yYoqOjjwojD1dffUCSVFdZro3LFh315zl5I9sdTDb7TEW5nEqMDjuxFwLgtHd412NhYaHWrVunK664Qq+88oqmTJmiQCAgh8OhBx98UB6PR9KR3ZUAAAAAINExCYSUaZoqLCzUli1b9OWXXyoiIkKDBw/WAVea1pT61NsT1eU17alu1pVnpWj8kF5dPjaAnuPAgQNasWKFJk2apFWrVunDDz/Utddeq7S0ND366KO69NJLddddd0kilAQAAABwbHRMAiFgWZaKioo0a9YshYeHa/DgwbrtttuUlpYmwzBU0eDVmv3bFDCDcti77mE+YAYlWcrPcn/ruQDObLW1tZo7d67sdrsmTJigXbt26U9/+pMefPBBTZ8+XWFh/+q6JpQEAAAAcCwEk8ApZpqmbDbbEZs9GIahXr166ZZbblF6evpRG0Ekx7iUlx6vTaV1ynBHdlmtZfVeDctwKzmGtSUBHNsHH3ygHTt2yOPx6IknntBvfvMbpaWlKTIyUjabTQ6Hoy2UZKMbAAAAAN+EYBLoZMFgUIZhtD2M2+32Y57ncrmUkZFx3PtMzEvTl+UNavD6FeNynpJaD9fg9ctpt+maoUzhBnBsCxYs0IwZM3TLLbdo1qxZKi0t1UMPPaQHHnhApmnqxRdfVE5OTtv5hJIAAAAAvglrTAKd4HhdQX6/Xy+99JIKCgo0evRo3XvvvR2676pdVXp9VbGyPFGy207dA74ZtLSnukk3j8zSqL6eUzYOgJ7l8LUhA4GAJk2apJycHP3+97+Xz+fT9773PT377LOKjIyUy+WSx+OhSxIAAABAu7HoE9BOhzL8PXv2SDr4kH7IoYdw0zT15ptv6uGHH9ayZcu0a9cutbS06Je//KXWrVunzz//vENjnpudoHOzE7SnulnB4Kn5DCEYtLSnurltLACQDr7HHQollyxZopqaGs2YMUObN2/WO++8o7CwMHk8HlVXVys9PV0ej6etYxwAAAAA2oNgEmiHQx1AJSUlmjp1qiTJ4fjXSgh//OMf1dDQoI0bN2rRokUaOXKkLrjgAr399tuqq6vTX/7yFy1ZskQ7duyQ3+9v97g2m6Ep52ZqaHqciqubZXZyOGkGLRVXN2toepwmj8iU7RR2ZQLoWQ69xz3zzDN69tln9dhjj+nDDz/UjBkz9OSTT+quu+5SeHi48vPz265hkxsAAAAAHcETBHAMlmXJNM227w91AKWnpysxMVHl5eV6/vnntXLlSknS4sWLVVBQoEWLFum8887T9773PUnS0KFDtXHjRo0dO1bbt2/Xd77zHTmdTnVkBYVwh123npelc7PdKq5qUoO3/cHmN2nw+lVc1aRzs9269bwshTuOvRYmgDPL3LlztWzZMknSb37zG33++ef66KOP9Kc//UnvvvuuNm/erAULFmjv3r26//775XQ6O/SBCwAAAAAcQjAJfC0YDCoYDEo6GEQevmmNZVn69a9/rRtvvFEVFRUqKChQQkKCFi5cKEm69tprtWzZMl111VVtD/SSNHz4cE2cOFGLFy/W9ddfr9tvv11+v7/DUx3DHXbdPDJLt4zKUrPP1L6aZgXM4Am9zoAZ1L6aZjX7TN0yKks3jySUBPAvl19+ucaMGaPly5frjjvuUFlZmRYsWKCwsDC98MIL+vnPf66IiAj96Ec/0p133qna2lo5nad+gy4AAAAApx82v8EZKxgMyrKsY+6aXVlZqZdeekm1tbWaOnWqMjIy9Mtf/lIvvfSS3nrrLS1evFjPPfec7rnnHs2ZM0dlZWWaPHmyli5dqqlTp2rgwIHatGmTvv/97+vqq6/W/PnzNWTIEGVnZ5903VWNrZq3oVQbSmolGfJEhSkyzP6NYadlWWr2mapq8kmyNCzDrdkv/n+acu0Effe731VYWNhJ1wXg9LFv3z49+uij+vGPf6yMjAzdf//9mjZtmi688EJVVFQoOTlZkrRlyxYNHjw4xNUCAAAA6KkIJoGvrVmzRn/729/Uq1cvDR8+XLNnz9bVV1+t2bNn64EHHtDjjz+uf/zjH/J6vbrsssu0bNkyXXvttXrkkUfU0NCg++67T5999pkSExM1c+ZMDRw4UBdccMFR43TWjrUVDV6tK67RssIqNXn9MmRIhuS022QYkmVJfjMoWZIlS9Eupy7ISdQ5veOVHOPS5MmTtWPHDl199dW69dZblZuby6YVANp24t66datefvll3X333bIsSw8++KDmzZunmJgY2e32tiUpeN8AAAAAcKIIJnHaKiws1OzZs9WrVy/deOONCg8PbwsFTdPUypUr9cEHH8jpdOr//b//p9/97nfaunWr/vCHP+jZZ5/V1KlTNXjwYP3iF7/QhAkTNG3aNL3xxhtKTU1Vbm6u3nzzTcXGxupnP/uZcnJyNG7cOI0ZM6bLpzQGzKAqG30qr/eqtLZFdV6//KYlp91QnMuptPgIpcS6lBgdJof9X6s3/PKXv9SiRYvk8XgUExOjSy65RDfddJMiIiK6tH4AoXP4ByWHd0L+85//VGVlpbKysvTcc8/p6aefVlxcnBITE0NZLgAAAIDTjOPbTwF6np07d+pXv/qV3G63wsPDNW3aNL3wwgttD+CLFy/WE088oXvvvVeFhYWaNWuWsrKyFBYWpqioKNntdu3YsUODBw+Wz+dTQ0ODHnjgAT388MOqr6/XVVddJYfDoZycHL355pvHrOFQ19Gp5rDblBrnUmqcS3mZ8e2+Ljk5Wenp6dq/f7/sdru+/PJLNTY2EkwCZ5BD74nvvfeeZs+erd/+9reqrq7Wz3/+cz3zzDPKz8/XD37wA1mWRSgJAAAAoNMRTKLHM01TNpvtiOmEffv21XPPPSePxyNJmjJlipqamhQVFSVJysrK0llnnaXrrrtO9fX1mjNnjlwul9xut8rKyvTd735X7733nt544w21tLRo0qRJuvTSS5Wfn69evXrJ4XB8aw1dEUqejMTERDU1NckwDKWnp+upp55q+/kAOL3V1NTI7XZLkjZt2qTHH39cS5cuVXh4uJqamvTpp59KOthROXbs2LavmbYNAAAAoDPZn3rqqadCXQTQEYc2rTn0gHwoEDz8mGEYioyMlCS98847iouL0wUXXHDEVO7S0lKZpqlzzjlHS5cuVV5enrZv3y5JOv/885Wbm6uYmBg98MADys3NlSTFxcXJZrPJsqwjOiL/PZTsCWpqauT3+/XYY4+prKxMKSkpSkpKCnVZAE4xn8+nhQsXqqioSF9++aXOOusslZWVad26dZo3b55ef/119evXT7179z7ifa2nvccBAAAA6P5YYxLd3rd16cyePVvTp0/XOeeco/vvv18pKSlHXPfMM88oLS1Nd955Z9s1fr9ff/vb3/T+++/L7XbL7/frjTfeUEFBgbKzsxUf3/4p0T3V4T/XDRs26OOPP9aPf/xjuVyuEFcG4FQrKirSlClTVF1drR07dmjv3r0qLCzUJZdcogceeEDDhw/X7bffHuoyAQAAAJzmuvdcU5yRgsGgJB1zx1ev16uPPvpI9913n2bOnKna2loVFBTojjvu0NNPP62UlJS2bkbDMFRaWqq6ujrdeeedampqkiS99dZbqq+v16BBg5SRkaEpU6botddekyQNGzbsiFDydM7tD/+5Dh06VElJSVq8eHEIKwLQVXbt2qWJEydq7NixWrNmjTIzMzV69Gg98cQTKioqIpQEAAAA0CUIJtEtHAoAP/7447bp1IeCsyVLlrQFZm+++aYee+wxjRkzRmVlZfrd736ns88+W6tXr9aDDz6oRYsWyTCMtinWn3zyiebMmaPvfOc7+tGPfqTW1lYNHDhQbrdbSUlJSkxMVN++feVwONoC0cOdKVMXDcPQNddco4KCAu3duzfU5QDoZId/yLJo0SJt2rRJ1113ne69915Nnz5dy5Yt09y5czV58tMoNrwAACAASURBVGS9//77knTM90QAAAAA6EysMYlu4VAAuGnTJv3f//2fMjIyFBERoWeffVYfffSRDMNQQUGBLrjgAq1cuVLPPvusUlJStHDhQv3kJz/R4MGDdfXVV+upp57SJZdcorffflu9evXSunXrlJeXpzvuuEP33nuvHA6HUlNT28JLu92u5ORkud3uMyaEPJ6IiAgZhqF//vOfys/P7/ab9wBoP8Mw1NLSouLiYg0cOFCFhYUqLi5W//791bdvX/30pz9VamqqbrjhBkk6Yg1dAAAAADhVeOpAlzle901zc7MWLVqkDz74QHv37tX8+fP14Ycfyu/3a9euXXrxxRcVHx+v2bNnKyIiQmlpaaqoqFBCQoLS09O1bt06vf/++7r88suVk5OjpKQkXXLJJfJ4PLrvvvv02GOP6ZxzzjlqXLfbrSuuuEJ9+vQ51S+9xzj//PMlSStWrAhxJQA6w+Hvu//85z/1pz/9SeXl5Zo0aZIqKir04YcfKj8/X3PnztVPfvKTtnMJJQEAAAB0BZ48cEoEg0HV1NQccezwB13LsmRZlurq6jR+/HjNmTNH8fHxuueeezRu3Dg9+OCDSkpK0v79+3XDDTfI7/fr97//vfr06SOXy6VHH31Ut9xyi6KiopSbm6vMzEy9+uqreuaZZyRJffr0kdPplCSZpnlarxXZmex2uyZOnKjPPvtM1dXVoS4HwEmwLEs2m00FBQVasmSJTNPUlVdeqbffflsxMTHq27ev1q1bp7KyMiUmJkpi+jYAAACArsVUbnSqQzs9HzhwQHPnzlV+fr4kqaqqSrNnz9Z7772nc845Ry6XS4ZhaOvWrXI6nZo0aVLbmo87d+5USUmJ8vPzVVVVpby8PI0ePVqLFy+W3W5Xbm6uVqxYoV/84heaOHGiXC6XBgwYoKSkpGPWZLPZzvhp2h0RFxen+vp6bdy4UUOHDuVnB/RQh5ZmePLJJ2Wz2bR06VK1trZq2LBh+ulPf6qVK1dq2rRpGjp06BHXAAAAAEBXoWMSnerQQ63D4dD777+vm2++Wfv27dPzzz+vRYsWacKECYqIiFB5eblee+011dXVady4cXr//ff1zjvv6JlnntHkyZM1a9YszZ07VzfeeKNM09TLL7+srKwsDR06VJmZmXI4HBo6dKjsdnuIX/Hp6bLLLlN5ebk2bdoU6lIAdMChjkfLstTc3KyHH35Yo0eP1uOPP67//M//1ObNmzVo0CA9+uijeumllzR69OgQVwwAAADgTOYIdQFon4AZVGWjT+X1XpXWtqi2xaeAaclhNxQfEaa0+AilxLqUGB0mh/3U5s2maR6zC9E0TVVUVGjdunXav3+/7Ha7AoGAYmJilJ6eruTkZFVUVGjnzp0qLCzUihUrNGLECA0cOFBZWVl65513VFRUpP79+2vMmDGqqqpSSkqKHnrooSPGSUtL0zXXXKOqqip5PJ5T+lrPVC6XS1dddZU++OAD9e/fXxEREaEuCUA7HFoyo6ioSH369NErr7yiu+66S1deeaXOO+88ud1u2e12jRkzRhKb3AAAAAAILcNi8b1uraLBqy+Ka7S8sEqNXr9sMiRDctptMgzJsiS/GZQsKShL0S6nxvTzaHiWW8kxrpMe3+/3y+l0yjTNo7oTq6qqVFlZqT59+uivf/2r0tLSNHLkSL322msaNWqUXC6XFi5cqCeeeEK1tbVatGiRoqOj9fzzz2vx4sVt9/nss8/0wgsvKDU1VdOmTVN2dvZRdQSDQVmWRYdkF7IsSzNnzlRMTIwmTZoU6nIAfIPVq1dr5MiRkqTnnntO7733ns4//3zdcsstam1t1Xe/+11NnjxZhYWFmj59utxud4grBgAAAAA6JrutqsZWzdtQqg0ltZIMeaLClJAQ+Y3rf1mWpWafqYVby7Vwa5ny0uM1MS9NnujwDo/f2NioBQsWaNSoUcrOzj4iENyxY4ceeugh2Ww2ud1ujR8/Xr1799aOHTs0YcKEtk1rRo4cqX379kmSYmNjNXjwYH3++eeKjo5WRUWFkpOTJUljxozRxRdffFQNh4ehdPR0PcMwNGHCBP3v//6v8vLylJWVFeqSABxDXV2dGhsbJUm//vWvtXHjRn322Wdau3at/ud//kc///nP9eqrr+rxxx/X3Llz5Xa72z50AgAAAIBQIpjsZoJBS2uKqjWnoER+M6i0uIh2T802DENR4Q5FhTsUMIPaVFqnL8sbdP2wdJ2bnSCbrf2bGkRHR2vy5MlqamrSrFmztGzZMm3dulUvvviiMjMztXz5ctXW1qqurk633Xab7r//ftXX18swDNntdpWUlCg+Pl7h4eG6+uqrde+992rbtm06cOCA/vu//7stlJQOrkcpHQwiDcNoCyHpjgy9+Ph4XXLJJZo3b55+9KMftf1dAeg+4uLiNHbsWK1cuVJr167VF198oeLiYo0aNUrV1dV6+OGHNWPGDN111126/fbb9f7778vlOvmOegAAAAA4WezK3Y20Bky9uXqPPtpSpvhIp5JiXB0KEw9nsxmKjXDKsiyt3FWlqiafBvaKkaMDnYcrVqzQ9OnT9Ytf/EKPP/64JkyYoFdeeUXXXnut1q5dq5ycHPXp00eff/65kpOTVV1drXXr1mn9+vWqr6/XhRdeqEGDBik3N1eXXXaZLrroIl1++eVKTEw8Ts3snt0dpaenq6CgQM3NzcecZg+geygrK1Nqaqq8Xq8WLlyoMWPGKD8/Xzk5OcrMzNSIESOUnp6u/v37h7pUAAAAAJDErtzdRmvA1IwVxfqiuEZZnijFuDpnil2My6ksT5TWFtdoxopitQbMdl/r8XiUk5Oj1NRUJSYmatSoUfJ4PNq6dauGDRumt956S9u3b5fT6dSECRN0ySWXaNOmTbr44ov15JNPKikpSWeddZauuuoqhYcfnE5umqZY1rRnsdlsmjhxopYvX67KyspQlwPgOIYPH66cnBxddtll8vl8+uEPfyifz6f8/Py2c8aOHRvCCgEAAADgSHRMdgPBoKU3V+9Rwd5a9U6IlP0EuySPx2YYinM59VVFo2qbfTo7La5dnYlhYWEqKipSdXW1TNPUeeedp3379mnDhg2aMmWKfvazn6m4uFgDBgzQRRddpN69e+vaa6/V2WefrcjIyLb7WJbVNh5dkT1TTEyMmpqatH79euXl5fF3CHRDhmGoV69eamhoUHR0tMaPH6/c3NxQlwUAAAAAx0XHZDewpqhaa4qqlZkQecJTt7+NzWaod0KkVu8+ONa/W7lypaZPn37EsejoaMXExMjtduvDDz+UJI0cOVJut1v9+vVT37599de//lUPP/xw2zWWZSkYDB5xH0Ks08PYsWNVXV2tgoKCUJcC4BuMGTNGU6ZM0fjx4yWJLnUAAAAA3RbBZIhVNrZqTkGJkmLCO71T8t/ZbYaSYsI1p6BEVY2tbcdXrlypl19+WZ999plKS0vbHmJtNps8Ho8uvvhi3XHHHTJNUwMHDtTdd98tSUpISNCnn34q6V8PvodvXoPTy6GNjBYuXKimpqZQlwPg32zbtk2macrhcCgpKantOB8OAQAAAOiuDItWipD66/Ld2lRapwx35Lef3En21TRraHq8bj8/W6tWrdKLL74owzBUVFSkAQMG6Pe//71iYmKOe30gEJDD4VBjY6Oio6OPmKqN09+sWbPkdDp1ww03hLoUAF/bvn273n33Xd15553q1atXqMsBAAAAgHZxhLqAM1lFg1cbSmqVFhfRrvNbmhq0Y90K1VcfUGtLk4KmKVdUjBLTe6vP4Hy5IqPbdZ/UWJcK9tXI9vcN+t/fPiufz6eUlBQlJycrPT39mKFkMBhs64R0OA7+2kRHHxyPUPLMctVVV+nll19WYWGh+vXrF+pygDPenj179O677+r6668nlAQAAADQoxBMhtAXxTWSDDns7Zv63NrSLG9Tg5Iz+yg8Mlo2m02NtVUq2bFV5UU7dd41UxUe8e2dlwfHM1Swr07Dhg1TMBiU1+uVaZqqqKhQQ0PDUeEk07NxSGxsrMaNG6cFCxbonnvukdPZOTvIA+i4iooKvfnmm7riiis0aNCgUJcDAAAAAB3CVO4QCZhBPTV/q1wOm6LCTy4fLivaoYKlHyl3+Bj1GZzfrmuaWgPyBoJ66pqz5LDb1NLSourqajU0NKh///6y2+0nVRNOb8FgUK+++qr69OmjcePGhboc4IxUV1en6dOna/jw4br44otDXQ4AAAAAdBhtcCFS2ehTo9evyLCTDwBd0bGSpICv9VvO/JfIMLuavH5VNvokSREREUpPT9fAgQMJJfGtbDabJk6cqJUrV6q8vDzU5QBnnObmZs2YMUO5ubm66KKLQl0OAAAAAJwQgskQKa/3yibjhNZnNM2AfN4WeZsaVVm6R1tXHtwZOzE9u933MAxDhgyV13s7PD4gSSkpKRo5cqTmz58vGq+BruPz+TRz5kwlJyfrqquuYp1fAAAAAD0WwWSIlNa2SCf4LFm2e4c+efvPWvLuX7T24/fk93k19MIr5E7u4KYHxtd1ACfo4osvVkNDg9auXRvqUoAzgmma+vvf/y6Hw6EbbriB9X8BAAAA9Gg80YRIbYtPznZuevPvPGmZSh88UoNGXaq+Q0bI4XB2aBr3IU67TXVe/wnVAEhSWFiYrrnmGn388cdqaGgIdTnAac2yLM2bN08NDQ2aOnWqHA72rwMAAADQsxFMhkjAtHSis+/CXZGqbfYpMT1bA/LP06CRF2vrqs+0b8eWDt3HMCS/yRRcnJycnBz1799fH330UahLAU5rixcvVlFRkW6++Wa5XK5QlwMAAAAAJ41gMkQcdkMnuixffX297Da7oqKiJEkJqRlyRUZr386tHbqPZUlOO2uT4eSNHz9ehYWF+uqrr0JdCnBaWrFihdavX69bb71VMTExoS4HAAAAADoFwWSIxEeEyW8GT+jamtpaxbvjdXjLZdAMKODzdeg+fjOoOJfzhGoADhcdHa3LL79cCxYskK+Dv4cAvtnGjRu1ZMkS3XzzzfJ4PKEuBwAAAAA6DcFkiKTFR0gd7JhsbWmSJNXU1MgdH992fP/ur+Rr9SouMaVjN7S+rgPoBPn5+YqPj1dRUVGoSwFOG4WFhZo3b54mT56stLS0UJcDAAAAAJ2KlfNDJCXWpaAsWZYlo52LTX61boXqqw9of1WdYoefq6LactUeKFP5nkK5IqKUkzey3eNbliVLllJiWacMncMwDN1yyy2hLgM4bZSUlOjtt9/WpEmT1K9fv1CXAwAAAACdjmAyRBKjwxTtcqrZZyoqvH1/DSm9+6qm6oCaq8q0a+MqSYYiomOVPWiYsgfnKzwist3jN/tMRbmcSowOO8FXABzN6WRpAOBENTU1yTRNxcbGqqqqSm+88YYuvfRSnX322aEuDQAAAABOCYLJEHHYbRrTz6OFW8vbHUwmZ/ZVQ2tQKbn5ys3NPanxq5p8uvKsFDnszOYHgO5g3rx5WrVqlX74wx/qH//4h/Lz8zV69OhQlwUAAAAApwzBZAgNz3Jr4dYyBcxguwPCmtpa9e7d+6TGDZhBSZbys9wndR/gm9TU1KimpkZ9+/YNdSnAKRcwg6ps9Km83qvS2hbVtvgUMC057IbiI8KUFh+hlFiXEqPDjvl+b1mW1q5dq3379uk//uM/NGXKFI0bNy4ErwQAAAAAug7BZAglx7iUlx6vTaV1ynB/+zRsv8+npsamIza+ORFl9V4Ny3ArOYb1JdH5TNNUeXm5PB6PFi1apPT0dIWHh4e6LOCUqGjw6oviGi0vrFKj1y+bDMmQnHabDEOyLMlvBiVLCspStMupMf08Gp515HtweXm59u/fL5/Pp/DwcK1Zs0YbN25UXl5eCF8dAAAAAJxaBJMhNjEvTV+WN6jB61eM65vX56utrVVUVJScYSe+LmSD1y+n3aZrhvY64XsAx9Pc3KxVq1Zp8+bNGjNmjCZPnhzqkoBToqqxVfM2lGpDSa0kQ56oMCUkRH7jZmaWZanZZ2rh1nIt3FqmvPR4TcxLkyc6XNu3b9eePXvkdDqVmpoqy7K0Y8cOgkkAAAAApzXDsiwr1EWc6VbtqtLrq4qV5YmS3Xb8h9qvtn8lu92mfjk5JzSOGbS0p7pJN4/M0qi+nhMtFzim0tJSrVu3TkVFRfrb3/4mn8+nt956SwMHDpRpmrLb7aEuEThpwaClNUXVmlNQIr8ZVGqs64TW6g2YQZXVe+W023T9sHS99puf6/PlyzV27FiNHz9eI0aMUEJCwil4BQAAAADQfRBMdgPBoKXXVxVrbXGNshIiZTtOOLlq5Srl9M+Rx9PxUDEYtFRc3axzs926eWTWcccATkRtba3mzZunwsJCff755+rVq5euvvpq/eUvf9F7773HVG6cFloDpmat2as1RdVKign/1i739mjw+nWgoVW2qt2aem5vjTp3uGw2NiUDAAAAcGbg6acbsNkMTTk3U0PT41Rc3SwzeHRW7PV65W31Kj4ursP3N78OJYemx2nyiExCSXS6+Ph4OZ1OzZo1S9ddd51ee+01TZkyRcnJydq7d2+oywNOWmvA1IwVxfqiuEZZnqhOCSUlKcblVJYnSlZiX23xeeQ/xvs/AAAAAJyu6JjsRloDpt5eu1erdx/djbN//36VlZXpnHPO6dA9D3XjjOyToMkjMhXuYDotTp21a9dqxIgRampq0iuvvKKKigr96le/UllZWdu6ed+0Bh/QHR3qav+iuEa9v6Gr/WTHoKsdAAAAwJmGjsluJNxh180js3TLqCw1+0ztq2lWwAxKkmpraju0G3fADGpfTbOafaZuGZWlm0dmEUrilAkGD/6ejhgxQoWFhZo2bZpM09TTTz+t559/XqNHj9bq1atlGIZM0wxxtUDHrCmq1pqiamWeolBSOtg53zshUqt3HxwLAAAAAM4EdEx2U0fs+GpJu7Zt0rDBuYp3u497zaEdX6uafJIsDctw65qhveSJZn0/dJ1169Zp586d+s53vqNHHnlEhYWFuu222/SHP/xB7777ruI7ELADoVbZ2KrnF25XZJi906Zvf5MGr1/NPlOPXZHLezcAAACA054j1AXg2DzR4bpjTB9VNHj1ycbd+nJzUHWmUw1VzZIhOe02GYZkWZLfDEqWZMlStMup8YNTdU7veCXHuEL9MnAGys/PV0JCgq644goNHDhQ8+bNkyQtW7ZMTU1NBJPoUeZvKJXfDCrG1TXvpzEup+pa/Jq/cb9uPz+7S8YEAAAAgFAhmOzmkmNcyrbXaeoAh6686iyV13tVWtuiOq9fftOS024ozuVUWnyEUmJdSowOk8PODH2EzquvvqqZM2cqISFBw4YNkyT9+c9/VmNjo8rKylReXq78/PwQVwl8u4oGrzaU1CotLqJd51uWpeJtBdr71Ra1NNYpzBWpXtn9lTNslOyO9ndbpsa6VLCvRlc1pPIBEwAAAIDTGsFkD7Br1y7179dXqXEupca5lJdJxxm6r8jISL3++utKTU3VpZdequLiYu3Zs0dTp07Vpk2btG/fPg0ePFjh4UxTRff2RXGNJKPdH/Z8uWapir/cqJTe/ZR91jA11dWoaFuB6msqNeKya9u98dPB8QytK67R+CG9TvwFAAAAAEA3RzDZzQWDQRUXF+vSSy8NdSlAu0ydOrXt6/nz5ysqKkqbN2/W4sWL5Xa79ZOf/ET79+9XYmKioqKiQlgpcHwBM6jlhVXyRIW16/yGmirt2b5JKb376ZxLrm47HhEdq21rlqq8eKdSs/u3e3xPVJiWFVbpskEpdMEDAAAAOG3xtNPNlZSUyG63KzU1NdSlAB0WERGh+vp6Pfnkk+rfv79uu+02bdy4UbNmzdLSpUslSRUVFaqqqgpxpcCRKht9avT6FRlmb9f5ZUVfybIsZQ3KO+J4xoDBstsdKt21vUPjR4bZ1eT1q7LR16HrAAAAAKAnoWOym9u1a5eys7PbPQUQ6E5sNptiY2OVkpKi9evXq7i4WF6vV/369VNNTY2mTZsmwzB06623yuPxhLpcoE15vVc2Ge1+762rLJdhGIpLTDniuN3uUExCouqqyjs0vmEYMmSovN6r1DjWmQQAAABweqJjspvbvXu3+vbtG+oygBMSDAYlSS+88IIqKytVWFio/Px8XXTRRWpubtbMmTNls9k0ePDgEFcKHKm0tkXqwOdBrS1Ncoa7ZLcf/XmfKzJarS3Nbf8e2s34ug4AAAAAOE3RMdmN+f1+7d27V5MmTQp1KcAJsdlsCgaDCgsL0+OPPy6v1yu73a6XX35Z4eHh+vvf/67S0tK2803TlN3evqmzwKlU2+KTswNrO5qBwBGhpLelRS6XSzIM2b4+HjQDstnat2alJDntNtV5/e0vGgAAAAB6GDomu7E9e/YoOjpabrc71KUAJ8xmO/g2k5SUpMTERD300EPKyMjQT3/6U40ePVoOh0MPP/ywgsGg7Ha7LMsKccWAFDAtdWQFDbvDIdMMSJLKy8u1Zu1a+fwHQ8Xg18dtx+im/CaGIflN/j0AAAAAOH0RTHZju3btUt++fVlfEqeNiIgIZWRkKCIiQpK0evVqrVq1SoFAQI888ogk8fuObsFhN9SRjDw8Ikr+Vq9K9+3VV199pSFDhigs7GB3pLe5UeERkW0hfXtZluS08+8BAAAAwOmLYLIb27Vrl/r06RPqMoBOcWh9vd/+9rfKzc3VJ598or59+6q8vFwvvfSSMjIy2J0b3UZ8RJj8ZvvXhIxLTFFDfb02rf9CZ599dlunu2kG1FBdqVhPcodr8JtBxbmcHb4OAAAAAHoKgsluqqWlRWVlZWx8g9PG4d1iSUlJuueee1T7/7N3n4FRlWkbx/8zkzLpnYRQEkqCoa9SBaSsggioiBQhlN1FRRfFShMWQRFUdNVV4F1RcREFbAhSRBEERKQqRYqQHiAkkEr6zHk/sMmCAlKSzASu36dk5sw510k5ybnnfp4nKws3NzcyMjJ48sknCQoK0lBucQrh/h5wGT+Khrs3mZmZeFtK8ff3L3885dA+bLZSwus1uvwQxn9ziIiIiIiIXKNUmHRS8fHxhISE4O3t7egoIhXKbrcTERHBSy+9xMSJEwkKCiItLY3PP/8c4JzCpIqU4iihvlbsGJf0M5iSksKJU9n8qUM3cjOOsWv9SlJ+3ceBbRs5sH0jQWG1CYuMuqzjG4aBgUGor/VKT0FERERERMTpaVVuJxUfH69h3HJNKuucvOuuu+jcuTP+/v5kZmYyYsQIoqOjCQgI4OTJkzRr1gyTyYRhGJp3UqpcsLcb3lZX8otteLlf+E9lclISSUnJtGjeHC9vbxL3h5Ly6z7SU+Jxs3oSGdOSBi3aXPbPcH6xDS+rK8Hel76Kt4iIiIiISHWjwqSTiouLo3v37o6OIVKp/P39sdvtBAQE8MEHHxAYGMiiRYv45JNP6N69O88884yjI8p1ysVipkODINb8knbBwmRSYiLJKSm0aNEcbx8fAOo1uZF6TW686uOfPF1Mj8ahuFg0sEFERERERK5duuNxQtnZ2WRmZhIREeHoKCKVzmw2k5GRgbu7OzabjX379rF06VKOHj3KkSNHMJlMpKenOzqmXIduiggADEp/uwiOYZCQkEBySgotW7QoL0pWlDPHM7gxIqBC9ysiIiIiIuJsVJh0QvHx8YSHh2O1am4xuT5kZWXx1FNPERYWRlhYGDk5OUyZMoXt27fz4osv8u9//5vs7GxHx5TrTA0fKy1q+XM8p/B/DxoG8QkJHD16lJYtW+JVCfMAH88ppGXtAGr46G+AiIiIiIhc2zSU2wnFxcVpNW65rjRs2JCGDRsSGxtLeno6jz76KIGBgXz00UesW7eO48eP4+Gh1Yml6vVpEc6BtFxyC0vwcXchLi6OtLQTtGzZEk9Pzwo/Xm5hCa4WM72b16zwfYuIiIiIiDgbFSadjGEYxMfHc8899zg6ikiVKFvcZubMmWzcuLG8Y3LEiBHcdtttREdHk5ycTHR0tKOjynUoyNudvi1r8cGPiRxLTuBkRgYtW7bAoxKKkja7QUZeEUPaRBDk7V7h+xcREREREXE2GsrtZDIyMigoKKBOnTqOjiJSJUwmE3b7mTn8OnXqRE5ODsOHD2fcuHE88sgjzJw5k+zsbI4cOQJQvq1IVWkVEYBH3jGOpOXQokXlFCXtdoOkU/m0jgykdWRghe9fRERERETEGakw6WTi4uKoW7cuLi5qZpXrh9n8v0uRm5sbbm5udOjQgS1btnDvvffy6aef8vDDD3Pw4MFzthWpbIZhsHLlCkJPH6bPzc1Jyzew2Y0KPYbNbpB4Kp/mtfwY0KoOZrOpQvcvIiIiIiLirEyGYVTsHZZclUWLFlG7dm06duzo6CgiVa5sWPehQ4dwcXHh2WefZfv27bz//vskJSWxe/dupk6d6uiYcp2w2+0sW7aMpKQkRowYgbunF0u2J7M1/hQhPu74WF2v+hi5hSWk5xbRpl4gA1rVwd3FUgHJRUREREREqge1HjkRu91OQkKCFr6R65bJdKZTLDo6moSEBKKioti1axfjx4/n559/pkuXLo4NKNcNu93O0qVLSUlJ4S9/+Qu+vr64u1gY0iaC2LYR5BfbSMnMp9R2ZVMLlNrspGTmk19sI7ZtBEPaRKgoKSIiIiIi1x0VJp3IsWPHAAgLC3NwEhHH69y5M6tXr2b79u1MnjwZm81G/fr1ee6551i+fDlwpsNSpKLZbDY+++wzjh07xogRI/Dx8Sl/zmw20bZ+EE93b0SzcD+OZheQdCqf00Wlf/jzaBgGp4tKSTqVz9HsAprX8ufp7o1oWz9Iw7dFREREROS6pKHcTmTjxo0cPXqUgQMHOjqKiEPZbDYsFgsHDx7kyy+/5Mknn+TXX3/lnXfeYceOHbi5uTFz5kyaNWvm6KhyjbHZbHz66aecPHmS1ZOjAgAAIABJREFUYcOG4eXlddHtT+QWsjMxk01HTnK6sAQTJjCBq8WMyQSGASU2OxhgYOBtdaVjw2D+VNefGj7WKjorERERERER56TCpBN5//33iYmJoU2bNo6OIuJwZfNNZmZmsnv3bsaNG8fIkSMZOXIks2bNIjMzk+nTpzs6plxDSktL+eSTT8jOzmbo0KF4Xsbq26U2Oxl5xaTlFHI0q4DswhJKbAauFhN+VlfC/T0I9bUS7O2Gi0WDFURERERERAC09LOTKCkpITk5mV69ejk6iohTMJlM5OTk8NFHH7Fu3ToaNWrEyJEjyc7OxsPDg1tuueWc7csKmSJXorS0lMWLF5Ofn8+wYcPw8PC4rNe7WMyE+VkJ87PSoo5/JaUUERERERG5tqgw6SSSk5Px9PQkKCjI0VFEnEZ8fDyHDh1i9uzZjBo1iocffhg/Pz/c3d3ZsGEDX3zxBWFhYTzyyCOOjirVWElJCYsWLaK4uJihQ4ditWqItYiIiIiISFXQUG4nsXbtWnJycujbt6+jo4g4lbJOyLS0NPLy8khOTqZLly68/fbbNGzYkClTpvDmm2/SvHlzR0eVaqi4uJiPPvoIm83GkCFDcHd3d3QkERERERGR64YmunIScXFx1K9f39ExRJyOyWTCZrMRGhpKcHAwW7ZswW638+uvv2I2m3nvvffIzc0FtEq3XB7DMNi0aROGYRAbG6uipIiIiIiISBXTUG4nUFJSQmZmJvXq1XN0FBGnZLFYAPDz86OwsJD777+fgQMHkpubS4MGDWjQoAFA+RyThYWFGo4rf8hkMtG5c2cMw8DFRX8ORUREREREqpqGcjsBu90OgNmsBlaRCzl7cZupU6eSmZnJoEGDaNeu3TnPHz16lNmzZ/Pwww8THh7uyMhSDWjRJBEREREREcdRYVJEqg2bzVbePQlnuo1dXV1/t93DDz9McXEx8+bNq8p4IiIiIiIiInIZ1KInItXG2UXJuLg4Zs2aVd5xDGcWMlm9ejUAvXv3rvJ8IiIiIiIiInLp1DEpItVaZmYmAQEB5OXlMX/+fPbt28fIkSO56aabHB1NRERERERERC5Cs/2LSLW2ZMkSfvnlF2rWrEl+fj7Tp08nMDAQ0PyBci673V4+l29BQQEeHh4OTiQiIiIiInJ9U8ekEyguLsbNzY2SkhJcXFxUSBG5DPn5+bRv35727dszd+5c4NwClMhvvf322xw4cIA777yTzp07OzqOiIiIiIjIdUt37g5WXFzMpEmTGDlyJOPHj6ekpMTRkUSqDZvNhqenJ/Pnz6dRo0blj59d3C+bg9Jms1V5PnEOZ7//tnDhQlavXs2QIUMYM2YMa9euPWeeUhEREREREak6Kkw6SNmN8KefforNZmPAgAHEx8eTl5fHiBEjHBtOpJqwWCwYhsGf/vQnHn/8cXbs2AH8rzD5888/M2PGjPJtVYC6/pQN58/JyeHEiROEhIQwevRo1q5dS9OmTenQoYO6a0VERERERBxEd2MOUnYjXFJSwo033sjevXsZOHAg33zzDWFhYYA6vEQuxdndkdnZ2aSmpgKwatUqxowZw7fffsvEiRMBVIC6DplMJjZs2ECvXr1ITk6mqKiIyZMnk52dzYIFCxg/fjxvvfWWo2OKiIiIiIhcl3SX7mADBgxg7969LF68mJMnT/LNN99wzz33ACqiiFyubt264eHhwYwZM3jrrbeYNWsWa9euZePGjfzyyy+OjicOsmzZMiZPnsxNN91Enz596Nu3LyEhIdx77734+vry97//3dERRURERERErktaldsByhbm2LlzJ6mpqTz33HPcfvvtHDp0iFmzZuHr6wugRXBELtOhQ4f48MMPWbNmDS+//DKtWrVi9+7d3HnnnTRo0KB8JWYtjnNtKxu+/cEHHxAbG0tGRgalpaXlz9933334+PjQvXt3YmJiAC2YJCIiIiIi4ghaldsBym6AhwwZwq233spf/vIXAPbu3YvJZKJJkyYOTihSPT333HOEh4cTGhrKCy+8QGxsLAcOHKBp06aEh4ezY8cOxo0bh9VqdXRUqWBFRUW4u7uf81i/fv2444476NGjB8OGDePNN9/ks88+Iz09nRkzZuDp6QmoKCkiIiIiIuIoKkw60NNPP03fvn25+eabAejVqxcTJkygY8eODk4mUj3l5+eXF5u+/vprgoODSU5OxtXVlUWLFpGVlUVMTAwzZ850cFKpKIZhkJSUxOjRo/noo484ffo0ixYtYsyYMRQUFDBo0CCefPJJXF1d+eqrrzhy5AhvvPEGAQEBjo4uIiIiIiJy3VNh0oFSU1MZOnQoAQEBNG/enJ07d/LFF184OpZItXZ291tcXBzLli3DMAwGDhxIeHg4X3/9NTVr1qRp06blQ36l+rLZbFgsFmbNmkVGRgbPPfccvXv35qGHHuLuu+9m79693HPPPXz55ZdER0f/7nUiIiIiIiLiOJpjsoqVFU0++ugjQkND+fbbb0lMTCQ+Pp4nn3wSQMUSkatw9pDckJAQfH196d+/PykpKXz44YdkZmayefNmpkyZQpcuXfT7Vs2VFRc9PDxYs2YN4eHh/Oc//2HIkCE0adKE/Px8mjdvTmpqanlh0jAMFSVFREREREScgDomHeSzzz7j008/5dSpU9StW5devXrRu3dvzXMmUkF+W3CcPXs2NWrUoEePHmzdupV///vf/Oc///ndvIRS/cyYMYOUlBT+/ve/89BDD/H888/j6urKwoUL2bVrF3PnzqVp06aOjikiIiIiIiK/ocJkFTpfZ1ZRURH/93//x+eff84333yjLh6RCmYYBvHx8YwZM4ZPP/2UgoIC5s2bR2BgYPnCU3Cmm9lkMql70smdfR3Ny8vD29ubadOm0apVK+644w52797NwIEDWbZsGVFRURQWFmK1WtUZKyIiIiIi4oQ0lLuKnH1T/Pbbb5OQkMB9991H06ZNueGGG7j55puxWCxaHVakgplMJurXr4+/vz9PPPEEgYGBNG/enNLSUpYvX47JZCrvVj7fys7iXMquo++99x4bNmzglltuoWPHjrz44ov86U9/okGDBkRGRrJp0yaioqKwWq26roqIiIiIiDgpdUxWoWXLltGtWzcKCgpYunQp69atIzs7m5CQEPr370+vXr10Ay1Swcp+p/Ly8li4cCEdO3Zk/vz57Nmzh6FDhzJ37lxmzJjBzp07OXLkCK+//rqjI8t5nH1t3L17N6NGjeLdd9/l7bffplu3bqSlpfHNN98QFxfH2LFjueeeexycWERERERERP6ICpNV5MSJE4wdO5a0tDSaNWtGhw4d6NmzJyUlJXh4eKgYKVKJzi5qZWdn89RTT/Hiiy8SGBjI119/zWOPPcYtt9zC9OnTCQwMdHBauZDc3Fz+8Y9/MGDAADZt2sTTTz9NRkYGo0aN4p///CcBAQFkZWVRu3ZtAL3RIyIiIiIi4uR0x1ZFatSowfz58xkwYADp6el8/PHHdO3alXvvvZdNmzY5Op7INe3s4tS+fftISUkhMDCQH374gccff5z777+fOXPmqCjpZOx2e/nHWVlZTJ8+nby8PNq2bUt8fDxr167Fz88PT09P9u7di7e3N7Vr16bs/TYVJUVERERERJyb5pisIjabDYvFwoIFC1iwYAG1atUCYMiQIWRnZwPq7hGpCjfffDMtW7akS5cuHDt2jA8++ICoqCiGDRvGfffdR8+ePR0dUTj3epicnExISAi1a9fGYrGQn5/PsGHD2LRpE//85z9p0KDBOd83LXIjIiIiIiJSPWgodxUyDIO33nqLlJQU+vbtS9u2benatSufffYZAQEBjo4ncs0rK3aVlJTwzDPP8MQTT5CQkMC///1vevTowcCBA3+3rTjW888/z5o1axg+fDgmkwmz2Yynpyd9+vTBw8ODtLQ0QkNDAX3PREREREREqhvdwVUhk8nE3/72N4KCgnjhhRcYOHAg/fr1IyAgANWHRSqf2WzGZrPh6urKSy+9RFhYGEuWLOGxxx5j4MCB5Obm8uCDD5KamqoCl4PZ7XZef/11fvjhB1auXEloaCinTp3C1dWVuLg41q9fD5yZJqNse33PREREREREqhcN5a4ChmFgMpmYO3cuu3btonPnzkybNo2YmBjc3NwADT0UqSoWi6X84x07dvD999/z6quvsnPnTqZNm8Ytt9xSPtUCqODlCIZhUFJSQmRkJJ6enhw9epTevXuzb98+2rVrR5MmTYiOjgb+d+3U90hERERERKT60VDuKnL06FHuu+8+5s2bx+7du/nkk09o164do0ePPqdQIiJVa+bMmaxdu5bU1FQ+/PBD/P392bVrFxaLhTvvvNPR8a4LkyZNwmq14uXlxeOPP17+eE5ODqtWrWLTpk389a9/5fHHH2fcuHHl80mWvekjIiIiIiIi1ZNaTKpISkoK4eHh1KtXj379+jFu3Dh++OEHFSVFHKRsxefx48fTvn171q9fT2pqKg888ACFhYW8/vrrzJkzx8Epr307duxg586d/O1vf2P//v28+uqrwJmio6+vL507d6ZmzZqMGzeOYcOGaZEbERERERGRa4iGcleyso6ewMBAevbsyaRJk4iKiuK7776jTZs2wP9W7BaRqlM236TFYmHatGkAfPnllyxYsIDQ0FC6devG22+/zenTp/Hy8nJw2muX3W7n1ltvpWbNmrzyyisMHTqU6OhoevfuDYCPjw8DBgzAw8MDT09PB6cVERERERGRiqSOyUpmMpnIyspixIgRDBs2jL59++Lh4cH06dMZPXo0oLnRRBzl7DcEjhw5wv79+/Hz8wMgNDSUPn36YDabKS4uBv7XZSkVp1GjRmzYsIHVq1fj4+PDtGnTeP311zl06BA7d+5k+fLlREZG0qZNG44fP05WVpajI4uIiIiIiEgF0RyTVWDbtm1Mnz6dnj170r17d0JDQ9X5I+KEJk2ahL+/P0899RTvvvsue/bsISgoiAMHDvDBBx84Ot414XyLCe3cuZOJEyfy1ltv0aBBA+bPn09MTAxt27Yt36aoqAiLxYKLixr9RURERERErhWWZ5999llHh7jWubi44Onpid1uJyEhgY0bN+Ln50eNGjUcHU1EOFMsM5lMdOvWDR8fH5KSkli3bh07d+5kzpw5rFu3joKCApo0aeLoqNXa2dNWHDx4kODgYAzDIDw8HFdXV+bNm0dGRgavvvoqffv2LV8d3TAMXFxc1F0uIiIiIiJyjdFdXiUrLS3l8OHDuLi4kJmZidVqpW7duoSFhTk6moj8l9lspqx5vFmzZuzatYvOnTsTGxtL37598fPzIyYmxsEpqz+LxUJ+fj6DBw9m48aN2Gy28q/7wIEDGTduHGFhYbz55pu0bt26/HVa5EZEREREROTapDFxlaRsuOKKFSuYM2cOI0aMoGbNmuTk5DB8+HA8PDwcHVFEznJ28SsqKopXXnmF5cuXc+LECWrXrk2jRo0cmK76KlsArMzbb79NUFAQI0eOJC4ujpKSEho1asT8+fMZPHgwjRs3Ln8dqCgpIiIiIiJyLVNhspJt2LCBu+++m0GDBgEwatQo8vPzeeKJJ84715qIOF7Xrl3ZsmULCxcu5Omnn6aoqAhXV1dHx6p2zr7Gbd26laZNm+Lr60vbtm3p168ft99+O6tWrWLRokVERkbi5uZW/loVJEVERERERK59KkxWkrKb8aCgIFauXMnJkycJCwsjPT2d+++//5xtRMR5lHX4TZgwgeLiYlxdXVWUvEJl17jXXnuNxYsXc88995CRkcGECRO46667SElJYfXq1RQUFNClSxfHhhUREREREZEqp1W5K1l6ejrp6ekcOXKELVu2kJWVRZ06dYiMjCzvohQRuZacPXx72bJlPPXUU+zevZuSkhJmzZpFTEwMdevW5R//+AdjxoyhT58+Dk4sIiIiIiIijqDCZBWx2+3k5uZy7NgxDh8+jNls5o477nB0LBG5iN/OjyiXLiUlhcDAQHJycpg8eTJNmzZlzJgxrF69mtzcXNq3b4/Vai1fmVtfZxERERERkeuPhnJXguLi4nPmSoMzQxq3bdtGx44diY6OprS01EHpRORSlJaWcvr0afz8/BwdpdpZtmwZEydO5IEHHiApKYlJkyYxduxY3NzcWLduHS1btqR///6Air8iIiIiIiLXM01yWMG2bt3KzJkzufvuu1m7di25ubmUlJQAUFBQgNVqxWw2/65wKSLOJSsri9mzZ5OVleXoKE5v586d5R9nZmby3XffsW7dOurVq8eePXvw9PTkkUce4fPPP6dFixZMnDixfHsVJUVERERERK5fKkxWsJkzZ1KvXj0+/PBDVq5cyZQpU2jZsiX/+c9/NI+aSDUSHBxMkyZNWLFiBZrx4uISExMZPnw4Tz75JJmZmbi6ujJ8+HA+/PBDFi1axLFjxwgLC+Oxxx4jMTGRnJwcR0cWERERERERJ6DCZAUqLS3FYrHQv39/PD09WbVqFbGxsezbt4/NmzeTlJTk6Igichluu+02jh49yi+//OLoKE7HMIzygu0dd9zBkSNHWL9+PfXr12fgwIGcPn2aAQMG8P777zN9+nSCgoLo3r07bdq0ISMjw8HpRURERERExBlo8ZsKlJ+fz65du+jQoQOlpaV89913/PnPf6aoqIhOnTqxefNmXFw0radIdbJnzx6++uorRo8ejdVqdXQcp7N27VoSExNp2bIlM2fOZPDgwdx9993s2bOH1atXk52dzbRp0zCb9T6YiIiIiIiInEuFySqwbNkyVq5cydy5c7Hb7bpBF6lGDMPggw8+ICAggN69ezs6jlN59dVXWblyJQsXLiQ0NJS4uDgeeughYmNj8fPz47bbbsPDwwMAm82GxWJxcGIRERERERFxJipMVpKyL6vJZKKoqIjMzEzCwsJUmBSphjIzM5k9ezZDhw6lbt26jo7jNGbOnEnt2rVxc3MjKSmJm266iYCAAJ599lkeffRRunXrBmjlbRERERERETk/VcgqmM1mA84UJMtuxN944w1q1KgBoKKkSDUUEBBA586dWb58efnv+PXkt+9flZSUANCqVSsMw+DYsWPceeedzJ07l5YtW7Jo0aLyoiRo5W0RERERERE5P1XJKojdbgdg1apVbNu2DaB8sZu7774bs9mslX1FqrH27dtjNpv5/vvvHR2lSp3d7bhjxw5yc3NxdXXl4MGD7Nmzh6FDh/KXv/yFqVOnUr9+fQDNxSkiIiIiIiKXRIXJClLWCfnKK68QERHB0qVLefXVV5k2bRpRUVGAuoZEqjOLxUKfPn3YuHEjp06dcnScKlN23XrxxReZNWsWOTk5ZGRkMHXqVGrWrAnAihUraN26NTNmzHBkVBEREREREalmNMdkBcrIyGDIkCE888wzvPLKK3z++ed07tyZFStW4Ovr6+h4IlIBVq5cSUZGBkOHDr1m32wwDIMff/yRmJgY/Pz8OHXqFL179+bjjz/m9OnTJCYmctttt5VvX1BQoEVuRERERERE5LKpY7KCGIZBcHAwo0aN4vjx48yePZsDBw7g6+uLr69v+VBvEaneunXrRnp6Ort373Z0lEpjMpkwm83cf//9dOzYkbS0NIYMGcL06dP55JNPmD17NkuWLCnfvqwoCagoKSIiIiIiIpfMxdEBrgVlc7Bt3ryZvLw8Dh48SJ06dcjPz+fpp592dDwRqUBWq5WePXvy5ZdfEhUVhaenp6MjVQofHx8KCgrw8vIiJiaGmJgYjh8/TlhYGMOHD6e0tNTREUVERERERKSaU8dkBTCZTBw7dowxY8bg7+9Py5YtWbNmDZGRkXTp0gXQatwi15KYmBjq1KnDmjVrHB2lwpw9q8euXbvYunUrU6ZMoWfPnrz22msAuLi48MgjjxAWFsbgwYMdFVVERERERESuEeqYvEp2ux2z2czatWu58cYb6dOnT/nj48eP5+OPPy7fRkSuDSaTiTvuuIO33nqLFi1aUK9ePUdHumpl82XabDby8/PJyMggKyuLvn378sYbb/Doo49Ss2ZNJk2aRGhoKICubSIiIiIiInJVdEd5lcpuyi0WC7t372bYsGHMmjWL77//nltvvfWcbUTk2uHn50e3bt348ssvq+2w5rO7JO12Ox988AFz5syhQ4cONG/enF27dpGZmcnjjz+O3W6nX79+5UVJwzB0bRMREREREZGrolW5K4hhGCQmJpKfn8+iRYtISkrC1dWVqKgoRo0apVW5Ra5BdrudefPmERUVRdeuXR0d57Kc3e24Y8cOatWqxYEDBzh48CC1atWid+/ePPHEE3h6ejJ+/Hi8vb1/9zoRERERERGRq6Gh3FfBZrNhsVhYt24d+/bt45tvvuHee+9l2rRpHD9+nOPHj7N3714VJUWuUWazmT59+vDuu+/StGlTQkJCHB3pkpUVF+fPn8/cuXPp1asXv/76KyNHjmT9+vWcOHGC5ORk7rvvvvKi5NmvExEREREREbla6pi8CmWrcffs2ZNnn32Wjz/+mD//+c/Y7fbyOehE5Nr31VdfcfToUUaMGFE+V6OzOrvjcePGjdx9993s37+fGjVqMHXqVIqKihgyZAjPPvssw4YNK583V0RERERERKSiqfXlKphMJmw2G1FRUQDs3r2bnj17Mnv2bGrVquXgdCJSVbp27UpWVha7du1ydJQLstvtwJmOxxMnTnDy5Ek6derEnXfeyfPPPw/AX//6V/z9/WnSpAkLFy4sL0rq/SsRERERERGpDBrKfZWKi4sZOHAgL7/8MgkJCUyfPp2QkBBatGjh6GgiUkXc3Nzo1asXn332GdHR0ecMfXYGe/fu5cSJE3Tr1o0ff/yR0aNH065dO+rWrct7773HnXfeyT/+8Q8yMjLKOz5dXP7358HZu0BFRERERESketJQ7it0+PBhGjZsyLRp03jwwQcJDg5mx44dFBQU0KBBA2rXrl0+1FtErg9LlizBYrHQr18/R0c5R2JiIjVr1mTDhg089dRTvP3227Ru3ZrJkyfTuHFjevToQe/evenUqRMvvviio+OKiIiIiIjIdUJDua+AYRj8+uuvdO/eneeff56FCxdy/Phx2rRpw/Hjx8sXu1FRUuT60rNnTw4dOsThw4cdHeUcERERJCUlsXXrVnJzc1m9ejUADz/8MKtXr2bv3r3861//YuvWraSnpzs4rYiIiIiIiFwvLM8+++yzjg5R3ZhMJqKiorjlllsIDQ2luLiYuXPnsmDBArZs2cKDDz7o6Igi4gDu7u5YrVa+/fZbbrrpJiwWi6MjlbNarRQXFxMeHs63336Lt7c3bdq0ISoqirCwMG644Qbsdjvu7u6aI1dERERERESqhIZyXwGbzYbFYmHhwoW0bt2a6OhoAE6cOMGJEydo2rTpOSvfisj1wzAM3n33XerWrcttt93m6DjnyMjIYOPGjWzatImvvvqKL774ggYNGjg6loiIiIiIiFynVDm7AmVdUO+//z7R0dEsWbKEnj178tVXX9G0aVMAFSVFrlMmk4nevXuzdetW0tLSHB3nHMHBwTRr1oymTZsyZcoUFSVFRERERETEoVQ9u0K7d+8mLy+PFStWsGbNGhYvXsySJUscHUtEnEBoaCht27Zl+fLl2O12R8c5R8OGDbn77rvp378/gNPlExERERERkeuHCpNXqHnz5owZM4b8/HzGjh3Lxo0b8fDwAM4M9RaR61vnzp05ffo027dvd3SUcxiGQUBAQPnn6u4WERERERERR9Ed6RUyDIOBAwdSo0YNoqKisNvtPPPMM4BW4xYRcHV1pXfv3qxdu5acnBxHxwEgMTGR0tJSNLWwiIiIiIiIOAMVJq+QyWQiNTWVZ555BpPJRM+ePWnRogWgDiQROaNBgwY0atSIVatWOToKBw8eZOHChSQnJ+vNExEREREREXEKqqBdhrIuo2PHjgFnVuF+7LHHAHVJisj59ejRg4SEBA4ePOiwDPv37+eTTz6hb9++1K9f32E5RERERERERM6mwuRlKCs+jh8/nvT0dIKCgvDw8MAwjPKVukVEzubl5UX37t1ZsWIFRUVFVX78vXv38vnnn9O/f39iYmKq/PgiIiIiIiIiF6LC5CUq65bcu3cvx44dIyQkhJdffpkPP/yQ8ePHa2VbEbmgli1bEhgYyLp16yr9WMnJyeXXo927d7Ns2TIGDBhAdHR0pR9bRERERERE5HK4ODpAdVHWLWk2mwkODmbs2LF07dqVJk2aMHnyZMxmM4ZhaEi3iPyOyWSid+/e/N///R/NmzcnPDz8nOdLbXYy8opJyynkaFYBWQXFlNoMXCwm/D3cCPf3INTXSrC3Gy6WC7+flJWVxfTp0+nUqRMxMTGsXr2aQYMGafi2iIiIiIiIOCWToeVZL4ndbsdsNrNlyxaCg4MxDIO6devy6quvEhISwsiRI7HZbBrSLSIXtH79eg4ePMj999+P2WzmRG4hOxIz+f7ISfIKSzBjAhO4WsyYTGAYUGKzgwF2DLytrnRoEMRNEQHU8LH+bv8bN27krbfeIjMzk4CAAF544QUVJUVERERERMRpqWPyEuTl5fHZZ5+xdetWli5dSkpKSvlzt9xyC23atAFQUVJELqpjx47s3buXrzf8wDHXcH5OzQJMBHm5ERjoedGOa8MwyC+2seaXNNb8cpwWtfzp0yKcIG/38m02btxIcXExhYWF5Ofns337durVq6dObhEREREREXFK6pi8BDabjaysLF588UUOHjxIZGQkQUFBuLi4cOjQIebPn+/oiCJSDdjtBst//IU5K3cQfcMN1An2vejQ7Asptdk5nlOIq8VM35a1aB0ZyKlTJxk2bBjZ2dmEh4fj5uZGWFgYU6ZMwdfXtxLORkREREREROTqqGPyElgsFoKCgmjfvj0DBgygdu3a7Nq1iy1bttCvXz8ADeMWkYsqKrWxeFvQwu8vAAAgAElEQVQy2xKKCAsOIO9EMi41ml7RvlwsZmoHeJJbWMIHPyZyMC2X4v3rSU1N5cYbb+TWW2/lT3/6Ew0bNsTNza2Cz0RERERERESkYqhj8hLl5+fzzjvv4OLiQnBwMNHR0TRu3BhXV1dHRxMRJ1dUamPBD4nsSc2mTqAnhq2UrVu3ERXVkJAaNa5q3za7QdKpfMJci7ilJrRr0woXF73nJCIiIiIiIs7v8scQXqc8PDzo0aMHDRo0ICUlhTfeeIMFCxY4OpaIODm73WDxtmT2pGZTN9ATi9mEi6srDRo24NfDhyktLb2q/VvMJiICPTle4k68OQyzWZ3bIiIiIiIiUj2oMHmJVq5cSXZ2NjfffDOPP/44/v7+eHt7A2dW7BYROZ9tCafYlnCKOoGemM3/W4QmtEYNvL28iY+Lv+pjmM0m6gZ6sjX+zLFEREREREREqgON97sENpuN9PR0kpKS+PrrrzGbzWzevJlHHnkEQCveish5ZeQV8flPqYT4uGMx/+Y6YTIRFR3F9m3bCQ2tga+f31Udy2I2EeLjzuc/pdKwhvc5q3WLiIiIiIiIOCN1TF5E2fSbJpOJmJgYXF1dufnmm2nXrh1jxowhMjKy/HkRkd/68uejlNjs+FjPPxeth4cHERF1OXjoEEYFdF77WF0psdn5cvexq96XiIiIiIiISGXT4jcXYbfbMZvN3H///dSqVYuMjAxSU1N55plnaNWqlaPjiYgTO5FbyIxV+wn388DFcuH3gAy7ne07dlCalYaHm5nskycoyMvFNyCYm/vcd9nHLbXZOZpdwISeMdTwsV7NKYiIiIiIiIhUKnVMXoTZbKagoIBTp07x7LPP8uabb/Liiy/yxhtvkJmZ6eh4IuLEdiRmAqaLFiUBTGYzjaKj2bNlPWnJCXj5+OPicv4Oy0tx5ngmdibqGiUiIiIiIiLOTYXJP7Bt2za+/vprJk+ezP79+/Hy8iIzM5OAgABHRxMRJ1Vqs/P9kZMEebld0va+fn7c3GcIYc060OrWu3C1Xl2nY5CXG5uOnKTUpoW5RERERERExHmpMHkRhmFwyy23sHnzZgIDA3nwwQdp3749Li4uHD58uHwbEZGzZeQVk1dYgqeb5ZJfc0PTZuSfziftxImrPr6nm4XThSVk5BVf9b5EREREREREKosKkxdhMpn46aef2L17N127dmXDhg38/PPP9OrViwceeIBNmzZp4RsR+Z20nELMmC7r+uDi4kLDqIYcPnwY21V2OppMJkyYSMspvKr9iIiIiIiIiFQmFSbPw/7f1XFXrFjB448/jmEYLF68mOHDh/Puu+8ycuRIvv32Wzp27OjgpCLijI5mFcAVvGcREhyMr68vJzMyrj6E6b85RERERERERJyUi6MDOCOz+Uy9dufOnTz22GPcddddACQmJnLy5EkAbDYbFsulD9MUketHVkExrn+w6M15mUxER0WxbdXH5OblXVUGV4uZ7MKSq9qHiIiIiIiISGVSx+RvFBcXs2zZMgC+++47xo4dy8svv8zu3buJiIjgxhtvBFBRUkQuqNRmcKWzPJTabBQWFnDo0KGrymAyQYlNc+CKiIiIiIiI81LH5G/k5eURGRlJbm4uXbt2JT09nS1btvD5559Tq1YtPv74Y0dHFBEn52IxcbnrYhmGQUpyMgkJiQQFBVFUXAKGwZVWOA0DXC2aA1dERERERESclwqTvxEYGEhgYCCnT5+mW7du5OTkkJmZSXJyMrVr1wbOzEFZNtxbROS3/D3cKLmMBWwKCgo4cOAAxcXFtGjRHPuJwyQkJ1NQUICHp+cVZSix2fGzul7Ra0VERERERESqggqTF+Dl5UX79u2x2+3k5ORw5MgR6tatC6CipIhcVLi/B1xKx6RhcPToUY7ExREWFkb9+vXPTBNhNuHu7k5ubu4VFyYx/ptDRERERERExEmpMPkHzGYzfn5+NG/eHFdXdR+JyB8L9bVix8AwDEwXGIpdVFTEwQMHyc/Pp2nTpuSfSiNh304ASouLsQAHdv5Abq1aWL18qNXghks+vmEYGBiE+lor4nREREREREREKoUKk5dg+/btmM1mbrrpJkdHEZFqINjbDW+rK/nFNrzcf3OZNQzS0tL49fBhgoODadykMS4uLvy6dR2n0lLLNzNhJ273VvLTaxIYWuuyCpP5xTa8rK4Ee7tV1CmJiIiIiIiIVDgVJi/BgQMHiI6OdnQMEakmXCxmOjQIYs0vaecUJkuKizl46BA5OTnE3HADQcHB5c+16XHPOfs4ffo0O3fspGOnjhfsuryQk6eL6dE4FBeLpp0QERERERER56W71j9QWlpKUlIS9evXd3QUEalGbooIAAxK/7sITkZ6Olu3bcNkMtG6detzipLn4/nfuSXz8/Mv67hnjmdwY0TAlcQWERERERERqTLqmPwDKSkpWK1Wgv+giCAicrYaPlZa1PLnp6RT5GekcOrkKaKiGlKjRg24hA5Ik8mEt483ubm5eHl5XfJxj+cU0rJ2ADV8NL+kiIiIiIiIODcVJv9AXFwc9erVu+yhlCIiTX2L+PjQAfw8rbRu3Qo3d/fLer2Pjw+5ubmEhYVd0va5hSW4Wsz0bl7zSuKKSDVWWFhIcXGxo2NUObvdjtmsAUAiIiIizsrNzQ2r9cKNMypM/oH4+HgteiMil6W4uJg1a9awZ88eBrbtzLZMKxbXy1+IxsfHh9TU1D/eELDZDTLyihjSJoIg78srgIpI9VZYWEi9evU4fvy4o6NUOXd3d4qKihwdQ0REREQuICwsjPj4+AsWJ1WYvIiioiJSU1O59957HR1FRKqJxMREli5diq+vL6NGjcLPzx9+TGR7YiYRgZ6YzZfefe3j40NeXh6G3Y7pIh1BdrtB0ql8WkcG0joysCJOQ0SqkeLiYo4fP05ycjK+vr6OjlNlUlNTady48XV33iLi/B577DFq1KjBxIkTL/u1mzdv5qGHHuLnn3++4Dbh4eFs2LCBhg0bXk1MEZFKl5OTQ506dSguLlZh8kokJCQQEBCAn5+fo6OIiJMrLS3l22+/Zdu2bXTr1o127dqVTwExsHUdikvt7E7Npm6gJ5ZLLE56enhgNpk5nZ+Pt7f3ebex/bco2byWHwNa1bmswqeIXFt8fX0rpEC3efNmxo0bx549ewBo2LAh06ZNw9PTk0GDBjlNZ2ZOTg5QcectInIxXbp0YcuWLbi4uODu7k7r1q15/fXXadSo0e+2fffdd6/4OLfffjvx8fHln0dGRjJ37lxuv/328sfy8vKueP8iIs5Gk/JcRHx8PPXq1XN0DBFxcoZhsHPnThITE3nggQdo3779OfPSurtYGNo+gtaRASSePE1uYcml7fisBXDOJ7ewhMSTp2kdGcDQ9hG4u1gq4nRE5DqWk5NDr169GDlyJBkZGaSlpfHPf/5ThT8REeC1114jLy+PxMREAgICGDFixDnPG4aBzWZzTDgRkWpKhcmLiIuLo379+o6OISJOzmQy0apVK/72t78REhJy3m3cXSwMaRNBbNsI8ottpGTmU2qz/+G+yxbAOVupzU5KZj75xTZi20YwpI2KkiJSMQ4dOkRJSQnDhw8v7wrq1KkTzZo1o2fPnpw4cQJvb2+8vb3Zv38/AB988AFNmzbF39+fTp06sW/fvvL9RUZGMmPGDJo1a4afnx/9+vUjKyvLUacnIlIhvL29iY2NZc+ePXTp0oWJEyfSpUsXvLy82Lp1KyNGjGD8+PEArF+/nrCwMF5//XVq1qxJUFAQ8+bNY8eOHbRs2RI/Pz+GDh1KaWnpOdsD3HfffSQlJdG3b1+8vb155plngDP/ex44cICtW7cSGBh4zuJnW7duJSAgoHz+3Ytdo0VEnIEKkxeQl5dHenq6OiZF5JKYzeY/XBnWbDbRtn4QT3dvRLNwP45mF5B0Kp/TRaUYhnHe15QVJg3D4HRRKUmn8jmaXUDzWv483b0RbesHafi2iFSY6OhorFYrgwcP5ssvvyQ9PR0APz8/Vq1aRY0aNcjLyyMvL4+YmBiWL1/OpEmT+Oijjzh58iSxsbH06dPnnJvk+fPn88UXX5CSkkJRURGPPvqoo05PRKRC5OTksGDBAm688UbgzHWurJuy7LGzlXWgJyYmMn/+fEaPHs3UqVNZtWoVhw8fZuPGjSxZsuR3r/voo4+oW7cun3/+OXl5eUyfPv2c59u0aUNISAgrV64sf2zhwoX0798fd3f3S7pGi4g4mgqTFxAfH09YWBgeHh6OjiIiTuhChcRLEeTtzogO9ZjQM4YejUMpLLWTfCqf5JP5JJ/K53h2IWk5hRzPLiTH5srx3DNDtotK7dzeJIwJPWMYfnOkVt8WkQrn6+vL5s2b8fT05O9//zthYWF07dqVw4cPn3f7OXPmMG7cOJo1a4bFYuHBBx/EZDKxZcuW8m1Gjx5N/fr18fHxYfr06SxevBi7/Y87xkVEnM0TTzxBQEAAN9xwA0VFRbz//vsADBs2jJYtW2I2m3F3//3/Z2azmalTp+Lm5kafPn1wc3Nj8ODB1KxZk5CQELp3787OnTuvKNOQIUNYuHAhADabjcWLFxMbGwtc2jVaRMTRtPjNBWgYt4icT1lB8uw5JA3DOOfzS1XDx8rtTWtya0woGXnFpOUUcjSrgOzCEkpsBq4WE77WEKxJPzLsxkCaNKiLi0XvJ4lI5YqOjmbevHkA5XPnDh06lBkzZvxu24SEBJ5++mkmTJhQ/lhxcTGpqanln9etW7f844iICIqLi0lPTyc0NLQSz0JEpOK9+uqrjBo16nePR0REXPR1gYGBuLq6ln/u6elZPly77PMrXdAmNjaWpk2bkpOTww8//IDVaqVTp07ApV2jRUQcTYXJ8zAMg/j4eHr37u3oKCLiZMoKkPv37+fLL79k8ODB1KpV66r26WIxE+ZnJczPSos6/r97/uRPAZTkZOBiibyq44iIXK6IiAgeeeQR7rvvvvO+AVO3bl3Gjh37uwUgzpaUlHTOx66urhecj1dEpDq6kjeoK2rf9evXp2XLlnzyySesW7eOwYMHl7/mUq7RIiKOptab88jMzCQ3N/ecd/hF5Pr129UVly9fzkMPPYTVamX27Nls3769Uo8fHh7O0aNHK/UYIiIABw4c4OWXXyYpKQnDMDhx4gTz5s2jffv2hIaGkpmZSWZmZvn2Dz30EDNnzuTnn3/GMAzy8vJYvnz5OYt2zZ49m/j4eHJzc5k0aRIDBw78wzl5RUTkjNDQUI4cOXLRbWJjY5k3bx5Lly4tH8YNl3aNFhFxNP1XeB7x8fHUqVMHNzc3R0cRESdgsZxZ8Xrx4sUcOHAAq9XKpEmTiIyMZOnSpeTm5lbqfGkqTIpIVfHx8WH79u3cfPPNeHt707JlS7y9vXn//fe54YYbiI2NpWHDhvj7+7N//37uuusupkyZwvDhw/H39ycqKooFCxacs89hw4Zx5513Urt2bSwWC6+//rqDzk5EpPqZMGECL730Ev7+/kyePPm82wwcOJDt27cTFRVF48aNyx+/lGu0iIijmYyrWcHhGvXxxx9To0YNOnfu7OgoIuIE0tPT6d+/P82aNeOll15iwYIFLFmyhHr16vHqq69y8uRJAgMD8fX1rZTjZ2dn8/rrrzNhwoRz5icSEYEzq8P6+fmRnZ1dadehKxUZGcncuXO5/fbbK3zfKSkp1KlTxynPW0REREQu7f9UzTH5G2XzS7Zr187RUUTEAUpLS3FxOXNp3LNnD7Vr1yYnJ4cePXqUTxzeq1cvDh06RIcOHZg2bRpbt27lnXfeqbQbY19fXzw8PEhLS6N27dqVcgwRqf5SU1PJyclxdIxz2Gw2MjIySElJqfB9Hzt2DHDO8xYRERERLmnqCBUmfyMtLY3S0lLCw8MdHUVEqsjp06eZOXMmzz33HC4uLuTn5+Pp6cnSpUvJy8vjkUce4bvvvisvTNaqVYv777+fQ4cOER0dzcsvv1yp+UwmE+Hh4aSmpqowKSK/Y7fbcXd3P2f4njMZOnRope3bzc3Nac9bRERERMDd3f2iU5+pMPkbcXFxREZGls8pJyLXttLSUry8vEhLS+Odd97hrrvuYuLEibz11ltMmDCBe+65B1dXV9q1a8eMGTOw2Wzs3buXSZMm0adPn/L92Gy2Sr1uaJ5JEbkQs9lMUVERycnJ19WQ5tTUVBo3bnzdnbdUnClTpnDixAnmzJnj6Ci/89hjj1GjRg0mTpx42a/dvHkzDz30ED///PMFtwkPD2fDhg00bNjwamJesUOHDtG6dWuys7MdcnxHOXLkCH/5y184cuQIY8aMYezYsY6O9Dtt27Zl5syZdO3atVKPU1hYSGhoKLt37yYiIqJSj1UdRUVF8e6779KpU6dK2f8jjzzCsmXLCA4OZseOHZVyDBE4M5S7Tp06F134UIXJ34iPj6dBgwaOjiEiVaRs2HazZs2YP38+I0aMICQkhKVLl9K/f38GDhzIQw89xGeffcb69etZsWIFL7zwAvXr1z9nP5X9ZkZ4eDi//PJLpR5DRKo3X1/fal+gy8rK4oEHHmDVqlX4+PgwduxYHnvssfNuWzZ8+1o4b3EMd3d3XF1dMZvN3HHHHfzyyy8UFxdTv359pk6dyl133eWwbO++++4Vv/b2228nPj6+/PPzzfWal5d3Vfmulre3N8Al/e6uX7+eQYMGcfz48as+bpcuXRg0aBCjRo266n1didmzZ9OhQwd++uknhxz/t8739di/f3+VHLtsoVkfH58//DlISEigXr16FBQUYLVaKzXX5fyMVObPk8lkwsvL65J+R0wmE/v37+eGG264pH1///33fPXVVyQlJeHj43O1UStFXl4eU6dO5ZNPPuHEiROEhITw5z//mcmTJxMZGenoeFLBtCr3WWw2W/lFT0SuTXa7nbPX/CotLaVfv34kJCQQEBDA5MmT+fvf/87q1atJT0+nRYsW/PDDD/z000906dKFl19+mfr161PV64aFh4eTkZFBcXFxlR5XRKQqjR49mqKiIlJTU/nqq6944YUXWLVqlaNjyTXO3d2duXPnkpaWRk5ODnPmzCE2NpbU1NQqz2IYBjabrcqPK2f89v/EihYfH0+zZs2u6LWlpaUVnEauV/Hx8URGRl6wKOnon7Xi4mJuvfVWduzYwbJly8jJyWHXrl20aNGCb775xqHZpJIYUi4hIcF46aWXDLvd7ugoIlLJ9u/fb/z6669GTk6O8eijjxqGYRiFhYVGx44djWPHjhmfffaZMWLECKNbt27GV199dc5rbTabIyIbs2bNMhISEhxybBFxXtnZ2QZgZGdnOzrKVcnLyzPc3NyMPXv2lD82ceJE49577z3v9snJydfEeUvV+fnnn41WrVoZ3t7exh133GE88MADxvDhw8/Zxm63Gz/88IPh7u5ufP/994ZhGMZ7771ntG3b1hg3bpwREBBg1KpVy1i+fLmxevVqIzo62vDz8zOefPLJCx53ypQpRt++fY0hQ4YY3t7eRuPGjY3169eXP9+5c2djwoQJRufOnQ0PDw9j8+bNxvDhw41x48YZhmEY69atM0JDQ43XXnvNCAsLMwIDA423337b2L59u9GiRQvD19fXiI2NNUpKSs7Z3jAMY9CgQYbJZDKsVqvh5eVlTJw40TAMwwCM/fv3Gz/++KMREBBgFBUVlef58ccfDX9/f6OwsNAwDMNYsGCB0aRJE8PPz8/o2LGjsXfv3vJtX375ZaN27dqGt7e3Ua9ePeOjjz4679egoKDA+Nvf/mYEBAQYDRs2NP71r38ZZ9+Kvv/++0bjxo3L9/Pmm28ahmEYWVlZhtVqNUwmk+Hl5WV4eXkZv/zyixEXF2d069bNCAwMNIKCgoxBgwYZp06dush33zDGjh1rmM1mw93d3fDy8jJiY2MNwzCMiIgI48UXXzRuvPFGw93d3Th69OgF85z99X3jjTeMsLAwIyQkxHjppZfKn9+2bZvRpk0bw8fHxwgODjYGDx5sGIZhdOrU6Zzjb9261cjOzjb++te/GqGhoUatWrWMxx9/vPzrHh8fbwDGe++9Z0RGRhoxMTGX/bOQnZ1t9O7d2wgJCTH8/f2Nnj17GomJiX/49Vi1apVhGIZR9P/snXdUVNf2x78zMEObYQaG3qVYUBRRwYYlKihoUBNFESUmEuVFIzFRQY1AjF0TS6yxYcESXuydPFRUEEvEWLDwpAhIExgQmIGZ8/uDH/dx6aiRgOezFms5p+6zz7nXe/c9Z2+ZjHz33XfE1NSUGBgYED8/P1JQUMCMFQDZtm0b6dChA9HW1iaTJk1iraXqKBQKEhQURPT09Ii5uTnZsWMHAUCeP39OCCHkzJkzpHv37kQoFBIzMzOyaNEipq6xsTEBwKyBs2fPNjg2QgjZu3cvsbGxIQKBgJiZmZGffvqJyTt37hzp0aMHEYlEpHv37uTKlSsN6qQ56+nx48dk6NChRCwWEzs7O7Jjx45626jOTz/9RExMTIi+vj5Zs2YNMTQ0JNHR0YSQyjXVp08fIhKJiKGhIQkICGDWSZ8+fQgAoqmpSbS0tMi2bdsIIZXXvpGREdHW1ib9+/cn9+7dI4QQsm3bNqKmpka4XC7R0tIis2fPZtbVzz//TExMTIi7u3uDbRBCiJ+fH5kxYwbx8vIiWlpaxNHRkTx79oysXLmSGBoaEiMjI3Lo0CGmvEwmI8HBwcTKyopIJBIyceLEeq/ZnTt3En19fSKVSuvVV2ZmJhk7diyRSCTEysqKLF26lHlHe9N7ZlPLE0LInj17SIcOHYhIJCKDBw8mjx49YvIGDhxIFi1aRAYNGkQEAgHp27fvB/0O15TnVGqYrEZ0dDT57bffWloMCoXyjqluSHz9+jWZO3cuGTJkCPHz8yPXrl0jH330EYmLiyOEEDJnzhwyZswYQggh2dnZLSJvfURERJDr16+3tBgUCuUfRlsxTN65c4eoqqqy0o4cOUI6duxYZ3lqmKQ0B7lczry8yuVycv78eaKhocEyTPbv35/w+XwCgAwbNoxUVFQQQioNk6qqqmTz5s2kvLycbNiwgejp6RFvb29SUFBAkpKSiFAoZAyZNQkJCSEqKipk7969pLy8nOzZs4eIRCLmpXzgwIHE2NiY/Pnnn0ShUJCysrJahkkVFRUSHBxMZDIZOXHiBFFTUyOjRo0iGRkZJDs7m1haWpIDBw4w5asMk4SwDU1VVBkmCSGkffv25OjRo0ze119/Tfz9/QkhhJw4cYJYWlqSe/fukYqKCrJ161bSrl07IpPJSGJiItHQ0CCJiYmEEEIyMjLIgwcP6tTBggULiLOzM8nKyiJZWVmkd+/eLMPk6dOnydOnT4lSqSRXrlwhGhoaJD4+vs7xEEJIUlISOX/+PCkrKyM5OTlk4MCB5Kuvvqqz7+oMHDiQbNmyhZVmaWlJOnfuTJKSkohcLifl5eWNyqOiokLmzp1LZDIZuXHjBuHxeOTp06eEEEJ69+5NfvzxR6JQKEhpaSm5evVqvf37+fkRd3d3kp+fTzIzM4mzszNjPK4yTHp7e5PCwkJSUlLS7LWQn59PfvvtN/L69WtSVFREvL29iaenZ6P6qFovISEhpHv37iQ9PZ0UFBSQUaNGMYZWQghzreTm5pKsrCxibW1Ndu7cWafut2/fTmxsbEhSUhKRSqXEy8uLZZi8fPkySUhIIAqFgty7d48YGhoy7+ZVuigtLWXaa2hsxcXFRFVVlTE45uXlkTt37hBCCLl79y7R1dUlMTExRKFQkJMnTxJdXV2Sk5NTr07qo2ZZuVxO7OzsyPfff09kMhm5efMmkUgk5OLFiw22c+HCBaKrq0tu375NSktLyfTp04mKigpjmLxz5w65du0aKS8vJ8nJycTe3p6sXr2aNQ/VDWOEELJr1y5SWFhIysrKyJw5c0jnzp2ZvKqPLVVUrauvv/6alJaWkpKSkkbb8PPzI2KxmMTGxhK5XE7Gjx9P2rVrR0JDQ4lcLidHjhwhIpGIaWvOnDnE3d2dZGVlkZKSEuLn58daS9Xx9vYmkydPblBngwYNIlOnTiWvX78mT58+JTY2NmT79u2s8TTnntnc8lVjl8lk5McffyTW1taMsXjgwIHEwsKC/PXXX0Qmk5FPPvmk0fG0ZahhsglUVFSQ//73v6SkpITs3LmT3L59u6VFolAofwPl5eVk06ZNJCYmhnz++efk9evX5NKlS+TAgQNkyZIlxNfXl/z0009kwoQJZMmSJUShUDC7p6teTFqaS5cukcjIyJYWg0Kh/MNoK4bJK1euEIlEwkq7cOECMTU1rbM8NUxSmsPly5eJvr4+62Pl6NGja+2YlMlk5NixY2TNmjVM2u7du4mlpSXzOy8vjwAgsbGxTNqwYcPIxo0b6+w7JCSEODk5sdIcHR3Jvn37CCGVL7FVRsgqahomeTwekcvlTL5QKGTtTvT392d2bTbXMBkWFsbsTK6oqCCGhobk8uXLhBBCRowYQTZv3syqa21tTS5fvkyePXtG1NXVSWRkJGN8qI927dqREydOML+PHTtGGjq8N3r0aGYO6jJM1uTYsWOkS5cuDZYhpH5DXGOGqJry8Hg81s5ABwcHxog2YMAA4u/vT9LS0hrsv6KigvD5fHL37l0m/+zZs8TMzIwQ8j9jXHWDU3PXQk3u3r1LBAJBo/qoWi82Njbk2LFjTN6jR4+IiooKM3YAjPGMEEJmzZpVr4F48ODBZMOGDSxZqhsmaxIYGEhmzpzJ0kV1w2RDYysuLiYaGhpk69attf6PCAgIIPPmzWOlffTRRyQ8PJwQ8naGyZiYGKKnp8d6d5g/f36DOy8JIWTq1Klkzpw5zO/8/HzC4XBYuq3OunXryMiRI5nfdRkmq1NQUEAAkNzcXEJI3YZJLpdLXr9+3eQ2/Pz8WPfPf//730RLS4s19qpTEEqlkmhqapKHDx8yeRfN574AACAASURBVKmpqYTH49X5njV06NBa98TqpKWlEQ6Hw9pxuWXLFtK/f39mPM29Zzan/BdffEECAwOZPIVCQYyNjUlUVBQhpHJdhISEsHRT3aj7odGU59QP3sdkfn4+li9fjoCAABw8eBDJyclITk5uabEoFMpboFQqWb9fvXqFefPm4dKlS7C1tYWtrS3S09MxcOBA3L17F5988gm+//57yOVyhIaGYtGiReByueBwOAD+/sA2TYVG5qZQKG0ZgUDABLSporCw8B/rmJ/SusjIyICpqSkrKmhdkYD5fD68vLxw5swZnDhxgkk3MjJi/q2pqVlnWkMBZSwsLFi/LS0tWT4sG4tKrKurCx6Px+qvOf03hK+vL06fPg2pVIqoqCioq6szkYCTk5Mxd+5ciMVi5i8zMxPp6emwsbFBeHg4NmzYAENDQ3h6eiIxMbHOPjIyMlg6qDnes2fPonfv3tDV1YVYLMbp06eRm5tbr8xZWVmYMGECTE1Noa2tjUmTJjVYvjGaK4+uri4TvAVg63/Xrl0oKSmBk5MTOnfuXG8goyrf4dUDeVhZWSEzM5Pl57KmbM1ZCyUlJZg+fTosLS2hra0NV1dXFBcXQyaTNUkv6enpteRTKBSsQERNXYeNrYEbN25g8ODB0NfXh0gkwpYtWxqc04bGpqWlhRMnTuDo0aMwNzfHwIEDERsbC6ByTW/cuJG1puPi4t7JM3Z6ejrMzMxY7w5WVlaN+qutqRuxWMwKevPkyROMHDkSRkZG0NbWRnBwcIO6USgUCAoKgo2NDbS1tRldN1RHT0+Pubc1tY2ac6+np8cau4aGBoqLi5GTk4OSkhL06dOH0bmDgwO4XG6dQa0kEkmD85Geng6RSAQdHR0mraaem3vPbE75mtcFl8uFhYUFq/93dX/+UPjgDZO6urpQVVWFUqnE69evsX//fqxbtw6lpaUtLRqFQnlDql46Lly4gN9//x1lZWWwt7eHqakpjIyMIBQKcfLkSfzxxx94+PAhUlJS0L59e8yfPx8dOnQAIaSWcfOfgImJCfLy8lBWVtbSolAoFMo7p3379uBwOHjw4AGTdvfuXXTp0qUFpaK0FUxMTJCens76/z01NbXe8hUVFUhKSnpn/dfsKzU1Faampszvqo+hfweNtW1tbQ1HR0dERkZi//798PHxYepYWFjgl19+QUFBAfNXUlKCiRMnAgDGjx+Py5cv4+XLl7CxsYG/v3+dfZiYmLB0UP3fMpkMn3zyCQIDA5GVlYWCggJ4enoyxrm65F+wYAGUSiXu3bsHqVSKAwcONCloTX26qJ7emDyNYWNjg/379yMrKwubNm3CjBkz8OzZs1rl9PT0wOfzWZtikpOTYWxszJLnbdbG2rVr8fDhQ8TFxUEqlSImJgYAGtRtdUxNTWvJx+VyWUaXptLQGgAAHx8feHp6IjU1FYWFhQgICGhQzsbGNnToUCaYpaenJ8aPHw+gck3PmzePtaZfv36NoKCgevuqj5plTU1N8eLFC1YAq+TkZNa1Xhc1dVNQUMD6UBcQEABbW1s8efIEUqkUy5cvb3A9RkRE4Pfff8fFixdRWFiIlJQUAGiwTs2xvEkb9aGnpwcNDQ3cvXuXpfeysrI6dePm5obz58/Xa8wzNTVFYWEhCgoKmLSm6PldUfO6UCqVSEtLe2/9t0U+eMMkl8uFra0tcnNzYWBgAC0tLUydOhUaGhotLRqFQnlDCgoKMGPGDOzevRu3b9/G5s2bYWpqChsbG9y+fRv+/v7o1KkTNm3aBB8fHwwfPpypSwgBh8Nh7aj4p6ClpQWRSITMzMyWFoVCoVDeOVpaWvj000+xcOFCFBUV4f79+9ixYwc+//zzlhaN0gbo06cPNDQ0sGrVKpSXlyMqKgrnzp0DANy6dQvR0dGQyWSQy+XYuXMnYmNjMWjQoHfWf0JCAiIiIlBRUYF9+/YhKSkJHh4e76z9hjA0NGzUyOrr64sdO3bg2LFj8PX1ZdIDAgKwYsUKJCQkgBCC4uJinDx5EkVFRXj8+DGioqJQVlYGNTU1CASCek+ZeHt7Y9myZcjJyUFOTg5WrlzJ5MnlcshkMujr60NVVRUXLlzAhQsXWPLn5+cjPz+fSSsqKmKeizIyMrBmzZp3povG5GmMvXv3Ijs7GxwOB2KxGBwOp069qKioYMKECQgODkZBQQGysrIQFhaGyZMnN7mvxigqKoKGhgbEYjHy8/OxZMkSVn5j+pg0aRKWLFmCzMxMSKVSBAUFwdvbm7VbtKl4e3tjw4YNeP78OYqKihAaGlpLVh0dHWhoaODWrVuIiIhg8vT19cHlclmyNjS2rKwsHDt2DMXFxeDxeBAKhcwcfPnll9i+fTuuXr0KpVKJ0tJSREdH48WLF03SSXVqlnVxcYFYLMaSJUsgl8tx584d7Ny5s9E59fb2Rnh4OO7evYuysjIsWLCA9S5SVFQEbW1tCIVCPHnyBFu3bm1QjqKiIqipqUEikaC0tBSLFi1q0niq8y7aqILL5WL69On45ptvmPeY7OxsHD9+vM7yvr6+sLKywujRo/HgwQMoFAoUFhZi06ZN2LlzJ8zMzODq6orvvvsOpaWlSEpKwtq1a9/ptdMQkyZNwp49exAfH4/y8nKsXLkSfD4f/fv3fy/9t0X+eW/eLUDHjh2RlZUFuVyOL774At26dWtpkSgUShPIyMhg/hOuvgOirKwMN2/exKxZs7B06VLY2NggPj4ednZ2+P3336FUKjFixAj8/vvv8PHxAdD0L8ctDT3OTaFQ2jKbNm0Cj8eDsbExhg0bhqCgIIwYMaKlxaK0AXg8Ho4fP47IyEjo6Ojg559/Zl5iy8vL8e2330JPTw+Ghob49ddfERkZie7du79RXzExMRAIBKy0jz/+GGfOnIGOjg6WL1+Oo0ePQldX963H1RSCg4OxatUqiMVifP/993WW8fb2xq1bt2BnZwd7e3sm3cvLCyEhIfDz84NYLIadnR327dsHoHJn4cKFC6Gvrw+JRIK4uDjGYFJTB4sXL0anTp3Qvn179O3bl9lxCQBCoRAbNmyAj48PdHR0sG/fPowaNYrJ79ixI3x9fWFrawuxWIxHjx4hJCQE9+7dg1gshoeHB0aPHt0kXcyePRsnTpyAjo4O/Pz86izTmDyNcfHiRTg4OEAgEGDcuHHYvHkz2rVrV2fZqmPwHTp0QPfu3eHs7IzFixc3ua/GCAwMhFwuh76+PlxcXDBs2DBWfmP6WLBgAQYOHIiePXvC1tYW2tra2Lx5c5P6Tk1NhUAgYHYCTps2DWPGjIGzszPs7e3h6enJKr9582b88MMPEAqFCA0Nxbhx45g8TU1NLFq0CIMHD4ZYLMb58+cbHJtSqcS6detgZmYGsViMPXv2MIZOJycnhIeHY+7cuZBIJLC0tMTatWuZd4mmrJH69Mfj8XDy5EnExMTAwMAA3t7eWLp0Kdzc3Bpsx93dHcHBwfD09ISFhQVsbGygp6fH5K9ZswZHjhyBUCjEF198wdINAISFhWHatGkQi8X49ddfMWXKFFhbW8PU1BSdOnVCr169Guy/Lt5FG9VZsWIFunXrBldXVwiFQvTt2xfx8fFMvkAgYHa98vl8REVFwdHREZ6entDW1kbXrl1x9+5dZp4PHjyIvLw8mJmZ4aOPPsKUKVMwbdq0t5KxqQwePJgxhOrr6+PChQs4ffo01NTU3kv/bREOeZO9uG2Mmzdv4vPPP0dwcDAmTpz4jzdMUCiUyuMF8+bNw9ChQ/Hrr7+Cx+Mxux2Byv+s4uLisH79esTExOD58+dwdXXFo0ePMHToUPB4PHA4HCiVyn/k7sj6iImJwcuXL2s9kFAolA8XqVQKkUiEwsJClk+qts6LFy9gbm7+wY2b0voIDQ1FYmIiDh061NKiUCgUCoXyXmnKc6rqe5apRahQKJFbLEeWtAwZBaUoKJWjQkGgqsKBWIMPbaExfP1n4tNx46lRkkJpJdjZ2eHq1as4fvw4Dhw4gM8++4yVP2rUKBw+fBghISG4ceMGRowYgXbt2tX6Yt2ajJJApU+TO3futLQYFAqFQqFQKBQKhUKhvDVt2jCZXVSG2yn5uJaUh+KycnDBATgAT4ULDgcgBChXKAECKA2d8cOZRPSzkaCHpQ4MhOotLT6FQqmDql2RPXr0AJfLxcCBA7Fp0yZ4eHjAwMCAKScQCPDNN99g2bJl+O6772odXWmtGBsbIz8/HyUlJazIeRQKhZKenl4rqnVbpspP1Yc2bkrrQyqVoqSkhPFhR/n7SE9Px0cffVRn3t69e+Hi4vKeJaK0Zt7legoKCsLRo0drpTs7OzPuESiUtkhRUVGjZdrkUe68YhlOJmQgIb0AAAcSLT40+SoN7oYkhKBErkDeazkAgm6mYozqZgKJgPoJoFBaGoVCUa8zdYVCgV9++QWEEAQGBgIAoqOjoaGhAQcHB6xZswbW1taYPHlyg+20JjZs2ABPT0/Y2Ni0tCgUCuUfQEFBAYyMjCCTyVpalPcOn8+HXC5vaTEoFAqFQqFQKPWgpqaGly9fQiwW15nfpgyTSiXBzeRXOHo3HeUKJYy01aGq0vxjmhUKJV5Ky8BT4WKMoyl6WemCy6VHvCmUlqSwsBAXLlyAq6srjIyMWHklJSWYM2cO9PX1MWnSJOjq6kIikUBFRQUJCQlISEjA5MmT24yrhsjISBgaGsLV1bWlRaFQKP8Aqnz3pKWlfVC+FtPT02Fvb//BjZtC+dAICQlBdnY2tmzZ0tKi1CIwMBAGBgZYsGBBs+tev34dAQEBSEhIqLeMiYkJrly5Altb27cR828lJiYGn3/+OZ4+fdrSojSZsrIyGBoa4t69e7C0tHyrefyQuHnzJgICApCRkYE1a9bg3//+N7y8vDBlyhQcOHAAu3btwh9//NHSYlL+YUil0kZ9greZo9yyCgUO30zDzeRX0BeqQaj+5kexVVW4MNPRRFFZOfbfSMHjrCJ49zKHmmrr32lFobQGagakuXTpEmbMmIEFCxawItRVcefOHVy4cAGDBg2Cnp4eq0y3bt3QrVu39yL3+8LExARpaWktLQaFQvmHoa2t3aoNdL/88gv27NmDv/76C2PGjGk0UEjV8e3WPm4KhdIwampq4PF44HK58PDwwMOHDyGXy2FtbY2wsDB4eXm1mGy7du1647rDhw/H8+fPmd9WVlbYunUrhg8fzqQVFxe/lXzvAy0tLXA4nFZ1H+bz+QAqI7Bra2u/1Tx+SKxYsQL+/v6YO3cuAGDGjBlMnoaGBlRUVN7ZOrhz5w5CQ0Nx9epVKJVKWFlZwc/PD19//XWbOAFHYdO6oj7Ug6xCgX2xKbidkg9LiRaE6rx30q5QnQdLiRZupeRjX2wKZBWKd9IuhUJpmCqj5M2bNwEAubm5CAoKQteuXXH16lU8evQIAJgvs0qlEpGRkdi1a1edhsu2homJCTIyMlpaDAqFQnmnmJiYYNGiRfD3929pUSgUyj8QNTU1bN26FVlZWZBKpdiyZQt8fX2Rnp7+3mUhhEChoO+GlPfL37nuKioqGi3z/PlzODg4/O1937hxAwMGDICzszMePXqEgoICRERE4MaNG03yV0hpfbR6w6RSSXD4Zhr+Si+Eha4mVN7xkWsVLgeWupq4l16II7fSoFS2mZPvFMo/BkIIqnuVuH//Pr744gt8++232Lx5Mx4/fgxDQ0Ps2rULpaWlCAkJAQD8+9//Rl5eHgYMGAAnJycQQqBUKltqGO8NY2NjSKXSVvEVnUKhUJrK2LFjMXr06A/iAxOFQqmfe/fuoVevXhAKhfD09ER+fj4AgMfjwd7eHioqKkwwxPLycqSkpAAA9uzZg969eyMoKAi6urowMzPDqVOncP78eXTo0AFisRjfffddvf2GhoZi7Nix8PX1hVAoROfOnXH58mUmf9CgQViwYAEGDRoELS0txMfH47PPPkNQUBCAyhM+RkZGWL9+PYyNjSGRSLBjxw7cvn0bjo6OEIlEmDx5MmOEqSoPABMnTkRqairGjBkDgUCAhQsXAgA4HA4SExMRHx8PXV1dlk/d+Ph46OjoMP6F9+/fjy5dukAsFsPV1RUPHjxgyq5Zswbm5uYQCoWwtraud0f6rVu34OLiAm1tbcZFUhWPHz+Gh4cHczpp5syZrLobN26EsbExDAwMsHr1aiadEIKffvoJ7du3h66uLjw8PFiBoDgcDrZu3YqOHTtCS0sLM2bMQE5ODjw9PSEUCtG3b19W+Tlz5sDCwgJCoRBOTk6sOaoPpVKJ4OBg6Ovrw8LCAgcOHGDlV5/HvLw8eHl5QUdHBzo6OnBxcUFubi6AStdSM2bMgJmZGUQiEVxdXVFaWsrMR58+fSASieDg4IBTp04BqAzUVuVfr4qqtKogbufPn0fPnj0hFovh5OSEmJgYpmzNdRcXF8ead6DSkCeRSFjro2p9rV69GgYGBjAzM8P69euZ/Kr1/vnnn0MsFmP16tWQSqX44osvYGRkBDMzM8yZM4fpx8rKCv/973+ZNZqdnY1BgwZh69atder86dOnGDFiBPT09GBjY4PNmzc32Hd15s6di8mTJ2PRokUwNDQEANjb2+PQoUOMj8KzZ8+iW7duEIlEcHFxQWxsLGs+AwICMHr0aAgEAnTv3h1JSUlYtWoVjIyMYGxsjMOHD79x+Yb0lJycDA6Hg3379qFdu3bQ0dHBN998U6eOKP+j1Rsmbya/ws3kVzDX1fzb/EByuRxY6Goi/nllXxQK5d1R9WDJ4XCYL4C7d+9GQUEBrly5gk6dOuHly5cYMWIE1q1bh9u3b8PMzAxAZXQ7iUTCtMXhcFhHwNsqampqkEgkdNckhUKhUCiUNkV5eTm8vLwwZswYvHr1CrNnz64VsdjV1RXq6uro06cPBgwYwIqMfPv2bVhaWiI7Oxvz58/H1KlTsXv3bsTHx+POnTvYvn07rl+/Xm//J06cgLu7O/Lz8zFv3jx4eXkxhlGg0vi5bt06FBcXw8nJqVb93NxcZGVlISUlBXv27MHMmTMRFhaGs2fP4tmzZ4iJicGRI0dq1Tt48CAsLCxw9OhRFBcXY+nSpax8Z2dn6Ovr48yZM0zagQMHMG7cOKipqeHkyZNYtGgRDh48iLy8PPj6+mLUqFGQy+V4/PgxFi9ejKioKBQVFeHatWvo2rVrneOfNWsWPv74YxQUFCAtLQ3/+te/AFQeKR86dCj69euHtLQ0pKWlYcKECaxxp6WlISUlBadOncLChQvx7NkzAJVuOg4cOIALFy4gKysLTk5OrLoAcPz4ccTGxuLhw4c4cuQI3N3dERoailevXsHAwAA//PADU7ZHjx64c+cO8vPzMWXKFIwbNw4lJSX1zikA7Ny5E7/99htu3LiBBw8e4OTJk/WWXbNmDZRKJdLT05Gbm4vNmzdD/f/dxPn5+SErKwt//vknXr16hRUrVoDL5SI/Px/Dhw/H559/jry8PKxduxbe3t549OgRjI2NMXDgQJYx+NChQxg4cCCMjY2RkJAAHx8frFu3Dq9evcIPP/yA0aNHM8ZQgL3uevbsCXNzc9YY9u7di4kTJzJH1KvPy/Pnz5Gamorjx48jLCwM//nPf5j8kydPws3NDa9evUJgYCC+/vprpKenIzExEbdu3cK1a9cY3ScnJ7PWqIGBQb06LCkpwZAhQ/Dxxx8jMzMTZ86cwYoVK3Dx4sV6+65e99q1axg3bly97T99+hSffPIJli1bhry8PHz11VcYMWIES2eHDh1CUFAQ8vPz0b59ewwbNgylpaVIS0vDhg0bMH36dMao3NzyDempiqioKNy/fx+3b9/G7t27WXqn1KZVv8HnFstw9G469IVq73ynZE1UuBzoC9Vw9G468oo/vKiXFMq7pKKigvlazOFwIJPJ8PXXX2PevHm4ePEipk+fDqVSiczMTAwePBgGBgaIj4/H999/j9zcXISFhTFttaH4Xc2CHuemUCgUCoXS1oiNjcXr168RFBQEHo8HNzc3uLu7s8rExMSgqKgIx44dg7u7O8vfnKmpKQICAqCqqopJkyYhNzcXgYGBEIlEsLa2Ru/evXHnzp16++/WrRsmT54MVVVV+Pn5oV27djh9+jSTP2XKFDg6OoLL5UJNTa1WfS6Xi7CwMPD5fIwaNQp8Ph8+Pj4wNjaGvr4+3NzcGuy/ISZNmsTs9FMoFDh8+DB8fX0BAFu2bMH8+fPh4OAAFRUVTJ8+HRwOB3FxcVBVVQUhBPfv30dpaSmMjY1hb29fZx98Ph8pKSnIyMiAuro6+vXrBwA4deoURCIRFi5cCA0NDWhoaKB///6scf/444/g8/lwdnZGx44dcffuXUa2H3/8EVZWVuDxeAgNDcXNmzeRmprK1J8/fz50dHRgaWmJ/v37w9nZGb169QKPx8P48eNZOps0aRL09PSgqqqKwMBAlJeXM26e6uPgwYOYPXs2rK2tIRQKWe8SdekgLy8Pz549g4qKCnr06AGBQICXL1/i+PHj2L59O/T19aGiooJ+/fpBTU0Np0+fhqWlJfz9/aGqqgo3NzeMGjUKERERAABfX1/WLs0DBw4wc7dt2zZMmzYN/fv3B5fLxciRI+Ho6MgyQtdcd1OnTsXevXsBAHK5HIcPH4afn1+tsSiVSixfvhzq6uro0aMH/Pz8GJmASiPvhAkTwOVywefzcfDgQaxcuRJisRhGRkYICwtj+mkOp06dgrGxMQICAsDj8dChQwf4+/vj4MGDdfatoaHBpOfn50OpVMLExKTe9g8fPgx3d3d4enpCVVUVU6ZMQceOHXH8+HGmjJeXF3r37g0ejwdvb29kZ2dj0aJF4PF4GDduHEpLS5GUlNTs8gqFokl6CgsLg5aWFqytrTFgwIA3vu4/FFq1YfJUQgbKFcp35lOyMYTqPJQrlDh1L/O99EehtFWuXLmCCxcuAABev36Nbdu2wdLSEt999x02btyIFy9eYMiQIVi5ciUuXryIq1evwtHREcHBwVi3bh2EQiFzZLutRNpuLtQwSaFQKBQKpa2RkZEBU1NT1gkYS0vLWuX4fD68vLxw5swZnDhxgkmvOhoNAJqamnWmNeQKx8LCgvXb0tKS5cOyLlmqo6urCx7vf++mmpqazeq/IXx9fXH69GlIpVJERUVBXV0drq6uACp3s82dOxdisZj5y8zMRHp6OmxsbBAeHo4NGzbA0NAQnp6eSExMrLOPXbt2oaSkBE5OTujcuTMTFCY1NbXByOC6urqs3XrVx5mcnAxvb29GLj09PXC5XNbx7Jo6akhna9asQadOnSASiSAWi1FYWMjaKVcXGRkZrLltaB7nzp2L/v3745NPPoGJiQnmzZuH8vJypKamQiQSQV9fv1ad9PR0WFlZsdKsrKyYtTN27Fg8fPgQT548wePHj/Ho0SOMHTuW0c/GjRtZcxcXF8d6zq8pr6+vL/744w/k5OTg9OnTMDAwQK9evWrJJRKJIBKJWO3Ut55zc3Mhl8tZ47CyskJmZmazN4IkJyfjzz//ZI1p1apVrOPs9c2Bjo4OuFxug+85jekbqL2m9PT0WB8xNDQ0WOuqqeWbqqd3dd1/KLTaqNzZRWVISC+AiUij0bKlxVJc/j28zjwdA2O4DP+0yf0aaavj7ot8jCgygoHwzSN/UygfGlWGRC6XC2dnZzg7O2PJkiXYuXMnpFIp5HI5Fi1aBHNzczg4OKB9+/aYPn06tm/fjmXLloHP5zMPPISQD+LIdkOYmJjg6tWrzFF4CoVCoVAolNaOiYkJ0tPToVQqmWe91NTUeiP9VlRUsHY9vS3Vd/FV/f700/+9K/6dz1yNtW1tbQ1HR0dERkYiOjoaPj4+TB0LCwvMmzcPn332WZ11x48fj/Hjx6OkpARBQUHw9/dn+TGswsbGBvv37wchBJcvX4abmxsGDBgACwuLWkfqm4qFhQW2bt2KQYMGvVH96sTExGD58uWIjo5Gly5dwOVyoaOj06jhzMTEhDW3Nee5OgKBAKtWrcKqVauQlJSEESNGoEOHDvD09GSMoDV9IZuamiI5OZmVlpycDDs7O6ZNLy8vRrdeXl4QCAQA/jd3oaGh9cpUc21U7b49ePAgoqOj69wtCVT6xJRKpcz1k5qaClNT0zrb1dPTA5/PR3JyMrp168aMwdjYuNnr3sLCAn379sWlS5eaPKYqNDU10a9fP0RGRuKjjz6qs4ypqSlu377NSktOTsawYcOaJeeb8C71RPkfrfbN/nZKPgAOVFWaPgRDCxt07T+M9WfT1blZ/Vb2x8GdlPxGy1IolP/B5XLB5XLx9OlT3LhxA05OTjAyMoK9vT2cnZ1x6NAheHt745dffsG5c+dQXFwMf39/tG/fHj169GC1RW/6lV/hXr9+TSPTUSiUNkNFRQXKyspQUVEBpVKJsrIylJeXt7RYFArlPdKnTx9oaGhg1apVKC8vR1RUFM6dOwegMjBLdHQ0ZDIZ5HI5du7cidjY2Hdi8KoiISEBERERqKiowL59+5CUlAQPD4931n5DGBoaNmpk9fX1xY4dO3Ds2DHmKDAABAQEYMWKFUhISAAhBMXFxTh58iSKiorw+PFjREVFoaysDGpqahAIBKydYNXZu3cvsrOzweFwIBaLweFwoKKiAk9PT8anYmlpKUpLS3H16tUmjSsgIAALFy5kxpafn1+nn82mUFRUBFVVVejp6aGiogJLly6FVCpttJ63tzc2bNiA58+fo6ioqEEj4KlTp/DkyRMolUpoa2uDx+NBRUUFRkZGGDVqFGbMmIHc3FwoFApcv34dMpkMHh4eSE5Oxq5du1BRUYGoqCicPHkSPj4+TLtVx7kjIiJYc/fll19i+/btuHr1KpRKJUpLSxEdHc3aUVoXU6dOxZYtW3D+/HlMnjy5zjJcLhcLFiyATCbDn3/+ifDwcEycOLHOsioqKpgwYQKCg4NRUFCArKwshIWFUSpd4wAAIABJREFU1dt2Q4wcORLJycnYuXMnZDIZKioq8Ndff+HmzZtNqr9q1Srs27cPy5cvR05ODoDK4Es+Pj4oKCjA+PHjcf78eZw9exYVFRXYv38/EhMT4eXl1WxZm8u71BPlf7RKw2SFQolrSXmQaPEbL1wNoY4EJtYdWX96JhaNV6yBRIuPq0l5qFC0/ei/FMq7oqSkBNOmTcOSJUtgYGCA/fv3Q6lU4sGDB3Bzc0NAQAAuXboEZ2dnvHr1Ch07doSrqyukUimePHnS0uL/4+Dz+dDX16fHuSkUSpvhxx9/hIaGBpYuXYrffvsNGhoa8Pf3b2mxKBTKe4TH4+H48eOIjIyEjo4Ofv75Z+aFv7y8HN9++y309PRgaGiIX3/9FZGRkejevfsb9RUTE8PsWqvi448/xpkzZ6Cjo4Ply5fj6NGj0NXVfetxNYXg4GCsWrUKYrEY33//fZ1lvL29cevWLdjZ2bH8RHp5eSEkJAR+fn4Qi8Wws7NjdjjKZDIsXLgQ+vr6kEgkiIuLYyIp19TBxYsX4eDgAIFAgHHjxmHz5s1o164dhEIhLl68iP/85z8wNTWFhYVFk42Ls2bNwoQJEzBy5Ehoa2ujW7duOH/+/BvpqMqvYMeOHWFpaQkejwdzc/NG602bNg1jxoyBs7Mz7O3t4enpWW/ZZ8+eYfjw4RAKhejatSvc3NyYNRgeHs5E3ZZIJAgODoZSqYSuri7OnDmDbdu2QSKRIDAwEBEREejUqRPTrpubG4qLi1FcXAw3Nzcm3cnJCeHh4Zg7dy4kEgksLS2xdu1a5rRZfXh4eCA/Px8DBgxg/DEKBALWTlg9PT1YWlrC3NwcI0eOxKJFizBkyJB626w67t+hQwd0794dzs7OWLx4ccPKrQOBQICLFy/ixIkTMDc3h76+Pr788st6jcgHDhxA586dmd+9e/fGpUuXcP36ddjZ2UEsFsPb2xsuLi4QCoVo3749jhw5gvnz50MikWD9+vU4ffp0rZ2sfxfvSk+U/8EhrTByxMvCMqw49wgWuppN2jlVdZTbtpsz2nXpARACFdU390tJCEHaqxLMH94JRiJ6nJtCqUn14zdVpKSkYOHChVi/fj1kMhlMTExw+vRpREZGYurUqbCxsUFOTg4sLCxYD4BlZWVMJDwKm2PHjkFbW7veYw4UCuXDQCqVQiQSobCwsN7jjm2RFy9ewNzc/IMbN4VC+XsIDQ1FYmIiK3oyhfJPxsnJCXPnzq1zF+SlS5cwYcIEll9HCqUlaMpzaqv0MZklLQMXnGYf50x+8CeeJcQDADS0hDC1tYd1lx7g1rOVvT44HA444CBLWkYNkxRKHVQZJasbKMvLyzF8+HDMmjUL9vb2uH79Ok6dOoWoqCjs2bMHq1evhqOjI1OPw6m8xqlRsn5MTEzoblIKhcKQnp7epCNtbYXMzMpghB/auCkUyt+DVCpFSUlJo0doKZR/AlevXkVqaipcXFzqXLM5OTlQKBR0PVNanKa4HmuVhsmMglKgOTZJDgcSIzMYmFtDQyCEvKwUmc+f4FnCDUjzstF9sGfzfdZxKuXoZi5uXj0KpQ3y6tUr1i7HqKgohIeHw9/fHwMGDMDNmzdx7tw5fP/99/j0009BCEFCQgKys7OxfPnyWsbHDz2wTVMxMTHBpUuXaAAcCuUDR6lUQk1NjXWs70OBz+d/kOOmUCh/HydPnmxpESiUJmNjY9NgflOOulMofzdqamoNuidolYbJglI5eM0IeqOhJUQvtzHMb4VCATNbeyRcvYDM50+Q8+I5DMytmyUDT4WLwjLqkJ1CkcvluHjxIrp06QJ7e3u8ePEC3377LSIiItC5c2e8fPkSERERcHFxASEEJ06cwMqVK/Gvf/0LRkZGTDt1Hf+mNIyRkRHKyspQWFgIsZh+JKFQPlS4XC5kMhnS0tI+qCPN6enpsLe3/+DGTfmwCQkJQXZ2NrZs2dLSotQiMDAQBgYGWLBgQbPrXr9+HQEBAUhISKi3jImJCa5cuQJbW9u3EfONefLkCXr16oXCwsIW6b+lSEpKwtSpU5GUlITZs2dj3rx5LS1SLVxcXLBixQoMHjy4pUUBUOnCqmvXrsjKyqKnvygfPFKpFObm5g2+67dKw2SFguBtNgc9efIEmhoasO7SE5nPnyA3I7XZhkkOByhXtDr3nBTKO4fP50NLSwuffPIJjI2NsWvXLnh4eOD333/H7t27IZFI8PPPPzPle/bsiRs3bkBVlX37oUbJ5qOqqgpDQ0NkZGRQwySFQoG2tnarNtDJZDJ89dVX+OOPP5CbmwsLCwssXLiQFdW0OlXHt1v7uCmU5qCmpgYejwculwsPDw88fPgQcrkc1tbWCAsLey9Raetj165db1x3+PDheP78OfPbysoKW7duxfDhw5m04uLit5LvbakKEtOU+8279O83aNAgTJgwATNmzHjrtt6EzZs3o1+/frh7926L9F+TuvTx6NGjFpSoNkKhEEDlWqGGyTfjt99+w9q1a3H//n1oamrC3t4e3377LUaNGtXSolH+BlqlJUBVhYO3CdljZmqK1NQ0cFQro3rLy0qb3QYhAE+FHp2kfHjUjJf14MEDPH/+HI6Ojpg6dSratWuHgIAADBkyBGvWrMHZs2eRnJzMlLe2toaqqioUCsV7lrxtYmJiQiNzUyiUNkFFRQVMTEzwxx9/QCqVYtu2bQgICEBsbGxLi0ah/ONQU1PD1q1bkZWVBalUii1btsDX1xfp6envXRZCCH2ua0GUSmWt5/N3yfPnz+Hg4PBGdSsqKt6xNJR/Au9yzdV1/1i/fj0CAgIwZ84cZGZmIjMzE99//z2OHTv2Tvqk/PNolYZJsQYf5Yr6z6c3hlBbGwYGBnh0v/KoAF9ds9ltlCuUEKm/eWRvCqW1UeUTosqX4evXr5GcnIxTp07Bzs4OK1euRGxsLMrKymBhYQFtbW14enqiU6dO0NHRqdWeSjODTlHqhhomKRRKW0FLSws//PADrK2tweFw0L9/f/Tr1w/Xr19vadEolBbj3r176NWrF4RCITw9PZGfnw8A4PF4sLe3h4qKCuNrury8HCkpKQCAPXv2oHfv3ggKCoKuri7MzMxw6tQpnD9/Hh06dIBYLMZ3331Xb7+hoaEYO3YsfH19IRQK0blzZ1y+fJnJHzRoEBYsWIBBgwZBS0sL8fHx+OyzzxAUFASgcsegkZER1q9fD2NjY0gkEuzYsQO3b9+Go6MjRCIRJk+ezBiuqsoDwMSJE5GamooxY8ZAIBBg4cKFACqfQRMTExEfHw9dXV3I5XJGnvj4eOjo6EAmkwEA9u/fjy5dukAsFsPV1RUPHjxgyq5Zswbm5uYQCoWwtrauNwp3WVkZpk2bBl1dXdjZ2SEqKoqVv3fvXnTu3JlpZ9OmTQCAwsJCjBgxAtnZ2RAIBBAIBHj06BGeP3+OIUOGQCKRQE9PDxMnTmTmsz7mz5+PmJgYBAYGQiAQYPLkyQAqd5SuWrUKPXr0gKamJl6+fFmvPNX1u3HjRhgbG8PAwACrV69m8m/dugUXFxdoa2tDX18fkyZNAgAMGDAA0dHRTP83b96EVCrFF198ASMjI5iZmWHOnDmM3pOTk8HhcLBnzx60a9cOXbt2bfZakEqlGDVqFAwMDKCjowMPDw+kpqY2qo9z584BqHQzNXfuXJiZmcHQ0BCfffYZ6/g9h8PB9u3b0bFjR4hEIvj6+rLWUk327t2LLl26QCgUwtbWlumnIT3UpLp8ALB161YMGjSIJdPWrVvRsWNHaGlpYcaMGcjJyYGnpyeEQiH69u3LCmLT3DFUn5sdO3bA3Nwc+vr6WLBgAfOO19j6rGvN7du3D7a2thAKhTA3N2edkjt//jx69uwJsVgMJycnxMTEMHl13T+qkEqlWLhwITZt2oTx48dDKBRCRUUFQ4YMwc6dOwFUGjNXrVoFa2trSCQSjB49mvU+ZGVlhdWrV8PJyQlaWloYO3Ys8vPzMWXKFGhra8PBwQH3799/4/JPnjzBsGHDoKOjg/bt2zNyAf+79y5cuBASiQSmpqY4cOBAg3NDaaWGSROxBtAMA31dOyItrSzx7M84lJWWwsC8XfOFIP8vB4XSxiGEgBDCOmp9+vRpbNy4EVZWVmjfvj1SUlKgVCrh4OCAkJAQbN++HYWFhZg+fTq2bdsGkUjUgiNo21QZJv/OL+UUCoXSErx+/Rq3bt1Cly5dWloUCqVFKC8vh5eXF8aMGYNXr15h9uzZ2LdvH6uMq6sr1NXV0adPHwwYMAAuLi5M3u3bt2FpaYns7GzMnz8fU6dOxe7duxEfH487d+5g+/btDRr+T5w4AXd3d+Tn52PevHnw8vJiGSr27NmDdevWobi4GE5OTrXq5+bmIisrCykpKdizZw9mzpyJsLAwnD17Fs+ePUNMTAyOHDlSq97BgwdhYWGBo0ePori4GEuXLmXlOzs7Q19fH2fOnGHSDhw4gHHjxkFNTQ0nT57EokWLcPDgQeTl5cHX1xejRo2CXC7H48ePsXjxYkRFRaGoqAjXrl1D165d6xz/kiVL8NdffyExMRHXrl2rZVzQ09PD8ePHIZVKER4ejrlz5+LmzZsQiUQ4e/YsDAwMUFxcjOLiYnTq1AmEEMyfPx8ZGRlITExkdoE1xMqVK+Hq6sroufr87927F7/99huKioqgr69frzzV5yMtLQ0pKSk4deoUFi5ciGfPngEAZs2ahY8//hgFBQVIS0vDv/71LwDAlStXWP336tULX3/9NdLT05GYmIhbt27h2rVr+OGHH1hynzt3DgkJCbh9+3az14JSqYSfnx+Sk5MZH8JV8jSkjyqWLVuGP/74A/Hx8Xjy5AlevXrF1K8iMjIS165dw9OnTxEbG4v9+/fXqf9jx45h3rx52L59O6RSKaKjo2FpaQkATdJDczh+/DhiY2Px8OFDHDlyBO7u7ggNDcWrV69gYGBQq+2mjqEmZ8+excOHDxEXF4dDhw5hz549ANCk9Vl9zWlqauLzzz/H7t27UVRUhISEBMbYmpCQAB8fH6xbtw6vXr3CDz/8gNGjRyM3N5dpq777x/Xr11FWVoaxY8fWO4bw8HBs3rwZZ86cwYsXL2BsbIzx48ezyhw6dAgnT55Eamoq7t+/DxcXF/j4+CA/Px+DBw/GnDlz3qh8eXk5Ro4ciT59+iArKwsRERGYP38+68PF7du3YWRkhKysLGzcuBHTp09n3M9Q6qZVGiYNtdWhBGnyi/iDuGjcvHgMT+/GIe3JfSTdu4nbF3+HiqIMSjUBJMbNi1RFCAEBgaE29RdBaftwOBxwOBykpqbiq6++wrVr16ChoQGBQID4+HgMGDAAaWlpSExMhK+vL7KyssDlctGvXz98/PHHANBgBC7K22FgYIDy8vJGv7hTKBRKa0KpVOKzzz5Dr1694Obm1tLiUCgtQmxsLF6/fo2goCDweDy4ubnB3d2dVSYmJgZFRUU4duwY3N3dWSdSTE1NERAQAFVVVUyaNAm5ubkIDAyESCSCtbU1evfujTt37tTbf7du3TB58mSoqqrCz88P7dq1w+nTp5n8KVOmwNHREVwuF2pqarXqc7lchIWFgc/nY9SoUeDz+fDx8YGxsTH09fXh5ubWYP8NMWnSJMZQqFAocPjwYfj6+gIAtmzZgvnz58PBwQEqKiqYPn06OBwO4uLioKqqCkII7t+/j9LSUhgbG8Pe3r7OPg4ePIhFixbBwMAABgYGzG7QKjw8PGBrawsOhwNXV1e4u7vjypUr9cpsbW0NNzc3qKmpQU9PD9988w1rF2pzmTlzJqytrcHj8aCqqtqoPFwuFz/++CP4fD6cnZ3RsWNHxm8kn89HSkoKMjIyoK6ujn79+tXZp0KhwMGDB7Fy5UqIxWIYGRkhLCwMe/fuZZULDQ2FtrY2NDQ0mL6buhbEYjE+/fRTaGpqQiAQIDg4uFl62r9/P0JCQmBiYgKRSIRVq1bh8OHDrB2FCxYsgEQigYGBATw9Petdh1u3bsV3332Hvn37gsPhwNzcHJ06dWqyHprD/PnzoaOjA0tLS/Tv3x/Ozs7o1asXeDwexo8fX0vGpo6hJqGhoRAKhbCxscHs2bMREREBoGnrs+aa4/F4ePjwIaRSKXR1ddG9e3cAwLZt2zBt2jT0798fXC4XI0eOhKOjI+tjQn33j7y8POjp6YHHq/906v79+/HNN9+gY8eO0NDQwOrVq3Hjxg0kJSUxZWbNmgVTU1NIJBIMHz4c1tbWGD58OFRUVDBhwoRa+mpq+Rs3biA/Px8hISHg8/no2bMnpk2bhvDwcKYtU1NTzJo1C6qqqhg7diy4XC6ePHnSpPn5UGmVhkk9AR8CdR5K5E3zZaJvagmiVOLF0wd4FH8Zzx/cgYoqD33cR8O0Uw9kNNMXS4lcAS11HvQE/DcRn0L5x1PTH8zu3bvh5+eHDh064M8//8T+/fvh4OCA8+fPQyKRID09HTdu3ICamhp2796NadOmserTwDZ/HyoqKjAyMqLHuSkUSpuBEIIZM2YgIyMDhw8fZlyIUCgfGhkZGTA1NWU9R1Xt1qoOn8+Hl5cXzpw5gxMnTjDpVUejAUBTU7POtIYCylhYWLB+W1pasnxY1iVLdXR1dVnGBU1NzWb13xC+vr44ffo0pFIpoqKioK6uDldXVwCVR1bnzp0LsVjM/GVmZiI9PR02NjYIDw/Hhg0bYGhoCE9PTyQmJtbZR0ZGBksHNcd79uxZ9O7dG7q6uhCLxTh9+jRrR1hNsrKyMGHCBJiamkJbW5sxFr8pzZVHV1cXfP7/3l+r63/Xrl0oKSmBk5MTOnfuXG8go9zcXMjlclhZWTFpVlZWyMzMZG0aqilbc9ZCSUkJpk+fDktLS2hra8PV1RXFxcX1HpOuSXp6ei35FAoFKxBRU9dhampqnVHgm6qH5lBTpsZkfNNrqeaarrqmm7I+q8+rlpYWTpw4gaNHj8Lc3BwDBw5kfEInJydj48aNrGswLi6O9b5S3/1DIpEgNzcX5eXl9Y6h5hwLBALmnbSKt9VnfeXT09NhZmbG+ghkZWVVb9/19Udh0yqtBaoqXPSzkSDvdcN+FKows+sMZ/exGDzuC7j5foWhE6ej94hxMG/fBXZ2dnienIzyRnwyVCfvtRz9bSRQVWmV6qNQ6uXGjRtQKpVMxOwqfx/GxsZIS0vD119/jZkzZ6KsrAxSqZT54mpoaIiZM2dCTU0NHA6H7pB8z1A/kxQKpa1ACMFXX32Fu3fv4uzZs0wUXArlQ8TExATp6ems56oqX3t1UVFRwdox9LbU7Cs1NRWmpqbM77/zo0FjbVtbW8PR0RGRkZHYv38/fHx8mDoWFhb45ZdfUFBQwPyVlJRg4sSJAIDx48fj8uXLePnyJWxsbODv719nHyYmJiwdVP+3TCbDJ598gsDAQGRlZaGgoACenp6MUaou+av8+d27dw9SqRQHDhxokhGrPl1UT29MnsawsbHB/v37kZWVhU2bNmHGjBnMMe/q6Onpgc/nswJbJicnw9jYmCXP26yNtWvXMkeNpVIp45uwId1Wx9TUtJZ8XC63lrGoKVhYWLyVHqoQCAQoKSlhfr+LaO1vSs01XXVNN2V91hzb0KFDce7cOcYfZtVxagsLC8ybN491DVbt/q6vrSr69u0LdXV1HD16tN4x1Jzj4uJi5OXlse5PfxempqZ48eIFK2BPcnLye+m7LdNqLWs9LHUAEFS8RRAcoPLrjVgkxvPnyU0qX9kfgZNl7WAeFEprJicnBwqFAlwuF//5z3/g4+ODxYsXIzQ0FG5ubhg6dCjj2LdLly7o3Lkzvv32W+zevRsrVqyAnp4e858X3SH5fjExMUFeXl5Li0GhUChvzcyZMxEXF4fz589DW1u7pcWhUFqUPn36QENDA6tWrUJ5eTmioqKYABq3bt1CdHQ0ZDIZ5HI5du7cidjYWFZAjbclISEBERERqKiowL59+5CUlAQPD4931n5DGBoaNmpk9fX1xY4dO3Ds2DHmGDcABAQEYMWKFUhISAAhBMXFxTh58iSKiorw+PFjREVFoaysDGpqahAIBPUGZPT29sayZcuQk5ODnJwcrFy5ksmTy+WQyWTQ19eHqqoqLly4gAsXLrDkz8/PZ7naKSoqgpaWFkQiETIyMrBmzZp3povG5GmMvXv3Ijs7GxwOB2KxGBwOp069VB1rDQ4ORkFBAbKyshAWFsYEoXkXFBUVQUNDA2KxGPn5+ViyZAkrvzF9TJo0CUuWLEFmZiakUimCgoLg7e3N2i3aVL788kusXbsWcXFxIITgxYsXSExMbLYeunfvjoiICMjlcjx8+JDx69gShIWFoaioCP/973+xfv16xmDf3PWZlZWFY8eOobi4GDwejwlSA1Tqbfv27bh69SqUSiVKS0sRHR3NCuBTH9ra2li6dClmzpyJyMhIFBcXQ6FQ4PLly8xHhEmTJmHdunV48uQJysrKMH/+fDg7O8PGxuYttdM4Li4uEIvFWLJkCeRyOe7cuYOdO3e+02vgQ6TVWg8MhOroZirGS2nZW7dlY2uDrKwsFBcVNVr2pbQMjmY6MBBS/5KUtoW+vj569OiBw4cP47PPPsOQIUNw7tw5iEQihIeHY968eVi8eDHmz5+PM2fOgM/nQ0VFBZ07dwZQ6Q+MHrdrGRwcHDBhwgQaAIdCobRqUlJSsHnzZjx8+BDm5uZMNNtly5a1tGgUSovA4/Fw/PhxREZGQkdHBz///DPz8lteXo5vv/0Wenp6MDQ0xK+//orIyEjGx1tziYmJqbVD+eOPP8aZM2ego6OD5cuX4+jRo9DV1X3rcTWF4OBgrFq1CmKxuN4AMd7e3rh16xbs7OxYfiK9vLwQEhICPz8/iMVi2NnZMUFSZDIZFi5cCH19fUgkEsTFxWHr1q0Aautg8eLF6NSpE9q3b4++ffsyBhwAEAqF2LBhA3x8fKCjo4N9+/Zh1KhRTH7Hjh3h6+sLW1tbiMViPHr0CCEhIbh37x7EYjE8PDwwevToJuli9uzZOHHiBHR0dODn51dnmcbkaYyLFy/CwcEBAoEA48aNw+bNm9GuXd0BYquOwXfo0AHdu3eHs7MzFi9e3OS+GiMwMBByuRz6+vpwcXHBsGHDWPmN6WPBggUYOHAgevbsCVtbW/wfe+cdFtW1/f3vDMzA4AwzDGXoI4iIhdhQiA2MXUSjMQ5RFFuM3Jsb9SZY0AT8qdFgTKJJ0JiYKIjd1y7Wq7FEEQtqBIwivaPA0Adm1vsHl3MZpYMS8Xyeh+fhnL3P2Wvvs/Ype1YxNDRESEhIo9pOTk6GUChkrAonTZqE1atXY/bs2TA0NMTQoUORlJQEoGnjsGrVKqSnp0MqleLjjz/GjBkzmjIkrcro0aPRrVs3uLq64v3338esWbMAoMn6qdFo8N1338Ha2hoSiQTbt29n4lX26dOHScBkbGwMuVyODRs21OlVN3/+fMyfP5/ZXrBgAX744Qd8/fXXkMlksLCwQFBQECOTr68v5s2bh1GjRsHS0hIpKSnYu3dvawxPg/B4PBw7dgyXL1+GmZkZFAoF1qxZw8bDbiEceo2/ZJ8WlWP9mYcw4OtApF93cNTGkPDkCfILCtC7Vy+gjsWVwrIKlKjU8B/ZBcbCFwM8s7C87mRmZuL06dPMC3BoaChSU1PxxRdfYN26dQgLC8Pdu3dbFNiZhYWFhaV1USqVEIvFKCgoeKOsDFNTU2FjY/PG9ZuF5VUQFBSEuLg47Nmzp61FYWFhaQUSExNhZ2eH0tJS6OuzRlYsr47GvKfqvmKZWhVjoR4m9rLCzsgkGPB1ocNtvrWWrVyOzMgbyM7OhplM9kK5WkPILSrHtP5ydlGSpd1ibm4OBwcHDBs2DCdPnsS5c+cwfPhwDBgwAHfv3sXs2bMxevToqnliZtbW4rKwsLCw1CAtLQ1KpbKtxXhlZGRkAHjz+s3C8ipQKpUoKSlplOslCwvL35+az8yaWbBZWF42hY3wTH6tLSYBQKMh7IxMws2kPMilBuC2YHEyOysL8fFP0N+1v1ZMDY2GkPSsBP06GmFaf3mL2mBh+btTnd3w//2//4fc3FycOnUKRMS4ae/fvx+urq4vZGpkYWFhYWkb8vPzYW5u3uiMpe0JPp8PVRMSGLKwsLCwsLCwsLxa9PT0kJmZCYlEUmv5a78wCQDllWqEXUvCvbQC2EoNmm85SYQ70dGQiMWws7cHUGUpmfysBG9ZiTH9bTn0dGsPjszC8iqoVGuQW6RClrIM6fmlyC9VoVJN0NXhQCLgw1IigMxQHyZCfouyxl+/fh1JSUlwcHBA3759AUBrcZLl74dKpUJJSQlycnLQuXPnthaHhYXlFVLtIpOSkvJGuTSnpaWhW7du7bLfgYGByM7OxubNm9talBdYuHAhzMzMEBAQ0ORj//jjD/j5+eHu3bt11rG0tMSlS5fg4ODQEjGbzV9//YV+/fqhoKCgTdpvK+Lj4zFr1izEx8djwYIFWLx4cVuL9AKurq5Yt24dhg4d2taitCvCw8Px66+/4vz5820tSqNwdnbGt99+i+HDh7e1KG1OS+7HLCyvCqVS2WDondfalbsaPV0dTH9bjn03U3Aj4RlMRXrNiznJ4aCzgwPu3ImGuYUFKjm6yCksR387Kaa42LCLkixtRnZhGW4l5eFq/FMUlVWACw7AAXg6XHA4ABFQodYABGhAEOrzMLCTMfrKm5eoqV+/fnBzc9Paxy5K/j25f/8+0tLS8OzZM0RFRSE1NRVbt26FkZFRW4vGwsLyijE0NHztF+jmzZuHEydOoLCwEFKpFPPmzavzg6vafbs99Pt59PT0wOPxwOVyMXbsWMTExEClUsHe3h4rV67EhAkT2ky2X3/9tdnHjh49GgkJCcx2x44dsWXLFowePZrZV1RU1CL5Wkp18pPG6NTFixfh7e2NzMzMFrfr4eFWnoakAAAgAElEQVQBb29vrQQQr5KQkBAMHDgQ0dHRbdL+89Q2HrGxsW0oUe3UpsOtycyZM2Fubo5169a9lPMDgEAggI6OzmtzH+VwODAwMHht5G0taov72pL7MQBs374dW7ZswfXr11sqXqtz+/ZtBAUFMZm9O3bsCF9fX3zyySe1Zoxneb15bbNyP4+erg6m9ZfDx1WOEpUaqXklqFTXnvWpPoQiEUxkMtyMeYISlRo+rnJM689aSrK0DU+LyrH9agLWRsTiTEwW9HW5sJUawMbYADZSA5iL9SEz1Ie5WB82/91vKzWAvi4XZ2KysDYiFtuvJuBpUd3ufTdu3MD+/ftBRExW5+qbfTswqG73XL16FZGRkUhLS4NcLodGo0FkZCQA1Jn5joWFheXvysKFC/H48WMolUpcvnwZO3fuxL59+9parDZDT08PW7ZsQVZWFpRKJTZv3gwfHx+kpaW9clmICGq1+pW3y1KFRqN5qe9lCQkJcHZ2btaxlZWVrSxN+4EdGxaWphMZGYkhQ4agf//+iI2NRX5+Pnbt2oXIyMhGxStkef1oNwuTAMDlcuBqbwz/kV3gbClGekEpkp+VoLi8ssEHORGhuLwSyc9KoG9kDkFZDiZ35sPV3piNKcnyytFoCJFPnmL9mYe4n14AS7EAtlIDdNDTbdBykcPhoIOeLmylBrAUC3A/vQDrzzxE5JOn0Gi050FxcTHCwsJw8OBBHD16FBkZGVqLWayV5N+f+fPnY9q0aSgvL8ezZ89gZWWFa9eutbVYLCwsLM2iW7duEAgEzDaXy8Xjx4/bUKJXw71799CvXz+IRCJ4enoiLy8PAMDj8dCtWzfo6OgwIVUqKiqQlJQEoMraxc3NDUuXLoVUKoW1tTWOHz+O06dPo0uXLpBIJPjss8/qbDcoKAiTJk2Cj48PRCIRunfvjt9//50p9/DwQEBAADw8PNChQwfcuHEDM2fOxNKlSwFUWQyam5tj48aNsLCwgLGxMX755RfcunULvXr1glgsxvTp05nFmer6APDBBx8gOTkZEydOhFAoxPLlywFUvXvExcXhxo0bkEqlWjFEb9y4ASMjIyae6s6dO9GjRw9IJBIMHjwYDx48YOp+/fXXsLGxgUgkgr29fZ3ZpcvKyjB37lxIpVJ07twZ586d0yoPDQ1F9+7dmfP8+OOPAICCggKMGTMG2dnZEAqFEAqFiI2NRUJCAoYNGwZjY2OYmJjggw8+YK5nXSxZsgSXL1/GwoULIRQKMX36dABV1njBwcHo27cvDAwMkJmZWac8Ncf3+++/h4WFBczMzLB+/Xqm/ObNm3B1dYWhoSFMTU0xbdo0AMCQIUNw4cIFpv2oqCgolUrMmTMH5ubmsLa2xr///W9m3BMTE8HhcLB9+3bY2dnhrbfearIuKJVKeHl5wczMDEZGRhg7diySk5MbHI9Tp04BqAph4+/vD2tra8hkMsycOVPL/Z7D4WDr1q1wcnKCWCyGj49PvfFo69KlQ4cOwdLSErm5uQCAW7duwcjICA8fPqxXh0NCQuDk5MTEU1u/fj0cHBwgEonQtWtXHDx48AU969GjB0QiERwcHHDq1CmEhIQgPDwc33zzDYRCIYYMGVKvHpWXl2PZsmWws7ODSCSCi4sLUlJSAFSFKBgxYgSMjIzg6OiIbdu21XqO6mtbVlbG7PP29kZQUBCA5s/5pupkQ9y5cwfOzs4Qi8V47733kJ+fDwDw8vJCcHCwVt0xY8bg66+/rvd8YWFhzPWxsbHBt99+y5SdPn0aLi4ukEgk6NOnDy5fvsyUeXh4YPny5XB3d0eHDh3g4eGB7OxsfPrpp5BKpbC3t8fFixeZ+s2dv8ePH8eXX36JgwcPQigUomPHjgCgdT8GgIiICLi4uEAsFsPGxgbbt2+vs8/379/H/PnzERUVxdzDkpKSoK+vzyTKAaqS5ujr6yMzM5ORcf369TAzM4O1tTU2btzI1CUifPPNN3B0dIRUKsXYsWOZBFpEBH9/f8hkMhgaGsLJyUlrbGri7++P6dOnY8WKFZD9NzFxt27dsGfPHmZORUREoGfPnhCLxXB1ddX6/pk5cyb8/Pzw7rvvQigUonfv3oiPj0dwcDDMzc1hYWGBvXv3Nrt+Y+6PYWFhsLOzg5GRERYtWlTndWD5L9SOyVKWUsT9dFp++D4t3HObFu25Q4v23qHFB+7SkoN3afGBu7Ro7x1atOcOLdxzm1Ycvk+n/sygLGUpRUZG0vfff0+VlZVt3Q2WN4yyikra8UcCfbzrFq089id9c/Zhi/9WHvuTPt51i3b8kUBlFf/T6cOHD5NCoaAxY8bQgAED6ObNm6TRaNqw9yxNJSkpifz8/Gjz5s00YsQISkpKotGjR7e1WCwsLK+QgoICAkAFBQVtLUqrsHTpUjIwMCAA1LFjR0pKSqq1XkpKSrvot0qloo4dO9KaNWtIpVLR6dOnSSAQkK+vL1Nn0KBBxOfzCQCNGDGCeT/97bffSFdXl0JCQqiiooI2bdpEJiYmpFAoKD8/n+Lj40kkEtHVq1drbTswMJB0dHQoNDSUKioqaPv27SQWi+nZs2dEROTu7k4WFhZ0584dUqvVVFZWRr6+vrRkyRIiIrpw4QLp6OjQsmXLqLy8nI4ePUp6enrk5eVF6enplJ2dTXK5nMLDw5n6MpmMaV8ul1NERISWTAAoNjaWiIgcHR3p0KFDTNknn3xCH374IRERHT16lORyOd27d48qKytpy5YtZGdnR+Xl5RQXF0cCgYDi4uKIiCg9PZ0ePHhQ6xgEBARQ//79KSsri7KyssjNzY1qfiKdOHGCHj16RBqNhi5dukQCgYBu3LhRa3+IiOLj4+n06dNUVlZGOTk55O7uTv/85z9rbbsm7u7utHnzZq19crmcunfvTvHx8aRSqaiioqJBeXR0dMjf35/Ky8spMjKSeDwePXr0iIiI3NzcaPXq1aRWq6m0tJSuXLlSZ/u+vr40atQoysvLo4yMDOrfvz8FBAQQEVFCQgIBIIVCQQUFBVRSUtJkXcjLy6P9+/dTcXExFRYWkkKhIE9PzwbHo1pfAgMDqXfv3pSWlkb5+fnk5eVFU6dOZepWz5Xc3FzKysoie3t72rZtW61jX58uERHNmTOHJkyYQMXFxdSlSxf66aefapWpZtvu7u6UlZVFJSUlRES0f/9+Sk1NJbVaTfv37yeBQEApKSlERHTo0CGSyWR09epV0mg0lJycTDExMcx1qJ5vDbFgwQIaMGAAJSYmklqtpujoaMrNzSWVSkWdO3emzz//nMrLyykqKoqMjY3p7NmzRFR1H3F1ddW6tqWlpcx5FQoFBQYGElHz5nxzdbIu5HI5OTo6Unx8PCmVSvL09KTp06cTEdHBgwepR48eTN2MjAzS09OjjIyMOs9XVFREurq6dOnSJSIievr0Kd2+fZuIiKKjo0kqldLly5dJrVbTsWPHSCqVUk5ODhFV6alcLqfY2FgqLi6mQYMGkZ2dHW3bto0qKytp/fr11LlzZ6atlszfwMBAUigUWrLX1I+bN2+SUCik48ePU2VlJeXk5DD9qIua174aLy8vWr9+PbMdHBzMzM1qGf38/Ki0tJRu3rxJRkZGdP78eSIi2rRpE/Xp04cSEhJIpVLR8uXLaeDAgUREdOrUKbK2tqb09HQiInry5AnFx8e/IFNxcTFxuVzmnLXx119/kUAgoOPHj1NFRQXt2LGDxGIxc118fX1JIpHQtWvXSKVS0ZQpU8jOzo6CgoJIpVLRvn37SCwWM/OzOfUbuj/OmDGDioqKKD4+nsRicb39ae805j21XVlMPo+ZSB+je1ggaFw3LBndFb4DOmJEVxl620rQ3VKM3rYSjOgqg++AjlgyuisCx3XDqO7mMBPpw8XFBVwul3GJZGF5FVQncrqVlAe5cYfmxUqtBZE+D3LjDriZlIewa0kor1QjNzcX4eHhyMrKAp/Ph4WFBW7dutUq7bG8OmQyGR4/foz58+ejpKQEf/75J/T09FBcXNzWorGwsLA0i7Vr16KoqAg3btzA1KlT233M3GvXrqG4uBhLly4Fj8fDyJEjMWrUKK06ly9fRmFhIQ4fPoxRo0ZpxdeysrKCn58fdHV1MW3aNOTm5mLhwoUQi8Wwt7eHm5sbbt++XWf7PXv2xPTp06GrqwtfX1/Y2dnhxIkTTPmMGTPQq1cvcLlc6OnpvXA8l8vFypUrwefz4eXlBT6fj6lTp8LCwgKmpqYYOXJkve3Xx7Rp0xAeHg4AUKvV2Lt3L3x8fAAAmzdvxpIlS+Ds7AwdHR189NFH4HA4uH79OnR1dUFE+PPPP1FaWgoLCwt069at1jZ2796NFStWwMzMDGZmZlrWRwAwduxYODg4gMPhYPDgwRg1ahQuXbpUp8z29vYYOXIk9PT0YGJigkWLFmlZoTaVjz/+GPb29uDxeNDV1W1QHi6Xi9WrV4PP56N///5wcnJi4kby+XwkJSUhPT0d+vr6GDhwYK1tqtVq7N69G1999RUkEgnMzc2xcuVKhIaGatULCgqCoaEhY+XcFF2QSCSYPHkyDAwMIBQKsWzZsiaN086dOxEYGAhLS0uIxWIEBwdj7969WlaRAQEBMDY2hpmZGTw9PevUw/p0CQA2btyImJgYuLm5oWvXrpg3b16D8i1duhRmZmbM2EyePBlWVlbgcrmYPHkyunTpwnxnbtmyBZ999hkGDBgADocDGxsbdO3atdFjAVS5+m/duhUbN26EXC4Hl8tFz549YWxsjMjISOTl5SEwMBB8Ph8uLi6YO3cuduzY0aQ2qmnqnG8NnXye6nkhEomwZs0a7N27FxqNBl5eXsjMzGTaDw8Px7BhwxhL7brg8XiIiYmBUqmEVCpF7969AQA//fQT5s6di0GDBoHL5WLcuHHo1asXTp48yRw7c+ZMODk5wcDAABMnTgSfz8fs2bOho6ODqVOn4tGjR0zs3JbM34b4+eef4evrC09PT+jo6MDExITpR1OYNWsWwsLCmO2wsDD4+voy2xqNBmvXroW+vj769u0LX19f7Nq1C0DVXFq9ejU6duwIHo+HoKAgREVFITk5GXw+H2VlZXjw4AEqKipgZ2cH+/8mHK5JXl4eNBoNLC0t65Rx7969GDVqFDw9PaGrq4sZM2bAyckJR44cYepMmDABbm5u4PF4UCgUyM7OxooVK8Dj8fD++++jtLQU8fHxTa7f2PvjypUr0aFDB9jb22PIkCHNfg6+KbTrhclqdHW4MBfro6eNBGOcLeDdzxbT3eTw7meLMc4W6GkjgblYXyuLMZfLxZgxY/D777+3eRBuljcDjYawNyoF91uaXb4OdLgcyKUGuJdWgH03U7Bhwze4f/8+BAIBhEIhDA0NkZOTU6+bC8vfC41GAz09PQgEAmRkZGD8+PGIiorCDz/8AH39pic9YmFhYfm7wOFw0K9fP+jr6yMwMLCtxXmppKenMwsW1cjl8hfq8fl8TJgwASdPnsTRo0eZ/TU/uA0MDGrdV9+7rK2trda2XC7XimFZmyw1kUql4PH+90OqgYFBk9qvDx8fH5w4cQJKpRLnzp2Dvr4+Bg8eDKDKXc7f3x8SiYT5y8jIQFpaGjp16oQdO3Zg06ZNkMlk8PT0RFxcXK1tpKena43B8/2NiIiAm5sbpFIpJBIJTpw4wbj11kZWVha8vb1hZWUFQ0NDZrG4uTRVHqlUCj6fz2zXHP9ff/0VJSUl6NOnD7p3715n4ozc3FyoVCrGXRSocqPOyMjQCo/1vGxN0YWSkhJ89NFHkMvlMDQ0xODBg1FUVMS4QzZEWlraC/Kp1WqtRESN1cP6dAkAOnToAB8fH9y/fx/+/v6Nku/5sQkNDUWvXr2Y89+/f5+5bsnJyS3OQp+bm4vS0tJaz5OWlgZra2utHzQ6duzY7Fi1TZ3zraGTz/P8nFWpVMjJyQGPx8O0adOYRaLQ0FCtRbXa6NChA44ePYpDhw7BxsYG7u7ujFtwYmIivv/+ey3duH79OtLT05njn+97bffk6v62ZP42RGvoEQCMGzcO6enpiI6Oxp07d5Camorx48cz5WKxGGKxmNmu+cxITEyEQqFgxsrExARcLhepqakYOnQoVq5ciYCAAJiamsLb21trHKsxMjICl8uttaya5+c/8KJOP38dTExMtOaAQCDQGtvG1m/s/bG1noNvCm/EwmRzsbOzQ6dOnV6INcPC8jKISnyGqMRnsJEavLS4plwuB7ZSA9xIeAZpl37YuHEjvvvuO3z33XfYunUrli9fXqs1BMvfk+oYoMHBweDz+Rg3bhzGjBmDffv24b333sORI0ewc+fONpaShYWFpflUVlZqWTS0RywtLZGWlqYV47k61l5ttPaYPN9WcnIyrKysmO2XGW+6oXPb29ujV69eOHDgAHbu3ImpU6cyx9ja2uKHH35Afn4+81dSUoIPPvgAADBlyhT8/vvvyMzMRKdOnfDhhx/W2oalpaXWGNT8v7y8HO+99x4WLlyIrKws5Ofnw9PTk/n4rE3+gIAAaDQa3Lt3D0qlEuHh4Y1KWlPXWNTc35A8DdGpUyfs3LkTWVlZ+PHHHzF//vxaY7iamJiAz+cjMTGR2ZeYmAgLCwsteVqiGxs2bEBMTAyuX7/OJLsCUO/Y1sTKyuoF+bhcboOWcbXRkC7FxcXhm2++wcyZM/HJJ5+goqKCObYx1y0pKQlz587Fpk2b8PTpU+Tn58PZ2Znpq62tbZ2xdBs7xiYmJhAIBLWex8rKCqmpqVrJqxITE7XmeTXVWelLSkqYfa2Rdb4uGquTz/P8nOXxeDA1NQVQZfG3e/du3Lp1C8nJyZgwYUKD5xs+fDhOnTqFnJwceHp6YsqUKQCqrs3ixYu1dKPawr2ptHT+NqQL9elRU87J4/Hg4+OD0NBQhIaGQqFQaH0fFhQUQKlUMts1nxm2trY4evSo1niVlpZiwIABAIB//OMfuHHjBhISElBZWYklS5a80L6BgQEGDhyIAwcO1Cn38/MfqFunW5vG3h9Zmga7MNkAI0eOxIMHD5igrSwsL4PconIcik6DqUiv1S0ln0eHy4GpSA9PDTuh7wB3ODo6wsTEBLq6ui+1XZbWp/rhZ2FhgYsXL+Lq1au4fv06EhMTkZSUhOzsbISEhLSxlCwsLCyNIy8vD2FhYVAqldBoNLh69So2b96M4cOHt7VoL5W3334bAoEAwcHBqKiowLlz55gEHzdv3sSFCxdQXl4OlUqFbdu24dq1a/Dw8Gi19u/evYtdu3ahsrISYWFhiI+Px9ixY1vt/PUhk8kaXGT18fHBL7/8gsOHDzNu3ADg5+eHdevW4e7duyAiFBUV4dixYygsLMTDhw9x7tw5lJWVQU9PD0KhUMvypSYKhQJffvklcnJykJOTg6+++oopU6lUKC8vh6mpKXR1dXHmzBmcOXNGS/68vDyt5DaFhYXo0KEDxGIx0tPTG0y60ZSxaEiehggNDUV2djY4HA4kEgk4HE6t46KjowNvb28sW7YM+fn5yMrKwsqVK5kkNK1BYWEhBAIBJBIJ8vLysGrVKq3yhsZj2rRpWLVqFTIyMqBUKrF06VIoFAota7PGUp8uqVQqTJs2DUuXLsW2bdsgFArxxRdfNFpOAEx4neqFs9DQUPz5559M+bx587BhwwZcv34dRITU1FTGwlcmk+HJkycN9oHL5WLu3LlYtGgRkpOTQUS4e/cunj59CldXV0gkEqxatQoqlQq3b9/Gtm3bar2eJiYmsLa2xo4dO6BWq3HkyJGXmlSxsTr5PCEhIUhISEBhYSFWrFgBhULBWJ337NkTVlZWmD17Nry9vRs0usjKysLhw4dRVFQEHo8HkUjEyDBv3jxs3boVV65cgUajQWlpKS5cuNCstYGWzl+ZTIbExEStH7FqMnfuXISGhiIiIgJqdVXYrobcwGUyGdLS0l6wVJ41axZ27dqF3bt3Y+bMmVplXC4XAQEBKC8vx507d7Bjxw5mEd/Pzw/Lly9n5kReXh727dsHAIiKisK1a9egUqlgYGAAAwODOq91cHAwwsLCsHbtWuTk5AAAHj58iKlTpyI/Px9TpkzB6dOnERERgcrKSuzcuRNxcXGNWoRuKa/i/vgmwi5MNoBEIsHAgQMRERHR6F8zWFiayvG76ahQa1otpmRDiPR5qFBrcPxeRsOVWf72GBgY4MmTJ7C1tcWkSZOwadMmzJ49Gx9++CE2btyo9Qs5CwsLy98VDoeD3377DXK5HGKxGHPmzMGnn36Kjz/+uK1Fe6nweDwcOXIEBw4cgJGREb799lvmA6eiogKffvopTExMIJPJ8PPPP+PAgQPNihsGVMWqrLaIqmb8+PE4efIkjIyMsHbtWhw6dAhSqbTF/WoMy5YtQ3BwMCQSCT7//PNa6ygUCty8eROdO3fWihM5YcIEBAYGwtfXFxKJBJ07d2biopWXl2P58uUwNTWFsbExrl+/ji1btgB4cQy++OILdO3aFY6OjhgwYADzgQ0AIpEImzZtYmKdhoWFwcvLiyl3cnKCj48PHBwcIJFIEBsbi8DAQNy7dw8SiQRjx47Fu+++26ixWLBgAY4ePQojI6M6XU8bkqchzp49C2dnZwiFQrz//vsICQmBnZ1drXWr3eC7dOmC3r17o3///loLci1l4cKFUKlUMDU1haurK0aMGKFV3tB4BAQEwN3dHS4uLnBwcIChoWGjf4xNTk6GUChkrO7q06XPP/8cIpEIixcvBpfLRWhoKLZu3crEw2yMDnfr1g3+/v4YOHAgZDIZoqOjGQsyAJg0aRJWr16N2bNnw9DQEEOHDkVSUhIAYM6cOXj06BGMjIwwdOjQevsVHBwMV1dXDBo0CGKxGHPnzkVpaSl4PB6OHTuGy5cvw8zMDAqFAmvWrMHIkSNrPc+2bdvwww8/QCqV4sSJE03SsabSFJ2syYwZMzB+/HjGRb1mZmigamHt3r17DbpxA1Whkb777jtYW1tDIpFg+/btTMzEPn36YMeOHfD394exsTHkcjk2bNhQ5+JgfbR0/r7//vvg8XgwNjZGp06dXih3cXFBeHg4AgICmAzid+/erfec77zzDnr16gULCwtIJBJmEf2tt96ClZUVJBIJXF1dtY4xMTGBXC6HjY0Nxo0bhxUrVmDYsGEAgH/961/w9vbGuHHjYGhoiJ49e+L06dMAqjJZz58/H8bGxrC0tERBQQHWrVsHoCoWaPfu3Zk23NzccPHiRfzxxx/o3LkzJBIJFAoFXF1dIRKJ4OjoiH379mHJkiUwNjbGxo0bceLECZiYmDR6PFvCy74/volwiF1ta5CKigr8+OOPcHd3b/aLIAtLXWQXlmFtRCwsxQKtOKeNpSj/Gf44vhsajQa9PcZCZvvig6o2KtUapBeUYtmYrjATsfEI2wOPHz/Gnj17EBUVhfHjx2P06NGvxKWBhYWlbVEqlRCLxSgoKIChoWFbi/PKSE1NhY2NzRvX79YkKCgIcXFx2LNnT1uLwsLCwtJqHD16FP7+/nj48GFbi/LaMmnSJPTr1w/Lli1j9l28eBHe3t4v1b2fpf3RmPdU1mKyEVRnSDx//jzKysraWhyWdsatpDwAnGYtShIRYq5fAJfbsMvD81S1x8HtpLwG67L8/cnMzMSuXbtw+/ZtyOVymJmZYfny5QCq4pGxsLCwsLCwsLCwtHfKysrw/fffY/78+W0tymtLbGwszp49i9mzZ7e1KCxvCGxQuUbStWtXREVF4dKlS3WavbOwNJVKtQZX45/CuEPT4+EAQNrjGBQ8zYZdjz54fPdGk4837sDHlfinGN5V1qyFUZa2h4jA4XAQExODlJQUhIaGYvHixXBzc2MsYNj4oSwsbwZpaWlaAenbOxkZVeFI3rR+tyZKpRIlJSVsLPVXQFpaGt55551ay0JDQ19wl2RhqY0uXbrUun/dunWYOHHiK5bm5RAZGYkZM2bUWvaf//ynXm+gK1euYPbs2ejTpw/Gjx/P3Nve5Pk3ffp03Ljx4nfixIkTGVfqmnz66ac4efIkFi9ejIqKCq3nQ05ODtRqNfvMYGkShYWFDdZhXbmbQFZWFn7++WfMnz//lcUvYGnfZBaUYd2pWNhKDZqcxau8tARXjuyEXfc+0BN0wP0/zjXJlRuoWtRKeVaCJaO7wlzMunO/zpSUlGDixIk4cuQInJyc4ODggPXr17PhJ1hY3gDy8/Nhbm7+QvD6NwE+nw+VStXWYrCwsLCwsLCwsNSBnp4eMjMzIZFIai1nzWiagEwmQ9++fREREQEfHx82HTxLi8lSloELTrN06eGtq+DrC9CxW29kJPzVrPY5HA444CBLWcYuTL7GEBEMDAwwefJk6Ovr49atWzAyMkJOTg6ys7NhZmYGjUbDZCtkYWFpX3C5XJSXlyMlJeWNirWYlpaGbt26tct+BwYGIjs7G5s3b25rUV5g4cKFMDMzQ0BAQJOP/eOPP+Dn51dvQgZLS0tcunQJDg4OLRGz2fz111/o168fCgoK2qT9tiI+Ph6zZs1CfHw8FixYgMWLF7e1SC/g6uqKdevWNZgEhqVphIeH49dff8X58+fbWpRG4ezsjG+//RbDhw9va1HanJbcj1lYXhVKpRI2Njb1fouyC5NNxMPDA99//z3++uuvOk3pWVgaS3p+KdCM9e2nGSlIfxIHl+ETwNVpenxJLThVcvS0qf3XC5a/P9UL2z4+Pti5cycePHiAyMhIxMXFwc/Pr84skSwsLO0LQ0PDdrNAl5uby1h/X79+vdY61e7b7anf1ejp6YHH44HL5WLs2LGIiYmBSqWCvb09Vq5ciQkTJrSZbL/++muzjx09ejQSEhKY7Y4dO2LLli0YPXo0s6+oqKhF8rWU6ozdjdGp1kwE4eHhAW9v7zaLixcSEoKBAwciOjq6Tdp/ntrGIzY2tg0lqp3adLg1mTlzJszNzWt1uW0tBAIBdNCkjBQAACAASURBVHR0Xpv7KIfDgYGBwWsjb2tRW7KyltyPAWD79u3YsmVLnc/ZtuT27dsICgrClStXoNFo0LFjR/j6+uKTTz6BTku/f1n+drDmM01EIBBg2LBhOHXqFJtQgqXF5JeqwGtibEe1uhIxkRdhLndAuYbb4l/0eTpcFJRVtOgcLG1LdUSO2NhYXLhwAf3798fu3btx9epVFBcXAwBrLcnCwvJa4e/vj27durW1GG2Onp4etmzZgqysLCiVSmzevBk+Pj5IS0t75bIQEdRq9Stvl6UKjUaDlxmBKyEhAc7Ozs06lv0mqht2bFhYmk5kZCSGDBmC/v37IzY2Fvn5+di1axciIyMbFa+Q5fWD/VJtBr1794a+vj6uXbvW1qKwvOZUqglN9eJO+PMWykuKYdO1NyJv3Ghx8GEOB6hQs6FmX2eqLSb79OmDbdu2YeLEiZDJZDA1NYVcLmfjr7GwsLxW/P7773j06BFmzZrV1qK8Mu7du4d+/fpBJBLB09MTeXl5AAAej4du3bpBR0eHSXZWUVGBpKQkAFXWLm5ubli6dCmkUimsra1x/PhxnD59Gl26dIFEIsFnn31WZ7tBQUGYNGkSfHx8IBKJ0L17d/z+++9MuYeHBwICAuDh4YEOHTrgxo0bmDlzJpYuXQqgymLQ3NwcGzduhIWFBYyNjfHLL7/g1q1b6NWrF8RiMaZPn84szlTXB4APPvgAycnJmDhxIoRCIZYvXw6g6pkWFxeHGzduQCqVaj3Dbty4ASMjIyae6s6dO9GjRw9IJBIMHjwYDx48YOp+/fXXsLGxgUgkgr29vZaVUU3Kysowd+5cSKVSdO7cGefOndMqDw0NRffu3Znz/PjjjwCAgoICjBkzBtnZ2RAKhRAKhYiNjUVCQgKGDRsGY2NjmJiY4IMPPmCuZ10sWbIEly9fxsKFCyEUCjF9+nQAVdZ4wcHB6Nu3LwwMDJCZmVmnPDXH9/vvv4eFhQXMzMywfv16pvzmzZtwdXWFoaEhTE1NMW3aNADAkCFDcOHCBab9qKgoKJVKzJkzB+bm5rC2tsa///1vZtwTExPB4XCwfft22NnZ4a233mqyLiiVSnh5ecHMzAxGRkYYO3YskpOTGxyPU6dOAQBUKhX8/f1hbW0NmUyGmTNnav1Yz+FwsHXrVjg5OUEsFsPHx6fe96G6dOnQoUOwtLREbm4uADDhch4+fFivDoeEhMDJyYmJp7Z+/Xo4ODhAJBKha9euOHjw4At61qNHD4hEIjg4OODUqVMICQlBeHg4vvnmGwiFQgwZMqRePSovL8eyZctgZ2cHkUgEFxcXpKSkAKgKUTBixAgYGRnB0dER27Ztq/Uc1de2rKyM2eft7Y2goCAAzZ/zTdXJhrhz5w6cnZ0hFovx3nvvIT8/HwDg5eWF4OBgrbpjxozB119/Xe/5wsLCmOtjY2ODb7/9lik7ffo0XFxcIJFI0KdPH1y+fJkp8/DwwPLly+Hu7o4OHTrAw8MD2dnZ+PTTTyGVSmFvb4+LFy8y9Zs7f48fP44vv/wSBw8ehFAoRMeOHQFA634MABEREXBxcYFYLIaNjQ22b99eZ5/v37+P+fPnIyoqirmHJSUlQV9fn0kuB1QlmtPX10dmZiYj4/r162FmZgZra2ts3LiRqUtE+Oabb+Do6AipVIqxY8cy36pEBH9/f8hkMhgaGsLJyUlrbGri7++P6dOnY8WKFZDJZACAbt26Yc+ePcycioiIQM+ePSEWi+Hq6qq1NjNz5kz4+fnh3XffhVAoRO/evREfH4/g4GCYm5vDwsICe/fubXb9xtwfw8LCYGdnByMjIyxatKjO68DyX4ilWSQlJdGaNWuooKCgrUVheY3ZFZlIiw/cpW/OPmzU35dHbtGYmQtozqotNDFgM01Y9iNNXxtG//p2F42c/jEt+uEArToYSV+fimn0ORcfuEu7byS19VCwtBJFRUW0adMm6t+/P/Xq1YtWr15N9+7dIyIitVrdxtKxsLC8DAoKCghAu3gnKS8vpx49elB0dDT99ttv5OrqWmfdlJSUdtFvlUpFHTt2pDVr1pBKpaLTp0+TQCAgX19fps6gQYOIz+cTABoxYgRVVlYSEdFvv/1Gurq6FBISQhUVFbRp0yYyMTEhhUJB+fn5FB8fTyKRiK5evVpr24GBgaSjo0OhoaFUUVFB27dvJ7FYTM+ePSMiInd3d7KwsKA7d+6QWq2msrIy8vX1pSVLlhAR0YULF0hHR4eWLVtG5eXldPToUdLT0yMvLy9KT0+n7OxsksvlFB4eztSXyWRM+3K5nCIiIrRkAkCxsbFEROTo6EiHDh1iyj755BP68MMPiYjo6NGjJJfL6d69e1RZWUlbtmwhOzs7Ki8vp7i4OBIIBBQXF0dEROnp6fTgwYNaxyAgIID69+9PWVlZlJWVRW5ublTzE+nEiRP06NEj0mg0dOnSJRIIBHTjxo1a+0NEFB8fT6dPn6aysjLKyckhd3d3+uc//1lr2zVxd3enzZs3a+2Ty+XUvXt3io+PJ5VKRRUVFQ3Ko6OjQ/7+/lReXk6RkZHE4/Ho0aNHRETk5uZGq1evJrVaTaWlpXTlypU62/f19aVRo0ZRXl4eZWRkUP/+/SkgIICIiBISEggAKRQKKigooJKSkibrQl5eHu3fv5+Ki4upsLCQFAoFeXp6Njge1foSGBhIvXv3prS0NMrPzycvLy+aOnUqU7d6ruTm5lJWVhbZ29vTtm3bah37+nSJiGjOnDk0YcIEKi4upi5dutBPP/1Uq0w123Z3d6esrCwqKSkhIqL9+/dTamoqqdVq2r9/PwkEAkpJSSEiokOHDpFMJqOrV6+SRqOh5ORkiomJYa5D9XxriAULFtCAAQMoMTGR1Go1RUdHU25uLqlUKurcuTN9/vnnVF5eTlFRUWRsbExnz54lItK611Zf29LSUua8CoWCAgMDiah5c765OlkXcrmcHB0dKT4+npRKJXl6etL06dOJiOjgwYPUo0cPpm5GRgbp6elRRkZGnecrKioiXV1dunTpEhERPX36lG7fvk1ERNHR0SSVSuny5cukVqvp2LFjJJVKKScnh4iq9FQul1NsbCwVFxfToEGDyM7OjrZt20aVlZW0fv166ty5M9NWS+ZvYGAgKRQKLdlr6sfNmzdJKBTS8ePHqbKyknJycph+1EVtz1kvLy9av349sx0cHMzMzWoZ/fz8qLS0lG7evElGRkZ0/vx5IiLatGkT9enThxISEkilUtHy5ctp4MCBRER06tQpsra2pvT0dCIievLkCcXHx78gU3FxMXG5XOactfHXX3+RQCCg48ePU0VFBe3YsYPEYjFzXXx9fUkikdC1a9dIpVLRlClTyM7OjoKCgkilUtG+fftILBYz87M59Ru6P86YMYOKioooPj6exGJxvf1p7zTmPZW1mGwmtra26NKlC86ePdvWorC8xkgEfFSoNY2uryorhbqyEneuXsCTqPPIjYtC9PnDeHj7DwDAg8iLuHw4DCVFjXfvrlBrINbnNVl2lr8nUVFRKC4uxtixY/HPf/4T/fr1Y345ppfoAsbCwsLSGqxbtw7Dhw9Hz54921qUV8a1a9dQXFyMpUuXgsfjYeTIkRg1apRWncuXL6OwsBCHDx/GqFGjtOJrWVlZwc/PD7q6upg2bRpyc3OxcOFCiMVi2Nvbw83NDbdv366z/Z49e2L69OnQ1dWFr68v7OzscOLECaZ8xowZ6NWrF7hcLvT09F44nsvlYuXKleDz+fDy8gKfz8fUqVNhYWEBU1NTjBw5st7262PatGkIDw8HAKjVauzduxc+Pj4AgM2bN2PJkiVwdnaGjo4OPvroI3A4HFy/fh26urogIvz5558oLS2FhYVFnaEBdu/ejRUrVsDMzAxmZmZa1kcAMHbsWDg4OIDD4WDw4MEYNWoULl26VKfM9vb2GDlyJPT09GBiYoJFixZpWaE2lY8//hj29vbg8XjQ1dVtUB4ul4vVq1eDz+ejf//+cHJyYuJG8vl8JCUlIT09Hfr6+hg4cGCtbarVauzevRtfffUVJBIJzM3NsXLlSoSGhmrVCwoKgqGhIQQCAdN2Y3VBIpFg8uTJMDAwgFAoxLJly5o0Tjt37kRgYCAsLS0hFosRHByMvXv3allFBgQEwNjYGGZmZvD09KxTD+vTJQDYuHEjYmJi4Obmhq5du2LevHkNyrd06VKYmZkxYzN58mRYWVmBy+Vi8uTJ6NKlCyIjIwEAW7ZswWeffYYBAwaAw+HAxsYGXbt2bfRYAFWu/lu3bsXGjRshl8vB5XLRs2dPGBsbIzIyEnl5eQgMDASfz4eLiwvmzp2LHTt2NKmNapo651tDJ5+nel6IRCKsWbMGe/fuhUajgZeXFzIzM5n2w8PDMWzYMMZSuy54PB5iYmKgVCohlUrRu3dvAMBPP/2EuXPnYtCgQeByuRg3bhx69eqFkydPMsfOnDkTTk5OMDAwwMSJE8Hn8zF79mzo6Ohg6tSpePToERM7tyXztyF+/vln+Pr6wtPTEzo6OjAxMWH60RRmzZqFsLAwZjssLAy+vr7Mtkajwdq1a6Gvr4++ffvC19cXu3btAlA1l1avXo2OHTuCx+MhKCgIUVFRSE5OBp/PR1lZGR48eICKigrY2dnB3t7+hfbz8vKg0WhgaWlZp4x79+7FqFGj4OnpCV1dXcyYMQNOTk44cuQIU2fChAlwc3MDj8eDQqFAdnY2VqxYAR6Ph/fffx+lpaWIj49vcv3G3h9XrlyJDh06wN7eHkOGDGn2c/BNgV2YbAEjRozAw4cPGbcDFpamYikRAE1YKxIIDWHVzQUmnZzxziQf9HQfAwvH3rDt8hYAwK57H/RyHwN9gbDxJ6X/ysHyWlO96FhUVITExET84x//wH/+8x8QERMzig0UzcLC8nfm8ePH2L59O1auXNnWorxS0tPTmQWLauRy+Qv1+Hw+JkyYgJMnT+Lo0aPM/pof3AYGBrXuqy+hjK2trda2XC7XimFZmyw1kUql4PH+9wOngYFBk9qvDx8fH5w4cQJKpRLnzp2Dvr4+Bg8eDKDKXc7f3x8SiYT5y8jIQFpaGjp16oQdO3Zg06ZNkMlk8PT0RFxcXK1tpKena43B8/2NiIiAm5sbpFIpJBIJTpw4wbj11kZWVha8vb1hZWUFQ0NDZrG4uTRVHqlUCj6fz2zXHP9ff/0VJSUl6NOnD7p3715n4ozc3FyoVCrGXRSocqPOyMjQ+pHzedmaogslJSX46KOPIJfLYWhoiMGDB6OoqIhxh2yItLS0F+RTq9VaiYgaq4f16RIAdOjQAT4+Prh//z78/f0bJd/zYxMaGopevXox579//z5z3ZKTk1uchT43NxelpaW1nictLQ3W1tZa74EdO3Zsdqzaps751tDJ53l+zqpUKuTk5IDH42HatGnMIlFoaKjWolptdOjQAUePHsWhQ4dgY2MDd3d3xi04MTER33//vZZuXL9+Henp6czxz/e9tntydX9bMn8bojX0CADGjRuH9PR0REdH486dO0hNTcX48eOZcrFYDLFYzGzXfGYkJiZCoVAwY2ViYgIul4vU1FQMHToUK1euREBAAExNTeHt7a01jtUYGRmBy+XWWlbN8/MfeFGnn78OJiYmWnNAIBBojW1j6zf2/thaz8E3BXZhsgVUP0TPnz/PWiKxNAuZoT40oEbrT7mqAvmlari5D4eNQ1fIHbtD38gUQokUACAxNYe53AG6NR5o9UFEIBBkhvrN7gPL34PqOJMeHh7Iz8+HUChETk4OTpw4gcWLF7exdCwsLCwNc+XKFWRmZsLR0RHm5uZYsGABbt++DXNzcyYDd3vE0tISaWlp0Gj+50FR34/elZWVWlYeLeX5tpKTk2FlZcVsVz9fXgYNndve3h69evXCgQMHsHPnTkydOpU5xtbWFj/88APy8/OZv5KSEnzwwQcAgClTpuD3339HZmYmOnXqhA8//LDWNiwtLbXGoOb/5eXleO+997Bw4UJkZWUhPz8fnp6ezHtbbfIHBARAo9Hg3r17UCqVCA8Pb9R7Xl1jUXN/Q/I0RKdOnbBz505kZWXhxx9/xPz58/H48eMX6pmYmIDP5yMxMZHZl5iYCAsLCy15WqIbGzZsQExMDK5fvw6lUsnE7atvbGtiZWX1gnxcLrdBy7jaaEiX4uLi8M0332DmzJn45JNPUFHxv6SRjbluSUlJmDt3LjZt2oSnT58iPz8fzs7OTF9tbW1rvQ71nf95TExMIBAIaj2PlZUVUlNTtZJXJSYmas3zaqqz0peUlDD7WiPrfF00Vief5/k5y+PxYGpqCqDK4m/37t24desWkpOTMWHChAbPN3z4cJw6dQo5OTnw9PTElClTAFRdm8WLF2vpRrWFe1Np6fxtSBfq06OmnJPH48HHxwehoaEIDQ2FQqHQspYvKCjQeibXfGbY2tri6NGjWuNVWlqKAQMGAAD+8Y9/4MaNG0hISEBlZSWWLFnyQvsGBgYYOHAgDhw4UKfcz89/oG6dbm0ae39kaRrswmQLGTBgAD744AN2YZKlWZgI+RDq81CiajjLpbqyEg8ePICtrQ0kRkYAAJ6uLnR1dFHezOQmJSo1OujzYCJs3EImy98foVCIoKAgCAQCnD17FoGBgQgLC0NsbCwAaH34srCwsPydUCgUePLkCaKjoxEdHY3/+7//g7OzM6KjoyESidpavJfG22+/DYFAgODgYFRUVODcuXNMgo+bN2/iwoULKC8vh0qlwrZt23Dt2jV4eHi0Wvt3797Frl27UFlZibCwMMTHx2Ps2LGtdv76kMlkDS6y+vj44JdffsHhw4cZN24A8PPzw7p163D37l0QEYqKinDs2DEUFhbi4cOHOHfuHMrKyqCnpwehUFin14BCocCXX36JnJwc5OTk4KuvvmLKVCoVysvLYWpqCl1dXZw5cwZnzpzRkj8vL08ruU1hYSE6dOgAsViM9PT0BpNuNGUsGpKnIUJDQ5GdnQ0OhwOJRAIOh1PruOjo6MDb2xvLli1Dfn4+srKysHLlSiYJTWtQWFgIgUAAiUSCvLw8rFq1Squ8ofGYNm0aVq1ahYyMDCiVSixduhQKhULL2qyx1KdLKpUK06ZNw9KlS7Ft2zYIhUJ88cUXjZYTAIqLiwGAWTgLDQ3Fn3/+yZTPmzcPGzZswPXr10FESE1NZSx8ZTIZnjx50mAfuFwu5s6di0WLFiE5ORlEhLt37+Lp06dwdXWFRCLBqlWroFKpcPv2bWzbtq3W62liYgJra2vs2LEDarUaR44ceakJXxurk88TEhKChIQEFBYWYsWKFVAoFIzVec+ePWFlZYXZs2fD29u71hAUNcnKysLhw4dRVFQEHo8HkUjEyDBv3jxs3boVV65cgUajQWlpKS5cuNCs5KMtnb8ymQyJiYl1vsvPnTsXoaGhiIiIgFqtRm5uboNu4DKZDGlpaS9YKs+aNQu7du3C7t27MXPmTK0yLpeLgIAAlJeX486dO9ixYweziO/n54fly5czcyIvLw/79u0DUBVu6tq1a1CpVDAwMICBgUGd1zo4OBhhYWFYu3YtcnJyAAAPHz7E1KlTkZ+fjylTpuD06dOIiIhAZWUldu7cibi4uEYtQreUV3F/fBNhFyZbiI6ODvT19bXcb1hYGouuDhcDOxnjaXEDC4tEePjwL+jr68O2pmsIh1P1Umdui9Ez/gWZbacmtf+0WIVBnYyhq8Pqb3uisrISs2bNwqRJkzBhwgRs2LBBK+YKCwsLy98RgUAAc3Nz5k8sFoPH48Hc3LxdWyHweDwcOXIEBw4cgJGREb799lvmA6eiogKffvopTExMIJPJ8PPPP+PAgQPNihsGVMWqrLaIqmb8+PE4efIkjIyMsHbtWhw6dAhSqbTF/WoMy5YtQ3BwMCQSCT7//PNa6ygUCty8eROdO3fWihM5YcIEBAYGwtfXFxKJBJ07d2biopWXl2P58uUwNTWFsbExrl+/ji1btgB4cQy++OILdO3aFY6OjozBQTUikQibNm3C1KlTYWRkhLCwMHh5eTHlTk5O8PHxgYODAyQSCWJjYxEYGIh79+5BIpFg7NixePfddxs1FgsWLMDRo0dhZGRUp+tpQ/I0xNmzZ+Hs7AyhUIj3338fISEhsLOzq7VutRt8ly5d0Lt3b/Tv319rQa6lLFy4ECqVCqampnB1dcWIESO0yhsaj4CAALi7u8PFxQUODg4wNDRESEhIo9pOTk6GUChkrO7q06XPP/8cIpEIixcvBpfLRWhoKLZu3crEw2yMDnfr1g3+/v4YOHAgZDIZoqOjGQsyAJg0aRJWr16N2bNnw9DQEEOHDkVSUhIAYM6cOXj06BGMjIwwdOjQevsVHBwMV1dXDBo0CGKxGHPnzkVpaSl4PB6OHTuGy5cvw8zMDAqFAmvWrMHIkSNrPc+2bdvwww8/QCqV4sSJE03SsabSFJ2syYwZMzB+/HjGRb1mZmigamHt3r17DbpxA1U/2n/33XewtraGRCLB9u3bmZiJffr0wY4dO+Dv7w9jY2PI5XJs2LChWT/0t3T+vv/+++DxeDA2NkanTi9+87m4uCA8PBwBAQFMBvG7d+/We8533nkHvXr1goWFBSQSCbOI/tZbb8HKygoSiQSurq5ax5iYmEAul8PGxgbjxo3DihUrMGzYMADAv/71L3h7e2PcuHEwNDREz549cfr0aQBVmaznz58PY2NjWFpaoqCgAOvWrQNQFQu0e/fuTBtubm64ePEi/vjjD3Tu3BkSiQQKhQKurq4QiURwdHTEvn37sGTJEhgbG2Pjxo04ceIETExMGj2eLeFl3x/fRDjEmvqxsLQp2YVlWBsRC0uxoM4FwvS0NCQlJcPFpS94z/0S/ODBAxiKRLB5LkZUQ1SqNUgvKMWyMV1hJmJdudsDRAQOh4O4uDhER0fjnXfegZmZGVJSUjBnzhycOXOGqcPCwtJ+UCqVEIvFKCgogKGhYVuL88pITU2FjY3NG9fv1iQoKAhxcXHYs2dPW4vCwsLC0mocPXoU/v7+ePjwYVuL8toyadIk9OvXD8uWLWP2Xbx4Ed7e3i/VvZ+l/dGY91TdVyxTu0etViM/Px/GxsZtLQrLa4KZSB89rSS4n14AayODF8qLCgsRH/8Ezm85v7AoCVRZmJSWljW53UxlGXpZG7GLku2I6gVHJycnODk5obS0FDExMYiMjMTbb78NtVrNJsBhYWnHpKWltetYjM+TkZEB4M3rd2uiVCpRUlLSLLdEFhYWlr8j5eXlWL9+Pby9vdl7WzN59OgRzpw5g88//1xrDHNycqBWq9lxZWkShYWFDdZhLSZbkaKiIvD5fOzZsweTJ09msnCxsDTE06JyrD/zEAZ8HYj0/5flrrKyErdu3oKFhbm2C3cNMtLTkZ2Tg549eza6vcKyCpSo1PAf2QXGwvrjrrC8nmzatAl3795lgoEvWbLkBfc9FhaW9kF+fj7Mzc0bnc22PcHn86FqZpxlFhYWFhYWFhaWl4+enh4yMzMhkUhqLWcXJlsBtVqNBw8eIDo6Gh4eHrCxsWFdJVmaTOSTp9gZmQS5cQfocDkAER7ExECtVuMtZ2egDp3Kz8vDw4d/wdXNtdby51FrCMnPijGtvxyu9qxlb3tDo9GAy+Xi8ePHTObWsrIyFBUVoXv37nBxcWHqsLCwtA+qXWRSUlLeKJfmtLQ0dOvW7Y3rNwsLy6shMDAQ2dnZ2Lx5c1uL8gILFy6EmZkZAgICmnzsH3/8AT8/v3rjD1paWuLSpUtwcHBoiZjN5q+//kK/fv1QUFDQJu3XhlgsRlRUFBwdHV9qO66urli3bl2DMUWbSkpKClxdXZGQkNBgQqD2Tnh4OH799VecP3++rUV5I1AqlQ2G3mFduVuIUqnErVu3kJKSgoiICOzYsQN79+6FiYkJKisroavLDjFL4+jXUYqHWYW4mZQHudQAGelVrmkuffvWuSgJAPqC/8/evcflfP+PH390XR1EdV1dlUo6OjOEmcNkzIYJOWwoUfPZEGbMNspGNhszm81sWljO5rA5k7EZmUPMyBBmpRQq0kGn66rX7w/f3j8XlbOY1/12c7u5rvfr/X49X+/Xq+vwul4HSwoKCxAlJZjcprOppESQdCWPlh46Wno8moXtpUertMPRxcWFb775hsuXL/Pss8+SmprK1q1b+fHHH+UPJ5L0H2VjY/NEd9AFBwezfPlyo111T5w4gVs5ayiXTt9+0sstSdLjycLCAjMzM1QqFd26dePEiRMUFRXh5eXFlClTHskOwOX54Ycf7vncrl27kpCQoDz28PAgIiKCrl27Ks/l5ubeV3z3q3SWz528tj/KdQ+trKwe+vvNyZMnH8p1GzVqVGn1+jDqaNOmTUyePJl//vkHtVpNw4YNmTVrFi1btrwlv5KSEkJCQoiJiWHHjh1YWlqiVqvlZ4fHiOw1uw+5ubls3bqVlJQU9uzZQ1JSEu3atWPy5Ml8++23slNSuisqlQn9W7pSZCjhwJkLpCck4O3dpMx1JW9kYWGBCSYUFBZiaWlZbrri/+uUbOKiod+zrqhUsnPqv6y4uJhffvmFHTt2KM81a9aMnJwcrK2tKzEySZKk8r3zzjvKLp2SJEmPAwsLCyIiIqhXrx5qtZp9+/bRuXNn4uPjcXFxeaSxCCEoKSmRa4ZLT5WSkhJMTEyUwRVnz54lICCAVatW0blzZwoKCti9e3eZI0ENBgNBQUGcPHmS3bt3P9Sdu+XAtHsn5/LdBysrK0pKSliwYAG1a9fm119/5auvviI+Pp6DBw9WdnjSE8jCVE2/Zk4YLp7G0sEVlcXt1yk1MTGhSpUq5Ofnl5smp0DPucvXaOlhy6A27liYyg8z/2VCCKysrCgoKDD61XXNmjVy7VtJkiRJkqQyxMXF0bJlS6ytrfH19SUzMxMAMzMzuiSIRwAAIABJREFUGjZsiFqtRgiBiYkJer2ec+fOAbBw4UJat27NhAkT0Ol01KxZk02bNrFt2zbq1auHVqvl3XffLTff8PBw+vTpQ2BgINbW1jRq1Ihdu3Ypxzt06EBYWBgdOnSgWrVqxMbGEhwczIQJE4Dro9GcnJz4+uuvcXZ2xs7Ojvnz5/Pnn3/i7e2NRqNh0KBBGAwGo/QA/v7+JCUl0bt3b6ysrJg4cSJw/ftFfHw8sbGx6HQ6o7V8Y2NjsbW1VdY1Xrp0Kc888wxarRYfHx+OHz+upJ05cyaurq5YW1vj5eXFjz/+WOY9KCgo4I033kCn01GnTh2jH9YBFi9eTKNGjZTrfPvttwBkZWXxyiuvkJaWhpWVFVZWVpw8eZKEhAQ6deqEnZ0d9vb2+Pv7K/VZkaysLIYPH07NmjXRaDT4+PiU+R2rqKiIsLAwPD09sbe3JyAgwOj6/v7+ODs7K9c4duyYciw4OJgRI0bQp08frK2tadKkCUeOHFGOe3h4EB0dDVxvG3379uXNN99Eo9FQu3Zto3uTlJTEiy++iLW1Ne3atVPaSVkSExMxMTGhoOD6pqkdOnRg4sSJvPDCC1SrVo0OHTqQlpbGuHHj0Ol0eHl58fvvvyvnd+jQgdDQUNq2bYu1tTWdOnUiOTm5zGsDDBgwgPDw8HLrCCpuOx4eHsyYMYMWLVpQtWpVo9GWf/31F66urnTt2hWVSkXVqlXp2rUrTZo0MSpzYWEhffv2JSEhgd9++63cTskzZ87wyiuvYG9vT61atfjuu++UY4cOHaJt27ZotVqcnJwYMWKE0ZreJiYmfPfdd9SvXx+tVqvciyVLluDp6YmtrS1jx44tM1/p/5Mdk/epb9++fPzxx8yYMQONRsPmzZtp0aIFLVu2VNIUFxdXYoTSk0QIwdbNG3nRzZwxvs3JKyrmfGYehuKSCs+ztLSkoIw3TUNxCecz88grKiawlTsDn5Odkk+D0qWDFy1aRIMGDTh+/DiRkZEcPHiQy5cvG6WRJEl6nERGRqLT6WjatOl9TVWUJEm6G3q9Hj8/P3r37s2VK1d4++23WbJkiVEaHx8fqlSpQps2bWjfvj2tWv3/9d3//PNP3N3dSUtLY/z48bz++utERUURGxvL4cOHiYyMZO/eveXmv2HDBrp06UJmZibvv/8+fn5+Rh1dCxcu5KuvviI3N5fmzZvfcn5GRgaXLl3i3LlzLFy4kFGjRjFlyhS2bt3KP//8Q0xMDKtWrbrlvBUrVuDm5sbatWvJzc3lk08+MTr+3HPP4eDgwJYtW5Tnli1bxmuvvYaFhQUbN27kgw8+YMWKFVy+fJnAwEB69OhBUVERp06dYtKkSezYsYOcnBz++OOPWzqOSn388cccO3aM+Ph4/vjjD5YtW2Z03N7envXr15Odnc2iRYt47733OHjwIBqNhq1bt1K9enVyc3PJzc2lQYMGCCEYP348qampxMfHc+HCBT788MNy73+poKAgLl26xF9//cWVK1eYPn16meuyh4aGcvjwYQ4cOEBycjLm5uaMGjVKOd65c2dOnTpFWloazz33HP7+/kbnL1++nHfeeYerV6/y4osvMnr06HJj2rRpE926dePKlSuMHDmSIUOGKMf8/f1p2LAh6enpzJkzh6ioqNuW8UbLli3j+++/V3bbbt26NY0aNSI9PZ0RI0YwdOhQo/Q//PADc+bMIT09ndq1axMYGHjbPMqro4raTqnFixezevVqcnJycHBwUJ5v0aIFZ8+eZdSoUWzfvr3MtUj1ej2+vr7k5uayffv2cjddycvLo1OnTvTs2ZMLFy6wZcsWpk+fzvbt2wFQq9XMnDmTjIwMDhw4wK5du/jmm2+MrrFq1Sp2795Nenq68tyOHTv4+++/+fPPP4mKiuK333677b16msmOyftQUlKCubk5ffr0ISMjg2nTprF9+3ZGjx7NkSNHGD58OEII1Go1JSUVdyxJEsCBAwdISUmhb5/etKplz3ud69G4hobUrHySruRxrdBQZodSFcv/P2JSCMG1QgNJV/JIzcqniYuW9zrXo5WXnZy+/ZQo/QBVq1YtFi1axBtvvMHUqVMpKiri7bffruToJEmSyjZ69GhOnz5NWloaX331Fe+//z4//fRTZYclSdJTYN++fVy7do0JEyZgZmZG586d6dKli1GamJgYcnJyWLduHV26dDGaTu3i4kJISAimpqYMHDiQjIwMxowZg0ajwcvLi9atW3P48OFy82/atCmDBg3C1NSUoKAgPD092bx5s3J88ODBeHt7o1KpypyuqlKpmDJlCubm5vTo0QNzc3MCAgJwdnbGwcGBzp07V5h/RQYOHKh0FBYXF7Ny5UqlQ2ru3LmMHz+exo0bo1arGTZsGCYmJuzfvx9TU1OEEPz999/k5+fj7OxMw4YNy8xjxYoVfPDBB1SvXp3q1asro0FLdevWjdq1a2NiYoKPjw9dunRh9+7d5cbs5eVF586dsbCwwN7enrFjxxqNQi3LxYsXWb9+PZGRkTg4OKBWq3n++edvud9CCCIiIpg1axbVq1fH0tKSjz/+mNWrVysDkl5//XVsbGywsLBg0qRJHD9+XBkcANCrVy/atWuHWq1m8ODBFdZNmzZt6N27N2q1mqCgIJKTk8nIyCApKYl9+/bx6aefUqVKFby9vRk4cGCFZbxZcHAw9evXp2rVqvTu3Rtzc3OGDBmCWq0mICCAM2fOGK1LGRgYSPPmzalSpQqfffYZMTExnD9//q7yLFVR2yk1atQovLy8MDMzM5oi7enpSUxMDFeuXGHw4MHY29vTq1cvLl26pKTJzc0lJiaGwMBAqlWrVm4cmzZtwtnZmZCQEMzMzKhXrx5vvvkmK1asAK4vhdW2bVtMTU1xd3dn6NCht7SlCRMmKG2h1JQpU6hWrRpeXl60b9/+nv/+nhayY/I+3PjryR9//IGJiQlfffUVu3bt4rXXXiM1NZWPPvrolrSSVJbz58/z66+/8tprrynTbe2sLAh+3pPQVxrQpaEjBYYSkq/kkXw5j+QreVzMKuBSdgH5WJCSVag8X2gooWsjJ0JfaUBQWw/srJ7undeeVrm5ufz000/s27ePxo0bM3jwYE6dOkVubq7cAEeSpMdO8+bNsbe3x9TUlI4dOzJy5EhWr15d2WFJkvQUSE1NxcXFxeg7m7u7+y3pzM3N8fPzY8uWLWzYsEF5vnRqNKB8jr/5uYo2Hrl5ky93d3dSUlIqjOVGOp0OMzMzo/zuJv+KBAYGsnnzZrKzs9mxYwdVqlTBx8cHuD6F97333kOr1Sr/Lly4QEpKivID+ezZs3F0dMTX15f4+Pgy80hNTTW6BzeXd+vWrbRu3RqdTodWq2Xz5s1kZGSUG/OlS5cYMGAALi4u2NjYKJ3FFUlKSkKj0RiNzCtLeno6eXl5tGnTRilz48aNUalUXLx4keLiYiZMmECtWrWwsbFRynJj/jfXzbVr18rNr6y2lZubS2pqKhqNxmgDF1dX1wpjv921y8ur1I11pNVqsbGxMWqnd6OitlOqonbfsmVLli9fzoULFzhy5AhJSUmMGTNGOW5ra8tPP/3EyJEjWblyZYVx/PXXX0ZxzJgxQ5k6fvr0abp3746TkxM2NjaEhobe0pbKivNB/f09LeTKnPepdJ0RPz8/Xn75Zfr3709ubi579+7FwcEBHx8fhgwZctcvEtLTJT8/n9WrV9OxY8cy20p16yp0fcaZlxo4kpFbxKXsAlKv5pNVoEdfLHAyKyDhVDJBbT1wtKmCvZU5pmrZGf40K11nMi8vj8uXL6PT6QgJCWHYsGFUqVKlssOTJEm6LZVKJZedkCTpkahRowYpKSmUlJQonZNJSUnl7tprMBg4e/bsA8s/KSnplsevvvqq8vhh/qB8u2t7eXnh7e3NmjVr2LlzJwEBAco5bm5uvP/++wQHB5d5br9+/ejXrx95eXlMmDCBN998k5iYmFvS1ahRg6SkJJo2bQoY34/SdQJ/+OEH+vbti5mZGb1791beH8qKPywsjJKSEuLi4rCzs2P9+vUMGzaswnK6ubmRlZVFRkZGhRuk2NvbY2lpyZEjR/Dw8Ljl+JIlS/j555/Zvn07np6eZGdno9VqH/j7WY0aNcjKyjLa2LJ0zceH5cZ6uXr1KtnZ2bi4uCjfLfLy8pT/X7x4kfr16wNl19Ht2k5555WlUaNGDBkyhO+//97o+e7du/Pjjz8yYMAAVCoVr732WplxtG3b1mg9zRuFhITQuHFjli9fjo2NDV9//bUymvJu45TKJ3su7lNpI1y5ciWvvvoqCQkJzJ07FwcHB/bt20enTp2wsrJCr9cDcl036VZCCNatW4eTkxNt2rSpMK2pWoWTpgpNXbW80tiZAS3dGNTanYGtPalemEqTmhqcNFVkp6SkvNa8+uqrpKamMnjwYFxcXHj11VflbnGSJD2WVq1aRU5ODiUlJezZs4c5c+bQu3fvyg5LkqSnQJs2bbC0tGTGjBno9Xp27NihbEBy6NAhdu7cSWFhIUVFRSxYsIB9+/aVu8nIvTh69CjLly/HYDCwZMkSzp49S7du3R7Y9Svi6Oh4207WwMBA5s+fz7p164zWFQwJCWH69OkcPXoUIQS5ubls3LiRnJwcTp06xY4dOygoKMDCwgIrK6tydxPv378/n376Kenp6aSnp/PZZ58px4qKiigsLMTBwQFTU1N++eUXfvnlF6P4MzMzjdbkzMnJoVq1amg0GlJTU5k5c+Zt74OTkxM9evRg+PDhZGRkUFxczN69e402OoHrP5oNGzaMsWPHcuHCBQDS0tJYv369kreFhQV2dnbk5+fzwQcf3Dbve+Hm5kbr1q2ZOHEihYWFxMXFsXz58oeSV6lly5Zx5MgRCgoKCA0N5fnnn6dmzZrY29tTs2ZNFi1aRHFxMevXr2ffvn3KeWXVUUVt53b27NlDRESEMqrx3LlzLFu2rMzv0j179mTFihUEBQWVuTxM9+7dSUxMZMGCBRQWFmIwGDh27JiymXFOTg42NjZYW1tz+vRpIiIi7vq+Sbcney8ekAMHDjBnzhzCw8MJCAjgs88+Y+rUqbi5uREdHa28SMjedOlmsbGxXLp0iV69et1z+9Bqtej1ejlEXFKU/tofHBxM48aNqV+/Pk5OTtjZ2ck1byVJeizNmTMHV1dXNBoNw4YNY+rUqQwYMKCyw5Ik6SlgZmbG+vXrWbNmDba2tsyaNYtBgwYB1zfRGDduHPb29jg6OjJv3jzWrFlDs2bN7imvmJgYrKysjJ7r2bMnW7ZswdbWlmnTprF27Vp0Ot19l+tOhIaGMmPGDLRabbkbxPTv359Dhw5Rp04do3Ui/fz8mDx5MkFBQWi1WurUqaNsGlRYWMjEiRNxcHDAzs6O/fv3K506N9+DSZMm0aBBA+rWrUvbtm2NNouxtrZm9uzZBAQEYGtry5IlS+jRo4dyvH79+gQGBlK7dm20Wi0nT55k8uTJxMXFodVq6datG7169bqje7Fo0SI0Gg2NGzfGzs6O0NDQMj83T58+naZNm+Lj44O1tTVt27YlNjYWuL4eqJeXFy4uLjRo0MBoU9wHbfny5cTFxWFvb09ISAgDBw40WhOzUaNGt2wkdD9KdxR3cHDg1KlTRtdesGABc+bMQafTsXnz5tvWUUVtpyyvvPIKn376KXD9u++WLVvw9vamWrVqtGvXjmbNmvHFF1+Uea6fnx/Lly9n8ODBrF271uiYlZUV27dvZ8OGDbi6uuLg4MDQoUPJzs4Gru8sv2rVKqytrfnf//5X5qhL6f6ZCDmE74H766+/MDExITc3l5SUFM6dO4efnx916tSRa01KRkpKSigoKCAnJwdHR8f7utaXX37Jq6++essaNZIE139tfuWVV/j1118rOxRJkh6w7OxsNBoNWVlZ5U47/C86f/48rq6uT125JUn6bwkPDyc+Pp4ff/yxskORnnBvv/02165dY/78+Q/82h06dGDAgAEMHz78gV9b+m+7k8+pcj7fA1RcXIxaraZZs2ZkZ2czadIknnvuOUaNGkVmZiY//PADffr0QafTYTAY5HRKCZVKhaWlpbK48P2wtbXlypUrsmNSKpO5uTkzZsxg/fr12Nra0qxZM6ytrZV1ciVJevKlpKQov/A/DUqn0D1t5ZYk6b8lOzubvLy8e97dWHp6xcXFYW1tjYeHBwcPHmThwoV88803D6UtFRYWkpmZKdupdNfuZHq+HDH5kMTFxTF06FC2bdvG33//TVxcHEIIHBwcePbZZzl58iTPPfdchQvrStLdWL9+PTY2NnTs2LGyQ5EeM7t370YIQcuWLVmyZAknT56katWqfPrpp8oPKpIkPbmuXr2Kk5PTLetgPQ3Mzc0pKiqq7DAkSZIkSZKkclhYWHDx4kW0Wm2Zx+WQvYdACEGTJk0YPHgwQUFB1K9fn5YtW9KuXTsiIyOZMWMG48ePl52S0gNla2tLenp6ZYchPUZKR0NmZ2cTHR1NvXr1+Ouvvxg3bpyyELfslJSkJ59KpaKwsJDk5OSnakpzSkoKDRs2fOrKLUlS+SZPnkxaWhpz586t7FBuMWbMGKpXr05YWNhdn7t3715CQkI4evRouWlq1KjB7t27qV279v2EeVemTZtGZGSk8h70tH2u1Gg0HDx4kLp16z7Q686cOZOzZ88+0nY8bdo0Tp8+TVRUVJnHo6KimDZtGrm5ufzxxx94eno+stikJ1t2djaurq4VLmsoOyYfohEjRmBlZUX9+vWpWbMm27Zt4/z581y+fFl5w5DTKKUHRafTcfr06coOQ3qMlL62PPfcc8yYMYM9e/bQqlUr6tSpQ8+ePSkpKZHr3krSf4iNjc1/ooPup59+YvLkySQkJGBvb8+sWbPo06fPLelKp2//V8otSdL9s7CwwMzMDJVKRbdu3Thx4gRFRUV4eXkxZcoU/Pz8Ki22H3744Z7P7dq1KwkJCcpjDw8PIiIi6Nq1q/Lco94EMzk5mS+//JKEhARq1KhxX9dKTEzE09OT/Px8qlSp8oAifDSsrKzu6z1o4cKFREREsH//fuW5jz766EGEdldK/3bKKoteryc0NJRdu3Y9kM18TExMOHnyJPXr17/va91OTEwMr7zyivL42rVrVK1aVfmetHXrVnx8fPjqq68YO3Ysa9euNdosqbRt1qlTx+i7tl6vx8XFhfT0dKXdBgcH4+TkxPTp041ieJTlfVLJb6QPgYmJibJ71+DBg6lfvz5ffvklycnJfP/99xw4cIC6dety7do1o7SSdD9K15iUpJtVr16dL774giNHjihT/VNTU4mMjGTr1q1cvny5kiOUJEm67rfffmPMmDF8//335OTkcPDgQby9vSs7LEmSnjAWFhZERERw6dIlsrOzmTt3LoGBgaSkpDzyWIQQFBcXP/J8H7Zz585ha2t7352SD4rBYKjsEP6zLl26RH5+Po0bN67sUIC7q2sfHx9yc3PJzc0lIyMDgOPHjyvP+fj4ANdHhOp0OhYuXFjmdUxMTNi7d6/yePPmzeh0unsvhGREdkw+JDeOQjpw4AA7d+5k4sSJyuM33niD119//Za00n9fecu63m8HtU6nIy8v76lcY0y6vZYtW9KvXz+io6N5//33Wb9+PXPnzkWj0WBmZlbZ4UmSJAEwadIkJk2axPPPP49KpaJ69ep4eXlVdliSJD2m4uLiaNmyJdbW1vj6+pKZmQmAmZkZDRs2RK1WKzPU9Ho9586dA66PUmvdujUTJkxAp9NRs2ZNNm3axLZt26hXrx5arZZ333233HzDw8Pp06cPgYGBWFtb06hRI3bt2qUc79ChA2FhYXTo0IFq1aoRGxtLcHAwEyZMAOD333/HycmJr7/+GmdnZ+zs7Jg/fz5//vkn3t7eaDQaBg0apHTAlKYH8Pf3Jykpid69e2NlZaV8xzQxMSE+Pp7Y2Fh0Op3R+ruxsbHY2toq3xOWLl3KM888g1arxcfHh+PHjytpZ86ciaurK9bW1nh5eZW5W3h0dDQvv/wyaWlpWFlZ0bt3bwAOHjxI+/btsbW1pUGDBvz888/KOVu3bqV58+bY2Njg6urKhx9+qBxr27YtAPb29lhZWREdHa3U0Y2cnJz4/fffjepgyJAhaLVaPv/8c4qKiggLC8PT0xN7e3sCAgKUNlEeIQTvvfcejo6O2NjYUL9+fSUPIQRffvkldevWRafT0a1bt3I3f7ld3rGxscq9cXJyYtq0aRw7dozhw4dz8OBBrKyssLKy4tq1a4SHhzNgwACje9e0aVM0Gg2tWrVi3759yrHg4GBGjBhBnz59sLa2pkmTJhw5cuSuy3qjkpISRo4cSceOHTl69Cj16tVT6qdFixYAnDlzhldeeQV7e3tq1arFd999p5x/6NAh2rZti1arxcnJiREjRihtr7SuW7RogZWVFZGRkUbtu1Tr1q2VTsLStjB+/HiqV6/OyJEj76pubufw4cP8/fffzJs3j82bN5OWlnZLmqCgIKNOy4ULFxIUFHRP+UllENIjsXjxYnHixAlRWFgo3n33XXHo0CExbtw4sWTJksoOTaok8+fPFx9//LEYN26cyMjIeCDXnDZtmkhNTX0g15L+W3bt2iWaNGkiNmzYIJKTk8XBgwdFWFhYZYclSdIDkJWVJQCRlZVV2aHcF4PBIMzMzMT06dNFnTp1hLOzswgODhZXr14tM31ycvJ/otySJN2boqIi4eHhIT755BNRVFQktm3bJiwtLUVQUJCSpl27dsLc3FwA4uWXXxYGg0EIIURUVJQwNTUV3333ndDr9WL27NnC3t5e9O/fX1y9elWcPXtWWFtbiz/++KPMvCdPnizUarVYvHix0Ov1YuHChUKj0YgrV64IIYR44YUXhLOzs/jrr79EcXGxKCgoEEFBQWL8+PFCCCF27twp1Gq1CA0NFYWFhWLDhg3CwsJC9OjRQ6Smpoq0tDTh7u4uli1bpqR3dHRU8nd3dxdbt241igkQJ0+eFEIIUbduXbF27Vrl2OjRo8Wbb74phBBiw4YNwt3dXcTFxQmDwSAiIiKEp6enKCwsFPHx8cLS0lLEx8cLIYRITU0Vx48fL/Me3BxTamqq0Ol0Yu3atcJgMIj9+/cLW1tbceLECSHE9c+iR48eFcXFxSIuLk44OjqK1atXCyGESEhIEIDIz89XrhcVFSVatWpllKejo6PYuXOnUgempqZixYoVori4WOTl5Yl33nlHdOnSRVy6dEnk5eWJoKAgERAQUGb8paKjo0XNmjWV71D//vuvOHv2rBBCiNmzZ4vmzZuLhIQEUVRUJCZOnCief/75Mu95RXknJycLGxsbERUVJQoLC0VWVpbYv39/ueWcPHmy6N+/vxBCiNOnTwtLS0uxadMmodfrxaJFi4RGoxHp6elCCCGCgoKERqMRMTExwmAwiLffflv4+PjcdVlL8ywsLBT9+vUTfn5+Sn3cXD/Xrl0Trq6u4rvvvhNFRUUiPj5euLq6il9++UUIIcThw4fFH3/8IfR6vUhMTBQNGzYUn3/+eZn3TYhb25IQQrRq1UpERUUp90itVosZM2aIoqIikZeXd9u6KUt+fr4AREJCgtHzo0aNEi+99JIQQojatWuLL774QjlWWvZz584Je3t7kZ+fL9LS0oSDg4M4efKk0X258W/8RjeX92lzJ59T5VC9h6x0FNzAgQOZM2cOZ86c4eWXXyYpKYnJkyfL6UlPqWPHjrFw4ULGjh3L8ePHWbJkyQO5rk6nu+2vgtLTqVGjRvTu3ZsePXqQn5+PjY0NYWFhcikJSZIeG5cuXUKv1/Pjjz/y22+/ceLECS5dusSYMWMqOzRJkh5D+/bt49q1a0yYMAEzMzM6d+5Mly5djNLExMSQk5PDunXr6NKli9HmLC4uLoSEhGBqasrAgQPJyMhgzJgxaDQavLy8aN26NYcPHy43/6ZNmzJo0CBMTU0JCgrC09OTzZs3K8cHDx6Mt7c3KpUKCwuLW85XqVRMmTIFc3NzevTogbm5OQEBATg7O+Pg4EDnzp0rzL8iAwcOZNmyZQAUFxezcuVKAgMDAZg7dy7jx4+ncePGqNVqhg0bhomJCfv378fU1BQhBH///Tf5+fk4OzvTsGHDO8pzyZIlvPTSS/Tq1Qu1Wk2rVq3o3bs3q1evBqB9+/Y0adIElUpF48aN8ff3Nxplei9atGjBgAEDUKlUVKlShYiICGbNmkX16tWxtLTk448/ZvXq1RVOpTc3N6egoIDjx4+j1+vx9PRURurPnTuXqVOn4uHhgZmZGeHh4Rw8eJCkpCSjawghKsx76dKl+Pj4EBwcjLm5OTY2NrRq1eqOyrhy5Uq6dOmCr68vpqamylJx69evV9L06tWLdu3aoVarGTx4cLntpqKywvV1Sn19falatSo//fRTuet9btq0CWdnZ0JCQjAzM6NevXq8+eabrFixAoBmzZrRtm1bTE1NcXd3Z+jQofdd19WrV+fdd9/FzMwMS0vLO66b2yksLGT58uXK38fAgQPL3ACoevXqtGnThnXr1rFs2TJ69+79xK2H+jiTHZMPWek0bZVKha+vL6NHj2b+/PlkZWVRXFzMM888IzsGnhI31rOFhQXt27dn8+bNuLi4MGLECH7++WdlIf97JdeZlMpjZ2fHBx98wLJlyxgzZgyffPIJY8eOVaZ6iHKWGJAkSXpUqlatCsCoUaOoWbMmWq2WiRMnsmnTpkqOTJKkx1FqaiouLi5Gy2K5u7vfks7c3Bw/Pz+2bNnChg0blOdvnDpa+vpz83MVbSjj5uZm9Njd3d1oDcuyYrmRTqczWk6natWqd5V/RQIDA9m8eTPZ2dns2LGDKlWqKGvpJSYm8t5776HVapV/Fy5cICUlhVq1arFo0SJmz56No6Mjvr6+xMfH31GeiYmJrF+/3ui6K1eu5MKFC8D15cw6duyIg4MaXIaOAAAgAElEQVQDGo2GuXPnKmv+3asb73F6ejp5eXm0adNGyb9x48aoVCouXrxY7jU6duzIlClTCAsLw8HBgQEDBpCamqqUqX///sr17O3tUalUt0wZvl3eSUlJ97xbekpKCh4eHkbPeXh4GLW1m9vNtWvX7rqscH26+d69e/nwww8r3GE9MTGRv/76y6iuZ8yYodzn06dP0717d5ycnLCxsSE0NPS+69rV1dVo0+A7rZvb2bBhAwUFBfTt2xe4/rfz999/c+jQoVvSBgcHs3DhQhYuXEhwcPAtx83MzNDr9UbPlT6WS2dVTHZMPiJCCLp160ZYWBivvfYawcHBfPjhh5w4cUKuMfkUKN39+MiRI0yePJm6deuSmJjIW2+9xbfffsvUqVM5dOjQfe8qKkdMShVZunQpSUlJ1K9fnw4dOtCpUyflV2z5A4kkSZVNq9Xe8sVDkiSpPDVq1CAlJcXoM0xFo6UMBgNnz559YPnfnFdSUhIuLi7K44f5Wna7a3t5eeHt7c2aNWtYunQpAQEByjlubm7MmTOHq1evKv/y8vLw9/cHoF+/fuzatYuLFy9Sq1Yt3nzzzTuKyc3NjQEDBhhdNzc3l7lz5wIQEBCAr68vSUlJZGVlERISovwwXlZ5rKysyMvLUx7r9fpbBmDceJ69vT2WlpYcOXLEKIaCggKjeinLiBEjiI2NJSEhAYPBwPjx45Uybdiwweh6+fn5yjqJd5q3m5sb//zzT5l5364uXVxcSExMNHouMTHxtmW627ICvPjii0ybNo0XX3zRaBf4m7m5udG2bVujsubk5LBlyxYAQkJCqF27NqdPnyY7O5tp06ZVOAji5roGbulMvvk+3Wnd3E5UVBRFRUXUrl0bJycn2rVrh4mJSZmjJrt3787hw4cpKCigTZs2txx3c3O75b79+++/qFQqXF1d7yqup43sEXvEOnXqxGuvvQZc34xi1apV/Pnnn2zevJk9e/YAsoPgv0ilUnH58mXGjx9Ps2bNAJg3bx6DBw9m8uTJXLhwgU8//RS4v5FrcsSkVJbS15Rq1apx9epVRo8ezcaNGyksLFTeJCv6VVSSJOlReeONN5gzZw4XL14kJyeH6dOn07Nnz8oOS5Kkx1CbNm2wtLRkxowZ6PV6duzYQXR0NHB9842dO3dSWFhIUVERCxYsYN++fXTo0OGB5X/06FGWL1+OwWBgyZIlnD17lm7duj2w61fE0dHxtp2sgYGBzJ8/n3Xr1inTVOF6p9H06dM5evQoQghyc3PZuHEjOTk5nDp1ih07dlBQUICFhQVWVlZ3/BkxMDCQrVu3snHjRgwGA0VFRRw4cICTJ08CkJOTg62tLZaWlhw6dIjly5cr5zo4OKBSqYzK1LRpU2Uzn6KiIiZNmlTh92SVSsWwYcMYO3asMkozLS3NaMpzWQ4ePMi+ffsoKiqiatWqVK1aVSlzSEgIEydOVOLKzMxk1apVd533wIED2b17N0uWLEGv15Odnc2BAweA63WZkpJS7gam/fr1Y9u2bWzduhWDwcDSpUuJj4/Hz8+vwnLdbVlLjR49mrFjx9KxY8dbOkRLde/encTERBYsWEBhYSEGg4Fjx45x8OBB4Hpd29jYYG1tzenTp4mIiDA6/+b2W7duXYqLi/n5558xGAx8++23RiNCy3KndVOR1NRUfvnlF1avXs2RI0eUf3PmzGHFihW31Im5uTm//PILa9euLfN6r732Gr/88gubNm3CYDBw+fJlQkND6du3L+bm5ncV29NGdkw+ImX9EjJ48GDCw8NJTEzk4MGDjBw5kvT0dDmC8j/kxk7G3NxcNBoN3bt3B64Ps//ss8+YPn068+bNA66vAXM/v67KEZNSWUpfUzp27EheXh41a9bEycmJzMzMO/4VXJIk6VEICwujXbt2NGzYkFq1amFvb8+sWbMqOyxJkh5DZmZmrF+/njVr1mBra8usWbMYNGgQcH103bhx47C3t8fR0ZF58+axZs0aZYDA3YqJicHKysrouZ49e7JlyxZsbW2ZNm0aa9euRafT3Xe57kRoaCgzZsxAq9Ua7W59o/79+3Po0CHq1KljtE6kn58fkydPJigoCK1WS506dZT17gsLC5k4cSIODg7Y2dmxf/9+pVOprHtwo5o1a7Jlyxa++uorHB0dqVGjBqGhoUrnznfffcdHH32EtbU14eHhymAduP696IMPPqBjx45otVq2bdtGnTp1mDp1Kr6+vri7u+Pq6oq9vX2F92X69Ok0bdoUHx8frK2tadu2LbGxsRWek52dzfDhw7Gzs6NGjRpkZWUxffp0AN566y0GDBhA9+7dsbGxoWnTpmzbtu2u83Z1dSU6OpqIiAgcHByoV6+eshv2iy++iLe3N87Ozmi12lumYdetW5dVq1Yxfvx47Ozs+Prrr9m8efNt70WpV155RRkAU1FZb/T2228zZsyYcjsnrays2L59Oxs2bMDV1RUHBweGDh2qLEs2c+ZMVq1ahbW1Nf/73/+M6hpgypQpvPHGG2i1WubNm4eNjQ0RERG89dZbODo6cvHiRWX37/Lcrm4aNWqkrLNansWLF1OvXj169eqFk5OT8m/IkCGo1WrWrVt3yzne3t40aNCgzOvVr1+fNWvW8NFHH2Fvb0/Tpk2xs7Pj+++/rzAOCUyEXFisUuTk5LB9+3b8/PwYN24cgwYNIiMjg5SUFIYMGUJmZiZqtfq+p/ZKlU8IwaJFi2jfvj07d+7EzMwMPz8/li9fztGjR5U3eyHEfU/5yMrK4quvvuKDDz6QI+CkMh0+fBhXV1dOnz6NwWAgLi6OHj164OHh8UDaoCRJj152djYajYasrKyn6nPD+fPncXV1ferKLUlS5QsPDyc+Pp4ff/yxskORJEl6rN3J51TTRxyT9H+sra357bffSE1N5Y033uDQoUMEBwezbds2PvzwQ1JTUxk/frz8oP0fkJubS3Z2NitXrsTZ2RkTExOGDx9Ofn6+suYKPJh1aGxsbFCr1Vy9ehU7O7v7vp7033Hu3DlOnjzJ1atXWbp0KXB9indcXBwlJSW8/fbbsmNSkiRJkiRJkiRJeqRkx2QlKP3yP336dIKDg/n555+ZOnUqxcXFrF27lqioKH788Ufq1q1b2aFK9+DYsWN4enpiZWXF0qVL6d27N6+//jrR0dGcOXOGbt260bNnT4qLi9HpdBQXFz+w0Y0mJiZotVquXLkiOyYlI+np6ezdu5eSkhLOnz/P/PnzsbGx4cCBA3z++ee8/fbbslNSkp5wKSkpyjSqp0HpOl5PW7klSap82dnZ5OXl3fUOwFLlWLt2LRMmTCjz2KlTpx5xNJL0dMnJybltGjmVu5KUdkbl5ORgbm5OYmIi77zzDj179sTd3R21Wk2nTp1QqVRyFNMTJDMzk6ioKEpKSggICGD16tWcO3eO0NBQbG1teeedd7C2tmby5MmYm5s/lLpdvnw5tWvX5rnnnnug15WebHq9HjMzM3Jzc+ncuTOfffYZKpWKmJgYtFotw4cPr+wQJUm6R1evXsXJyanchfP/y8zNzSkqKqrsMCRJkiRJkqRyWFhYcPHiRbRabZnHZcfkY+DUqVMEBwczZ84cWrRoQUlJCb///jvm5ua0a9eussOT7lJKSgqRkZFotVr8/Pw4fPgwW7Zswd/fn8jISKZMmWK0APWDtnXrVkxMTOjatetDy0N6sq1evZqMjAwyMzOxtbXFz8+PGjVqVHZYkiTdo9K1e5KTk5+qJWBSUlJo2LDhU1duSZLuzOTJk0lLSzNaOulxMWbMGKpXr05YWNhdn7t3715CQkI4evRouWlq1KjB7t27qV279v2E+UD4+vrSp08f/ve//z2U67dq1Yrp06fTsWNHAKZNm0ZkZCSFhYXs37+f1q1bk5CQgIWFxQPN937qUKpcMTExDBkyhDNnzlR2KE+F7Ozs264JLqdyPwbq1atHw4YNcXFx4eDBg4SHh1OvXj1SU1NJTk7G39+/skOU7sKGDRuIj4/HwcGBlStX0r9/f6pWrUpERARBQUEPtVMSru/M/e+//z7UPKQnW+fOnTl79ize3t5ERkYya9YsmjVrRkBAwANdWkCSpEfLxsbmie6gu3m314KCArp168aGDRvKTF86fftJL7ckSQ+HhYUFZmZmqFQqunXrxokTJygqKsLLy4spU6bg5+dXabH98MMP93xu165dSUhIUB57eHgQERFhNCghNzf3vuJ7kNRqNZaWlg/tdfrkyZPK/5OTk/nyyy9JSEhQfnR/EPeirM2O7qcOH5bg4GCcnJzK3Gn7QTAxMeHkyZPUr1//oVz/buzdu5fx48dz7NgxAGrXrs1HH31Et27dSExMxNPTk/z8fKpUqQLAJ598wrfffsv27dupVq0aJiYm8rPDY0RV2QE87YqLiwGYM2cOer2eqKgoiouLqVWrFp988gk7d+6kpKSkkqOU7oQQgnPnzrF27Vpmz57NxIkT0Wg0LFy4kGbNmrFs2TJ69uz50OOwtbXlypUrDz0f6cm1b98+Nm3aRGZmJomJifTt21cZTaBSybcFSZIqR25urvIvKysLR0dH+vXrV9lhSZL0hLOwsCAiIoJLly6RnZ3N3LlzCQwMJCUl5ZHHIoRQvv9JD965c+ewtbWVM4HK8bDbn8FgeGjXvlF2dja+vr688cYbZGRkcOnSJWbNmlVuR+P48eOZN28eMTExNGrU6KHFVVJSgpyQfG/kN9BKVjoyydLSkqtXr1JUVER0dDQ7duzgq6++4uWXXzbqKJAN/fFTWicmJia4u7vTt29ffv31V5ydnWnZsiXx8fGcOnVK+bXmYdPpdGRmZsq2IpXrmWee4eeffyYjI4Pc3Fxat27NSy+9JNezlSTpsREdHU1ubi59+/at7FAkSXpCxMXF0bJlS6ytrfH19SUzMxMAMzMzGjZsiFqtVj7r6PV6zp07B8DChQtp3bo1EyZMQKfTUbNmTTZt2sS2bduoV68eWq2Wd999t9x8w8PD6dOnD4GBgVhbW9OoUSN27dqlHO/QoQNhYWF06NCBatWqERsbS3BwsLIZy++//46TkxNff/01zs7O2NnZMX/+fP7880+8vb3RaDQMGjRI6fQpTQ/g7+9PUlISvXv3xsrKiokTJwLXv5fEx8cTGxuLTqczWos3NjYWW1tbZV3ipUuX8swzz6DVavHx8eH48eNK2pkzZ+Lq6oq1tTVeXl5GIwZvVFJSwhdffEHdunWVe3D48OFb0iUkJNCpUyfs7Oywt7fH399fqaeK8vv333958cUX0Wg02NnZ0b59e+UcDw8PoqOjiY6O5uWXXyYtLQ0rKyt69+5NYmIiJiYmFBQUAJCVlcXw4cOpWbMmGo0GHx8f8vPzAXjnnXdwc3PD2tqa5s2bK3W4adMmPv30U3766SesrKzw8PAAMKpDgEWLFlG/fn20Wi0vvvgi8fHxRm3gww8/pGPHjlhbW/P8888r7a8sBw8epH379tja2tKgQQN+/vlnABITE9HpdMTExADXO+g8PT1ZunQp3333HcuWLePLL7/EyspKuUdltb+tW7fSvHlzbGxscHV15cMPPyw3llJt27YFoEWLFlhZWREZGanc34ULF+Lp6UmTJk0A2LZtG88++yxarZbmzZsr8ZbGXFoHTk5OjBo1Sqmfy5cv4+fnh62tLba2trRq1YqMjIxbYjl9+jR6vZ6goCBMTU2xsLDAx8fnlmXwhBCEhISwbt06YmJiqFWrVpllu3jxIv3798fR0RFXV1fCw8OVwWG3a7MeHh7MmDGDFi1aULVqVS5evIiJiQmRkZHUr18fjUZDYGCgXA/7doT0WPH39xcLFiwQ//zzj/j0009FQkKCmDZtmlixYoUQQoiSkpJKjlAqyy+//CLeeecdMXr0aPH999+L5cuXi4kTJ4oBAwaIr7/++pHGotfrRXh4uMjKynqk+UpPli+//FKMGTNGxMbGCiGEyMjIkK8vkvSEysrKEsB/6nW/b9++YujQoRWmSU5O/s+VW5Kke1NUVCQ8PDzEJ598IoqKisS2bduEpaWlCAoKUtK0a9dOmJubC0C8/PLLwmAwCCGEiIqKEqampuK7774Ter1ezJ49W9jb24v+/fuLq1evirNnzwpra2vxxx9/lJn35MmThVqtFosXLxZ6vV4sXLhQaDQaceXKFSGEEC+88IJwdnYWf/31lyguLhYFBQUiKChIjB8/XgghxM6dO4VarRahoaGisLBQbNiwQVhYWIgePXqI1NRUkZaWJtzd3cWyZcuU9I6Ojkr+7u7uYuvWrUYxAeLkyZNCCCHq1q0r1q5dqxwbPXq0ePPNN4UQQmzYsEG4u7uLuLg4YTAYREREhPD09BSFhYUiPj5eWFpaivj4eCGEEKmpqeL48eNl3oNZs2aJevXqiWPHjomSkhJx+vRpkZiYqJR/7ty5Qgghzp49K7Zt2yYKCgpEenq6eOGFF8TIkSOFEKLC/AYMGCCGDRsmioqKRFFRkdi9e3eZ5b/53iQkJAhA5OfnCyGE8PPzE7169RJpaWnCYDCIPXv2iIKCAiGEEEuXLhXp6elCr9eLWbNmCQcHB3Ht2jWljvv3729U5pvrUKvVin379onCwkIxdepU4eXlpVz7hRdeEG5ubuLYsWOisLBQ9O3bVwwaNKjMe5mamip0Op1Yu3atMBgMYv/+/cLW1lacOHFCCCHEkiVLhLu7u7h69aoIDAwU/v7+ZcZUqqz2t2vXLnH06FFRXFws4uLihKOjo1i9enWZ8dzoxnZ14/3t37+/yMrKEnl5eeLIkSNCp9OJmJgYUVxcLDZu3Ch0Op1IT08XQgjRp08fERQUJLKyskRmZqbo3LmzCAsLE0IIMWHCBNG9e3dx7do1YTAYxKFDh0ROTs4tcWRlZQk7Ozvh7+8vNm7cKNLS0oyOl8b12muvicaNG4uLFy8aHb+xnRQXF4uWLVuKsLAwkZ+fL1JSUoS3t7eIjIwUQlTcZoW43v4aNWokzp49K4qKioRer1deYzIyMsSlS5eEl5eXWLBgwW3v73/VnXxOlSMmHxOlPfILFiygqKgIDw8PevbsSUREBDt37mTx4sUcPXpUjmZ6jIj/G5F49epVJk2aRPfu3encuTMrV65EpVLxwgsv0L59e0aPHm2U/mEzNTXFxsbG6JccSbrZmDFjGDduHC1btkQIgZ2dnXx9kSTpsZCRkcHGjRsZMmRIZYciSdITYt++fVy7do0JEyZgZmZG586d6dKli1GamJgYcnJyWLduHV26dDFaU9vFxYWQkBBMTU0ZOHAgGRkZjBkzBo1Gg5eXF61bty5zBGCppk2bMmjQIExNTQkKCsLT05PNmzcrxwcPHoy3tzcqlarMTVhUKhVTpkzB3NycHj16YG5uTkBAAM7Ozjg4ONC5c+cK86/IwIEDWbZsGXB9GbGVK1cSGBgIwNy5cxk/fjyNGzdGrVYzbNgwTExM2L9/P6ampggh+Pvvv8nPz8fZ2bnctfIjIiKYOnUqzzzzDCYmJtSpUwd3d/db0nl5edG5c2csLCywt7dn7NixysjEivIzNzfnwoULnDt3DjMzM3x8fO76Ply8eJH169cTGRmJg4MDarWa559/XqmPgQMHYm9vj6mpKWPGjEGv1xutX1mRpUuXEhwcTOvWrTE3Nyc0NJT8/Hz27NmjpHn99dd55plnlLotrz6XLFnCSy+9RK9evVCr1bRq1YrevXuzevVqAAIDA2nbti0+Pj7ExMTc0eZON7e/9u3b06RJE1QqFY0bN8bf399olO/dCg8Px8bGBktLS77//nveeOMN2rVrh0qlonv37nh7e7NlyxbS0tLYsGED33zzDTY2Nmi1Wj744ANWrFgBXK/ny5cv888//6BWq5XRmTezsbFh7969VK1alZEjR+Lk5ETHjh35559/jNJFR0fTu3dvHB0dy4390KFDJCcnM3XqVKpUqUKNGjV45513lJgqarOlRo0ahZeXF2ZmZpiaXt/GJSwsDDs7O6pXr46vr+89//0+LWTH5GNCpVIhhMDS0hJ/f3+2bt1KUFAQjRo1Ytu2bbz00kssXbq0ssOU/k9JSQkmJiacP3+enTt30qxZMzp27Iivry/vv/8+V65c4eWXXyYkJMQo/aMi15mUbueff/655Q2ytPO8dEqLJElSZVi2bBm1a9emVatWlR2KJElPiNTUVFxcXIyWwCqrY8zc3Bw/Pz+2bNlitLFW6dRogKpVq5b5XEWbqLi5uRk9dnd3N1rDsqxYbqTT6TAzMzPK727yr0hgYCCbN28mOzubHTt2UKVKFaVjLzExkffeew+tVqv8u3DhAikpKdSqVYtFixYxe/ZsHB0d8fX1NZqefKOkpKQ72gH80qVLDBgwABcXF2xsbJROYKDC/D7//HNq1KhBx44d8fLyuqfNXZKSktBoNDg4OJR5fObMmTRo0ACNRoNWqyUrK6vMacRlSUlJUaZ4w/Xv9m5ubkZt4E7rMzExkfXr1xvVycqVK7lw4YKSZvjw4Rw7doyhQ4ei0WhuG9/N7e/AgQN07NgRBwcHNBoNc+fOveOy3u76iYmJfPPNN0bx79+/n9TUVBITEykuLsbV1VU51r17d9LS0gB47733aNeuHX379qVGjRq8//776PX6MvOsW7cu8+fP59y5c/z777+Ym5szaNAgozSbN2/m66+/ZubMmeXGnpiYSHp6Ora2tkpMI0eO5NKlS0DFbbas8pd6UH+/TwvZMfkYMTExITU1lS+//JLFixfTpEkTBg0axJUrV7CwsGDAgAGVHaL0f1QqFdnZ2cybNw9bW1tcXV3p168f+fn5/P777yQmJt6S/lEqXWdSksrj6upK8+bNAZROcxMTE7KysggPD5dvnpIkVZqoqChef/31yg5DkqQnSI0aNUhJSTHaNDQpKanc9AaDgbNnzz6w/G/OKykpCRcXF+XxwxygcLtre3l54e3tzZo1a1i6dCkBAQHKOW5ubsyZM4erV68q//Ly8vD39wegX79+7Nq1i4sXL1KrVi3efPPNMvNwc3O7ZbRaWcLCwigpKSEuLo7s7GyWLVtmNKusvPyqV6/O3LlzSU5O5ueff+bzzz/n119/vaP7c2OM5XU2xsTEMG3aNFauXElmZiZXr15Fo9EY7SVQERcXF6PvfyUlJSQnJxu1gbuJc8CAAUZ1kpubq4yMLCgoICQkhCFDhvDFF18Y5VtenDc/HxAQgK+vL0lJSWRlZRESEnJfs/tuvL6bmxvvv/++Ufylo5nd3NwwNTUlLS1NOZaVlaV877CysmLGjBmcOXOGmJgY1q1bx+LFi2+bv7u7O2+99RZ///230fMtW7Zk27ZtTJ06lVmzZpV5rpubGzVr1jSKNzs7W1lr9XZt9ubyS/dGdkw+ZjIzM7G2tmbVqlVkZmby1ltvMX36dNLT0/ntt9/46KOPlGkBcrfuR+/Gez579mx27drFCy+8QGhoKC1atGDcuHGkpKQQHh5eeUEiR0xKt1elShWysrKU0ZGZmZn8+++/XL58mejoaPbu3VvJEUqS9DQ6fPgwx48fv2XUgyRJUkXatGmDpaUlM2bMQK/Xs2PHDqKjo4HrUzV37txJYWEhRUVFLFiwgH379tGhQ4cHlv/Ro0dZvnw5BoOBJUuWcPbsWbp16/bArl8RR0fH23ayBgYGMn/+fNatW6dM4wYICQlh+vTpHD16FCEEubm5bNy4kZycHE6dOsWOHTsoKCjAwsICKysro+nvNxo6dCiTJk3ixIkTCCE4c+ZMmZu75OTkUK1aNTQaDampqUYj2SrKb9WqVSQnJwOg1WpRq9XlxlIeJycnevTowfDhw8nIyKC4uJi9e/dSWFhITk4Opqam2NvbYzAY+OSTT8jOzlbOdXR0JDExsdzv3wMHDmThwoXExsai1+v57LPPMDc3v2UzljsRGBjI1q1b2bhxIwaDgaKiIg4cOKBMK3/33Xdxc3Nj/vz5jBgxgsDAQGWnbUdHR/7999/b5pGTk4OtrS2WlpYcOnSI5cuX31Fsd9LWhg4dSmRkJHv27KGkpIT8/Hx27tzJ+fPncXJywtfXl7ffflvZrDU5OVn5W920aROnT5+mpKQEGxsbzMzMyqzn+Ph4Pv/8c5KSkhBCkJaWxvz582nTps0taVu1akV0dDTh4eF8/fXXtxxv2bIlDg4OfPzxx1y7do2SkhLOnDmjTNeuqM1KD47smHzMNGrUSNn17fvvvyc0NJQePXoQHh6OTqejfv36fPzxx2RmZj7yUXhPu5KSEuWeX7x4kQ8++IDmzZszbtw4AMaPH89nn31GVFQUlpaWyhtEZZAjJqWKlP7Kt3r1ambNmsWePXtYtWoVq1atYuXKlfTv399oOookSdKjEhUVha+vb4XrQUmSJN3MzMyM9evXs2bNGmxtbZk1a5byA4der2fcuHHY29vj6OjIvHnzWLNmDc2aNbunvGJiYm5Z965nz55s2bIFW1tbpk2bxtq1a9HpdPddrjsRGhrKjBkz0Gq15e6u3L9/fw4dOkSdOnWM1on08/Nj8uTJBAUFodVqqVOnDkuWLAGgsLCQiRMn4uDggJ3d/2PvvsOjKvO3gd+TmUzaTCa9h/RIFUILHUGRLk0MCoKCCv50l6zA0lQiKFJ1laWIgkBApUmvovTei4AJSQghlfSeac/7R96ZJZJAAkkmhPtzXVy7mTkzz31OzjlOvvMUR5w6dQrLli0D8OAxGD9+PEaOHIn+/ftDqVRi8ODB5XaSmDFjBq5cuQI7Ozv06dMHAwcOND73sPbOnz+P9u3bw8bGBl26dEF4ePhjFZZXr14NlUqFZs2awdHREVOnToVer0fPnj3Rt29fNGzYED4+PjA3N4e3t7fxdUOHDoW5uTkcHR3LXdm5W7duWLhwId588004Oztj//792LVrV7nziZanSZMmxnlAvby8sHv3bvznP/+Bq6srPDw8MMAS06kAACAASURBVHXqVJSUlGDPnj3YuHEjVq1aBYlEghkzZkCj0WD27NkAgDFjxiA6Ohr29vbo1q1bhe0tWbIEM2fOhFKpREREBIYOHVqpnJ999hneeecd2NnZ4fvvvy93m5YtW2L16tWYNGkSHB0d4ePjg4ULFxqLuqtXr4a5ublxxfmePXsiKioKQOlUU7169YJSqcTzzz+Pl19+2Xgdjxs3DuPGjQMAKJVKnDt3Dh06dIBCoUCLFi2gUCiwevXqcjO1a9cOe/fuxaeffopFixaVeU4qlWLHjh2Ijo5GUFAQ7O3t8dprrxmHzj/snKXqIxG1tSIHVYmhCJaYmIi1a9di8uTJGDFiBL744gvExMTA2toa7dq1gxCCXYdrwf3HefTo0cjLy4OjoyNmzpyJb7/9FrGxsZX+pqk2JCUlITIyEpMnTzZ1FKqDDPeXyMhIzJkzB+Hh4bCxsUFwcDD8/f1r7YM0EVWP3NxcqFQq5OTkwNbW1tRxas3du3fh7e39zO03EdUtERERuHnzJn755RdTRyEiqnMq8zlVVsuZqJIMPfM8PT1x/fp1zJ07F3379kVGRga6d+9u3I5FydphOM5Lly6FSqXCypUrsWjRIkydOhU//PADZs+ejfj4+EdObF1bnJyc0LJlS+h0uioPc6D6z3B/efHFF3H48OFy5wvilx5ET5/ExMQyQ8/qO0Nvhmdtv4mobsnNzUVhYSHu3r1r6ihERHVOXl7eI7dhj8k6zNCrKT8/H59//jkyMjIwefLkMiueCSHw66+/oqCgACNHjjRh2vrHUJgx/G9JSYlxmP2cOXNgY2ODKVOmoH379hgwYECZ19QFGo2mzOp+ROVRq9WQy+UQQhjvOXXlHCaiysnOzoabmxtKSkpMHaXWyeVyqNVqU8cgIiIiogpYWFggJSUFdnZ25T7PwmQdd/+8hn9XWFiI5cuXQ61WY9euXRg7dizeeOONWk5Yf2VmZhqHtBp+D2q1GrNmzYJGo0GbNm2wdOlSfPLJJ+jatauJ0xI9nh07dsDd3R2tW7c2dRQiekyGITIJCQnP1JDmxMRENG7c+Jnbb6oZM2bMQFpamnHl27okPDwcLi4umDZtWpVfe+LECbz//vu4fPlyhdt4eHjgyJEjZTo/1KaoqCi0adMGOTk5JmnfVGJiYvD2228jJiYG48ePx7///W9TR3pAaGgo5syZ89D5CqluCgoKwsqVK9G5c2csWLAAMTExj3V/e5LXEgGln1MfNfUOh3LXcfcXJbdu3Qq9Xo/BgwcDKF1VNysrC3l5eVi1ahU+//xzDBgwADY2NqaKW2+kp6ejS5cumDt3Lvr37w8zMzNoNBrI5XLMmjULCxYswC+//IJ//vOfLErSU61Dhw6QyWTQarXIzc1FTk4OCgoK0LRp0zrVA5iIHs3W1vapL9Ddvn0bH3zwAU6ePAmZTIZevXph8eLFUCqVD2xrGL5dH/abTM/CwgLm5uYwMzNDnz59cP36dajVavj7++Ozzz4zjo4xhZUrVz72a3v16oW4uDjjz76+vli2bBl69eplfCw/P/+J8j0pwyIqlbmODx06hGHDhiElJeWJ233hhRcwbNgw44IatW3JkiXo2LEjLl26ZJL2/66842FYCbo+KO/cr06rVq3CsmXLcOrUqRp5/6qSSCSwsbGBra0tZs6cWanXlLcPlX1tdYuJicEnn3xiXKnd09MTYWFhmDRpEusd9RCXdX6KDBw4EAMHDsTp06cBlBYtJ02ahJiYGPj6+mLKlCmcT7CamJmZITAwEAsXLsTPP/8MAGWGRU+cOBG9e/fGb7/9hvj4eFPFJHoiaWlpyMzMxPXr1/Hzzz9j7dq1mDp1KmbMmMGiJBGZxHvvvQcHBwckJibir7/+wt27dytcZZaoJlhYWGDZsmVITU1Fbm4uli5dihEjRiAxMbHWswghoNPpar1dKqXX61GTgwvj4uLQrFmzx3qtVqut5jRUl49pXc5WE2JjY9G2bVs4OjriwoULyM3NxZ49e5Ceno5bt26ZOh7VABYmnzIpKSmYPn06YmJiAABnz56Fn58fJBIJgoKCYGlpaeKE9YODgwPGjh2LV155BZs2bcLSpUuRl5eHoqIibNu2DXv27MHo0aPRqFEjk3/LXJ69e/di6tSpmD17Ns6dO2fqOFRHbdmyBRs2bMCiRYuwf/9++Pr64osvvoCFhQUuXrwIADX6gZyI6O/i4uLw+uuvw8rKCvb29hgyZAiuXr1q6lhUD125cgVt2rSBUqlE3759kZWVBaD0i+jGjRtDKpUav6TTaDTGL6JXrVqFdu3aYcqUKXBwcICXlxd27tyJffv24bnnnoOdnZ1xTvLyREREYPDgwRgxYgSUSiWaNGmCw4cPG59/4YUXMG3aNLzwwguwsbHBmTNn8NZbb2HKlCkASnsMurm54ZtvvoG7uzscHR3xww8/4Pz582jRogVUKhXefPNNYyHDsD0AvP7667hz5w4GDRoEhUKB6dOnAyjtWXXz5k2cOXMGDg4OZeZtPXPmDOzt7Y1z2K5duxZNmzaFnZ0dOnfujD///NO47YIFC+Dt7Q2lUgl/f/8KV6kuLi7GO++8AwcHBwQFBeHAgQNlnl+zZg2aNGlifJ/FixcDAHJyctC7d2+kpaVBoVBAoVDgxo0biIuLw4svvghHR0c4OTnh9ddfN/4+KzJ58mQcPXoU4eHhUCgUePPNNwGU9qqbN28eWrVqBWtra6SkpFSY5/7ju2jRIri7u8PFxQXz5883Pn/u3DmEhobC1tYWzs7OGD58OACgS5cuOHjwoLH9s2fPIjc3F2PGjIGbmxu8vLzw0UcfGY/77du3IZFIsGrVKvj5+eH555+v8rmQm5uL/v37w8XFBfb29ujTpw/u3LnzyOOxd+9eAKXzkk+aNAleXl5wdXXFW2+9VWb4vUQiwfLly9GwYUOoVCqMGDHioXMAr1mzBk2bNoVSqURgYKCxncoch8jISPj5+cHe3h7/+te/jO8ZGxuL7t27Q6VSwdHREV26dAHw8HN/yZIlaNiwoXHuO8P1YDBlyhS89dZbxp/PnDmDLl26wN7eHm5ubvjyyy9x9epVjBs3DmfPnjWemwUFBRXuu+E+8tFHH8HOzg4BAQHYsGGD8fm33noL48aNM+bduHEjcnNzMW7cOHh5ecHNzQ0ffvghiouLja/5+uuv4enpCRcXFyxcuLBMexERERg2bNhj7cPfX7tnzx40b94cKpUKoaGhOHnyZJnc//d//4fBgwdDqVTi+eefL9MjuLL3iIiICLRt2xaLFi2Cl5cXgNJz8b///S+aN29u3If27dtDpVKhWbNm2LlzZ5nXDxkyBKNHj4atrS2CgoJw5swZrFmzBj4+PnBycsLXX3/92NtX97VAAAQ9NfR6vRBCiO+++068/PLL4rPPPhMvv/yy+OOPP0ycrH4xHOfLly+L5cuXCyGEGDx4sGjbtq1ISkoyZbRK+f7778WcOXPEpUuXxOnTp0X79u3FjRs3TB2L6qBjx46Ju3fvin/+859i8+bNQgghkpKSxODBg8XGjRtNnI6IKisnJ0cAEDk5OaaO8sT++9//iuHDh4u8vDxx79490bVrVzF//vxyt01ISKg3+021S61WC19fX/HFF18ItVot9u3bJ6ysrMSoUaOM23Tq1EnI5XIBQPTo0UNotVohhBA//vijkMlkYsmSJUKj0Yhvv/1WODk5ibCwMJGdnS1iYmKEUqkUx48fL7ftGTNmCKlUKtasWSM0Go1YtWqVUKlUIjMzUwghRNeuXYW7u7u4ePGi0Ol0ori4WIwaNUpMnjxZCCHEwYMHhVQqFVOnThUlJSVi+/btwsLCQvTv318kJSWJtLQ04ePjI9atW2fc3tXV1di+j4+P2LNnT5lMAIyfFYODg8WWLVuMz/3zn/8U7777rhBCiO3btwsfHx9x5coVodVqxbJly4Sfn58oKSkRN2/eFFZWVuLmzZtCiNLPE3/++We5x2DatGmibdu2IjU1VaSmpop27dqJ+/8s3bVrl4iOjhZ6vV4cOXJEWFlZiTNnzpS7P0IIERMTI/bt2yeKi4uN940PPvig3Lbv17VrV7F06dIyj/n4+IgmTZqImJgYoVarhUajeWQeqVQqJk2aJEpKSsTp06eFubm5iI6OFkII0a5dO/H5558LnU4nioqKxLFjxypsf9SoUaJnz54iKytLJCcni7Zt24pp06YJIYSIi4sTAERYWJjIyckRhYWFVT4XsrKyxMaNG0VBQYHIy8sTYWFhom/fvo88HobzZcaMGSIkJEQkJiaK7Oxs0b9/f/HGG28YtzVcK+np6SI1NVX4+/uLFStWlHvst2zZIlxdXcXx48eFXq8Xd+7cEdevX6/0cRg5cqTIz88XMTExQqVSid9//10IIcSwYcPE2LFjhVqtFmq1Whw5cqTcfbk/c9euXUVqaqooLCw0Pnb/306TJ0823hsSEhKEra2t+PHHH0VJSYnIyckRp06dEkKU3htCQ0PL3d+/+/HHH4VUKi1zD7K0tBRRUVHGY2BjYyP++OMPodfrRWFhoRg8eLAYNWqUyMnJEVlZWeLll182Hpf9+/cLBwcHcf78eVFUVCTGjh0rpFKpOHjwoPF3FxYW9lj7cP9ro6KihJWVldi5c6fQaDRi9erVQqVSiXv37hlzq1QqcfToUaHVasX48eNF586dhRCiSvcIV1fXCs8dIYTIzMwU9vb2Yvny5UKj0Yh9+/YJa2tr4zk0Y8YMIZfLxbZt24RWqxUTJ04UDRo0EGPHjhWFhYXi1KlTwtzcXNy5c+ext6+ua+FZUJnPqSxMPqUuXLggfv/9d5GVlVXmcZ1OZ6JE9dOnn34qNmzYIF555RUREhIifvnlF+NzhgJmXWH4wDxo0CCxd+9e4+Pjx483/ke5rmWmumHLli3itddeE4sXLxbffPON2Llzp6kjEVEV1KfC5LVr10SrVq2EmZmZACBefvllUVJSUu62LEzS4zp8+LBwdnYu87l54MCBZQqTQghRUlIitm7dKhYsWGB87McffxQ+Pj7GnzMyMgQAcfLkSeNjPXr0EIsWLSq37RkzZoiWLVuWeaxFixYiMjJSCFFaHDIUIQ3+Xpg0NzcXarXa+LxSqRQ///yz8ed3331XTJgwwbh9VQqTn332mXj11VeFEKWfLV1dXcXhw4eFEEL07t1bLFmypMxr/f39xeHDh8WtW7eEpaWl2LRpk7HAUxE/Pz+xfft2489bt24VD+svM3DgQOPvoLzC5N9t3bpVNG3a9KHbCFFxIe7vjz0qj7m5eZn7VLNmzYxf8Hbp0kW8++67IiEh4aHta7VaIZfLxaVLl4zP79mzR3h5eQkh/leQu79gVtVz4e8uXbokFArFI4+H4XwJCAgQW7duNT5348YNIZVKjfsOwFgIE0KIf/zjHxUWiHv27Fnul06VPQ5xcXHG5/v37298r5EjR4pXXnnFWBiuaF8MADz0ehCibGHyyy+/LFPMvV9VC5Pl3YNmzZolhCi95g3FQCGESE1NFTKZTOTm5hofO3LkiPDz8xNCCPH222+Ljz76yPhcVlaWkEgk5RYmq7oP97921qxZYuDAgWWeDw0NFT/88IMx9/330fPnzwsbGxshhKjSPUImkz3we7lfZGSkaNGiRZnHwsLCxMcff2zM3LVr1zI5AIjk5GTjY0FBQWLHjh2PtX11XgvPgsp8TuVQ7qeM+P/DKkNCQtCmTRtjt3agtNv777//juTkZACl86JQ5fx9/h7Dz4ah3MOHD8eFCxcQFhZm3Kauzb9nyDNgwABs3rwZM2fOxDvvvINr164ZhxIIDsulcgwcOBArVqyAnZ0dZDIZioqKnmiifSKix6HT6dCrVy/07dsXBQUFyMnJgaenJ0aMGGHqaFTPJCUlwdPTs8wikz4+Pg9sJ5fLMWDAAOzevRvbt283Pm4YGg0A1tbW5T72sKl+GjRoUOZnHx+fMnNYlpflfg4ODmXmPre2tq5S+w8zYsQI7Nq1C7m5uThw4AAsLS3RuXNnAKXDaCdNmgQ7Ozvjv+TkZCQmJiIgIACrV6/Gt99+C1dXV/Tt27fMcNj7JSUllTkGf9/fPXv2oF27dnBwcICdnR127dqF9PT0CjOnpqZi2LBh8PT0hK2tLYYPH/7Q7R+lqnkcHBwgl8uNP99//FeuXInCwkK0bNkSTZo0qfDzVXp6OtRqNXx9fY2P+fr6Ijk5uczn979nq8q5UFhYiLFjx8LHxwe2trbo3Lkz8vPzy/w9+TCJiYkP5NPpdGUWIqrseXjnzp1yV4Gv7HGoqJ358+fDw8MD3bp1g7+/P+bMmfPI/XrU9VaZ3I+jvHtQRfeB27dvQ6fTwdvb23jt9evXD2lpaQAevKbs7OwqXEzqSfbh7+cAUPr7uT/33383hr9Dq3KPcHR0RFJSUrXmKO+x+8/PqmxfndcClWJh8ilzfzEsMTERFy5cgF6vR0xMDDZu3IhZs2bho48+AlC6gAsLUY+m1+shlUqRn5+P9evXA4BxEaEmTZrg448/xmuvvWbctq4eU8N/2AYNGoQGDRpg06ZNSElJgb29Pe7evVtmG6K/27ZtG7Kzs2FhYYHTp0/j119/Na7EWFfPeSKqX7KysnD37l384x//gKWlJWxtbfH+++9j9+7dpo5G9YyHhwcSExPLfIlvmGuvPFqt1ji/e3X4e1t37tyBp6en8eea/PL7Ue/t7++PFi1aYNOmTVi7di3eeOMN42saNGiA//73v8jOzjb+KywsxOuvvw4AeO2113D48GGkpKQgICAA7777brlteHh4lDkG9///kpISDBkyBOHh4UhNTUV2djb69u1r/CxSXv5p06ZBr9fjypUryM3Nxbp16yr12aWiY3H/44/K8ygBAQFYu3YtUlNTsXjxYowbN67cxTucnJwgl8tx+/Zt42O3b9+Gu7t7mTxPcm4sXLgQ169fx6lTp5Cbm4ujR48CwEOP7f08PT0fyGdmZlamAFNZDRo0eKLjUBEXFxcsXboUCQkJ+PXXXzF//nz8/vvvACr3+wYAGxsbFBYWGn++v9hUUe6HvX9FyrsHVXQfaNCgAWQyGdLS0ozXXk5OjrHY9fdrKjs7G7m5ueW2+yT78PdzACj9/dyf+2Eqe494+eWXsWnTphrL8aSq81qgUqxSPMUaNmyITp064dy5c1i8eDFSU1Oxbt06uLm5Yfbs2QDqXq++ushQrBsxYgTS09PLfNBISEgwftskhICZmVmdP6YFBQW4cuUK9u7di507d2Ljxo145513WFyichnOCx8fHwQEBKBXr16YP39+mQnt6/o5T0T1g5OTE/z9/bFkyRKo1WoUFBRg+fLlxonuiapL+/btYWVlhXnz5kGj0eDAgQPGhTfOnTuHgwcPoqSkBGq1GitWrMDJkyfxwgsvVFv7ly9fxk8//QStVovIyEjExMSgT58+1fb+D+Pq6vrIIuuIESPwww8/YOvWrWV6LL///vuYM2cOLl++DCEE8vPzsWPHDuTl5eGvv/7CgQMHUFxcDAsLCygUCuMX/X8XFhaG2bNn4969e7h37x7mzp1rfE6tVqOkpATOzs6QyWTYv38/9u/fXyZ/VlZWmcVt8vLyYGNjA5VKhaSkJCxYsKDajsWj8jzKmjVrkJaWBolEAjs7O0gkknKPi1QqxbBhwzB16lRkZ2cjNTUVn332mXERmuqQl5cHKysr2NnZISsrC7NmzSrz/KOOx/DhwzFr1iwkJycjNzcXU6ZMQVhYWJneopX13nvvYeHChTh16hSEELh79y5u3rz5xMdhw4YNSEhIAFDaa1AqlRqPd2V+30DpyMTIyEjodDqcOHEC27ZtK3MMjhw5gsjISGg0GuTm5uL06dPG909MTKx0D9TMzMwy96A9e/Zg6NCh5W7r5uaGvn37Yvz48cjKyoIQAgkJCcb7VlhYGFavXo1Lly6huLgY06ZNq7BDypPsw2uvvYZ9+/Zhz5490Gq1WLt2LW7evIkBAwY8cn+rco+IiIjA6dOnER4ebuw5eefOHYwfPx5XrlxBnz59cPv2baxcuRJarRYHDhzAjh078MYbbzwyR3WozmuBSrEw+RS7c+cOfv/9d6xbtw7p6emYNm0anJ2d4erqig4dOhi3Y0GqfPcfl+PHj6OgoADvvPMOJk2ahNmzZyMuLg4JCQmwt7cH8PQUZ5ycnCCVSuHh4QGg9BucPXv2PDBcnQj433ndqVMn9OzZE1FRUVi0aBF8fHyMKxnyHkJEtWXLli04fPgw3Nzc0KBBAyQlJWHNmjWmjkX1jLm5ObZt24ZNmzbB3t4eX3/9tbHwodFoMGHCBDg5OcHV1RXff/89Nm3ahJCQkMdq6+jRo1AoFGUee+WVV7B7927Y29vjyy+/xJYtW+Dg4PDE+1UZU6dOxbx582BnZ4dPPvmk3G3CwsJw7tw5BAUFoXHjxsbHBwwYgBkzZmDUqFGws7NDUFAQIiMjAZT2LJw+fTqcnZ3h6OiIU6dOYdmyZQAePAaffvopGjVqhODgYHTo0MHY4xIAlEolvv32W7zxxhuwt7dHZGQk+vfvb3y+YcOGGDFiBAIDA2FnZ4cbN25gxowZuHLlCuzs7NCnTx8MHDiwUsdi/Pjx2L59O+zt7TFq1Khyt3lUnkf57bff0KxZMygUCgwdOhRLliyBn59fudsahrg+99xzCAkJQdu2bfHpp59Wuq1HCQ8Ph1qthrOzM0JDQ9GjR48yzz/qeEybNg1du3ZF69atERgYCFtbWyxZsqRSbd+5cwcKhcLYq2/w4MH4/PPPjasgd+vWzbjy/ZMch/Pnz6N9+/awsbFBly5dEB4ebvxSoTLnvqH933//HXZ2dli4cGGZ89Pb2xt79+7FsmXL4OzsjOeeew6HDh0CAHTv3h0tWrSAu7s77OzsHroqNwC0bt0aaWlpcHZ2xnvvvYeVK1fiueeeq3D71atXw9zc3LjiuuFzOwD07NkTU6dORd++fdGgQQMEBATAycmp3Pd5kn0IDg7Ghg0bMHnyZDg6OuKbb77Brl27KmzrflW5R/j7++P06dNITU1F8+bNYWtri549e8LR0RGBgYFwcHDA7t278d1338HR0RHh4eH46aef0KhRo0fmqA5Pci1Q+SSCf3E+tY4fP47du3ejdevWWLJkCYYMGQIrKytkZGSgdevWuH37Nl599VVYW1tDr9dzGO99hBDGgkx8fDx8fHwwb9485Obmol+/fti8eTNcXV0xceJEEyd9PIMHD0Z0dDScnZ3h7OwMa2trrFixgucAPdSJEyfw22+/oUmTJsjLy8OKFStw7NgxU8ciokfIzc2FSqVCTk5OhXNK1Ud3796Ft7f3M7ff9HSLiIjAzZs38csvv5g6ChGZyKpVq7Bs2TKcOnXK1FGIalxlPqfKajkTVaOOHTsiODgYzs7OAEqH8MpkMigUCly+fBkHDx7EiRMnsGzZMhak7nN/UXL69Om4efMmJBIJ5s+fDz8/P1y5cgUnT56s9DCQusSwb7NmzYJKpYKnpyckEgkuXLiAyMhI40TXlflWi54906dPx8KFC9GyZUsApfNO3rhxo9a+fSSiJ1PRfFL1VV5eHoBnb7/p6VZSUmIcPklEz6aioiLodDreB+iZUJnznD0m6xGtVovVq1cDKB120K1bNyxfvhzjx4+HQqEoU5AjYP369fjjjz/w3XffYdOmTfjpp5+wcOFCXLhwAVKpFAMHDnxqj5larcbPP/+MgwcPwsbGBidOnIBEIsGSJUvQsGFD2NnZmToi1SE6nQ5SqRQffPAB3N3d4eHhgUuXLkGtVuPTTz81TgtARHVTcXEx/Pz8ykzQ/6ywsLCo9HxeRERERFT73NzcEBcXB0tLy3KfZ4/JesRQYw4ODkbnzp1x7NgxKJVKLF68GP369UOTJk04pPs+8fHxKCoqQkJCAl599VXk5ORg27ZtCA8PN27ztBUlDYXUY8eO4ffff8fChQvh5OSEU6dO4Y8//kC7du1MHZHqIMN5Hh4ejgMHDsDe3h5vvvkmgoODoVKpTJyOiB7F0tIScXFxUKvVpo5S6/i5hoiIiKhuk8vlFRYlAfaYrDcMBamCggLY2Nhg27ZtOHr0KPr164dr165h//792L59u6lj1gkHDx5Eeno6XnzxRaxatQrx8fHo2LEjNmzYgAEDBlTr6nemkp2djfXr12Ps2LHQaDSIi4vD6dOn68W+Uc0qLi7G7NmzcfnyZRQWFuLll1/GhAkTIJFInrpCPREREREREdVt7DFZTxgKBtbW1gCAK1euYOTIkXj++efh7++Po0ePIikp6Zkfkrl37158+eWXePXVVzFs2DAsXrwYR44cwYYNG9C7d+96Ubg7efIkkpOTUVxcjHnz5iE/Px95eXkQQmD48OHsWUIPtWTJElhbW2POnDlo1KgR+vbti2HDhsHb29vU0YiIiIiIiKieYWGynjEUKE+cOAE/Pz/cvn0bp0+fxgcffABXV1eo1WrI5XLjnHLPAkMvUqB0ouE5c+YgMzMTFhYWCAoKgre3N1xcXLBt2za0adMGzz//vIkTP5lz587B3Nwct2/fhkKhQJ8+faDX67FgwQLExsYiMDDQ1BGpDjIMh7x8+TLee+8944I3fn5+mD9/PtLS0vDSSy/hnXfeMXFSIiIiIiIiqi9YmKxnDMWFRYsWYd68eXBwcEBYWBh27dqFRYsWQSaTYeXKlbCyssLNmzfRsGFDU0euUdeuXUN+fj5KSkpw8uRJ+Pr64quvvoJarcamTZswbdo0yGQyzJw5EwUFBcjIyDB15CcWFhYGFxcXjBkzBoMGDUL79u1RXFwMCwsLJCQkIDAw8Kld1Idq3tChQzF9+nTY2NggMzMTLi4uGDRoENzc3NC1a1dTxyMiIiIiIqJ6hHNM1kOG4mR+fj5sbGwwevRoyOVyfPPNN1i1ahWSk5NRWFiI98QivwAAIABJREFUqKgobNiwARYWFqaOXGPS0tIwb948REZGYv78+Rg5ciTmzp2Lu3fvwsPDA9euXcO6detMHbNGrFu3DuvXr4ePjw8sLCwQEBCAUaNGGYf7E93PUKxOS0vDqlWr0KNHDzRv3hxFRUU4c+YMunXrZuqIREREREREVM+wMFnPxcbG4ssvv8T3338PANiwYQNmz56Nd999Fx988IGJ09Wc+4eqX79+HXPmzEGPHj3QvXt3eHp64uDBg/Dx8YGvry/MzMyg1Wohk9WvDsQlJSUoKipCSkoKEhMTodfrodFojEO7OdckPcyiRYuwf/9+CCHg4uKCBQsWwMHBwdSxiIiIiIiIqB6pX5UYeoCLiwtiYmJw+PBhHDx4EIcOHcKKFSvg7OyM1NRUuLq6mjpijZBKpdDpdHjxxRexZMkSLF26FCtWrMDOnTuRlpYGiURi7AGm1+vrXVESACwsLHDo0CH85z//gUKhgEqlQuvWrQGARUl6qJUrVyIxMRFr1qyBvb298XFOAUBERERERETVqf5VY8hIp9NBoVBg3rx5GDVqFEJDQ/Hbb79h9+7deP/99/HFF1+gR48epo5ZYxYsWID+/fujcePGOHLkCJo3b46UlBTk5OSUWX27vhbptFotZs6cie3bt8PR0dHUcegpYOhJq9fr4eTkBHt7e+Tk5ODw4cPIzMzEW2+9xd62REREREREVG1YmKzHDEOZW7dujfXr16Np06aYNGkSUlNTsX37dri5udXr1bllMhni4uLw/vvvw8vLC2q1GhMnToRSqQSAel9gkclkaNGiBTIzM1FUVIQbN27g/PnzGDlyJDw8PEwdj+ogw/XQq1cvDB06FJs3b4aDgwMCAwMREhJSZhsiIiIiIiKiJ8XCZD1nGHrZtGlTXL9+HYWFhVizZg0A4NixY5g+fTq+//57BAcHmzhp9TEUWydMmIDDhw+jVatWuHfvHgYMGIBRo0YZC5P1ucBiKLqam5ujZ8+eaNSoEZo0aQJbW1vY2tqaOh7VcV5eXvj3v/+NNm3awMvLC7GxsXB3dzd1LCIiIiIiIqpnuPjNMyQmJgZvv/02Vq1ahV9++QXnzp3D5MmTERoaatzmaexFeP+8dxkZGXB0dIRer0d4eDhGjx4Nd3d3jBgxAhMmTECvXr2eiXnyDPuYk5MDhUIBAMjKysLJkydx7NgxeHl5oXfv3ggMDDRxUqqLDOfP119/ja1bt0KlUqFBgwbo1q0bhgwZUq97WhMREREREVHtYY/JZ4Rer0dAQACmTp2KPn36oGvXrli/fj3Wr1+PX3/9FU5OTpg0adJTV5QEYCwyzpkzBxKJBJMmTcK2bdtgb2+PFi1aAACWL18OPz+/MtvXZ4Z9NDc3x7Jly3DmzBk4OTkhPj4e165dw/bt2+Hi4mLilFRXSSQSnDhxArdu3cLhw4cBAHv37sXSpUsxZMiQZ+IaIiIiIiIioprHwuQzwlBw7N27N+zs7NC+fXu899570Gg0mDlzJsaMGQNHR0eMHj3axEkfz9WrV3HgwAHs27cP165dg7u7Ozp27AigtPeXoSj5rElKSsLevXuxZcsWyGQyaDQaRERE1Kuh+1S9DL0llUolLl68aHy8V69eiI6OBlC/p0EgIiIiIiKi2iONiIiIMHUIqh16vR4SiQTe3t64ceMGoqOjsWjRIqhUKvTr1w/Jyclo3LgxANT54c5arRY3btyAi4sLrl+/jj///BNpaWnIycnBxo0bodPpYGZmBl9f3zq9HzVNpVJBrVajVatW0Gq12LlzJzw8PNCoUSNTR6M6ynC9uLq6oqioCG3btkVmZibOnDmDTp06GedoJSIiIiIiInpS7DH5DLm/l1N2djb27NmDiIgIWFhYQKFQQKFQIDo6GkFBQZBIJHW6OKnT6bBv3z589dVXSEtLw9q1a+Hs7Ax3d3e8/fbb6NevHywsLPDCCy+YOqpJSaVSREdHIykpCYmJiTh//jxSUlIQEhICX19fU8ejOu7DDz/EqlWr8NNPP8HBwQHt2rWDr68vBg4cyHkmiYiIiIiI6IlxPN4zqn379njzzTdx6NAhXLhwAaNGjcLBgwfxww8/YPz48QDq7lyMOp0OFhYWGD16NFJSUmBtbQ2VSoXu3bujUaNGmDt3LoKDgzF27FhTRzUpvV4PAEhMTERaWhq2bt2Kd999FxYWFjh+/HiZbYjKc+bMGaSmpuLdd99Fo0aN8NJLL2HdunUA6u79gYiIiIiIiJ4eLEw+gwzFqAkTJqB79+64ePEigoKCkJqairlz5yI1NRVxcXEmTlkxqVSKe/fu4ZtvvsHGjRvRuXNnTJ06FUVFRdi+fTuee+45fPXVVwBKi5jPKiEEAMDX1xfff/89VCoVJBIJhg0bhoCAAACcK5DKZzh3bGxscP78efTu3Ru7d+/GqVOn4OPjA4DnDhERERERET05/mX5DLq/oKDVarFv3z58+OGHcHNzQ/v27eHv7w83NzcTJizf/b37nJ2dkZCQgE8//RQffvghfH19MWHCBGzZsgUDBw40bv8sDzU17PvEiRMxcOBABAcHw9LSEp07d0br1q1NnI7qMkNvyCZNmiAhIQFyuRzPP/880tPTER4ebuJ0REREREREVF9IhKFrDD2zli9fjuPHj2PlypVYsGABRo0aVScLk0BpD8hffvkFw4cPBwB88sknuH79OiIjIyGEgFwuh7m5eZ2eH9NU7t27h7Fjx+LXX3+FXq9njzd6KMM1FBUVheDgYGRlZcHe3t7UsYiIiIiIiKgeYWWC8N5778HLywsZGRn417/+VeeKkvf3lNRqtfjpp58wZ84cAMCsWbMghMCUKVNgY2PDouRDODg4wNnZGQCH4dKjGa6h4OBgAGBRkoiIiIiIiKode0w+4+p6z7n7i4zz58+Hq6srunbtiqlTp8Lf3x9NmjTBgQMHMHPmTHh6epo4bd2XkJCAvXv3wtLSEkOGDIG1tbWpI1EdptFoIJPJWOgnIiIiIiKiGiGNiIiIMHUIMp26XnAw5Js3bx5iY2MhlUrx119/YfTo0cjOzsbt27fxxhtvoGHDhtDr9XV+f2qbobA7ceJEWFlZQSKR4MKFCzh//jysra0RHBzM40YV+vPPPxEbGwtvb29TRyEiIiIiIqJ6SGbqAGR6hk6zdak4VVJSAgsLCwDAH3/8gatXr2LYsGHo27cvNm7ciKVLl+Kbb74xLvBS13t+mophASAbGxvExMQgLy8P7du3h7u7Ow4ePIi+ffuaOiLVYRqNBtHR0ejQoYOpoxAREREREVE9xErOM04IgaKiIkRHR5s6ipFarca+fftw69YtbN68GUlJSQgNDcXly5dx8eJFdOjQAbdu3UJCQoLxNSxKls9QbG7Tpg22b98OtVqNgIAADBs2DP/6179MnI7qOg8PDyQnJ4MzfhAREREREVFNYI/JZ5xEIsHdu3exdetW/OMf/4CVlZWpI0Eul8PV1RXDhg1DZmYmoqKiIJPJsHz5cqxYsQI5OTkICwuDr6+vqaPWeYaCbd++fdGyZUucPn0aBQUFOH78OLZt24aOHTsiNDQUoaGhJk5KdZGLiwu0Wi0yMzPh6Oho6jhERERERERUz7CbGSEoKAgeHh44dOiQqaMYNWvWDF9//TXatGmD3377DQAwZswYNGnSBEFBQWjVqhVKSkpMnPLpIZFIcOXKFURFRWHLli3Izs7G8ePHERgYiAYNGpg6HtVRUqkUrq6uSEpKMnUUIiIiIiIiqodYmCRIJBL06tUL58+fR1pamkmz6HQ65Ofn45VXXkF6ejqWL1+OzZs348cff8RHH32Epk2bIjAwEJs2bUJ+fr5Jsz5tvv32W3Tu3BkTJ07E//3f/2Ho0KFo2rQp3N3dTR2N6jAPDw8WJomIiIiIiKhGcFVuAgBYW1ujuLgYFy9eRPPmzWt9IRzDytBarRZWVlbo1KkTFixYAJlMhokTJ+Kvv/6CQqHAq6++imbNmiEwMBBubm61mvFpZViZWyaTwdPTE3fu3MGxY8fQoUMHNG3a1NTxqI4rKChAdHQ0QkJCTB2FiIiIiIiI6hkWJsnI09MTBw8ehL29PZydnWutXcOK2pmZmRgzZgyCgoLQqFEj9OjRA5MmTUJaWho+/PBDtG3b1vgalUpVa/medoYic+PGjXHgwAF89913iI2Nhbm5Oa5fv44WLVpAp9NxASEql5mZGY4cOYKOHTvW+hcWREREREREVL+xEkFGlpaWeOmll7Bv3z5oNJpaaVMIATMzM6Snp+OHH35AdHQ0pk+fjlOnTsHBwQFDhgzhMNInZFhReffu3bh8+TL+/e9/o1GjRujZsyf27NkDACw4UYWcnZ0hhEBGRoapoxAREREREVE9w8IkldGiRQsoFAocP368VtqTSCTQ6XQYNWoULC0tMX36dCiVSoSHh2Pq1KnYt28fvv76awD/K7BR1RiKjv7+/rhz5w4aN26Mn3/+GWvXrkXjxo0BgL0lqUJmZmZwd3fnFwRERERERERU7ViNoDIkEgl69+6N48ePIzs7u1balEql6NSpE/r3748hQ4Zg7NixcHV1xdChQ7F+/XrI5XLjHJT0+AIDA2FnZwcHBweMGTMG3bp1wyeffGLqWPQU4AI4REREREREVBNYmKQHeHl5oUmTJti/f3+ttenv74/169fj7t27yM7Ohr29PRo2bAilUmkc7k1PRi6XY/78+bh69SpcXFxw7do1TJo0CSkpKaaORnUcC5NERERERERUE2SmDkB100svvYRFixYhLi4Ofn5+Nd5eWFgY1q9fj59++gn79+/H/PnzYW1tbVxRmp7MokWLcOLECchkMmi1WjRu3BhKpRLp6emIj4+Hm5sbjzVVyMPDA8nJycaFqoiIiIiIiIiqg0Rw4j6qwIkTJ3Dp0iWMGzeuVosReXl5UCqVLIJUo5iYGNja2mLnzp3Izc3F+PHjAQCLFy9GVlYWPv74Yx5vqpAQAl9++SXGjBkDV1dXU8chIiIiIiKieoJVCKpQaGgodDodzp07V2ttCiGgUCgAcEGW6uTv7w9nZ2eoVCqcO3cOO3bswPz58/Hbb78hJCQEAI83VUwikXA4NxEREREREVU7DuWmCkmlUvTu3RubNm1CkyZNcPv2bWi1WjRv3rzG2tTpdJBIJJBKpTXWxrPIMES7V69eUKvVOHToENq1a4c33ngDnp6eJk5HTwNDYdJQyCYiIiIiIiJ6UuwiRQ8VGBgIlUqFjz76CBEREdW+IE5JSQni4uIAAHfu3MGSJUtQXFxcrW3Q/1hbW6Nt27ZYuHAhQkJCMGvWLEyZMgV6vd7U0aiOY49JIiIiIiIiqm4sTFKF9Ho9tm7diqNHj+LMmTOwsbFBXl5etbZx5coVRERE4Oeff8b69esRGhoKGxubam2DygoLC0NOTg62bduGsLAwREVFISoqytSxqI7z8PBASkoKdDqdqaMQERERERFRPcHCJFWopKQEFy9ehFarRXBwMJKSklBQUFCtbSQmJqK4uBjffvstLl26hICAgGp9f/ofQ6/I5557Dps3b8aNGzfQqlUrNGjQwFiY5FpYVBF7e3uYm5sjLS3N1FGIiIiIiIionpBGREREmDoE1U3m5ubo2LEjrKys8NdffyE+Ph5SqRRDhw41zlkIAFqdHml5JYi5l4+Ld7JwLj4Tl+5k43pyDu5mFqFQXdrDytLcDGZmkjJt7NixA7GxsZBKpZBKpbh48SLatGkDa2vrWt3XZ4EQAhKJBHK5HBcuXEBoaChCQkLwwgsvoGnTpgBQ5vdKdD+JRILY2FhYWlrCw8PD1HGIiIiIiIioHuDiN/RQcrkcAwYMQEhICObOnYtLly5Bp9NBJpMhLa8Y5+OzcDwmA/nFGphBAkgAc6kZJBJACECj0wMC0ENAYWmOjgGOaOVjDxelJYQQuHr1KpKTk+Hl5QVHR0f06NEDKpXK1LtdLxlW3e7Xrx9atWoFd3d3AIBMxtsAVQ7nmSQiIiIiIqLqJBEcu0mVpFarceTIEYS064wdl5NwOTEbgASONnJYy6UP7W0nhEChWoeMAjUAgeaedmjrJsWoYUPg5eWFMWPGoGvXrlAqlbW2P8+qe/fu4dChQxg6dCiA//WkJHqU69ev48iRIxg3bpypoxAREREREVE9wMIkVZpeL3D2dia2XEqERqeHm60lZNKqT1Oq1emRklsM6HWwSbuGj0YOhB17Sdaq1NRU5ObmQqlUws3NzdRx6CmRnZ2Nb7/9FlOnToW5ubmp4xAREREREdFTjmM4qVJKtDqsP5uAs7cz4ay0gNLS8rHfSyY1g5e9NfKKNbjn1Azbr2chrI0CFjJpNSamh3F1dcVXX30FKysrpKSkoHv37ujevTvkcjlsbW1NHY/qKJVKBUtLS6SmpsLLy8vUcYiIiIiIiOgpx1W56ZFKtDpEnozH+fgs+DjaQGlZPT2llJbm8HG0wbn4LESejEeJVlct70sPp9OVHmeNRoPmzZtj2bJlcHFxwfvvv49Ro0bhxIkTJk5IdZVEIuE8k0RERERERFRt2GOSHkqvF1h/NgFXE3PQwMH6gVW1n5TUTAIfB2tcSczBhnMJGN7Wp9rboLKkUilyc3Px119/ISoqCvv27UNMTAycnZ2h0+mQlpZm6ohUh3l6erIwSURERERERNWChUl6qLO3M3H2diZ8HG1qrGBoZiZBAwdrnInLRLCLEqH+jjXSDgF6vR5mZma4desWLC0tYWtriylTpsDJyQkKhcK4HRfEoYp4eHjgxo0bpo5BRERERERE9QALk1Sh9PwSbLmUCGelBaQ13ItRaiaBs9ICWy4lItBFAUeFRY2296wyMyudvaFly5bYvHkzYmJi4OvrC6C0GGn4J5Vyvk8qn4eHB+7duwe1Wg25XG7qOERERERERPQU4xyTVKGdl5Og0emrbU7JR1FamkOj02PnleRaae9ZptfrsWfPHgQEBECr1Rp7SJqZmbEoSQ+lVCqhUCiQkpJi6ihERERERET0lGOPSSpXWl4xLidmw0NlVenXlBQVIvbqWaTdjUNJYQHMLSyhtHdCwzadoVA5VOo93GwtceluFnrnucFF+fgrf9PDmZmZobCwEAAgk5XeBjQaDQuTVCmGBXAaNGhg6ihERERERET0FGNhksp1Pj4LgAQyaeU61RbkZuPs/i2QmEngGdAIljZKaEqKkZuRBk1JcaXbLW1PggvxWejV1P3xwtNDabVaxMTEIDg4GOvXr8eff/6J7OxsxMXFYdq0aWjfvj3nmKSH4srcREREREREVB1YmKQHaHV6HI/JgKNN5eePu3rsN8gtLNG25xDInnDeOUcbOY7FZOClRq6VLoxS5a1fvx579+6FQqFAYWEhXnzxRXh7e+PAgQOIjo5mYZIeycPDA1evXjV1DCIiIiIiInrKsepDD0jPVyO/WANreeWG9GYkJyA7PQWBLUIhk8uh02mh02kfu31ruRQFxRqk56sf+z2oYoMGDUJkZCQGDx4MFxcXjBw5Et26dUOnTp1w9OhRU8ejp4C7uzsyMjJQUlJi6ihERERERET0FGOPSXpAam4xzCCpdI+59KQ7AACZ3AJn9m5G1r1kCCFga++EoJYd4OzpU6X2JRIJJJAgNbcYbirOM1ndrK2tAQB+fn7466+/8OGHH6KkpASxsbGIiIgA8L/Vu4nKo1AoYGtri+TkZOOq7kRERERERERVxcIkPSApuwiowijewtxsAMClw3ugcnJF8849oSkpRszVc7jwxw60fmkAHN29qxZCUpqjubdd1V5HlRYYGIjly5cjPj4erq6u8Pb2Rl5enqlj0VPCMM8kC5NERERERET0uFiYpAdkF6lhXoW5HbVaDQBAYWuPlt36QV1SAgsLCzi4e+HYtnWIungS7atYmDSXmiGnWFOl11DVHTp0CNeuXUN0dDSysrKQnJyMnTt3wsenar1c6dnDBXCIiIiIiIjoSbEwSQ/Q6gSqsu6JmbR0LkqPgIaQSCS4cvUq5HI5ngsOhr2zO7LuJUOrUUNmXvlFcSQSQKMTVY1OlaTX62FmZgYhBBo3bow333wTCoUC69atQ1JSEnx8fLgADj2Uh4cHLly4AAA8V4iIiIiIiOixsDBJD5BJJRBVqAlaWisAAHKr0rkLQ0JCEBcbh7PnzkFdVAyh11e5MCkEYC5loaOmGIqSr7/+epnHvby8kJWVBQAsNFGFMjIycO/ePZw6dQoFBQXIyMhAeHg4vL2rOGUDERERERERPdNYmKQH2FnJodHpK729yskVCVHXUFKQDwCQyWQICg6Ci4sz9vx0CsV52SjRaFGVZWw0Oj1UluZVTE5VIZFIIITA7t27sWTJEhQVFcHb2xvjxo0DwF5wVD6NRoPZs2fj7t27iImJgU6ng5WVFezt7U0djYiIiIiIiJ4yXHqXHuBhZwVUoceki5cfZDJz3L11HXr9/wqaEr0GdgpLuHj74srlq4iNjYVep6vcm4r/n4NqVHp6On799VfMnTsXu3btwueff46xY8eaOhbVYebm5hg8eDCkUim8vb2hVqsRFBQEhUJh6mhERERERET0lGFhkh7gamsJPQREJcdzyy2tENyqA3Iy0nBm32bE37iMW5dP4+z+LZDJzNGp10C0bBmC7KxsnDt3HtnZ2Q99PyEEBARcbavSx5Ieh1KpxOHDh9G0aVNYWVmhpKQEjRs3hl6vZ29JqlCHDh3Qt29f5OfnIyMjAy1btjR1JCIiIiIiInoKcSg3PcBJIYfC0hyFah1sLCp3ijR47nmYyy1x+/pFRF04DomZFA6unggKaQelvRMAIKRlCBITE3H16lW4uLggICAAMtmD71+o1sHG0hxOisrPSUmPx9LSEhMmTMDYsWORnJyMkpIS/Oc//4GZGb+zoIpJJBIMHToUly5dwv79+xEcHGzqSERERERERPQUkojKdoujZ8qea8nYfz0VDRysq/29i4uLERUVhfz8fAQHBcHJ2bnM83cyC9GzsSt6NXWv9rapLMM8khcuXMDVq1fh4eEBPz8/uLi4wNbWlvNM0kMlJSXhk08+wdKlSyGX84sEIiIiIiIiqhoWJqlcaXnF+HLPDXiorCCT1kDvOSGQmpaGW7duwU5lh6CgQMgtLKDV6ZGUU4SpvRvBRcmh3LXh5s2bWLNmDZydnSGVSnH48GH06NHDuAgOUUW0Oj3S89VIzS1GUnYRsovU0OoEZFIJ7Kzk8LCzgqutJZwU8pq5jxAREREREdFTjUO5qVwuSks097TD1aQceNlXf69JSCRwdXWFg709bsXE4MzZswjw94fO0g4tvOxZlKwFer0eZmZmuHXrFhITEzF79mwAQOPGjfHrr79CrVYjLS0NXl5eJk5KdU1aXjHOx2fheEwG8os1MIMEkADmUjNIJIAQgEanBwSgh4DC0hwdAxzRyofXNhEREREREf0Pe0xShTLySzB//1+wlkuhtDSv0bYyMzJw9WYMzCwsMW94JwR6u9Voe/Q/MTExmD59OoYNGwZLS0vs3r0bOp3OOG/g+PHjTZyQ6oqM/BLsuJyEy4nZACRwtJHDWi596HB/IQQK1TpkFKgBCDT3tEP/5h5wVFjUWm4iIiIiIiKqm1iYpIc6HZuBtafj4eNoA6lZzc01qNML3E7PQ7BZGvJiLqJLly7o0KEDpFJpjbVJpXQ6HT788EMolUqoVCo4ODjA1dUVtra2cHNzQ9OmTU0dkUxMrxc4ezsTWy4lQqPTw83W8rGGZmt1eqTkFsNcaoZBLTzRxtcBZjV4XyEiIiIiIqK6jYVJeii9XmDt6Xici8+Cj4N1jRQR9HqB+MxCtPG1x/C2PkhKSsT27dshkUgwaNAguLmx92RNy8zMhIWFBbKzs5GTk4Pbt2+jZ8+eLAwTSrQ6rD+bgLO3M+GstKiW3tN5xRrcyytBG18HhLXxhoWM5xkREREREdGziHNM0kOZmUkQ1sYbaq0eVxJz0MDBulp7Tur0AncyC/G8pwqvtfaGmZkEXl5eGDt2LI4dO4b8/HzjXIhUc65fv45ly5bBzMwMTk5OkEqlaNSoEfz8/EwdjUyoRKtD5Ml4XE3MqdZe00pLc1jLZTgXnwW1Vo832/uwOElERERERPQMYo9JqpQSrQ4bziXgTFz195pq6+eA11qX32tKCPHQ+evoyRiKvvPmzYNarcbHH39s6khURxh6S5+Pz0KDWuotzWHdREREREREzxZpREREhKlDUN0nMzNDMw8VnBQWuJqYg+xCNazl0scqJGh1eiTlFEEvgLDW3ujVxB3mFcxX9/eiZGFhIWJjY+Hk5PRY+0FlGQq/xcXFOHPmDPR6PRISEnDw4EHcuXMHDRs2ZHH4GXUmLhN7/0xBgxqcX1YikcDW0hzXk3LhpLCAl711jbRDREREREREdROHclOlmZlJEOrviEAXxROvzNvCyx79nnev8sq8crkc8fHxcHBwgLOz85PtEBmHyDdu3BgbNmzApUuXIJPJYG9vbzy+LEo+e9LzS7DlUiKclRY1uugVAEjNJHBWWmDLpUQEuii4WjcREREREdEzhEO56bGl5RXjQnwWjsVkoKBYAwkkgAQwl5pBIgGEADQ6PSAAAQGFpTk6BTohpIEdXJSWVW4vPT0dcXFxCAoKgp2dHQBw/slqlJubi9jYWERFRcHOzg7BwcHw9fU1dSwygVXH43A1KadWezDezSrE8552GNXBt9baJCIiIiIiItNiYZKemFanR3q+Gqm5xUjKLkJOsQYanYC5VAKVpTk87KzgamsJJ4UcsgqGbD+KEAJHjhzBwYMHsWXLFgwbNgxvvvkmvLy8ymzD3n2Pb/v27di2bRs6deqExMREnD59GosWLWJx8hmTlleML/8fe3ceH1V973/8fWbLZE8m+x422SGsYgQrm4CKu1asV1Hb2nqt26+i19qCFYsVqrW49rYu16WiICoKyqKigCA7ASITb3ajAAAgAElEQVS7CUnIvu/JzPn9QZOCoEIImSyv5z+ak3PO93OGySOPeefz/X6XpSs22PeUf17ra2t0aOdm5WcdUm1VpWwOhwJDwpTUb4gi4pJO6R5NSzz8z5S+LfrDBQAAAACg42EqN86YzWpRdLBT0cFODU4IOStjFBQU6MiRI9q6davi4+MVGRmpK664Qh999JGioqIkHZ1yTAfl6Wt6zZYtW6bLL79cl112mSTptttu0/r165WcnMzr2oVsziiRZJxyKOlubNCGjxeqtqpC8b36KzA0XPV1Ncrat1ubV32gQWMuUmy33j96n6PjGdqSUaLJA2LO7CEAAAAAAB0CwSTaPdM0tXXrVmVmZqqqqkrPPPOM+vTpo4yMDEnSpk2btHbtWv3qV7+Sjw/r052upqbp4OBgHTp0qPn4rbfeKqfzaOcaoWTX0Oj2aO2BIoX5O075moKsb1VVXqo+w8couV9K8/G4Hv20etHLytq765SCSUkK83dozYEiTegb1eLuagAAAABAx0EwiXavpqZGVVVVWrZsmYYMGaI+ffooLS1NR44cUVRUlK688kqFhYVp0aJF+sMf/qAJEyZ4u+QOxWq1SpLuv/9+BQYGqqKiQunp6fLz89OQIUO8XB3aUmFlvSprG+Rynfrako0N9ZIkp5//ccftPk5ZLFZZbaf+a8bPYVVRZZ0KK+sVHcx0bgAAAADo7Agm0e75+fnpqquuktvt1siRIyVJCxYs0NVXX60NGzbI5XJpyZIlWrx4sYqKinTgwAGVlpZq2LBhXq68YwkLC9OHH36oxx9/XOnp6br55pu1aNEizZ49m/U7u4i88lpZZJzWv3VoVJwMw9DerV/JarM3T+U+tGuLTNNUtwGn/nNoGIYMGcorryWYBAAAAIAugGAS7V5TKHbttddKknJyclRSUqLJkyfr3HPP1ezZsyVJpaWl2rFjh3x9fbVt2zYNGzZMhw8fVkJCgiQ1r5cYHR3ttWdpzxoaGvT3v/9dX3zxhS655BI9+eST6tOnj2bOnCm73e7t8tAGckprpNPMn/2DQpRywRSlb/xCmz9d0nzc6RegkZOuUnB41Ond0Dhax9larxYAAAAA0H4QTKLd+273VmxsrJ599ll98cUXKisr0759++Tj46OlS5fqhhtuUFZWli6//HIdPnxYb731lvr06aO6ujqtXbtWTz31lCR28P4u0zRlt9tVVlamuro6ORwOvf7667rmmmtUXV2t4OBgb5eINlBaUy97C9Z2tDud8g8OVXBUvJyBIQrwderb9G3a/OkSjZh4pQJDw079XlaLymobTrsGAAAAAEDHw+4C6LBmzpyp+fPnKysrSx9//LEeeughBQYGqra2VoMHD9bGjRsVHByssLAw/fOf/9TChQtVUFAg6WjYaZpm88YvXV3T6zBp0iQVFxdr0qRJWrFihaZNm0Yo2YU0uk2dbl5fWpinjcsXKzqpp3oMGqnCilpFJvXSyElXy/R4lP716tO6n2FIDW5+LgEAAACgK6BjEh1Sbm6uxo0bp4kTJ2rixInNHZB33XWXXC6Xvv76a+3YsUM/+clPFBkZqR49emj69Omqr6/Xa6+9pr59+2r48OHefox2o2nX7bvvvlv+/v664447dMcdd3i5KrQ1m9XQ6Wb1md/skGmaikrqKYePU+Hh4Tpw8KD69eunkMgYFWZnyOPxnPLO7qYp2a10MwMAAABAV0DHJDqk6Oho/f73v5ckud3u5uO/+c1vVFJSojlz5sjf31/9+/fXggULlJqaqp/+9Kf68ssv9eGHH2r27NmaM2fOcddC8vf/z87KHo+HjtIuJsTXoQa357Suqa+pkiSZnqPX9ejeXUVFRSotLW0+djppZ4Pbo2Ana5oCAAAAQFdAxyQ6PKvV2vz/vXr10tNPP63MzEy53W7V1NRo3bp1+t3vfidJqq6u1nnnnad77rlHu3fvltVqVUNDw3Gbu3g8nqO7A3fBNShramqUm5urbt26nXKHGzqP2BBf6TSzaP8QlwqPHFbOwW/Urf9QOXx8lJyUpN07tqsqP0eBIWGyHPMz+qPMf9cBAAAAAOj0SB7QqXj+3aGVmJiobt26yTRNWSwWvfnmmzpw4IBuvfVWNTY26qOPPlJsbKwkNYeSmzdvVlFRkSwWS3MoWVFR4Z0H8ZIjR47ogw8+8HYZ8JKoIKc8Or21V5P7psju8NHeLeuUtnalDu9JU21RjvZv/FSlJSXqOWTUKd/LNE2ZMhUV5GxJ+QAAAACADoaOSXQq3+3yS05O1pIlS1RQUKCHH35YtbW1CggIUG1trQIDA/W73/1Of/zjH1VaWqqFCxcqPT1dd911l6ZPn65Vq1apoKBA119/vZeepu25XC6VlZXJ7XYf14mKzqWmpkavvPKK7Ha7wsPDFRoaKrvdroDAIAU47aqud8vf59R+PfgGBCl16jQd3LFRxXnZyv12ryxWmxK791K9LUChUfGnXFd1vVv+TrvCAxwtfTQAAAAAQAdCMIlOrSlgi4iI0Isvvqh//etfys/P169+9Sv5+PjoL3/5i0JDQ2W1WvWTn/xEb7zxhpYuXapXXnlFc+fO1euvv958r9PZwKOjCgoKkmEYKisrk8vl8nY5OEscDof27dun7Oxs1dXVqbi4WAUFBZo4caKm/PJ/tHx33ikHk5Lk6x+o/ueNO+H4zrSdOnTokM4555xTuk9RVb0m9YuSzdq5f84AAAAAAEfx6Q+dWlPXX9MU72nTpunuu++Wj4+PJGnkyJEKDQ1VSkqK9u7dq08++UQXX3yxysvLVVlZqZdeekmVlZWSTuzG7IwsFotCQkJUUlLi7VJwFpWXlysiIkIHDhxQVVWVHA6HrrjiCs2ZM0fDkkIlmWo8zU1wTqZHzx7Ky81T5SksiXB0PFNDk0LPeFwAAAAAQMdAxyS6hKZQ0TTN4za1KSoq0ty5cxUUFKRJkybp/fff1/jx47V8+XJt375dISEh+uCDD7R69Wr17dtXP//5z5uv7awdlKGhoSouLlaPHj28XQpamdvt1uuvv67MzExFRkYqPj5eQUFBio+P1wMPPCB/f3/5SxocF6K0nDLFh/qd0Xi+vr6Kj4/Tvv37NSQlRfqBDaVyy2uVEh+qyEDWlwQAAACArqLzpSrAD/juTtthYWGaOXOmSktLtXTpUt1xxx1auHChhgwZopCQEH344Yd64YUXdPvtt2vVqlVau3Zt87VNoWRTN2Zn4XK56JjsBEzzxE1sTNPU8OHDdd999+n2229XamqqwsLCdO+99yo4OLj5vKmDY2W3WlRR23DGdSQmJam2tlZ5+fnfe05FbYPsVosuHRRzxuMBAAAAADoOOibRpZmmKT8/P82dO7c5YJwzZ4769OkjSVq0aJEOHDigxYsXa9KkSdq7d6/CwsL03HPP6frrr1dqamqn65oMDQ1VRkaGt8vAGfpuCC8dXdqgf//+zV/fdNNNslgsioiIOO68sAAfXZkSp9c3ZMjPYZPV8v2djj/GarWqR48eOrD/gMLDwmS1Hf9rx+0xVVhZp5+NTFJYgE+LxwEAAAAAdDydK1EBTpNhGM2dZRaLRRaLRX/+858VHx+v7OxsuVwu7d69Wz169ND777+vxMREvfPOOzp06JDmzp2rhx566Lj7naxLraOhY7LjOdl7buvWrZo/f75WrlzZfOy7YWV0dLQiIyNPes8RyS6NSHYps7haHs+ZvacjIyLk6+t7QuDt8ZjKLK5uHgsAAAAA0LUQTKLLMwyjObAxTVOxsbG6+eabFRERobCwME2aNEmxsbFatGiRYmJidODAAb377rt64oknlJCQoPLycn3wwQdKS0s77l4dVWhoqEpKSjp8wNpVHLtu6saNG5WVlaWcnBzNnj1b/v7+WrBggQ4ePHja97VYDP10RIIGxQUro7ha7jMJJw1DPXv1VFZ2tmqqqyUd7ZTMKK7WoLhgXTc8QZYz6MoEAAAAAHRMBJPAMY4NFR0Ohx566CH96U9/0vbt25Wfn6/nn39e/fr1k91uV2Zmpvbu3at//OMfWrt2rX7xi1/oxRdfVGNjY4cO9UJDQ1VfX6+qqipvl4J/Ky4u1sKFC5u/bnp/NTQ0yDAMZWdna+zYsZozZ44qKyu1YMECXXfdderVq5c+//xzLV++XI2Njac9ro/Nqv86L0kjkkOVUVR1RmtOBgQEKCY6Rvv3H1BFTYMyiqo0IjlU/3Veknxs1hbfFwAAAADQcVlnzZo1y9tFAO1RU/gTHx+vESNG6ODBg3r33XdVXl6uzMxM7d69W1OnTlVMTIxM09Tzzz8vu92u2NjY5oBz06ZNWrVqlebNm6fk5GTFxLT/zT2sVqs2b96s7t27H7chCtpeUzek3W7XzTffrF/84heqrq6Wx+ORzWbTNddco/79+2vNmjVKSUnRnDlzFB4err179+rpp5+W0+nUI488osmTJ8tqbVn4Z7NYNDA2WOEBPkrLLlNpdb38HNYWdTj6BwQo7UCW7D4+uun8XprcP0Z2K38fAwAAAICuik+EwPdompZtmqbq6ur06quv6qabbtIvfvELFRYWavr06YqPj1dqaqqOHDmiFStWaPDgwc2b6Bw8eFDPPPOMzjnnHN1666168803lZ2d7eWnOjWsM+kdHo9Hbre7+eumgLu+vl4JCQk6cOCA5syZo5dfflmSNGrUKG3evFkxMTHasmVL83UXXHCBxo4dqxEjRmjBggW65pprVFdX1+K6LBZD53YP0/0X9dbA2GDllNUos7haVXU/3h1smqaq6hqVWVyt/KpGje6XqKTS7RqWGMz0bQAAAADo4gyzI885BdqIaZp67733NHXqVNn+vatwWVmZHn74YVVXV2vChAlat26d5s+fL4/HI4vFouuvv755qu19992n1NRULz/FqXvvvfcUHByssWPHeruUTu/YzZeO5Xa7ZbVaNWPGDO3atUtWq1UpKSm67rrr9Je//EUvv/yydu3apd///vd66623NHbsWL355ptat26dEhISNHDgQD399NPq3r27pkyZorCwsFarOb+iVlsySrTmQJGqahtkyJAMyW61yDAk05Qa3B7JlEyZCnDaNbpnuIYkhijc36H//d//Vb9+/TRmzJhWqwkAAAAA0PEQTAKnqSkwavK3v/1NhmHosssuU1JSkiRp/fr1uu+++7R48WKVlJToX//6l5pWTTh2HcumELO9+eKLL1RYWKirrrrK26V0St/3715aWqqZM2fqyJEjmjp1qiZNmqTf/OY3WrBggTIyMnTzzTfr888/1/jx47Vq1SpJUp8+ffTpp58qKytLf/vb3yRJDz74oAYMGHDWn6PR7VFhZb3yymuVU1qjstoGNbhN2a2Ggp12xYb4KirIqfAAh2zHTNnOzMzU66+/rjvvvFNBQUFnvU4AAAAAQPtk83YBQEfTFEo2hUt33XWXJKmxsVEzZszQrFmztHXrVt19992KiorS6tWrtXv37uMCydLSUoWEhDSHU8furNwehIaGau/evd4uo9M6NpTcs2eP3njjDRUVFenGG2+U2+3Wn/70Jz3wwAMaPny4CgsLVVBQ0Bx679+/X1dffbV+/vOfy+PxaNCgQcrKytLIkSP1+uuvnzDW2Qy/bVaLooOdig52anBCyClfl5iYqD59+mjFihW6+uqrz0ptAAAAAID2j81vgBZqChKbQkWLxaKxY8fKx8dHHo9HH374oUaNGqWHHnpIM2fOVGxsrCSpqqpKL730kp599lk5nU716tXrhHt5W2NjozZu3Kjzzz/f26V0SB6PR8uXL9fWrVvVv39/ud3u5nCwrKxM7733nhYtWqTRo0dr8eLF2rBhg+bNm6dly5YpNTVVI0eOVFFRkex2u/Lz81VRUSF/f3998MEHMk1Td999t/z9/TVhwgTdeeediouLax67aY3KpvdRe3g/nUx8fLyWLl2qxMREhYSceqgJAAAAAOg82t8cUqCDOTZUtNlsMk1TAwcOlNvt1oMPPqif/exnGj58ePP5+/bt06ZNmzR69Gj99a9/1aJFi064l7eFhoaqqqrqjDZM6YrKy8v1P//zPxo/frwef/xxvfHGG5L+02W7a9cu3XjjjVq3bp0qKir06quvKjg4WCNGjJDT6VRUVJR27twpSYqIiND27dv16KOPKisrS/fff7/uvfdeTZ8+XZI0fvx4DRw4UJKO2zDHarW2y+UBviswMFAXXHCBli5d2rxhFAAAAACga2EqN9BKju1Q8/Pz03PPPaf6+no5HA699dZbGjp0qGJiYrR27Vqdd955uv3222W1WhUSEqKVK1dq165dGj16tIYNGybp+zdFaQu+vr5yOp0qKSlRdHR0m4/fUQUFBenyyy/XI488oh07digtLU21tbVyOp2Sjga+MTExuuGGGzRs2DC9+uqrys3NVVRUlNLT03XJJZfonXfe0aRJk1RfX6/f/va3Cg8P1+9///uTvg+aOmyPXfO0Ixk1apS2bt2qzZs3a8SIEd4uBwAAAADQxtp/Ww3QAXk8HpmmKYfDIelod1t0dLT27t2rFStWaOLEiaqoqJBpmtqzZ492796t9evX67HHHlNRUZFycnKap4d7g2EYCg0NVXFxsVfG7whO1uVnmqZGjRolh8OhjRs3KicnR06nU42NjZKkkJAQjR49Wps3b5bdbldNTY369u2rqqoqpaWlKTAwULfddpv++Mc/6pNPPtEll1wi6eialKZpNr+vmrSXDtuWstlsmjx5sj799FNVV1d7uxwAAAAAQBujYxI4C44NFE3TVEREhPLy8rRt2zb5+vrq/vvvV1RUlM477zzZbDbV1dXp+eefV0FBgd555x299NJLSk1N1aOPPqrAwEBJUkNDg+x2e5s9g8vlUklJSZuN1955PB4ZhtEcBlosFtXU1Gjz5s1KTExUYmKiDMNo3mwmJSVFTz75pKSjAZwk+fn5qXv37nrsscdUWVmp999/X++//7569OghPz+/5uvPPffc5jGb3kvHjt2Z9OrVSwkJCfr000916aWXerscAAAAAEAbomMSOMuawqSMjAzt3btXzz77rP7whz/ohhtu0Lhx41RWVqakpCT5+/vrmWeeUUJCgr7++muVlZU1rzdYU1PTHErOmDFDNTU1Z73urt4xeWxnonQ0iDQMo7mzb8WKFRoyZIhWrFhx3BqPTUGiw+HQoEGDmv+tDh48qEOHDikxMVEpKSmqqKjQm2++qYiICPXs2bN5c6RjQ+2OsFZka5g8ebK2b9+uI0eOeLsUAAAAAEAbomMSaCMjR46Ur6+vXC6XXC6XJCk/P1+RkZHq06ePcnJyFBQUpEsuuUTl5eWyWCzq1q2bnn76aa1cuVITJ06U1WpVfn6+fH191djY2NyJdza4XK7mYLQr+O6O6Mf+f01Njd5++22tWLFCJSUlmjNnjvbs2aMnnnhCw4YNU2ho6An327Fjh4KDg+Xr6ytJys3NVWBgoBITE9WzZ085HA4lJye3m53YvcnlcmnUqFFatmyZbrnlli7/egAAAABAV9E12nGAdqJpF+WmbrzIyEjdcMMN6tatm+Li4nTo0CGlpaXppZde0tixY7Vw4UJt3LhR77zzjoqKivTmm29q3rx5knRWQ0mpa3VM5uXl6d133z3u2ObNmzVv3jwVFxerpKREf/7zn3XjjTdq5syZWrhwoeLi4lRQUKDZs2dr7ty5evvttyVJW7ZskXR0PcljO1tTU1M1cOBABQUFqVevXgoODm6eHg5pzJgxKi0tVVpamrdLAQAAAAC0EYJJwAuawqimgNI0TdlsNo0bN04PPvig9u3bp4svvliLFy/W/fffL6fTqerqak2YMEEFBQX66U9/etZrdLlcKisrO26acmdy7OY1UVFRevHFF/XRRx8pOztby5Yt01//+lfZ7XY9//zzyszM1BVXXKH6+nqNHDlSYWFhamho0G233aY5c+bI7XbL399f+fn52rhxoyTpyiuv1AMPPHDcmE3dkRdccIEuu+yyLjNV+1Q4HA5NnDhRK1asUF1dnbfLAQAAAAC0AT4VA17UFFA2/ffWW2/VRx99pDlz5igwMFA9evRQr169lJ+fr/Xr1+uBBx7Qc889pyuuuEKStGjRIn355ZfH3bO1gsTAwEBZLBaVlpa2yv3am6bNa9atW6c1a9aoqqpK9957rzwej7755huNHDlSgwYN0qpVq5Senq7zzz9fS5culSRVVFQoMTFRf//733XVVVepqqpKY8aMUWRkpG6//fbmMb67TiXdkT9swIABcrlcJ7ynAQAAAACdE8Ek0I40dfEFBQXJbrerX79+mj59ui699FJNmTJFBw8eVFZWlqZNmyZJmj9/vkJCQiRJ27ZtkyRZrdYzriM3N1fbtm1TXl6eXnrpJT3++OMdcortsV2R3/XGG29o6NChWrNmjRISEnT33XfrsssuU0JCgkJDQ/XEE09oy5YtevbZZ3XTTTepf//+2r59u6688kqtW7dOAwYM0LRp0/Tpp5/qL3/5i4KCgk4YgyDy9BiGoSlTpmjDhg0qKirydjkAAAAAgLPMML/b0gOgXamsrNRrr72mm2++We+8844cDoemTZumZ599VgUFBfrVr36l//f//p8qKyvV0NCgZ555Rt27d2++3uPxnPaU4SeeeEIbNmxQXl6eQkND5XQ69eijj6pPnz6t/XityuPxKD09XaGhoc27XEvSoUOHVF1drf79+zcfe+GFFxQUFKSLL75YISEhqqur09SpU7V8+XIVFxfrt7/9rW688UatWLFCAQEBuvXWW/XCCy9o8ODBuuqqq44b1+12N+/ajTP30UcfqbS0VD/72c+8XQoAAAAA4Cyyzpo1a5a3iwBwItM0ZZqmfHx8NGLECNntdhUVFenRRx+V3W7X/Pnz9fzzz+vJJ5/UqFGj9Pjjj6tv376yWCyy2WzatWuXYmNjWxSWRUdHa926dXI6nTIMQ3FxcZo2bVq7XxPRMAxt2rRJ69evV+/evVVXV6dLLrlEX375peLi4tS3b195PB5t3LhRvXv31meffaYVK1Zo0aJFGj16tHbv3q3PP/9c8fHxGjFihFatWqXk5GTdeuutCgsL08GDB7Vv3z5deOGFamxsbH49CCVbV3x8vFauXKnIyEiFhYV5uxwAAAAAwFlCxyTQzjVtmNLk22+/1auvvqq6ujo9/PDDuuyyy7Ry5crm7z/xxBMKCgrSwoULNWjQIM2aNeuk04x/zHvvvafnnntONTU1uvPOO9tkw51T9d3XpEleXp5ee+01zZ07V2PGjNGf//xnPf7445o3b558fX1VXFysBx98UB6PR88//7z8/PxkGIZuueUW3XnnnTJNUwsXLtQNN9ygQYMGnXD/goICVVVVKTk5uQ2esmvbtGmT1q1bpzvuuOOs70APAAAAAPAOPu0B7dyxAZzH41FycrJmzpwpt9stwzCUkJCgd955R9dee63S09O1ZMmS5rDyzjvvVFFRUYuCyYsvvlirVq3SmjVrNHTo0NZ8pBZrCiSbXhOPx6O9e/eqT58+WrhwoQoLC9WvXz9dccUV+sMf/qC4uDide+65uuuuuxQaGqp+/frplVdeab722Wef1UsvvaTLL79cffr0kb+/v4YPH37cmMdOhY+IiFBERESbPnNXNXToUG3atElfffWVxowZ4+1yAAAAAABnAVO5gQ7EMAx5PB4ZhtE8fdjlcunVV1+Vv7+/srKy1KtXL1VVVenOO++U2+3W7bff3qINcaxWq5KTk7Vv3z7dcsstbTqNuymAbFq7sYlhGCouLtZbb70lPz8/FRUV6dVXX1VycrICAgJ08OBBpaamyjAMpaWlacSIERo4cKCuvvpqlZeX64svvtBll10mj8cj0zTVt29f3X333brwwgvlcDiaxz62I5Mp2t5hGIYiIyP14YcfavDgwfLx8fF2SQAAAACAVsZUbqCTqK+v15EjR/TII4/o4YcflsvlUnp6us4999wWhYqNbo8KK+uVV16rnNIaldbUq9FtymY1FOLrUGyIr6KCnAoPcMhmbb3Q8uDBg1qzZo1uuumm5mNVVVXy9/fX/Pnz9frrr2vYsGGKjIyUy+VSQkKCJGnChAn6+9//rpSUFLlcLj333HO65ZZb5Ovrq/vuu09+fn763e9+p9TU1BPGZPOa9uvdd9+VaZq6+uqrvV0KAAAAAKCVMZUb6OCapho7HA4lJSVpypQpuuWWW3TTTTfptttuO+375VfUanNGidYeKFJlbYMsMiRDslstMgzJNKUGt0cyJY9MBTjtOr9HmIYlhSoy0HnGz5OUlKTu3bsrKytLH3zwgT788ENFRkbq8ccfV48ePWSz2fTcc8/pm2++0YwZMzRjxgx9/vnnuvLKK5WRkSGHw6H//u//lmEYevnll/WHP/xBS5YsUUBAwPeO2ZKOUrSNCRMm6JlnnlFGRoaSkpK8XQ4AAAAAoBXRMQl0UoWFhQoPD5dpmqqoqPjRdSaLKuu0ZHuOtmeXSjIU5u+Qn8P6g12Epmmqut6toqp6SaYGx4Vo6uBYhQW0fNqt2+3WihUr9Nprr2nDhg367LPPtG/fPr3//vv661//qrFjx2rVqlWyWq36+c9/rnvvvVdPPvmk8vPzFRgYqGuuuUZXXHHFcWtRSkcDXEntfmdxnGjNmjXauXOnfvnLX/LvBwAAAACdCB2TQCfjdrtltVqPCyX/+Mc/6pFHHpG/v/8J53s8pjZ+W6zF27LV4PYoNtj3lKdmG4Yhfx+b/H1sanR7lJZTpm/yKnRlSpxGJLtksZz+1Gir1Sqn06mUlBRt27ZNYWFhioyM1KJFi1RRUaHg4GDNmTNHNTU1iouLU//+/XX33XertrZWI0eO/M6zeZoDSgKtjmvUqFHaunWrNm/erBEjRni7HAAAAABAK2HzG6CTOTaAc7vdmjdvng4cOCC73a7+/fsfd25do1v/+mPWuk0AAB2vSURBVDpTH+/KVYifXRGBzhaFiUfHNRTka5dpmlp/sEhFVfXqExMoWwsCQYvFotzcXOXk5CgsLEz9+vXT1q1bVVRUpMmTJ+tvf/ubxo0bpyuuuEIul0tRUVGKi4uT9J8wUtIJXZPomCwWi1wul5YuXaqhQ4fKbrd7uyQAAAAAQCughQjopEzT1KFDh7RkyRLV19fr/fffV1ZWVvP36xrdeu2rDG3OKFFSmL8Cna0T9gQ67UoK89emjBK99lWG6hrdp32PyMhIBQcHy8/PTwsXLpQkXXLJJUpMTFRKSoqCgoL0y1/+Uj179my+pmlVCjojO6devXopISFBn332mbdLAQAAAAC0Ej7BA51Ubm6uZs+erd69e6u8vFxpaWl66qmn5Ha75fGYWrDxsNKyy5To8pO1hV2S38dqMZTk8tOO7DK9vemwPJ7TW8rW6XQqNDRUV199tW688UZJ0pAhQ5SamqrAwED16tVLe/bsOe4aOiM7v8mTJ2vbtm3Kzc31dikAAAAAgFbA5jdAJ7Vv3z6lp6crPz9f+fn52rNnjw4dOqQXX3xR5T6Ren1DhpLC/Fs9lDyW22Mqo6hKN56bpHO7h33veeXl5crNzVW3bt1+cJpu0/qZ6LpWrlypzMxM3XLLLYTRAAAAANDBEUwCXYhpmiqqqte85Xvk57C22vTtH1JR26Dqerfuv6j3cbt1NzY2as+ePdq2bZsOHjyovn376uqrrz4hbPJ4PEzPRrP6+nrNnz9fF110kQYOHOjtcgAAAAAAZ4BduYEuxDAMfbg9Rw1ujwKdzjYZM9BpV1lNgz7ccUQ3pyYrNzdXW7du1Y4dO+Tr66uUlBRdeumlCg4OPun1hJI4lsPh0EUXXaTly5erd+/ecjgc3i4JAAAAANBCdEwCXUh+Ra3mLEtXbLCvbNYfD/z2b9+g/du//t7v+wUG64Irb/rR+9TU1Sk9I0+DzYOqKy1Q//79NWTIECUmJjIdF6fNNE298sorSkhI0IQJE7xdDgAAAACgheiYBLqQzRklkoxTCiUlKSqxh/wCT+xkLMnL0eF9uxQRn/y915qmqZLiYuXm5qmwsFCNvsHyGzBAv/n5MPn4+HzvdcCPMQxDU6ZM0T//+U8NGTJEYWHfv34pAAAAAKD9IpgEuohGt0drDxQpzP/Up74GhoYrMDT8hON5GQckSfE9+53wvZrqauXm5io3L08ypajoKI0YMVweq0M5jR5ZbWd/XUt0ftHR0UpJSdEnn3yiG264wdvlAAAAAABagMXbgC6isLJelbUN8nOc2a7WdTXVys86pODwqObQ0u12K/fIEW3dulUbN25SdU2Nep9zjkadN0rdu3eXr5+f/BxWVdU2qLCyvjUeB9DYsWN1+PBh7d2719ulAAAAAABagI5JoIvIK6+VRcYZr+mYc/Abmaap+B59VFZaqtzcPOUX5MvX11cx0dEaMGCA7PYTuyINw5AhQ3nltYoObpuNd9C5+fn5ady4cfr444/VvXt32Wz8SgMAAACAjoSOSaCLyCmtkVphn5mMPWmqrKjU4YIy7dy5SxarRUNSUjR8+HDFxcefNJRsZvy7DqCVDBs2TA6HQ+vXr/d2KQAAAACA00R7CdBFlNbUy36Km96cTFlZmXbv2KI9O9MU2723ep3TW2Hh4bJYTv2edqtFZbUNLa4B+C6LxaIpU6bojTfe0KBBgxQUFOTtkgAAAAAAp4iOSaCLaHSbOpNZ3NXV1aotyVd8fJxSx09WRGTkaYWSkmQYUoPbbHkRwEkkJSWpd+/eWrFihbdLAQAAAACcBoJJoIuwWQ2ZZ5AJRkaEy1NdpsAQl1zR8S26h2lKdmsrzCcHvmPixInas2ePMjIyvF0KAAAAAOAUEUwCXUSIr0MNbk+Lr8/LPKCG+jrF9ezX4g10GtweBTt/YA1KoIWCgoJ0wQUXaNmyZfJ4Wv4+BwAAAAC0HYJJoIuIDfGVzqBjMvvANzIMQ3E9+rb8Jua/6wDOglGjRqm+vl5btmzxdikAAAAAgFPA5jdAFxEV5JRHpkzTPO2Ox5rKchXnZiksJkFO/4AWjW+apkyZigpytuh64MfYbDZNnTpVfn5+LXqfAwAAAADaFsEk0EWEBzgU4LSrut4tf5/T+9HPPpAu0zQV37Nfi8evrnfL32lXeICjxfcAfky3bt3k8XgIJQEAAACgAzBM80y2wwDQkSzbeUTLd+cp0eXX5mNnFldrUr8oTR4Q0+ZjAwAAAACA9oc1JoEuZFhSqCRTjWewCU5LHB3P1NCk0DYdFwAAAAAAtF9M5Qa6kMhApwbHhSgtp0zxoW3XNZlbXquU+FBFBrK+JNpGdXW1Fi1apPr6esXHx2vSpEnyeDyyWPh7HAAAAAC0F3xCA7qYqYNjZbdaVFHb0CbjVdQ2yG616NJBTOFG28jNzdWMGTNks9lkt9v18MMPq7i4WBaLRR5P23YLAwAAAAC+H8Ek0MWEBfjoypQ4FVTUye05u0vMuj2mCivrdGVKnMICfM7qWECTdevW6fzzz9e0adN00003ady4cVqyZIkk0TEJAAAAAO0In9CALmhEsksjkl3KLK6W5yyFkx6Pqczi6uaxgLaydu1abdmyRcXFxXrqqaf03nvv6bnnntP+/fvFfm8AAAAA0H6wKzfQRdU1uvXaVxnakV2mRJefrBaj1e7t/ncoOSguWP91XpJ8bNZWuzfwfUzTlGEYqqur0z333KNvv/1Whw4d0iWXXKIHH3xQERER3i4RAAAAAHAMgkmgC6trdOvtTYf19aFiRQT6KNBpP+N7VtQ2qKCiTiO7uXTd8ARCSbSppg1uKioqtGrVKmVlZenOO++UJK1Zs0Yvv/yy/vnPf0qSSktLFRIS4s1yAQAAAKBLY1duoAvzsVn1s5FJOicyUIu3ZauspkHRQU7ZrKe/ykOj26Pc8lrZrRbdeG6SRiS7ZGnFLkzgVDStIRkYGKj8/Hxt3rxZkjRv3jx9+eWX+vWvf62tW7fq3Xff1cSJEzVmzBgZBu9TAAAAAPAGgkmgi7NYDJ3bPUw9IwO0ZHuOtmeXSjIU5u+Qn8P6g6GNaZqqrnerqKpekqmU+FBdOiiGjW7QLvzyl7/UkSNH1L9/f5133nl6++23tXHjRl166aW66KKLdMEFF3i7RAAAAADo0pjKDeA4+RW12pJRojUHilRV2yBDhmRIdqtFhiGZptTg9kimZMpUgNOu0T3DNSQxRJGBTm+XD0j6z5Tu/Px8rV69Wtdee63mzp2rgoICDR06VGvXrtVjjz2moKCg5rUpAQAAAABti2ASwEk1uj0qrKxXXnmtckprVFbboAa3KbvVULDTrtgQX0UFORUe4GjR1G/gbGsKJyXpzTff1JEjR/TrX/9afn5+WrVqlY4cOaIbb7xRkggnAQAAAMALmMoN4KRsVouig52KDnZqcAIbhKDjaQolJWn//v0KCAiQn5+fPvvsM33++ec6cOCAGhsbNX36dO8VCQAAAABdGB2TAIBO69iuyYULF6q8vFx79+7VyJEjlZSUpAcffFBPPfWUBgwY4OVKAQAAAKDroWMSANBpNYWSO3bs0L59+3TkyBH17dtXV111lQoLC+Xn5ye73d58/rFBJgAAAADg7KJjEgDQqXk8Hj355JNKTk5WTk6OduzYocmTJystLU1jxoxRdHS0Vq5cqXvuucfbpQIAAABAl0IwCQDo9MrKymS1WhUQEKBHH31Upmnq2muv1aJFi/Tll1/qtttu03XXXaemX4lshAMAAAAAZx/BJACgS2lsbJTNZtMTTzyh/fv3a/bs2YqMjFRdXZ3KysoUGRnp7RIBAAAAoEtgIS0AQJdis9lUUVGhbdu2af78+YqMjNSaNWs0duxYrVq1SpK0evVqlZSUeLlSAAAAAOjc2PwGANDlBAYGKiYmRi+88IIKCgq0bds2vfDCC7Jarbrooos0bdo0DR061NtlAgAAAECnRsckAKBL8Xg8kqR58+Zp7dq1ysjI0MKFC1VaWqrHHntMO3bsUM+ePRUYGOjlSgEAAACgc2ONSQBAl+PxeGSxWFRaWqqQkBC9+OKL2rlzp2bMmKGGhgatXr1a06dPl2EYzecCAAAAAFoXn7QAAF1OU9AYEhKi7Oxsffnll5o7d64SEhLk8Xi0Y8cOvfbaa8edCwAAAABoXXzaAgB0aXFxcYqPj9eOHTskSbt27dL48eO1YMECvfjii16uDgAAAAA6L4JJAECX1bTe5MyZM7VixQrt3btXkpSRkaFXXnlFTqdTbrfbmyUCAAAAQKdFMAkA6LKapmn7+vpq2LBhuueeexQWFqbS0lJFRETo+uuvl9VqbQ4wAQAAAACth2ASAABJkydP1sCBA7V8+XIlJSVJknx8fCT9J8DMz89XY2Oj12oEAAAAgM6EXbkBAF3esTtvV1dXq7KyUpGRkc3fX79+vWbNmqUBAwaoqKhIL7/8srdKBQAAAIBOg45JAECXd+zO2263W5988knz1y+88IL+8Y9/6P7779e8efOUmZmprVu3eqNMAAAAAOhUbN4uAACA9iQwMFAXXXSRTNPU9OnTFRISoscff1zh4eHKyMjQ5MmTNWTIEG+XCQAAAAAdHh2TAAB8R1RUlDIyMhQVFaWnn35a4eHhWrdunWbMmKHo6GhJEiuhAAAAAMCZoWMSAICTCA0N1dKlSxUTEyOPx6O1a9fqjjvu0IQJEyRJhmF4uUIAAAAA6NjY/AYAgO9o2gxn48aNqqqqUklJicaPH6+goCBvlwYAAAAAnQbBJAAAp8g0TTolAQAAAKCVsMYkAACngFASAAAAAFoXwSQAAD/C7XZrzZo1bHgDAAAAAK2IYBIAgB/hdrv19ddfa9euXd4uBQAAAAA6DYJJAAB+hMPh0MSJE7V8+XLV19d7uxwAAAAA6BQIJgEAOAUDBw5UcHCwvvzyS2+XAgAAAACdAsEkAACnwDAMXXzxxVq/fr2Ki4u9XQ4AAAAAdHgEkwAAnKKYmBgNGjRIn3zyibdLAQAAAIAOj2ASAIDTMH78eGVmZmrfvn3eLgUAAAAAOjSCSQAAToOfn5/Gjh2rjz/+WG6329vlAAAAAECHRTAJAMBpGj58uGw2m9avX+/tUgAAAACgwyKYBADgNFksFk2ZMkVffPGFKioqvF0OAAAAAHRIBJMAALRAcnKyevXqpZUrV3q7FAAAAADokAgmAQBooYkTJyo9PV2HDx/2dikAAAAA0OEQTAIA0ELBwcEaPXq0li5dKo/H4+1yAAAAAKBDIZgEAOAMpKamqra2Vlu3bvV2KQAAAADQoRBMAgBwBmw2myZPnqxVq1appqbG2+UAAAAAQIdBMAkAwBk655xzFBsbq88++8zbpQAAAABAh0EwCQDAGTIMQ5MnT9aWLVuUl5fn7XIAAAAAoEMgmAQAoBWEh4fr3HPP1bJly2SaprfLAQAAAIB2j2ASAIBWcsEFF6iwsFC7du3ydikAAAAA0O4RTAIA0Ep8fHw0ceJELV++XPX19d4uBwAAAADaNYJJAABa0aBBgxQcHKw1a9Z4uxQAAAAAaNcIJgEAaEWGYejiiy/WV199peLiYm+XAwAAAADtFsEkAACtLCYmRoMGDdInn3zi7VIAAAAAoN0imAQA4CwYN26cMjIytH//fm+XAgAAAADtEsEkAABngb+/v8aNG6dly5bJ7XZ7uxwAAAAAaHcIJgEAOEuGDx8um82mDRs2eLsUAAAAAGh3CCYBADhLLBaLpkyZotWrV6uiosLb5QAAAABAu0IwCQDAWZScnKyePXtq5cqV3i4FAAAAANoVgkkAAM6yiy66SOnp6Tp8+LC3SwEAAACAdoNgEgCAsyw4OFijR4/WsmXL5PF4vF0OAAAAALQLBJMAALSB1NRU1dTUaOvWrd4uBQAAAADaBYJJAADagM1m06RJk7Rq1SrV1NR4uxwAAAAA8DqCSQAA2kjv3r0VGxurzz//3NulAAAAAIDXEUwCANBGDMPQ5MmTtXnzZuXl5Xm7HAAAAADwKoJJAADaUHh4uEaOHKlly5bJNE1vlwMAAAAAXkMwCQBAG/vJT36iwsJC7d6929ulAAAAAIDXEEwCANDGfHx8NHHiRC1fvlwFBQV66623tGPHDm+XBQAAAABtimASAAAv6Nevn7Kzs3Xbbbfptdde08GDB71dEgAAAAC0KYJJAADaWGFhoWbPnq39+/ersLBQISEhKi8v93ZZAAAAANCmCCYBAGhjNptNDodDvr6+ioyMVG5urioqKrxdFgAAAAC0KZu3CwAAoKsJCQnRAw88oNWrV+v//u//tG/fPh06dOi4cxrdHhVW1iuvvFY5pTUqralXo9uUzWooxNeh2BBfRQU5FR7gkM3K3xkBAAAAdDyGaZqmt4sAAKCrysvL05/+9Cfl5OTonXfeUX5FrTZnlGjtgSJV1jbIIkMyJLvVIsOQTFNqcHskU/LIVIDTrvN7hGlYUqgiA53efhwAAAAAOGUEkwAAeJnb7da+zCNan9Og7dmlkgyF+Tvk57DKMIzvvc40TVXXu1VUVS/J1OC4EE0dHKuwAJ82qx0AAAAAWopgEgAAL/J4TG38tliLt2Wrwe1RdJCzRVOzG90e5ZbXym616MqUOI1Idsli+f5QEwAAAAC8jWASAAAvqWt0a8HGw9r4bbEiAn0U6LSf8T0rahtUUFGnEcku/XREgnxs1laoFAAAAABaH8EkAABeUNfo1mtfZSgtu0wJLj9ZW7G70e0xlVlcrUFxwfqv85IIJwEAAAC0S2zjCQBAG/N4TC3YeFhp2WVKbOVQUpKsFkNJLj/tyC7T25sOy+Phb5AAAAAA2h+CSQAA2tjGb4u18dtiJbj8zto6kBaLoUSXn74+dHQsAAAAAGhvCCYBAGhDhZV1WrwtWxGBPq3eKfldVouhiEAfLd6WraLKurM6FgAAAACcLoJJAADa0Ifbc9Tg9rTKRjenItBpV4Pbow93HGmT8QAAAADgVBFMAgDQRvIrarU9u1TRQc4fPK+mslz7t29QeXFBq4wbHeTUtqwS5VfU/uB569ev17Zt21plTAAAAAD4MQSTAAC0kc0ZJZIM2aw//Ov3aDD5tSqKC1tl3KPjGdqSUfKD5xFMAgAAAGhLBJMAALSBRrdHaw8UKczf4ZXxw/wdWnOgSI1uj1fGBwAAAIDvMkzTNL1dBAAAHVVVVZU+++wz7d27V1VVVQoICFC/fv104YUXysfHp/m8l994Wy8uXqXrbr9PhvGfTW9qKsu1+t1X1XPwSPUcfK6y96crbd3KE8aJ69FXA8+f0Pz94RMuV0letrIPfKOGuhoFhISp15DzFB6b2HxNcW6Wvl6+WANTJyi2Rx8dLq7WA5P7KjrYqVdeeUWlpaW65557JEmzZs066fN933EAAAAAOFM2bxcAAEBHVVtbq5deeknFxcUaOnSooqOjlZ2dra+++krffvutbrvtNtlsR3/VltY0yJBxXCh5MqFRseo+YLgO7tykhF79FRoVK0nyDQg+7ry9m9fKND1K6jtYHrdbWft2afOqDzR8wuUKi0k44b6GYciQobzyWkUHn7jG5VVXXaWPP/5Yfn5+uuCCC1r6kgAAAADAKSOYBACghdauXauioiJNnTpVw4YNkySNGDFC4eHhWrlypb7++mulpqZKkoqr6qQfziQlSX6BwQqPTdDBnZsUEhGj2O59TnpeQ0Odzr/0BtkcR6eGx/XsqzXvv6H0r7/Q6Mt/dvKbG1JOaY0GJ4Sc8K1Bgwbp008/VUBAgAYNGnQKTw8AAAAAZ4Y1JgEAaKFvvvlGAQEBGjJkyHHHR40aJYfDoW+++ab5WFV9oyw/0i15OhLOGdAcSkqS0y9AMd3OUWVZsarKT77Jjd1qUVltQ6vVAAAAAABngmASAIAWKikpUXh4uCyW43+d2mw2uVwulZT8JyD0uKVWzCXlHxT6vceqK8pPeo1hSA1ulpYGAAAA0D4QTAIA0AYsVulk2815PG23S7ZpSnar0ebjAgAAAMDJEEwCANBCoaGhKioqkvmdxLGxsVElJSUKDf1PV6MrMFAe01R9Xe1x59ZUnry78cecbLp20zG/wCBJkt3n6CY3DfVHx2xwexTstEuSSktLWzQuAAAAALQWgkkAAFqoT58+qqio0LZt2447vmHDBtXV1alPn/9sXNM9PloypeLcrOZjpmkq45vtJ9zXajsaHjYFiidzeO9ONdbXN39dW12pI4f2KiDY1Tyl29c/SIZhqKhpTFOKDfHVrl27VF5+YiDqcDhUU1NzCk8OAAAAAGeOXbkBAGih888/X7t379aSJUuUnZ2tqKgoZWdna/v27YqJidHIkSObz00dMUTPvfmedq5bpaqyEtnsDuVlHpDb3XjCfQNCXLLZ7Mrckyar1Sabw0e+AUEKiYhuPsdu99GGj99RbI++Mj0eHd67U+7GBvUZMab5HJvDobgefZS1P13bv/xE9T4u7bR8qyMZ++VyuU6Yzh0fH6+tW7fq888/V1hYmAzD0IABA87CKwcAAAAAknXWrFmzvF0EAAAdkc1m04ABA1RXV6c9e/YoPT1d1dXVSklJ0ZVXXinHMbtmB/g6tL3MrsrifBUc3q/y4nyFxySq15BRytyTJld0nFzR8ZIki8WqgBCXSvJzlHNwj3Iz9ss0TUUldldFcaHyDx/UwNETZbHYlLVvlwpzMuT0D9SA1PGKiEs6rkZXVLzqaqqV/e0BleYd1sCEUF17zTXKzMxUbW2tRo0a1XxuXFyciouLtXPnTqWlpWn37t268MIL2+S1BAAAAND1GOZ3F8YCAABnxbKdR7R8d54SXX4tvkf2/nSlrVupkRdd2RxknorM4mpN6helyQNiWjw2AAAAALQm1pgEAKCNDEsKlWSq0d22O2IfHc/U0KTQHz0XAAAAANoKwSQAAG0kMtCpwXEhyi3//k1tzobc8lqlxIcqMtDZpuMCAAAAwA8hmAQAoA1NHRwru9WiitqGNhmvorZBdqtFlw5iCjcAAACA9oU1JgEAaGMbDhbp9Q0ZSgrzl9VinLVx3B5TmcVV+tnIJJ3bPeysjQMAAAAALUHHJAAAbWxEsksjkl3KLK6Wx3N2/j7o8ZjKLK5uHgsAAAAA2huCSQAA2pjFYuinIxI0KC5YGcXVcrdyOOn2mMoortaguGBdNzxBlrPYlQkAAAAALcVUbgAAvKSu0a23Nx3W14eKFRHoo0Cn/YzvWVHboIKKOo3s5tJ1wxPkY7O2QqUAAAAA0PoIJgEA8CKPx9TGb4u1eFv2/2/f7nGiDKAoDJ+PcXBUVBTif4yFtcHCziXYuFa3YG3CAiwsJGokIoKODMPnCkyU4jDF86zg1m/OzWJ5lns3Zrk0+f+HhtPlWT4dzjOdrOX1zsO8eHLbUhIAAFhpwiQArID9o995s7uX3Y8HSYZsXVvP1fVJhuHvcXEcx/w8WWb/+CTJmJ1Ht/Lq2f1sbVyu3Q0AAHBewiQArJAvP+Z59+Fb3r7fz/F8kSFDMiTTyVqGIRnHZLE8S8ZkzJiN2TQvn27n+ePN3Lk+u+jzAQAA/pkwCQAr6HR5lq9HJ/l8OM/ewa98ny+yWI6ZTobcnE3zYPNK7t6YZXtj/Vyv3wAAABdNmAQAAAAA6kwsAAAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqBMmAQAAAIA6YRIAAAAAqPsDbPCyBdszhPIAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for result in search.history:\n", + " pipeline_utils.plot_pipeline(result.pipeline)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/axolotl/examples/load_csv.ipynb b/axolotl/examples/load_csv.ipynb new file mode 100644 index 0000000..0b26179 --- /dev/null +++ b/axolotl/examples/load_csv.ipynb @@ -0,0 +1,424 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Axolotl CSV manipulation [Binary Classification]." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we are showcasing different components of the system.\n", + "- Loading syntethic data for a univariate regression task.\n", + "- Easy use of the backend.\n", + "- Use of simple interface for search predefined method.\n", + "- Exploring searched pipelines." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import multiple utils we will be using" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-07-12 15:23:25,435\tINFO resource_spec.py:212 -- Starting Ray with 4.39 GiB memory available for workers and up to 2.2 GiB for objects. You can adjust these settings with ray.init(memory=, object_store_memory=).\n", + "2020-07-12 15:23:25,965\tINFO services.py:1170 -- View the Ray dashboard at localhost:8265\n" + ] + } + ], + "source": [ + "import os\n", + "from pprint import pprint\n", + "import pandas as pd\n", + "from sklearn.datasets import make_regression\n", + "\n", + "from d3m import container\n", + "from d3m.metadata.pipeline import Pipeline\n", + "\n", + "from axolotl.utils import data_problem, pipeline as pipeline_utils\n", + "from axolotl.backend.ray import RayRunner\n", + "from axolotl.algorithms.random_search import RandomSearch\n", + "\n", + "# init runner\n", + "backend = RayRunner(random_seed=42, volumes_dir=None, n_workers=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load csv file and transform it as dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "table_path = os.path.join('..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv')\n", + "df = pd.read_csv(table_path)\n", + "dataset, problem_description = data_problem.generate_dataset_problem(df, task='binary_classification', target_index=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an instance of the search and fit with the input_data." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# The method fit search for the best pipeline based on the time butget and fit the best pipeline based on the rank with the input_data.\n", + "search = RandomSearch(problem_description=problem_description, backend=backend)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 47ec5c86-46b8-4dee-9562-1e5ebc3d0824 failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 64da5190-c2ee-4b8e-abef-697b54cfa32b failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 9e03188f-2120-49ac-a087-1e4fb1b29754 failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline af32bc20-64fa-44a5-ab34-bbe810b671b1 failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 5dbc9e87-19be-4cda-ac51-c1d7ea9328c1 failed.',)]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 918c088e-58dd-4991-8336-deb0b41cb5eb failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 41dfec8f-0b07-4f8e-8ff3-cdbb1dab11c7 failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline d465a878-1ea5-4b72-b8a7-3a4122d1a482 failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 8c39e981-f446-4fde-8744-5606c35a7fdf failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline df127bce-11af-4fae-b8bb-722cb0666484 failed.',)]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n", + "(pid=85426) The parameter 'presort' is deprecated and has no effect. It will be removed in v0.24. You can suppress this warning by not passing any value to the 'presort' parameter. We also recommend using HistGradientBoosting models instead.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 0985e11e-8db0-4c1c-9f34-3ce8fbc626c1 failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline 8977a9c0-dd79-4771-9dc1-455586b80947 failed.',)]\n", + "Current trial is failed. Error: [StepFailedError('Step 7 for pipeline c0238551-5fbb-41cd-8187-d3d23bc5571d failed.',)]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(pid=85426) class_weight presets \"balanced\" or \"balanced_subsample\" are not recommended for warm_start if the fitted data differs from the full dataset. In order to use \"balanced\" weights, use compute_class_weight (\"balanced\", classes, y). In place of y you can use a large enough sample of the full training set target to properly estimate the class frequency distributions. Pass the resulting weights as the class_weight parameter.\n" + ] + } + ], + "source": [ + "fitted_pipeline, fitted_pipelineine_result = search.search_fit(input_data=[dataset], time_limit=30)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "produce_results = search.produce(fitted_pipeline, [dataset])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
d3mIndexspecies
00Iris-setosa
11Iris-setosa
22Iris-setosa
33Iris-setosa
44Iris-setosa
.........
145145Iris-virginica
146146Iris-virginica
147147Iris-virginica
148148Iris-virginica
149149Iris-virginica
\n", + "

150 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " d3mIndex species\n", + "0 0 Iris-setosa\n", + "1 1 Iris-setosa\n", + "2 2 Iris-setosa\n", + "3 3 Iris-setosa\n", + "4 4 Iris-setosa\n", + ".. ... ...\n", + "145 145 Iris-virginica\n", + "146 146 Iris-virginica\n", + "147 147 Iris-virginica\n", + "148 148 Iris-virginica\n", + "149 149 Iris-virginica\n", + "\n", + "[150 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "produce_results.output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Print information about scores of the succeded pipelines." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------------------\n", + "Pipeline id: 676360d8-71ac-401c-b44a-31a810c4e8d3\n", + "Rank: 0.22667216466666668\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.773333 0.773333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 85d44359-0dac-4260-aea8-c78950025c3f\n", + "Rank: 0.33333446433333336\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.666667 0.666667 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 3efb07be-28ff-45d8-b1fb-1c49f96b3381\n", + "Rank: 0.6666653826666668\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.333333 0.333333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: abd9eb99-a4ba-4210-bb34-c2dec7c3ccfa\n", + "Rank: 0.6666606186666667\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.333333 0.333333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 8948a194-0dfe-4d07-a7c8-d1f5136f68c6\n", + "Rank: 0.21333939733333337\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.786667 0.786667 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 22866f54-ba68-49e5-8f84-a2a6aba98253\n", + "Rank: 0.16000235200000004\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.84 0.84 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 37a1c72a-9efd-4b0a-9d3d-811d47571b45\n", + "Rank: 0.6666753326666668\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.333333 0.333333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 2d3cae0f-66f6-46e0-9fa5-128bf02b4d7e\n", + "Rank: 0.6666655736666668\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.333333 0.333333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: d1e5a59d-be50-42f3-a71b-cf8ba59b3c47\n", + "Rank: 0.08666869166666667\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.913333 0.913333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 35d47611-bded-4669-9803-9d259f686ec1\n", + "Rank: 0.35999672099999996\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.64 0.64 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 7398d17f-e91f-4c75-9a95-c9f85763c858\n", + "Rank: 0.6666598006666667\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.333333 0.333333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 5293503b-4cb6-4b8b-bf8e-8b9d981c3b03\n", + "Rank: 0.04666429966666663\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.953333 0.953333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 756e2a15-3315-4aa1-8620-f73ffc69f8a4\n", + "Rank: 0.6666748276666667\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.333333 0.333333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 46633510-6f46-479e-982e-263aaa2e187a\n", + "Rank: 0.17999182400000005\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.82 0.82 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 49a750b0-5c86-4ff3-9b2d-c58c6390dd0d\n", + "Rank: 0.6666588986666667\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.333333 0.333333 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 84c24452-b2cf-41a2-813c-a135eaeef480\n", + "Rank: 0.36000324699999997\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.64 0.64 42 0\n", + "----------------------------------------------------\n", + "Pipeline id: 82117b6b-6960-48bb-b1f4-91355acf51d6\n", + "Rank: 0.026667331666666617\n", + " metric value normalized randomSeed fold\n", + "0 ACCURACY 0.973333 0.973333 42 0\n" + ] + } + ], + "source": [ + "for pipeline_result in search.history:\n", + " print('-' * 52)\n", + " print('Pipeline id:', pipeline_result.pipeline.id)\n", + " print('Rank:', pipeline_result.rank)\n", + " print(pipeline_result.scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/axolotl/examples/random_search/oracle.json b/axolotl/examples/random_search/oracle.json new file mode 100644 index 0000000..1e4ae71 --- /dev/null +++ b/axolotl/examples/random_search/oracle.json @@ -0,0 +1 @@ +{"ongoing_trials": {}, "hyperparameters": "{'space': [{'class_name': 'Enumeration', 'config': {'type': d3m.metadata.hyperparams.Enumeration, 'default': '6a520746-108c-45bf-a6d8-c875b5a9d326', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'values': ['6a520746-108c-45bf-a6d8-c875b5a9d326', 'a6b468a5-4d03-405e-a707-8e377f9ad1c3', 'ef1c483a-34fc-4398-a6b3-063b33786972'], 'p': Enumeration(values=['6a520746-108c-45bf-a6d8-c875b5a9d326', 'a6b468a5-4d03-405e-a707-8e377f9ad1c3', 'ef1c483a-34fc-4398-a6b3-063b33786972'], default=6a520746-108c-45bf-a6d8-c875b5a9d326)}}, {'class_name': 'Enumeration', 'config': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'mean', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The imputation strategy. - If \"mean\", then replace missing values using the mean along each column. Can only be used with numeric data. - If \"median\", then replace missing values using the median along each column. Can only be used with numeric data. - If \"most_frequent\", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. - If \"constant\", then replace missing values with fill_value. Can be used with strings or numeric data. .. versionadded:: 0.20 strategy=\"constant\" for fixed value imputation.', 'values': ['median', 'most_frequent', 'mean', 'constant'], 'p': Enumeration(values=['median', 'most_frequent', 'mean', 'constant'], default=mean)}}, {'class_name': 'UniformBool', 'config': {'type': d3m.metadata.hyperparams.UniformBool, 'default': False, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'p': UniformBool(default=False)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[int, NoneType], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'When strategy == \"constant\", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and \"missing_value\" for strings or object data types.', 'configuration': {'int': {'type': d3m.metadata.hyperparams.Hyperparameter, 'default': 0, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}}, 'p': Union(configuration={int: Hyperparameter(default=0), none: Constant(default=None)}, default=None)}}, {'class_name': 'Enumeration', 'config': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'SPEC', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'different method to choose for feature selection', 'values': ['SPEC', 'fisher_score', 'reliefF', 'CIFE', 'f_score', 'chi_square'], 'p': Enumeration(values=['SPEC', 'fisher_score', 'reliefF', 'CIFE', 'f_score', 'chi_square'], default=SPEC)}}, {'class_name': 'Uniform', 'config': {'type': d3m.metadata.hyperparams.Uniform, 'default': 0.5, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'percentage of features to select, between 0 and 1', 'lower': 0.25, 'upper': 1, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Uniform(lower=0.25, upper=1, q=None, default=0.5, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 10, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The number of trees in the forest.', 'lower': 1, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=1, upper=None, default=10, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Enumeration', 'config': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'gini', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The function to measure the quality of a split. Supported criteria are \"gini\" for the Gini impurity and \"entropy\" for the information gain. Note: this parameter is tree-specific.', 'values': ['gini', 'entropy'], 'p': Enumeration(values=['gini', 'entropy'], default=gini)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': 'auto', 'structural_type': typing.Union[str, NoneType, float], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If \"auto\", then `max_features=sqrt(n_features)`. - If \"sqrt\", then `max_features=sqrt(n_features)` (same as \"auto\"). - If \"log2\", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', 'configuration': {'calculated': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'auto', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'values': ['auto', 'sqrt', 'log2']}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}, 'percent': {'type': d3m.metadata.hyperparams.Uniform, 'default': 0.25, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': 1, 'lower_inclusive': True, 'upper_inclusive': False}}, 'p': Union(configuration={calculated: Enumeration(values=['auto', 'sqrt', 'log2'], default=auto), none: Constant(default=None), percent: Uniform(lower=0, upper=1, q=None, default=0.25, lower_inclusive=True, upper_inclusive=False)}, default=auto)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[int, NoneType], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', 'configuration': {'int': {'type': d3m.metadata.hyperparams.Bounded, 'default': 10, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}}, 'p': Union(configuration={int: Bounded(lower=0, upper=None, default=10, lower_inclusive=True, upper_inclusive=False), none: Constant(default=None)}, default=None)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': 2, 'structural_type': typing.Union[int, float], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', 'configuration': {'absolute': {'type': d3m.metadata.hyperparams.Bounded, 'default': 2, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 1, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'percent': {'type': d3m.metadata.hyperparams.Uniform, 'default': 0.25, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': 1, 'lower_inclusive': False, 'upper_inclusive': True}}, 'p': Union(configuration={absolute: Bounded(lower=1, upper=None, default=2, lower_inclusive=True, upper_inclusive=False), percent: Uniform(lower=0, upper=1, q=None, default=0.25, lower_inclusive=False, upper_inclusive=True)}, default=2)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': 1, 'structural_type': typing.Union[int, float], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', 'configuration': {'absolute': {'type': d3m.metadata.hyperparams.Bounded, 'default': 1, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 1, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'percent': {'type': d3m.metadata.hyperparams.Uniform, 'default': 0.25, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': 0.5, 'lower_inclusive': False, 'upper_inclusive': True}}, 'p': Union(configuration={absolute: Bounded(lower=1, upper=None, default=1, lower_inclusive=True, upper_inclusive=False), percent: Uniform(lower=0, upper=0.5, q=None, default=0.25, lower_inclusive=False, upper_inclusive=True)}, default=1)}}, {'class_name': 'Uniform', 'config': {'type': d3m.metadata.hyperparams.Uniform, 'default': 0, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', 'lower': 0, 'upper': 0.5, 'lower_inclusive': True, 'upper_inclusive': True, 'p': Uniform(lower=0, upper=0.5, q=None, default=0, lower_inclusive=True, upper_inclusive=True)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[int, NoneType], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', 'configuration': {'int': {'type': d3m.metadata.hyperparams.Bounded, 'default': 10, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}}, 'p': Union(configuration={int: Bounded(lower=0, upper=None, default=10, lower_inclusive=True, upper_inclusive=False), none: Constant(default=None)}, default=None)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.0, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', 'lower': 0.0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=0.0, upper=None, default=0.0, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Enumeration', 'config': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'bootstrap', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'Whether bootstrap samples are used when building trees. And whether to use out-of-bag samples to estimate the generalization accuracy.', 'values': ['bootstrap', 'bootstrap_with_oob_score', 'disabled'], 'p': Enumeration(values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], default=bootstrap)}}, {'class_name': 'UniformBool', 'config': {'type': d3m.metadata.hyperparams.UniformBool, 'default': False, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.', 'p': UniformBool(default=False)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[str, NoneType], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': '\"balanced_subsample\" or None, optional (default=None) Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The \"balanced\" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The \"balanced_subsample\" mode is the same as \"balanced\" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.', 'configuration': {'str': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'balanced', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'values': ['balanced', 'balanced_subsample']}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}}, 'p': Union(configuration={str: Enumeration(values=['balanced', 'balanced_subsample'], default=balanced), none: Constant(default=None)}, default=None)}}, {'class_name': 'Enumeration', 'config': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'deviance', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': \"loss function to be optimized. 'deviance' refers to deviance (= logistic regression) for classification with probabilistic outputs. For loss 'exponential' gradient boosting recovers the AdaBoost algorithm.\", 'values': ['deviance', 'exponential'], 'p': Enumeration(values=['deviance', 'exponential'], default=deviance)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.1, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'learning rate shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators.', 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=0, upper=None, default=0.1, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 100, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance.', 'lower': 1, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=1, upper=None, default=100, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 3, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables.', 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=0, upper=None, default=3, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Enumeration', 'config': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'friedman_mse', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The function to measure the quality of a split. Supported criteria are \"friedman_mse\" for the mean squared error with improvement score by Friedman, \"mse\" for mean squared error, and \"mae\" for the mean absolute error. The default value of \"friedman_mse\" is generally the best as it can provide a better approximation in some cases. .. versionadded:: 0.18', 'values': ['friedman_mse', 'mse', 'mae'], 'p': Enumeration(values=['friedman_mse', 'mse', 'mae'], default=friedman_mse)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': 2, 'structural_type': typing.Union[int, float], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', 'configuration': {'absolute': {'type': d3m.metadata.hyperparams.Bounded, 'default': 2, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 1, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'percent': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.25, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': 1, 'lower_inclusive': True, 'upper_inclusive': True}}, 'p': Union(configuration={absolute: Bounded(lower=1, upper=None, default=2, lower_inclusive=True, upper_inclusive=False), percent: Bounded(lower=0, upper=1, default=0.25, lower_inclusive=True, upper_inclusive=True)}, default=2)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': 1, 'structural_type': typing.Union[int, float], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', 'configuration': {'absolute': {'type': d3m.metadata.hyperparams.Bounded, 'default': 1, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 1, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'percent': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.25, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': 0.5, 'lower_inclusive': True, 'upper_inclusive': True}}, 'p': Union(configuration={absolute: Bounded(lower=1, upper=None, default=1, lower_inclusive=True, upper_inclusive=False), percent: Bounded(lower=0, upper=0.5, default=0.25, lower_inclusive=True, upper_inclusive=True)}, default=1)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', 'lower': 0, 'upper': 0.5, 'lower_inclusive': True, 'upper_inclusive': True, 'p': Bounded(lower=0, upper=0.5, default=0, lower_inclusive=True, upper_inclusive=True)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 1.0, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0 this results in Stochastic Gradient Boosting. `subsample` interacts with the parameter `n_estimators`. Choosing `subsample < 1.0` leads to a reduction of variance and an increase in bias.', 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=0, upper=None, default=1.0, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[int, str, NoneType, float], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If \"auto\", then `max_features=sqrt(n_features)`. - If \"sqrt\", then `max_features=sqrt(n_features)`. - If \"log2\", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Choosing `max_features < n_features` leads to a reduction of variance and an increase in bias. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', 'configuration': {'specified_int': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'calculated': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'auto', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'values': ['auto', 'sqrt', 'log2']}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}, 'percent': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.25, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': 1, 'lower_inclusive': True, 'upper_inclusive': True}}, 'p': Union(configuration={specified_int: Bounded(lower=0, upper=None, default=0, lower_inclusive=True, upper_inclusive=False), calculated: Enumeration(values=['auto', 'sqrt', 'log2'], default=auto), none: Constant(default=None), percent: Bounded(lower=0, upper=1, default=0.25, lower_inclusive=True, upper_inclusive=True)}, default=None)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[int, NoneType], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', 'configuration': {'int': {'type': d3m.metadata.hyperparams.Bounded, 'default': 10, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}}, 'p': Union(configuration={int: Bounded(lower=0, upper=None, default=10, lower_inclusive=True, upper_inclusive=False), none: Constant(default=None)}, default=None)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.0, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19', 'lower': 0.0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=0.0, upper=None, default=0.0, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'UniformBool', 'config': {'type': d3m.metadata.hyperparams.UniformBool, 'default': False, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just erase the previous solution.', 'p': UniformBool(default=False)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.1, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if ``n_iter_no_change`` is set to an integer.', 'lower': 0, 'upper': 1, 'lower_inclusive': True, 'upper_inclusive': True, 'p': Bounded(lower=0, upper=1, default=0.1, lower_inclusive=True, upper_inclusive=True)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[int, NoneType], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': '``n_iter_no_change`` is used to decide if early stopping will be used to terminate training when validation score is not improving. By default it is set to None to disable early stopping. If set to a number, it will set aside ``validation_fraction`` size of the training data as validation and terminate training when validation score is not improving in all of the previous ``n_iter_no_change`` numbers of iterations.', 'configuration': {'int': {'type': d3m.metadata.hyperparams.Bounded, 'default': 5, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}}, 'p': Union(configuration={int: Bounded(lower=0, upper=None, default=5, lower_inclusive=True, upper_inclusive=False), none: Constant(default=None)}, default=None)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.0001, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'Tolerance for the early stopping. When the loss is not improving by at least tol for ``n_iter_no_change`` iterations (if set to a number), the training stops.', 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=0, upper=None, default=0.0001, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 10, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The number of trees in the forest.', 'lower': 1, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=1, upper=None, default=10, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Enumeration', 'config': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'gini', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The function to measure the quality of a split. Supported criteria are \"gini\" for the Gini impurity and \"entropy\" for the information gain.', 'values': ['gini', 'entropy'], 'p': Enumeration(values=['gini', 'entropy'], default=gini)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[int, NoneType], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', 'configuration': {'int': {'type': d3m.metadata.hyperparams.Bounded, 'default': 10, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}}, 'p': Union(configuration={int: Bounded(lower=0, upper=None, default=10, lower_inclusive=True, upper_inclusive=False), none: Constant(default=None)}, default=None)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': 2, 'structural_type': typing.Union[int, float], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', 'configuration': {'absolute': {'type': d3m.metadata.hyperparams.Bounded, 'default': 2, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 1, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'percent': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.25, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': 1, 'lower_inclusive': True, 'upper_inclusive': True}}, 'p': Union(configuration={absolute: Bounded(lower=1, upper=None, default=2, lower_inclusive=True, upper_inclusive=False), percent: Bounded(lower=0, upper=1, default=0.25, lower_inclusive=True, upper_inclusive=True)}, default=2)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': 1, 'structural_type': typing.Union[int, float], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', 'configuration': {'absolute': {'type': d3m.metadata.hyperparams.Bounded, 'default': 1, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 1, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'percent': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.25, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': 0.5, 'lower_inclusive': True, 'upper_inclusive': True}}, 'p': Union(configuration={absolute: Bounded(lower=1, upper=None, default=1, lower_inclusive=True, upper_inclusive=False), percent: Bounded(lower=0, upper=0.5, default=0.25, lower_inclusive=True, upper_inclusive=True)}, default=1)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', 'lower': 0, 'upper': 0.5, 'lower_inclusive': True, 'upper_inclusive': True, 'p': Bounded(lower=0, upper=0.5, default=0, lower_inclusive=True, upper_inclusive=True)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[int, NoneType], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', 'configuration': {'int': {'type': d3m.metadata.hyperparams.Bounded, 'default': 10, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}}, 'p': Union(configuration={int: Bounded(lower=0, upper=None, default=10, lower_inclusive=True, upper_inclusive=False), none: Constant(default=None)}, default=None)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': 'auto', 'structural_type': typing.Union[str, NoneType, float], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If \"auto\", then `max_features=sqrt(n_features)`. - If \"sqrt\", then `max_features=sqrt(n_features)`. - If \"log2\", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', 'configuration': {'calculated': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'auto', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'values': ['auto', 'sqrt', 'log2']}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}, 'percent': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.25, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'lower': 0, 'upper': 1, 'lower_inclusive': False, 'upper_inclusive': True}}, 'p': Union(configuration={calculated: Enumeration(values=['auto', 'sqrt', 'log2'], default=auto), none: Constant(default=None), percent: Bounded(lower=0, upper=1, default=0.25, lower_inclusive=False, upper_inclusive=True)}, default=auto)}}, {'class_name': 'Bounded', 'config': {'type': d3m.metadata.hyperparams.Bounded, 'default': 0.0, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', 'lower': 0.0, 'upper': None, 'lower_inclusive': True, 'upper_inclusive': False, 'p': Bounded(lower=0.0, upper=None, default=0.0, lower_inclusive=True, upper_inclusive=False)}}, {'class_name': 'Enumeration', 'config': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'bootstrap', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'Whether bootstrap samples are used when building trees. And whether to use out-of-bag samples to estimate the generalization accuracy.', 'values': ['bootstrap', 'bootstrap_with_oob_score', 'disabled'], 'p': Enumeration(values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], default=bootstrap)}}, {'class_name': 'UniformBool', 'config': {'type': d3m.metadata.hyperparams.UniformBool, 'default': False, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.', 'p': UniformBool(default=False)}}, {'class_name': 'Union', 'config': {'type': d3m.metadata.hyperparams.Union, 'default': None, 'structural_type': typing.Union[str, NoneType], 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'description': 'Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The \"balanced\" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The \"balanced_subsample\" mode is the same as \"balanced\" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.', 'configuration': {'str': {'type': d3m.metadata.hyperparams.Enumeration, 'default': 'balanced', 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], 'values': ['balanced', 'balanced_subsample']}, 'none': {'type': d3m.metadata.hyperparams.Constant, 'default': None, 'structural_type': , 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter']}}, 'p': Union(configuration={str: Enumeration(values=['balanced', 'balanced_subsample'], default=balanced), none: Constant(default=None)}, default=None)}}], 'values': {'pipeline_choice': '6a520746-108c-45bf-a6d8-c875b5a9d326', 'step5/d3m.primitives.data_cleaning.imputer.SKlearn/strategy': 'mean', 'step5/d3m.primitives.data_cleaning.imputer.SKlearn/add_indicator': False, 'step5/d3m.primitives.data_cleaning.imputer.SKlearn/fill_value': None, 'step6/d3m.primitives.feature_selection.skfeature.TAMU/selection_method': 'SPEC', 'step6/d3m.primitives.feature_selection.skfeature.TAMU/percentage_selected_features': 0.5, 'step7/d3m.primitives.classification.random_forest.SKlearn/n_estimators': 10, 'step7/d3m.primitives.classification.random_forest.SKlearn/criterion': 'gini', 'step7/d3m.primitives.classification.random_forest.SKlearn/max_features': 'auto', 'step7/d3m.primitives.classification.random_forest.SKlearn/max_depth': None, 'step7/d3m.primitives.classification.random_forest.SKlearn/min_samples_split': 2, 'step7/d3m.primitives.classification.random_forest.SKlearn/min_samples_leaf': 1, 'step7/d3m.primitives.classification.random_forest.SKlearn/min_weight_fraction_leaf': 0, 'step7/d3m.primitives.classification.random_forest.SKlearn/max_leaf_nodes': None, 'step7/d3m.primitives.classification.random_forest.SKlearn/min_impurity_decrease': 0.0, 'step7/d3m.primitives.classification.random_forest.SKlearn/bootstrap': 'bootstrap', 'step7/d3m.primitives.classification.random_forest.SKlearn/warm_start': False, 'step7/d3m.primitives.classification.random_forest.SKlearn/class_weight': None}}"} \ No newline at end of file diff --git a/axolotl/examples/run.py b/axolotl/examples/run.py new file mode 100644 index 0000000..d82c838 --- /dev/null +++ b/axolotl/examples/run.py @@ -0,0 +1,31 @@ +import os +import time +from pprint import pprint +import pandas as pd +from sklearn.datasets import make_regression + +from d3m import container +from d3m.metadata.pipeline import Pipeline + +from axolotl.utils import data_problem, pipeline as pipeline_utils +from axolotl.backend.simple import SimpleRunner +from axolotl.backend.ray import RayRunner +from axolotl.algorithms.random_search import RandomSearch + +# init runner +#backend = RayRunner(random_seed=42, volumes_dir=None, n_workers=3) +backend = SimpleRunner(random_seed=42, volumes_dir=None) +#time.sleep(30) + +table_path = os.path.join('..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv') +df = pd.read_csv(table_path) +dataset, problem_description = data_problem.generate_dataset_problem(df, task='binary_classification', target_index=5) + +# The method fit search for the best pipeline based on the time butget and fit the best pipeline based on the rank with the input_data. +search = RandomSearch(problem_description=problem_description, backend=backend) + +fitted_pipeline, fitted_pipelineine_result = search.search_fit(input_data=[dataset], time_limit=30) + +produce_results = search.produce(fitted_pipeline, [dataset]) + +print(produce_results.output) diff --git a/axolotl/examples/synthetic_data_bayesian_hp_tunning.ipynb.REMOVED.git-id b/axolotl/examples/synthetic_data_bayesian_hp_tunning.ipynb.REMOVED.git-id new file mode 100644 index 0000000..7b34017 --- /dev/null +++ b/axolotl/examples/synthetic_data_bayesian_hp_tunning.ipynb.REMOVED.git-id @@ -0,0 +1 @@ +0b793ea6bbd8536751fb6941cb70e3ff2ed5739b \ No newline at end of file diff --git a/axolotl/failed_installation_repos.txt b/axolotl/failed_installation_repos.txt new file mode 100644 index 0000000..9e5064f --- /dev/null +++ b/axolotl/failed_installation_repos.txt @@ -0,0 +1,11 @@ +Repository Name: dsbox-primitives +Package URI: git+https://github.com/usc-isi-i2/dsbox-primitives@390595a708a8702cd6b7b388661127fcf63e4605#egg=dsbox-primitives +Error: "AttributeError: module 'tensorflow' has no attribute 'get_default_graph'" + +Repository Name: distil-primitives +Package URI: git+https://github.com/uncharted-distil/distil-primitives.git@08065c3e867401e444d8e25177c779fcc3ad5af7#egg=distil-primitives +Error: "Cannnot be install due to hard dependency on tensorflow-gpu" + +Repository Name: kf-d3m-primitives +Package URI: git+https://github.com/kungfuai/d3m-primitives.git@17ca6cd4e9ca00e09e2cf91e1cb9f18562645821#egg=kf-d3m-primitives +Error: "Cannnot be install due to hard dependency on tensorflow-gpu" diff --git a/axolotl/images/Devd3mStart.sh b/axolotl/images/Devd3mStart.sh new file mode 100755 index 0000000..5d119f6 --- /dev/null +++ b/axolotl/images/Devd3mStart.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +alias python="python3" + +# check if we are on a deployment container or not. +if [ -d "/user_dev" ]; then + cd /user_dev + echo "Running on deployment" +else + echo "Running on testing" +fi + + +# check output_dir +if [[ -z "$D3MOUTPUTDIR" ]]; then + D3MOUTPUTDIR="$(pwd)/output_dir" + mkdir -p "$D3MOUTPUTDIR" +else + D3MOUTPUTDIR="$D3MOUTPUTDIR" +fi + +# check if time is set, otherwise we use 1 min +if [[ -z "$D3MTIMEOUT" ]]; then + D3MTIMEOUT="60" # 10 gb + else + D3MTIMEOUT="$D3MTIMEOUT" +fi + +# execute d3m server. +case $D3MRUN in + "standalone") + echo "Executing TAMU TA2 Standalone" + echo "No standalone supported yet" + ;; + *) + echo "Executing TAMU TA2" + python3 -m axolotl.d3m_grpc.server + ;; +esac diff --git a/axolotl/images/axolotl.dockerfile b/axolotl/images/axolotl.dockerfile new file mode 100644 index 0000000..a20dd7c --- /dev/null +++ b/axolotl/images/axolotl.dockerfile @@ -0,0 +1,13 @@ +FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.5.18-20200630-050709 + +RUN pip3 install -e git+https://gitlab.com/axolotl1/axolotl.git@9619a077e1d06a152fa01f0fca7fa0321dcd3d2c#egg=axolotl +COPY images/Devd3mStart.sh /user_dev/Devd3mStart.sh + +RUN chmod a+x /user_dev/Devd3mStart.sh + +ENV D3MRUN ta2ta3 +ENV TOKENIZERS_PARALLELISM false + +EXPOSE 45042 + +ENTRYPOINT ["/user_dev/Devd3mStart.sh"] diff --git a/axolotl/images/base.dockerfile b/axolotl/images/base.dockerfile new file mode 100755 index 0000000..51e05b8 --- /dev/null +++ b/axolotl/images/base.dockerfile @@ -0,0 +1,3 @@ +FROM registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.5.18-20200630-050709 + +RUN apt update \ No newline at end of file diff --git a/axolotl/images/build-images.sh b/axolotl/images/build-images.sh new file mode 100755 index 0000000..8b491f3 --- /dev/null +++ b/axolotl/images/build-images.sh @@ -0,0 +1,21 @@ +#!/bin/sh -e + +docker login -u gitlab-ci-token -p "$CI_JOB_TOKEN" "$CI_REGISTRY" + +for IMAGE_NAME in "$@"; do + if [ "$IMAGE_NAME" = "base" ]; then + echo "Bulding "$CI_REGISTRY_IMAGE/$IMAGE_NAME":latest" + docker build -t "$CI_REGISTRY_IMAGE/$IMAGE_NAME:latest" -f images/base.dockerfile . + echo "Pushing "$CI_REGISTRY_IMAGE/$IMAGE_NAME":latest" + docker push "$CI_REGISTRY_IMAGE/$IMAGE_NAME:latest" + echo "Done" + fi + + if [ "$IMAGE_NAME" = "axolotl" ]; then + echo "Bulding "$CI_REGISTRY_IMAGE/$IMAGE_NAME":latest" + docker build -t "$CI_REGISTRY_IMAGE/$IMAGE_NAME:latest" -f images/axolotl.dockerfile . + echo "Pushing "$CI_REGISTRY_IMAGE/$IMAGE_NAME":latest" + docker push "$CI_REGISTRY_IMAGE/$IMAGE_NAME:latest" + echo "Done" + fi +done diff --git a/axolotl/run_tests.py b/axolotl/run_tests.py new file mode 100755 index 0000000..16c264a --- /dev/null +++ b/axolotl/run_tests.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +import sys +import unittest + +runner = unittest.TextTestRunner(verbosity=1) + +tests = unittest.TestLoader().discover('tests') + +if not runner.run(tests).wasSuccessful(): + sys.exit(1) diff --git a/axolotl/setup.py b/axolotl/setup.py new file mode 100644 index 0000000..fccf24f --- /dev/null +++ b/axolotl/setup.py @@ -0,0 +1,53 @@ +import os +import os.path +import sys +from setuptools import setup, find_packages +import subprocess + +PACKAGE_NAME = 'axolotl' +MINIMUM_PYTHON_VERSION = 3, 6 + + +def check_python_version(): + """Exit when the Python version is too low.""" + if sys.version_info < MINIMUM_PYTHON_VERSION: + sys.exit("Python {}.{}+ is required.".format(*MINIMUM_PYTHON_VERSION)) + + +def read_package_variable(key): + """Read the value of a variable from the package without importing.""" + module_path = os.path.join(PACKAGE_NAME, '__init__.py') + with open(module_path) as module: + for line in module: + parts = line.strip().split(' ') + if parts and parts[0] == key: + return parts[-1].strip("'") + raise KeyError("'{0}' not found in '{1}'".format(key, module_path)) + + +check_python_version() +version = read_package_variable('__version__') +description = read_package_variable('__description__') +setup( + name=PACKAGE_NAME, + version=version, + description=version, + + packages=find_packages(exclude=['tests*']), + license='Apache-2.0', + classifiers=[ + 'License :: OSI Approved :: Apache Software License', + ], + install_requires=[ + 'd3m', + 'grpcio', + 'grpcio-tools', + 'grpcio-testing', + 'ray', + 'networkx', + ], + extras_require={ + 'cpu': ['tensorflow==2.2.0'], + 'gpu': ['tensorflow-gpu==2.2.0'] + } +) diff --git a/axolotl/tests/__init__.py b/axolotl/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/axolotl/tests/_server_test.py b/axolotl/tests/_server_test.py new file mode 100644 index 0000000..68e5866 --- /dev/null +++ b/axolotl/tests/_server_test.py @@ -0,0 +1,383 @@ +# from __future__ import print_function + +import argparse +import os +import pathlib +from pprint import pprint + +import grpc +from d3m import utils as d3m_utils, runtime as runtime_module +from d3m.metadata import problem as problem_module +from ta3ta2_api import core_pb2, core_pb2_grpc, value_pb2, utils + +from axolotl.utils import pipeline as pipeline_utils +from axolotl.d3m_grpc import constants + +# with d3m_utils.silence(): +# d3m_index.load_all(blocklist=constants.PrimitivesList.BLACK_LIST) + + +# primitives = [ +# 'd3m.primitives.datasets.DatasetToDataFrame', +# 'd3m.primitives.data_transformation.denormalize.Common' +# ] +# +# with d3m_utils.silence(): +# for primitive in primitives: +# d3m_index.get_primitive(primitive) + + +LENGTH = 60 +ALLOWED_VALUE_TYPES = ['DATASET_URI', 'CSV_URI', 'RAW'] +FULL_SPECIFIED_PIPELINE_PATH = 'modules/server/test_full_pipeline.json' +PRE_SPECIFIED_PIPELINE_PATH = 'modules/server/test_placeholder.json' + + +# PRE_SPECIFIED_PIPELINE_PATH = 'modules/server/test_placeholder_pipeline.json' + + +def hello_request(): + request = core_pb2.HelloRequest() + return request + + +def list_primitives_request(): + request = core_pb2.ListPrimitivesRequest() + return request + + +def search_solutions_request(test_paths, specified_template=None): + user_agent = "test_agent" + version = core_pb2.DESCRIPTOR.GetOptions().Extensions[core_pb2.protocol_version] + + time_bound = 0.5 + priority = 10 + # allowed_value_types = [value_pb2.ValueType.Value(value) for value in ALLOWED_VALUE_TYPES] + + problem_description = utils.encode_problem_description( + problem_module.Problem.load(test_paths['TRAIN']['problem']) + ) + + template = None + if specified_template == 'FULL': + with d3m_utils.silence(): + pipeline = pipeline_utils.load_pipeline(FULL_SPECIFIED_PIPELINE_PATH) + template = utils.encode_pipeline_description(pipeline, ALLOWED_VALUE_TYPES, constants.Path.TEMP_STORAGE_ROOT) + elif specified_template == 'PRE': # PRE for PREPROCESSING + pipeline = runtime_module.get_pipeline(PRE_SPECIFIED_PIPELINE_PATH, load_all_primitives=False) + template = utils.encode_pipeline_description(pipeline, ALLOWED_VALUE_TYPES, constants.Path.TEMP_STORAGE_ROOT) + + inputs = [ + value_pb2.Value( + dataset_uri=test_paths['TRAIN']['dataset'] + ) + ] + + request = core_pb2.SearchSolutionsRequest( + user_agent=user_agent, + version=version, + time_bound_search=time_bound, + priority=priority, + allowed_value_types=ALLOWED_VALUE_TYPES, + problem=problem_description, + template=template, + inputs=inputs + ) + return request + + +def get_search_solution_results_request(search_id): + request = core_pb2.GetSearchSolutionsResultsRequest(search_id=search_id) + return request + + +def fit_solution_request(solution_id, test_paths): + inputs = [ + value_pb2.Value( + dataset_uri=test_paths['TRAIN']['dataset'] + ) + ] + expose_outputs = ['outputs.0'] + expose_value_types = ['CSV_URI'] + users = [ + core_pb2.SolutionRunUser( + id='test_user', + chosen=True, + reason='just because' + ) + ] + request = core_pb2.FitSolutionRequest( + solution_id=solution_id, + inputs=inputs, + expose_outputs=expose_outputs, + expose_value_types=expose_value_types, + users=users + ) + return request + + +def get_fit_solution_results_request(request_id): + request = core_pb2.GetFitSolutionResultsRequest( + request_id=request_id + ) + return request + + +def produce_solution_request(fitted_solution_id, test_paths): + inputs = [ + value_pb2.Value( + dataset_uri=test_paths['TEST']['dataset'] + ) + ] + expose_outputs = ['outputs.0'] + expose_value_types = ['CSV_URI'] + + users = [ + core_pb2.SolutionRunUser( + id='test_user', + chosen=True, + reason='just because' + ) + ] + + request = core_pb2.ProduceSolutionRequest( + fitted_solution_id=fitted_solution_id, + inputs=inputs, + expose_outputs=expose_outputs, + expose_value_types=expose_value_types, + users=users + ) + return request + + +def get_produce_solution_results_request(request_id): + request = core_pb2.GetProduceSolutionResultsRequest( + request_id=request_id + ) + return request + + +def describe_solution_request(solution_id): + request = core_pb2.DescribeSolutionRequest( + solution_id=solution_id + ) + return request + + +def score_solution_request(solution_id, test_paths): + inputs = [ + value_pb2.Value( + dataset_uri=test_paths['SCORE']['dataset'] + ) + ] + + problem = problem_module.Problem.load(test_paths['SCORE']['problem']) + performance_metrics = [] + for performance_metric in problem['problem'].get('performance_metrics', []): + performance_metrics.append(utils.encode_performance_metric(performance_metric)) + + # TODO add support for more evaluation methods + users = [] + evaluation_method = 'K_FOLD' + configuration = core_pb2.ScoringConfiguration( + method=evaluation_method, + folds=2, + # train_test_ratio + shuffle=True, + random_seed=42, + stratified=True, + ) + request = core_pb2.ScoreSolutionRequest( + solution_id=solution_id, + inputs=inputs, + performance_metrics=performance_metrics, + users=users, + configuration=configuration + ) + return request + + +def get_score_solution_request(solution_id): + request = core_pb2.ScoreSolutionRequest( + solution_id=solution_id + ) + return request + + +def solution_export_request(solution_id): + rank = 0.1 + request = core_pb2.SolutionExportRequest( + solution_id=solution_id, + rank=rank + ) + return request + + +def end_search_solutions_request(search_id): + request = core_pb2.EndSearchSolutionsRequest(search_id=search_id) + return request + + +def stop_search_solution_request(search_id): + request = core_pb2.StopSearchSolutionsRequest(search_id=search_id) + return request + + +def run(test_paths, specified_template=None): + channel = grpc.insecure_channel('localhost:45042') + stub = core_pb2_grpc.CoreStub(channel) + + print_name('Hello') + hello_r = stub.Hello(hello_request()) + pprint(hello_r) + + print_name('ListPrimitive') + list_primitives_r = stub.ListPrimitives(list_primitives_request()) + for _primitive in list_primitives_r.primitives: + print_space() + pprint(_primitive) + + print_name('SearchSolution') + search_solutions_r = stub.SearchSolutions(search_solutions_request(test_paths, specified_template)) + search_id = search_solutions_r.search_id + pprint(search_solutions_r) + + print_name('GetSearchSolutionsResults') + solution_id = None + for get_search_solution_r in stub.GetSearchSolutionsResults(get_search_solution_results_request(search_id)): + print_space() + pprint(get_search_solution_r) + if get_search_solution_r.solution_id: + solution_id = get_search_solution_r.solution_id + + print_name('DescribeSolution') + describe_solution_r = stub.DescribeSolution(describe_solution_request(solution_id)) + pprint(describe_solution_r) + + print_name('FitSolution') + fit_solution_r = stub.FitSolution(fit_solution_request(solution_id, test_paths)) + fit_request_id = fit_solution_r.request_id + pprint(fit_solution_r) + + print_name('GetFitSolutionResultsRequest') + fitted_solution_id = None + for get_git_solution_results_r in stub.GetFitSolutionResults(get_fit_solution_results_request(fit_request_id)): + print_space() + pprint(get_git_solution_results_r) + fitted_solution_id = get_git_solution_results_r.fitted_solution_id + + print_name('ProduceSolutionRequest') + produce_solution_r = stub.ProduceSolution(produce_solution_request(fitted_solution_id, test_paths)) + produce_request_id = produce_solution_r.request_id + pprint(produce_solution_r) + + print_name('GetProduceSolutionResultsRequest') + for get_produce_solution_results_r in stub.GetProduceSolutionResults( + get_produce_solution_results_request(produce_request_id)): + print_space() + pprint(get_produce_solution_results_r) + + print_name('ScoreSolution') + score_solution_r = stub.ScoreSolution(score_solution_request(solution_id, test_paths)) + score_request_id = score_solution_r.request_id + + pprint(score_solution_r) + + print_name('GetScoreSolutionResults') + for score_solution_r in stub.GetScoreSolutionResults(get_score_solution_request(score_request_id)): + print_space() + pprint(score_solution_r) + + print_name('SolutionExport') + solution_export_r = stub.SolutionExport(solution_export_request(solution_id)) + pprint(solution_export_r) + + print_name('StopSearchSolutions') + stop_search_solution_r = stub.StopSearchSolutions(stop_search_solution_request(search_id)) + pprint(stop_search_solution_r) + + print_name('EndSearchSolutions') + end_search_solutions_r = stub.EndSearchSolutions(end_search_solutions_request(search_id)) + pprint(end_search_solutions_r) + + +def print_name(name): + length = LENGTH + free_space = length - len(name) - 2 + space = int(free_space / 2) + name = '#' + ' ' * space + name + ' ' * space + if free_space % 2 == 0: + name = name + '#' + else: + name = name + ' #' + + print("#" * length) + print(name) + print("#" * length) + + +def print_space(): + print('-' * LENGTH) + + +def configure_parser(parser, *, skip_arguments=()): + parser.add_argument( + '-t', '--test-path', type=str, default="/D3M/internal_d3m/Winter_2018_tamuta2/datasets/26/", + help="path of d3m dataset to test." + ) + + +def get_problem_id(test_path): + problem_description = problem_module.Problem.load(test_path) + print(problem_description) + problem_id = problem_description.get('id', None) + return problem_id + + +def get_paths(test_path): + # Classification Score dataset path is (problem_SCORE, dataset_SCORE) not + # However, regression and other Score dataset path is (problem_TEST, dataset_TEST) + score_problem_relative_path = os.path.join(test_path, 'SCORE/problem_SCORE/problemDoc.json') + score_dataset_relative_path = os.path.join(test_path, 'SCORE/dataset_SCORE/datasetDoc.json') + + if not os.path.exists(score_problem_relative_path) or not os.path.exists(score_dataset_relative_path): + score_problem_relative_path = os.path.join(test_path, 'SCORE/problem_TEST/problemDoc.json') + score_dataset_relative_path = os.path.join(test_path, 'SCORE/dataset_TEST/datasetDoc.json') + + test_paths = { + 'TRAIN': { + 'dataset': os.path.join(test_path, 'TRAIN/dataset_TRAIN/datasetDoc.json'), + 'problem': pathlib.Path( + os.path.abspath(os.path.join(test_path, 'TRAIN/problem_TRAIN/problemDoc.json'))).as_uri() + }, + 'TEST': { + 'dataset': os.path.join(test_path, 'TEST/dataset_TEST/datasetDoc.json'), + 'problem': pathlib.Path( + os.path.abspath(os.path.join(test_path, 'TEST/problem_TEST/problemDoc.json'))).as_uri() + }, + 'SCORE': { + 'dataset': os.path.join(test_path, score_dataset_relative_path), + 'problem': pathlib.Path(os.path.abspath(score_problem_relative_path)).as_uri() + }, + } + return test_paths + + +if __name__ == '__main__': + # Creating parser + parser = argparse.ArgumentParser(description="Test from command line") + configure_parser(parser) + arguments = parser.parse_args() + + # Getting test root path + test_path = arguments.test_path + + # Getting test paths train/test/score + test_paths = get_paths(test_path) + + # Getting problem id + test_id = get_problem_id(test_paths['TEST']['problem']) + + print_name('Starting Test: ' + test_id) + run(test_paths, None) + print_name('Finishing Test: ' + test_id) diff --git a/axolotl/tests/data/.gitignore b/axolotl/tests/data/.gitignore new file mode 100644 index 0000000..94d5afd --- /dev/null +++ b/axolotl/tests/data/.gitignore @@ -0,0 +1,10 @@ +*.pyc +__pycache__ +.DS_Store +.ipynb_checkpoints +.cache +.idea +*.egg-info +.mypy_cache +dist +build diff --git a/axolotl/tests/data/.gitlab-ci.yml b/axolotl/tests/data/.gitlab-ci.yml new file mode 100644 index 0000000..499ae55 --- /dev/null +++ b/axolotl/tests/data/.gitlab-ci.yml @@ -0,0 +1,42 @@ +build_summing_image: + stage: build + + image: docker:stable + + services: + - docker:dind + + before_script: + - docker info + + script: + - docker login -u gitlab-ci-token -p "$CI_JOB_TOKEN" "$CI_REGISTRY" + - docker build --cache-from="$CI_REGISTRY_IMAGE/summing:latest" -t "$CI_REGISTRY_IMAGE/summing:latest" docker/summing + - docker push "$CI_REGISTRY_IMAGE/summing:latest" + + only: + - master + +style_check: + stage: build + + image: registry.gitlab.com/datadrivendiscovery/images/testing:ubuntu-bionic-python36 + + script: + - pycodestyle primitives/test_primitives + +type_check: + stage: build + + image: registry.gitlab.com/datadrivendiscovery/images/testing:ubuntu-bionic-python36 + + variables: + DEPENDENCY_REF: devel + + script: + - cd primitives + - git clone https://gitlab.com/datadrivendiscovery/d3m.git + - cd d3m + - git checkout ${DEPENDENCY_REF} + - cd .. + - MYPYPATH=d3m mypy test_primitives diff --git a/axolotl/tests/data/README.md b/axolotl/tests/data/README.md new file mode 100644 index 0000000..9efeea5 --- /dev/null +++ b/axolotl/tests/data/README.md @@ -0,0 +1,10 @@ +# Data used for tests + +This repository contains data used for tests across multiple other repositories. + +## About Data Driven Discovery Program + +DARPA Data Driven Discovery (D3M) Program is researching ways to get machines to build +machine learning pipelines automatically. It is split into three layers: +TA1 (primitives), TA2 (systems which combine primitives automatically into pipelines +and executes them), and TA3 (end-users interfaces). diff --git a/axolotl/tests/data/add.sh b/axolotl/tests/data/add.sh new file mode 100755 index 0000000..fd92052 --- /dev/null +++ b/axolotl/tests/data/add.sh @@ -0,0 +1,20 @@ +#!/bin/bash -e + +# Assumption is that this repository is cloned into "d3m-test-data" directory +# which is a sibling of "d3m-primitives" directory. + +for PRIMITIVE in d3m.primitives.regression.monomial.Test \ + d3m.primitives.operator.increment.Test \ + d3m.primitives.operator.sum.Test \ + d3m.primitives.data_generation.random.Test \ + d3m.primitives.operator.primitive_sum.Test \ + d3m.primitives.operator.null.TransformerTest \ + d3m.primitives.operator.null.UnsupervisedLearnerTest \ + d3m.primitives.classification.random_classifier.Test \ + d3m.primitives.evaluation.compute_scores.Test ; do + echo $PRIMITIVE + python -m d3m primitive describe -i 4 $PRIMITIVE > primitive.json + pushd ../d3m-primitives + ./add.py ../d3m-tests-data/primitive.json + popd +done diff --git a/axolotl/tests/data/datasets/audio_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/audio_dataset_1/datasetDoc.json new file mode 100644 index 0000000..ff57fa7 --- /dev/null +++ b/axolotl/tests/data/datasets/audio_dataset_1/datasetDoc.json @@ -0,0 +1,82 @@ +{ + "about": { + "datasetID": "audio_dataset_1", + "datasetName": "Audio dataset to be used for tests", + "license": "CC0", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "4eaa4ee8ce18dc066d400d756105aab1ce92895593d09c8be23e08fdd89640e1" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "media/", + "resType": "audio", + "resFormat": { + "audio/mpeg": [ + "mp3" + ] + }, + "isCollection": true + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 5, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "audio_file", + "colType": "string", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "start", + "colType": "real", + "role": [ + "boundaryIndicator" + ] + }, + { + "colIndex": 3, + "colName": "end", + "colType": "real", + "role": [ + "boundaryIndicator" + ] + }, + { + "colIndex": 4, + "colName": "class", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/audio_dataset_1/media/test_audio.mp3 b/axolotl/tests/data/datasets/audio_dataset_1/media/test_audio.mp3 new file mode 100644 index 0000000..a18ba6c Binary files /dev/null and b/axolotl/tests/data/datasets/audio_dataset_1/media/test_audio.mp3 differ diff --git a/axolotl/tests/data/datasets/audio_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/audio_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..c92d98d --- /dev/null +++ b/axolotl/tests/data/datasets/audio_dataset_1/tables/learningData.csv @@ -0,0 +1,2 @@ +d3mIndex,audio_file,start,end,class +0,test_audio.mp3,0.007,0.008,test diff --git a/axolotl/tests/data/datasets/boston_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/boston_dataset_1/datasetDoc.json new file mode 100644 index 0000000..cd8fae2 --- /dev/null +++ b/axolotl/tests/data/datasets/boston_dataset_1/datasetDoc.json @@ -0,0 +1,164 @@ +{ + "about": { + "datasetID": "boston_dataset_1", + "datasetName": "Boston Dataset", + "description": "The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics ...', Wiley, 1980. N.B. Various transformations are used in the table on pages 244-261 of the latter.", + "datasetVersion": "4.0.0", + "datasetSchemaVersion": "4.0.0", + "sourceURI": "http://lib.stat.cmu.edu/datasets/boston", + "license": "unknown", + "digest": "7797ade70da006a47c32db5dd24be51a6956dbf8d600c4720e53576d32e451e6" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 15, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "CRIM", + "colDescription": "per capita crime rate by town", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "ZN", + "colDescription": "proportion of residential land zoned for lots over 25,000 sq.ft.", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "INDUS", + "colDescription": "proportion of non-retail business acres per town", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "CHAS", + "colDescription": "Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)", + "colType": "boolean", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "NOX", + "colDescription": "nitric oxides concentration (parts per 10 million)", + "colType": "real", + "role": [ + "attribute", + "suggestedTarget" + ] + }, + { + "colIndex": 6, + "colName": "RM", + "colDescription": "average number of rooms per dwelling", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "AGE", + "colDescription": "proportion of owner-occupied units built prior to 1940", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 8, + "colName": "DIS", + "colDescription": "weighted distances to five Boston employment centres", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 9, + "colName": "RAD", + "colDescription": "index of accessibility to radial highways", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 10, + "colName": "TAX", + "colDescription": "full-value property-tax rate per $10,000", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 11, + "colName": "PTRATIO", + "colDescription": "pupil-teacher ratio by town", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 12, + "colName": "B", + "colDescription": "1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 13, + "colName": "LSTAT", + "colDescription": "% lower status of the population", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 14, + "colName": "MEDV", + "colDescription": "Median value of owner-occupied homes in $1000's\n", + "colType": "real", + "role": [ + "attribute", + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/boston_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/boston_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..30897fc --- /dev/null +++ b/axolotl/tests/data/datasets/boston_dataset_1/tables/learningData.csv @@ -0,0 +1,507 @@ +d3mIndex,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV +0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24 +1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6 +2,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7 +3,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4 +4,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2 +5,0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7 +6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9 +7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1 +8,0.21124,12.5,7.87,0,0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93,16.5 +9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9 +10,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45,15 +11,0.11747,12.5,7.87,0,0.524,6.009,82.9,6.2267,5,311,15.2,396.9,13.27,18.9 +12,0.09378,12.5,7.87,0,0.524,5.889,39,5.4509,5,311,15.2,390.5,15.71,21.7 +13,0.62976,0,8.14,0,0.538,5.949,61.8,4.7075,4,307,21,396.9,8.26,20.4 +14,0.63796,0,8.14,0,0.538,6.096,84.5,4.4619,4,307,21,380.02,10.26,18.2 +15,0.62739,0,8.14,0,0.538,5.834,56.5,4.4986,4,307,21,395.62,8.47,19.9 +16,1.05393,0,8.14,0,0.538,5.935,29.3,4.4986,4,307,21,386.85,6.58,23.1 +17,0.7842,0,8.14,0,0.538,5.99,81.7,4.2579,4,307,21,386.75,14.67,17.5 +18,0.80271,0,8.14,0,0.538,5.456,36.6,3.7965,4,307,21,288.99,11.69,20.2 +19,0.7258,0,8.14,0,0.538,5.727,69.5,3.7965,4,307,21,390.95,11.28,18.2 +20,1.25179,0,8.14,0,0.538,5.57,98.1,3.7979,4,307,21,376.57,21.02,13.6 +21,0.85204,0,8.14,0,0.538,5.965,89.2,4.0123,4,307,21,392.53,13.83,19.6 +22,1.23247,0,8.14,0,0.538,6.142,91.7,3.9769,4,307,21,396.9,18.72,15.2 +23,0.98843,0,8.14,0,0.538,5.813,100,4.0952,4,307,21,394.54,19.88,14.5 +24,0.75026,0,8.14,0,0.538,5.924,94.1,4.3996,4,307,21,394.33,16.3,15.6 +25,0.84054,0,8.14,0,0.538,5.599,85.7,4.4546,4,307,21,303.42,16.51,13.9 +26,0.67191,0,8.14,0,0.538,5.813,90.3,4.682,4,307,21,376.88,14.81,16.6 +27,0.95577,0,8.14,0,0.538,6.047,88.8,4.4534,4,307,21,306.38,17.28,14.8 +28,0.77299,0,8.14,0,0.538,6.495,94.4,4.4547,4,307,21,387.94,12.8,18.4 +29,1.00245,0,8.14,0,0.538,6.674,87.3,4.239,4,307,21,380.23,11.98,21 +30,1.13081,0,8.14,0,0.538,5.713,94.1,4.233,4,307,21,360.17,22.6,12.7 +31,1.35472,0,8.14,0,0.538,6.072,100,4.175,4,307,21,376.73,13.04,14.5 +32,1.38799,0,8.14,0,0.538,5.95,82,3.99,4,307,21,232.6,27.71,13.2 +33,1.15172,0,8.14,0,0.538,5.701,95,3.7872,4,307,21,358.77,18.35,13.1 +34,1.61282,0,8.14,0,0.538,6.096,96.9,3.7598,4,307,21,248.31,20.34,13.5 +35,0.06417,0,5.96,0,0.499,5.933,68.2,3.3603,5,279,19.2,396.9,9.68,18.9 +36,0.09744,0,5.96,0,0.499,5.841,61.4,3.3779,5,279,19.2,377.56,11.41,20 +37,0.08014,0,5.96,0,0.499,5.85,41.5,3.9342,5,279,19.2,396.9,8.77,21 +38,0.17505,0,5.96,0,0.499,5.966,30.2,3.8473,5,279,19.2,393.43,10.13,24.7 +39,0.02763,75,2.95,0,0.428,6.595,21.8,5.4011,3,252,18.3,395.63,4.32,30.8 +40,0.03359,75,2.95,0,0.428,7.024,15.8,5.4011,3,252,18.3,395.62,1.98,34.9 +41,0.12744,0,6.91,0,0.448,6.77,2.9,5.7209,3,233,17.9,385.41,4.84,26.6 +42,0.1415,0,6.91,0,0.448,6.169,6.6,5.7209,3,233,17.9,383.37,5.81,25.3 +43,0.15936,0,6.91,0,0.448,6.211,6.5,5.7209,3,233,17.9,394.46,7.44,24.7 +44,0.12269,0,6.91,0,0.448,6.069,40,5.7209,3,233,17.9,389.39,9.55,21.2 +45,0.17142,0,6.91,0,0.448,5.682,33.8,5.1004,3,233,17.9,396.9,10.21,19.3 +46,0.18836,0,6.91,0,0.448,5.786,33.3,5.1004,3,233,17.9,396.9,14.15,20 +47,0.22927,0,6.91,0,0.448,6.03,85.5,5.6894,3,233,17.9,392.74,18.8,16.6 +48,0.25387,0,6.91,0,0.448,5.399,95.3,5.87,3,233,17.9,396.9,30.81,14.4 +49,0.21977,0,6.91,0,0.448,5.602,62,6.0877,3,233,17.9,396.9,16.2,19.4 +50,0.08873,21,5.64,0,0.439,5.963,45.7,6.8147,4,243,16.8,395.56,13.45,19.7 +51,0.04337,21,5.64,0,0.439,6.115,63,6.8147,4,243,16.8,393.97,9.43,20.5 +52,0.0536,21,5.64,0,0.439,6.511,21.1,6.8147,4,243,16.8,396.9,5.28,25 +53,0.04981,21,5.64,0,0.439,5.998,21.4,6.8147,4,243,16.8,396.9,8.43,23.4 +54,0.0136,75,4,0,0.41,5.888,47.6,7.3197,3,469,21.1,396.9,14.8,18.9 +55,0.01311,90,1.22,0,0.403,7.249,21.9,8.6966,5,226,17.9,395.93,4.81,35.4 +56,0.02055,85,0.74,0,0.41,6.383,35.7,9.1876,2,313,17.3,396.9,5.77,24.7 +57,0.01432,100,1.32,0,0.411,6.816,40.5,8.3248,5,256,15.1,392.9,3.95,31.6 +58,0.15445,25,5.13,0,0.453,6.145,29.2,7.8148,8,284,19.7,390.68,6.86,23.3 +59,0.10328,25,5.13,0,0.453,5.927,47.2,6.932,8,284,19.7,396.9,9.22,19.6 +60,0.14932,25,5.13,0,0.453,5.741,66.2,7.2254,8,284,19.7,395.11,13.15,18.7 +61,0.17171,25,5.13,0,0.453,5.966,93.4,6.8185,8,284,19.7,378.08,14.44,16 +62,0.11027,25,5.13,0,0.453,6.456,67.8,7.2255,8,284,19.7,396.9,6.73,22.2 +63,0.1265,25,5.13,0,0.453,6.762,43.4,7.9809,8,284,19.7,395.58,9.5,25 +64,0.01951,17.5,1.38,0,0.4161,7.104,59.5,9.2229,3,216,18.6,393.24,8.05,33 +65,0.03584,80,3.37,0,0.398,6.29,17.8,6.6115,4,337,16.1,396.9,4.67,23.5 +66,0.04379,80,3.37,0,0.398,5.787,31.1,6.6115,4,337,16.1,396.9,10.24,19.4 +67,0.05789,12.5,6.07,0,0.409,5.878,21.4,6.498,4,345,18.9,396.21,8.1,22 +68,0.13554,12.5,6.07,0,0.409,5.594,36.8,6.498,4,345,18.9,396.9,13.09,17.4 +69,0.12816,12.5,6.07,0,0.409,5.885,33,6.498,4,345,18.9,396.9,8.79,20.9 +70,0.08826,0,10.81,0,0.413,6.417,6.6,5.2873,4,305,19.2,383.73,6.72,24.2 +71,0.15876,0,10.81,0,0.413,5.961,17.5,5.2873,4,305,19.2,376.94,9.88,21.7 +72,0.09164,0,10.81,0,0.413,6.065,7.8,5.2873,4,305,19.2,390.91,5.52,22.8 +73,0.19539,0,10.81,0,0.413,6.245,6.2,5.2873,4,305,19.2,377.17,7.54,23.4 +74,0.07896,0,12.83,0,0.437,6.273,6,4.2515,5,398,18.7,394.92,6.78,24.1 +75,0.09512,0,12.83,0,0.437,6.286,45,4.5026,5,398,18.7,383.23,8.94,21.4 +76,0.10153,0,12.83,0,0.437,6.279,74.5,4.0522,5,398,18.7,373.66,11.97,20 +77,0.08707,0,12.83,0,0.437,6.14,45.8,4.0905,5,398,18.7,386.96,10.27,20.8 +78,0.05646,0,12.83,0,0.437,6.232,53.7,5.0141,5,398,18.7,386.4,12.34,21.2 +79,0.08387,0,12.83,0,0.437,5.874,36.6,4.5026,5,398,18.7,396.06,9.1,20.3 +80,0.04113,25,4.86,0,0.426,6.727,33.5,5.4007,4,281,19,396.9,5.29,28 +81,0.04462,25,4.86,0,0.426,6.619,70.4,5.4007,4,281,19,395.63,7.22,23.9 +82,0.03659,25,4.86,0,0.426,6.302,32.2,5.4007,4,281,19,396.9,6.72,24.8 +83,0.03551,25,4.86,0,0.426,6.167,46.7,5.4007,4,281,19,390.64,7.51,22.9 +84,0.05059,0,4.49,0,0.449,6.389,48,4.7794,3,247,18.5,396.9,9.62,23.9 +85,0.05735,0,4.49,0,0.449,6.63,56.1,4.4377,3,247,18.5,392.3,6.53,26.6 +86,0.05188,0,4.49,0,0.449,6.015,45.1,4.4272,3,247,18.5,395.99,12.86,22.5 +87,0.07151,0,4.49,0,0.449,6.121,56.8,3.7476,3,247,18.5,395.15,8.44,22.2 +88,0.0566,0,3.41,0,0.489,7.007,86.3,3.4217,2,270,17.8,396.9,5.5,23.6 +89,0.05302,0,3.41,0,0.489,7.079,63.1,3.4145,2,270,17.8,396.06,5.7,28.7 +90,0.04684,0,3.41,0,0.489,6.417,66.1,3.0923,2,270,17.8,392.18,8.81,22.6 +91,0.03932,0,3.41,0,0.489,6.405,73.9,3.0921,2,270,17.8,393.55,8.2,22 +92,0.04203,28,15.04,0,0.464,6.442,53.6,3.6659,4,270,18.2,395.01,8.16,22.9 +93,0.02875,28,15.04,0,0.464,6.211,28.9,3.6659,4,270,18.2,396.33,6.21,25 +94,0.04294,28,15.04,0,0.464,6.249,77.3,3.615,4,270,18.2,396.9,10.59,20.6 +95,0.12204,0,2.89,0,0.445,6.625,57.8,3.4952,2,276,18,357.98,6.65,28.4 +96,0.11504,0,2.89,0,0.445,6.163,69.6,3.4952,2,276,18,391.83,11.34,21.4 +97,0.12083,0,2.89,0,0.445,8.069,76,3.4952,2,276,18,396.9,4.21,38.7 +98,0.08187,0,2.89,0,0.445,7.82,36.9,3.4952,2,276,18,393.53,3.57,43.8 +99,0.0686,0,2.89,0,0.445,7.416,62.5,3.4952,2,276,18,396.9,6.19,33.2 +100,0.14866,0,8.56,0,0.52,6.727,79.9,2.7778,5,384,20.9,394.76,9.42,27.5 +101,0.11432,0,8.56,0,0.52,6.781,71.3,2.8561,5,384,20.9,395.58,7.67,26.5 +102,0.22876,0,8.56,0,0.52,6.405,85.4,2.7147,5,384,20.9,70.8,10.63,18.6 +103,0.21161,0,8.56,0,0.52,6.137,87.4,2.7147,5,384,20.9,394.47,13.44,19.3 +104,0.1396,0,8.56,0,0.52,6.167,90,2.421,5,384,20.9,392.69,12.33,20.1 +105,0.13262,0,8.56,0,0.52,5.851,96.7,2.1069,5,384,20.9,394.05,16.47,19.5 +106,0.1712,0,8.56,0,0.52,5.836,91.9,2.211,5,384,20.9,395.67,18.66,19.5 +107,0.13117,0,8.56,0,0.52,6.127,85.2,2.1224,5,384,20.9,387.69,14.09,20.4 +108,0.12802,0,8.56,0,0.52,6.474,97.1,2.4329,5,384,20.9,395.24,12.27,19.8 +109,0.26363,0,8.56,0,0.52,6.229,91.2,2.5451,5,384,20.9,391.23,15.55,19.4 +110,0.10793,0,8.56,0,0.52,6.195,54.4,2.7778,5,384,20.9,393.49,13,21.7 +111,0.10084,0,10.01,0,0.547,6.715,81.6,2.6775,6,432,17.8,395.59,10.16,22.8 +112,0.12329,0,10.01,0,0.547,5.913,92.9,2.3534,6,432,17.8,394.95,16.21,18.8 +113,0.22212,0,10.01,0,0.547,6.092,95.4,2.548,6,432,17.8,396.9,17.09,18.7 +114,0.14231,0,10.01,0,0.547,6.254,84.2,2.2565,6,432,17.8,388.74,10.45,18.5 +115,0.17134,0,10.01,0,0.547,5.928,88.2,2.4631,6,432,17.8,344.91,15.76,18.3 +116,0.13158,0,10.01,0,0.547,6.176,72.5,2.7301,6,432,17.8,393.3,12.04,21.2 +117,0.15098,0,10.01,0,0.547,6.021,82.6,2.7474,6,432,17.8,394.51,10.3,19.2 +118,0.13058,0,10.01,0,0.547,5.872,73.1,2.4775,6,432,17.8,338.63,15.37,20.4 +119,0.14476,0,10.01,0,0.547,5.731,65.2,2.7592,6,432,17.8,391.5,13.61,19.3 +120,0.06899,0,25.65,0,0.581,5.87,69.7,2.2577,2,188,19.1,389.15,14.37,22 +121,0.07165,0,25.65,0,0.581,6.004,84.1,2.1974,2,188,19.1,377.67,14.27,20.3 +122,0.09299,0,25.65,0,0.581,5.961,92.9,2.0869,2,188,19.1,378.09,17.93,20.5 +123,0.15038,0,25.65,0,0.581,5.856,97,1.9444,2,188,19.1,370.31,25.41,17.3 +124,0.09849,0,25.65,0,0.581,5.879,95.8,2.0063,2,188,19.1,379.38,17.58,18.8 +125,0.16902,0,25.65,0,0.581,5.986,88.4,1.9929,2,188,19.1,385.02,14.81,21.4 +126,0.38735,0,25.65,0,0.581,5.613,95.6,1.7572,2,188,19.1,359.29,27.26,15.7 +127,0.25915,0,21.89,0,0.624,5.693,96,1.7883,4,437,21.2,392.11,17.19,16.2 +128,0.32543,0,21.89,0,0.624,6.431,98.8,1.8125,4,437,21.2,396.9,15.39,18 +129,0.88125,0,21.89,0,0.624,5.637,94.7,1.9799,4,437,21.2,396.9,18.34,14.3 +130,0.34006,0,21.89,0,0.624,6.458,98.9,2.1185,4,437,21.2,395.04,12.6,19.2 +131,1.19294,0,21.89,0,0.624,6.326,97.7,2.271,4,437,21.2,396.9,12.26,19.6 +132,0.59005,0,21.89,0,0.624,6.372,97.9,2.3274,4,437,21.2,385.76,11.12,23 +133,0.32982,0,21.89,0,0.624,5.822,95.4,2.4699,4,437,21.2,388.69,15.03,18.4 +134,0.97617,0,21.89,0,0.624,5.757,98.4,2.346,4,437,21.2,262.76,17.31,15.6 +135,0.55778,0,21.89,0,0.624,6.335,98.2,2.1107,4,437,21.2,394.67,16.96,18.1 +136,0.32264,0,21.89,0,0.624,5.942,93.5,1.9669,4,437,21.2,378.25,16.9,17.4 +137,0.35233,0,21.89,0,0.624,6.454,98.4,1.8498,4,437,21.2,394.08,14.59,17.1 +138,0.2498,0,21.89,0,0.624,5.857,98.2,1.6686,4,437,21.2,392.04,21.32,13.3 +139,0.54452,0,21.89,0,0.624,6.151,97.9,1.6687,4,437,21.2,396.9,18.46,17.8 +140,0.2909,0,21.89,0,0.624,6.174,93.6,1.6119,4,437,21.2,388.08,24.16,14 +141,1.62864,0,21.89,0,0.624,5.019,100,1.4394,4,437,21.2,396.9,34.41,14.4 +142,3.32105,0,19.58,1,0.871,5.403,100,1.3216,5,403,14.7,396.9,26.82,13.4 +143,4.0974,0,19.58,0,0.871,5.468,100,1.4118,5,403,14.7,396.9,26.42,15.6 +144,2.77974,0,19.58,0,0.871,4.903,97.8,1.3459,5,403,14.7,396.9,29.29,11.8 +145,2.37934,0,19.58,0,0.871,6.13,100,1.4191,5,403,14.7,172.91,27.8,13.8 +146,2.15505,0,19.58,0,0.871,5.628,100,1.5166,5,403,14.7,169.27,16.65,15.6 +147,2.36862,0,19.58,0,0.871,4.926,95.7,1.4608,5,403,14.7,391.71,29.53,14.6 +148,2.33099,0,19.58,0,0.871,5.186,93.8,1.5296,5,403,14.7,356.99,28.32,17.8 +149,2.73397,0,19.58,0,0.871,5.597,94.9,1.5257,5,403,14.7,351.85,21.45,15.4 +150,1.6566,0,19.58,0,0.871,6.122,97.3,1.618,5,403,14.7,372.8,14.1,21.5 +151,1.49632,0,19.58,0,0.871,5.404,100,1.5916,5,403,14.7,341.6,13.28,19.6 +152,1.12658,0,19.58,1,0.871,5.012,88,1.6102,5,403,14.7,343.28,12.12,15.3 +153,2.14918,0,19.58,0,0.871,5.709,98.5,1.6232,5,403,14.7,261.95,15.79,19.4 +154,1.41385,0,19.58,1,0.871,6.129,96,1.7494,5,403,14.7,321.02,15.12,17 +155,3.53501,0,19.58,1,0.871,6.152,82.6,1.7455,5,403,14.7,88.01,15.02,15.6 +156,2.44668,0,19.58,0,0.871,5.272,94,1.7364,5,403,14.7,88.63,16.14,13.1 +157,1.22358,0,19.58,0,0.605,6.943,97.4,1.8773,5,403,14.7,363.43,4.59,41.3 +158,1.34284,0,19.58,0,0.605,6.066,100,1.7573,5,403,14.7,353.89,6.43,24.3 +159,1.42502,0,19.58,0,0.871,6.51,100,1.7659,5,403,14.7,364.31,7.39,23.3 +160,1.27346,0,19.58,1,0.605,6.25,92.6,1.7984,5,403,14.7,338.92,5.5,27 +161,1.46336,0,19.58,0,0.605,7.489,90.8,1.9709,5,403,14.7,374.43,1.73,50 +162,1.83377,0,19.58,1,0.605,7.802,98.2,2.0407,5,403,14.7,389.61,1.92,50 +163,1.51902,0,19.58,1,0.605,8.375,93.9,2.162,5,403,14.7,388.45,3.32,50 +164,2.24236,0,19.58,0,0.605,5.854,91.8,2.422,5,403,14.7,395.11,11.64,22.7 +165,2.924,0,19.58,0,0.605,6.101,93,2.2834,5,403,14.7,240.16,9.81,25 +166,2.01019,0,19.58,0,0.605,7.929,96.2,2.0459,5,403,14.7,369.3,3.7,50 +167,1.80028,0,19.58,0,0.605,5.877,79.2,2.4259,5,403,14.7,227.61,12.14,23.8 +168,2.3004,0,19.58,0,0.605,6.319,96.1,2.1,5,403,14.7,297.09,11.1,23.8 +169,2.44953,0,19.58,0,0.605,6.402,95.2,2.2625,5,403,14.7,330.04,11.32,22.3 +170,1.20742,0,19.58,0,0.605,5.875,94.6,2.4259,5,403,14.7,292.29,14.43,17.4 +171,2.3139,0,19.58,0,0.605,5.88,97.3,2.3887,5,403,14.7,348.13,12.03,19.1 +172,0.13914,0,4.05,0,0.51,5.572,88.5,2.5961,5,296,16.6,396.9,14.69,23.1 +173,0.09178,0,4.05,0,0.51,6.416,84.1,2.6463,5,296,16.6,395.5,9.04,23.6 +174,0.08447,0,4.05,0,0.51,5.859,68.7,2.7019,5,296,16.6,393.23,9.64,22.6 +175,0.06664,0,4.05,0,0.51,6.546,33.1,3.1323,5,296,16.6,390.96,5.33,29.4 +176,0.07022,0,4.05,0,0.51,6.02,47.2,3.5549,5,296,16.6,393.23,10.11,23.2 +177,0.05425,0,4.05,0,0.51,6.315,73.4,3.3175,5,296,16.6,395.6,6.29,24.6 +178,0.06642,0,4.05,0,0.51,6.86,74.4,2.9153,5,296,16.6,391.27,6.92,29.9 +179,0.0578,0,2.46,0,0.488,6.98,58.4,2.829,3,193,17.8,396.9,5.04,37.2 +180,0.06588,0,2.46,0,0.488,7.765,83.3,2.741,3,193,17.8,395.56,7.56,39.8 +181,0.06888,0,2.46,0,0.488,6.144,62.2,2.5979,3,193,17.8,396.9,9.45,36.2 +182,0.09103,0,2.46,0,0.488,7.155,92.2,2.7006,3,193,17.8,394.12,4.82,37.9 +183,0.10008,0,2.46,0,0.488,6.563,95.6,2.847,3,193,17.8,396.9,5.68,32.5 +184,0.08308,0,2.46,0,0.488,5.604,89.8,2.9879,3,193,17.8,391,13.98,26.4 +185,0.06047,0,2.46,0,0.488,6.153,68.8,3.2797,3,193,17.8,387.11,13.15,29.6 +186,0.05602,0,2.46,0,0.488,7.831,53.6,3.1992,3,193,17.8,392.63,4.45,50 +187,0.07875,45,3.44,0,0.437,6.782,41.1,3.7886,5,398,15.2,393.87,6.68,32 +188,0.12579,45,3.44,0,0.437,6.556,29.1,4.5667,5,398,15.2,382.84,4.56,29.8 +189,0.0837,45,3.44,0,0.437,7.185,38.9,4.5667,5,398,15.2,396.9,5.39,34.9 +190,0.09068,45,3.44,0,0.437,6.951,21.5,6.4798,5,398,15.2,377.68,5.1,37 +191,0.06911,45,3.44,0,0.437,6.739,30.8,6.4798,5,398,15.2,389.71,4.69,30.5 +192,0.08664,45,3.44,0,0.437,7.178,26.3,6.4798,5,398,15.2,390.49,2.87,36.4 +193,0.02187,60,2.93,0,0.401,6.8,9.9,6.2196,1,265,15.6,393.37,5.03,31.1 +194,0.01439,60,2.93,0,0.401,6.604,18.8,6.2196,1,265,15.6,376.7,4.38,29.1 +195,0.01381,80,0.46,0,0.422,7.875,32,5.6484,4,255,14.4,394.23,2.97,50 +196,0.04011,80,1.52,0,0.404,7.287,34.1,7.309,2,329,12.6,396.9,4.08,33.3 +197,0.04666,80,1.52,0,0.404,7.107,36.6,7.309,2,329,12.6,354.31,8.61,30.3 +198,0.03768,80,1.52,0,0.404,7.274,38.3,7.309,2,329,12.6,392.2,6.62,34.6 +199,0.0315,95,1.47,0,0.403,6.975,15.3,7.6534,3,402,17,396.9,4.56,34.9 +200,0.01778,95,1.47,0,0.403,7.135,13.9,7.6534,3,402,17,384.3,4.45,32.9 +201,0.03445,82.5,2.03,0,0.415,6.162,38.4,6.27,2,348,14.7,393.77,7.43,24.1 +202,0.02177,82.5,2.03,0,0.415,7.61,15.7,6.27,2,348,14.7,395.38,3.11,42.3 +203,0.0351,95,2.68,0,0.4161,7.853,33.2,5.118,4,224,14.7,392.78,3.81,48.5 +204,0.02009,95,2.68,0,0.4161,8.034,31.9,5.118,4,224,14.7,390.55,2.88,50 +205,0.13642,0,10.59,0,0.489,5.891,22.3,3.9454,4,277,18.6,396.9,10.87,22.6 +206,0.22969,0,10.59,0,0.489,6.326,52.5,4.3549,4,277,18.6,394.87,10.97,24.4 +207,0.25199,0,10.59,0,0.489,5.783,72.7,4.3549,4,277,18.6,389.43,18.06,22.5 +208,0.13587,0,10.59,1,0.489,6.064,59.1,4.2392,4,277,18.6,381.32,14.66,24.4 +209,0.43571,0,10.59,1,0.489,5.344,100,3.875,4,277,18.6,396.9,23.09,20 +210,0.17446,0,10.59,1,0.489,5.96,92.1,3.8771,4,277,18.6,393.25,17.27,21.7 +211,0.37578,0,10.59,1,0.489,5.404,88.6,3.665,4,277,18.6,395.24,23.98,19.3 +212,0.21719,0,10.59,1,0.489,5.807,53.8,3.6526,4,277,18.6,390.94,16.03,22.4 +213,0.14052,0,10.59,0,0.489,6.375,32.3,3.9454,4,277,18.6,385.81,9.38,28.1 +214,0.28955,0,10.59,0,0.489,5.412,9.8,3.5875,4,277,18.6,348.93,29.55,23.7 +215,0.19802,0,10.59,0,0.489,6.182,42.4,3.9454,4,277,18.6,393.63,9.47,25 +216,0.0456,0,13.89,1,0.55,5.888,56,3.1121,5,276,16.4,392.8,13.51,23.3 +217,0.07013,0,13.89,0,0.55,6.642,85.1,3.4211,5,276,16.4,392.78,9.69,28.7 +218,0.11069,0,13.89,1,0.55,5.951,93.8,2.8893,5,276,16.4,396.9,17.92,21.5 +219,0.11425,0,13.89,1,0.55,6.373,92.4,3.3633,5,276,16.4,393.74,10.5,23 +220,0.35809,0,6.2,1,0.507,6.951,88.5,2.8617,8,307,17.4,391.7,9.71,26.7 +221,0.40771,0,6.2,1,0.507,6.164,91.3,3.048,8,307,17.4,395.24,21.46,21.7 +222,0.62356,0,6.2,1,0.507,6.879,77.7,3.2721,8,307,17.4,390.39,9.93,27.5 +223,0.6147,0,6.2,0,0.507,6.618,80.8,3.2721,8,307,17.4,396.9,7.6,30.1 +224,0.31533,0,6.2,0,0.504,8.266,78.3,2.8944,8,307,17.4,385.05,4.14,44.8 +225,0.52693,0,6.2,0,0.504,8.725,83,2.8944,8,307,17.4,382,4.63,50 +226,0.38214,0,6.2,0,0.504,8.04,86.5,3.2157,8,307,17.4,387.38,3.13,37.6 +227,0.41238,0,6.2,0,0.504,7.163,79.9,3.2157,8,307,17.4,372.08,6.36,31.6 +228,0.29819,0,6.2,0,0.504,7.686,17,3.3751,8,307,17.4,377.51,3.92,46.7 +229,0.44178,0,6.2,0,0.504,6.552,21.4,3.3751,8,307,17.4,380.34,3.76,31.5 +230,0.537,0,6.2,0,0.504,5.981,68.1,3.6715,8,307,17.4,378.35,11.65,24.3 +231,0.46296,0,6.2,0,0.504,7.412,76.9,3.6715,8,307,17.4,376.14,5.25,31.7 +232,0.57529,0,6.2,0,0.507,8.337,73.3,3.8384,8,307,17.4,385.91,2.47,41.7 +233,0.33147,0,6.2,0,0.507,8.247,70.4,3.6519,8,307,17.4,378.95,3.95,48.3 +234,0.44791,0,6.2,1,0.507,6.726,66.5,3.6519,8,307,17.4,360.2,8.05,29 +235,0.33045,0,6.2,0,0.507,6.086,61.5,3.6519,8,307,17.4,376.75,10.88,24 +236,0.52058,0,6.2,1,0.507,6.631,76.5,4.148,8,307,17.4,388.45,9.54,25.1 +237,0.51183,0,6.2,0,0.507,7.358,71.6,4.148,8,307,17.4,390.07,4.73,31.5 +238,0.08244,30,4.93,0,0.428,6.481,18.5,6.1899,6,300,16.6,379.41,6.36,23.7 +239,0.09252,30,4.93,0,0.428,6.606,42.2,6.1899,6,300,16.6,383.78,7.37,23.3 +240,0.11329,30,4.93,0,0.428,6.897,54.3,6.3361,6,300,16.6,391.25,11.38,22 +241,0.10612,30,4.93,0,0.428,6.095,65.1,6.3361,6,300,16.6,394.62,12.4,20.1 +242,0.1029,30,4.93,0,0.428,6.358,52.9,7.0355,6,300,16.6,372.75,11.22,22.2 +243,0.12757,30,4.93,0,0.428,6.393,7.8,7.0355,6,300,16.6,374.71,5.19,23.7 +244,0.20608,22,5.86,0,0.431,5.593,76.5,7.9549,7,330,19.1,372.49,12.5,17.6 +245,0.19133,22,5.86,0,0.431,5.605,70.2,7.9549,7,330,19.1,389.13,18.46,18.5 +246,0.33983,22,5.86,0,0.431,6.108,34.9,8.0555,7,330,19.1,390.18,9.16,24.3 +247,0.19657,22,5.86,0,0.431,6.226,79.2,8.0555,7,330,19.1,376.14,10.15,20.5 +248,0.16439,22,5.86,0,0.431,6.433,49.1,7.8265,7,330,19.1,374.71,9.52,24.5 +249,0.19073,22,5.86,0,0.431,6.718,17.5,7.8265,7,330,19.1,393.74,6.56,26.2 +250,0.1403,22,5.86,0,0.431,6.487,13,7.3967,7,330,19.1,396.28,5.9,24.4 +251,0.21409,22,5.86,0,0.431,6.438,8.9,7.3967,7,330,19.1,377.07,3.59,24.8 +252,0.08221,22,5.86,0,0.431,6.957,6.8,8.9067,7,330,19.1,386.09,3.53,29.6 +253,0.36894,22,5.86,0,0.431,8.259,8.4,8.9067,7,330,19.1,396.9,3.54,42.8 +254,0.04819,80,3.64,0,0.392,6.108,32,9.2203,1,315,16.4,392.89,6.57,21.9 +255,0.03548,80,3.64,0,0.392,5.876,19.1,9.2203,1,315,16.4,395.18,9.25,20.9 +256,0.01538,90,3.75,0,0.394,7.454,34.2,6.3361,3,244,15.9,386.34,3.11,44 +257,0.61154,20,3.97,0,0.647,8.704,86.9,1.801,5,264,13,389.7,5.12,50 +258,0.66351,20,3.97,0,0.647,7.333,100,1.8946,5,264,13,383.29,7.79,36 +259,0.65665,20,3.97,0,0.647,6.842,100,2.0107,5,264,13,391.93,6.9,30.1 +260,0.54011,20,3.97,0,0.647,7.203,81.8,2.1121,5,264,13,392.8,9.59,33.8 +261,0.53412,20,3.97,0,0.647,7.52,89.4,2.1398,5,264,13,388.37,7.26,43.1 +262,0.52014,20,3.97,0,0.647,8.398,91.5,2.2885,5,264,13,386.86,5.91,48.8 +263,0.82526,20,3.97,0,0.647,7.327,94.5,2.0788,5,264,13,393.42,11.25,31 +264,0.55007,20,3.97,0,0.647,7.206,91.6,1.9301,5,264,13,387.89,8.1,36.5 +265,0.76162,20,3.97,0,0.647,5.56,62.8,1.9865,5,264,13,392.4,10.45,22.8 +266,0.7857,20,3.97,0,0.647,7.014,84.6,2.1329,5,264,13,384.07,14.79,30.7 +267,0.57834,20,3.97,0,0.575,8.297,67,2.4216,5,264,13,384.54,7.44,50 +268,0.5405,20,3.97,0,0.575,7.47,52.6,2.872,5,264,13,390.3,3.16,43.5 +269,0.09065,20,6.96,1,0.464,5.92,61.5,3.9175,3,223,18.6,391.34,13.65,20.7 +270,0.29916,20,6.96,0,0.464,5.856,42.1,4.429,3,223,18.6,388.65,13,21.1 +271,0.16211,20,6.96,0,0.464,6.24,16.3,4.429,3,223,18.6,396.9,6.59,25.2 +272,0.1146,20,6.96,0,0.464,6.538,58.7,3.9175,3,223,18.6,394.96,7.73,24.4 +273,0.22188,20,6.96,1,0.464,7.691,51.8,4.3665,3,223,18.6,390.77,6.58,35.2 +274,0.05644,40,6.41,1,0.447,6.758,32.9,4.0776,4,254,17.6,396.9,3.53,32.4 +275,0.09604,40,6.41,0,0.447,6.854,42.8,4.2673,4,254,17.6,396.9,2.98,32 +276,0.10469,40,6.41,1,0.447,7.267,49,4.7872,4,254,17.6,389.25,6.05,33.2 +277,0.06127,40,6.41,1,0.447,6.826,27.6,4.8628,4,254,17.6,393.45,4.16,33.1 +278,0.07978,40,6.41,0,0.447,6.482,32.1,4.1403,4,254,17.6,396.9,7.19,29.1 +279,0.21038,20,3.33,0,0.4429,6.812,32.2,4.1007,5,216,14.9,396.9,4.85,35.1 +280,0.03578,20,3.33,0,0.4429,7.82,64.5,4.6947,5,216,14.9,387.31,3.76,45.4 +281,0.03705,20,3.33,0,0.4429,6.968,37.2,5.2447,5,216,14.9,392.23,4.59,35.4 +282,0.06129,20,3.33,1,0.4429,7.645,49.7,5.2119,5,216,14.9,377.07,3.01,46 +283,0.01501,90,1.21,1,0.401,7.923,24.8,5.885,1,198,13.6,395.52,3.16,50 +284,0.00906,90,2.97,0,0.4,7.088,20.8,7.3073,1,285,15.3,394.72,7.85,32.2 +285,0.01096,55,2.25,0,0.389,6.453,31.9,7.3073,1,300,15.3,394.72,8.23,22 +286,0.01965,80,1.76,0,0.385,6.23,31.5,9.0892,1,241,18.2,341.6,12.93,20.1 +287,0.03871,52.5,5.32,0,0.405,6.209,31.3,7.3172,6,293,16.6,396.9,7.14,23.2 +288,0.0459,52.5,5.32,0,0.405,6.315,45.6,7.3172,6,293,16.6,396.9,7.6,22.3 +289,0.04297,52.5,5.32,0,0.405,6.565,22.9,7.3172,6,293,16.6,371.72,9.51,24.8 +290,0.03502,80,4.95,0,0.411,6.861,27.9,5.1167,4,245,19.2,396.9,3.33,28.5 +291,0.07886,80,4.95,0,0.411,7.148,27.7,5.1167,4,245,19.2,396.9,3.56,37.3 +292,0.03615,80,4.95,0,0.411,6.63,23.4,5.1167,4,245,19.2,396.9,4.7,27.9 +293,0.08265,0,13.92,0,0.437,6.127,18.4,5.5027,4,289,16,396.9,8.58,23.9 +294,0.08199,0,13.92,0,0.437,6.009,42.3,5.5027,4,289,16,396.9,10.4,21.7 +295,0.12932,0,13.92,0,0.437,6.678,31.1,5.9604,4,289,16,396.9,6.27,28.6 +296,0.05372,0,13.92,0,0.437,6.549,51,5.9604,4,289,16,392.85,7.39,27.1 +297,0.14103,0,13.92,0,0.437,5.79,58,6.32,4,289,16,396.9,15.84,20.3 +298,0.06466,70,2.24,0,0.4,6.345,20.1,7.8278,5,358,14.8,368.24,4.97,22.5 +299,0.05561,70,2.24,0,0.4,7.041,10,7.8278,5,358,14.8,371.58,4.74,29 +300,0.04417,70,2.24,0,0.4,6.871,47.4,7.8278,5,358,14.8,390.86,6.07,24.8 +301,0.03537,34,6.09,0,0.433,6.59,40.4,5.4917,7,329,16.1,395.75,9.5,22 +302,0.09266,34,6.09,0,0.433,6.495,18.4,5.4917,7,329,16.1,383.61,8.67,26.4 +303,0.1,34,6.09,0,0.433,6.982,17.7,5.4917,7,329,16.1,390.43,4.86,33.1 +304,0.05515,33,2.18,0,0.472,7.236,41.1,4.022,7,222,18.4,393.68,6.93,36.1 +305,0.05479,33,2.18,0,0.472,6.616,58.1,3.37,7,222,18.4,393.36,8.93,28.4 +306,0.07503,33,2.18,0,0.472,7.42,71.9,3.0992,7,222,18.4,396.9,6.47,33.4 +307,0.04932,33,2.18,0,0.472,6.849,70.3,3.1827,7,222,18.4,396.9,7.53,28.2 +308,0.49298,0,9.9,0,0.544,6.635,82.5,3.3175,4,304,18.4,396.9,4.54,22.8 +309,0.3494,0,9.9,0,0.544,5.972,76.7,3.1025,4,304,18.4,396.24,9.97,20.3 +310,2.63548,0,9.9,0,0.544,4.973,37.8,2.5194,4,304,18.4,350.45,12.64,16.1 +311,0.79041,0,9.9,0,0.544,6.122,52.8,2.6403,4,304,18.4,396.9,5.98,22.1 +312,0.26169,0,9.9,0,0.544,6.023,90.4,2.834,4,304,18.4,396.3,11.72,19.4 +313,0.26938,0,9.9,0,0.544,6.266,82.8,3.2628,4,304,18.4,393.39,7.9,21.6 +314,0.3692,0,9.9,0,0.544,6.567,87.3,3.6023,4,304,18.4,395.69,9.28,23.8 +315,0.25356,0,9.9,0,0.544,5.705,77.7,3.945,4,304,18.4,396.42,11.5,16.2 +316,0.31827,0,9.9,0,0.544,5.914,83.2,3.9986,4,304,18.4,390.7,18.33,17.8 +317,0.24522,0,9.9,0,0.544,5.782,71.7,4.0317,4,304,18.4,396.9,15.94,19.8 +318,0.40202,0,9.9,0,0.544,6.382,67.2,3.5325,4,304,18.4,395.21,10.36,23.1 +319,0.47547,0,9.9,0,0.544,6.113,58.8,4.0019,4,304,18.4,396.23,12.73,21 +320,0.1676,0,7.38,0,0.493,6.426,52.3,4.5404,5,287,19.6,396.9,7.2,23.8 +321,0.18159,0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.9,6.87,23.1 +322,0.35114,0,7.38,0,0.493,6.041,49.9,4.7211,5,287,19.6,396.9,7.7,20.4 +323,0.28392,0,7.38,0,0.493,5.708,74.3,4.7211,5,287,19.6,391.13,11.74,18.5 +324,0.34109,0,7.38,0,0.493,6.415,40.1,4.7211,5,287,19.6,396.9,6.12,25 +325,0.19186,0,7.38,0,0.493,6.431,14.7,5.4159,5,287,19.6,393.68,5.08,24.6 +326,0.30347,0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.9,6.15,23 +327,0.24103,0,7.38,0,0.493,6.083,43.7,5.4159,5,287,19.6,396.9,12.79,22.2 +328,0.06617,0,3.24,0,0.46,5.868,25.8,5.2146,4,430,16.9,382.44,9.97,19.3 +329,0.06724,0,3.24,0,0.46,6.333,17.2,5.2146,4,430,16.9,375.21,7.34,22.6 +330,0.04544,0,3.24,0,0.46,6.144,32.2,5.8736,4,430,16.9,368.57,9.09,19.8 +331,0.05023,35,6.06,0,0.4379,5.706,28.4,6.6407,1,304,16.9,394.02,12.43,17.1 +332,0.03466,35,6.06,0,0.4379,6.031,23.3,6.6407,1,304,16.9,362.25,7.83,19.4 +333,0.05083,0,5.19,0,0.515,6.316,38.1,6.4584,5,224,20.2,389.71,5.68,22.2 +334,0.03738,0,5.19,0,0.515,6.31,38.5,6.4584,5,224,20.2,389.4,6.75,20.7 +335,0.03961,0,5.19,0,0.515,6.037,34.5,5.9853,5,224,20.2,396.9,8.01,21.1 +336,0.03427,0,5.19,0,0.515,5.869,46.3,5.2311,5,224,20.2,396.9,9.8,19.5 +337,0.03041,0,5.19,0,0.515,5.895,59.6,5.615,5,224,20.2,394.81,10.56,18.5 +338,0.03306,0,5.19,0,0.515,6.059,37.3,4.8122,5,224,20.2,396.14,8.51,20.6 +339,0.05497,0,5.19,0,0.515,5.985,45.4,4.8122,5,224,20.2,396.9,9.74,19 +340,0.06151,0,5.19,0,0.515,5.968,58.5,4.8122,5,224,20.2,396.9,9.29,18.7 +341,0.01301,35,1.52,0,0.442,7.241,49.3,7.0379,1,284,15.5,394.74,5.49,32.7 +342,0.02498,0,1.89,0,0.518,6.54,59.7,6.2669,1,422,15.9,389.96,8.65,16.5 +343,0.02543,55,3.78,0,0.484,6.696,56.4,5.7321,5,370,17.6,396.9,7.18,23.9 +344,0.03049,55,3.78,0,0.484,6.874,28.1,6.4654,5,370,17.6,387.97,4.61,31.2 +345,0.03113,0,4.39,0,0.442,6.014,48.5,8.0136,3,352,18.8,385.64,10.53,17.5 +346,0.06162,0,4.39,0,0.442,5.898,52.3,8.0136,3,352,18.8,364.61,12.67,17.2 +347,0.0187,85,4.15,0,0.429,6.516,27.7,8.5353,4,351,17.9,392.43,6.36,23.1 +348,0.01501,80,2.01,0,0.435,6.635,29.7,8.344,4,280,17,390.94,5.99,24.5 +349,0.02899,40,1.25,0,0.429,6.939,34.5,8.7921,1,335,19.7,389.85,5.89,26.6 +350,0.06211,40,1.25,0,0.429,6.49,44.4,8.7921,1,335,19.7,396.9,5.98,22.9 +351,0.0795,60,1.69,0,0.411,6.579,35.9,10.7103,4,411,18.3,370.78,5.49,24.1 +352,0.07244,60,1.69,0,0.411,5.884,18.5,10.7103,4,411,18.3,392.33,7.79,18.6 +353,0.01709,90,2.02,0,0.41,6.728,36.1,12.1265,5,187,17,384.46,4.5,30.1 +354,0.04301,80,1.91,0,0.413,5.663,21.9,10.5857,4,334,22,382.8,8.05,18.2 +355,0.10659,80,1.91,0,0.413,5.936,19.5,10.5857,4,334,22,376.04,5.57,20.6 +356,8.98296,0,18.1,1,0.77,6.212,97.4,2.1222,24,666,20.2,377.73,17.6,17.8 +357,3.8497,0,18.1,1,0.77,6.395,91,2.5052,24,666,20.2,391.34,13.27,21.7 +358,5.20177,0,18.1,1,0.77,6.127,83.4,2.7227,24,666,20.2,395.43,11.48,22.7 +359,4.26131,0,18.1,0,0.77,6.112,81.3,2.5091,24,666,20.2,390.74,12.67,22.6 +360,4.54192,0,18.1,0,0.77,6.398,88,2.5182,24,666,20.2,374.56,7.79,25 +361,3.83684,0,18.1,0,0.77,6.251,91.1,2.2955,24,666,20.2,350.65,14.19,19.9 +362,3.67822,0,18.1,0,0.77,5.362,96.2,2.1036,24,666,20.2,380.79,10.19,20.8 +363,4.22239,0,18.1,1,0.77,5.803,89,1.9047,24,666,20.2,353.04,14.64,16.8 +364,3.47428,0,18.1,1,0.718,8.78,82.9,1.9047,24,666,20.2,354.55,5.29,21.9 +365,4.55587,0,18.1,0,0.718,3.561,87.9,1.6132,24,666,20.2,354.7,7.12,27.5 +366,3.69695,0,18.1,0,0.718,4.963,91.4,1.7523,24,666,20.2,316.03,14,21.9 +367,13.5222,0,18.1,0,0.631,3.863,100,1.5106,24,666,20.2,131.42,13.33,23.1 +368,4.89822,0,18.1,0,0.631,4.97,100,1.3325,24,666,20.2,375.52,3.26,50 +369,5.66998,0,18.1,1,0.631,6.683,96.8,1.3567,24,666,20.2,375.33,3.73,50 +370,6.53876,0,18.1,1,0.631,7.016,97.5,1.2024,24,666,20.2,392.05,2.96,50 +371,9.2323,0,18.1,0,0.631,6.216,100,1.1691,24,666,20.2,366.15,9.53,50 +372,8.26725,0,18.1,1,0.668,5.875,89.6,1.1296,24,666,20.2,347.88,8.88,50 +373,11.1081,0,18.1,0,0.668,4.906,100,1.1742,24,666,20.2,396.9,34.77,13.8 +374,18.4982,0,18.1,0,0.668,4.138,100,1.137,24,666,20.2,396.9,37.97,13.8 +375,19.6091,0,18.1,0,0.671,7.313,97.9,1.3163,24,666,20.2,396.9,13.44,15 +376,15.288,0,18.1,0,0.671,6.649,93.3,1.3449,24,666,20.2,363.02,23.24,13.9 +377,9.82349,0,18.1,0,0.671,6.794,98.8,1.358,24,666,20.2,396.9,21.24,13.3 +378,23.6482,0,18.1,0,0.671,6.38,96.2,1.3861,24,666,20.2,396.9,23.69,13.1 +379,17.8667,0,18.1,0,0.671,6.223,100,1.3861,24,666,20.2,393.74,21.78,10.2 +380,88.9762,0,18.1,0,0.671,6.968,91.9,1.4165,24,666,20.2,396.9,17.21,10.4 +381,15.8744,0,18.1,0,0.671,6.545,99.1,1.5192,24,666,20.2,396.9,21.08,10.9 +382,9.18702,0,18.1,0,0.7,5.536,100,1.5804,24,666,20.2,396.9,23.6,11.3 +383,7.99248,0,18.1,0,0.7,5.52,100,1.5331,24,666,20.2,396.9,24.56,12.3 +384,20.0849,0,18.1,0,0.7,4.368,91.2,1.4395,24,666,20.2,285.83,30.63,8.8 +385,16.8118,0,18.1,0,0.7,5.277,98.1,1.4261,24,666,20.2,396.9,30.81,7.2 +386,24.3938,0,18.1,0,0.7,4.652,100,1.4672,24,666,20.2,396.9,28.28,10.5 +387,22.5971,0,18.1,0,0.7,5,89.5,1.5184,24,666,20.2,396.9,31.99,7.4 +388,14.3337,0,18.1,0,0.7,4.88,100,1.5895,24,666,20.2,372.92,30.62,10.2 +389,8.15174,0,18.1,0,0.7,5.39,98.9,1.7281,24,666,20.2,396.9,20.85,11.5 +390,6.96215,0,18.1,0,0.7,5.713,97,1.9265,24,666,20.2,394.43,17.11,15.1 +391,5.29305,0,18.1,0,0.7,6.051,82.5,2.1678,24,666,20.2,378.38,18.76,23.2 +392,11.5779,0,18.1,0,0.7,5.036,97,1.77,24,666,20.2,396.9,25.68,9.7 +393,8.64476,0,18.1,0,0.693,6.193,92.6,1.7912,24,666,20.2,396.9,15.17,13.8 +394,13.3598,0,18.1,0,0.693,5.887,94.7,1.7821,24,666,20.2,396.9,16.35,12.7 +395,8.71675,0,18.1,0,0.693,6.471,98.8,1.7257,24,666,20.2,391.98,17.12,13.1 +396,5.87205,0,18.1,0,0.693,6.405,96,1.6768,24,666,20.2,396.9,19.37,12.5 +397,7.67202,0,18.1,0,0.693,5.747,98.9,1.6334,24,666,20.2,393.1,19.92,8.5 +398,38.3518,0,18.1,0,0.693,5.453,100,1.4896,24,666,20.2,396.9,30.59,5 +399,9.91655,0,18.1,0,0.693,5.852,77.8,1.5004,24,666,20.2,338.16,29.97,6.3 +400,25.0461,0,18.1,0,0.693,5.987,100,1.5888,24,666,20.2,396.9,26.77,5.6 +401,14.2362,0,18.1,0,0.693,6.343,100,1.5741,24,666,20.2,396.9,20.32,7.2 +402,9.59571,0,18.1,0,0.693,6.404,100,1.639,24,666,20.2,376.11,20.31,12.1 +403,24.8017,0,18.1,0,0.693,5.349,96,1.7028,24,666,20.2,396.9,19.77,8.3 +404,41.5292,0,18.1,0,0.693,5.531,85.4,1.6074,24,666,20.2,329.46,27.38,8.5 +405,67.9208,0,18.1,0,0.693,5.683,100,1.4254,24,666,20.2,384.97,22.98,5 +406,20.7162,0,18.1,0,0.659,4.138,100,1.1781,24,666,20.2,370.22,23.34,11.9 +407,11.9511,0,18.1,0,0.659,5.608,100,1.2852,24,666,20.2,332.09,12.13,27.9 +408,7.40389,0,18.1,0,0.597,5.617,97.9,1.4547,24,666,20.2,314.64,26.4,17.2 +409,14.4383,0,18.1,0,0.597,6.852,100,1.4655,24,666,20.2,179.36,19.78,27.5 +410,51.1358,0,18.1,0,0.597,5.757,100,1.413,24,666,20.2,2.6,10.11,15 +411,14.0507,0,18.1,0,0.597,6.657,100,1.5275,24,666,20.2,35.05,21.22,17.2 +412,18.811,0,18.1,0,0.597,4.628,100,1.5539,24,666,20.2,28.79,34.37,17.9 +413,28.6558,0,18.1,0,0.597,5.155,100,1.5894,24,666,20.2,210.97,20.08,16.3 +414,45.7461,0,18.1,0,0.693,4.519,100,1.6582,24,666,20.2,88.27,36.98,7 +415,18.0846,0,18.1,0,0.679,6.434,100,1.8347,24,666,20.2,27.25,29.05,7.2 +416,10.8342,0,18.1,0,0.679,6.782,90.8,1.8195,24,666,20.2,21.57,25.79,7.5 +417,25.9406,0,18.1,0,0.679,5.304,89.1,1.6475,24,666,20.2,127.36,26.64,10.4 +418,73.5341,0,18.1,0,0.679,5.957,100,1.8026,24,666,20.2,16.45,20.62,8.8 +419,11.8123,0,18.1,0,0.718,6.824,76.5,1.794,24,666,20.2,48.45,22.74,8.4 +420,11.0874,0,18.1,0,0.718,6.411,100,1.8589,24,666,20.2,318.75,15.02,16.7 +421,7.02259,0,18.1,0,0.718,6.006,95.3,1.8746,24,666,20.2,319.98,15.7,14.2 +422,12.0482,0,18.1,0,0.614,5.648,87.6,1.9512,24,666,20.2,291.55,14.1,20.8 +423,7.05042,0,18.1,0,0.614,6.103,85.1,2.0218,24,666,20.2,2.52,23.29,13.4 +424,8.79212,0,18.1,0,0.584,5.565,70.6,2.0635,24,666,20.2,3.65,17.16,11.7 +425,15.8603,0,18.1,0,0.679,5.896,95.4,1.9096,24,666,20.2,7.68,24.39,8.3 +426,12.2472,0,18.1,0,0.584,5.837,59.7,1.9976,24,666,20.2,24.65,15.69,10.2 +427,37.6619,0,18.1,0,0.679,6.202,78.7,1.8629,24,666,20.2,18.82,14.52,10.9 +428,7.36711,0,18.1,0,0.679,6.193,78.1,1.9356,24,666,20.2,96.73,21.52,11 +429,9.33889,0,18.1,0,0.679,6.38,95.6,1.9682,24,666,20.2,60.72,24.08,9.5 +430,8.49213,0,18.1,0,0.584,6.348,86.1,2.0527,24,666,20.2,83.45,17.64,14.5 +431,10.0623,0,18.1,0,0.584,6.833,94.3,2.0882,24,666,20.2,81.33,19.69,14.1 +432,6.44405,0,18.1,0,0.584,6.425,74.8,2.2004,24,666,20.2,97.95,12.03,16.1 +433,5.58107,0,18.1,0,0.713,6.436,87.9,2.3158,24,666,20.2,100.19,16.22,14.3 +434,13.9134,0,18.1,0,0.713,6.208,95,2.2222,24,666,20.2,100.63,15.17,11.7 +435,11.1604,0,18.1,0,0.74,6.629,94.6,2.1247,24,666,20.2,109.85,23.27,13.4 +436,14.4208,0,18.1,0,0.74,6.461,93.3,2.0026,24,666,20.2,27.49,18.05,9.6 +437,15.1772,0,18.1,0,0.74,6.152,100,1.9142,24,666,20.2,9.32,26.45,8.7 +438,13.6781,0,18.1,0,0.74,5.935,87.9,1.8206,24,666,20.2,68.95,34.02,8.4 +439,9.39063,0,18.1,0,0.74,5.627,93.9,1.8172,24,666,20.2,396.9,22.88,12.8 +440,22.0511,0,18.1,0,0.74,5.818,92.4,1.8662,24,666,20.2,391.45,22.11,10.5 +441,9.72418,0,18.1,0,0.74,6.406,97.2,2.0651,24,666,20.2,385.96,19.52,17.1 +442,5.66637,0,18.1,0,0.74,6.219,100,2.0048,24,666,20.2,395.69,16.59,18.4 +443,9.96654,0,18.1,0,0.74,6.485,100,1.9784,24,666,20.2,386.73,18.85,15.4 +444,12.8023,0,18.1,0,0.74,5.854,96.6,1.8956,24,666,20.2,240.52,23.79,10.8 +445,10.6718,0,18.1,0,0.74,6.459,94.8,1.9879,24,666,20.2,43.06,23.98,11.8 +446,6.28807,0,18.1,0,0.74,6.341,96.4,2.072,24,666,20.2,318.01,17.79,14.9 +447,9.92485,0,18.1,0,0.74,6.251,96.6,2.198,24,666,20.2,388.52,16.44,12.6 +448,9.32909,0,18.1,0,0.713,6.185,98.7,2.2616,24,666,20.2,396.9,18.13,14.1 +449,7.52601,0,18.1,0,0.713,6.417,98.3,2.185,24,666,20.2,304.21,19.31,13 +450,6.71772,0,18.1,0,0.713,6.749,92.6,2.3236,24,666,20.2,0.32,17.44,13.4 +451,5.44114,0,18.1,0,0.713,6.655,98.2,2.3552,24,666,20.2,355.29,17.73,15.2 +452,5.09017,0,18.1,0,0.713,6.297,91.8,2.3682,24,666,20.2,385.09,17.27,16.1 +453,8.24809,0,18.1,0,0.713,7.393,99.3,2.4527,24,666,20.2,375.87,16.74,17.8 +454,9.51363,0,18.1,0,0.713,6.728,94.1,2.4961,24,666,20.2,6.68,18.71,14.9 +455,4.75237,0,18.1,0,0.713,6.525,86.5,2.4358,24,666,20.2,50.92,18.13,14.1 +456,4.66883,0,18.1,0,0.713,5.976,87.9,2.5806,24,666,20.2,10.48,19.01,12.7 +457,8.20058,0,18.1,0,0.713,5.936,80.3,2.7792,24,666,20.2,3.5,16.94,13.5 +458,7.75223,0,18.1,0,0.713,6.301,83.7,2.7831,24,666,20.2,272.21,16.23,14.9 +459,6.80117,0,18.1,0,0.713,6.081,84.4,2.7175,24,666,20.2,396.9,14.7,20 +460,4.81213,0,18.1,0,0.713,6.701,90,2.5975,24,666,20.2,255.23,16.42,16.4 +461,3.69311,0,18.1,0,0.713,6.376,88.4,2.5671,24,666,20.2,391.43,14.65,17.7 +462,6.65492,0,18.1,0,0.713,6.317,83,2.7344,24,666,20.2,396.9,13.99,19.5 +463,5.82115,0,18.1,0,0.713,6.513,89.9,2.8016,24,666,20.2,393.82,10.29,20.2 +464,7.83932,0,18.1,0,0.655,6.209,65.4,2.9634,24,666,20.2,396.9,13.22,21.4 +465,3.1636,0,18.1,0,0.655,5.759,48.2,3.0665,24,666,20.2,334.4,14.13,19.9 +466,3.77498,0,18.1,0,0.655,5.952,84.7,2.8715,24,666,20.2,22.01,17.15,19 +467,4.42228,0,18.1,0,0.584,6.003,94.5,2.5403,24,666,20.2,331.29,21.32,19.1 +468,15.5757,0,18.1,0,0.58,5.926,71,2.9084,24,666,20.2,368.74,18.13,19.1 +469,13.0751,0,18.1,0,0.58,5.713,56.7,2.8237,24,666,20.2,396.9,14.76,20.1 +470,4.34879,0,18.1,0,0.58,6.167,84,3.0334,24,666,20.2,396.9,16.29,19.9 +471,4.03841,0,18.1,0,0.532,6.229,90.7,3.0993,24,666,20.2,395.33,12.87,19.6 +472,3.56868,0,18.1,0,0.58,6.437,75,2.8965,24,666,20.2,393.37,14.36,23.2 +473,4.64689,0,18.1,0,0.614,6.98,67.6,2.5329,24,666,20.2,374.68,11.66,29.8 +474,8.05579,0,18.1,0,0.584,5.427,95.4,2.4298,24,666,20.2,352.58,18.14,13.8 +475,6.39312,0,18.1,0,0.584,6.162,97.4,2.206,24,666,20.2,302.76,24.1,13.3 +476,4.87141,0,18.1,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7 +477,15.0234,0,18.1,0,0.614,5.304,97.3,2.1007,24,666,20.2,349.48,24.91,12 +478,10.233,0,18.1,0,0.614,6.185,96.7,2.1705,24,666,20.2,379.7,18.03,14.6 +479,14.3337,0,18.1,0,0.614,6.229,88,1.9512,24,666,20.2,383.32,13.11,21.4 +480,5.82401,0,18.1,0,0.532,6.242,64.7,3.4242,24,666,20.2,396.9,10.74,23 +481,5.70818,0,18.1,0,0.532,6.75,74.9,3.3317,24,666,20.2,393.07,7.74,23.7 +482,5.73116,0,18.1,0,0.532,7.061,77,3.4106,24,666,20.2,395.28,7.01,25 +483,2.81838,0,18.1,0,0.532,5.762,40.3,4.0983,24,666,20.2,392.92,10.42,21.8 +484,2.37857,0,18.1,0,0.583,5.871,41.9,3.724,24,666,20.2,370.73,13.34,20.6 +485,3.67367,0,18.1,0,0.583,6.312,51.9,3.9917,24,666,20.2,388.62,10.58,21.2 +486,5.69175,0,18.1,0,0.583,6.114,79.8,3.5459,24,666,20.2,392.68,14.98,19.1 +487,4.83567,0,18.1,0,0.583,5.905,53.2,3.1523,24,666,20.2,388.22,11.45,20.6 +488,0.15086,0,27.74,0,0.609,5.454,92.7,1.8209,4,711,20.1,395.09,18.06,15.2 +489,0.18337,0,27.74,0,0.609,5.414,98.3,1.7554,4,711,20.1,344.05,23.97,7 +490,0.20746,0,27.74,0,0.609,5.093,98,1.8226,4,711,20.1,318.43,29.68,8.1 +491,0.10574,0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07,13.6 +492,0.11132,0,27.74,0,0.609,5.983,83.5,2.1099,4,711,20.1,396.9,13.35,20.1 +493,0.17331,0,9.69,0,0.585,5.707,54,2.3817,6,391,19.2,396.9,12.01,21.8 +494,0.27957,0,9.69,0,0.585,5.926,42.6,2.3817,6,391,19.2,396.9,13.59,24.5 +495,0.17899,0,9.69,0,0.585,5.67,28.8,2.7986,6,391,19.2,393.29,17.6,23.1 +496,0.2896,0,9.69,0,0.585,5.39,72.9,2.7986,6,391,19.2,396.9,21.14,19.7 +497,0.26838,0,9.69,0,0.585,5.794,70.6,2.8927,6,391,19.2,396.9,14.1,18.3 +498,0.23912,0,9.69,0,0.585,6.019,65.3,2.4091,6,391,19.2,396.9,12.92,21.2 +499,0.17783,0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.1,17.5 +500,0.22438,0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.9,14.33,16.8 +501,0.06263,0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21,391.99,9.67,22.4 +502,0.04527,0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21,396.9,9.08,20.6 +503,0.06076,0,11.93,0,0.573,6.976,91,2.1675,1,273,21,396.9,5.64,23.9 +504,0.10959,0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21,393.45,6.48,22 +505,0.04741,0,11.93,0,0.573,6.03,80.8,2.505,1,273,21,396.9,7.88,11.9 diff --git a/axolotl/tests/data/datasets/database_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/database_dataset_1/datasetDoc.json new file mode 100644 index 0000000..0d37094 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_1/datasetDoc.json @@ -0,0 +1,200 @@ +{ + "about": { + "datasetID": "database_dataset_1", + "datasetName": "A dataset simulating a database dump", + "description": "A synthetic dataset trying to be similar to a database dump, with tables with different relations between them.", + "license": "CC", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "044b0c8724c80f672fb5a6e233b154d3342e4528de9a0245df77066f770c3d8f" + }, + "dataResources": [ + { + "resID": "codes", + "resPath": "tables/codes.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 3, + "columns": [ + { + "colIndex": 0, + "colName": "code", + "colType": "categorical", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "name", + "colType": "string", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "author", + "colType": "integer", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "authors", + "resObject": { + "columnIndex": 0 + } + } + } + ] + }, + { + "resID": "authors", + "resPath": "tables/authors.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 2, + "columns": [ + { + "colIndex": 0, + "colName": "id", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "name", + "colType": "string", + "role": [ + "attribute" + ] + } + ] + }, + { + "resID": "values", + "resPath": "tables/values.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 4, + "columns": [ + { + "colIndex": 0, + "colName": "code", + "colType": "categorical", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "codes", + "resObject": { + "columnName": "code" + } + } + }, + { + "colIndex": 1, + "colName": "key", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "year", + "colType": "dateTime", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + } + ] + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 5, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "code", + "colType": "categorical", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "codes", + "resObject": { + "columnName": "code" + } + } + }, + { + "colIndex": 2, + "colName": "key", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "year", + "colType": "dateTime", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "value", + "colType": "real", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/database_dataset_1/tables/authors.csv b/axolotl/tests/data/datasets/database_dataset_1/tables/authors.csv new file mode 100644 index 0000000..47cc42e --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_1/tables/authors.csv @@ -0,0 +1,4 @@ +id,name +1,1 name +2,2 name +3,3 name diff --git a/axolotl/tests/data/datasets/database_dataset_1/tables/codes.csv b/axolotl/tests/data/datasets/database_dataset_1/tables/codes.csv new file mode 100644 index 0000000..41cca23 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_1/tables/codes.csv @@ -0,0 +1,4 @@ +code,name,author +AAA,AAA name,1 +BBB,BBB name,2 +CCC,CCC name, diff --git a/axolotl/tests/data/datasets/database_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/database_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..ed41868 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_1/tables/learningData.csv @@ -0,0 +1,46 @@ +d3mIndex,code,key,year,value +1,AAA,aaa,1990,46.8470585590128 +2,BBB,aaa,1990,62.2717948419606 +3,CCC,aaa,1990,67.8237620898007 +4,AAA,aaa,2000,48.2983166483242 +5,BBB,aaa,2000,45.549658223693 +6,CCC,aaa,2000,69.2737110112599 +7,AAA,aaa,2010,67.0300272167932 +8,BBB,aaa,2010,61.2105314015372 +9,CCC,aaa,2010,69.2795439452043 +10,AAA,bbb,1990,41.4118186427085 +11,BBB,bbb,1990,39.5640865244041 +12,CCC,bbb,1990,67.9914975626095 +13,AAA,bbb,2000,65.1345599343405 +14,BBB,bbb,2000,68.7315746449879 +15,CCC,bbb,2000,56.0433735350634 +16,AAA,bbb,2010,54.3830733627441 +17,BBB,bbb,2010,40.2346487255306 +18,CCC,bbb,2010,45.8856701879045 +19,AAA,ccc,1990,39.1645043487628 +20,BBB,ccc,1990,45.648691933945 +21,CCC,ccc,1990,46.7133286046023 +22,AAA,ccc,2000,46.6525032867438 +23,BBB,ccc,2000,46.4240035362404 +24,CCC,ccc,2000,69.89360375709 +25,AAA,ccc,2010,67.3439641718359 +26,BBB,ccc,2010,45.0358709995812 +27,CCC,ccc,2010,53.6699876392329 +28,AAA,ddd,1990,51.7980168708906 +29,BBB,ddd,1990,41.9604584970723 +30,CCC,ddd,1990,40.1696772984881 +31,AAA,ddd,2000,40.1734778285386 +32,BBB,ddd,2000,47.1396106716371 +33,CCC,ddd,2000,52.4998419337555 +34,AAA,ddd,2010,32.328512195122 +35,BBB,ddd,2010,62.2543658536585 +36,CCC,ddd,2010,46.1351219512195 +37,AAA,eee,1990,32.9848292682927 +38,BBB,eee,1990,61.7827317073171 +39,CCC,eee,1990,65.2155365853659 +40,AAA,eee,2000,65.8634634146342 +41,BBB,eee,2000,65.5693658536586 +42,CCC,eee,2000,70.8170731707317 +43,AAA,eee,2010,68.5856097560976 +44,BBB,eee,2010,60.836243902439 +45,CCC,eee,2010,62.7290487804878 diff --git a/axolotl/tests/data/datasets/database_dataset_1/tables/values.csv b/axolotl/tests/data/datasets/database_dataset_1/tables/values.csv new file mode 100644 index 0000000..c54a074 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_1/tables/values.csv @@ -0,0 +1,65 @@ +code,key,year,value +AAA,fff,1980,47.6978880950964 +AAA,ggg,1980,8.5102073882048 +AAA,hhh,1980,28937699 +AAA,iii,1980,31.2853842116054 +BBB,fff,1980,40.017000543436 +BBB,ggg,1980,0.088668203582195 +BBB,hhh,1980,1324191 +BBB,iii,1980,31.5974898513652 +CCC,fff,1980,19.1174351291049 +CCC,ggg,1980,3.93265041799129 +CCC,hhh,1980,40680946 +CCC,iii,1980,44.5079211390026 +DDD,fff,1980,26.3623503812716 +DDD,ggg,1980,1.92536841530696 +DDD,hhh,1980,231871389 +DDD,iii,1980,22.4711322042954 +AAA,fff,1990,47.4553181402273 +AAA,ggg,1990,8.70342011230219 +AAA,hhh,1990,30440944 +AAA,iii,1990,32.0290757076957 +BBB,fff,1990,39.7447929589982 +BBB,ggg,1990,0.208550573514077 +BBB,hhh,1990,1358967 +BBB,iii,1990,31.8190266061615 +CCC,fff,1990,18.1079412466031 +CCC,ggg,1990,4.17195041631032 +CCC,hhh,1990,41697325 +CCC,iii,1990,45.206665319194 +DDD,fff,1990,24.5407077023555 +DDD,ggg,1990,2.16383747049959 +DDD,hhh,1990,239260861 +DDD,iii,1990,23.1221808000131 +AAA,fff,2000,47.1985790784782 +AAA,ggg,2000,7.57819143910044 +AAA,hhh,2000,32023589 +AAA,iii,2000,32.780322721748 +BBB,fff,2000,39.3127951747193 +BBB,ggg,2000,0.017809439002671 +BBB,hhh,2000,1395610 +BBB,iii,2000,32.0578605237552 +CCC,fff,2000,17.2041249395661 +CCC,ggg,2000,4.85743169997929 +CCC,hhh,2000,42662735 +CCC,iii,2000,45.866564696018 +DDD,fff,2000,36.4059763551675 +DDD,ggg,2000,1.78711972147311 +DDD,hhh,2000,247991158 +DDD,iii,2000,23.651145327486 +AAA,fff,2010,46.9193698023565 +AAA,ggg,2010,8.06572068707991 +AAA,hhh,2010,33702951 +AAA,iii,2010,33.5558808208595 +BBB,fff,2010,38.7256964353179 +BBB,ggg,2010,0 +BBB,hhh,2010,1433393 +BBB,iii,2010,32.3139419105331 +CCC,fff,2010,16.9941914183028 +CCC,ggg,2010,3.60319993005617 +CCC,hhh,2010,43670268 +CCC,iii,2010,46.5340927663649 +DDD,fff,2010,40.2972491778884 +DDD,ggg,2010,1.70584120202376 +DDD,hhh,2010,258930584 +DDD,iii,2010,24.1179584421312 diff --git a/axolotl/tests/data/datasets/database_dataset_2/datasetDoc.json b/axolotl/tests/data/datasets/database_dataset_2/datasetDoc.json new file mode 100644 index 0000000..e026a28 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_2/datasetDoc.json @@ -0,0 +1,196 @@ +{ + "about": { + "datasetSchemaVersion": "4.0.0", + "datasetID": "database_dataset_2", + "datasetName": "Database dataset of type COUNTS_PER_USER", + "description": "Database dataset of type COUNTS_PER_USER, size 100, random seed 0", + "digest": "0eafe8b08646e4c684bb1776fee8af92cc232ba1bd2840ca0c70e7ae5a59d976", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "users", + "isCollection": false, + "columnsCount": 2, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/users.csv", + "columns": [ + { + "colIndex": 0, + "colName": "id", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "name", + "role": [ + "attribute" + ], + "colType": "string" + } + ] + }, + { + "resID": "posts", + "isCollection": false, + "columnsCount": 3, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/posts.csv", + "columns": [ + { + "colIndex": 0, + "colName": "id", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "author_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "users", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 2, + "colName": "post", + "role": [ + "attribute" + ], + "colType": "string" + } + ] + }, + { + "resID": "comments", + "isCollection": false, + "columnsCount": 4, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/comments.csv", + "columns": [ + { + "colIndex": 0, + "colName": "id", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "post_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "posts", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 2, + "colName": "author_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "users", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 3, + "colName": "comment", + "role": [ + "attribute" + ], + "colType": "string" + } + ] + }, + { + "resID": "learningData", + "isCollection": false, + "columnsCount": 4, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/learningData.csv", + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "user_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "users", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 2, + "colName": "posts_count", + "role": [ + "suggestedTarget" + ], + "colType": "integer" + }, + { + "colIndex": 3, + "colName": "comments_count", + "role": [ + "suggestedTarget" + ], + "colType": "integer" + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/database_dataset_2/tables/comments.csv b/axolotl/tests/data/datasets/database_dataset_2/tables/comments.csv new file mode 100644 index 0000000..ab57bf3 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_2/tables/comments.csv @@ -0,0 +1,1001 @@ +id,post_id,author_id,comment +0,198,74,Comment 0 +1,383,3,Comment 1 +2,490,59,Comment 2 +3,471,40,Comment 3 +4,952,3,Comment 4 +5,581,3,Comment 5 +6,680,3,Comment 6 +7,945,45,Comment 7 +8,361,33,Comment 8 +9,327,3,Comment 9 +10,25,8,Comment 10 +11,165,41,Comment 11 +12,205,3,Comment 12 +13,164,8,Comment 13 +14,698,5,Comment 14 +15,455,23,Comment 15 +16,556,20,Comment 16 +17,784,3,Comment 17 +18,198,58,Comment 18 +19,299,3,Comment 19 +20,621,3,Comment 20 +21,337,6,Comment 21 +22,25,83,Comment 22 +23,393,3,Comment 23 +24,857,3,Comment 24 +25,360,93,Comment 25 +26,304,3,Comment 26 +27,985,3,Comment 27 +28,526,49,Comment 28 +29,327,3,Comment 29 +30,74,3,Comment 30 +31,728,3,Comment 31 +32,621,59,Comment 32 +33,870,87,Comment 33 +34,198,3,Comment 34 +35,91,3,Comment 35 +36,657,95,Comment 36 +37,185,36,Comment 37 +38,154,26,Comment 38 +39,297,65,Comment 39 +40,772,3,Comment 40 +41,459,3,Comment 41 +42,25,96,Comment 42 +43,421,67,Comment 43 +44,588,54,Comment 44 +45,458,3,Comment 45 +46,488,11,Comment 46 +47,198,93,Comment 47 +48,828,93,Comment 48 +49,488,1,Comment 49 +50,637,56,Comment 50 +51,968,1,Comment 51 +52,385,96,Comment 52 +53,857,5,Comment 53 +54,4,96,Comment 54 +55,25,3,Comment 55 +56,488,3,Comment 56 +57,293,3,Comment 57 +58,217,50,Comment 58 +59,232,3,Comment 59 +60,297,73,Comment 60 +61,663,35,Comment 61 +62,488,96,Comment 62 +63,732,97,Comment 63 +64,796,3,Comment 64 +65,824,95,Comment 65 +66,361,3,Comment 66 +67,373,3,Comment 67 +68,880,93,Comment 68 +69,545,96,Comment 69 +70,46,3,Comment 70 +71,461,8,Comment 71 +72,327,14,Comment 72 +73,982,72,Comment 73 +74,15,38,Comment 74 +75,494,59,Comment 75 +76,657,13,Comment 76 +77,251,38,Comment 77 +78,950,51,Comment 78 +79,842,52,Comment 79 +80,862,0,Comment 80 +81,22,3,Comment 81 +82,488,3,Comment 82 +83,265,3,Comment 83 +84,828,3,Comment 84 +85,510,3,Comment 85 +86,459,90,Comment 86 +87,91,31,Comment 87 +88,459,47,Comment 88 +89,509,81,Comment 89 +90,934,8,Comment 90 +91,488,0,Comment 91 +92,945,3,Comment 92 +93,938,65,Comment 93 +94,526,84,Comment 94 +95,451,3,Comment 95 +96,424,3,Comment 96 +97,857,92,Comment 97 +98,25,3,Comment 98 +99,90,39,Comment 99 +100,75,3,Comment 100 +101,702,3,Comment 101 +102,308,17,Comment 102 +103,519,91,Comment 103 +104,488,38,Comment 104 +105,327,3,Comment 105 +106,451,45,Comment 106 +107,526,59,Comment 107 +108,911,3,Comment 108 +109,488,28,Comment 109 +110,498,45,Comment 110 +111,26,3,Comment 111 +112,931,96,Comment 112 +113,451,8,Comment 113 +114,749,96,Comment 114 +115,25,3,Comment 115 +116,373,13,Comment 116 +117,219,3,Comment 117 +118,224,1,Comment 118 +119,735,3,Comment 119 +120,504,3,Comment 120 +121,394,8,Comment 121 +122,5,81,Comment 122 +123,943,38,Comment 123 +124,938,8,Comment 124 +125,198,97,Comment 125 +126,25,38,Comment 126 +127,451,3,Comment 127 +128,931,45,Comment 128 +129,857,15,Comment 129 +130,488,79,Comment 130 +131,275,83,Comment 131 +132,304,3,Comment 132 +133,857,30,Comment 133 +134,451,3,Comment 134 +135,120,38,Comment 135 +136,217,98,Comment 136 +137,232,67,Comment 137 +138,106,3,Comment 138 +139,420,3,Comment 139 +140,864,96,Comment 140 +141,557,3,Comment 141 +142,32,3,Comment 142 +143,4,3,Comment 143 +144,232,13,Comment 144 +145,327,95,Comment 145 +146,719,96,Comment 146 +147,945,3,Comment 147 +148,329,3,Comment 148 +149,590,18,Comment 149 +150,991,1,Comment 150 +151,682,3,Comment 151 +152,516,93,Comment 152 +153,39,3,Comment 153 +154,297,3,Comment 154 +155,861,6,Comment 155 +156,185,50,Comment 156 +157,824,3,Comment 157 +158,600,3,Comment 158 +159,327,3,Comment 159 +160,451,76,Comment 160 +161,463,3,Comment 161 +162,638,30,Comment 162 +163,451,99,Comment 163 +164,120,49,Comment 164 +165,719,3,Comment 165 +166,358,59,Comment 166 +167,938,95,Comment 167 +168,242,3,Comment 168 +169,219,47,Comment 169 +170,304,3,Comment 170 +171,488,83,Comment 171 +172,857,3,Comment 172 +173,154,3,Comment 173 +174,232,23,Comment 174 +175,488,3,Comment 175 +176,411,38,Comment 176 +177,406,27,Comment 177 +178,784,48,Comment 178 +179,875,3,Comment 179 +180,438,24,Comment 180 +181,590,64,Comment 181 +182,749,64,Comment 182 +183,451,47,Comment 183 +184,264,23,Comment 184 +185,519,44,Comment 185 +186,488,3,Comment 186 +187,857,10,Comment 187 +188,25,72,Comment 188 +189,857,37,Comment 189 +190,452,93,Comment 190 +191,18,3,Comment 191 +192,323,3,Comment 192 +193,657,38,Comment 193 +194,451,3,Comment 194 +195,964,59,Comment 195 +196,377,3,Comment 196 +197,302,22,Comment 197 +198,784,64,Comment 198 +199,478,59,Comment 199 +200,584,15,Comment 200 +201,758,96,Comment 201 +202,562,16,Comment 202 +203,376,3,Comment 203 +204,109,3,Comment 204 +205,488,53,Comment 205 +206,857,8,Comment 206 +207,369,6,Comment 207 +208,857,3,Comment 208 +209,451,3,Comment 209 +210,504,72,Comment 210 +211,801,60,Comment 211 +212,488,54,Comment 212 +213,65,3,Comment 213 +214,965,3,Comment 214 +215,217,3,Comment 215 +216,626,11,Comment 216 +217,451,87,Comment 217 +218,435,3,Comment 218 +219,216,3,Comment 219 +220,656,32,Comment 220 +221,89,88,Comment 221 +222,986,52,Comment 222 +223,827,71,Comment 223 +224,452,3,Comment 224 +225,382,3,Comment 225 +226,244,23,Comment 226 +227,451,45,Comment 227 +228,857,95,Comment 228 +229,232,3,Comment 229 +230,451,3,Comment 230 +231,200,20,Comment 231 +232,304,45,Comment 232 +233,166,31,Comment 233 +234,986,48,Comment 234 +235,488,3,Comment 235 +236,665,72,Comment 236 +237,74,3,Comment 237 +238,327,3,Comment 238 +239,857,59,Comment 239 +240,55,3,Comment 240 +241,143,44,Comment 241 +242,504,3,Comment 242 +243,504,3,Comment 243 +244,808,3,Comment 244 +245,896,81,Comment 245 +246,122,8,Comment 246 +247,219,3,Comment 247 +248,383,3,Comment 248 +249,1,1,Comment 249 +250,27,57,Comment 250 +251,198,3,Comment 251 +252,438,37,Comment 252 +253,219,3,Comment 253 +254,857,67,Comment 254 +255,298,3,Comment 255 +256,877,1,Comment 256 +257,952,3,Comment 257 +258,408,90,Comment 258 +259,198,3,Comment 259 +260,258,18,Comment 260 +261,719,3,Comment 261 +262,230,3,Comment 262 +263,327,92,Comment 263 +264,435,59,Comment 264 +265,488,58,Comment 265 +266,993,20,Comment 266 +267,198,66,Comment 267 +268,327,8,Comment 268 +269,219,8,Comment 269 +270,327,14,Comment 270 +271,74,3,Comment 271 +272,327,92,Comment 272 +273,857,3,Comment 273 +274,268,3,Comment 274 +275,327,3,Comment 275 +276,919,65,Comment 276 +277,661,59,Comment 277 +278,451,8,Comment 278 +279,719,3,Comment 279 +280,105,96,Comment 280 +281,421,3,Comment 281 +282,101,70,Comment 282 +283,198,59,Comment 283 +284,121,93,Comment 284 +285,327,59,Comment 285 +286,327,3,Comment 286 +287,928,3,Comment 287 +288,219,3,Comment 288 +289,431,3,Comment 289 +290,767,48,Comment 290 +291,770,66,Comment 291 +292,692,38,Comment 292 +293,248,3,Comment 293 +294,451,3,Comment 294 +295,165,33,Comment 295 +296,165,8,Comment 296 +297,394,3,Comment 297 +298,677,3,Comment 298 +299,451,23,Comment 299 +300,857,38,Comment 300 +301,797,11,Comment 301 +302,25,96,Comment 302 +303,488,3,Comment 303 +304,488,41,Comment 304 +305,938,86,Comment 305 +306,25,3,Comment 306 +307,451,3,Comment 307 +308,185,58,Comment 308 +309,25,13,Comment 309 +310,488,98,Comment 310 +311,719,35,Comment 311 +312,719,92,Comment 312 +313,25,2,Comment 313 +314,359,3,Comment 314 +315,25,45,Comment 315 +316,217,5,Comment 316 +317,172,67,Comment 317 +318,198,8,Comment 318 +319,307,47,Comment 319 +320,232,31,Comment 320 +321,938,3,Comment 321 +322,327,48,Comment 322 +323,857,47,Comment 323 +324,163,3,Comment 324 +325,712,29,Comment 325 +326,26,3,Comment 326 +327,419,8,Comment 327 +328,4,7,Comment 328 +329,764,3,Comment 329 +330,931,3,Comment 330 +331,25,3,Comment 331 +332,893,3,Comment 332 +333,719,70,Comment 333 +334,327,3,Comment 334 +335,857,8,Comment 335 +336,266,60,Comment 336 +337,360,3,Comment 337 +338,74,3,Comment 338 +339,540,49,Comment 339 +340,25,38,Comment 340 +341,447,30,Comment 341 +342,587,3,Comment 342 +343,784,59,Comment 343 +344,122,3,Comment 344 +345,698,3,Comment 345 +346,645,59,Comment 346 +347,488,3,Comment 347 +348,488,3,Comment 348 +349,66,3,Comment 349 +350,488,3,Comment 350 +351,327,3,Comment 351 +352,856,45,Comment 352 +353,294,3,Comment 353 +354,488,3,Comment 354 +355,383,59,Comment 355 +356,857,87,Comment 356 +357,219,38,Comment 357 +358,40,3,Comment 358 +359,689,23,Comment 359 +360,360,38,Comment 360 +361,488,3,Comment 361 +362,25,67,Comment 362 +363,931,59,Comment 363 +364,857,33,Comment 364 +365,504,3,Comment 365 +366,21,45,Comment 366 +367,327,83,Comment 367 +368,961,3,Comment 368 +369,582,95,Comment 369 +370,137,59,Comment 370 +371,419,3,Comment 371 +372,945,38,Comment 372 +373,452,3,Comment 373 +374,25,3,Comment 374 +375,503,66,Comment 375 +376,226,43,Comment 376 +377,74,38,Comment 377 +378,353,3,Comment 378 +379,488,3,Comment 379 +380,21,77,Comment 380 +381,488,3,Comment 381 +382,451,3,Comment 382 +383,740,3,Comment 383 +384,379,3,Comment 384 +385,483,59,Comment 385 +386,682,38,Comment 386 +387,857,3,Comment 387 +388,327,3,Comment 388 +389,488,31,Comment 389 +390,599,38,Comment 390 +391,25,3,Comment 391 +392,748,37,Comment 392 +393,327,3,Comment 393 +394,559,3,Comment 394 +395,431,3,Comment 395 +396,611,3,Comment 396 +397,657,96,Comment 397 +398,168,96,Comment 398 +399,546,8,Comment 399 +400,828,3,Comment 400 +401,203,95,Comment 401 +402,702,66,Comment 402 +403,512,48,Comment 403 +404,931,8,Comment 404 +405,219,4,Comment 405 +406,122,59,Comment 406 +407,293,20,Comment 407 +408,219,41,Comment 408 +409,702,31,Comment 409 +410,665,47,Comment 410 +411,559,3,Comment 411 +412,198,3,Comment 412 +413,849,3,Comment 413 +414,935,3,Comment 414 +415,451,3,Comment 415 +416,526,18,Comment 416 +417,451,3,Comment 417 +418,242,96,Comment 418 +419,244,20,Comment 419 +420,294,3,Comment 420 +421,168,45,Comment 421 +422,857,3,Comment 422 +423,841,3,Comment 423 +424,419,67,Comment 424 +425,327,3,Comment 425 +426,904,3,Comment 426 +427,198,3,Comment 427 +428,483,49,Comment 428 +429,25,41,Comment 429 +430,168,1,Comment 430 +431,488,3,Comment 431 +432,546,47,Comment 432 +433,488,3,Comment 433 +434,857,33,Comment 434 +435,987,3,Comment 435 +436,712,8,Comment 436 +437,423,74,Comment 437 +438,803,3,Comment 438 +439,102,45,Comment 439 +440,587,8,Comment 440 +441,451,3,Comment 441 +442,158,3,Comment 442 +443,950,68,Comment 443 +444,305,96,Comment 444 +445,499,69,Comment 445 +446,857,3,Comment 446 +447,451,3,Comment 447 +448,91,67,Comment 448 +449,779,26,Comment 449 +450,327,59,Comment 450 +451,970,59,Comment 451 +452,857,3,Comment 452 +453,833,3,Comment 453 +454,327,12,Comment 454 +455,702,93,Comment 455 +456,979,84,Comment 456 +457,451,7,Comment 457 +458,431,3,Comment 458 +459,18,3,Comment 459 +460,609,21,Comment 460 +461,841,85,Comment 461 +462,217,3,Comment 462 +463,294,3,Comment 463 +464,451,38,Comment 464 +465,32,18,Comment 465 +466,63,3,Comment 466 +467,373,3,Comment 467 +468,219,13,Comment 468 +469,358,59,Comment 469 +470,412,59,Comment 470 +471,954,96,Comment 471 +472,919,3,Comment 472 +473,121,3,Comment 473 +474,857,0,Comment 474 +475,45,67,Comment 475 +476,451,38,Comment 476 +477,142,59,Comment 477 +478,327,10,Comment 478 +479,682,90,Comment 479 +480,382,3,Comment 480 +481,758,3,Comment 481 +482,645,3,Comment 482 +483,918,3,Comment 483 +484,452,3,Comment 484 +485,753,31,Comment 485 +486,297,3,Comment 486 +487,326,38,Comment 487 +488,351,93,Comment 488 +489,360,66,Comment 489 +490,78,3,Comment 490 +491,451,3,Comment 491 +492,74,94,Comment 492 +493,857,98,Comment 493 +494,26,3,Comment 494 +495,408,8,Comment 495 +496,243,3,Comment 496 +497,122,47,Comment 497 +498,725,6,Comment 498 +499,639,60,Comment 499 +500,122,33,Comment 500 +501,510,3,Comment 501 +502,738,3,Comment 502 +503,719,63,Comment 503 +504,68,45,Comment 504 +505,590,38,Comment 505 +506,641,44,Comment 506 +507,849,37,Comment 507 +508,451,38,Comment 508 +509,419,38,Comment 509 +510,944,21,Comment 510 +511,741,57,Comment 511 +512,488,3,Comment 512 +513,304,11,Comment 513 +514,827,3,Comment 514 +515,414,3,Comment 515 +516,69,3,Comment 516 +517,413,3,Comment 517 +518,857,3,Comment 518 +519,408,3,Comment 519 +520,18,3,Comment 520 +521,491,3,Comment 521 +522,993,3,Comment 522 +523,774,3,Comment 523 +524,500,3,Comment 524 +525,861,3,Comment 525 +526,768,50,Comment 526 +527,451,3,Comment 527 +528,488,3,Comment 528 +529,665,36,Comment 529 +530,828,3,Comment 530 +531,25,3,Comment 531 +532,719,3,Comment 532 +533,406,6,Comment 533 +534,545,45,Comment 534 +535,712,38,Comment 535 +536,459,1,Comment 536 +537,128,59,Comment 537 +538,360,3,Comment 538 +539,36,45,Comment 539 +540,590,3,Comment 540 +541,931,3,Comment 541 +542,741,3,Comment 542 +543,334,65,Comment 543 +544,488,3,Comment 544 +545,277,44,Comment 545 +546,459,3,Comment 546 +547,272,3,Comment 547 +548,676,93,Comment 548 +549,219,36,Comment 549 +550,940,82,Comment 550 +551,546,3,Comment 551 +552,26,67,Comment 552 +553,5,93,Comment 553 +554,993,3,Comment 554 +555,198,85,Comment 555 +556,293,3,Comment 556 +557,191,3,Comment 557 +558,881,99,Comment 558 +559,529,94,Comment 559 +560,451,3,Comment 560 +561,485,67,Comment 561 +562,297,3,Comment 562 +563,498,96,Comment 563 +564,371,70,Comment 564 +565,185,85,Comment 565 +566,539,32,Comment 566 +567,327,3,Comment 567 +568,725,3,Comment 568 +569,363,20,Comment 569 +570,451,44,Comment 570 +571,727,53,Comment 571 +572,407,3,Comment 572 +573,945,21,Comment 573 +574,738,8,Comment 574 +575,792,3,Comment 575 +576,931,26,Comment 576 +577,330,3,Comment 577 +578,488,21,Comment 578 +579,596,3,Comment 579 +580,568,84,Comment 580 +581,931,3,Comment 581 +582,249,37,Comment 582 +583,258,3,Comment 583 +584,868,38,Comment 584 +585,745,3,Comment 585 +586,185,95,Comment 586 +587,488,3,Comment 587 +588,682,37,Comment 588 +589,327,3,Comment 589 +590,393,72,Comment 590 +591,488,61,Comment 591 +592,425,3,Comment 592 +593,696,87,Comment 593 +594,4,3,Comment 594 +595,393,56,Comment 595 +596,305,8,Comment 596 +597,488,67,Comment 597 +598,719,8,Comment 598 +599,36,3,Comment 599 +600,127,3,Comment 600 +601,554,98,Comment 601 +602,676,3,Comment 602 +603,488,3,Comment 603 +604,969,61,Comment 604 +605,451,3,Comment 605 +606,121,3,Comment 606 +607,590,3,Comment 607 +608,488,3,Comment 608 +609,488,99,Comment 609 +610,472,45,Comment 610 +611,905,20,Comment 611 +612,118,61,Comment 612 +613,32,42,Comment 613 +614,122,3,Comment 614 +615,216,3,Comment 615 +616,488,74,Comment 616 +617,495,3,Comment 617 +618,198,8,Comment 618 +619,451,3,Comment 619 +620,440,3,Comment 620 +621,408,47,Comment 621 +622,754,22,Comment 622 +623,431,44,Comment 623 +624,702,3,Comment 624 +625,145,3,Comment 625 +626,451,3,Comment 626 +627,789,6,Comment 627 +628,158,40,Comment 628 +629,423,93,Comment 629 +630,488,42,Comment 630 +631,165,20,Comment 631 +632,702,5,Comment 632 +633,41,3,Comment 633 +634,924,11,Comment 634 +635,428,3,Comment 635 +636,304,33,Comment 636 +637,96,50,Comment 637 +638,388,93,Comment 638 +639,545,3,Comment 639 +640,70,3,Comment 640 +641,282,3,Comment 641 +642,806,78,Comment 642 +643,526,66,Comment 643 +644,191,59,Comment 644 +645,137,3,Comment 645 +646,857,27,Comment 646 +647,245,3,Comment 647 +648,159,3,Comment 648 +649,999,59,Comment 649 +650,291,3,Comment 650 +651,109,17,Comment 651 +652,419,20,Comment 652 +653,393,3,Comment 653 +654,880,14,Comment 654 +655,120,3,Comment 655 +656,66,0,Comment 656 +657,69,78,Comment 657 +658,857,5,Comment 658 +659,408,85,Comment 659 +660,122,15,Comment 660 +661,488,8,Comment 661 +662,458,3,Comment 662 +663,183,3,Comment 663 +664,488,3,Comment 664 +665,308,37,Comment 665 +666,205,58,Comment 666 +667,451,23,Comment 667 +668,258,57,Comment 668 +669,198,3,Comment 669 +670,857,34,Comment 670 +671,124,79,Comment 671 +672,234,3,Comment 672 +673,328,3,Comment 673 +674,231,55,Comment 674 +675,195,64,Comment 675 +676,719,3,Comment 676 +677,304,3,Comment 677 +678,89,38,Comment 678 +679,459,3,Comment 679 +680,110,38,Comment 680 +681,327,15,Comment 681 +682,857,59,Comment 682 +683,327,69,Comment 683 +684,509,73,Comment 684 +685,451,38,Comment 685 +686,121,38,Comment 686 +687,526,8,Comment 687 +688,837,3,Comment 688 +689,69,3,Comment 689 +690,697,3,Comment 690 +691,590,8,Comment 691 +692,855,70,Comment 692 +693,78,36,Comment 693 +694,282,44,Comment 694 +695,598,3,Comment 695 +696,25,3,Comment 696 +697,666,3,Comment 697 +698,841,68,Comment 698 +699,408,3,Comment 699 +700,393,30,Comment 700 +701,232,3,Comment 701 +702,4,3,Comment 702 +703,165,3,Comment 703 +704,964,3,Comment 704 +705,856,3,Comment 705 +706,224,37,Comment 706 +707,940,3,Comment 707 +708,327,59,Comment 708 +709,266,3,Comment 709 +710,122,3,Comment 710 +711,857,44,Comment 711 +712,980,37,Comment 712 +713,304,3,Comment 713 +714,613,8,Comment 714 +715,304,3,Comment 715 +716,78,38,Comment 716 +717,337,3,Comment 717 +718,483,44,Comment 718 +719,105,8,Comment 719 +720,778,3,Comment 720 +721,451,54,Comment 721 +722,200,3,Comment 722 +723,488,3,Comment 723 +724,738,3,Comment 724 +725,304,72,Comment 725 +726,609,14,Comment 726 +727,384,20,Comment 727 +728,941,3,Comment 728 +729,718,3,Comment 729 +730,327,3,Comment 730 +731,21,20,Comment 731 +732,542,45,Comment 732 +733,181,17,Comment 733 +734,103,67,Comment 734 +735,889,3,Comment 735 +736,999,3,Comment 736 +737,226,7,Comment 737 +738,272,3,Comment 738 +739,142,3,Comment 739 +740,419,96,Comment 740 +741,855,71,Comment 741 +742,609,3,Comment 742 +743,828,3,Comment 743 +744,198,8,Comment 744 +745,665,59,Comment 745 +746,868,3,Comment 746 +747,236,3,Comment 747 +748,590,3,Comment 748 +749,351,38,Comment 749 +750,254,4,Comment 750 +751,950,70,Comment 751 +752,327,3,Comment 752 +753,81,13,Comment 753 +754,329,47,Comment 754 +755,407,24,Comment 755 +756,695,3,Comment 756 +757,931,3,Comment 757 +758,773,22,Comment 758 +759,889,3,Comment 759 +760,431,93,Comment 760 +761,646,22,Comment 761 +762,290,3,Comment 762 +763,26,48,Comment 763 +764,327,3,Comment 764 +765,602,59,Comment 765 +766,232,8,Comment 766 +767,848,3,Comment 767 +768,734,3,Comment 768 +769,174,98,Comment 769 +770,304,3,Comment 770 +771,790,94,Comment 771 +772,216,17,Comment 772 +773,304,17,Comment 773 +774,317,3,Comment 774 +775,749,94,Comment 775 +776,25,50,Comment 776 +777,32,20,Comment 777 +778,488,3,Comment 778 +779,346,3,Comment 779 +780,510,8,Comment 780 +781,224,3,Comment 781 +782,857,28,Comment 782 +783,708,3,Comment 783 +784,26,3,Comment 784 +785,725,18,Comment 785 +786,950,3,Comment 786 +787,917,3,Comment 787 +788,668,3,Comment 788 +789,106,3,Comment 789 +790,488,3,Comment 790 +791,243,3,Comment 791 +792,950,3,Comment 792 +793,644,3,Comment 793 +794,490,3,Comment 794 +795,600,3,Comment 795 +796,394,8,Comment 796 +797,327,67,Comment 797 +798,896,93,Comment 798 +799,304,3,Comment 799 +800,25,3,Comment 800 +801,352,31,Comment 801 +802,734,3,Comment 802 +803,526,3,Comment 803 +804,938,3,Comment 804 +805,81,3,Comment 805 +806,860,3,Comment 806 +807,327,29,Comment 807 +808,431,38,Comment 808 +809,385,97,Comment 809 +810,95,3,Comment 810 +811,217,3,Comment 811 +812,682,94,Comment 812 +813,25,73,Comment 813 +814,120,38,Comment 814 +815,25,68,Comment 815 +816,841,3,Comment 816 +817,501,76,Comment 817 +818,148,74,Comment 818 +819,713,3,Comment 819 +820,945,18,Comment 820 +821,895,93,Comment 821 +822,870,44,Comment 822 +823,4,41,Comment 823 +824,488,29,Comment 824 +825,219,3,Comment 825 +826,488,96,Comment 826 +827,297,8,Comment 827 +828,122,3,Comment 828 +829,403,49,Comment 829 +830,451,37,Comment 830 +831,986,3,Comment 831 +832,25,3,Comment 832 +833,272,3,Comment 833 +834,828,3,Comment 834 +835,545,38,Comment 835 +836,792,18,Comment 836 +837,545,3,Comment 837 +838,703,3,Comment 838 +839,451,3,Comment 839 +840,185,52,Comment 840 +841,763,3,Comment 841 +842,488,3,Comment 842 +843,121,3,Comment 843 +844,757,3,Comment 844 +845,938,76,Comment 845 +846,327,3,Comment 846 +847,261,95,Comment 847 +848,49,3,Comment 848 +849,553,3,Comment 849 +850,938,74,Comment 850 +851,121,7,Comment 851 +852,447,3,Comment 852 +853,74,45,Comment 853 +854,25,3,Comment 854 +855,553,82,Comment 855 +856,857,68,Comment 856 +857,305,3,Comment 857 +858,857,59,Comment 858 +859,96,96,Comment 859 +860,205,14,Comment 860 +861,857,32,Comment 861 +862,451,45,Comment 862 +863,488,3,Comment 863 +864,25,38,Comment 864 +865,117,11,Comment 865 +866,25,3,Comment 866 +867,857,84,Comment 867 +868,120,59,Comment 868 +869,828,93,Comment 869 +870,327,67,Comment 870 +871,747,3,Comment 871 +872,327,37,Comment 872 +873,225,45,Comment 873 +874,69,43,Comment 874 +875,235,3,Comment 875 +876,431,7,Comment 876 +877,775,3,Comment 877 +878,408,3,Comment 878 +879,950,29,Comment 879 +880,460,33,Comment 880 +881,25,3,Comment 881 +882,363,8,Comment 882 +883,590,11,Comment 883 +884,200,2,Comment 884 +885,605,3,Comment 885 +886,451,38,Comment 886 +887,25,98,Comment 887 +888,719,39,Comment 888 +889,488,3,Comment 889 +890,51,66,Comment 890 +891,431,8,Comment 891 +892,245,3,Comment 892 +893,857,37,Comment 893 +894,243,3,Comment 894 +895,915,5,Comment 895 +896,473,3,Comment 896 +897,297,38,Comment 897 +898,768,8,Comment 898 +899,602,3,Comment 899 +900,361,3,Comment 900 +901,504,17,Comment 901 +902,719,59,Comment 902 +903,828,1,Comment 903 +904,771,38,Comment 904 +905,144,3,Comment 905 +906,792,37,Comment 906 +907,25,67,Comment 907 +908,185,3,Comment 908 +909,504,15,Comment 909 +910,91,59,Comment 910 +911,488,3,Comment 911 +912,286,3,Comment 912 +913,385,85,Comment 913 +914,294,45,Comment 914 +915,28,51,Comment 915 +916,451,69,Comment 916 +917,939,22,Comment 917 +918,492,13,Comment 918 +919,271,27,Comment 919 +920,934,48,Comment 920 +921,332,3,Comment 921 +922,602,3,Comment 922 +923,513,3,Comment 923 +924,931,96,Comment 924 +925,861,3,Comment 925 +926,122,47,Comment 926 +927,55,93,Comment 927 +928,762,3,Comment 928 +929,857,48,Comment 929 +930,451,3,Comment 930 +931,590,3,Comment 931 +932,27,70,Comment 932 +933,198,86,Comment 933 +934,122,3,Comment 934 +935,488,38,Comment 935 +936,180,3,Comment 936 +937,25,59,Comment 937 +938,272,79,Comment 938 +939,574,3,Comment 939 +940,488,40,Comment 940 +941,304,77,Comment 941 +942,802,3,Comment 942 +943,232,70,Comment 943 +944,219,32,Comment 944 +945,488,4,Comment 945 +946,434,20,Comment 946 +947,404,66,Comment 947 +948,124,48,Comment 948 +949,451,17,Comment 949 +950,219,13,Comment 950 +951,337,84,Comment 951 +952,665,3,Comment 952 +953,899,3,Comment 953 +954,719,44,Comment 954 +955,358,46,Comment 955 +956,488,32,Comment 956 +957,684,3,Comment 957 +958,361,3,Comment 958 +959,327,38,Comment 959 +960,120,3,Comment 960 +961,670,17,Comment 961 +962,809,3,Comment 962 +963,296,18,Comment 963 +964,725,3,Comment 964 +965,490,3,Comment 965 +966,725,51,Comment 966 +967,360,3,Comment 967 +968,686,3,Comment 968 +969,360,71,Comment 969 +970,60,3,Comment 970 +971,482,3,Comment 971 +972,411,47,Comment 972 +973,219,3,Comment 973 +974,857,31,Comment 974 +975,327,38,Comment 975 +976,25,96,Comment 976 +977,327,3,Comment 977 +978,382,3,Comment 978 +979,848,93,Comment 979 +980,744,48,Comment 980 +981,185,8,Comment 981 +982,811,51,Comment 982 +983,217,4,Comment 983 +984,312,4,Comment 984 +985,36,3,Comment 985 +986,25,20,Comment 986 +987,581,3,Comment 987 +988,873,38,Comment 988 +989,451,3,Comment 989 +990,824,70,Comment 990 +991,739,59,Comment 991 +992,553,3,Comment 992 +993,959,35,Comment 993 +994,753,47,Comment 994 +995,232,19,Comment 995 +996,732,3,Comment 996 +997,593,18,Comment 997 +998,350,8,Comment 998 +999,36,3,Comment 999 diff --git a/axolotl/tests/data/datasets/database_dataset_2/tables/learningData.csv b/axolotl/tests/data/datasets/database_dataset_2/tables/learningData.csv new file mode 100644 index 0000000..c257772 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_2/tables/learningData.csv @@ -0,0 +1,101 @@ +d3mIndex,user_id,posts_count,comments_count +0,0,7,4 +1,1,8,9 +2,2,9,2 +3,3,5,425 +4,4,2,5 +5,5,8,6 +6,6,4,6 +7,7,16,5 +8,8,48,40 +9,9,6,0 +10,10,15,2 +11,11,2,7 +12,12,3,1 +13,13,26,8 +14,14,2,5 +15,15,6,5 +16,16,4,1 +17,17,10,8 +18,18,13,9 +19,19,6,1 +20,20,83,14 +21,21,11,4 +22,22,9,5 +23,23,10,7 +24,24,4,2 +25,25,6,0 +26,26,1,3 +27,27,34,3 +28,28,6,2 +29,29,6,4 +30,30,3,4 +31,31,9,8 +32,32,8,5 +33,33,8,7 +34,34,2,1 +35,35,1,3 +36,36,5,4 +37,37,12,13 +38,38,38,42 +39,39,4,2 +40,40,1,3 +41,41,6,5 +42,42,13,2 +43,43,5,2 +44,44,1,11 +45,45,4,20 +46,46,4,1 +47,47,3,13 +48,48,5,10 +49,49,4,5 +50,50,7,5 +51,51,4,4 +52,52,118,3 +53,53,2,2 +54,54,3,3 +55,55,1,1 +56,56,10,2 +57,57,1,3 +58,58,2,4 +59,59,2,37 +60,60,2,3 +61,61,2,3 +62,62,5,0 +63,63,1,1 +64,64,2,4 +65,65,4,4 +66,66,11,8 +67,67,3,15 +68,68,17,4 +69,69,5,3 +70,70,67,8 +71,71,5,3 +72,72,49,6 +73,73,2,3 +74,74,14,5 +75,75,3,0 +76,76,5,3 +77,77,2,2 +78,78,6,2 +79,79,5,3 +80,80,7,0 +81,81,3,3 +82,82,3,2 +83,83,8,4 +84,84,5,5 +85,85,4,5 +86,86,3,2 +87,87,3,4 +88,88,4,1 +89,89,31,0 +90,90,4,3 +91,91,8,1 +92,92,6,4 +93,93,16,19 +94,94,3,5 +95,95,6,9 +96,96,6,23 +97,97,1,3 +98,98,9,6 +99,99,4,3 diff --git a/axolotl/tests/data/datasets/database_dataset_2/tables/posts.csv b/axolotl/tests/data/datasets/database_dataset_2/tables/posts.csv new file mode 100644 index 0000000..0d17d9f --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_2/tables/posts.csv @@ -0,0 +1,1001 @@ +id,author_id,post +0,66,Post 0 +1,20,Post 1 +2,70,Post 2 +3,93,Post 3 +4,20,Post 4 +5,52,Post 5 +6,52,Post 6 +7,52,Post 7 +8,20,Post 8 +9,91,Post 9 +10,41,Post 10 +11,72,Post 11 +12,68,Post 12 +13,23,Post 13 +14,72,Post 14 +15,37,Post 15 +16,80,Post 16 +17,52,Post 17 +18,80,Post 18 +19,68,Post 19 +20,70,Post 20 +21,52,Post 21 +22,91,Post 22 +23,57,Post 23 +24,38,Post 24 +25,52,Post 25 +26,2,Post 26 +27,23,Post 27 +28,62,Post 28 +29,22,Post 29 +30,52,Post 30 +31,38,Post 31 +32,13,Post 32 +33,23,Post 33 +34,52,Post 34 +35,52,Post 35 +36,52,Post 36 +37,60,Post 37 +38,59,Post 38 +39,38,Post 39 +40,83,Post 40 +41,31,Post 41 +42,39,Post 42 +43,83,Post 43 +44,72,Post 44 +45,69,Post 45 +46,8,Post 46 +47,88,Post 47 +48,70,Post 48 +49,99,Post 49 +50,13,Post 50 +51,76,Post 51 +52,16,Post 52 +53,52,Post 53 +54,11,Post 54 +55,72,Post 55 +56,72,Post 56 +57,52,Post 57 +58,38,Post 58 +59,8,Post 59 +60,68,Post 60 +61,42,Post 61 +62,70,Post 62 +63,75,Post 63 +64,95,Post 64 +65,74,Post 65 +66,1,Post 66 +67,30,Post 67 +68,70,Post 68 +69,17,Post 69 +70,52,Post 70 +71,7,Post 71 +72,19,Post 72 +73,2,Post 73 +74,72,Post 74 +75,20,Post 75 +76,27,Post 76 +77,89,Post 77 +78,69,Post 78 +79,5,Post 79 +80,16,Post 80 +81,52,Post 81 +82,52,Post 82 +83,20,Post 83 +84,89,Post 84 +85,52,Post 85 +86,52,Post 86 +87,52,Post 87 +88,70,Post 88 +89,25,Post 89 +90,37,Post 90 +91,20,Post 91 +92,18,Post 92 +93,89,Post 93 +94,70,Post 94 +95,50,Post 95 +96,20,Post 96 +97,20,Post 97 +98,7,Post 98 +99,38,Post 99 +100,25,Post 100 +101,68,Post 101 +102,33,Post 102 +103,18,Post 103 +104,3,Post 104 +105,8,Post 105 +106,66,Post 106 +107,42,Post 107 +108,52,Post 108 +109,83,Post 109 +110,98,Post 110 +111,20,Post 111 +112,62,Post 112 +113,20,Post 113 +114,3,Post 114 +115,70,Post 115 +116,27,Post 116 +117,34,Post 117 +118,52,Post 118 +119,72,Post 119 +120,54,Post 120 +121,77,Post 121 +122,20,Post 122 +123,72,Post 123 +124,18,Post 124 +125,91,Post 125 +126,66,Post 126 +127,20,Post 127 +128,89,Post 128 +129,70,Post 129 +130,20,Post 130 +131,20,Post 131 +132,52,Post 132 +133,4,Post 133 +134,20,Post 134 +135,38,Post 135 +136,32,Post 136 +137,44,Post 137 +138,21,Post 138 +139,52,Post 139 +140,75,Post 140 +141,10,Post 141 +142,52,Post 142 +143,13,Post 143 +144,70,Post 144 +145,36,Post 145 +146,52,Post 146 +147,18,Post 147 +148,13,Post 148 +149,49,Post 149 +150,29,Post 150 +151,89,Post 151 +152,70,Post 152 +153,70,Post 153 +154,84,Post 154 +155,8,Post 155 +156,52,Post 156 +157,52,Post 157 +158,93,Post 158 +159,22,Post 159 +160,20,Post 160 +161,8,Post 161 +162,2,Post 162 +163,89,Post 163 +164,64,Post 164 +165,72,Post 165 +166,21,Post 166 +167,52,Post 167 +168,8,Post 168 +169,49,Post 169 +170,96,Post 170 +171,78,Post 171 +172,27,Post 172 +173,93,Post 173 +174,20,Post 174 +175,90,Post 175 +176,89,Post 176 +177,72,Post 177 +178,54,Post 178 +179,78,Post 179 +180,23,Post 180 +181,72,Post 181 +182,52,Post 182 +183,1,Post 183 +184,27,Post 184 +185,13,Post 185 +186,96,Post 186 +187,47,Post 187 +188,51,Post 188 +189,56,Post 189 +190,31,Post 190 +191,13,Post 191 +192,72,Post 192 +193,19,Post 193 +194,52,Post 194 +195,20,Post 195 +196,8,Post 196 +197,74,Post 197 +198,94,Post 198 +199,92,Post 199 +200,85,Post 200 +201,70,Post 201 +202,27,Post 202 +203,8,Post 203 +204,38,Post 204 +205,20,Post 205 +206,13,Post 206 +207,7,Post 207 +208,70,Post 208 +209,1,Post 209 +210,70,Post 210 +211,13,Post 211 +212,8,Post 212 +213,8,Post 213 +214,65,Post 214 +215,20,Post 215 +216,38,Post 216 +217,52,Post 217 +218,74,Post 218 +219,70,Post 219 +220,20,Post 220 +221,13,Post 221 +222,7,Post 222 +223,23,Post 223 +224,20,Post 224 +225,42,Post 225 +226,66,Post 226 +227,68,Post 227 +228,21,Post 228 +229,33,Post 229 +230,18,Post 230 +231,72,Post 231 +232,7,Post 232 +233,68,Post 233 +234,71,Post 234 +235,71,Post 235 +236,20,Post 236 +237,32,Post 237 +238,52,Post 238 +239,20,Post 239 +240,31,Post 240 +241,19,Post 241 +242,43,Post 242 +243,7,Post 243 +244,72,Post 244 +245,8,Post 245 +246,52,Post 246 +247,25,Post 247 +248,52,Post 248 +249,92,Post 249 +250,58,Post 250 +251,5,Post 251 +252,38,Post 252 +253,52,Post 253 +254,52,Post 254 +255,66,Post 255 +256,21,Post 256 +257,12,Post 257 +258,36,Post 258 +259,91,Post 259 +260,18,Post 260 +261,85,Post 261 +262,52,Post 262 +263,42,Post 263 +264,80,Post 264 +265,43,Post 265 +266,70,Post 266 +267,37,Post 267 +268,85,Post 268 +269,67,Post 269 +270,68,Post 270 +271,27,Post 271 +272,70,Post 272 +273,56,Post 273 +274,20,Post 274 +275,15,Post 275 +276,72,Post 276 +277,92,Post 277 +278,43,Post 278 +279,52,Post 279 +280,74,Post 280 +281,42,Post 281 +282,91,Post 282 +283,52,Post 283 +284,72,Post 284 +285,86,Post 285 +286,72,Post 286 +287,15,Post 287 +288,54,Post 288 +289,37,Post 289 +290,8,Post 290 +291,38,Post 291 +292,20,Post 292 +293,72,Post 293 +294,5,Post 294 +295,92,Post 295 +296,29,Post 296 +297,29,Post 297 +298,2,Post 298 +299,18,Post 299 +300,37,Post 300 +301,89,Post 301 +302,8,Post 302 +303,89,Post 303 +304,76,Post 304 +305,42,Post 305 +306,27,Post 306 +307,20,Post 307 +308,52,Post 308 +309,5,Post 309 +310,2,Post 310 +311,38,Post 311 +312,8,Post 312 +313,20,Post 313 +314,20,Post 314 +315,20,Post 315 +316,13,Post 316 +317,1,Post 317 +318,10,Post 318 +319,52,Post 319 +320,95,Post 320 +321,98,Post 321 +322,38,Post 322 +323,16,Post 323 +324,56,Post 324 +325,50,Post 325 +326,98,Post 326 +327,8,Post 327 +328,72,Post 328 +329,22,Post 329 +330,20,Post 330 +331,62,Post 331 +332,20,Post 332 +333,63,Post 333 +334,52,Post 334 +335,38,Post 335 +336,52,Post 336 +337,21,Post 337 +338,69,Post 338 +339,38,Post 339 +340,30,Post 340 +341,72,Post 341 +342,89,Post 342 +343,7,Post 343 +344,20,Post 344 +345,28,Post 345 +346,72,Post 346 +347,98,Post 347 +348,93,Post 348 +349,85,Post 349 +350,23,Post 350 +351,98,Post 351 +352,20,Post 352 +353,9,Post 353 +354,90,Post 354 +355,20,Post 355 +356,67,Post 356 +357,7,Post 357 +358,70,Post 358 +359,80,Post 359 +360,20,Post 360 +361,33,Post 361 +362,32,Post 362 +363,70,Post 363 +364,20,Post 364 +365,17,Post 365 +366,41,Post 366 +367,24,Post 367 +368,72,Post 368 +369,20,Post 369 +370,52,Post 370 +371,89,Post 371 +372,55,Post 372 +373,76,Post 373 +374,89,Post 374 +375,70,Post 375 +376,68,Post 376 +377,93,Post 377 +378,98,Post 378 +379,42,Post 379 +380,8,Post 380 +381,22,Post 381 +382,13,Post 382 +383,38,Post 383 +384,13,Post 384 +385,52,Post 385 +386,34,Post 386 +387,83,Post 387 +388,93,Post 388 +389,52,Post 389 +390,20,Post 390 +391,52,Post 391 +392,83,Post 392 +393,38,Post 393 +394,52,Post 394 +395,20,Post 395 +396,42,Post 396 +397,37,Post 397 +398,20,Post 398 +399,52,Post 399 +400,25,Post 400 +401,32,Post 401 +402,52,Post 402 +403,70,Post 403 +404,27,Post 404 +405,89,Post 405 +406,74,Post 406 +407,7,Post 407 +408,20,Post 408 +409,41,Post 409 +410,8,Post 410 +411,28,Post 411 +412,70,Post 412 +413,66,Post 413 +414,52,Post 414 +415,70,Post 415 +416,20,Post 416 +417,27,Post 417 +418,66,Post 418 +419,79,Post 419 +420,52,Post 420 +421,21,Post 421 +422,5,Post 422 +423,70,Post 423 +424,1,Post 424 +425,32,Post 425 +426,52,Post 426 +427,89,Post 427 +428,8,Post 428 +429,38,Post 429 +430,3,Post 430 +431,27,Post 431 +432,52,Post 432 +433,21,Post 433 +434,20,Post 434 +435,10,Post 435 +436,52,Post 436 +437,68,Post 437 +438,65,Post 438 +439,90,Post 439 +440,0,Post 440 +441,58,Post 441 +442,52,Post 442 +443,52,Post 443 +444,93,Post 444 +445,2,Post 445 +446,68,Post 446 +447,72,Post 447 +448,52,Post 448 +449,27,Post 449 +450,72,Post 450 +451,8,Post 451 +452,39,Post 452 +453,52,Post 453 +454,68,Post 454 +455,8,Post 455 +456,20,Post 456 +457,38,Post 457 +458,52,Post 458 +459,81,Post 459 +460,52,Post 460 +461,13,Post 461 +462,96,Post 462 +463,77,Post 463 +464,52,Post 464 +465,89,Post 465 +466,52,Post 466 +467,89,Post 467 +468,72,Post 468 +469,93,Post 469 +470,88,Post 470 +471,6,Post 471 +472,17,Post 472 +473,35,Post 473 +474,91,Post 474 +475,23,Post 475 +476,15,Post 476 +477,81,Post 477 +478,41,Post 478 +479,86,Post 479 +480,15,Post 480 +481,62,Post 481 +482,39,Post 482 +483,8,Post 483 +484,68,Post 484 +485,20,Post 485 +486,6,Post 486 +487,8,Post 487 +488,8,Post 488 +489,86,Post 489 +490,70,Post 490 +491,83,Post 491 +492,65,Post 492 +493,52,Post 493 +494,24,Post 494 +495,99,Post 495 +496,31,Post 496 +497,45,Post 497 +498,33,Post 498 +499,96,Post 499 +500,17,Post 500 +501,27,Post 501 +502,66,Post 502 +503,8,Post 503 +504,52,Post 504 +505,46,Post 505 +506,21,Post 506 +507,20,Post 507 +508,52,Post 508 +509,31,Post 509 +510,42,Post 510 +511,27,Post 511 +512,94,Post 512 +513,13,Post 513 +514,8,Post 514 +515,27,Post 515 +516,52,Post 516 +517,62,Post 517 +518,37,Post 518 +519,99,Post 519 +520,28,Post 520 +521,70,Post 521 +522,56,Post 522 +523,72,Post 523 +524,95,Post 524 +525,82,Post 525 +526,70,Post 526 +527,68,Post 527 +528,27,Post 528 +529,13,Post 529 +530,8,Post 530 +531,20,Post 531 +532,38,Post 532 +533,52,Post 533 +534,70,Post 534 +535,92,Post 535 +536,10,Post 536 +537,9,Post 537 +538,52,Post 538 +539,70,Post 539 +540,72,Post 540 +541,89,Post 541 +542,97,Post 542 +543,37,Post 543 +544,33,Post 544 +545,13,Post 545 +546,66,Post 546 +547,61,Post 547 +548,74,Post 548 +549,8,Post 549 +550,51,Post 550 +551,52,Post 551 +552,20,Post 552 +553,17,Post 553 +554,74,Post 554 +555,8,Post 555 +556,45,Post 556 +557,10,Post 557 +558,42,Post 558 +559,96,Post 559 +560,38,Post 560 +561,74,Post 561 +562,10,Post 562 +563,20,Post 563 +564,38,Post 564 +565,37,Post 565 +566,64,Post 566 +567,27,Post 567 +568,70,Post 568 +569,56,Post 569 +570,37,Post 570 +571,38,Post 571 +572,52,Post 572 +573,8,Post 573 +574,72,Post 574 +575,60,Post 575 +576,70,Post 576 +577,52,Post 577 +578,10,Post 578 +579,38,Post 579 +580,38,Post 580 +581,27,Post 581 +582,5,Post 582 +583,70,Post 583 +584,10,Post 584 +585,52,Post 585 +586,68,Post 586 +587,56,Post 587 +588,92,Post 588 +589,8,Post 589 +590,76,Post 590 +591,5,Post 591 +592,52,Post 592 +593,38,Post 593 +594,52,Post 594 +595,31,Post 595 +596,19,Post 596 +597,2,Post 597 +598,52,Post 598 +599,72,Post 599 +600,32,Post 600 +601,20,Post 601 +602,8,Post 602 +603,8,Post 603 +604,20,Post 604 +605,8,Post 605 +606,20,Post 606 +607,8,Post 607 +608,8,Post 608 +609,74,Post 609 +610,15,Post 610 +611,52,Post 611 +612,70,Post 612 +613,42,Post 613 +614,13,Post 614 +615,19,Post 615 +616,38,Post 616 +617,52,Post 617 +618,28,Post 618 +619,72,Post 619 +620,70,Post 620 +621,89,Post 621 +622,4,Post 622 +623,83,Post 623 +624,36,Post 624 +625,79,Post 625 +626,67,Post 626 +627,98,Post 627 +628,70,Post 628 +629,31,Post 629 +630,52,Post 630 +631,33,Post 631 +632,31,Post 632 +633,20,Post 633 +634,51,Post 634 +635,66,Post 635 +636,20,Post 636 +637,52,Post 637 +638,10,Post 638 +639,15,Post 639 +640,7,Post 640 +641,94,Post 641 +642,0,Post 642 +643,18,Post 643 +644,52,Post 644 +645,8,Post 645 +646,80,Post 646 +647,70,Post 647 +648,93,Post 648 +649,52,Post 649 +650,23,Post 650 +651,52,Post 651 +652,89,Post 652 +653,52,Post 653 +654,20,Post 654 +655,79,Post 655 +656,32,Post 656 +657,0,Post 657 +658,20,Post 658 +659,27,Post 659 +660,74,Post 660 +661,43,Post 661 +662,40,Post 662 +663,27,Post 663 +664,80,Post 664 +665,89,Post 665 +666,98,Post 666 +667,33,Post 667 +668,93,Post 668 +669,72,Post 669 +670,65,Post 670 +671,20,Post 671 +672,20,Post 672 +673,17,Post 673 +674,89,Post 674 +675,23,Post 675 +676,42,Post 676 +677,50,Post 677 +678,71,Post 678 +679,72,Post 679 +680,13,Post 680 +681,38,Post 681 +682,72,Post 682 +683,72,Post 683 +684,8,Post 684 +685,14,Post 685 +686,24,Post 686 +687,8,Post 687 +688,38,Post 688 +689,9,Post 689 +690,52,Post 690 +691,20,Post 691 +692,52,Post 692 +693,10,Post 693 +694,95,Post 694 +695,89,Post 695 +696,36,Post 696 +697,20,Post 697 +698,20,Post 698 +699,48,Post 699 +700,6,Post 700 +701,56,Post 701 +702,38,Post 702 +703,33,Post 703 +704,72,Post 704 +705,70,Post 705 +706,91,Post 706 +707,28,Post 707 +708,83,Post 708 +709,70,Post 709 +710,29,Post 710 +711,52,Post 711 +712,22,Post 712 +713,78,Post 713 +714,10,Post 714 +715,20,Post 715 +716,18,Post 716 +717,38,Post 717 +718,70,Post 718 +719,52,Post 719 +720,49,Post 720 +721,0,Post 721 +722,38,Post 722 +723,8,Post 723 +724,20,Post 724 +725,89,Post 725 +726,20,Post 726 +727,74,Post 727 +728,72,Post 728 +729,14,Post 729 +730,52,Post 730 +731,10,Post 731 +732,70,Post 732 +733,56,Post 733 +734,72,Post 734 +735,47,Post 735 +736,87,Post 736 +737,7,Post 737 +738,22,Post 738 +739,70,Post 739 +740,38,Post 740 +741,17,Post 741 +742,9,Post 742 +743,72,Post 743 +744,45,Post 744 +745,80,Post 745 +746,70,Post 746 +747,38,Post 747 +748,32,Post 748 +749,52,Post 749 +750,82,Post 750 +751,70,Post 751 +752,0,Post 752 +753,68,Post 753 +754,88,Post 754 +755,70,Post 755 +756,17,Post 756 +757,48,Post 757 +758,13,Post 758 +759,30,Post 759 +760,89,Post 760 +761,89,Post 761 +762,21,Post 762 +763,27,Post 763 +764,52,Post 764 +765,93,Post 765 +766,13,Post 766 +767,20,Post 767 +768,78,Post 768 +769,50,Post 769 +770,84,Post 770 +771,18,Post 771 +772,52,Post 772 +773,27,Post 773 +774,27,Post 774 +775,41,Post 775 +776,38,Post 776 +777,29,Post 777 +778,87,Post 778 +779,70,Post 779 +780,70,Post 780 +781,22,Post 781 +782,52,Post 782 +783,71,Post 783 +784,72,Post 784 +785,27,Post 785 +786,70,Post 786 +787,70,Post 787 +788,13,Post 788 +789,75,Post 789 +790,39,Post 790 +791,49,Post 791 +792,41,Post 792 +793,52,Post 793 +794,52,Post 794 +795,51,Post 795 +796,76,Post 796 +797,53,Post 797 +798,37,Post 798 +799,38,Post 799 +800,72,Post 800 +801,27,Post 801 +802,20,Post 802 +803,8,Post 803 +804,78,Post 804 +805,88,Post 805 +806,10,Post 806 +807,27,Post 807 +808,17,Post 808 +809,10,Post 809 +810,84,Post 810 +811,7,Post 811 +812,96,Post 812 +813,8,Post 813 +814,74,Post 814 +815,52,Post 815 +816,31,Post 816 +817,27,Post 817 +818,70,Post 818 +819,26,Post 819 +820,61,Post 820 +821,52,Post 821 +822,48,Post 822 +823,84,Post 823 +824,52,Post 824 +825,72,Post 825 +826,70,Post 826 +827,6,Post 827 +828,70,Post 828 +829,20,Post 829 +830,84,Post 830 +831,7,Post 831 +832,27,Post 832 +833,8,Post 833 +834,46,Post 834 +835,72,Post 835 +836,23,Post 836 +837,13,Post 837 +838,27,Post 838 +839,72,Post 839 +840,13,Post 840 +841,20,Post 841 +842,8,Post 842 +843,69,Post 843 +844,36,Post 844 +845,25,Post 845 +846,70,Post 846 +847,27,Post 847 +848,70,Post 848 +849,72,Post 849 +850,20,Post 850 +851,95,Post 851 +852,16,Post 852 +853,22,Post 853 +854,18,Post 854 +855,27,Post 855 +856,47,Post 856 +857,52,Post 857 +858,73,Post 858 +859,82,Post 859 +860,20,Post 860 +861,52,Post 861 +862,10,Post 862 +863,43,Post 863 +864,27,Post 864 +865,27,Post 865 +866,48,Post 866 +867,70,Post 867 +868,8,Post 868 +869,79,Post 869 +870,70,Post 870 +871,17,Post 871 +872,89,Post 872 +873,52,Post 873 +874,99,Post 874 +875,19,Post 875 +876,52,Post 876 +877,22,Post 877 +878,24,Post 878 +879,52,Post 879 +880,89,Post 880 +881,72,Post 881 +882,70,Post 882 +883,52,Post 883 +884,89,Post 884 +885,50,Post 885 +886,78,Post 886 +887,72,Post 887 +888,20,Post 888 +889,70,Post 889 +890,1,Post 890 +891,27,Post 891 +892,20,Post 892 +893,52,Post 893 +894,70,Post 894 +895,8,Post 895 +896,52,Post 896 +897,89,Post 897 +898,20,Post 898 +899,66,Post 899 +900,52,Post 900 +901,1,Post 901 +902,46,Post 902 +903,70,Post 903 +904,7,Post 904 +905,79,Post 905 +906,52,Post 906 +907,5,Post 907 +908,20,Post 908 +909,91,Post 909 +910,52,Post 910 +911,9,Post 911 +912,21,Post 912 +913,42,Post 913 +914,3,Post 914 +915,38,Post 915 +916,50,Post 916 +917,20,Post 917 +918,52,Post 918 +919,70,Post 919 +920,20,Post 920 +921,52,Post 921 +922,56,Post 922 +923,90,Post 923 +924,71,Post 924 +925,72,Post 925 +926,50,Post 926 +927,18,Post 927 +928,98,Post 928 +929,12,Post 929 +930,45,Post 930 +931,8,Post 931 +932,89,Post 932 +933,93,Post 933 +934,70,Post 934 +935,28,Post 935 +936,20,Post 936 +937,20,Post 937 +938,12,Post 938 +939,52,Post 939 +940,13,Post 940 +941,27,Post 941 +942,53,Post 942 +943,70,Post 943 +944,3,Post 944 +945,38,Post 945 +946,59,Post 946 +947,73,Post 947 +948,46,Post 948 +949,93,Post 949 +950,20,Post 950 +951,2,Post 951 +952,48,Post 952 +953,20,Post 953 +954,72,Post 954 +955,20,Post 955 +956,25,Post 956 +957,72,Post 957 +958,70,Post 958 +959,52,Post 959 +960,69,Post 960 +961,38,Post 961 +962,0,Post 962 +963,1,Post 963 +964,52,Post 964 +965,8,Post 965 +966,7,Post 966 +967,93,Post 967 +968,74,Post 968 +969,13,Post 969 +970,0,Post 970 +971,89,Post 971 +972,21,Post 972 +973,18,Post 973 +974,68,Post 974 +975,9,Post 975 +976,20,Post 976 +977,95,Post 977 +978,56,Post 978 +979,52,Post 979 +980,37,Post 980 +981,70,Post 981 +982,13,Post 982 +983,93,Post 983 +984,74,Post 984 +985,52,Post 985 +986,7,Post 986 +987,68,Post 987 +988,87,Post 988 +989,52,Post 989 +990,29,Post 990 +991,11,Post 991 +992,70,Post 992 +993,81,Post 993 +994,8,Post 994 +995,70,Post 995 +996,2,Post 996 +997,93,Post 997 +998,52,Post 998 +999,52,Post 999 diff --git a/axolotl/tests/data/datasets/database_dataset_2/tables/users.csv b/axolotl/tests/data/datasets/database_dataset_2/tables/users.csv new file mode 100644 index 0000000..01f98db --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_2/tables/users.csv @@ -0,0 +1,101 @@ +id,name +0,User 0 +1,User 1 +2,User 2 +3,User 3 +4,User 4 +5,User 5 +6,User 6 +7,User 7 +8,User 8 +9,User 9 +10,User 10 +11,User 11 +12,User 12 +13,User 13 +14,User 14 +15,User 15 +16,User 16 +17,User 17 +18,User 18 +19,User 19 +20,User 20 +21,User 21 +22,User 22 +23,User 23 +24,User 24 +25,User 25 +26,User 26 +27,User 27 +28,User 28 +29,User 29 +30,User 30 +31,User 31 +32,User 32 +33,User 33 +34,User 34 +35,User 35 +36,User 36 +37,User 37 +38,User 38 +39,User 39 +40,User 40 +41,User 41 +42,User 42 +43,User 43 +44,User 44 +45,User 45 +46,User 46 +47,User 47 +48,User 48 +49,User 49 +50,User 50 +51,User 51 +52,User 52 +53,User 53 +54,User 54 +55,User 55 +56,User 56 +57,User 57 +58,User 58 +59,User 59 +60,User 60 +61,User 61 +62,User 62 +63,User 63 +64,User 64 +65,User 65 +66,User 66 +67,User 67 +68,User 68 +69,User 69 +70,User 70 +71,User 71 +72,User 72 +73,User 73 +74,User 74 +75,User 75 +76,User 76 +77,User 77 +78,User 78 +79,User 79 +80,User 80 +81,User 81 +82,User 82 +83,User 83 +84,User 84 +85,User 85 +86,User 86 +87,User 87 +88,User 88 +89,User 89 +90,User 90 +91,User 91 +92,User 92 +93,User 93 +94,User 94 +95,User 95 +96,User 96 +97,User 97 +98,User 98 +99,User 99 diff --git a/axolotl/tests/data/datasets/database_dataset_3/datasetDoc.json b/axolotl/tests/data/datasets/database_dataset_3/datasetDoc.json new file mode 100644 index 0000000..d37f42b --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_3/datasetDoc.json @@ -0,0 +1,188 @@ +{ + "about": { + "datasetSchemaVersion": "4.0.0", + "datasetID": "database_dataset_3", + "datasetName": "Database dataset of type COMMENTS_PER_POST", + "description": "Database dataset of type COMMENTS_PER_POST, size 100, random seed 0", + "digest": "7dc0973f7fcb22fe487fe37fdaa8c269589074504c53b9728b5a5ec85e2ebb9b", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "users", + "isCollection": false, + "columnsCount": 2, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/users.csv", + "columns": [ + { + "colIndex": 0, + "colName": "id", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "name", + "role": [ + "attribute" + ], + "colType": "string" + } + ] + }, + { + "resID": "posts", + "isCollection": false, + "columnsCount": 3, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/posts.csv", + "columns": [ + { + "colIndex": 0, + "colName": "id", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "author_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "users", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 2, + "colName": "post", + "role": [ + "attribute" + ], + "colType": "string" + } + ] + }, + { + "resID": "comments", + "isCollection": false, + "columnsCount": 4, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/comments.csv", + "columns": [ + { + "colIndex": 0, + "colName": "id", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "post_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "posts", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 2, + "colName": "author_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "users", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 3, + "colName": "comment", + "role": [ + "attribute" + ], + "colType": "string" + } + ] + }, + { + "resID": "learningData", + "isCollection": false, + "columnsCount": 3, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/learningData.csv", + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "post_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "posts", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 2, + "colName": "comments_count", + "role": [ + "suggestedTarget" + ], + "colType": "integer" + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/database_dataset_3/tables/comments.csv b/axolotl/tests/data/datasets/database_dataset_3/tables/comments.csv new file mode 100644 index 0000000..ab57bf3 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_3/tables/comments.csv @@ -0,0 +1,1001 @@ +id,post_id,author_id,comment +0,198,74,Comment 0 +1,383,3,Comment 1 +2,490,59,Comment 2 +3,471,40,Comment 3 +4,952,3,Comment 4 +5,581,3,Comment 5 +6,680,3,Comment 6 +7,945,45,Comment 7 +8,361,33,Comment 8 +9,327,3,Comment 9 +10,25,8,Comment 10 +11,165,41,Comment 11 +12,205,3,Comment 12 +13,164,8,Comment 13 +14,698,5,Comment 14 +15,455,23,Comment 15 +16,556,20,Comment 16 +17,784,3,Comment 17 +18,198,58,Comment 18 +19,299,3,Comment 19 +20,621,3,Comment 20 +21,337,6,Comment 21 +22,25,83,Comment 22 +23,393,3,Comment 23 +24,857,3,Comment 24 +25,360,93,Comment 25 +26,304,3,Comment 26 +27,985,3,Comment 27 +28,526,49,Comment 28 +29,327,3,Comment 29 +30,74,3,Comment 30 +31,728,3,Comment 31 +32,621,59,Comment 32 +33,870,87,Comment 33 +34,198,3,Comment 34 +35,91,3,Comment 35 +36,657,95,Comment 36 +37,185,36,Comment 37 +38,154,26,Comment 38 +39,297,65,Comment 39 +40,772,3,Comment 40 +41,459,3,Comment 41 +42,25,96,Comment 42 +43,421,67,Comment 43 +44,588,54,Comment 44 +45,458,3,Comment 45 +46,488,11,Comment 46 +47,198,93,Comment 47 +48,828,93,Comment 48 +49,488,1,Comment 49 +50,637,56,Comment 50 +51,968,1,Comment 51 +52,385,96,Comment 52 +53,857,5,Comment 53 +54,4,96,Comment 54 +55,25,3,Comment 55 +56,488,3,Comment 56 +57,293,3,Comment 57 +58,217,50,Comment 58 +59,232,3,Comment 59 +60,297,73,Comment 60 +61,663,35,Comment 61 +62,488,96,Comment 62 +63,732,97,Comment 63 +64,796,3,Comment 64 +65,824,95,Comment 65 +66,361,3,Comment 66 +67,373,3,Comment 67 +68,880,93,Comment 68 +69,545,96,Comment 69 +70,46,3,Comment 70 +71,461,8,Comment 71 +72,327,14,Comment 72 +73,982,72,Comment 73 +74,15,38,Comment 74 +75,494,59,Comment 75 +76,657,13,Comment 76 +77,251,38,Comment 77 +78,950,51,Comment 78 +79,842,52,Comment 79 +80,862,0,Comment 80 +81,22,3,Comment 81 +82,488,3,Comment 82 +83,265,3,Comment 83 +84,828,3,Comment 84 +85,510,3,Comment 85 +86,459,90,Comment 86 +87,91,31,Comment 87 +88,459,47,Comment 88 +89,509,81,Comment 89 +90,934,8,Comment 90 +91,488,0,Comment 91 +92,945,3,Comment 92 +93,938,65,Comment 93 +94,526,84,Comment 94 +95,451,3,Comment 95 +96,424,3,Comment 96 +97,857,92,Comment 97 +98,25,3,Comment 98 +99,90,39,Comment 99 +100,75,3,Comment 100 +101,702,3,Comment 101 +102,308,17,Comment 102 +103,519,91,Comment 103 +104,488,38,Comment 104 +105,327,3,Comment 105 +106,451,45,Comment 106 +107,526,59,Comment 107 +108,911,3,Comment 108 +109,488,28,Comment 109 +110,498,45,Comment 110 +111,26,3,Comment 111 +112,931,96,Comment 112 +113,451,8,Comment 113 +114,749,96,Comment 114 +115,25,3,Comment 115 +116,373,13,Comment 116 +117,219,3,Comment 117 +118,224,1,Comment 118 +119,735,3,Comment 119 +120,504,3,Comment 120 +121,394,8,Comment 121 +122,5,81,Comment 122 +123,943,38,Comment 123 +124,938,8,Comment 124 +125,198,97,Comment 125 +126,25,38,Comment 126 +127,451,3,Comment 127 +128,931,45,Comment 128 +129,857,15,Comment 129 +130,488,79,Comment 130 +131,275,83,Comment 131 +132,304,3,Comment 132 +133,857,30,Comment 133 +134,451,3,Comment 134 +135,120,38,Comment 135 +136,217,98,Comment 136 +137,232,67,Comment 137 +138,106,3,Comment 138 +139,420,3,Comment 139 +140,864,96,Comment 140 +141,557,3,Comment 141 +142,32,3,Comment 142 +143,4,3,Comment 143 +144,232,13,Comment 144 +145,327,95,Comment 145 +146,719,96,Comment 146 +147,945,3,Comment 147 +148,329,3,Comment 148 +149,590,18,Comment 149 +150,991,1,Comment 150 +151,682,3,Comment 151 +152,516,93,Comment 152 +153,39,3,Comment 153 +154,297,3,Comment 154 +155,861,6,Comment 155 +156,185,50,Comment 156 +157,824,3,Comment 157 +158,600,3,Comment 158 +159,327,3,Comment 159 +160,451,76,Comment 160 +161,463,3,Comment 161 +162,638,30,Comment 162 +163,451,99,Comment 163 +164,120,49,Comment 164 +165,719,3,Comment 165 +166,358,59,Comment 166 +167,938,95,Comment 167 +168,242,3,Comment 168 +169,219,47,Comment 169 +170,304,3,Comment 170 +171,488,83,Comment 171 +172,857,3,Comment 172 +173,154,3,Comment 173 +174,232,23,Comment 174 +175,488,3,Comment 175 +176,411,38,Comment 176 +177,406,27,Comment 177 +178,784,48,Comment 178 +179,875,3,Comment 179 +180,438,24,Comment 180 +181,590,64,Comment 181 +182,749,64,Comment 182 +183,451,47,Comment 183 +184,264,23,Comment 184 +185,519,44,Comment 185 +186,488,3,Comment 186 +187,857,10,Comment 187 +188,25,72,Comment 188 +189,857,37,Comment 189 +190,452,93,Comment 190 +191,18,3,Comment 191 +192,323,3,Comment 192 +193,657,38,Comment 193 +194,451,3,Comment 194 +195,964,59,Comment 195 +196,377,3,Comment 196 +197,302,22,Comment 197 +198,784,64,Comment 198 +199,478,59,Comment 199 +200,584,15,Comment 200 +201,758,96,Comment 201 +202,562,16,Comment 202 +203,376,3,Comment 203 +204,109,3,Comment 204 +205,488,53,Comment 205 +206,857,8,Comment 206 +207,369,6,Comment 207 +208,857,3,Comment 208 +209,451,3,Comment 209 +210,504,72,Comment 210 +211,801,60,Comment 211 +212,488,54,Comment 212 +213,65,3,Comment 213 +214,965,3,Comment 214 +215,217,3,Comment 215 +216,626,11,Comment 216 +217,451,87,Comment 217 +218,435,3,Comment 218 +219,216,3,Comment 219 +220,656,32,Comment 220 +221,89,88,Comment 221 +222,986,52,Comment 222 +223,827,71,Comment 223 +224,452,3,Comment 224 +225,382,3,Comment 225 +226,244,23,Comment 226 +227,451,45,Comment 227 +228,857,95,Comment 228 +229,232,3,Comment 229 +230,451,3,Comment 230 +231,200,20,Comment 231 +232,304,45,Comment 232 +233,166,31,Comment 233 +234,986,48,Comment 234 +235,488,3,Comment 235 +236,665,72,Comment 236 +237,74,3,Comment 237 +238,327,3,Comment 238 +239,857,59,Comment 239 +240,55,3,Comment 240 +241,143,44,Comment 241 +242,504,3,Comment 242 +243,504,3,Comment 243 +244,808,3,Comment 244 +245,896,81,Comment 245 +246,122,8,Comment 246 +247,219,3,Comment 247 +248,383,3,Comment 248 +249,1,1,Comment 249 +250,27,57,Comment 250 +251,198,3,Comment 251 +252,438,37,Comment 252 +253,219,3,Comment 253 +254,857,67,Comment 254 +255,298,3,Comment 255 +256,877,1,Comment 256 +257,952,3,Comment 257 +258,408,90,Comment 258 +259,198,3,Comment 259 +260,258,18,Comment 260 +261,719,3,Comment 261 +262,230,3,Comment 262 +263,327,92,Comment 263 +264,435,59,Comment 264 +265,488,58,Comment 265 +266,993,20,Comment 266 +267,198,66,Comment 267 +268,327,8,Comment 268 +269,219,8,Comment 269 +270,327,14,Comment 270 +271,74,3,Comment 271 +272,327,92,Comment 272 +273,857,3,Comment 273 +274,268,3,Comment 274 +275,327,3,Comment 275 +276,919,65,Comment 276 +277,661,59,Comment 277 +278,451,8,Comment 278 +279,719,3,Comment 279 +280,105,96,Comment 280 +281,421,3,Comment 281 +282,101,70,Comment 282 +283,198,59,Comment 283 +284,121,93,Comment 284 +285,327,59,Comment 285 +286,327,3,Comment 286 +287,928,3,Comment 287 +288,219,3,Comment 288 +289,431,3,Comment 289 +290,767,48,Comment 290 +291,770,66,Comment 291 +292,692,38,Comment 292 +293,248,3,Comment 293 +294,451,3,Comment 294 +295,165,33,Comment 295 +296,165,8,Comment 296 +297,394,3,Comment 297 +298,677,3,Comment 298 +299,451,23,Comment 299 +300,857,38,Comment 300 +301,797,11,Comment 301 +302,25,96,Comment 302 +303,488,3,Comment 303 +304,488,41,Comment 304 +305,938,86,Comment 305 +306,25,3,Comment 306 +307,451,3,Comment 307 +308,185,58,Comment 308 +309,25,13,Comment 309 +310,488,98,Comment 310 +311,719,35,Comment 311 +312,719,92,Comment 312 +313,25,2,Comment 313 +314,359,3,Comment 314 +315,25,45,Comment 315 +316,217,5,Comment 316 +317,172,67,Comment 317 +318,198,8,Comment 318 +319,307,47,Comment 319 +320,232,31,Comment 320 +321,938,3,Comment 321 +322,327,48,Comment 322 +323,857,47,Comment 323 +324,163,3,Comment 324 +325,712,29,Comment 325 +326,26,3,Comment 326 +327,419,8,Comment 327 +328,4,7,Comment 328 +329,764,3,Comment 329 +330,931,3,Comment 330 +331,25,3,Comment 331 +332,893,3,Comment 332 +333,719,70,Comment 333 +334,327,3,Comment 334 +335,857,8,Comment 335 +336,266,60,Comment 336 +337,360,3,Comment 337 +338,74,3,Comment 338 +339,540,49,Comment 339 +340,25,38,Comment 340 +341,447,30,Comment 341 +342,587,3,Comment 342 +343,784,59,Comment 343 +344,122,3,Comment 344 +345,698,3,Comment 345 +346,645,59,Comment 346 +347,488,3,Comment 347 +348,488,3,Comment 348 +349,66,3,Comment 349 +350,488,3,Comment 350 +351,327,3,Comment 351 +352,856,45,Comment 352 +353,294,3,Comment 353 +354,488,3,Comment 354 +355,383,59,Comment 355 +356,857,87,Comment 356 +357,219,38,Comment 357 +358,40,3,Comment 358 +359,689,23,Comment 359 +360,360,38,Comment 360 +361,488,3,Comment 361 +362,25,67,Comment 362 +363,931,59,Comment 363 +364,857,33,Comment 364 +365,504,3,Comment 365 +366,21,45,Comment 366 +367,327,83,Comment 367 +368,961,3,Comment 368 +369,582,95,Comment 369 +370,137,59,Comment 370 +371,419,3,Comment 371 +372,945,38,Comment 372 +373,452,3,Comment 373 +374,25,3,Comment 374 +375,503,66,Comment 375 +376,226,43,Comment 376 +377,74,38,Comment 377 +378,353,3,Comment 378 +379,488,3,Comment 379 +380,21,77,Comment 380 +381,488,3,Comment 381 +382,451,3,Comment 382 +383,740,3,Comment 383 +384,379,3,Comment 384 +385,483,59,Comment 385 +386,682,38,Comment 386 +387,857,3,Comment 387 +388,327,3,Comment 388 +389,488,31,Comment 389 +390,599,38,Comment 390 +391,25,3,Comment 391 +392,748,37,Comment 392 +393,327,3,Comment 393 +394,559,3,Comment 394 +395,431,3,Comment 395 +396,611,3,Comment 396 +397,657,96,Comment 397 +398,168,96,Comment 398 +399,546,8,Comment 399 +400,828,3,Comment 400 +401,203,95,Comment 401 +402,702,66,Comment 402 +403,512,48,Comment 403 +404,931,8,Comment 404 +405,219,4,Comment 405 +406,122,59,Comment 406 +407,293,20,Comment 407 +408,219,41,Comment 408 +409,702,31,Comment 409 +410,665,47,Comment 410 +411,559,3,Comment 411 +412,198,3,Comment 412 +413,849,3,Comment 413 +414,935,3,Comment 414 +415,451,3,Comment 415 +416,526,18,Comment 416 +417,451,3,Comment 417 +418,242,96,Comment 418 +419,244,20,Comment 419 +420,294,3,Comment 420 +421,168,45,Comment 421 +422,857,3,Comment 422 +423,841,3,Comment 423 +424,419,67,Comment 424 +425,327,3,Comment 425 +426,904,3,Comment 426 +427,198,3,Comment 427 +428,483,49,Comment 428 +429,25,41,Comment 429 +430,168,1,Comment 430 +431,488,3,Comment 431 +432,546,47,Comment 432 +433,488,3,Comment 433 +434,857,33,Comment 434 +435,987,3,Comment 435 +436,712,8,Comment 436 +437,423,74,Comment 437 +438,803,3,Comment 438 +439,102,45,Comment 439 +440,587,8,Comment 440 +441,451,3,Comment 441 +442,158,3,Comment 442 +443,950,68,Comment 443 +444,305,96,Comment 444 +445,499,69,Comment 445 +446,857,3,Comment 446 +447,451,3,Comment 447 +448,91,67,Comment 448 +449,779,26,Comment 449 +450,327,59,Comment 450 +451,970,59,Comment 451 +452,857,3,Comment 452 +453,833,3,Comment 453 +454,327,12,Comment 454 +455,702,93,Comment 455 +456,979,84,Comment 456 +457,451,7,Comment 457 +458,431,3,Comment 458 +459,18,3,Comment 459 +460,609,21,Comment 460 +461,841,85,Comment 461 +462,217,3,Comment 462 +463,294,3,Comment 463 +464,451,38,Comment 464 +465,32,18,Comment 465 +466,63,3,Comment 466 +467,373,3,Comment 467 +468,219,13,Comment 468 +469,358,59,Comment 469 +470,412,59,Comment 470 +471,954,96,Comment 471 +472,919,3,Comment 472 +473,121,3,Comment 473 +474,857,0,Comment 474 +475,45,67,Comment 475 +476,451,38,Comment 476 +477,142,59,Comment 477 +478,327,10,Comment 478 +479,682,90,Comment 479 +480,382,3,Comment 480 +481,758,3,Comment 481 +482,645,3,Comment 482 +483,918,3,Comment 483 +484,452,3,Comment 484 +485,753,31,Comment 485 +486,297,3,Comment 486 +487,326,38,Comment 487 +488,351,93,Comment 488 +489,360,66,Comment 489 +490,78,3,Comment 490 +491,451,3,Comment 491 +492,74,94,Comment 492 +493,857,98,Comment 493 +494,26,3,Comment 494 +495,408,8,Comment 495 +496,243,3,Comment 496 +497,122,47,Comment 497 +498,725,6,Comment 498 +499,639,60,Comment 499 +500,122,33,Comment 500 +501,510,3,Comment 501 +502,738,3,Comment 502 +503,719,63,Comment 503 +504,68,45,Comment 504 +505,590,38,Comment 505 +506,641,44,Comment 506 +507,849,37,Comment 507 +508,451,38,Comment 508 +509,419,38,Comment 509 +510,944,21,Comment 510 +511,741,57,Comment 511 +512,488,3,Comment 512 +513,304,11,Comment 513 +514,827,3,Comment 514 +515,414,3,Comment 515 +516,69,3,Comment 516 +517,413,3,Comment 517 +518,857,3,Comment 518 +519,408,3,Comment 519 +520,18,3,Comment 520 +521,491,3,Comment 521 +522,993,3,Comment 522 +523,774,3,Comment 523 +524,500,3,Comment 524 +525,861,3,Comment 525 +526,768,50,Comment 526 +527,451,3,Comment 527 +528,488,3,Comment 528 +529,665,36,Comment 529 +530,828,3,Comment 530 +531,25,3,Comment 531 +532,719,3,Comment 532 +533,406,6,Comment 533 +534,545,45,Comment 534 +535,712,38,Comment 535 +536,459,1,Comment 536 +537,128,59,Comment 537 +538,360,3,Comment 538 +539,36,45,Comment 539 +540,590,3,Comment 540 +541,931,3,Comment 541 +542,741,3,Comment 542 +543,334,65,Comment 543 +544,488,3,Comment 544 +545,277,44,Comment 545 +546,459,3,Comment 546 +547,272,3,Comment 547 +548,676,93,Comment 548 +549,219,36,Comment 549 +550,940,82,Comment 550 +551,546,3,Comment 551 +552,26,67,Comment 552 +553,5,93,Comment 553 +554,993,3,Comment 554 +555,198,85,Comment 555 +556,293,3,Comment 556 +557,191,3,Comment 557 +558,881,99,Comment 558 +559,529,94,Comment 559 +560,451,3,Comment 560 +561,485,67,Comment 561 +562,297,3,Comment 562 +563,498,96,Comment 563 +564,371,70,Comment 564 +565,185,85,Comment 565 +566,539,32,Comment 566 +567,327,3,Comment 567 +568,725,3,Comment 568 +569,363,20,Comment 569 +570,451,44,Comment 570 +571,727,53,Comment 571 +572,407,3,Comment 572 +573,945,21,Comment 573 +574,738,8,Comment 574 +575,792,3,Comment 575 +576,931,26,Comment 576 +577,330,3,Comment 577 +578,488,21,Comment 578 +579,596,3,Comment 579 +580,568,84,Comment 580 +581,931,3,Comment 581 +582,249,37,Comment 582 +583,258,3,Comment 583 +584,868,38,Comment 584 +585,745,3,Comment 585 +586,185,95,Comment 586 +587,488,3,Comment 587 +588,682,37,Comment 588 +589,327,3,Comment 589 +590,393,72,Comment 590 +591,488,61,Comment 591 +592,425,3,Comment 592 +593,696,87,Comment 593 +594,4,3,Comment 594 +595,393,56,Comment 595 +596,305,8,Comment 596 +597,488,67,Comment 597 +598,719,8,Comment 598 +599,36,3,Comment 599 +600,127,3,Comment 600 +601,554,98,Comment 601 +602,676,3,Comment 602 +603,488,3,Comment 603 +604,969,61,Comment 604 +605,451,3,Comment 605 +606,121,3,Comment 606 +607,590,3,Comment 607 +608,488,3,Comment 608 +609,488,99,Comment 609 +610,472,45,Comment 610 +611,905,20,Comment 611 +612,118,61,Comment 612 +613,32,42,Comment 613 +614,122,3,Comment 614 +615,216,3,Comment 615 +616,488,74,Comment 616 +617,495,3,Comment 617 +618,198,8,Comment 618 +619,451,3,Comment 619 +620,440,3,Comment 620 +621,408,47,Comment 621 +622,754,22,Comment 622 +623,431,44,Comment 623 +624,702,3,Comment 624 +625,145,3,Comment 625 +626,451,3,Comment 626 +627,789,6,Comment 627 +628,158,40,Comment 628 +629,423,93,Comment 629 +630,488,42,Comment 630 +631,165,20,Comment 631 +632,702,5,Comment 632 +633,41,3,Comment 633 +634,924,11,Comment 634 +635,428,3,Comment 635 +636,304,33,Comment 636 +637,96,50,Comment 637 +638,388,93,Comment 638 +639,545,3,Comment 639 +640,70,3,Comment 640 +641,282,3,Comment 641 +642,806,78,Comment 642 +643,526,66,Comment 643 +644,191,59,Comment 644 +645,137,3,Comment 645 +646,857,27,Comment 646 +647,245,3,Comment 647 +648,159,3,Comment 648 +649,999,59,Comment 649 +650,291,3,Comment 650 +651,109,17,Comment 651 +652,419,20,Comment 652 +653,393,3,Comment 653 +654,880,14,Comment 654 +655,120,3,Comment 655 +656,66,0,Comment 656 +657,69,78,Comment 657 +658,857,5,Comment 658 +659,408,85,Comment 659 +660,122,15,Comment 660 +661,488,8,Comment 661 +662,458,3,Comment 662 +663,183,3,Comment 663 +664,488,3,Comment 664 +665,308,37,Comment 665 +666,205,58,Comment 666 +667,451,23,Comment 667 +668,258,57,Comment 668 +669,198,3,Comment 669 +670,857,34,Comment 670 +671,124,79,Comment 671 +672,234,3,Comment 672 +673,328,3,Comment 673 +674,231,55,Comment 674 +675,195,64,Comment 675 +676,719,3,Comment 676 +677,304,3,Comment 677 +678,89,38,Comment 678 +679,459,3,Comment 679 +680,110,38,Comment 680 +681,327,15,Comment 681 +682,857,59,Comment 682 +683,327,69,Comment 683 +684,509,73,Comment 684 +685,451,38,Comment 685 +686,121,38,Comment 686 +687,526,8,Comment 687 +688,837,3,Comment 688 +689,69,3,Comment 689 +690,697,3,Comment 690 +691,590,8,Comment 691 +692,855,70,Comment 692 +693,78,36,Comment 693 +694,282,44,Comment 694 +695,598,3,Comment 695 +696,25,3,Comment 696 +697,666,3,Comment 697 +698,841,68,Comment 698 +699,408,3,Comment 699 +700,393,30,Comment 700 +701,232,3,Comment 701 +702,4,3,Comment 702 +703,165,3,Comment 703 +704,964,3,Comment 704 +705,856,3,Comment 705 +706,224,37,Comment 706 +707,940,3,Comment 707 +708,327,59,Comment 708 +709,266,3,Comment 709 +710,122,3,Comment 710 +711,857,44,Comment 711 +712,980,37,Comment 712 +713,304,3,Comment 713 +714,613,8,Comment 714 +715,304,3,Comment 715 +716,78,38,Comment 716 +717,337,3,Comment 717 +718,483,44,Comment 718 +719,105,8,Comment 719 +720,778,3,Comment 720 +721,451,54,Comment 721 +722,200,3,Comment 722 +723,488,3,Comment 723 +724,738,3,Comment 724 +725,304,72,Comment 725 +726,609,14,Comment 726 +727,384,20,Comment 727 +728,941,3,Comment 728 +729,718,3,Comment 729 +730,327,3,Comment 730 +731,21,20,Comment 731 +732,542,45,Comment 732 +733,181,17,Comment 733 +734,103,67,Comment 734 +735,889,3,Comment 735 +736,999,3,Comment 736 +737,226,7,Comment 737 +738,272,3,Comment 738 +739,142,3,Comment 739 +740,419,96,Comment 740 +741,855,71,Comment 741 +742,609,3,Comment 742 +743,828,3,Comment 743 +744,198,8,Comment 744 +745,665,59,Comment 745 +746,868,3,Comment 746 +747,236,3,Comment 747 +748,590,3,Comment 748 +749,351,38,Comment 749 +750,254,4,Comment 750 +751,950,70,Comment 751 +752,327,3,Comment 752 +753,81,13,Comment 753 +754,329,47,Comment 754 +755,407,24,Comment 755 +756,695,3,Comment 756 +757,931,3,Comment 757 +758,773,22,Comment 758 +759,889,3,Comment 759 +760,431,93,Comment 760 +761,646,22,Comment 761 +762,290,3,Comment 762 +763,26,48,Comment 763 +764,327,3,Comment 764 +765,602,59,Comment 765 +766,232,8,Comment 766 +767,848,3,Comment 767 +768,734,3,Comment 768 +769,174,98,Comment 769 +770,304,3,Comment 770 +771,790,94,Comment 771 +772,216,17,Comment 772 +773,304,17,Comment 773 +774,317,3,Comment 774 +775,749,94,Comment 775 +776,25,50,Comment 776 +777,32,20,Comment 777 +778,488,3,Comment 778 +779,346,3,Comment 779 +780,510,8,Comment 780 +781,224,3,Comment 781 +782,857,28,Comment 782 +783,708,3,Comment 783 +784,26,3,Comment 784 +785,725,18,Comment 785 +786,950,3,Comment 786 +787,917,3,Comment 787 +788,668,3,Comment 788 +789,106,3,Comment 789 +790,488,3,Comment 790 +791,243,3,Comment 791 +792,950,3,Comment 792 +793,644,3,Comment 793 +794,490,3,Comment 794 +795,600,3,Comment 795 +796,394,8,Comment 796 +797,327,67,Comment 797 +798,896,93,Comment 798 +799,304,3,Comment 799 +800,25,3,Comment 800 +801,352,31,Comment 801 +802,734,3,Comment 802 +803,526,3,Comment 803 +804,938,3,Comment 804 +805,81,3,Comment 805 +806,860,3,Comment 806 +807,327,29,Comment 807 +808,431,38,Comment 808 +809,385,97,Comment 809 +810,95,3,Comment 810 +811,217,3,Comment 811 +812,682,94,Comment 812 +813,25,73,Comment 813 +814,120,38,Comment 814 +815,25,68,Comment 815 +816,841,3,Comment 816 +817,501,76,Comment 817 +818,148,74,Comment 818 +819,713,3,Comment 819 +820,945,18,Comment 820 +821,895,93,Comment 821 +822,870,44,Comment 822 +823,4,41,Comment 823 +824,488,29,Comment 824 +825,219,3,Comment 825 +826,488,96,Comment 826 +827,297,8,Comment 827 +828,122,3,Comment 828 +829,403,49,Comment 829 +830,451,37,Comment 830 +831,986,3,Comment 831 +832,25,3,Comment 832 +833,272,3,Comment 833 +834,828,3,Comment 834 +835,545,38,Comment 835 +836,792,18,Comment 836 +837,545,3,Comment 837 +838,703,3,Comment 838 +839,451,3,Comment 839 +840,185,52,Comment 840 +841,763,3,Comment 841 +842,488,3,Comment 842 +843,121,3,Comment 843 +844,757,3,Comment 844 +845,938,76,Comment 845 +846,327,3,Comment 846 +847,261,95,Comment 847 +848,49,3,Comment 848 +849,553,3,Comment 849 +850,938,74,Comment 850 +851,121,7,Comment 851 +852,447,3,Comment 852 +853,74,45,Comment 853 +854,25,3,Comment 854 +855,553,82,Comment 855 +856,857,68,Comment 856 +857,305,3,Comment 857 +858,857,59,Comment 858 +859,96,96,Comment 859 +860,205,14,Comment 860 +861,857,32,Comment 861 +862,451,45,Comment 862 +863,488,3,Comment 863 +864,25,38,Comment 864 +865,117,11,Comment 865 +866,25,3,Comment 866 +867,857,84,Comment 867 +868,120,59,Comment 868 +869,828,93,Comment 869 +870,327,67,Comment 870 +871,747,3,Comment 871 +872,327,37,Comment 872 +873,225,45,Comment 873 +874,69,43,Comment 874 +875,235,3,Comment 875 +876,431,7,Comment 876 +877,775,3,Comment 877 +878,408,3,Comment 878 +879,950,29,Comment 879 +880,460,33,Comment 880 +881,25,3,Comment 881 +882,363,8,Comment 882 +883,590,11,Comment 883 +884,200,2,Comment 884 +885,605,3,Comment 885 +886,451,38,Comment 886 +887,25,98,Comment 887 +888,719,39,Comment 888 +889,488,3,Comment 889 +890,51,66,Comment 890 +891,431,8,Comment 891 +892,245,3,Comment 892 +893,857,37,Comment 893 +894,243,3,Comment 894 +895,915,5,Comment 895 +896,473,3,Comment 896 +897,297,38,Comment 897 +898,768,8,Comment 898 +899,602,3,Comment 899 +900,361,3,Comment 900 +901,504,17,Comment 901 +902,719,59,Comment 902 +903,828,1,Comment 903 +904,771,38,Comment 904 +905,144,3,Comment 905 +906,792,37,Comment 906 +907,25,67,Comment 907 +908,185,3,Comment 908 +909,504,15,Comment 909 +910,91,59,Comment 910 +911,488,3,Comment 911 +912,286,3,Comment 912 +913,385,85,Comment 913 +914,294,45,Comment 914 +915,28,51,Comment 915 +916,451,69,Comment 916 +917,939,22,Comment 917 +918,492,13,Comment 918 +919,271,27,Comment 919 +920,934,48,Comment 920 +921,332,3,Comment 921 +922,602,3,Comment 922 +923,513,3,Comment 923 +924,931,96,Comment 924 +925,861,3,Comment 925 +926,122,47,Comment 926 +927,55,93,Comment 927 +928,762,3,Comment 928 +929,857,48,Comment 929 +930,451,3,Comment 930 +931,590,3,Comment 931 +932,27,70,Comment 932 +933,198,86,Comment 933 +934,122,3,Comment 934 +935,488,38,Comment 935 +936,180,3,Comment 936 +937,25,59,Comment 937 +938,272,79,Comment 938 +939,574,3,Comment 939 +940,488,40,Comment 940 +941,304,77,Comment 941 +942,802,3,Comment 942 +943,232,70,Comment 943 +944,219,32,Comment 944 +945,488,4,Comment 945 +946,434,20,Comment 946 +947,404,66,Comment 947 +948,124,48,Comment 948 +949,451,17,Comment 949 +950,219,13,Comment 950 +951,337,84,Comment 951 +952,665,3,Comment 952 +953,899,3,Comment 953 +954,719,44,Comment 954 +955,358,46,Comment 955 +956,488,32,Comment 956 +957,684,3,Comment 957 +958,361,3,Comment 958 +959,327,38,Comment 959 +960,120,3,Comment 960 +961,670,17,Comment 961 +962,809,3,Comment 962 +963,296,18,Comment 963 +964,725,3,Comment 964 +965,490,3,Comment 965 +966,725,51,Comment 966 +967,360,3,Comment 967 +968,686,3,Comment 968 +969,360,71,Comment 969 +970,60,3,Comment 970 +971,482,3,Comment 971 +972,411,47,Comment 972 +973,219,3,Comment 973 +974,857,31,Comment 974 +975,327,38,Comment 975 +976,25,96,Comment 976 +977,327,3,Comment 977 +978,382,3,Comment 978 +979,848,93,Comment 979 +980,744,48,Comment 980 +981,185,8,Comment 981 +982,811,51,Comment 982 +983,217,4,Comment 983 +984,312,4,Comment 984 +985,36,3,Comment 985 +986,25,20,Comment 986 +987,581,3,Comment 987 +988,873,38,Comment 988 +989,451,3,Comment 989 +990,824,70,Comment 990 +991,739,59,Comment 991 +992,553,3,Comment 992 +993,959,35,Comment 993 +994,753,47,Comment 994 +995,232,19,Comment 995 +996,732,3,Comment 996 +997,593,18,Comment 997 +998,350,8,Comment 998 +999,36,3,Comment 999 diff --git a/axolotl/tests/data/datasets/database_dataset_3/tables/learningData.csv b/axolotl/tests/data/datasets/database_dataset_3/tables/learningData.csv new file mode 100644 index 0000000..a4d8901 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_3/tables/learningData.csv @@ -0,0 +1,1001 @@ +d3mIndex,post_id,comments_count +0,0,0 +1,1,1 +2,2,0 +3,3,0 +4,4,6 +5,5,2 +6,6,0 +7,7,0 +8,8,0 +9,9,0 +10,10,0 +11,11,0 +12,12,0 +13,13,0 +14,14,0 +15,15,1 +16,16,0 +17,17,0 +18,18,3 +19,19,0 +20,20,0 +21,21,3 +22,22,1 +23,23,0 +24,24,0 +25,25,35 +26,26,6 +27,27,2 +28,28,1 +29,29,0 +30,30,0 +31,31,0 +32,32,4 +33,33,0 +34,34,0 +35,35,0 +36,36,4 +37,37,0 +38,38,0 +39,39,1 +40,40,1 +41,41,1 +42,42,0 +43,43,0 +44,44,0 +45,45,1 +46,46,1 +47,47,0 +48,48,0 +49,49,1 +50,50,0 +51,51,1 +52,52,0 +53,53,0 +54,54,0 +55,55,2 +56,56,0 +57,57,0 +58,58,0 +59,59,0 +60,60,1 +61,61,0 +62,62,0 +63,63,1 +64,64,0 +65,65,1 +66,66,2 +67,67,0 +68,68,1 +69,69,4 +70,70,1 +71,71,0 +72,72,0 +73,73,0 +74,74,7 +75,75,1 +76,76,0 +77,77,0 +78,78,3 +79,79,0 +80,80,0 +81,81,2 +82,82,0 +83,83,0 +84,84,0 +85,85,0 +86,86,0 +87,87,0 +88,88,0 +89,89,2 +90,90,1 +91,91,4 +92,92,0 +93,93,0 +94,94,0 +95,95,1 +96,96,2 +97,97,0 +98,98,0 +99,99,0 +100,100,0 +101,101,1 +102,102,1 +103,103,1 +104,104,0 +105,105,2 +106,106,2 +107,107,0 +108,108,0 +109,109,2 +110,110,1 +111,111,0 +112,112,0 +113,113,0 +114,114,0 +115,115,0 +116,116,0 +117,117,1 +118,118,1 +119,119,0 +120,120,6 +121,121,6 +122,122,11 +123,123,0 +124,124,2 +125,125,0 +126,126,0 +127,127,1 +128,128,1 +129,129,0 +130,130,0 +131,131,0 +132,132,0 +133,133,0 +134,134,0 +135,135,0 +136,136,0 +137,137,2 +138,138,0 +139,139,0 +140,140,0 +141,141,0 +142,142,2 +143,143,1 +144,144,1 +145,145,1 +146,146,0 +147,147,0 +148,148,1 +149,149,0 +150,150,0 +151,151,0 +152,152,0 +153,153,0 +154,154,2 +155,155,0 +156,156,0 +157,157,0 +158,158,2 +159,159,1 +160,160,0 +161,161,0 +162,162,0 +163,163,1 +164,164,1 +165,165,5 +166,166,1 +167,167,0 +168,168,3 +169,169,0 +170,170,0 +171,171,0 +172,172,1 +173,173,0 +174,174,1 +175,175,0 +176,176,0 +177,177,0 +178,178,0 +179,179,0 +180,180,1 +181,181,1 +182,182,0 +183,183,1 +184,184,0 +185,185,8 +186,186,0 +187,187,0 +188,188,0 +189,189,0 +190,190,0 +191,191,2 +192,192,0 +193,193,0 +194,194,0 +195,195,1 +196,196,0 +197,197,0 +198,198,17 +199,199,0 +200,200,3 +201,201,0 +202,202,0 +203,203,1 +204,204,0 +205,205,3 +206,206,0 +207,207,0 +208,208,0 +209,209,0 +210,210,0 +211,211,0 +212,212,0 +213,213,0 +214,214,0 +215,215,0 +216,216,3 +217,217,7 +218,218,0 +219,219,15 +220,220,0 +221,221,0 +222,222,0 +223,223,0 +224,224,3 +225,225,1 +226,226,2 +227,227,0 +228,228,0 +229,229,0 +230,230,1 +231,231,1 +232,232,10 +233,233,0 +234,234,1 +235,235,1 +236,236,1 +237,237,0 +238,238,0 +239,239,0 +240,240,0 +241,241,0 +242,242,2 +243,243,3 +244,244,2 +245,245,2 +246,246,0 +247,247,0 +248,248,1 +249,249,1 +250,250,0 +251,251,1 +252,252,0 +253,253,0 +254,254,1 +255,255,0 +256,256,0 +257,257,0 +258,258,3 +259,259,0 +260,260,0 +261,261,1 +262,262,0 +263,263,0 +264,264,1 +265,265,1 +266,266,2 +267,267,0 +268,268,1 +269,269,0 +270,270,0 +271,271,1 +272,272,4 +273,273,0 +274,274,0 +275,275,1 +276,276,0 +277,277,1 +278,278,0 +279,279,0 +280,280,0 +281,281,0 +282,282,2 +283,283,0 +284,284,0 +285,285,0 +286,286,1 +287,287,0 +288,288,0 +289,289,0 +290,290,1 +291,291,1 +292,292,0 +293,293,3 +294,294,4 +295,295,0 +296,296,1 +297,297,7 +298,298,1 +299,299,1 +300,300,0 +301,301,0 +302,302,1 +303,303,0 +304,304,14 +305,305,3 +306,306,0 +307,307,1 +308,308,2 +309,309,0 +310,310,0 +311,311,0 +312,312,1 +313,313,0 +314,314,0 +315,315,0 +316,316,0 +317,317,1 +318,318,0 +319,319,0 +320,320,0 +321,321,0 +322,322,0 +323,323,1 +324,324,0 +325,325,0 +326,326,1 +327,327,40 +328,328,1 +329,329,2 +330,330,1 +331,331,0 +332,332,1 +333,333,0 +334,334,1 +335,335,0 +336,336,0 +337,337,3 +338,338,0 +339,339,0 +340,340,0 +341,341,0 +342,342,0 +343,343,0 +344,344,0 +345,345,0 +346,346,1 +347,347,0 +348,348,0 +349,349,0 +350,350,1 +351,351,2 +352,352,1 +353,353,1 +354,354,0 +355,355,0 +356,356,0 +357,357,0 +358,358,3 +359,359,1 +360,360,7 +361,361,4 +362,362,0 +363,363,2 +364,364,0 +365,365,0 +366,366,0 +367,367,0 +368,368,0 +369,369,1 +370,370,0 +371,371,1 +372,372,0 +373,373,3 +374,374,0 +375,375,0 +376,376,1 +377,377,1 +378,378,0 +379,379,1 +380,380,0 +381,381,0 +382,382,3 +383,383,3 +384,384,1 +385,385,3 +386,386,0 +387,387,0 +388,388,1 +389,389,0 +390,390,0 +391,391,0 +392,392,0 +393,393,5 +394,394,3 +395,395,0 +396,396,0 +397,397,0 +398,398,0 +399,399,0 +400,400,0 +401,401,0 +402,402,0 +403,403,1 +404,404,1 +405,405,0 +406,406,2 +407,407,2 +408,408,7 +409,409,0 +410,410,0 +411,411,2 +412,412,1 +413,413,1 +414,414,1 +415,415,0 +416,416,0 +417,417,0 +418,418,0 +419,419,6 +420,420,1 +421,421,2 +422,422,0 +423,423,2 +424,424,1 +425,425,1 +426,426,0 +427,427,0 +428,428,1 +429,429,0 +430,430,0 +431,431,8 +432,432,0 +433,433,0 +434,434,1 +435,435,2 +436,436,0 +437,437,0 +438,438,2 +439,439,0 +440,440,1 +441,441,0 +442,442,0 +443,443,0 +444,444,0 +445,445,0 +446,446,0 +447,447,2 +448,448,0 +449,449,0 +450,450,0 +451,451,44 +452,452,4 +453,453,0 +454,454,0 +455,455,1 +456,456,0 +457,457,0 +458,458,2 +459,459,6 +460,460,1 +461,461,1 +462,462,0 +463,463,1 +464,464,0 +465,465,0 +466,466,0 +467,467,0 +468,468,0 +469,469,0 +470,470,0 +471,471,1 +472,472,1 +473,473,1 +474,474,0 +475,475,0 +476,476,0 +477,477,0 +478,478,1 +479,479,0 +480,480,0 +481,481,0 +482,482,1 +483,483,3 +484,484,0 +485,485,1 +486,486,0 +487,487,0 +488,488,56 +489,489,0 +490,490,3 +491,491,1 +492,492,1 +493,493,0 +494,494,1 +495,495,1 +496,496,0 +497,497,0 +498,498,2 +499,499,1 +500,500,1 +501,501,1 +502,502,0 +503,503,1 +504,504,7 +505,505,0 +506,506,0 +507,507,0 +508,508,0 +509,509,2 +510,510,3 +511,511,0 +512,512,1 +513,513,1 +514,514,0 +515,515,0 +516,516,1 +517,517,0 +518,518,0 +519,519,2 +520,520,0 +521,521,0 +522,522,0 +523,523,0 +524,524,0 +525,525,0 +526,526,7 +527,527,0 +528,528,0 +529,529,1 +530,530,0 +531,531,0 +532,532,0 +533,533,0 +534,534,0 +535,535,0 +536,536,0 +537,537,0 +538,538,0 +539,539,1 +540,540,1 +541,541,0 +542,542,1 +543,543,0 +544,544,0 +545,545,5 +546,546,3 +547,547,0 +548,548,0 +549,549,0 +550,550,0 +551,551,0 +552,552,0 +553,553,3 +554,554,1 +555,555,0 +556,556,1 +557,557,1 +558,558,0 +559,559,2 +560,560,0 +561,561,0 +562,562,1 +563,563,0 +564,564,0 +565,565,0 +566,566,0 +567,567,0 +568,568,1 +569,569,0 +570,570,0 +571,571,0 +572,572,0 +573,573,0 +574,574,1 +575,575,0 +576,576,0 +577,577,0 +578,578,0 +579,579,0 +580,580,0 +581,581,2 +582,582,1 +583,583,0 +584,584,1 +585,585,0 +586,586,0 +587,587,2 +588,588,1 +589,589,0 +590,590,9 +591,591,0 +592,592,0 +593,593,1 +594,594,0 +595,595,0 +596,596,1 +597,597,0 +598,598,1 +599,599,1 +600,600,2 +601,601,0 +602,602,3 +603,603,0 +604,604,0 +605,605,1 +606,606,0 +607,607,0 +608,608,0 +609,609,3 +610,610,0 +611,611,1 +612,612,0 +613,613,1 +614,614,0 +615,615,0 +616,616,0 +617,617,0 +618,618,0 +619,619,0 +620,620,0 +621,621,2 +622,622,0 +623,623,0 +624,624,0 +625,625,0 +626,626,1 +627,627,0 +628,628,0 +629,629,0 +630,630,0 +631,631,0 +632,632,0 +633,633,0 +634,634,0 +635,635,0 +636,636,0 +637,637,1 +638,638,1 +639,639,1 +640,640,0 +641,641,1 +642,642,0 +643,643,0 +644,644,1 +645,645,2 +646,646,1 +647,647,0 +648,648,0 +649,649,0 +650,650,0 +651,651,0 +652,652,0 +653,653,0 +654,654,0 +655,655,0 +656,656,1 +657,657,4 +658,658,0 +659,659,0 +660,660,0 +661,661,1 +662,662,0 +663,663,1 +664,664,0 +665,665,5 +666,666,1 +667,667,0 +668,668,1 +669,669,0 +670,670,1 +671,671,0 +672,672,0 +673,673,0 +674,674,0 +675,675,0 +676,676,2 +677,677,1 +678,678,0 +679,679,0 +680,680,1 +681,681,0 +682,682,5 +683,683,0 +684,684,1 +685,685,0 +686,686,1 +687,687,0 +688,688,0 +689,689,1 +690,690,0 +691,691,0 +692,692,1 +693,693,0 +694,694,0 +695,695,1 +696,696,1 +697,697,1 +698,698,2 +699,699,0 +700,700,0 +701,701,0 +702,702,6 +703,703,1 +704,704,0 +705,705,0 +706,706,0 +707,707,0 +708,708,1 +709,709,0 +710,710,0 +711,711,0 +712,712,3 +713,713,1 +714,714,0 +715,715,0 +716,716,0 +717,717,0 +718,718,1 +719,719,14 +720,720,0 +721,721,0 +722,722,0 +723,723,0 +724,724,0 +725,725,5 +726,726,0 +727,727,1 +728,728,1 +729,729,0 +730,730,0 +731,731,0 +732,732,2 +733,733,0 +734,734,2 +735,735,1 +736,736,0 +737,737,0 +738,738,3 +739,739,1 +740,740,1 +741,741,2 +742,742,0 +743,743,0 +744,744,1 +745,745,1 +746,746,0 +747,747,1 +748,748,1 +749,749,3 +750,750,0 +751,751,0 +752,752,0 +753,753,2 +754,754,1 +755,755,0 +756,756,0 +757,757,1 +758,758,2 +759,759,0 +760,760,0 +761,761,0 +762,762,1 +763,763,1 +764,764,1 +765,765,0 +766,766,0 +767,767,1 +768,768,2 +769,769,0 +770,770,1 +771,771,1 +772,772,1 +773,773,1 +774,774,1 +775,775,1 +776,776,0 +777,777,0 +778,778,1 +779,779,1 +780,780,0 +781,781,0 +782,782,0 +783,783,0 +784,784,4 +785,785,0 +786,786,0 +787,787,0 +788,788,0 +789,789,1 +790,790,1 +791,791,0 +792,792,3 +793,793,0 +794,794,0 +795,795,0 +796,796,1 +797,797,1 +798,798,0 +799,799,0 +800,800,0 +801,801,1 +802,802,1 +803,803,1 +804,804,0 +805,805,0 +806,806,1 +807,807,0 +808,808,1 +809,809,1 +810,810,0 +811,811,1 +812,812,0 +813,813,0 +814,814,0 +815,815,0 +816,816,0 +817,817,0 +818,818,0 +819,819,0 +820,820,0 +821,821,0 +822,822,0 +823,823,0 +824,824,3 +825,825,0 +826,826,0 +827,827,2 +828,828,8 +829,829,0 +830,830,0 +831,831,0 +832,832,0 +833,833,1 +834,834,0 +835,835,0 +836,836,0 +837,837,1 +838,838,0 +839,839,0 +840,840,0 +841,841,4 +842,842,1 +843,843,0 +844,844,0 +845,845,0 +846,846,0 +847,847,0 +848,848,2 +849,849,2 +850,850,0 +851,851,0 +852,852,0 +853,853,0 +854,854,0 +855,855,2 +856,856,2 +857,857,40 +858,858,0 +859,859,0 +860,860,1 +861,861,3 +862,862,1 +863,863,0 +864,864,1 +865,865,0 +866,866,0 +867,867,0 +868,868,2 +869,869,0 +870,870,2 +871,871,0 +872,872,0 +873,873,1 +874,874,0 +875,875,1 +876,876,0 +877,877,1 +878,878,0 +879,879,0 +880,880,2 +881,881,1 +882,882,0 +883,883,0 +884,884,0 +885,885,0 +886,886,0 +887,887,0 +888,888,0 +889,889,2 +890,890,0 +891,891,0 +892,892,0 +893,893,1 +894,894,0 +895,895,1 +896,896,2 +897,897,0 +898,898,0 +899,899,1 +900,900,0 +901,901,0 +902,902,0 +903,903,0 +904,904,1 +905,905,1 +906,906,0 +907,907,0 +908,908,0 +909,909,0 +910,910,0 +911,911,1 +912,912,0 +913,913,0 +914,914,0 +915,915,1 +916,916,0 +917,917,1 +918,918,1 +919,919,2 +920,920,0 +921,921,0 +922,922,0 +923,923,0 +924,924,1 +925,925,0 +926,926,0 +927,927,0 +928,928,1 +929,929,0 +930,930,0 +931,931,10 +932,932,0 +933,933,0 +934,934,2 +935,935,1 +936,936,0 +937,937,0 +938,938,8 +939,939,1 +940,940,2 +941,941,1 +942,942,0 +943,943,1 +944,944,1 +945,945,6 +946,946,0 +947,947,0 +948,948,0 +949,949,0 +950,950,6 +951,951,0 +952,952,2 +953,953,0 +954,954,1 +955,955,0 +956,956,0 +957,957,0 +958,958,0 +959,959,1 +960,960,0 +961,961,1 +962,962,0 +963,963,0 +964,964,2 +965,965,1 +966,966,0 +967,967,0 +968,968,1 +969,969,1 +970,970,1 +971,971,0 +972,972,0 +973,973,0 +974,974,0 +975,975,0 +976,976,0 +977,977,0 +978,978,0 +979,979,1 +980,980,1 +981,981,0 +982,982,1 +983,983,0 +984,984,0 +985,985,1 +986,986,3 +987,987,1 +988,988,0 +989,989,0 +990,990,0 +991,991,1 +992,992,0 +993,993,3 +994,994,0 +995,995,0 +996,996,0 +997,997,0 +998,998,0 +999,999,2 diff --git a/axolotl/tests/data/datasets/database_dataset_3/tables/posts.csv b/axolotl/tests/data/datasets/database_dataset_3/tables/posts.csv new file mode 100644 index 0000000..0d17d9f --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_3/tables/posts.csv @@ -0,0 +1,1001 @@ +id,author_id,post +0,66,Post 0 +1,20,Post 1 +2,70,Post 2 +3,93,Post 3 +4,20,Post 4 +5,52,Post 5 +6,52,Post 6 +7,52,Post 7 +8,20,Post 8 +9,91,Post 9 +10,41,Post 10 +11,72,Post 11 +12,68,Post 12 +13,23,Post 13 +14,72,Post 14 +15,37,Post 15 +16,80,Post 16 +17,52,Post 17 +18,80,Post 18 +19,68,Post 19 +20,70,Post 20 +21,52,Post 21 +22,91,Post 22 +23,57,Post 23 +24,38,Post 24 +25,52,Post 25 +26,2,Post 26 +27,23,Post 27 +28,62,Post 28 +29,22,Post 29 +30,52,Post 30 +31,38,Post 31 +32,13,Post 32 +33,23,Post 33 +34,52,Post 34 +35,52,Post 35 +36,52,Post 36 +37,60,Post 37 +38,59,Post 38 +39,38,Post 39 +40,83,Post 40 +41,31,Post 41 +42,39,Post 42 +43,83,Post 43 +44,72,Post 44 +45,69,Post 45 +46,8,Post 46 +47,88,Post 47 +48,70,Post 48 +49,99,Post 49 +50,13,Post 50 +51,76,Post 51 +52,16,Post 52 +53,52,Post 53 +54,11,Post 54 +55,72,Post 55 +56,72,Post 56 +57,52,Post 57 +58,38,Post 58 +59,8,Post 59 +60,68,Post 60 +61,42,Post 61 +62,70,Post 62 +63,75,Post 63 +64,95,Post 64 +65,74,Post 65 +66,1,Post 66 +67,30,Post 67 +68,70,Post 68 +69,17,Post 69 +70,52,Post 70 +71,7,Post 71 +72,19,Post 72 +73,2,Post 73 +74,72,Post 74 +75,20,Post 75 +76,27,Post 76 +77,89,Post 77 +78,69,Post 78 +79,5,Post 79 +80,16,Post 80 +81,52,Post 81 +82,52,Post 82 +83,20,Post 83 +84,89,Post 84 +85,52,Post 85 +86,52,Post 86 +87,52,Post 87 +88,70,Post 88 +89,25,Post 89 +90,37,Post 90 +91,20,Post 91 +92,18,Post 92 +93,89,Post 93 +94,70,Post 94 +95,50,Post 95 +96,20,Post 96 +97,20,Post 97 +98,7,Post 98 +99,38,Post 99 +100,25,Post 100 +101,68,Post 101 +102,33,Post 102 +103,18,Post 103 +104,3,Post 104 +105,8,Post 105 +106,66,Post 106 +107,42,Post 107 +108,52,Post 108 +109,83,Post 109 +110,98,Post 110 +111,20,Post 111 +112,62,Post 112 +113,20,Post 113 +114,3,Post 114 +115,70,Post 115 +116,27,Post 116 +117,34,Post 117 +118,52,Post 118 +119,72,Post 119 +120,54,Post 120 +121,77,Post 121 +122,20,Post 122 +123,72,Post 123 +124,18,Post 124 +125,91,Post 125 +126,66,Post 126 +127,20,Post 127 +128,89,Post 128 +129,70,Post 129 +130,20,Post 130 +131,20,Post 131 +132,52,Post 132 +133,4,Post 133 +134,20,Post 134 +135,38,Post 135 +136,32,Post 136 +137,44,Post 137 +138,21,Post 138 +139,52,Post 139 +140,75,Post 140 +141,10,Post 141 +142,52,Post 142 +143,13,Post 143 +144,70,Post 144 +145,36,Post 145 +146,52,Post 146 +147,18,Post 147 +148,13,Post 148 +149,49,Post 149 +150,29,Post 150 +151,89,Post 151 +152,70,Post 152 +153,70,Post 153 +154,84,Post 154 +155,8,Post 155 +156,52,Post 156 +157,52,Post 157 +158,93,Post 158 +159,22,Post 159 +160,20,Post 160 +161,8,Post 161 +162,2,Post 162 +163,89,Post 163 +164,64,Post 164 +165,72,Post 165 +166,21,Post 166 +167,52,Post 167 +168,8,Post 168 +169,49,Post 169 +170,96,Post 170 +171,78,Post 171 +172,27,Post 172 +173,93,Post 173 +174,20,Post 174 +175,90,Post 175 +176,89,Post 176 +177,72,Post 177 +178,54,Post 178 +179,78,Post 179 +180,23,Post 180 +181,72,Post 181 +182,52,Post 182 +183,1,Post 183 +184,27,Post 184 +185,13,Post 185 +186,96,Post 186 +187,47,Post 187 +188,51,Post 188 +189,56,Post 189 +190,31,Post 190 +191,13,Post 191 +192,72,Post 192 +193,19,Post 193 +194,52,Post 194 +195,20,Post 195 +196,8,Post 196 +197,74,Post 197 +198,94,Post 198 +199,92,Post 199 +200,85,Post 200 +201,70,Post 201 +202,27,Post 202 +203,8,Post 203 +204,38,Post 204 +205,20,Post 205 +206,13,Post 206 +207,7,Post 207 +208,70,Post 208 +209,1,Post 209 +210,70,Post 210 +211,13,Post 211 +212,8,Post 212 +213,8,Post 213 +214,65,Post 214 +215,20,Post 215 +216,38,Post 216 +217,52,Post 217 +218,74,Post 218 +219,70,Post 219 +220,20,Post 220 +221,13,Post 221 +222,7,Post 222 +223,23,Post 223 +224,20,Post 224 +225,42,Post 225 +226,66,Post 226 +227,68,Post 227 +228,21,Post 228 +229,33,Post 229 +230,18,Post 230 +231,72,Post 231 +232,7,Post 232 +233,68,Post 233 +234,71,Post 234 +235,71,Post 235 +236,20,Post 236 +237,32,Post 237 +238,52,Post 238 +239,20,Post 239 +240,31,Post 240 +241,19,Post 241 +242,43,Post 242 +243,7,Post 243 +244,72,Post 244 +245,8,Post 245 +246,52,Post 246 +247,25,Post 247 +248,52,Post 248 +249,92,Post 249 +250,58,Post 250 +251,5,Post 251 +252,38,Post 252 +253,52,Post 253 +254,52,Post 254 +255,66,Post 255 +256,21,Post 256 +257,12,Post 257 +258,36,Post 258 +259,91,Post 259 +260,18,Post 260 +261,85,Post 261 +262,52,Post 262 +263,42,Post 263 +264,80,Post 264 +265,43,Post 265 +266,70,Post 266 +267,37,Post 267 +268,85,Post 268 +269,67,Post 269 +270,68,Post 270 +271,27,Post 271 +272,70,Post 272 +273,56,Post 273 +274,20,Post 274 +275,15,Post 275 +276,72,Post 276 +277,92,Post 277 +278,43,Post 278 +279,52,Post 279 +280,74,Post 280 +281,42,Post 281 +282,91,Post 282 +283,52,Post 283 +284,72,Post 284 +285,86,Post 285 +286,72,Post 286 +287,15,Post 287 +288,54,Post 288 +289,37,Post 289 +290,8,Post 290 +291,38,Post 291 +292,20,Post 292 +293,72,Post 293 +294,5,Post 294 +295,92,Post 295 +296,29,Post 296 +297,29,Post 297 +298,2,Post 298 +299,18,Post 299 +300,37,Post 300 +301,89,Post 301 +302,8,Post 302 +303,89,Post 303 +304,76,Post 304 +305,42,Post 305 +306,27,Post 306 +307,20,Post 307 +308,52,Post 308 +309,5,Post 309 +310,2,Post 310 +311,38,Post 311 +312,8,Post 312 +313,20,Post 313 +314,20,Post 314 +315,20,Post 315 +316,13,Post 316 +317,1,Post 317 +318,10,Post 318 +319,52,Post 319 +320,95,Post 320 +321,98,Post 321 +322,38,Post 322 +323,16,Post 323 +324,56,Post 324 +325,50,Post 325 +326,98,Post 326 +327,8,Post 327 +328,72,Post 328 +329,22,Post 329 +330,20,Post 330 +331,62,Post 331 +332,20,Post 332 +333,63,Post 333 +334,52,Post 334 +335,38,Post 335 +336,52,Post 336 +337,21,Post 337 +338,69,Post 338 +339,38,Post 339 +340,30,Post 340 +341,72,Post 341 +342,89,Post 342 +343,7,Post 343 +344,20,Post 344 +345,28,Post 345 +346,72,Post 346 +347,98,Post 347 +348,93,Post 348 +349,85,Post 349 +350,23,Post 350 +351,98,Post 351 +352,20,Post 352 +353,9,Post 353 +354,90,Post 354 +355,20,Post 355 +356,67,Post 356 +357,7,Post 357 +358,70,Post 358 +359,80,Post 359 +360,20,Post 360 +361,33,Post 361 +362,32,Post 362 +363,70,Post 363 +364,20,Post 364 +365,17,Post 365 +366,41,Post 366 +367,24,Post 367 +368,72,Post 368 +369,20,Post 369 +370,52,Post 370 +371,89,Post 371 +372,55,Post 372 +373,76,Post 373 +374,89,Post 374 +375,70,Post 375 +376,68,Post 376 +377,93,Post 377 +378,98,Post 378 +379,42,Post 379 +380,8,Post 380 +381,22,Post 381 +382,13,Post 382 +383,38,Post 383 +384,13,Post 384 +385,52,Post 385 +386,34,Post 386 +387,83,Post 387 +388,93,Post 388 +389,52,Post 389 +390,20,Post 390 +391,52,Post 391 +392,83,Post 392 +393,38,Post 393 +394,52,Post 394 +395,20,Post 395 +396,42,Post 396 +397,37,Post 397 +398,20,Post 398 +399,52,Post 399 +400,25,Post 400 +401,32,Post 401 +402,52,Post 402 +403,70,Post 403 +404,27,Post 404 +405,89,Post 405 +406,74,Post 406 +407,7,Post 407 +408,20,Post 408 +409,41,Post 409 +410,8,Post 410 +411,28,Post 411 +412,70,Post 412 +413,66,Post 413 +414,52,Post 414 +415,70,Post 415 +416,20,Post 416 +417,27,Post 417 +418,66,Post 418 +419,79,Post 419 +420,52,Post 420 +421,21,Post 421 +422,5,Post 422 +423,70,Post 423 +424,1,Post 424 +425,32,Post 425 +426,52,Post 426 +427,89,Post 427 +428,8,Post 428 +429,38,Post 429 +430,3,Post 430 +431,27,Post 431 +432,52,Post 432 +433,21,Post 433 +434,20,Post 434 +435,10,Post 435 +436,52,Post 436 +437,68,Post 437 +438,65,Post 438 +439,90,Post 439 +440,0,Post 440 +441,58,Post 441 +442,52,Post 442 +443,52,Post 443 +444,93,Post 444 +445,2,Post 445 +446,68,Post 446 +447,72,Post 447 +448,52,Post 448 +449,27,Post 449 +450,72,Post 450 +451,8,Post 451 +452,39,Post 452 +453,52,Post 453 +454,68,Post 454 +455,8,Post 455 +456,20,Post 456 +457,38,Post 457 +458,52,Post 458 +459,81,Post 459 +460,52,Post 460 +461,13,Post 461 +462,96,Post 462 +463,77,Post 463 +464,52,Post 464 +465,89,Post 465 +466,52,Post 466 +467,89,Post 467 +468,72,Post 468 +469,93,Post 469 +470,88,Post 470 +471,6,Post 471 +472,17,Post 472 +473,35,Post 473 +474,91,Post 474 +475,23,Post 475 +476,15,Post 476 +477,81,Post 477 +478,41,Post 478 +479,86,Post 479 +480,15,Post 480 +481,62,Post 481 +482,39,Post 482 +483,8,Post 483 +484,68,Post 484 +485,20,Post 485 +486,6,Post 486 +487,8,Post 487 +488,8,Post 488 +489,86,Post 489 +490,70,Post 490 +491,83,Post 491 +492,65,Post 492 +493,52,Post 493 +494,24,Post 494 +495,99,Post 495 +496,31,Post 496 +497,45,Post 497 +498,33,Post 498 +499,96,Post 499 +500,17,Post 500 +501,27,Post 501 +502,66,Post 502 +503,8,Post 503 +504,52,Post 504 +505,46,Post 505 +506,21,Post 506 +507,20,Post 507 +508,52,Post 508 +509,31,Post 509 +510,42,Post 510 +511,27,Post 511 +512,94,Post 512 +513,13,Post 513 +514,8,Post 514 +515,27,Post 515 +516,52,Post 516 +517,62,Post 517 +518,37,Post 518 +519,99,Post 519 +520,28,Post 520 +521,70,Post 521 +522,56,Post 522 +523,72,Post 523 +524,95,Post 524 +525,82,Post 525 +526,70,Post 526 +527,68,Post 527 +528,27,Post 528 +529,13,Post 529 +530,8,Post 530 +531,20,Post 531 +532,38,Post 532 +533,52,Post 533 +534,70,Post 534 +535,92,Post 535 +536,10,Post 536 +537,9,Post 537 +538,52,Post 538 +539,70,Post 539 +540,72,Post 540 +541,89,Post 541 +542,97,Post 542 +543,37,Post 543 +544,33,Post 544 +545,13,Post 545 +546,66,Post 546 +547,61,Post 547 +548,74,Post 548 +549,8,Post 549 +550,51,Post 550 +551,52,Post 551 +552,20,Post 552 +553,17,Post 553 +554,74,Post 554 +555,8,Post 555 +556,45,Post 556 +557,10,Post 557 +558,42,Post 558 +559,96,Post 559 +560,38,Post 560 +561,74,Post 561 +562,10,Post 562 +563,20,Post 563 +564,38,Post 564 +565,37,Post 565 +566,64,Post 566 +567,27,Post 567 +568,70,Post 568 +569,56,Post 569 +570,37,Post 570 +571,38,Post 571 +572,52,Post 572 +573,8,Post 573 +574,72,Post 574 +575,60,Post 575 +576,70,Post 576 +577,52,Post 577 +578,10,Post 578 +579,38,Post 579 +580,38,Post 580 +581,27,Post 581 +582,5,Post 582 +583,70,Post 583 +584,10,Post 584 +585,52,Post 585 +586,68,Post 586 +587,56,Post 587 +588,92,Post 588 +589,8,Post 589 +590,76,Post 590 +591,5,Post 591 +592,52,Post 592 +593,38,Post 593 +594,52,Post 594 +595,31,Post 595 +596,19,Post 596 +597,2,Post 597 +598,52,Post 598 +599,72,Post 599 +600,32,Post 600 +601,20,Post 601 +602,8,Post 602 +603,8,Post 603 +604,20,Post 604 +605,8,Post 605 +606,20,Post 606 +607,8,Post 607 +608,8,Post 608 +609,74,Post 609 +610,15,Post 610 +611,52,Post 611 +612,70,Post 612 +613,42,Post 613 +614,13,Post 614 +615,19,Post 615 +616,38,Post 616 +617,52,Post 617 +618,28,Post 618 +619,72,Post 619 +620,70,Post 620 +621,89,Post 621 +622,4,Post 622 +623,83,Post 623 +624,36,Post 624 +625,79,Post 625 +626,67,Post 626 +627,98,Post 627 +628,70,Post 628 +629,31,Post 629 +630,52,Post 630 +631,33,Post 631 +632,31,Post 632 +633,20,Post 633 +634,51,Post 634 +635,66,Post 635 +636,20,Post 636 +637,52,Post 637 +638,10,Post 638 +639,15,Post 639 +640,7,Post 640 +641,94,Post 641 +642,0,Post 642 +643,18,Post 643 +644,52,Post 644 +645,8,Post 645 +646,80,Post 646 +647,70,Post 647 +648,93,Post 648 +649,52,Post 649 +650,23,Post 650 +651,52,Post 651 +652,89,Post 652 +653,52,Post 653 +654,20,Post 654 +655,79,Post 655 +656,32,Post 656 +657,0,Post 657 +658,20,Post 658 +659,27,Post 659 +660,74,Post 660 +661,43,Post 661 +662,40,Post 662 +663,27,Post 663 +664,80,Post 664 +665,89,Post 665 +666,98,Post 666 +667,33,Post 667 +668,93,Post 668 +669,72,Post 669 +670,65,Post 670 +671,20,Post 671 +672,20,Post 672 +673,17,Post 673 +674,89,Post 674 +675,23,Post 675 +676,42,Post 676 +677,50,Post 677 +678,71,Post 678 +679,72,Post 679 +680,13,Post 680 +681,38,Post 681 +682,72,Post 682 +683,72,Post 683 +684,8,Post 684 +685,14,Post 685 +686,24,Post 686 +687,8,Post 687 +688,38,Post 688 +689,9,Post 689 +690,52,Post 690 +691,20,Post 691 +692,52,Post 692 +693,10,Post 693 +694,95,Post 694 +695,89,Post 695 +696,36,Post 696 +697,20,Post 697 +698,20,Post 698 +699,48,Post 699 +700,6,Post 700 +701,56,Post 701 +702,38,Post 702 +703,33,Post 703 +704,72,Post 704 +705,70,Post 705 +706,91,Post 706 +707,28,Post 707 +708,83,Post 708 +709,70,Post 709 +710,29,Post 710 +711,52,Post 711 +712,22,Post 712 +713,78,Post 713 +714,10,Post 714 +715,20,Post 715 +716,18,Post 716 +717,38,Post 717 +718,70,Post 718 +719,52,Post 719 +720,49,Post 720 +721,0,Post 721 +722,38,Post 722 +723,8,Post 723 +724,20,Post 724 +725,89,Post 725 +726,20,Post 726 +727,74,Post 727 +728,72,Post 728 +729,14,Post 729 +730,52,Post 730 +731,10,Post 731 +732,70,Post 732 +733,56,Post 733 +734,72,Post 734 +735,47,Post 735 +736,87,Post 736 +737,7,Post 737 +738,22,Post 738 +739,70,Post 739 +740,38,Post 740 +741,17,Post 741 +742,9,Post 742 +743,72,Post 743 +744,45,Post 744 +745,80,Post 745 +746,70,Post 746 +747,38,Post 747 +748,32,Post 748 +749,52,Post 749 +750,82,Post 750 +751,70,Post 751 +752,0,Post 752 +753,68,Post 753 +754,88,Post 754 +755,70,Post 755 +756,17,Post 756 +757,48,Post 757 +758,13,Post 758 +759,30,Post 759 +760,89,Post 760 +761,89,Post 761 +762,21,Post 762 +763,27,Post 763 +764,52,Post 764 +765,93,Post 765 +766,13,Post 766 +767,20,Post 767 +768,78,Post 768 +769,50,Post 769 +770,84,Post 770 +771,18,Post 771 +772,52,Post 772 +773,27,Post 773 +774,27,Post 774 +775,41,Post 775 +776,38,Post 776 +777,29,Post 777 +778,87,Post 778 +779,70,Post 779 +780,70,Post 780 +781,22,Post 781 +782,52,Post 782 +783,71,Post 783 +784,72,Post 784 +785,27,Post 785 +786,70,Post 786 +787,70,Post 787 +788,13,Post 788 +789,75,Post 789 +790,39,Post 790 +791,49,Post 791 +792,41,Post 792 +793,52,Post 793 +794,52,Post 794 +795,51,Post 795 +796,76,Post 796 +797,53,Post 797 +798,37,Post 798 +799,38,Post 799 +800,72,Post 800 +801,27,Post 801 +802,20,Post 802 +803,8,Post 803 +804,78,Post 804 +805,88,Post 805 +806,10,Post 806 +807,27,Post 807 +808,17,Post 808 +809,10,Post 809 +810,84,Post 810 +811,7,Post 811 +812,96,Post 812 +813,8,Post 813 +814,74,Post 814 +815,52,Post 815 +816,31,Post 816 +817,27,Post 817 +818,70,Post 818 +819,26,Post 819 +820,61,Post 820 +821,52,Post 821 +822,48,Post 822 +823,84,Post 823 +824,52,Post 824 +825,72,Post 825 +826,70,Post 826 +827,6,Post 827 +828,70,Post 828 +829,20,Post 829 +830,84,Post 830 +831,7,Post 831 +832,27,Post 832 +833,8,Post 833 +834,46,Post 834 +835,72,Post 835 +836,23,Post 836 +837,13,Post 837 +838,27,Post 838 +839,72,Post 839 +840,13,Post 840 +841,20,Post 841 +842,8,Post 842 +843,69,Post 843 +844,36,Post 844 +845,25,Post 845 +846,70,Post 846 +847,27,Post 847 +848,70,Post 848 +849,72,Post 849 +850,20,Post 850 +851,95,Post 851 +852,16,Post 852 +853,22,Post 853 +854,18,Post 854 +855,27,Post 855 +856,47,Post 856 +857,52,Post 857 +858,73,Post 858 +859,82,Post 859 +860,20,Post 860 +861,52,Post 861 +862,10,Post 862 +863,43,Post 863 +864,27,Post 864 +865,27,Post 865 +866,48,Post 866 +867,70,Post 867 +868,8,Post 868 +869,79,Post 869 +870,70,Post 870 +871,17,Post 871 +872,89,Post 872 +873,52,Post 873 +874,99,Post 874 +875,19,Post 875 +876,52,Post 876 +877,22,Post 877 +878,24,Post 878 +879,52,Post 879 +880,89,Post 880 +881,72,Post 881 +882,70,Post 882 +883,52,Post 883 +884,89,Post 884 +885,50,Post 885 +886,78,Post 886 +887,72,Post 887 +888,20,Post 888 +889,70,Post 889 +890,1,Post 890 +891,27,Post 891 +892,20,Post 892 +893,52,Post 893 +894,70,Post 894 +895,8,Post 895 +896,52,Post 896 +897,89,Post 897 +898,20,Post 898 +899,66,Post 899 +900,52,Post 900 +901,1,Post 901 +902,46,Post 902 +903,70,Post 903 +904,7,Post 904 +905,79,Post 905 +906,52,Post 906 +907,5,Post 907 +908,20,Post 908 +909,91,Post 909 +910,52,Post 910 +911,9,Post 911 +912,21,Post 912 +913,42,Post 913 +914,3,Post 914 +915,38,Post 915 +916,50,Post 916 +917,20,Post 917 +918,52,Post 918 +919,70,Post 919 +920,20,Post 920 +921,52,Post 921 +922,56,Post 922 +923,90,Post 923 +924,71,Post 924 +925,72,Post 925 +926,50,Post 926 +927,18,Post 927 +928,98,Post 928 +929,12,Post 929 +930,45,Post 930 +931,8,Post 931 +932,89,Post 932 +933,93,Post 933 +934,70,Post 934 +935,28,Post 935 +936,20,Post 936 +937,20,Post 937 +938,12,Post 938 +939,52,Post 939 +940,13,Post 940 +941,27,Post 941 +942,53,Post 942 +943,70,Post 943 +944,3,Post 944 +945,38,Post 945 +946,59,Post 946 +947,73,Post 947 +948,46,Post 948 +949,93,Post 949 +950,20,Post 950 +951,2,Post 951 +952,48,Post 952 +953,20,Post 953 +954,72,Post 954 +955,20,Post 955 +956,25,Post 956 +957,72,Post 957 +958,70,Post 958 +959,52,Post 959 +960,69,Post 960 +961,38,Post 961 +962,0,Post 962 +963,1,Post 963 +964,52,Post 964 +965,8,Post 965 +966,7,Post 966 +967,93,Post 967 +968,74,Post 968 +969,13,Post 969 +970,0,Post 970 +971,89,Post 971 +972,21,Post 972 +973,18,Post 973 +974,68,Post 974 +975,9,Post 975 +976,20,Post 976 +977,95,Post 977 +978,56,Post 978 +979,52,Post 979 +980,37,Post 980 +981,70,Post 981 +982,13,Post 982 +983,93,Post 983 +984,74,Post 984 +985,52,Post 985 +986,7,Post 986 +987,68,Post 987 +988,87,Post 988 +989,52,Post 989 +990,29,Post 990 +991,11,Post 991 +992,70,Post 992 +993,81,Post 993 +994,8,Post 994 +995,70,Post 995 +996,2,Post 996 +997,93,Post 997 +998,52,Post 998 +999,52,Post 999 diff --git a/axolotl/tests/data/datasets/database_dataset_3/tables/users.csv b/axolotl/tests/data/datasets/database_dataset_3/tables/users.csv new file mode 100644 index 0000000..01f98db --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_3/tables/users.csv @@ -0,0 +1,101 @@ +id,name +0,User 0 +1,User 1 +2,User 2 +3,User 3 +4,User 4 +5,User 5 +6,User 6 +7,User 7 +8,User 8 +9,User 9 +10,User 10 +11,User 11 +12,User 12 +13,User 13 +14,User 14 +15,User 15 +16,User 16 +17,User 17 +18,User 18 +19,User 19 +20,User 20 +21,User 21 +22,User 22 +23,User 23 +24,User 24 +25,User 25 +26,User 26 +27,User 27 +28,User 28 +29,User 29 +30,User 30 +31,User 31 +32,User 32 +33,User 33 +34,User 34 +35,User 35 +36,User 36 +37,User 37 +38,User 38 +39,User 39 +40,User 40 +41,User 41 +42,User 42 +43,User 43 +44,User 44 +45,User 45 +46,User 46 +47,User 47 +48,User 48 +49,User 49 +50,User 50 +51,User 51 +52,User 52 +53,User 53 +54,User 54 +55,User 55 +56,User 56 +57,User 57 +58,User 58 +59,User 59 +60,User 60 +61,User 61 +62,User 62 +63,User 63 +64,User 64 +65,User 65 +66,User 66 +67,User 67 +68,User 68 +69,User 69 +70,User 70 +71,User 71 +72,User 72 +73,User 73 +74,User 74 +75,User 75 +76,User 76 +77,User 77 +78,User 78 +79,User 79 +80,User 80 +81,User 81 +82,User 82 +83,User 83 +84,User 84 +85,User 85 +86,User 86 +87,User 87 +88,User 88 +89,User 89 +90,User 90 +91,User 91 +92,User 92 +93,User 93 +94,User 94 +95,User 95 +96,User 96 +97,User 97 +98,User 98 +99,User 99 diff --git a/axolotl/tests/data/datasets/database_dataset_4/datasetDoc.json b/axolotl/tests/data/datasets/database_dataset_4/datasetDoc.json new file mode 100644 index 0000000..6a3e72f --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_4/datasetDoc.json @@ -0,0 +1,202 @@ +{ + "about": { + "datasetSchemaVersion": "4.0.0", + "datasetID": "database_dataset_4", + "datasetName": "Database dataset of type HAS_USER_MADE_COMMENT_ON_POST", + "description": "Database dataset of type HAS_USER_MADE_COMMENT_ON_POST, size 100, random seed 0", + "digest": "61fe05fb19ff803c67eedf2fdbb131e3124fadf77abdd650d34ab6068b85c35f", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "users", + "isCollection": false, + "columnsCount": 2, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/users.csv", + "columns": [ + { + "colIndex": 0, + "colName": "id", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "name", + "role": [ + "attribute" + ], + "colType": "string" + } + ] + }, + { + "resID": "posts", + "isCollection": false, + "columnsCount": 3, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/posts.csv", + "columns": [ + { + "colIndex": 0, + "colName": "id", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "author_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "users", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 2, + "colName": "post", + "role": [ + "attribute" + ], + "colType": "string" + } + ] + }, + { + "resID": "comments", + "isCollection": false, + "columnsCount": 4, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/comments.csv", + "columns": [ + { + "colIndex": 0, + "colName": "id", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "post_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "posts", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 2, + "colName": "author_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "users", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 3, + "colName": "comment", + "role": [ + "attribute" + ], + "colType": "string" + } + ] + }, + { + "resID": "learningData", + "isCollection": false, + "columnsCount": 4, + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "resType": "table", + "resPath": "tables/learningData.csv", + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "role": [ + "index" + ], + "colType": "integer" + }, + { + "colIndex": 1, + "colName": "user_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "users", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 2, + "colName": "post_id", + "role": [ + "attribute" + ], + "colType": "integer", + "refersTo": { + "resID": "posts", + "resObject": { + "columnIndex": 0 + } + } + }, + { + "colIndex": 3, + "colName": "made_comment", + "role": [ + "suggestedTarget" + ], + "colType": "boolean" + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/database_dataset_4/tables/comments.csv b/axolotl/tests/data/datasets/database_dataset_4/tables/comments.csv new file mode 100644 index 0000000..ab57bf3 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_4/tables/comments.csv @@ -0,0 +1,1001 @@ +id,post_id,author_id,comment +0,198,74,Comment 0 +1,383,3,Comment 1 +2,490,59,Comment 2 +3,471,40,Comment 3 +4,952,3,Comment 4 +5,581,3,Comment 5 +6,680,3,Comment 6 +7,945,45,Comment 7 +8,361,33,Comment 8 +9,327,3,Comment 9 +10,25,8,Comment 10 +11,165,41,Comment 11 +12,205,3,Comment 12 +13,164,8,Comment 13 +14,698,5,Comment 14 +15,455,23,Comment 15 +16,556,20,Comment 16 +17,784,3,Comment 17 +18,198,58,Comment 18 +19,299,3,Comment 19 +20,621,3,Comment 20 +21,337,6,Comment 21 +22,25,83,Comment 22 +23,393,3,Comment 23 +24,857,3,Comment 24 +25,360,93,Comment 25 +26,304,3,Comment 26 +27,985,3,Comment 27 +28,526,49,Comment 28 +29,327,3,Comment 29 +30,74,3,Comment 30 +31,728,3,Comment 31 +32,621,59,Comment 32 +33,870,87,Comment 33 +34,198,3,Comment 34 +35,91,3,Comment 35 +36,657,95,Comment 36 +37,185,36,Comment 37 +38,154,26,Comment 38 +39,297,65,Comment 39 +40,772,3,Comment 40 +41,459,3,Comment 41 +42,25,96,Comment 42 +43,421,67,Comment 43 +44,588,54,Comment 44 +45,458,3,Comment 45 +46,488,11,Comment 46 +47,198,93,Comment 47 +48,828,93,Comment 48 +49,488,1,Comment 49 +50,637,56,Comment 50 +51,968,1,Comment 51 +52,385,96,Comment 52 +53,857,5,Comment 53 +54,4,96,Comment 54 +55,25,3,Comment 55 +56,488,3,Comment 56 +57,293,3,Comment 57 +58,217,50,Comment 58 +59,232,3,Comment 59 +60,297,73,Comment 60 +61,663,35,Comment 61 +62,488,96,Comment 62 +63,732,97,Comment 63 +64,796,3,Comment 64 +65,824,95,Comment 65 +66,361,3,Comment 66 +67,373,3,Comment 67 +68,880,93,Comment 68 +69,545,96,Comment 69 +70,46,3,Comment 70 +71,461,8,Comment 71 +72,327,14,Comment 72 +73,982,72,Comment 73 +74,15,38,Comment 74 +75,494,59,Comment 75 +76,657,13,Comment 76 +77,251,38,Comment 77 +78,950,51,Comment 78 +79,842,52,Comment 79 +80,862,0,Comment 80 +81,22,3,Comment 81 +82,488,3,Comment 82 +83,265,3,Comment 83 +84,828,3,Comment 84 +85,510,3,Comment 85 +86,459,90,Comment 86 +87,91,31,Comment 87 +88,459,47,Comment 88 +89,509,81,Comment 89 +90,934,8,Comment 90 +91,488,0,Comment 91 +92,945,3,Comment 92 +93,938,65,Comment 93 +94,526,84,Comment 94 +95,451,3,Comment 95 +96,424,3,Comment 96 +97,857,92,Comment 97 +98,25,3,Comment 98 +99,90,39,Comment 99 +100,75,3,Comment 100 +101,702,3,Comment 101 +102,308,17,Comment 102 +103,519,91,Comment 103 +104,488,38,Comment 104 +105,327,3,Comment 105 +106,451,45,Comment 106 +107,526,59,Comment 107 +108,911,3,Comment 108 +109,488,28,Comment 109 +110,498,45,Comment 110 +111,26,3,Comment 111 +112,931,96,Comment 112 +113,451,8,Comment 113 +114,749,96,Comment 114 +115,25,3,Comment 115 +116,373,13,Comment 116 +117,219,3,Comment 117 +118,224,1,Comment 118 +119,735,3,Comment 119 +120,504,3,Comment 120 +121,394,8,Comment 121 +122,5,81,Comment 122 +123,943,38,Comment 123 +124,938,8,Comment 124 +125,198,97,Comment 125 +126,25,38,Comment 126 +127,451,3,Comment 127 +128,931,45,Comment 128 +129,857,15,Comment 129 +130,488,79,Comment 130 +131,275,83,Comment 131 +132,304,3,Comment 132 +133,857,30,Comment 133 +134,451,3,Comment 134 +135,120,38,Comment 135 +136,217,98,Comment 136 +137,232,67,Comment 137 +138,106,3,Comment 138 +139,420,3,Comment 139 +140,864,96,Comment 140 +141,557,3,Comment 141 +142,32,3,Comment 142 +143,4,3,Comment 143 +144,232,13,Comment 144 +145,327,95,Comment 145 +146,719,96,Comment 146 +147,945,3,Comment 147 +148,329,3,Comment 148 +149,590,18,Comment 149 +150,991,1,Comment 150 +151,682,3,Comment 151 +152,516,93,Comment 152 +153,39,3,Comment 153 +154,297,3,Comment 154 +155,861,6,Comment 155 +156,185,50,Comment 156 +157,824,3,Comment 157 +158,600,3,Comment 158 +159,327,3,Comment 159 +160,451,76,Comment 160 +161,463,3,Comment 161 +162,638,30,Comment 162 +163,451,99,Comment 163 +164,120,49,Comment 164 +165,719,3,Comment 165 +166,358,59,Comment 166 +167,938,95,Comment 167 +168,242,3,Comment 168 +169,219,47,Comment 169 +170,304,3,Comment 170 +171,488,83,Comment 171 +172,857,3,Comment 172 +173,154,3,Comment 173 +174,232,23,Comment 174 +175,488,3,Comment 175 +176,411,38,Comment 176 +177,406,27,Comment 177 +178,784,48,Comment 178 +179,875,3,Comment 179 +180,438,24,Comment 180 +181,590,64,Comment 181 +182,749,64,Comment 182 +183,451,47,Comment 183 +184,264,23,Comment 184 +185,519,44,Comment 185 +186,488,3,Comment 186 +187,857,10,Comment 187 +188,25,72,Comment 188 +189,857,37,Comment 189 +190,452,93,Comment 190 +191,18,3,Comment 191 +192,323,3,Comment 192 +193,657,38,Comment 193 +194,451,3,Comment 194 +195,964,59,Comment 195 +196,377,3,Comment 196 +197,302,22,Comment 197 +198,784,64,Comment 198 +199,478,59,Comment 199 +200,584,15,Comment 200 +201,758,96,Comment 201 +202,562,16,Comment 202 +203,376,3,Comment 203 +204,109,3,Comment 204 +205,488,53,Comment 205 +206,857,8,Comment 206 +207,369,6,Comment 207 +208,857,3,Comment 208 +209,451,3,Comment 209 +210,504,72,Comment 210 +211,801,60,Comment 211 +212,488,54,Comment 212 +213,65,3,Comment 213 +214,965,3,Comment 214 +215,217,3,Comment 215 +216,626,11,Comment 216 +217,451,87,Comment 217 +218,435,3,Comment 218 +219,216,3,Comment 219 +220,656,32,Comment 220 +221,89,88,Comment 221 +222,986,52,Comment 222 +223,827,71,Comment 223 +224,452,3,Comment 224 +225,382,3,Comment 225 +226,244,23,Comment 226 +227,451,45,Comment 227 +228,857,95,Comment 228 +229,232,3,Comment 229 +230,451,3,Comment 230 +231,200,20,Comment 231 +232,304,45,Comment 232 +233,166,31,Comment 233 +234,986,48,Comment 234 +235,488,3,Comment 235 +236,665,72,Comment 236 +237,74,3,Comment 237 +238,327,3,Comment 238 +239,857,59,Comment 239 +240,55,3,Comment 240 +241,143,44,Comment 241 +242,504,3,Comment 242 +243,504,3,Comment 243 +244,808,3,Comment 244 +245,896,81,Comment 245 +246,122,8,Comment 246 +247,219,3,Comment 247 +248,383,3,Comment 248 +249,1,1,Comment 249 +250,27,57,Comment 250 +251,198,3,Comment 251 +252,438,37,Comment 252 +253,219,3,Comment 253 +254,857,67,Comment 254 +255,298,3,Comment 255 +256,877,1,Comment 256 +257,952,3,Comment 257 +258,408,90,Comment 258 +259,198,3,Comment 259 +260,258,18,Comment 260 +261,719,3,Comment 261 +262,230,3,Comment 262 +263,327,92,Comment 263 +264,435,59,Comment 264 +265,488,58,Comment 265 +266,993,20,Comment 266 +267,198,66,Comment 267 +268,327,8,Comment 268 +269,219,8,Comment 269 +270,327,14,Comment 270 +271,74,3,Comment 271 +272,327,92,Comment 272 +273,857,3,Comment 273 +274,268,3,Comment 274 +275,327,3,Comment 275 +276,919,65,Comment 276 +277,661,59,Comment 277 +278,451,8,Comment 278 +279,719,3,Comment 279 +280,105,96,Comment 280 +281,421,3,Comment 281 +282,101,70,Comment 282 +283,198,59,Comment 283 +284,121,93,Comment 284 +285,327,59,Comment 285 +286,327,3,Comment 286 +287,928,3,Comment 287 +288,219,3,Comment 288 +289,431,3,Comment 289 +290,767,48,Comment 290 +291,770,66,Comment 291 +292,692,38,Comment 292 +293,248,3,Comment 293 +294,451,3,Comment 294 +295,165,33,Comment 295 +296,165,8,Comment 296 +297,394,3,Comment 297 +298,677,3,Comment 298 +299,451,23,Comment 299 +300,857,38,Comment 300 +301,797,11,Comment 301 +302,25,96,Comment 302 +303,488,3,Comment 303 +304,488,41,Comment 304 +305,938,86,Comment 305 +306,25,3,Comment 306 +307,451,3,Comment 307 +308,185,58,Comment 308 +309,25,13,Comment 309 +310,488,98,Comment 310 +311,719,35,Comment 311 +312,719,92,Comment 312 +313,25,2,Comment 313 +314,359,3,Comment 314 +315,25,45,Comment 315 +316,217,5,Comment 316 +317,172,67,Comment 317 +318,198,8,Comment 318 +319,307,47,Comment 319 +320,232,31,Comment 320 +321,938,3,Comment 321 +322,327,48,Comment 322 +323,857,47,Comment 323 +324,163,3,Comment 324 +325,712,29,Comment 325 +326,26,3,Comment 326 +327,419,8,Comment 327 +328,4,7,Comment 328 +329,764,3,Comment 329 +330,931,3,Comment 330 +331,25,3,Comment 331 +332,893,3,Comment 332 +333,719,70,Comment 333 +334,327,3,Comment 334 +335,857,8,Comment 335 +336,266,60,Comment 336 +337,360,3,Comment 337 +338,74,3,Comment 338 +339,540,49,Comment 339 +340,25,38,Comment 340 +341,447,30,Comment 341 +342,587,3,Comment 342 +343,784,59,Comment 343 +344,122,3,Comment 344 +345,698,3,Comment 345 +346,645,59,Comment 346 +347,488,3,Comment 347 +348,488,3,Comment 348 +349,66,3,Comment 349 +350,488,3,Comment 350 +351,327,3,Comment 351 +352,856,45,Comment 352 +353,294,3,Comment 353 +354,488,3,Comment 354 +355,383,59,Comment 355 +356,857,87,Comment 356 +357,219,38,Comment 357 +358,40,3,Comment 358 +359,689,23,Comment 359 +360,360,38,Comment 360 +361,488,3,Comment 361 +362,25,67,Comment 362 +363,931,59,Comment 363 +364,857,33,Comment 364 +365,504,3,Comment 365 +366,21,45,Comment 366 +367,327,83,Comment 367 +368,961,3,Comment 368 +369,582,95,Comment 369 +370,137,59,Comment 370 +371,419,3,Comment 371 +372,945,38,Comment 372 +373,452,3,Comment 373 +374,25,3,Comment 374 +375,503,66,Comment 375 +376,226,43,Comment 376 +377,74,38,Comment 377 +378,353,3,Comment 378 +379,488,3,Comment 379 +380,21,77,Comment 380 +381,488,3,Comment 381 +382,451,3,Comment 382 +383,740,3,Comment 383 +384,379,3,Comment 384 +385,483,59,Comment 385 +386,682,38,Comment 386 +387,857,3,Comment 387 +388,327,3,Comment 388 +389,488,31,Comment 389 +390,599,38,Comment 390 +391,25,3,Comment 391 +392,748,37,Comment 392 +393,327,3,Comment 393 +394,559,3,Comment 394 +395,431,3,Comment 395 +396,611,3,Comment 396 +397,657,96,Comment 397 +398,168,96,Comment 398 +399,546,8,Comment 399 +400,828,3,Comment 400 +401,203,95,Comment 401 +402,702,66,Comment 402 +403,512,48,Comment 403 +404,931,8,Comment 404 +405,219,4,Comment 405 +406,122,59,Comment 406 +407,293,20,Comment 407 +408,219,41,Comment 408 +409,702,31,Comment 409 +410,665,47,Comment 410 +411,559,3,Comment 411 +412,198,3,Comment 412 +413,849,3,Comment 413 +414,935,3,Comment 414 +415,451,3,Comment 415 +416,526,18,Comment 416 +417,451,3,Comment 417 +418,242,96,Comment 418 +419,244,20,Comment 419 +420,294,3,Comment 420 +421,168,45,Comment 421 +422,857,3,Comment 422 +423,841,3,Comment 423 +424,419,67,Comment 424 +425,327,3,Comment 425 +426,904,3,Comment 426 +427,198,3,Comment 427 +428,483,49,Comment 428 +429,25,41,Comment 429 +430,168,1,Comment 430 +431,488,3,Comment 431 +432,546,47,Comment 432 +433,488,3,Comment 433 +434,857,33,Comment 434 +435,987,3,Comment 435 +436,712,8,Comment 436 +437,423,74,Comment 437 +438,803,3,Comment 438 +439,102,45,Comment 439 +440,587,8,Comment 440 +441,451,3,Comment 441 +442,158,3,Comment 442 +443,950,68,Comment 443 +444,305,96,Comment 444 +445,499,69,Comment 445 +446,857,3,Comment 446 +447,451,3,Comment 447 +448,91,67,Comment 448 +449,779,26,Comment 449 +450,327,59,Comment 450 +451,970,59,Comment 451 +452,857,3,Comment 452 +453,833,3,Comment 453 +454,327,12,Comment 454 +455,702,93,Comment 455 +456,979,84,Comment 456 +457,451,7,Comment 457 +458,431,3,Comment 458 +459,18,3,Comment 459 +460,609,21,Comment 460 +461,841,85,Comment 461 +462,217,3,Comment 462 +463,294,3,Comment 463 +464,451,38,Comment 464 +465,32,18,Comment 465 +466,63,3,Comment 466 +467,373,3,Comment 467 +468,219,13,Comment 468 +469,358,59,Comment 469 +470,412,59,Comment 470 +471,954,96,Comment 471 +472,919,3,Comment 472 +473,121,3,Comment 473 +474,857,0,Comment 474 +475,45,67,Comment 475 +476,451,38,Comment 476 +477,142,59,Comment 477 +478,327,10,Comment 478 +479,682,90,Comment 479 +480,382,3,Comment 480 +481,758,3,Comment 481 +482,645,3,Comment 482 +483,918,3,Comment 483 +484,452,3,Comment 484 +485,753,31,Comment 485 +486,297,3,Comment 486 +487,326,38,Comment 487 +488,351,93,Comment 488 +489,360,66,Comment 489 +490,78,3,Comment 490 +491,451,3,Comment 491 +492,74,94,Comment 492 +493,857,98,Comment 493 +494,26,3,Comment 494 +495,408,8,Comment 495 +496,243,3,Comment 496 +497,122,47,Comment 497 +498,725,6,Comment 498 +499,639,60,Comment 499 +500,122,33,Comment 500 +501,510,3,Comment 501 +502,738,3,Comment 502 +503,719,63,Comment 503 +504,68,45,Comment 504 +505,590,38,Comment 505 +506,641,44,Comment 506 +507,849,37,Comment 507 +508,451,38,Comment 508 +509,419,38,Comment 509 +510,944,21,Comment 510 +511,741,57,Comment 511 +512,488,3,Comment 512 +513,304,11,Comment 513 +514,827,3,Comment 514 +515,414,3,Comment 515 +516,69,3,Comment 516 +517,413,3,Comment 517 +518,857,3,Comment 518 +519,408,3,Comment 519 +520,18,3,Comment 520 +521,491,3,Comment 521 +522,993,3,Comment 522 +523,774,3,Comment 523 +524,500,3,Comment 524 +525,861,3,Comment 525 +526,768,50,Comment 526 +527,451,3,Comment 527 +528,488,3,Comment 528 +529,665,36,Comment 529 +530,828,3,Comment 530 +531,25,3,Comment 531 +532,719,3,Comment 532 +533,406,6,Comment 533 +534,545,45,Comment 534 +535,712,38,Comment 535 +536,459,1,Comment 536 +537,128,59,Comment 537 +538,360,3,Comment 538 +539,36,45,Comment 539 +540,590,3,Comment 540 +541,931,3,Comment 541 +542,741,3,Comment 542 +543,334,65,Comment 543 +544,488,3,Comment 544 +545,277,44,Comment 545 +546,459,3,Comment 546 +547,272,3,Comment 547 +548,676,93,Comment 548 +549,219,36,Comment 549 +550,940,82,Comment 550 +551,546,3,Comment 551 +552,26,67,Comment 552 +553,5,93,Comment 553 +554,993,3,Comment 554 +555,198,85,Comment 555 +556,293,3,Comment 556 +557,191,3,Comment 557 +558,881,99,Comment 558 +559,529,94,Comment 559 +560,451,3,Comment 560 +561,485,67,Comment 561 +562,297,3,Comment 562 +563,498,96,Comment 563 +564,371,70,Comment 564 +565,185,85,Comment 565 +566,539,32,Comment 566 +567,327,3,Comment 567 +568,725,3,Comment 568 +569,363,20,Comment 569 +570,451,44,Comment 570 +571,727,53,Comment 571 +572,407,3,Comment 572 +573,945,21,Comment 573 +574,738,8,Comment 574 +575,792,3,Comment 575 +576,931,26,Comment 576 +577,330,3,Comment 577 +578,488,21,Comment 578 +579,596,3,Comment 579 +580,568,84,Comment 580 +581,931,3,Comment 581 +582,249,37,Comment 582 +583,258,3,Comment 583 +584,868,38,Comment 584 +585,745,3,Comment 585 +586,185,95,Comment 586 +587,488,3,Comment 587 +588,682,37,Comment 588 +589,327,3,Comment 589 +590,393,72,Comment 590 +591,488,61,Comment 591 +592,425,3,Comment 592 +593,696,87,Comment 593 +594,4,3,Comment 594 +595,393,56,Comment 595 +596,305,8,Comment 596 +597,488,67,Comment 597 +598,719,8,Comment 598 +599,36,3,Comment 599 +600,127,3,Comment 600 +601,554,98,Comment 601 +602,676,3,Comment 602 +603,488,3,Comment 603 +604,969,61,Comment 604 +605,451,3,Comment 605 +606,121,3,Comment 606 +607,590,3,Comment 607 +608,488,3,Comment 608 +609,488,99,Comment 609 +610,472,45,Comment 610 +611,905,20,Comment 611 +612,118,61,Comment 612 +613,32,42,Comment 613 +614,122,3,Comment 614 +615,216,3,Comment 615 +616,488,74,Comment 616 +617,495,3,Comment 617 +618,198,8,Comment 618 +619,451,3,Comment 619 +620,440,3,Comment 620 +621,408,47,Comment 621 +622,754,22,Comment 622 +623,431,44,Comment 623 +624,702,3,Comment 624 +625,145,3,Comment 625 +626,451,3,Comment 626 +627,789,6,Comment 627 +628,158,40,Comment 628 +629,423,93,Comment 629 +630,488,42,Comment 630 +631,165,20,Comment 631 +632,702,5,Comment 632 +633,41,3,Comment 633 +634,924,11,Comment 634 +635,428,3,Comment 635 +636,304,33,Comment 636 +637,96,50,Comment 637 +638,388,93,Comment 638 +639,545,3,Comment 639 +640,70,3,Comment 640 +641,282,3,Comment 641 +642,806,78,Comment 642 +643,526,66,Comment 643 +644,191,59,Comment 644 +645,137,3,Comment 645 +646,857,27,Comment 646 +647,245,3,Comment 647 +648,159,3,Comment 648 +649,999,59,Comment 649 +650,291,3,Comment 650 +651,109,17,Comment 651 +652,419,20,Comment 652 +653,393,3,Comment 653 +654,880,14,Comment 654 +655,120,3,Comment 655 +656,66,0,Comment 656 +657,69,78,Comment 657 +658,857,5,Comment 658 +659,408,85,Comment 659 +660,122,15,Comment 660 +661,488,8,Comment 661 +662,458,3,Comment 662 +663,183,3,Comment 663 +664,488,3,Comment 664 +665,308,37,Comment 665 +666,205,58,Comment 666 +667,451,23,Comment 667 +668,258,57,Comment 668 +669,198,3,Comment 669 +670,857,34,Comment 670 +671,124,79,Comment 671 +672,234,3,Comment 672 +673,328,3,Comment 673 +674,231,55,Comment 674 +675,195,64,Comment 675 +676,719,3,Comment 676 +677,304,3,Comment 677 +678,89,38,Comment 678 +679,459,3,Comment 679 +680,110,38,Comment 680 +681,327,15,Comment 681 +682,857,59,Comment 682 +683,327,69,Comment 683 +684,509,73,Comment 684 +685,451,38,Comment 685 +686,121,38,Comment 686 +687,526,8,Comment 687 +688,837,3,Comment 688 +689,69,3,Comment 689 +690,697,3,Comment 690 +691,590,8,Comment 691 +692,855,70,Comment 692 +693,78,36,Comment 693 +694,282,44,Comment 694 +695,598,3,Comment 695 +696,25,3,Comment 696 +697,666,3,Comment 697 +698,841,68,Comment 698 +699,408,3,Comment 699 +700,393,30,Comment 700 +701,232,3,Comment 701 +702,4,3,Comment 702 +703,165,3,Comment 703 +704,964,3,Comment 704 +705,856,3,Comment 705 +706,224,37,Comment 706 +707,940,3,Comment 707 +708,327,59,Comment 708 +709,266,3,Comment 709 +710,122,3,Comment 710 +711,857,44,Comment 711 +712,980,37,Comment 712 +713,304,3,Comment 713 +714,613,8,Comment 714 +715,304,3,Comment 715 +716,78,38,Comment 716 +717,337,3,Comment 717 +718,483,44,Comment 718 +719,105,8,Comment 719 +720,778,3,Comment 720 +721,451,54,Comment 721 +722,200,3,Comment 722 +723,488,3,Comment 723 +724,738,3,Comment 724 +725,304,72,Comment 725 +726,609,14,Comment 726 +727,384,20,Comment 727 +728,941,3,Comment 728 +729,718,3,Comment 729 +730,327,3,Comment 730 +731,21,20,Comment 731 +732,542,45,Comment 732 +733,181,17,Comment 733 +734,103,67,Comment 734 +735,889,3,Comment 735 +736,999,3,Comment 736 +737,226,7,Comment 737 +738,272,3,Comment 738 +739,142,3,Comment 739 +740,419,96,Comment 740 +741,855,71,Comment 741 +742,609,3,Comment 742 +743,828,3,Comment 743 +744,198,8,Comment 744 +745,665,59,Comment 745 +746,868,3,Comment 746 +747,236,3,Comment 747 +748,590,3,Comment 748 +749,351,38,Comment 749 +750,254,4,Comment 750 +751,950,70,Comment 751 +752,327,3,Comment 752 +753,81,13,Comment 753 +754,329,47,Comment 754 +755,407,24,Comment 755 +756,695,3,Comment 756 +757,931,3,Comment 757 +758,773,22,Comment 758 +759,889,3,Comment 759 +760,431,93,Comment 760 +761,646,22,Comment 761 +762,290,3,Comment 762 +763,26,48,Comment 763 +764,327,3,Comment 764 +765,602,59,Comment 765 +766,232,8,Comment 766 +767,848,3,Comment 767 +768,734,3,Comment 768 +769,174,98,Comment 769 +770,304,3,Comment 770 +771,790,94,Comment 771 +772,216,17,Comment 772 +773,304,17,Comment 773 +774,317,3,Comment 774 +775,749,94,Comment 775 +776,25,50,Comment 776 +777,32,20,Comment 777 +778,488,3,Comment 778 +779,346,3,Comment 779 +780,510,8,Comment 780 +781,224,3,Comment 781 +782,857,28,Comment 782 +783,708,3,Comment 783 +784,26,3,Comment 784 +785,725,18,Comment 785 +786,950,3,Comment 786 +787,917,3,Comment 787 +788,668,3,Comment 788 +789,106,3,Comment 789 +790,488,3,Comment 790 +791,243,3,Comment 791 +792,950,3,Comment 792 +793,644,3,Comment 793 +794,490,3,Comment 794 +795,600,3,Comment 795 +796,394,8,Comment 796 +797,327,67,Comment 797 +798,896,93,Comment 798 +799,304,3,Comment 799 +800,25,3,Comment 800 +801,352,31,Comment 801 +802,734,3,Comment 802 +803,526,3,Comment 803 +804,938,3,Comment 804 +805,81,3,Comment 805 +806,860,3,Comment 806 +807,327,29,Comment 807 +808,431,38,Comment 808 +809,385,97,Comment 809 +810,95,3,Comment 810 +811,217,3,Comment 811 +812,682,94,Comment 812 +813,25,73,Comment 813 +814,120,38,Comment 814 +815,25,68,Comment 815 +816,841,3,Comment 816 +817,501,76,Comment 817 +818,148,74,Comment 818 +819,713,3,Comment 819 +820,945,18,Comment 820 +821,895,93,Comment 821 +822,870,44,Comment 822 +823,4,41,Comment 823 +824,488,29,Comment 824 +825,219,3,Comment 825 +826,488,96,Comment 826 +827,297,8,Comment 827 +828,122,3,Comment 828 +829,403,49,Comment 829 +830,451,37,Comment 830 +831,986,3,Comment 831 +832,25,3,Comment 832 +833,272,3,Comment 833 +834,828,3,Comment 834 +835,545,38,Comment 835 +836,792,18,Comment 836 +837,545,3,Comment 837 +838,703,3,Comment 838 +839,451,3,Comment 839 +840,185,52,Comment 840 +841,763,3,Comment 841 +842,488,3,Comment 842 +843,121,3,Comment 843 +844,757,3,Comment 844 +845,938,76,Comment 845 +846,327,3,Comment 846 +847,261,95,Comment 847 +848,49,3,Comment 848 +849,553,3,Comment 849 +850,938,74,Comment 850 +851,121,7,Comment 851 +852,447,3,Comment 852 +853,74,45,Comment 853 +854,25,3,Comment 854 +855,553,82,Comment 855 +856,857,68,Comment 856 +857,305,3,Comment 857 +858,857,59,Comment 858 +859,96,96,Comment 859 +860,205,14,Comment 860 +861,857,32,Comment 861 +862,451,45,Comment 862 +863,488,3,Comment 863 +864,25,38,Comment 864 +865,117,11,Comment 865 +866,25,3,Comment 866 +867,857,84,Comment 867 +868,120,59,Comment 868 +869,828,93,Comment 869 +870,327,67,Comment 870 +871,747,3,Comment 871 +872,327,37,Comment 872 +873,225,45,Comment 873 +874,69,43,Comment 874 +875,235,3,Comment 875 +876,431,7,Comment 876 +877,775,3,Comment 877 +878,408,3,Comment 878 +879,950,29,Comment 879 +880,460,33,Comment 880 +881,25,3,Comment 881 +882,363,8,Comment 882 +883,590,11,Comment 883 +884,200,2,Comment 884 +885,605,3,Comment 885 +886,451,38,Comment 886 +887,25,98,Comment 887 +888,719,39,Comment 888 +889,488,3,Comment 889 +890,51,66,Comment 890 +891,431,8,Comment 891 +892,245,3,Comment 892 +893,857,37,Comment 893 +894,243,3,Comment 894 +895,915,5,Comment 895 +896,473,3,Comment 896 +897,297,38,Comment 897 +898,768,8,Comment 898 +899,602,3,Comment 899 +900,361,3,Comment 900 +901,504,17,Comment 901 +902,719,59,Comment 902 +903,828,1,Comment 903 +904,771,38,Comment 904 +905,144,3,Comment 905 +906,792,37,Comment 906 +907,25,67,Comment 907 +908,185,3,Comment 908 +909,504,15,Comment 909 +910,91,59,Comment 910 +911,488,3,Comment 911 +912,286,3,Comment 912 +913,385,85,Comment 913 +914,294,45,Comment 914 +915,28,51,Comment 915 +916,451,69,Comment 916 +917,939,22,Comment 917 +918,492,13,Comment 918 +919,271,27,Comment 919 +920,934,48,Comment 920 +921,332,3,Comment 921 +922,602,3,Comment 922 +923,513,3,Comment 923 +924,931,96,Comment 924 +925,861,3,Comment 925 +926,122,47,Comment 926 +927,55,93,Comment 927 +928,762,3,Comment 928 +929,857,48,Comment 929 +930,451,3,Comment 930 +931,590,3,Comment 931 +932,27,70,Comment 932 +933,198,86,Comment 933 +934,122,3,Comment 934 +935,488,38,Comment 935 +936,180,3,Comment 936 +937,25,59,Comment 937 +938,272,79,Comment 938 +939,574,3,Comment 939 +940,488,40,Comment 940 +941,304,77,Comment 941 +942,802,3,Comment 942 +943,232,70,Comment 943 +944,219,32,Comment 944 +945,488,4,Comment 945 +946,434,20,Comment 946 +947,404,66,Comment 947 +948,124,48,Comment 948 +949,451,17,Comment 949 +950,219,13,Comment 950 +951,337,84,Comment 951 +952,665,3,Comment 952 +953,899,3,Comment 953 +954,719,44,Comment 954 +955,358,46,Comment 955 +956,488,32,Comment 956 +957,684,3,Comment 957 +958,361,3,Comment 958 +959,327,38,Comment 959 +960,120,3,Comment 960 +961,670,17,Comment 961 +962,809,3,Comment 962 +963,296,18,Comment 963 +964,725,3,Comment 964 +965,490,3,Comment 965 +966,725,51,Comment 966 +967,360,3,Comment 967 +968,686,3,Comment 968 +969,360,71,Comment 969 +970,60,3,Comment 970 +971,482,3,Comment 971 +972,411,47,Comment 972 +973,219,3,Comment 973 +974,857,31,Comment 974 +975,327,38,Comment 975 +976,25,96,Comment 976 +977,327,3,Comment 977 +978,382,3,Comment 978 +979,848,93,Comment 979 +980,744,48,Comment 980 +981,185,8,Comment 981 +982,811,51,Comment 982 +983,217,4,Comment 983 +984,312,4,Comment 984 +985,36,3,Comment 985 +986,25,20,Comment 986 +987,581,3,Comment 987 +988,873,38,Comment 988 +989,451,3,Comment 989 +990,824,70,Comment 990 +991,739,59,Comment 991 +992,553,3,Comment 992 +993,959,35,Comment 993 +994,753,47,Comment 994 +995,232,19,Comment 995 +996,732,3,Comment 996 +997,593,18,Comment 997 +998,350,8,Comment 998 +999,36,3,Comment 999 diff --git a/axolotl/tests/data/datasets/database_dataset_4/tables/learningData.csv b/axolotl/tests/data/datasets/database_dataset_4/tables/learningData.csv new file mode 100644 index 0000000..31bf113 --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_4/tables/learningData.csv @@ -0,0 +1,201 @@ +d3mIndex,user_id,post_id,made_comment +0,3,490,yes +1,64,897,no +2,64,219,no +3,64,620,no +4,47,546,yes +5,16,562,yes +6,64,551,no +7,29,327,yes +8,3,407,yes +9,64,653,no +10,1,224,yes +11,64,435,no +12,27,271,yes +13,45,168,yes +14,28,857,yes +15,84,526,yes +16,64,817,no +17,64,712,no +18,70,27,yes +19,97,385,yes +20,3,137,yes +21,91,519,yes +22,37,857,yes +23,64,25,no +24,3,719,yes +25,64,928,no +26,38,451,yes +27,64,22,no +28,64,839,no +29,64,872,no +30,3,198,yes +31,64,887,no +32,64,847,no +33,64,909,no +34,33,857,yes +35,64,947,no +36,64,848,no +37,64,723,no +38,64,715,no +39,3,127,yes +40,64,442,no +41,64,250,no +42,56,637,yes +43,59,358,yes +44,11,924,yes +45,64,751,no +46,64,671,no +47,3,327,yes +48,21,488,yes +49,64,196,no +50,64,394,no +51,3,258,yes +52,45,931,yes +53,64,761,no +54,64,999,no +55,3,941,yes +56,45,856,yes +57,65,919,yes +58,64,942,no +59,64,826,no +60,64,576,no +61,64,34,no +62,20,556,yes +63,59,358,yes +64,64,500,no +65,95,938,yes +66,64,673,no +67,64,184,no +68,47,329,yes +69,64,764,no +70,64,899,no +71,66,51,yes +72,64,11,no +73,64,132,no +74,64,179,no +75,64,198,no +76,84,568,yes +77,3,828,yes +78,93,895,yes +79,3,763,yes +80,30,638,yes +81,64,963,no +82,64,791,no +83,15,584,yes +84,64,800,no +85,64,359,no +86,98,25,yes +87,3,945,yes +88,64,892,no +89,64,936,no +90,64,319,no +91,47,122,yes +92,8,451,yes +93,64,191,no +94,64,968,no +95,64,822,no +96,64,121,no +97,3,488,yes +98,3,290,yes +99,3,488,yes +100,3,682,yes +101,64,996,no +102,3,305,yes +103,64,949,no +104,64,194,no +105,3,931,yes +106,59,784,yes +107,8,587,yes +108,64,429,no +109,64,940,no +110,64,786,no +111,3,425,yes +112,78,806,yes +113,3,473,yes +114,64,10,no +115,3,833,yes +116,64,120,no +117,97,198,yes +118,44,277,yes +119,64,430,no +120,3,458,yes +121,64,168,no +122,3,447,yes +123,3,775,yes +124,64,127,no +125,3,36,yes +126,96,96,yes +127,72,504,yes +128,64,601,no +129,3,294,yes +130,64,808,no +131,48,744,yes +132,64,343,no +133,64,896,no +134,64,318,no +135,96,758,yes +136,85,185,yes +137,64,532,no +138,3,964,yes +139,64,571,no +140,93,423,yes +141,87,451,yes +142,17,308,yes +143,64,134,no +144,64,28,no +145,3,330,yes +146,3,286,yes +147,64,245,no +148,64,252,no +149,18,792,yes +150,3,452,yes +151,64,506,no +152,64,518,no +153,64,918,no +154,3,735,yes +155,3,600,yes +156,64,293,no +157,30,393,yes +158,3,428,yes +159,64,413,no +160,57,258,yes +161,3,945,yes +162,64,49,no +163,64,813,no +164,64,422,no +165,49,526,yes +166,3,451,yes +167,64,585,no +168,64,411,no +169,45,68,yes +170,59,857,yes +171,38,121,yes +172,64,583,no +173,64,512,no +174,3,713,yes +175,45,304,yes +176,11,488,yes +177,64,490,no +178,3,327,yes +179,64,160,no +180,3,25,yes +181,64,679,no +182,64,39,no +183,64,484,no +184,64,363,no +185,47,857,yes +186,67,857,yes +187,64,926,no +188,64,924,no +189,64,480,no +190,66,198,yes +191,64,755,no +192,3,952,yes +193,13,492,yes +194,64,592,no +195,35,719,yes +196,64,591,no +197,64,664,no +198,64,885,no +199,3,451,yes diff --git a/axolotl/tests/data/datasets/database_dataset_4/tables/posts.csv b/axolotl/tests/data/datasets/database_dataset_4/tables/posts.csv new file mode 100644 index 0000000..0d17d9f --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_4/tables/posts.csv @@ -0,0 +1,1001 @@ +id,author_id,post +0,66,Post 0 +1,20,Post 1 +2,70,Post 2 +3,93,Post 3 +4,20,Post 4 +5,52,Post 5 +6,52,Post 6 +7,52,Post 7 +8,20,Post 8 +9,91,Post 9 +10,41,Post 10 +11,72,Post 11 +12,68,Post 12 +13,23,Post 13 +14,72,Post 14 +15,37,Post 15 +16,80,Post 16 +17,52,Post 17 +18,80,Post 18 +19,68,Post 19 +20,70,Post 20 +21,52,Post 21 +22,91,Post 22 +23,57,Post 23 +24,38,Post 24 +25,52,Post 25 +26,2,Post 26 +27,23,Post 27 +28,62,Post 28 +29,22,Post 29 +30,52,Post 30 +31,38,Post 31 +32,13,Post 32 +33,23,Post 33 +34,52,Post 34 +35,52,Post 35 +36,52,Post 36 +37,60,Post 37 +38,59,Post 38 +39,38,Post 39 +40,83,Post 40 +41,31,Post 41 +42,39,Post 42 +43,83,Post 43 +44,72,Post 44 +45,69,Post 45 +46,8,Post 46 +47,88,Post 47 +48,70,Post 48 +49,99,Post 49 +50,13,Post 50 +51,76,Post 51 +52,16,Post 52 +53,52,Post 53 +54,11,Post 54 +55,72,Post 55 +56,72,Post 56 +57,52,Post 57 +58,38,Post 58 +59,8,Post 59 +60,68,Post 60 +61,42,Post 61 +62,70,Post 62 +63,75,Post 63 +64,95,Post 64 +65,74,Post 65 +66,1,Post 66 +67,30,Post 67 +68,70,Post 68 +69,17,Post 69 +70,52,Post 70 +71,7,Post 71 +72,19,Post 72 +73,2,Post 73 +74,72,Post 74 +75,20,Post 75 +76,27,Post 76 +77,89,Post 77 +78,69,Post 78 +79,5,Post 79 +80,16,Post 80 +81,52,Post 81 +82,52,Post 82 +83,20,Post 83 +84,89,Post 84 +85,52,Post 85 +86,52,Post 86 +87,52,Post 87 +88,70,Post 88 +89,25,Post 89 +90,37,Post 90 +91,20,Post 91 +92,18,Post 92 +93,89,Post 93 +94,70,Post 94 +95,50,Post 95 +96,20,Post 96 +97,20,Post 97 +98,7,Post 98 +99,38,Post 99 +100,25,Post 100 +101,68,Post 101 +102,33,Post 102 +103,18,Post 103 +104,3,Post 104 +105,8,Post 105 +106,66,Post 106 +107,42,Post 107 +108,52,Post 108 +109,83,Post 109 +110,98,Post 110 +111,20,Post 111 +112,62,Post 112 +113,20,Post 113 +114,3,Post 114 +115,70,Post 115 +116,27,Post 116 +117,34,Post 117 +118,52,Post 118 +119,72,Post 119 +120,54,Post 120 +121,77,Post 121 +122,20,Post 122 +123,72,Post 123 +124,18,Post 124 +125,91,Post 125 +126,66,Post 126 +127,20,Post 127 +128,89,Post 128 +129,70,Post 129 +130,20,Post 130 +131,20,Post 131 +132,52,Post 132 +133,4,Post 133 +134,20,Post 134 +135,38,Post 135 +136,32,Post 136 +137,44,Post 137 +138,21,Post 138 +139,52,Post 139 +140,75,Post 140 +141,10,Post 141 +142,52,Post 142 +143,13,Post 143 +144,70,Post 144 +145,36,Post 145 +146,52,Post 146 +147,18,Post 147 +148,13,Post 148 +149,49,Post 149 +150,29,Post 150 +151,89,Post 151 +152,70,Post 152 +153,70,Post 153 +154,84,Post 154 +155,8,Post 155 +156,52,Post 156 +157,52,Post 157 +158,93,Post 158 +159,22,Post 159 +160,20,Post 160 +161,8,Post 161 +162,2,Post 162 +163,89,Post 163 +164,64,Post 164 +165,72,Post 165 +166,21,Post 166 +167,52,Post 167 +168,8,Post 168 +169,49,Post 169 +170,96,Post 170 +171,78,Post 171 +172,27,Post 172 +173,93,Post 173 +174,20,Post 174 +175,90,Post 175 +176,89,Post 176 +177,72,Post 177 +178,54,Post 178 +179,78,Post 179 +180,23,Post 180 +181,72,Post 181 +182,52,Post 182 +183,1,Post 183 +184,27,Post 184 +185,13,Post 185 +186,96,Post 186 +187,47,Post 187 +188,51,Post 188 +189,56,Post 189 +190,31,Post 190 +191,13,Post 191 +192,72,Post 192 +193,19,Post 193 +194,52,Post 194 +195,20,Post 195 +196,8,Post 196 +197,74,Post 197 +198,94,Post 198 +199,92,Post 199 +200,85,Post 200 +201,70,Post 201 +202,27,Post 202 +203,8,Post 203 +204,38,Post 204 +205,20,Post 205 +206,13,Post 206 +207,7,Post 207 +208,70,Post 208 +209,1,Post 209 +210,70,Post 210 +211,13,Post 211 +212,8,Post 212 +213,8,Post 213 +214,65,Post 214 +215,20,Post 215 +216,38,Post 216 +217,52,Post 217 +218,74,Post 218 +219,70,Post 219 +220,20,Post 220 +221,13,Post 221 +222,7,Post 222 +223,23,Post 223 +224,20,Post 224 +225,42,Post 225 +226,66,Post 226 +227,68,Post 227 +228,21,Post 228 +229,33,Post 229 +230,18,Post 230 +231,72,Post 231 +232,7,Post 232 +233,68,Post 233 +234,71,Post 234 +235,71,Post 235 +236,20,Post 236 +237,32,Post 237 +238,52,Post 238 +239,20,Post 239 +240,31,Post 240 +241,19,Post 241 +242,43,Post 242 +243,7,Post 243 +244,72,Post 244 +245,8,Post 245 +246,52,Post 246 +247,25,Post 247 +248,52,Post 248 +249,92,Post 249 +250,58,Post 250 +251,5,Post 251 +252,38,Post 252 +253,52,Post 253 +254,52,Post 254 +255,66,Post 255 +256,21,Post 256 +257,12,Post 257 +258,36,Post 258 +259,91,Post 259 +260,18,Post 260 +261,85,Post 261 +262,52,Post 262 +263,42,Post 263 +264,80,Post 264 +265,43,Post 265 +266,70,Post 266 +267,37,Post 267 +268,85,Post 268 +269,67,Post 269 +270,68,Post 270 +271,27,Post 271 +272,70,Post 272 +273,56,Post 273 +274,20,Post 274 +275,15,Post 275 +276,72,Post 276 +277,92,Post 277 +278,43,Post 278 +279,52,Post 279 +280,74,Post 280 +281,42,Post 281 +282,91,Post 282 +283,52,Post 283 +284,72,Post 284 +285,86,Post 285 +286,72,Post 286 +287,15,Post 287 +288,54,Post 288 +289,37,Post 289 +290,8,Post 290 +291,38,Post 291 +292,20,Post 292 +293,72,Post 293 +294,5,Post 294 +295,92,Post 295 +296,29,Post 296 +297,29,Post 297 +298,2,Post 298 +299,18,Post 299 +300,37,Post 300 +301,89,Post 301 +302,8,Post 302 +303,89,Post 303 +304,76,Post 304 +305,42,Post 305 +306,27,Post 306 +307,20,Post 307 +308,52,Post 308 +309,5,Post 309 +310,2,Post 310 +311,38,Post 311 +312,8,Post 312 +313,20,Post 313 +314,20,Post 314 +315,20,Post 315 +316,13,Post 316 +317,1,Post 317 +318,10,Post 318 +319,52,Post 319 +320,95,Post 320 +321,98,Post 321 +322,38,Post 322 +323,16,Post 323 +324,56,Post 324 +325,50,Post 325 +326,98,Post 326 +327,8,Post 327 +328,72,Post 328 +329,22,Post 329 +330,20,Post 330 +331,62,Post 331 +332,20,Post 332 +333,63,Post 333 +334,52,Post 334 +335,38,Post 335 +336,52,Post 336 +337,21,Post 337 +338,69,Post 338 +339,38,Post 339 +340,30,Post 340 +341,72,Post 341 +342,89,Post 342 +343,7,Post 343 +344,20,Post 344 +345,28,Post 345 +346,72,Post 346 +347,98,Post 347 +348,93,Post 348 +349,85,Post 349 +350,23,Post 350 +351,98,Post 351 +352,20,Post 352 +353,9,Post 353 +354,90,Post 354 +355,20,Post 355 +356,67,Post 356 +357,7,Post 357 +358,70,Post 358 +359,80,Post 359 +360,20,Post 360 +361,33,Post 361 +362,32,Post 362 +363,70,Post 363 +364,20,Post 364 +365,17,Post 365 +366,41,Post 366 +367,24,Post 367 +368,72,Post 368 +369,20,Post 369 +370,52,Post 370 +371,89,Post 371 +372,55,Post 372 +373,76,Post 373 +374,89,Post 374 +375,70,Post 375 +376,68,Post 376 +377,93,Post 377 +378,98,Post 378 +379,42,Post 379 +380,8,Post 380 +381,22,Post 381 +382,13,Post 382 +383,38,Post 383 +384,13,Post 384 +385,52,Post 385 +386,34,Post 386 +387,83,Post 387 +388,93,Post 388 +389,52,Post 389 +390,20,Post 390 +391,52,Post 391 +392,83,Post 392 +393,38,Post 393 +394,52,Post 394 +395,20,Post 395 +396,42,Post 396 +397,37,Post 397 +398,20,Post 398 +399,52,Post 399 +400,25,Post 400 +401,32,Post 401 +402,52,Post 402 +403,70,Post 403 +404,27,Post 404 +405,89,Post 405 +406,74,Post 406 +407,7,Post 407 +408,20,Post 408 +409,41,Post 409 +410,8,Post 410 +411,28,Post 411 +412,70,Post 412 +413,66,Post 413 +414,52,Post 414 +415,70,Post 415 +416,20,Post 416 +417,27,Post 417 +418,66,Post 418 +419,79,Post 419 +420,52,Post 420 +421,21,Post 421 +422,5,Post 422 +423,70,Post 423 +424,1,Post 424 +425,32,Post 425 +426,52,Post 426 +427,89,Post 427 +428,8,Post 428 +429,38,Post 429 +430,3,Post 430 +431,27,Post 431 +432,52,Post 432 +433,21,Post 433 +434,20,Post 434 +435,10,Post 435 +436,52,Post 436 +437,68,Post 437 +438,65,Post 438 +439,90,Post 439 +440,0,Post 440 +441,58,Post 441 +442,52,Post 442 +443,52,Post 443 +444,93,Post 444 +445,2,Post 445 +446,68,Post 446 +447,72,Post 447 +448,52,Post 448 +449,27,Post 449 +450,72,Post 450 +451,8,Post 451 +452,39,Post 452 +453,52,Post 453 +454,68,Post 454 +455,8,Post 455 +456,20,Post 456 +457,38,Post 457 +458,52,Post 458 +459,81,Post 459 +460,52,Post 460 +461,13,Post 461 +462,96,Post 462 +463,77,Post 463 +464,52,Post 464 +465,89,Post 465 +466,52,Post 466 +467,89,Post 467 +468,72,Post 468 +469,93,Post 469 +470,88,Post 470 +471,6,Post 471 +472,17,Post 472 +473,35,Post 473 +474,91,Post 474 +475,23,Post 475 +476,15,Post 476 +477,81,Post 477 +478,41,Post 478 +479,86,Post 479 +480,15,Post 480 +481,62,Post 481 +482,39,Post 482 +483,8,Post 483 +484,68,Post 484 +485,20,Post 485 +486,6,Post 486 +487,8,Post 487 +488,8,Post 488 +489,86,Post 489 +490,70,Post 490 +491,83,Post 491 +492,65,Post 492 +493,52,Post 493 +494,24,Post 494 +495,99,Post 495 +496,31,Post 496 +497,45,Post 497 +498,33,Post 498 +499,96,Post 499 +500,17,Post 500 +501,27,Post 501 +502,66,Post 502 +503,8,Post 503 +504,52,Post 504 +505,46,Post 505 +506,21,Post 506 +507,20,Post 507 +508,52,Post 508 +509,31,Post 509 +510,42,Post 510 +511,27,Post 511 +512,94,Post 512 +513,13,Post 513 +514,8,Post 514 +515,27,Post 515 +516,52,Post 516 +517,62,Post 517 +518,37,Post 518 +519,99,Post 519 +520,28,Post 520 +521,70,Post 521 +522,56,Post 522 +523,72,Post 523 +524,95,Post 524 +525,82,Post 525 +526,70,Post 526 +527,68,Post 527 +528,27,Post 528 +529,13,Post 529 +530,8,Post 530 +531,20,Post 531 +532,38,Post 532 +533,52,Post 533 +534,70,Post 534 +535,92,Post 535 +536,10,Post 536 +537,9,Post 537 +538,52,Post 538 +539,70,Post 539 +540,72,Post 540 +541,89,Post 541 +542,97,Post 542 +543,37,Post 543 +544,33,Post 544 +545,13,Post 545 +546,66,Post 546 +547,61,Post 547 +548,74,Post 548 +549,8,Post 549 +550,51,Post 550 +551,52,Post 551 +552,20,Post 552 +553,17,Post 553 +554,74,Post 554 +555,8,Post 555 +556,45,Post 556 +557,10,Post 557 +558,42,Post 558 +559,96,Post 559 +560,38,Post 560 +561,74,Post 561 +562,10,Post 562 +563,20,Post 563 +564,38,Post 564 +565,37,Post 565 +566,64,Post 566 +567,27,Post 567 +568,70,Post 568 +569,56,Post 569 +570,37,Post 570 +571,38,Post 571 +572,52,Post 572 +573,8,Post 573 +574,72,Post 574 +575,60,Post 575 +576,70,Post 576 +577,52,Post 577 +578,10,Post 578 +579,38,Post 579 +580,38,Post 580 +581,27,Post 581 +582,5,Post 582 +583,70,Post 583 +584,10,Post 584 +585,52,Post 585 +586,68,Post 586 +587,56,Post 587 +588,92,Post 588 +589,8,Post 589 +590,76,Post 590 +591,5,Post 591 +592,52,Post 592 +593,38,Post 593 +594,52,Post 594 +595,31,Post 595 +596,19,Post 596 +597,2,Post 597 +598,52,Post 598 +599,72,Post 599 +600,32,Post 600 +601,20,Post 601 +602,8,Post 602 +603,8,Post 603 +604,20,Post 604 +605,8,Post 605 +606,20,Post 606 +607,8,Post 607 +608,8,Post 608 +609,74,Post 609 +610,15,Post 610 +611,52,Post 611 +612,70,Post 612 +613,42,Post 613 +614,13,Post 614 +615,19,Post 615 +616,38,Post 616 +617,52,Post 617 +618,28,Post 618 +619,72,Post 619 +620,70,Post 620 +621,89,Post 621 +622,4,Post 622 +623,83,Post 623 +624,36,Post 624 +625,79,Post 625 +626,67,Post 626 +627,98,Post 627 +628,70,Post 628 +629,31,Post 629 +630,52,Post 630 +631,33,Post 631 +632,31,Post 632 +633,20,Post 633 +634,51,Post 634 +635,66,Post 635 +636,20,Post 636 +637,52,Post 637 +638,10,Post 638 +639,15,Post 639 +640,7,Post 640 +641,94,Post 641 +642,0,Post 642 +643,18,Post 643 +644,52,Post 644 +645,8,Post 645 +646,80,Post 646 +647,70,Post 647 +648,93,Post 648 +649,52,Post 649 +650,23,Post 650 +651,52,Post 651 +652,89,Post 652 +653,52,Post 653 +654,20,Post 654 +655,79,Post 655 +656,32,Post 656 +657,0,Post 657 +658,20,Post 658 +659,27,Post 659 +660,74,Post 660 +661,43,Post 661 +662,40,Post 662 +663,27,Post 663 +664,80,Post 664 +665,89,Post 665 +666,98,Post 666 +667,33,Post 667 +668,93,Post 668 +669,72,Post 669 +670,65,Post 670 +671,20,Post 671 +672,20,Post 672 +673,17,Post 673 +674,89,Post 674 +675,23,Post 675 +676,42,Post 676 +677,50,Post 677 +678,71,Post 678 +679,72,Post 679 +680,13,Post 680 +681,38,Post 681 +682,72,Post 682 +683,72,Post 683 +684,8,Post 684 +685,14,Post 685 +686,24,Post 686 +687,8,Post 687 +688,38,Post 688 +689,9,Post 689 +690,52,Post 690 +691,20,Post 691 +692,52,Post 692 +693,10,Post 693 +694,95,Post 694 +695,89,Post 695 +696,36,Post 696 +697,20,Post 697 +698,20,Post 698 +699,48,Post 699 +700,6,Post 700 +701,56,Post 701 +702,38,Post 702 +703,33,Post 703 +704,72,Post 704 +705,70,Post 705 +706,91,Post 706 +707,28,Post 707 +708,83,Post 708 +709,70,Post 709 +710,29,Post 710 +711,52,Post 711 +712,22,Post 712 +713,78,Post 713 +714,10,Post 714 +715,20,Post 715 +716,18,Post 716 +717,38,Post 717 +718,70,Post 718 +719,52,Post 719 +720,49,Post 720 +721,0,Post 721 +722,38,Post 722 +723,8,Post 723 +724,20,Post 724 +725,89,Post 725 +726,20,Post 726 +727,74,Post 727 +728,72,Post 728 +729,14,Post 729 +730,52,Post 730 +731,10,Post 731 +732,70,Post 732 +733,56,Post 733 +734,72,Post 734 +735,47,Post 735 +736,87,Post 736 +737,7,Post 737 +738,22,Post 738 +739,70,Post 739 +740,38,Post 740 +741,17,Post 741 +742,9,Post 742 +743,72,Post 743 +744,45,Post 744 +745,80,Post 745 +746,70,Post 746 +747,38,Post 747 +748,32,Post 748 +749,52,Post 749 +750,82,Post 750 +751,70,Post 751 +752,0,Post 752 +753,68,Post 753 +754,88,Post 754 +755,70,Post 755 +756,17,Post 756 +757,48,Post 757 +758,13,Post 758 +759,30,Post 759 +760,89,Post 760 +761,89,Post 761 +762,21,Post 762 +763,27,Post 763 +764,52,Post 764 +765,93,Post 765 +766,13,Post 766 +767,20,Post 767 +768,78,Post 768 +769,50,Post 769 +770,84,Post 770 +771,18,Post 771 +772,52,Post 772 +773,27,Post 773 +774,27,Post 774 +775,41,Post 775 +776,38,Post 776 +777,29,Post 777 +778,87,Post 778 +779,70,Post 779 +780,70,Post 780 +781,22,Post 781 +782,52,Post 782 +783,71,Post 783 +784,72,Post 784 +785,27,Post 785 +786,70,Post 786 +787,70,Post 787 +788,13,Post 788 +789,75,Post 789 +790,39,Post 790 +791,49,Post 791 +792,41,Post 792 +793,52,Post 793 +794,52,Post 794 +795,51,Post 795 +796,76,Post 796 +797,53,Post 797 +798,37,Post 798 +799,38,Post 799 +800,72,Post 800 +801,27,Post 801 +802,20,Post 802 +803,8,Post 803 +804,78,Post 804 +805,88,Post 805 +806,10,Post 806 +807,27,Post 807 +808,17,Post 808 +809,10,Post 809 +810,84,Post 810 +811,7,Post 811 +812,96,Post 812 +813,8,Post 813 +814,74,Post 814 +815,52,Post 815 +816,31,Post 816 +817,27,Post 817 +818,70,Post 818 +819,26,Post 819 +820,61,Post 820 +821,52,Post 821 +822,48,Post 822 +823,84,Post 823 +824,52,Post 824 +825,72,Post 825 +826,70,Post 826 +827,6,Post 827 +828,70,Post 828 +829,20,Post 829 +830,84,Post 830 +831,7,Post 831 +832,27,Post 832 +833,8,Post 833 +834,46,Post 834 +835,72,Post 835 +836,23,Post 836 +837,13,Post 837 +838,27,Post 838 +839,72,Post 839 +840,13,Post 840 +841,20,Post 841 +842,8,Post 842 +843,69,Post 843 +844,36,Post 844 +845,25,Post 845 +846,70,Post 846 +847,27,Post 847 +848,70,Post 848 +849,72,Post 849 +850,20,Post 850 +851,95,Post 851 +852,16,Post 852 +853,22,Post 853 +854,18,Post 854 +855,27,Post 855 +856,47,Post 856 +857,52,Post 857 +858,73,Post 858 +859,82,Post 859 +860,20,Post 860 +861,52,Post 861 +862,10,Post 862 +863,43,Post 863 +864,27,Post 864 +865,27,Post 865 +866,48,Post 866 +867,70,Post 867 +868,8,Post 868 +869,79,Post 869 +870,70,Post 870 +871,17,Post 871 +872,89,Post 872 +873,52,Post 873 +874,99,Post 874 +875,19,Post 875 +876,52,Post 876 +877,22,Post 877 +878,24,Post 878 +879,52,Post 879 +880,89,Post 880 +881,72,Post 881 +882,70,Post 882 +883,52,Post 883 +884,89,Post 884 +885,50,Post 885 +886,78,Post 886 +887,72,Post 887 +888,20,Post 888 +889,70,Post 889 +890,1,Post 890 +891,27,Post 891 +892,20,Post 892 +893,52,Post 893 +894,70,Post 894 +895,8,Post 895 +896,52,Post 896 +897,89,Post 897 +898,20,Post 898 +899,66,Post 899 +900,52,Post 900 +901,1,Post 901 +902,46,Post 902 +903,70,Post 903 +904,7,Post 904 +905,79,Post 905 +906,52,Post 906 +907,5,Post 907 +908,20,Post 908 +909,91,Post 909 +910,52,Post 910 +911,9,Post 911 +912,21,Post 912 +913,42,Post 913 +914,3,Post 914 +915,38,Post 915 +916,50,Post 916 +917,20,Post 917 +918,52,Post 918 +919,70,Post 919 +920,20,Post 920 +921,52,Post 921 +922,56,Post 922 +923,90,Post 923 +924,71,Post 924 +925,72,Post 925 +926,50,Post 926 +927,18,Post 927 +928,98,Post 928 +929,12,Post 929 +930,45,Post 930 +931,8,Post 931 +932,89,Post 932 +933,93,Post 933 +934,70,Post 934 +935,28,Post 935 +936,20,Post 936 +937,20,Post 937 +938,12,Post 938 +939,52,Post 939 +940,13,Post 940 +941,27,Post 941 +942,53,Post 942 +943,70,Post 943 +944,3,Post 944 +945,38,Post 945 +946,59,Post 946 +947,73,Post 947 +948,46,Post 948 +949,93,Post 949 +950,20,Post 950 +951,2,Post 951 +952,48,Post 952 +953,20,Post 953 +954,72,Post 954 +955,20,Post 955 +956,25,Post 956 +957,72,Post 957 +958,70,Post 958 +959,52,Post 959 +960,69,Post 960 +961,38,Post 961 +962,0,Post 962 +963,1,Post 963 +964,52,Post 964 +965,8,Post 965 +966,7,Post 966 +967,93,Post 967 +968,74,Post 968 +969,13,Post 969 +970,0,Post 970 +971,89,Post 971 +972,21,Post 972 +973,18,Post 973 +974,68,Post 974 +975,9,Post 975 +976,20,Post 976 +977,95,Post 977 +978,56,Post 978 +979,52,Post 979 +980,37,Post 980 +981,70,Post 981 +982,13,Post 982 +983,93,Post 983 +984,74,Post 984 +985,52,Post 985 +986,7,Post 986 +987,68,Post 987 +988,87,Post 988 +989,52,Post 989 +990,29,Post 990 +991,11,Post 991 +992,70,Post 992 +993,81,Post 993 +994,8,Post 994 +995,70,Post 995 +996,2,Post 996 +997,93,Post 997 +998,52,Post 998 +999,52,Post 999 diff --git a/axolotl/tests/data/datasets/database_dataset_4/tables/users.csv b/axolotl/tests/data/datasets/database_dataset_4/tables/users.csv new file mode 100644 index 0000000..01f98db --- /dev/null +++ b/axolotl/tests/data/datasets/database_dataset_4/tables/users.csv @@ -0,0 +1,101 @@ +id,name +0,User 0 +1,User 1 +2,User 2 +3,User 3 +4,User 4 +5,User 5 +6,User 6 +7,User 7 +8,User 8 +9,User 9 +10,User 10 +11,User 11 +12,User 12 +13,User 13 +14,User 14 +15,User 15 +16,User 16 +17,User 17 +18,User 18 +19,User 19 +20,User 20 +21,User 21 +22,User 22 +23,User 23 +24,User 24 +25,User 25 +26,User 26 +27,User 27 +28,User 28 +29,User 29 +30,User 30 +31,User 31 +32,User 32 +33,User 33 +34,User 34 +35,User 35 +36,User 36 +37,User 37 +38,User 38 +39,User 39 +40,User 40 +41,User 41 +42,User 42 +43,User 43 +44,User 44 +45,User 45 +46,User 46 +47,User 47 +48,User 48 +49,User 49 +50,User 50 +51,User 51 +52,User 52 +53,User 53 +54,User 54 +55,User 55 +56,User 56 +57,User 57 +58,User 58 +59,User 59 +60,User 60 +61,User 61 +62,User 62 +63,User 63 +64,User 64 +65,User 65 +66,User 66 +67,User 67 +68,User 68 +69,User 69 +70,User 70 +71,User 71 +72,User 72 +73,User 73 +74,User 74 +75,User 75 +76,User 76 +77,User 77 +78,User 78 +79,User 79 +80,User 80 +81,User 81 +82,User 82 +83,User 83 +84,User 84 +85,User 85 +86,User 86 +87,User 87 +88,User 88 +89,User 89 +90,User 90 +91,User 91 +92,User 92 +93,User 93 +94,User 94 +95,User 95 +96,User 96 +97,User 97 +98,User 98 +99,User 99 diff --git a/axolotl/tests/data/datasets/graph_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/graph_dataset_1/datasetDoc.json new file mode 100644 index 0000000..a0acd62 --- /dev/null +++ b/axolotl/tests/data/datasets/graph_dataset_1/datasetDoc.json @@ -0,0 +1,68 @@ +{ + "about": { + "datasetID": "graph_dataset_1", + "datasetName": "Test graph dataset in GML format", + "description": "Based on LL1_net_nomination_seed_dataset", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "fcca6f975627035daa978a24e93aa74861aa93bb885eb8d782d24e5f46376f14" + }, + "dataResources": [ + { + "resID": "G1", + "resPath": "graphs/G1.gml", + "resType": "graph", + "resFormat": { + "text/vnd.gml": [ + "gml" + ] + }, + "isCollection": false + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 3, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "G1.nodeID", + "colType": "integer", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "G1", + "resObject": { + "nodeAttribute": "nodeID" + } + } + }, + { + "colIndex": 2, + "colName": "classLabel", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/graph_dataset_1/graphs/G1.gml b/axolotl/tests/data/datasets/graph_dataset_1/graphs/G1.gml new file mode 100644 index 0000000..1522c17 --- /dev/null +++ b/axolotl/tests/data/datasets/graph_dataset_1/graphs/G1.gml @@ -0,0 +1,98 @@ +graph [ + node [ + id 0 + nodeID 0 + attr1 10.931067281322685 + attr2 -0.6081826070068602 + ] + node [ + id 1 + nodeID 1 + attr1 -7.6385951806129615 + attr2 -0.6245401364066232 + ] + node [ + id 2 + nodeID 2 + attr1 -7.827982471789538 + attr2 -1.6583217791337177 + ] + node [ + id 3 + nodeID 3 + attr1 -13.175150644300572 + attr2 0.059255494425681 + ] + node [ + id 4 + nodeID 4 + attr1 -2.858879300645913 + attr2 0.2877095029910792 + ] + node [ + id 5 + nodeID 5 + attr1 3.1166256979193085 + attr2 -0.9558118873968128 + ] + node [ + id 6 + nodeID 6 + attr1 -2.3460257528493025 + attr2 -0.9912505454192136 + ] + node [ + id 7 + nodeID 7 + attr1 4.279456640630548 + attr2 0.9571850297129592 + ] + node [ + id 8 + nodeID 8 + attr1 -1.3274504027623684 + attr2 0.008863588431931045 + ] + node [ + id 9 + nodeID 9 + attr1 2.9854996729947567 + attr2 -0.6257664276530307 + ] + node [ + id 10 + nodeID 10 + attr1 -8.126081560478179 + attr2 2.830732320647184 + ] + edge [ + source 0 + target 4 + edge_weight 1.0 + ] + edge [ + source 0 + target 5 + edge_weight 1.0 + ] + edge [ + source 1 + target 2 + edge_weight 0.9 + ] + edge [ + source 1 + target 3 + edge_weight 0.6 + ] + edge [ + source 1 + target 10 + edge_weight 1.0 + ] + edge [ + source 5 + target 7 + edge_weight 1.0 + ] +] diff --git a/axolotl/tests/data/datasets/graph_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/graph_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..22acdd5 --- /dev/null +++ b/axolotl/tests/data/datasets/graph_dataset_1/tables/learningData.csv @@ -0,0 +1,12 @@ +d3mIndex,G1.nodeID,classLabel +0,0,2 +1,1,0 +2,2,0 +3,3,0 +4,4,1 +5,5,2 +6,6,1 +7,7,2 +8,8,1 +9,9,2 +10,10,0 diff --git a/axolotl/tests/data/datasets/graph_dataset_2/datasetDoc.json b/axolotl/tests/data/datasets/graph_dataset_2/datasetDoc.json new file mode 100644 index 0000000..ac1deec --- /dev/null +++ b/axolotl/tests/data/datasets/graph_dataset_2/datasetDoc.json @@ -0,0 +1,118 @@ +{ + "about": { + "datasetID": "graph_dataset_2", + "datasetName": "Test graph dataset in edgelist format", + "description": "Based on LL1_EDGELIST_net_nomination_seed_dataset", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "ef138e993861a11f1d09a8b3662179eb0661c85a4e38d572be8555e32da712f1" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 5, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "nodeID", + "colType": "integer", + "role": [ + "attribute", + "key" + ] + }, + { + "colIndex": 2, + "colName": "attr1", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "attr2", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "classLabel", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ] + }, + { + "resID": "edgeList", + "resPath": "tables/edgeList.csv", + "resType": "edgeList", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 3, + "columns": [ + { + "colIndex": 0, + "colName": "source", + "colType": "integer", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "learningData", + "resObject": { + "columnName": "nodeID" + } + } + }, + { + "colIndex": 1, + "colName": "target", + "colType": "integer", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "learningData", + "resObject": { + "columnName": "nodeID" + } + } + }, + { + "colIndex": 2, + "colName": "edge_weight", + "colType": "real", + "role": [ + "attribute" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/graph_dataset_2/tables/edgeList.csv b/axolotl/tests/data/datasets/graph_dataset_2/tables/edgeList.csv new file mode 100644 index 0000000..0701187 --- /dev/null +++ b/axolotl/tests/data/datasets/graph_dataset_2/tables/edgeList.csv @@ -0,0 +1,7 @@ +source,target,edge_weight +0,4,1.0 +0,5,1.0 +1,2,0.9 +1,3,0.6 +1,10,1.0 +5,7,1.0 diff --git a/axolotl/tests/data/datasets/graph_dataset_2/tables/learningData.csv b/axolotl/tests/data/datasets/graph_dataset_2/tables/learningData.csv new file mode 100644 index 0000000..4b0ba55 --- /dev/null +++ b/axolotl/tests/data/datasets/graph_dataset_2/tables/learningData.csv @@ -0,0 +1,12 @@ +d3mIndex,nodeID,attr1,attr2,classLabel +0,0,10.931067281322685,-0.6081826070068602,2 +1,1,-7.6385951806129615,-0.6245401364066232,0 +2,2,-7.827982471789538,-1.6583217791337177,0 +3,3,-13.175150644300572,0.059255494425681,0 +4,4,-2.858879300645913,0.2877095029910792,1 +5,5,3.1166256979193085,-0.9558118873968128,2 +6,6,-2.3460257528493025,-0.9912505454192136,1 +7,7,4.279456640630548,0.9571850297129592,2 +8,8,-1.3274504027623684,0.008863588431931045,1 +9,9,2.9854996729947567,-0.6257664276530307,2 +10,10,-8.126081560478179,2.830732320647184,0 diff --git a/axolotl/tests/data/datasets/image_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/image_dataset_1/datasetDoc.json new file mode 100644 index 0000000..333ce49 --- /dev/null +++ b/axolotl/tests/data/datasets/image_dataset_1/datasetDoc.json @@ -0,0 +1,71 @@ +{ + "about": { + "datasetID": "image_dataset_1", + "datasetName": "Image dataset to be used for tests", + "description": "There are a total of 5 image files, one is a left hand from the handgeometry dataset, two birds from cifar10 dataset and 2 figures from mnist dataset.", + "license": "Creative Commons Attribution-NonCommercial 4.0", + "approximateSize": "24 KB", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "7d8dc3f1f7bd1edce4caf2848f05114cc5c7c8bb0221310bde87965e02a2a927" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "media/", + "resType": "image", + "resFormat": { + "image/jpeg": [ + "jpg" + ], + "image/png": [ + "png" + ] + }, + "isCollection": true + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 3, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "image_file", + "colType": "string", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "class", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/image_dataset_1/media/001_HandPhoto_left_01.jpg b/axolotl/tests/data/datasets/image_dataset_1/media/001_HandPhoto_left_01.jpg new file mode 100644 index 0000000..abd4c83 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_1/media/001_HandPhoto_left_01.jpg differ diff --git a/axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_1.png b/axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_1.png new file mode 100644 index 0000000..3b73644 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_1.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_2.png b/axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_2.png new file mode 100644 index 0000000..2b63a56 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_1/media/cifar10_bird_2.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_1/media/mnist_0_2.png b/axolotl/tests/data/datasets/image_dataset_1/media/mnist_0_2.png new file mode 100644 index 0000000..ee5c090 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_1/media/mnist_0_2.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_1/media/mnist_1_1.png b/axolotl/tests/data/datasets/image_dataset_1/media/mnist_1_1.png new file mode 100644 index 0000000..3dbdfd3 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_1/media/mnist_1_1.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/image_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..37658bc --- /dev/null +++ b/axolotl/tests/data/datasets/image_dataset_1/tables/learningData.csv @@ -0,0 +1,6 @@ +d3mIndex,image_file,class +0,mnist_0_2.png,mnist +1,mnist_1_1.png,mnist +2,001_HandPhoto_left_01.jpg,handgeometry +3,cifar10_bird_1.png,cifar +4,cifar10_bird_2.png,cifar diff --git a/axolotl/tests/data/datasets/image_dataset_2/datasetDoc.json b/axolotl/tests/data/datasets/image_dataset_2/datasetDoc.json new file mode 100644 index 0000000..fecdb19 --- /dev/null +++ b/axolotl/tests/data/datasets/image_dataset_2/datasetDoc.json @@ -0,0 +1,66 @@ +{ + "about": { + "datasetID": "image_dataset_2", + "datasetName": "Multiclass image classification", + "description": "Image recognition dataset consisting of 100 28x28 images of 10 handwritten digits. Based on 124_120_mnist_dataset.", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "526025607f71120946878fecc3351898ed6f9ef4dbafec1346af3248d52f5010" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "media/", + "resType": "image", + "resFormat": { + "image/png": [ + "png" + ] + }, + "isCollection": true + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 3, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "image", + "colType": "string", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "label", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00000.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00000.png new file mode 100644 index 0000000..ce5888b Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00000.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00001.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00001.png new file mode 100644 index 0000000..5e05ec1 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00001.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00002.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00002.png new file mode 100644 index 0000000..e43e690 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00002.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00003.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00003.png new file mode 100644 index 0000000..48ba482 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00003.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00004.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00004.png new file mode 100644 index 0000000..bd8a5cb Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00004.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00005.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00005.png new file mode 100644 index 0000000..e6bcd61 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00005.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00006.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00006.png new file mode 100644 index 0000000..de53008 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00006.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00007.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00007.png new file mode 100644 index 0000000..534ff5f Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00007.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00008.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00008.png new file mode 100644 index 0000000..10e4d08 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00008.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00009.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00009.png new file mode 100644 index 0000000..c0ffaec Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00009.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00010.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00010.png new file mode 100644 index 0000000..cb711b0 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00010.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00011.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00011.png new file mode 100644 index 0000000..6b0aae9 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00011.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00012.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00012.png new file mode 100644 index 0000000..162c2b2 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00012.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00013.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00013.png new file mode 100644 index 0000000..3ac16af Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00013.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00014.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00014.png new file mode 100644 index 0000000..7642af9 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00014.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00015.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00015.png new file mode 100644 index 0000000..df66f30 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00015.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00016.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00016.png new file mode 100644 index 0000000..d1fa66a Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00016.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00017.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00017.png new file mode 100644 index 0000000..b4dfbe4 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00017.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00018.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00018.png new file mode 100644 index 0000000..2ae93f0 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00018.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00019.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00019.png new file mode 100644 index 0000000..832098e Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00019.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00020.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00020.png new file mode 100644 index 0000000..bfb8f1d Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00020.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00021.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00021.png new file mode 100644 index 0000000..45573d6 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00021.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00022.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00022.png new file mode 100644 index 0000000..4301a7d Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00022.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00023.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00023.png new file mode 100644 index 0000000..ffe4963 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00023.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00024.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00024.png new file mode 100644 index 0000000..65b7f2f Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00024.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00025.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00025.png new file mode 100644 index 0000000..30bf46f Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00025.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00026.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00026.png new file mode 100644 index 0000000..62a4035 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00026.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00027.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00027.png new file mode 100644 index 0000000..81a6d59 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00027.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00028.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00028.png new file mode 100644 index 0000000..14b3a73 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00028.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00029.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00029.png new file mode 100644 index 0000000..2deb9b7 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00029.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00030.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00030.png new file mode 100644 index 0000000..6fa54e5 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00030.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00031.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00031.png new file mode 100644 index 0000000..1d2a89c Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00031.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00032.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00032.png new file mode 100644 index 0000000..0ae3434 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00032.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00033.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00033.png new file mode 100644 index 0000000..2de01a1 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00033.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00034.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00034.png new file mode 100644 index 0000000..cb8c826 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00034.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00035.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00035.png new file mode 100644 index 0000000..7e76e15 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00035.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00036.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00036.png new file mode 100644 index 0000000..8361b7a Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00036.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00037.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00037.png new file mode 100644 index 0000000..be26c27 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00037.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00038.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00038.png new file mode 100644 index 0000000..c80501c Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00038.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00039.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00039.png new file mode 100644 index 0000000..7b4dbf2 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00039.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00040.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00040.png new file mode 100644 index 0000000..6cf9b89 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00040.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00041.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00041.png new file mode 100644 index 0000000..71a025e Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00041.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00042.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00042.png new file mode 100644 index 0000000..3a7adf6 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00042.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00043.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00043.png new file mode 100644 index 0000000..14d71a3 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00043.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00044.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00044.png new file mode 100644 index 0000000..2f93cb4 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00044.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00045.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00045.png new file mode 100644 index 0000000..471cf52 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00045.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00046.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00046.png new file mode 100644 index 0000000..e7b20ee Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00046.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00047.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00047.png new file mode 100644 index 0000000..7ab16ad Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00047.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00048.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00048.png new file mode 100644 index 0000000..c76d07c Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00048.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00049.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00049.png new file mode 100644 index 0000000..ae270f9 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00049.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00050.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00050.png new file mode 100644 index 0000000..bc514fc Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00050.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00051.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00051.png new file mode 100644 index 0000000..55252b6 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00051.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00052.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00052.png new file mode 100644 index 0000000..4bbfa5a Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00052.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00053.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00053.png new file mode 100644 index 0000000..e0876b3 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00053.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00054.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00054.png new file mode 100644 index 0000000..288a47d Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00054.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00055.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00055.png new file mode 100644 index 0000000..66983ce Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00055.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00056.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00056.png new file mode 100644 index 0000000..3ef6d46 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00056.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00057.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00057.png new file mode 100644 index 0000000..615240a Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00057.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00058.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00058.png new file mode 100644 index 0000000..4459f1a Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00058.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00059.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00059.png new file mode 100644 index 0000000..35cae9c Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00059.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00060.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00060.png new file mode 100644 index 0000000..9253b7b Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00060.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00061.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00061.png new file mode 100644 index 0000000..4befbf4 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00061.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00062.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00062.png new file mode 100644 index 0000000..2dcf216 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00062.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00063.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00063.png new file mode 100644 index 0000000..7944c5c Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00063.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00064.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00064.png new file mode 100644 index 0000000..80729a5 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00064.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00065.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00065.png new file mode 100644 index 0000000..68099ee Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00065.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00066.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00066.png new file mode 100644 index 0000000..ea20530 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00066.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00067.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00067.png new file mode 100644 index 0000000..51f0d78 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00067.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00068.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00068.png new file mode 100644 index 0000000..ec1ad06 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00068.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00069.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00069.png new file mode 100644 index 0000000..7202dbc Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00069.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00070.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00070.png new file mode 100644 index 0000000..4948dae Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00070.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00071.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00071.png new file mode 100644 index 0000000..54a9e89 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00071.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00072.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00072.png new file mode 100644 index 0000000..6313678 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00072.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00073.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00073.png new file mode 100644 index 0000000..c6c52f3 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00073.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00074.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00074.png new file mode 100644 index 0000000..9ea0334 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00074.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00075.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00075.png new file mode 100644 index 0000000..4dfe7e1 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00075.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00076.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00076.png new file mode 100644 index 0000000..0e51f47 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00076.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00077.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00077.png new file mode 100644 index 0000000..e15b895 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00077.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00078.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00078.png new file mode 100644 index 0000000..a4f53ff Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00078.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00079.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00079.png new file mode 100644 index 0000000..b71c8d5 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00079.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00080.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00080.png new file mode 100644 index 0000000..91e9a14 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00080.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00081.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00081.png new file mode 100644 index 0000000..935423f Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00081.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00082.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00082.png new file mode 100644 index 0000000..bbfc0cb Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00082.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00083.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00083.png new file mode 100644 index 0000000..aac35fb Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00083.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00084.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00084.png new file mode 100644 index 0000000..bf64b3f Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00084.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00085.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00085.png new file mode 100644 index 0000000..d9a2f78 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00085.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00086.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00086.png new file mode 100644 index 0000000..c6b99f0 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00086.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00087.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00087.png new file mode 100644 index 0000000..cfb2887 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00087.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00088.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00088.png new file mode 100644 index 0000000..c51a58c Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00088.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00089.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00089.png new file mode 100644 index 0000000..d39d1a8 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00089.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00090.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00090.png new file mode 100644 index 0000000..3bb43ff Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00090.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00091.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00091.png new file mode 100644 index 0000000..67cdb45 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00091.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00092.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00092.png new file mode 100644 index 0000000..680e9c9 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00092.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00093.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00093.png new file mode 100644 index 0000000..dacf0fe Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00093.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00094.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00094.png new file mode 100644 index 0000000..88e1755 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00094.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00095.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00095.png new file mode 100644 index 0000000..e02a6d4 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00095.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00096.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00096.png new file mode 100644 index 0000000..2da8e1a Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00096.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00097.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00097.png new file mode 100644 index 0000000..d798e15 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00097.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/media/img_00098.png b/axolotl/tests/data/datasets/image_dataset_2/media/img_00098.png new file mode 100644 index 0000000..387f3c9 Binary files /dev/null and b/axolotl/tests/data/datasets/image_dataset_2/media/img_00098.png differ diff --git a/axolotl/tests/data/datasets/image_dataset_2/tables/learningData.csv b/axolotl/tests/data/datasets/image_dataset_2/tables/learningData.csv new file mode 100644 index 0000000..b055477 --- /dev/null +++ b/axolotl/tests/data/datasets/image_dataset_2/tables/learningData.csv @@ -0,0 +1,100 @@ +d3mIndex,image,label +0,img_00000.png,5 +1,img_00001.png,0 +2,img_00002.png,4 +3,img_00003.png,1 +4,img_00004.png,9 +5,img_00005.png,2 +6,img_00006.png,1 +7,img_00007.png,3 +8,img_00008.png,1 +9,img_00009.png,4 +10,img_00010.png,3 +11,img_00011.png,5 +12,img_00012.png,3 +13,img_00013.png,6 +14,img_00014.png,1 +15,img_00015.png,7 +16,img_00016.png,2 +17,img_00017.png,8 +18,img_00018.png,6 +19,img_00019.png,9 +20,img_00020.png,4 +21,img_00021.png,0 +22,img_00022.png,9 +23,img_00023.png,1 +24,img_00024.png,1 +25,img_00025.png,2 +26,img_00026.png,4 +27,img_00027.png,3 +28,img_00028.png,2 +29,img_00029.png,7 +30,img_00030.png,3 +31,img_00031.png,8 +32,img_00032.png,6 +33,img_00033.png,9 +34,img_00034.png,0 +35,img_00035.png,5 +36,img_00036.png,6 +37,img_00037.png,0 +38,img_00038.png,7 +39,img_00039.png,6 +40,img_00040.png,1 +41,img_00041.png,8 +42,img_00042.png,7 +43,img_00043.png,9 +44,img_00044.png,3 +45,img_00045.png,9 +46,img_00046.png,8 +47,img_00047.png,5 +48,img_00048.png,9 +49,img_00049.png,3 +50,img_00050.png,3 +51,img_00051.png,0 +52,img_00052.png,7 +53,img_00053.png,4 +54,img_00054.png,9 +55,img_00055.png,8 +56,img_00056.png,0 +57,img_00057.png,9 +58,img_00058.png,4 +59,img_00059.png,1 +60,img_00060.png,4 +61,img_00061.png,4 +62,img_00062.png,6 +63,img_00063.png,0 +64,img_00064.png,4 +65,img_00065.png,5 +66,img_00066.png,6 +67,img_00067.png,1 +68,img_00068.png,0 +69,img_00069.png,0 +70,img_00070.png,1 +71,img_00071.png,7 +72,img_00072.png,1 +73,img_00073.png,6 +74,img_00074.png,3 +75,img_00075.png,0 +76,img_00076.png,2 +77,img_00077.png,1 +78,img_00078.png,1 +79,img_00079.png,7 +80,img_00080.png,9 +81,img_00081.png,0 +82,img_00082.png,2 +83,img_00083.png,6 +84,img_00084.png,7 +85,img_00085.png,8 +86,img_00086.png,3 +87,img_00087.png,9 +88,img_00088.png,0 +89,img_00089.png,4 +90,img_00090.png,6 +91,img_00091.png,7 +92,img_00092.png,4 +93,img_00093.png,6 +94,img_00094.png,8 +95,img_00095.png,0 +96,img_00096.png,7 +97,img_00097.png,8 +98,img_00098.png,3 diff --git a/axolotl/tests/data/datasets/iris_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/iris_dataset_1/datasetDoc.json new file mode 100644 index 0000000..4ade4c7 --- /dev/null +++ b/axolotl/tests/data/datasets/iris_dataset_1/datasetDoc.json @@ -0,0 +1,76 @@ +{ + "about": { + "datasetID": "iris_dataset_1", + "datasetName": "Iris Dataset", + "humanSubjectsResearch": false, + "license": "CC", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "digest": "6191a49372f185f530920ffa35a3c4a78034ec47247aa23474537c449d37323b", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 6, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "sepalLength", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "sepalWidth", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "petalLength", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "petalWidth", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "species", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/iris_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/iris_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..bce7479 --- /dev/null +++ b/axolotl/tests/data/datasets/iris_dataset_1/tables/learningData.csv @@ -0,0 +1,151 @@ +d3mIndex,sepalLength,sepalWidth,petalLength,petalWidth,species +0,5.1,3.5,1.4,0.2,Iris-setosa +1,4.9,3,1.4,0.2,Iris-setosa +2,4.7,3.2,1.3,0.2,Iris-setosa +3,4.6,3.1,1.5,0.2,Iris-setosa +4,5,3.6,1.4,0.2,Iris-setosa +5,5.4,3.9,1.7,0.4,Iris-setosa +6,4.6,3.4,1.4,0.3,Iris-setosa +7,5,3.4,1.5,0.2,Iris-setosa +8,4.4,2.9,1.4,0.2,Iris-setosa +9,4.9,3.1,1.5,0.1,Iris-setosa +10,5.4,3.7,1.5,0.2,Iris-setosa +11,4.8,3.4,1.6,0.2,Iris-setosa +12,4.8,3,1.4,0.1,Iris-setosa +13,4.3,3,1.1,0.1,Iris-setosa +14,5.8,4,1.2,0.2,Iris-setosa +15,5.7,4.4,1.5,0.4,Iris-setosa +16,5.4,3.9,1.3,0.4,Iris-setosa +17,5.1,3.5,1.4,0.3,Iris-setosa +18,5.7,3.8,1.7,0.3,Iris-setosa +19,5.1,3.8,1.5,0.3,Iris-setosa +20,5.4,3.4,1.7,0.2,Iris-setosa +21,5.1,3.7,1.5,0.4,Iris-setosa +22,4.6,3.6,1,0.2,Iris-setosa +23,5.1,3.3,1.7,0.5,Iris-setosa +24,4.8,3.4,1.9,0.2,Iris-setosa +25,5,3,1.6,0.2,Iris-setosa +26,5,3.4,1.6,0.4,Iris-setosa +27,5.2,3.5,1.5,0.2,Iris-setosa +28,5.2,3.4,1.4,0.2,Iris-setosa +29,4.7,3.2,1.6,0.2,Iris-setosa +30,4.8,3.1,1.6,0.2,Iris-setosa +31,5.4,3.4,1.5,0.4,Iris-setosa +32,5.2,4.1,1.5,0.1,Iris-setosa +33,5.5,4.2,1.4,0.2,Iris-setosa +34,4.9,3.1,1.5,0.1,Iris-setosa +35,5,3.2,1.2,0.2,Iris-setosa +36,5.5,3.5,1.3,0.2,Iris-setosa +37,4.9,3.1,1.5,0.1,Iris-setosa +38,4.4,3,1.3,0.2,Iris-setosa +39,5.1,3.4,1.5,0.2,Iris-setosa +40,5,3.5,1.3,0.3,Iris-setosa +41,4.5,2.3,1.3,0.3,Iris-setosa +42,4.4,3.2,1.3,0.2,Iris-setosa +43,5,3.5,1.6,0.6,Iris-setosa +44,5.1,3.8,1.9,0.4,Iris-setosa +45,4.8,3,1.4,0.3,Iris-setosa +46,5.1,3.8,1.6,0.2,Iris-setosa +47,4.6,3.2,1.4,0.2,Iris-setosa +48,5.3,3.7,1.5,0.2,Iris-setosa +49,5,3.3,1.4,0.2,Iris-setosa +50,7,3.2,4.7,1.4,Iris-versicolor +51,6.4,3.2,4.5,1.5,Iris-versicolor +52,6.9,3.1,4.9,1.5,Iris-versicolor +53,5.5,2.3,4,1.3,Iris-versicolor +54,6.5,2.8,4.6,1.5,Iris-versicolor +55,5.7,2.8,4.5,1.3,Iris-versicolor +56,6.3,3.3,4.7,1.6,Iris-versicolor +57,4.9,2.4,3.3,1,Iris-versicolor +58,6.6,2.9,4.6,1.3,Iris-versicolor +59,5.2,2.7,3.9,1.4,Iris-versicolor +60,5,2,3.5,1,Iris-versicolor +61,5.9,3,4.2,1.5,Iris-versicolor +62,6,2.2,4,1,Iris-versicolor +63,6.1,2.9,4.7,1.4,Iris-versicolor +64,5.6,2.9,3.6,1.3,Iris-versicolor +65,6.7,3.1,4.4,1.4,Iris-versicolor +66,5.6,3,4.5,1.5,Iris-versicolor +67,5.8,2.7,4.1,1,Iris-versicolor +68,6.2,2.2,4.5,1.5,Iris-versicolor +69,5.6,2.5,3.9,1.1,Iris-versicolor +70,5.9,3.2,4.8,1.8,Iris-versicolor +71,6.1,2.8,4,1.3,Iris-versicolor +72,6.3,2.5,4.9,1.5,Iris-versicolor +73,6.1,2.8,4.7,1.2,Iris-versicolor +74,6.4,2.9,4.3,1.3,Iris-versicolor +75,6.6,3,4.4,1.4,Iris-versicolor +76,6.8,2.8,4.8,1.4,Iris-versicolor +77,6.7,3,5,1.7,Iris-versicolor +78,6,2.9,4.5,1.5,Iris-versicolor +79,5.7,2.6,3.5,1,Iris-versicolor +80,5.5,2.4,3.8,1.1,Iris-versicolor +81,5.5,2.4,3.7,1,Iris-versicolor +82,5.8,2.7,3.9,1.2,Iris-versicolor +83,6,2.7,5.1,1.6,Iris-versicolor +84,5.4,3,4.5,1.5,Iris-versicolor +85,6,3.4,4.5,1.6,Iris-versicolor +86,6.7,3.1,4.7,1.5,Iris-versicolor +87,6.3,2.3,4.4,1.3,Iris-versicolor +88,5.6,3,4.1,1.3,Iris-versicolor +89,5.5,2.5,4,1.3,Iris-versicolor +90,5.5,2.6,4.4,1.2,Iris-versicolor +91,6.1,3,4.6,1.4,Iris-versicolor +92,5.8,2.6,4,1.2,Iris-versicolor +93,5,2.3,3.3,1,Iris-versicolor +94,5.6,2.7,4.2,1.3,Iris-versicolor +95,5.7,3,4.2,1.2,Iris-versicolor +96,5.7,2.9,4.2,1.3,Iris-versicolor +97,6.2,2.9,4.3,1.3,Iris-versicolor +98,5.1,2.5,3,1.1,Iris-versicolor +99,5.7,2.8,4.1,1.3,Iris-versicolor +100,6.3,3.3,6,2.5,Iris-virginica +101,5.8,2.7,5.1,1.9,Iris-virginica +102,7.1,3,5.9,2.1,Iris-virginica +103,6.3,2.9,5.6,1.8,Iris-virginica +104,6.5,3,5.8,2.2,Iris-virginica +105,7.6,3,6.6,2.1,Iris-virginica +106,4.9,2.5,4.5,1.7,Iris-virginica +107,7.3,2.9,6.3,1.8,Iris-virginica +108,6.7,2.5,5.8,1.8,Iris-virginica +109,7.2,3.6,6.1,2.5,Iris-virginica +110,6.5,3.2,5.1,2,Iris-virginica +111,6.4,2.7,5.3,1.9,Iris-virginica +112,6.8,3,5.5,2.1,Iris-virginica +113,5.7,2.5,5,2,Iris-virginica +114,5.8,2.8,5.1,2.4,Iris-virginica +115,6.4,3.2,5.3,2.3,Iris-virginica +116,6.5,3,5.5,1.8,Iris-virginica +117,7.7,3.8,6.7,2.2,Iris-virginica +118,7.7,2.6,6.9,2.3,Iris-virginica +119,6,2.2,5,1.5,Iris-virginica +120,6.9,3.2,5.7,2.3,Iris-virginica +121,5.6,2.8,4.9,2,Iris-virginica +122,7.7,2.8,6.7,2,Iris-virginica +123,6.3,2.7,4.9,1.8,Iris-virginica +124,6.7,3.3,5.7,2.1,Iris-virginica +125,7.2,3.2,6,1.8,Iris-virginica +126,6.2,2.8,4.8,1.8,Iris-virginica +127,6.1,3,4.9,1.8,Iris-virginica +128,6.4,2.8,5.6,2.1,Iris-virginica +129,7.2,3,5.8,1.6,Iris-virginica +130,7.4,2.8,6.1,1.9,Iris-virginica +131,7.9,3.8,6.4,2,Iris-virginica +132,6.4,2.8,5.6,2.2,Iris-virginica +133,6.3,2.8,5.1,1.5,Iris-virginica +134,6.1,2.6,5.6,1.4,Iris-virginica +135,7.7,3,6.1,2.3,Iris-virginica +136,6.3,3.4,5.6,2.4,Iris-virginica +137,6.4,3.1,5.5,1.8,Iris-virginica +138,6,3,4.8,1.8,Iris-virginica +139,6.9,3.1,5.4,2.1,Iris-virginica +140,6.7,3.1,5.6,2.4,Iris-virginica +141,6.9,3.1,5.1,2.3,Iris-virginica +142,5.8,2.7,5.1,1.9,Iris-virginica +143,6.8,3.2,5.9,2.3,Iris-virginica +144,6.7,3.3,5.7,2.5,Iris-virginica +145,6.7,3,5.2,2.3,Iris-virginica +146,6.3,2.5,5,1.9,Iris-virginica +147,6.5,3,5.2,2,Iris-virginica +148,6.2,3.4,5.4,2.3,Iris-virginica +149,5.9,3,5.1,1.8,Iris-virginica diff --git a/axolotl/tests/data/datasets/iris_dataset_2/datasetDoc.json b/axolotl/tests/data/datasets/iris_dataset_2/datasetDoc.json new file mode 100644 index 0000000..a2212a1 --- /dev/null +++ b/axolotl/tests/data/datasets/iris_dataset_2/datasetDoc.json @@ -0,0 +1,25 @@ +{ + "about": { + "datasetID": "iris_dataset_2", + "datasetName": "Iris Dataset without metadata and d3mIndex", + "humanSubjectsResearch": false, + "license": "CC", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "digest": "54421aba799c789ad19b7f244aad168da3f5ea4cdb6bfdc7d0dae473f45898f9", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/iris_dataset_2/tables/learningData.csv b/axolotl/tests/data/datasets/iris_dataset_2/tables/learningData.csv new file mode 100644 index 0000000..4e455dc --- /dev/null +++ b/axolotl/tests/data/datasets/iris_dataset_2/tables/learningData.csv @@ -0,0 +1,151 @@ +sepalLength,sepalWidth,petalLength,petalWidth,species +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3,1.4,0.1,Iris-setosa +4.3,3,1.1,0.1,Iris-setosa +5.8,4,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5,3,1.6,0.2,Iris-setosa +5,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5,3.3,1.4,0.2,Iris-setosa +7,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5,2,3.5,1,Iris-versicolor +5.9,3,4.2,1.5,Iris-versicolor +6,2.2,4,1,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3,5,1.7,Iris-versicolor +6,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6,2.7,5.1,1.6,Iris-versicolor +5.4,3,4.5,1.5,Iris-versicolor +6,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3,4.1,1.3,Iris-versicolor +5.5,2.5,4,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3,4.6,1.4,Iris-versicolor +5.8,2.6,4,1.2,Iris-versicolor +5,2.3,3.3,1,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3,5.8,2.2,Iris-virginica +7.6,3,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3,5.5,2.1,Iris-virginica +5.7,2.5,5,2,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6,2.2,5,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2,Iris-virginica +7.7,2.8,6.7,2,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6,3,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3,5.2,2.3,Iris-virginica +6.3,2.5,5,1.9,Iris-virginica +6.5,3,5.2,2,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3,5.1,1.8,Iris-virginica diff --git a/axolotl/tests/data/datasets/iris_dataset_3/datasetDoc.json b/axolotl/tests/data/datasets/iris_dataset_3/datasetDoc.json new file mode 100644 index 0000000..9a44953 --- /dev/null +++ b/axolotl/tests/data/datasets/iris_dataset_3/datasetDoc.json @@ -0,0 +1,36 @@ +{ + "about": { + "datasetID": "iris_dataset_3", + "datasetName": "Iris Dataset with minimal metadata", + "humanSubjectsResearch": false, + "license": "CC", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "digest": "4a0b43c5e5a76919b42b2066015ba0962512beb8600919dfffa4e2ad604e446d", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 6, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/iris_dataset_3/tables/learningData.csv b/axolotl/tests/data/datasets/iris_dataset_3/tables/learningData.csv new file mode 100644 index 0000000..bce7479 --- /dev/null +++ b/axolotl/tests/data/datasets/iris_dataset_3/tables/learningData.csv @@ -0,0 +1,151 @@ +d3mIndex,sepalLength,sepalWidth,petalLength,petalWidth,species +0,5.1,3.5,1.4,0.2,Iris-setosa +1,4.9,3,1.4,0.2,Iris-setosa +2,4.7,3.2,1.3,0.2,Iris-setosa +3,4.6,3.1,1.5,0.2,Iris-setosa +4,5,3.6,1.4,0.2,Iris-setosa +5,5.4,3.9,1.7,0.4,Iris-setosa +6,4.6,3.4,1.4,0.3,Iris-setosa +7,5,3.4,1.5,0.2,Iris-setosa +8,4.4,2.9,1.4,0.2,Iris-setosa +9,4.9,3.1,1.5,0.1,Iris-setosa +10,5.4,3.7,1.5,0.2,Iris-setosa +11,4.8,3.4,1.6,0.2,Iris-setosa +12,4.8,3,1.4,0.1,Iris-setosa +13,4.3,3,1.1,0.1,Iris-setosa +14,5.8,4,1.2,0.2,Iris-setosa +15,5.7,4.4,1.5,0.4,Iris-setosa +16,5.4,3.9,1.3,0.4,Iris-setosa +17,5.1,3.5,1.4,0.3,Iris-setosa +18,5.7,3.8,1.7,0.3,Iris-setosa +19,5.1,3.8,1.5,0.3,Iris-setosa +20,5.4,3.4,1.7,0.2,Iris-setosa +21,5.1,3.7,1.5,0.4,Iris-setosa +22,4.6,3.6,1,0.2,Iris-setosa +23,5.1,3.3,1.7,0.5,Iris-setosa +24,4.8,3.4,1.9,0.2,Iris-setosa +25,5,3,1.6,0.2,Iris-setosa +26,5,3.4,1.6,0.4,Iris-setosa +27,5.2,3.5,1.5,0.2,Iris-setosa +28,5.2,3.4,1.4,0.2,Iris-setosa +29,4.7,3.2,1.6,0.2,Iris-setosa +30,4.8,3.1,1.6,0.2,Iris-setosa +31,5.4,3.4,1.5,0.4,Iris-setosa +32,5.2,4.1,1.5,0.1,Iris-setosa +33,5.5,4.2,1.4,0.2,Iris-setosa +34,4.9,3.1,1.5,0.1,Iris-setosa +35,5,3.2,1.2,0.2,Iris-setosa +36,5.5,3.5,1.3,0.2,Iris-setosa +37,4.9,3.1,1.5,0.1,Iris-setosa +38,4.4,3,1.3,0.2,Iris-setosa +39,5.1,3.4,1.5,0.2,Iris-setosa +40,5,3.5,1.3,0.3,Iris-setosa +41,4.5,2.3,1.3,0.3,Iris-setosa +42,4.4,3.2,1.3,0.2,Iris-setosa +43,5,3.5,1.6,0.6,Iris-setosa +44,5.1,3.8,1.9,0.4,Iris-setosa +45,4.8,3,1.4,0.3,Iris-setosa +46,5.1,3.8,1.6,0.2,Iris-setosa +47,4.6,3.2,1.4,0.2,Iris-setosa +48,5.3,3.7,1.5,0.2,Iris-setosa +49,5,3.3,1.4,0.2,Iris-setosa +50,7,3.2,4.7,1.4,Iris-versicolor +51,6.4,3.2,4.5,1.5,Iris-versicolor +52,6.9,3.1,4.9,1.5,Iris-versicolor +53,5.5,2.3,4,1.3,Iris-versicolor +54,6.5,2.8,4.6,1.5,Iris-versicolor +55,5.7,2.8,4.5,1.3,Iris-versicolor +56,6.3,3.3,4.7,1.6,Iris-versicolor +57,4.9,2.4,3.3,1,Iris-versicolor +58,6.6,2.9,4.6,1.3,Iris-versicolor +59,5.2,2.7,3.9,1.4,Iris-versicolor +60,5,2,3.5,1,Iris-versicolor +61,5.9,3,4.2,1.5,Iris-versicolor +62,6,2.2,4,1,Iris-versicolor +63,6.1,2.9,4.7,1.4,Iris-versicolor +64,5.6,2.9,3.6,1.3,Iris-versicolor +65,6.7,3.1,4.4,1.4,Iris-versicolor +66,5.6,3,4.5,1.5,Iris-versicolor +67,5.8,2.7,4.1,1,Iris-versicolor +68,6.2,2.2,4.5,1.5,Iris-versicolor +69,5.6,2.5,3.9,1.1,Iris-versicolor +70,5.9,3.2,4.8,1.8,Iris-versicolor +71,6.1,2.8,4,1.3,Iris-versicolor +72,6.3,2.5,4.9,1.5,Iris-versicolor +73,6.1,2.8,4.7,1.2,Iris-versicolor +74,6.4,2.9,4.3,1.3,Iris-versicolor +75,6.6,3,4.4,1.4,Iris-versicolor +76,6.8,2.8,4.8,1.4,Iris-versicolor +77,6.7,3,5,1.7,Iris-versicolor +78,6,2.9,4.5,1.5,Iris-versicolor +79,5.7,2.6,3.5,1,Iris-versicolor +80,5.5,2.4,3.8,1.1,Iris-versicolor +81,5.5,2.4,3.7,1,Iris-versicolor +82,5.8,2.7,3.9,1.2,Iris-versicolor +83,6,2.7,5.1,1.6,Iris-versicolor +84,5.4,3,4.5,1.5,Iris-versicolor +85,6,3.4,4.5,1.6,Iris-versicolor +86,6.7,3.1,4.7,1.5,Iris-versicolor +87,6.3,2.3,4.4,1.3,Iris-versicolor +88,5.6,3,4.1,1.3,Iris-versicolor +89,5.5,2.5,4,1.3,Iris-versicolor +90,5.5,2.6,4.4,1.2,Iris-versicolor +91,6.1,3,4.6,1.4,Iris-versicolor +92,5.8,2.6,4,1.2,Iris-versicolor +93,5,2.3,3.3,1,Iris-versicolor +94,5.6,2.7,4.2,1.3,Iris-versicolor +95,5.7,3,4.2,1.2,Iris-versicolor +96,5.7,2.9,4.2,1.3,Iris-versicolor +97,6.2,2.9,4.3,1.3,Iris-versicolor +98,5.1,2.5,3,1.1,Iris-versicolor +99,5.7,2.8,4.1,1.3,Iris-versicolor +100,6.3,3.3,6,2.5,Iris-virginica +101,5.8,2.7,5.1,1.9,Iris-virginica +102,7.1,3,5.9,2.1,Iris-virginica +103,6.3,2.9,5.6,1.8,Iris-virginica +104,6.5,3,5.8,2.2,Iris-virginica +105,7.6,3,6.6,2.1,Iris-virginica +106,4.9,2.5,4.5,1.7,Iris-virginica +107,7.3,2.9,6.3,1.8,Iris-virginica +108,6.7,2.5,5.8,1.8,Iris-virginica +109,7.2,3.6,6.1,2.5,Iris-virginica +110,6.5,3.2,5.1,2,Iris-virginica +111,6.4,2.7,5.3,1.9,Iris-virginica +112,6.8,3,5.5,2.1,Iris-virginica +113,5.7,2.5,5,2,Iris-virginica +114,5.8,2.8,5.1,2.4,Iris-virginica +115,6.4,3.2,5.3,2.3,Iris-virginica +116,6.5,3,5.5,1.8,Iris-virginica +117,7.7,3.8,6.7,2.2,Iris-virginica +118,7.7,2.6,6.9,2.3,Iris-virginica +119,6,2.2,5,1.5,Iris-virginica +120,6.9,3.2,5.7,2.3,Iris-virginica +121,5.6,2.8,4.9,2,Iris-virginica +122,7.7,2.8,6.7,2,Iris-virginica +123,6.3,2.7,4.9,1.8,Iris-virginica +124,6.7,3.3,5.7,2.1,Iris-virginica +125,7.2,3.2,6,1.8,Iris-virginica +126,6.2,2.8,4.8,1.8,Iris-virginica +127,6.1,3,4.9,1.8,Iris-virginica +128,6.4,2.8,5.6,2.1,Iris-virginica +129,7.2,3,5.8,1.6,Iris-virginica +130,7.4,2.8,6.1,1.9,Iris-virginica +131,7.9,3.8,6.4,2,Iris-virginica +132,6.4,2.8,5.6,2.2,Iris-virginica +133,6.3,2.8,5.1,1.5,Iris-virginica +134,6.1,2.6,5.6,1.4,Iris-virginica +135,7.7,3,6.1,2.3,Iris-virginica +136,6.3,3.4,5.6,2.4,Iris-virginica +137,6.4,3.1,5.5,1.8,Iris-virginica +138,6,3,4.8,1.8,Iris-virginica +139,6.9,3.1,5.4,2.1,Iris-virginica +140,6.7,3.1,5.6,2.4,Iris-virginica +141,6.9,3.1,5.1,2.3,Iris-virginica +142,5.8,2.7,5.1,1.9,Iris-virginica +143,6.8,3.2,5.9,2.3,Iris-virginica +144,6.7,3.3,5.7,2.5,Iris-virginica +145,6.7,3,5.2,2.3,Iris-virginica +146,6.3,2.5,5,1.9,Iris-virginica +147,6.5,3,5.2,2,Iris-virginica +148,6.2,3.4,5.4,2.3,Iris-virginica +149,5.9,3,5.1,1.8,Iris-virginica diff --git a/axolotl/tests/data/datasets/multivariate_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/multivariate_dataset_1/datasetDoc.json new file mode 100644 index 0000000..9c5261e --- /dev/null +++ b/axolotl/tests/data/datasets/multivariate_dataset_1/datasetDoc.json @@ -0,0 +1,93 @@ +{ + "about": { + "datasetID": "multivariate_dataset_1", + "datasetName": "A test dataset based on Synthetic Gaussian Process dataset", + "license": "Unknown", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "882a669722722e907af0c6cd61922e2b8ce92fb6fdcdaf3c1f6ca33dc25dfcd3" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "tables/gp_data_tables/", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": true, + "columnsCount": 2, + "columns": [ + { + "colIndex": 0, + "colName": "x", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 1, + "colName": "y", + "colType": "real", + "role": [ + "attribute" + ] + } + ] + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 4, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "gpDataFile", + "colType": "string", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "amplitude", + "colType": "real", + "role": [ + "suggestedTarget" + ] + }, + { + "colIndex": 3, + "colName": "lengthscale", + "colType": "real", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_934.csv b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_934.csv new file mode 100644 index 0000000..a49ad9e --- /dev/null +++ b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_934.csv @@ -0,0 +1,1001 @@ +x,y +6.719128702812473,0.7254263258353223 +3.9397005317789495,0.4123862051898459 +6.578095730947303,0.4104041440409367 +8.212554678166263,-0.1359718552358531 +0.30866297682347366,0.23210751418661554 +1.5200677371945446,0.24075273844178807 +-2.8816304416414833,0.43681868336095336 +3.3341471424871116,0.27559673340258145 +-7.214904997701236,-1.190497487741829 +-4.368582086240345,0.17026054572368698 +-4.1538148419803544,0.1476968216949026 +-7.708946365578563,-1.1974021494835194 +-5.041285459853642,-0.5018028970012427 +-4.680964383461092,-0.224720236244526 +4.5823210388147615,0.6913663509802308 +-6.7739353726743055,-1.0412648221453003 +-0.6552784239033915,-0.008219910700375033 +-9.588337090365737,-1.3689566824574113 +1.364225963324719,0.17749905516642128 +-0.36137016336406447,0.003715075399609316 +9.159517225314943,-0.29628495345010575 +-7.184432996503417,-1.1719295942604528 +8.655138158835705,-0.21968729563935616 +9.02289251063992,-0.21715536916202444 +-4.903703983921343,-0.4175387994977128 +-1.1620097517339367,-0.0638533449395937 +3.9821212631273144,0.5509331053122353 +6.443897992353334,0.5083975437497882 +-9.77449736071619,-1.502987794931701 +3.310715946036766,0.1253069500521266 +1.8569022824397052,0.08242728746079514 +9.602142172251575,-0.5060622466101456 +-0.9708595222905458,-0.05687166245149508 +2.348062467089367,0.013249252852419208 +-7.466269830115273,-1.2164497266068894 +-7.1306813996336595,-1.1547505872041741 +-9.795970362326749,-1.353243198869072 +6.338501869454166,0.6151991689052653 +1.7626261678353927,0.07842583731602534 +-1.6943578535687962,0.1714185266316258 +2.379771923408951,0.19757312784968245 +-9.734560927938478,-1.5213037853799882 +0.8245507607880764,0.21232976520348512 +-1.7098407567849705,0.10355429488997372 +7.959226337513003,0.01434784997314515 +-6.6706465544671465,-1.2018391041181635 +-9.032731962773145,-1.3141326636633979 +2.6605675494458976,-0.0986339265230603 +5.771633056620011,0.6727339476786903 +-6.373947931515978,-1.0800394176563008 +6.63671470636935,0.3787037678623445 +0.7437930003337492,0.3302376430937204 +6.83581819623987,0.48646122946257997 +3.9567786049209492,0.443892845323057 +-8.277756564410431,-1.3045787774937658 +3.9738803604105972,0.566573300426359 +2.809549475391382,0.09130099235217161 +4.5602124342901185,0.6228122645766934 +-6.6961907206496285,-1.1468711994306489 +1.3261344044859626,0.1275590035664855 +9.87725414147318,-0.6203077979227769 +4.463103495907488,0.6552645034175885 +-5.089406678736207,-0.560850208513192 +-6.0838801007978836,-0.9106917639181371 +3.228964035813324,0.2061727175201695 +-8.841414947708152,-1.2167163305519164 +3.8762012488389215,0.6001260841437205 +-3.08970862479792,0.29917486621495293 +-1.9913368200057455,-0.08748734113070039 +-4.529831250838235,-0.12584485811272944 +-2.6160312309961715,0.28682265076466984 +6.413265343700915,0.3294168126026539 +-8.333313693531117,-1.384954293647435 +-7.110936669371828,-1.2001480296653524 +-6.350262420260222,-1.3050233371895579 +2.80491077258278,0.10616815259449675 +-9.25691460067263,-1.3313768476600096 +-1.1498060751226635,0.08276610277916273 +3.0377404055343367,0.13257051086112706 +-3.145539242364283,0.26858257170833444 +-3.224821538690122,0.31505788025397496 +1.885211840809129,0.12270323475717462 +-2.2197767990740362,-0.01532120068682552 +-5.381322156563183,-0.5260207884424628 +3.43177757226584,0.4484477763602329 +-9.16049099004831,-1.2647130146029204 +-4.336854125689156,0.04934673363320628 +-2.1068506853493436,0.16679031465765215 +5.301207904982977,0.8223988681933493 +3.7065455802537617,0.5480945627516061 +0.6375019053550091,0.19014756475003528 +-4.923842417721769,-0.4647128020784528 +-1.3161641221487805,0.1751402089429369 +-8.712128040807691,-1.0830257012970104 +-4.225560778068909,-0.05643073430288572 +-6.956259544192961,-1.047553014063834 +-5.209868204401298,-0.3735965564495962 +9.172329265826477,-0.35485275396372695 +6.725036753511354,0.5384532795862732 +-9.006859510153355,-1.1978131916683292 +-4.355988169705762,0.06477751979860812 +8.081771861180925,0.020997588148325495 +-6.729072141263259,-1.203054129865091 +4.131860051606493,0.47817168354536954 +-1.517993745893996,0.04275436107684383 +8.877032885366425,-0.22132536025006133 +-3.27751952152755,0.3059998654429673 +-0.3534270554270602,0.2717570483373846 +-2.803518556602316,0.43067826017648725 +-6.178626481527413,-0.9999012074846451 +-9.221168163629692,-1.1676073873868993 +7.467139984524064,0.0537154792229938 +7.890400738406793,0.04025222723967243 +-5.26511500513001,-0.5134141425353972 +-0.2384977599666982,0.12506512169371048 +3.27196228428662,0.0740870751287743 +-5.0173877958579105,-0.33462755585246035 +-6.774252867768897,-1.24615845650114 +7.595972603190884,0.10956320134638405 +5.447347570047846,0.8761814044844383 +-3.609507258333755,0.12115297942671557 +-3.5013896248361664,0.3829459432530207 +-1.02451356375823,0.1565148953014776 +8.434952721216604,-0.11928704739914235 +-4.707321593643666,-0.2406609163902721 +7.116171834343525,0.29096415787952334 +-7.677849662929358,-1.2081998575885875 +0.15546419318985016,0.28390255117717655 +9.058045163702467,-0.39536594682478865 +3.8681212948209165,0.47474719590728887 +1.776298420182627,0.2465838874251075 +-0.7473563102584535,0.16904717412354936 +7.257196724162674,0.22302886782978534 +3.552062604547279,0.19110940435870527 +4.590296818498993,0.5866751933218374 +-8.486886616328174,-1.3435233021333637 +3.346003612976616,0.24603685415537394 +-9.125470895558038,-1.420466713424019 +6.174833673018681,0.6122082539610945 +4.205201539591158,0.6205281349791801 +-6.907185558783997,-1.3348842107171188 +0.3358641657035264,0.39269096079429966 +3.2985277323514595,0.1350780905505871 +8.037758401774056,0.21885230192256871 +0.599174723223058,0.16828365881510138 +0.10125188200353108,0.33967129908806515 +1.5262901360158754,0.20046421463178524 +9.433269364577356,-0.4629709826369814 +-4.90046452916725,-0.28919344359229965 +7.320590681601221,0.3425548610346437 +-2.6273595643208214,0.13332211956918938 +-8.44749597936779,-1.471776533196297 +-9.891331756878614,-1.4880715767571688 +8.339642435965821,-0.1254586182283429 +-2.093873842447156,0.21291034724371485 +9.449816182046472,-0.6102142469065681 +-3.2081535546727924,0.5145581117435095 +7.971162971855552,0.13424851330328288 +-4.3059972903091115,0.14130379492344716 +-4.8000254080756655,-0.14270964427610783 +5.826907638028672,0.8613697264916597 +-4.302819175122924,0.07898316091074768 +-4.977203609413227,-0.3135719871847394 +-7.136680309591274,-1.1401933252467717 +-2.9111010919477525,0.2338582997502252 +1.6720799708737601,0.35176081117182734 +-8.037926540626618,-1.1424301687702343 +5.942171398226623,0.7346029844477142 +-3.3873904391149208,0.20269987712880058 +5.745257666105722,0.7163102976601535 +1.8133846625801695,0.17543342282515278 +9.475920704687034,-0.7622939842411488 +-7.397678604847609,-1.313714906204555 +-4.129590480133896,-0.06771666389662631 +2.808044449736773,0.030236275740611565 +-7.328685849937927,-1.0482354840346708 +-5.073519638662599,-0.4749576484880269 +-3.9543605800221204,0.03936169889786886 +9.702298225454161,-0.6801592382582269 +8.578877335719916,-0.23607917766913097 +-1.4444791890013384,0.10079466621407782 +8.856391862605708,-0.4070442956414401 +1.5317205928941249,0.24447946018450964 +-9.056535620168251,-1.2620820467072091 +-3.3476745770301304,0.372290320890749 +0.11926559286731475,0.15780874217471796 +6.660201170963802,0.5536423386158931 +-2.1324548209174448,0.11339815627098024 +-3.4648520758661445,0.3762778073954467 +9.72913919467867,-0.5301808459674318 +8.070753596296655,0.05828102145915544 +8.249395718622647,-0.054735889425348586 +2.786863699144204,0.17987725588137093 +7.208700037412704,0.2657182417709026 +-5.935635644986004,-0.8427175866781342 +-8.503400556712293,-1.2803949299140507 +-5.586415718896038,-0.8352577566671029 +-8.819485375498516,-1.3307405193014976 +4.480156764596339,0.5342424962427484 +3.458218172500431,0.3389534405639995 +8.890806087034456,-0.32802923239504983 +-5.749459113665747,-0.8977122968746486 +-0.9240732986705069,0.09573034668622532 +-2.8982457468439944,0.37869435432913395 +-3.170104695968426,0.3850922203112026 +-9.20724655437802,-1.4811120236328057 +7.148838266175389,0.27915130297950824 +-4.792295780410729,-0.3204431095296053 +6.339664851550321,0.6253133324161293 +9.616450416794045,-0.4425078618947185 +7.192865906097353,0.2647034604112535 +-1.3717973187183308,0.24958268966604832 +-5.466086584169094,-0.7993186790546855 +-7.223352191674088,-0.9823289557199761 +-8.495766929740846,-1.3267433845204137 +-8.498565098211529,-1.2686821711866618 +-7.58712097727285,-1.1409552816484776 +-9.447679241849713,-1.438263300505498 +-1.4910462724726017,0.07291022235393488 +-7.90729328923284,-1.2860480703474542 +4.526133558439094,0.5974339893770876 +3.7919451303066727,0.42709248805911404 +-2.3348287921293616,0.19398783815698065 +9.869486368554025,-0.47093409519807655 +8.134608553532615,-0.007533083969252032 +-7.99548501895789,-1.224499533863491 +9.480499638710683,-0.5756723732776537 +-0.9229786323863998,-0.040331942289820774 +6.559655303633506,0.4285030236080119 +5.9915807323755885,0.8415776439939903 +-0.28120853541367197,0.16338132743769107 +-9.684331109189579,-1.5130905429575563 +4.428762853545486,0.5668397712047841 +4.716050395672751,0.5038852488116792 +-0.2915539578284587,0.20820738763529897 +-0.8263378770834287,0.2550729047156488 +1.8271009648586212,0.28615683188232843 +2.466039997720721,0.07055358761812619 +8.108252343674454,0.08118437679403226 +0.0686392865092742,0.14582227478890442 +-5.346166391452595,-0.5226564117337654 +-0.2506853037896928,-0.012191946535495213 +-2.342769046112654,0.11902444702735081 +-3.596412464165688,0.441415390176288 +-1.1908960269421343,0.0678661001333159 +6.554558520428845,0.6129091855277906 +-4.710064664801301,-0.2418082387921169 +-7.835145886519449,-1.1996057263281745 +4.87195327218763,0.689807278264303 +-8.36836551980739,-1.1459923353409902 +-2.45561944974575,0.2622366109382244 +5.779152042791553,0.599431341005703 +2.3031960390455137,0.1206120987264356 +-0.3542201483588947,0.12776641546704232 +8.387089485439155,-0.13124862483013267 +6.208711244870252,0.7162981225632731 +2.307151971911585,0.06029092343713791 +-9.141547697010473,-1.464456260495025 +8.04324293647106,0.039361937176932704 +-4.399376726247976,-0.04778951744659094 +1.9338528885087882,0.1449722201653679 +-2.199561511776169,0.27571838076458566 +2.220722170715863,0.1745933888435809 +0.3479595139650762,0.2164738512404561 +8.017831045546426,-0.051151069076911845 +-1.2738260649314768,0.24935312966539858 +1.9300250663280938,-0.05331755723666251 +0.6219688905146814,0.2968482812506593 +4.890463801852999,0.62215736363176 +-8.569144555601113,-1.1507675709419525 +-9.517608138709551,-1.3627828844449987 +-1.3371425339726173,0.100264276392414 +-3.0345323926882055,0.5187999219241093 +9.750941129202118,-0.4525448475281939 +5.08199967123657,0.7593104820914117 +-0.2618686766280742,0.11373113048639077 +-5.454815668980695,-0.7385045788181306 +-2.0516285802138294,0.20189544064329243 +-1.304546118175418,-0.004701102027514795 +-0.10173843335672927,0.21682910074847062 +-7.637132602303755,-1.422988871544002 +-4.5308356578046105,-0.04742789225187152 +-9.293549730133073,-1.4067434745159413 +-9.270633136801045,-1.4678621052222705 +4.721895700649483,0.6742198968492028 +7.252106070954699,0.2714075595295284 +-1.7746791983980046,0.22758028647174455 +-9.415818463040747,-1.3852722209849788 +4.839732459055689,0.7320715934988118 +3.1444152960458904,0.20491616602603646 +-0.29285252779337156,-0.03487200196803267 +-0.7019828403850514,0.06759754661077398 +9.005505928222242,-0.30897389044229623 +3.498641006260033,0.18897862208356025 +0.9783845985400941,0.15677900066707057 +-2.8377021406228042,0.3777224484280613 +-4.705002931517225,-0.2504463075998942 +0.9267372704502073,0.2485512179304211 +3.1197757586029122,0.0031244434416334843 +-0.9447088972192486,0.06794712330032276 +-2.902961377933373,0.35318391358404705 +4.175555110422881,0.6227540763995623 +-1.3209364583480578,-0.09932723380719141 +7.139368120756426,0.2496256312209859 +7.753522352424823,0.1752107643124768 +-4.636858951061885,-0.29249711646269944 +5.43161694374537,0.6881197859723635 +-5.9327423750916175,-0.8655637634661685 +1.631720540722462,0.22911409278019368 +5.136923988368473,0.6968221120773908 +-2.292117918711334,0.45988657830624446 +-6.234992568926615,-1.141410915947192 +3.247552954392447,0.24664793387140957 +9.210701180368062,-0.49457166367622035 +-8.857077689010598,-1.5582045316997928 +7.865990668249011,0.03515958728868275 +3.107292206964445,0.175362709768798 +-8.592462628739057,-1.2723337908706547 +3.8122230681721763,0.4458481149535187 +-5.635166284877284,-0.7966320952157969 +1.8371572443806983,0.14669113468063155 +1.4342265720101501,0.21008084532862273 +0.1874500348117678,0.1311834412324373 +-9.563057001470874,-1.4976118450299047 +-8.105082934642137,-1.2596535218333733 +8.142119788701223,-0.019882963309843873 +-5.077087232691113,-0.3373371353081324 +-4.598352332803724,-0.07350003619458419 +-6.246325417491679,-1.1958451226873232 +-7.232751436046358,-1.186707347805177 +6.214818843657049,0.6787417222385768 +-9.75190374552588,-1.5791704099885893 +7.816685179128683,0.08831045837574181 +-1.2417092635362117,0.041873132034784225 +3.7503505018847694,0.5359533451619984 +-1.998423182299348,0.15841260949522704 +-1.0229372695832417,0.2707682858117033 +-0.24396742217263956,0.12124730415235076 +1.8481434033695052,0.1547777003651807 +-7.958710573093075,-1.3735739886091143 +-8.849741370399546,-1.2769348990223062 +6.968388848995658,0.24628434427988594 +6.87803346503744,0.39642478481052423 +-6.810145879708589,-1.2873016345869532 +-8.638405263969076,-1.2874732717541317 +3.788434349443026,0.45102869539132473 +-5.250420624274845,-0.40198068616808186 +9.224158538168425,-0.6544137574824008 +6.035038182240321,0.7706196226383233 +-0.3677414622326669,0.05303960785022489 +-1.4938736569243805,0.2151383889998627 +-1.928838997637694,0.1825760896743267 +-4.688736715278097,-0.23876578505357315 +0.6578329376094381,0.45328093880231135 +-9.60780091442734,-1.238215035959677 +-9.440182498701258,-1.2836029481133928 +8.308409829209605,-0.22605553901517922 +2.451326855053093,0.06672083043433516 +6.699508296256656,0.5321213783153974 +-1.2746534278738544,-0.03373705180261158 +6.123882515066672,0.5237670334649717 +-1.76018048120131,0.1132530164414624 +-2.7976000063936635,0.34565354428711437 +-3.744245878367005,0.26268606347487594 +-5.670492209417368,-0.7026783899974668 +-1.5545106477446424,0.08589998625306496 +-8.298032039946865,-1.1198282167363023 +-4.020476415626604,0.030036633829929826 +-1.6706636829305221,0.13522293085597586 +3.0252316183588372,0.24253587993914816 +-2.990283857947116,0.19070845893034238 +-2.245512503782585,0.10257450503304508 +-0.2990546000247072,0.27897429097183946 +4.149624217759328,0.6409847230440429 +4.599628820366899,0.7755859232608842 +-4.450510464600397,-0.2539682210352855 +6.599188210403323,0.5860948262674023 +-4.931503728163037,-0.3388112316608811 +-4.331586754831616,-0.03813947232498758 +0.7439310725700565,0.38633950395541106 +2.139734241146032,0.3250495509621416 +-8.758999972086478,-1.2005774449782396 +8.283910048268936,0.08743210215981845 +-2.1035527481750056,0.1996566010194327 +4.686701726463314,0.622054845100881 +5.186787379318618,0.6897199049504043 +-9.152373063906577,-1.361443198579294 +-3.773566850587095,0.257569064365385 +-7.706673001970089,-1.2419342619934621 +-5.912094191658554,-0.7540139981014089 +8.172770406962393,-0.28890596247193995 +-8.006760091116526,-1.1421216634081173 +8.101240794823642,-0.17926195583080617 +7.106726794017074,0.2860880788021363 +-0.2992032068583015,0.13529338692507153 +3.771703676144611,0.5529952338794091 +3.4840001353244787,0.41944385878672874 +2.1998178605995378,0.1531942117358446 +4.382014909436593,0.5530111858869498 +2.7236356963643615,0.1156281302565331 +5.19556584549937,0.6796353298077865 +0.7342218926268593,0.14792010220946178 +-7.253030133363637,-1.1759186082451274 +8.312634499539257,-0.03306089740129099 +7.1969846375374935,0.25376826031296895 +-1.9423359747072944,-0.0430466014806834 +-4.141008364430734,0.12801233214686236 +7.208847166275692,0.35850558291205203 +-5.386890059419219,-0.5064621691114906 +1.643894962974759,0.03110925008744822 +5.077272269555131,0.8899552494133136 +-4.351728975478615,-0.023038639931524742 +8.505776206616527,-0.186172376821375 +3.481979232694661,0.2715902297022405 +2.4742608714186964,0.09612368586472231 +-0.004374049689044313,0.20793058224237443 +-7.079609820241171,-1.2975860339019416 +0.6570006255980267,0.2232730216472936 +-9.6929474328481,-1.2936229446063545 +7.247371856004165,0.3738777854907685 +7.6012306586003895,0.11656391801902719 +-7.458517308181815,-1.1098221538402684 +-2.9330826749457057,0.23615443021005184 +-8.446739515932833,-1.285721956145718 +3.6222536547584423,0.35154749426564735 +1.5713781570269916,0.20290620652848185 +1.4209976486937634,0.16867455006938883 +5.504682553028545,0.6898644181305074 +-9.285662004697215,-1.45112274186696 +-8.798704370849066,-1.3396253500647768 +2.4191436915993947,0.12726279520498196 +5.241418861575141,0.8389472423504162 +4.157346892393544,0.5882004786085601 +-4.939517335333896,-0.3934621754138458 +-6.605025323244625,-1.3053937770401531 +-2.398038573544103,0.34378696105304596 +-4.908387613070806,-0.34947484698971504 +-9.089195516667967,-1.3050127278577524 +7.215585266330358,0.45158788432062114 +-8.674353478257672,-1.0311175299254989 +4.825049478817011,0.6132828546988752 +-2.9666840571037194,0.4265556052689405 +7.812953512069441,0.22503279989677158 +0.9658373030450118,0.22602233371195637 +-2.8228764070106434,0.26408878569281663 +7.499529782411312,0.3293088594304593 +-9.674403495388578,-1.29668198498434 +2.793751063129477,0.20268282429896045 +8.846802431826951,-0.3402427343142532 +0.36743499800583557,0.24774819182514424 +5.904955112153258,0.7971328107578166 +-4.88807523158394,-0.23944256588136825 +2.90382390144909,0.045328760291246425 +4.1324527442646275,0.3610072993227661 +-7.193791381543857,-1.2387402211773126 +-2.8056935932927414,0.21444768723060992 +2.3701023489394797,0.45030698192898716 +-7.79746126350776,-1.1316351043152886 +-3.880346137109747,0.28446197807493245 +-7.556693486528658,-1.225835436310378 +-8.71707808370505,-1.2603655173978592 +-1.351715020473927,0.002512707344660789 +-5.212686350494273,-0.54331218284787 +0.978474964896705,0.11587188933871086 +-0.2199753901760424,0.15974172541894235 +-7.151550245469708,-1.252656887947495 +6.88513102025042,0.4330930903976776 +-2.785065860349585,0.29095075074253274 +-8.013062475096714,-1.2323848697139999 +-4.561351243723234,-0.11210846167427518 +-6.200502754311795,-0.8835277895952268 +8.282420587941857,0.005141239507252288 +-2.2868718151667404,0.18288922138279 +4.467765966042684,0.6125696864169252 +5.496635761079176,0.7880775759448233 +-1.2388199549173962,-0.0544652680158471 +1.2225622664344904,0.051114151884515624 +7.332194072424819,0.2783077406827885 +1.6234476969844955,0.25546116082219444 +-8.401426839060797,-1.260853588400902 +4.519419526673424,0.7547045024982136 +-8.858553885483943,-1.3125620542596623 +-0.12604211693412637,0.10919507742088594 +7.94155025637906,0.14548578217272135 +4.245726817761684,0.5868573367349881 +-6.429391212540331,-1.1873272348806088 +-2.3285319427611957,0.19899775280554322 +1.955847029162081,0.2148473422699545 +-4.832722843461074,-0.2513868522760756 +-8.791498948735224,-1.4173718142702245 +-5.526551840353008,-0.5288189564278193 +-0.9239252251971628,-0.07296778044114971 +8.280789131740992,0.171748231625182 +-7.237477414522218,-1.0628204003491821 +1.3245809756920757,0.18637325074685263 +-0.81506643288078,0.05004324817207796 +-2.509980429062533,0.37611408901513416 +-4.776715600198891,-0.1528781446665533 +6.312379189367729,0.6576482405022827 +-0.38789812561161874,0.28244022939166147 +3.170442811649128,0.23046238217720189 +-7.030260605005548,-1.0722244356679065 +9.756610104267224,-0.6913248532169785 +9.967810017461023,-0.6292899028453957 +6.3800763309903985,0.7817391698142945 +-6.60656823810224,-1.0411335724081336 +-3.941903033381422,0.14887767600049337 +-3.5847633502989633,0.21664405065593834 +-5.7892999868349015,-0.6401368939237657 +6.781849645350366,0.5157758770216807 +8.70091728694582,-0.2341699517035163 +6.114927391771058,0.4301409937619126 +-1.665714579248041,0.10365591786847461 +6.475555655445259,0.5886317953440796 +7.776430752700697,0.09962069638594032 +-7.505374890843704,-1.3273221378904656 +1.8561680372099065,-0.0029097226006682964 +6.9100736816535,0.12940962117537652 +8.900339446641944,-0.2866487361189882 +-4.366195593516927,-0.0555378094649201 +3.0823907475419747,0.10647470462553184 +-2.3378957403048655,0.31311605585153146 +6.801625379746596,0.3287883127641237 +9.657421944718717,-0.6711426540070242 +2.4048663129039265,0.03008727749147648 +-3.232946247205511,0.34636928124241273 +1.4532408798385266,0.07132671745569052 +-6.097222361392177,-0.9642649337570461 +-1.0576765049967172,-0.10368234392949449 +-6.435174683262712,-1.116652918531066 +-3.9349174993412066,0.04536681428057643 +1.366400027266188,0.21946775502070237 +-4.289846696381105,0.010526093137058441 +-7.803924674362364,-1.256766492284612 +0.3905260190690605,0.17737846614829506 +2.762548280678651,0.07223230703934819 +-9.90571776780795,-1.4941677538121252 +-8.153303506899206,-1.1894498726155671 +7.285004816720878,0.2176975713151203 +-8.276111621519359,-1.3553795296133395 +9.018238761575876,-0.5013409283790491 +4.3260680019944395,0.6142349186873036 +4.566943354540086,0.7003991710737211 +5.327069395840503,0.8766050191876479 +1.8917828964739858,0.11291479126110941 +-0.8561002272416207,-0.07553697279385949 +6.5218558202929735,0.638175586885144 +-5.528809104876956,-0.6759300581115217 +2.0079910134161456,0.22899779857237523 +2.4499061999684546,0.04378783226709192 +1.6012865517818575,0.1244473791703568 +1.5223680425232278,0.16779186032014276 +-2.279920449865589,0.20808502337150356 +8.480445958548941,-0.24821033318953667 +-3.3892300201669734,0.19606278602901211 +4.559212289438266,0.6023580169576676 +9.570154722175669,-0.5486963362218841 +3.794120319792036,0.25304987130633005 +-8.224847873617762,-1.3087783833744133 +-1.9435040242449997,0.1432535475320735 +1.2461782727080184,0.20383767536646916 +-8.804210785812447,-1.2267748056006962 +-2.037083997747402,0.06751340101540153 +-8.785023731348375,-1.2832258767606988 +2.3502182666045752,-0.035634276527927014 +0.8274859388820843,0.08611681788563805 +1.7309323617614503,0.15299307039795898 +-0.6133008862591538,0.032157314352842824 +0.1455990050439926,0.13285581787400727 +6.601794120135779,0.3964364738423365 +-8.918212044480548,-1.2926775725337802 +-1.1794232600418084,0.04528626089092589 +6.543257963407797,0.5152768237656217 +7.786993296232584,0.003823110545365077 +-1.426607334691731,0.04800799053918621 +-2.56551602159087,0.21052828299638282 +1.3642312294933312,0.16915230392222874 +-0.5179742036671353,0.05351657430808618 +-2.3774261581597944,0.23343817112461607 +1.6219135114641468,0.21971802885969519 +-3.0709459154631387,0.29766797612911117 +-0.10010792032564808,0.17679956267086153 +0.1805809852363538,0.1435385836807498 +4.76747218036083,0.7954968824721304 +9.799649092267625,-0.5072759762943729 +2.118836713649328,0.3394913814964435 +9.627142635331278,-0.7636352949728367 +-8.668228543255449,-1.3345483525545192 +5.3180115149940335,1.0498923460204195 +9.297940389666227,-0.6067759719989622 +6.988319488937833,0.40426070320548485 +2.913063731250256,0.15227020817259382 +1.7154833376305145,0.043884332746228105 +4.132257058025747,0.4167160161067417 +3.2986378548210205,0.22046726289182775 +7.543948959669695,0.10753539209009001 +-5.2189158651902225,-0.5836756713555208 +2.4502701001589027,0.20033894199511476 +-0.4959307537859914,0.12930983816429475 +6.135610659764773,0.6647596091766766 +-7.736624411395398,-1.2232466838255345 +5.190724205635885,0.7695459193962783 +4.00750824478826,0.6625926188378151 +-8.406991665524384,-1.3064081470453162 +8.294058692079457,-0.09415484752699796 +2.2350031740100245,0.09519761635288687 +9.73186644463745,-0.7403748701005683 +-7.126780345132676,-1.2493810594020784 +6.505914650685035,0.6764364595745325 +8.339732093342988,0.0021651865821439575 +-0.6829921076127299,0.10009572435815434 +-2.8995279376930267,0.26934940504327815 +1.642761470292374,0.1405337747820465 +-2.312201238243446,0.3875555040255133 +9.616117696420869,-0.6170792849305862 +6.448569603144637,0.6557127080250862 +-6.865282663406226,-1.0872229107371294 +-7.597981307056489,-1.1466324825026855 +-9.502747504485477,-1.2541071996743467 +-2.429423866623704,0.17897551430932382 +3.124219465664875,0.07978930466644497 +-5.97195513322896,-0.834128665345735 +-1.1500137974552302,0.1084804301868246 +7.543354428730726,0.08915204859482766 +-5.130295726324313,-0.4777862734062472 +-5.907884150747208,-0.7645081808516919 +2.6446732804028983,0.3750527499681839 +9.015820565133726,-0.4285134816413024 +-8.242537103304521,-1.299641148469871 +-7.345758791484115,-1.2483587732061232 +-4.1847979577568175,0.25895473716844647 +0.7195806061843726,0.09280599041736262 +0.32403992217242106,0.18030245283974652 +-6.926765266407546,-1.1354598330038206 +-9.359483691105286,-1.2096322089629907 +7.039742525345759,0.5489265661433901 +-3.1932558384163023,0.2085973147799065 +8.671565075387274,-0.2373156412604935 +-6.275462656286251,-0.9191686822210798 +-2.849426247818791,0.19994805260229181 +-9.324109877630903,-1.3602326263117512 +-1.8874352080300039,-0.028442231559088232 +0.5900172263845693,0.11166000423234541 +-2.9456335307366555,0.2472299914841298 +-2.8080468776171585,0.31184024602523897 +-0.05037736326451636,-0.02636236679042539 +-6.965416396771694,-1.3510611194302702 +7.434097484116112,0.24719037604177263 +8.641616060767756,-0.22534296824410363 +0.607694746681311,0.15506864239260798 +5.763435777114996,0.6583787302056037 +1.7530961902512843,0.31218884217910786 +0.8022411122302344,0.2827150258489559 +7.845938865829751,0.13959729729996634 +-0.061463891687566274,0.0714017521542739 +9.56268701966087,-0.6115892159845382 +-6.879828639816349,-1.19384343821984 +-4.893505124647897,-0.3531943548080539 +7.605985595869743,0.2589559193970556 +-1.1875166277594573,0.11587002300100388 +-6.261097715586018,-1.0295673825353509 +4.704703424292678,0.8120957938291475 +-9.274781394365764,-1.284278096056895 +9.315597589105757,-0.45607474508630835 +-6.208646029498412,-1.1356723327919944 +-4.039062676987122,0.32417348187766437 +5.494896763524091,0.8328345035296159 +-8.304763401129286,-1.1338503057195146 +-6.893948682309897,-1.3060922754323172 +-5.442032982901374,-0.6062703832565445 +-6.250151073649196,-1.0876765765217213 +-0.9532076852163804,0.03773993845649105 +-4.992499784150631,-0.24158571982120935 +3.4019146399491578,0.4277223884145185 +1.587966240526825,0.10390285942863975 +5.785174490390098,0.7608834428093022 +1.8069950755823516,0.2602805131080964 +9.549786179881238,-0.5212140408061825 +2.6189845072882,0.016140818081909286 +3.813005442892008,0.6663787857116911 +2.2628230469074673,0.21690132253755579 +0.27342080216731013,0.1703488309336215 +-4.768807542659674,-0.17328433269539156 +-9.293368859584067,-1.4152854111885247 +-7.958528419250573,-1.145872056390858 +3.1557979518190433,0.10922290972175241 +-0.20812233286524595,0.22507723241413902 +-5.096513338481143,-0.5659562277659212 +-0.6528675083110933,0.034132388985348316 +-7.275495425962069,-1.1542832532001503 +4.425155354122335,0.6678552967710977 +-1.0806749060023293,-0.1913531252401969 +1.4516224221162677,0.2404915582313554 +2.9034491672383655,-0.0240828746945011 +9.952266512149972,-0.6256345815737426 +-6.874834843300842,-1.0972349725787571 +-4.812114846298456,-0.18027009563060514 +-2.743440956071126,0.1050647466345365 +1.1263261145838932,0.3031035426999383 +2.3331262562577937,0.051593763485314095 +9.261434148805119,-0.35771110704491144 +5.403356852195479,0.7571644466647649 +-6.8567082424327985,-1.080162665126762 +3.1173320411287175,0.19411367856474152 +-1.8802926537312281,0.003933117536899247 +7.114868699551175,0.3712064253063588 +4.693987937841468,0.6172688631112562 +-4.879866201824111,-0.248813157258592 +-0.6532264202136684,0.13570069273591367 +-1.4171506594111136,0.26530806542771684 +2.2553373482914196,0.0480761593677695 +-0.12473822627197961,0.11508212778973442 +-3.7015647493736736,0.36972018614653945 +7.36602541174129,0.28000776135041805 +-0.1762903361421131,0.29379350772470647 +-7.940706303488546,-1.2704850054591155 +-4.710535097262852,-0.06752075661577225 +-6.862838262802846,-1.1771250245814544 +-1.6458776146441032,0.09265878930631632 +-2.5131788893791334,0.27917801523905134 +0.7826092146268806,0.2525039130273961 +-4.6280408955431795,-0.16881769889718362 +-4.690963975150964,-0.08866477538062272 +-8.983499101355715,-1.3675452367564158 +0.6381944710669742,0.12423960056118215 +-9.66276523554128,-1.4141048616152994 +-1.2842878147788017,0.1689370449466685 +5.3987006015871675,0.6838466841844981 +8.775429625438747,-0.4079129082846451 +8.756936315689682,-0.2240015309589933 +-1.0355712785999138,0.03063969271270162 +6.732676619833935,0.38119306537520276 +7.778291027012653,0.1000915150497456 +-5.340168900894366,-0.6074642563889892 +-0.16353080055118596,0.15006930622253198 +-5.929839830691602,-1.193209133952733 +-1.490307360770604,0.09444041029056827 +4.338795782203109,0.5971579284174712 +8.9436176076352,-0.4467379598532891 +3.680461119662519,0.4122690190924381 +0.5881197924357462,0.12287760002261988 +-6.8761641801603055,-1.2787994064923707 +8.990225431483086,-0.21417708959151308 +5.127424671155779,0.7722497596435484 +6.992119853253015,0.4034396231113522 +-8.60753781796366,-1.3511214587536495 +-3.947982861042863,0.19182768961738197 +3.5778948536507738,0.2804487187038347 +0.6484075553288893,0.3390573616976926 +2.827928046052781,0.08053314317760554 +1.1294515094608713,0.4046046924797975 +0.7195904966360533,0.07108642191103826 +5.520908627640324,0.7177305375269952 +7.659015183212547,0.18766717192997612 +-4.2065893527387175,0.010153477113344744 +-0.251149161209657,0.11587292448757452 +-5.691967284282558,-0.9172948102599165 +-4.5579525985711875,-0.14773230338800117 +0.026247481555806473,0.2335067756395034 +3.3221499227591877,0.427783261153913 +3.5760258665684437,0.3380690010593381 +-7.94546180110121,-1.3595619642029875 +-3.743522988410879,0.29709245846312593 +5.3880216715699,0.8636779124045382 +-6.260046915670436,-0.9582103841642764 +-7.571314377166757,-1.266636099138061 +-0.14125754606522634,0.01565520399066961 +8.68848220996119,-0.23747368830086138 +1.0651246818671787,0.19694357172330085 +1.9224043426992132,0.27349671349281723 +4.325721640441667,0.6769961310523149 +3.0993258226542597,0.1607857815380399 +-3.7203947585639163,0.15101869422775593 +1.5795935945481396,-0.006672578219750147 +3.15816279198617,0.3219848514274188 +-0.38631647129102475,0.0987261114023503 +3.7106062598790324,0.1610251019640464 +-9.372773755056691,-1.4360303182301588 +-5.452585176284547,-0.643470798990013 +2.3256603636814828,-0.03460352338632372 +-1.7353117064358123,0.03612444087298229 +3.806332347122719,0.24951433233715245 +-0.3959980577303668,0.1384093888230309 +-2.0900695583313755,0.14077400487159566 +0.03731728288741465,0.24375443953410886 +5.465313192543997,0.5884304232962548 +-9.033933946725618,-1.1846591573693308 +-0.487674700083236,0.11442258459588689 +7.969587334277995,0.015467356287071275 +8.037056671928301,-0.07272636618766738 +-6.785942831965723,-1.2453454712751342 +-7.278988885402625,-1.124685221720306 +8.107297763012681,0.03247466825785926 +1.9057291952451472,0.29652247780990754 +4.584114982759342,0.7052004797317517 +9.784580421385268,-0.7573985321536234 +3.501398504141506,0.30701756723314644 +0.9381107838295488,0.06245469218573382 +-7.559636058765854,-1.1958930228968092 +-4.702652764363297,-0.1847197831334461 +3.596446695086666,0.3071425653834298 +-2.963888756896069,0.18769923860119506 +-9.400814392353428,-1.3759955467629283 +-0.3751792476751383,-0.019824738402856062 +4.455413941427967,0.8077340351222795 +9.302397449485373,-0.578637343230634 +-2.3740872367931587,0.3608984018587675 +-1.6307349592997689,0.20480053180495683 +-7.559556390633241,-1.0943892911808286 +-3.5349331256940264,0.2567455819404898 +-7.208239210738711,-1.1660328298552411 +-0.06202930617547864,-0.00817772291444907 +-2.605345642524735,0.19524895634492623 +-9.283602633273901,-1.4365109007517727 +-1.1394151960528642,0.07685535461474605 +-6.960997749875651,-1.2615734397643994 +0.6342780185157935,0.37945474240793614 +-4.25178923299252,-0.01689583353317478 +1.9396931330434732,0.36872699218819793 +-8.975996678114527,-1.44682941075149 +9.327935430915552,-0.5294159825581747 +-5.956272328470575,-0.8946051020714385 +5.7820943719226126,0.6323787474315767 +-9.206885269120168,-1.35463791465672 +8.947519462828417,-0.3375902765865575 +-6.4376689155213995,-1.0223669172539664 +-8.181496847972834,-1.1945194646040265 +6.0489401359951325,0.6100793206971475 +6.648253013372365,0.5677606631548506 +8.794319147003158,-0.3682539906876917 +8.653071806403673,-0.19240202900390235 +9.803819407311387,-0.5144307935756154 +-0.091306784974714,0.17762778007022062 +1.9311902000035488,0.19563836993460984 +4.765681697261531,0.7824094874569044 +-8.111722265909904,-1.2684625985909852 +-7.036488330115208,-1.13448420739705 +-5.298714505175983,-0.5490487236322676 +6.528084317037496,0.41497250841777317 +-9.895643632946943,-1.543424531621194 +-5.909766705409876,-1.1013519475865245 +-8.47520830060428,-1.1739643806818068 +-9.837465950433764,-1.3564577111506517 +8.534373361978204,-0.15884499419620995 +-0.7666342089937537,-0.05709471273225772 +-2.987101996650594,0.29491768343033903 +-3.6271842137498074,0.28901942926870294 +2.223116262132954,0.03866889661721744 +6.765031810928189,0.5516299267169102 +-7.3905949611438295,-0.9934440470590762 +8.340745289641877,0.007529953938009218 +6.393752425694338,0.39846748706514296 +5.962033536636637,0.5710104032628531 +-6.536722364264938,-1.0450417580219187 +-8.898954041132288,-1.4650350962550283 +9.554714134602705,-0.5194657562237402 +-6.054387606558961,-0.9469586797894961 +-5.365683327894075,-0.5839725633060564 +5.3277138578891545,0.5417351240999174 +-7.15859742177721,-1.2500516774841062 +-1.1465322475785378,-0.03354377241726446 +-0.4199069779060629,0.1141932562210176 +-6.73360601513107,-1.129776638163474 +-4.686398419178413,-0.17669562858697435 +7.160454543295764,0.44052492802817367 +3.6646775312320123,0.33003946220008196 +-4.808531612805687,-0.30016591322406944 +-4.03485484171731,0.14775432023721743 +-4.126875774844718,0.1977334627084154 +-6.717553648788677,-1.1073690125806261 +-0.6210378328361301,0.26599545195838004 +9.080295393663633,-0.5664340635595735 +-2.7822798380137836,0.46394203984979165 +-9.82184349149836,-1.5752395592154775 +5.328818616787352,0.7293254613197574 +-6.091532598984904,-1.1096567967677253 +4.743493108463186,0.8671375492343203 +-7.138893033154182,-1.209725196834762 +4.130311730390339,0.4027233844882576 +-0.09690733988923839,0.15001704347473424 +-2.0511898815782854,0.26034631483851384 +-0.595965515736534,0.06596106666165566 +-4.872687984928451,-0.3444400288511092 +3.3605656233790033,0.36609598842036684 +5.510995763880992,0.6639194754843863 +1.6326473849198209,0.36514394037533315 +-3.266903418719327,0.1739310540921429 +6.525057944615607,0.62677419455253 +-7.530752959800555,-1.1670759683918182 +6.118379693407832,0.6510678040755805 +-8.703008716996862,-1.4196986031257712 +1.1970917720400465,0.23313461423036372 +-9.66847908271344,-1.376373798235831 +-9.430002884128783,-1.442672550065016 +2.1583673522959312,0.09064402478265278 +3.8190636653059684,0.4344149635529227 +7.334253146579638,0.464438233588119 +-9.340003296529176,-1.2362856060431995 +3.5333093379307594,0.3128703389602224 +6.995968909185638,0.6177586966699914 +9.322174218020688,-0.41427372756900854 +-2.1526146353782227,0.16985933321999752 +2.2238647816307338,0.15841936391638964 +-4.8513700883027155,-0.47255109316225863 +-8.65775051268631,-1.1702546253143196 +-5.6724190976239655,-0.6616042240754153 +7.664231644092375,0.26364777455377386 +5.7312054745510785,0.6735686354296818 +5.825606482243323,0.7319178189759505 +-0.6245294849751293,0.007262406391936246 +-5.908840950681423,-1.0145802536369881 +-8.991415905425242,-1.3062068233776685 +-1.7245364800256713,0.0014087461754513642 +2.164654509877071,-0.006511054342998909 +0.9093072575692709,0.181346084317703 +-3.7852802809895003,0.1052349686060067 +-5.692358255012335,-0.7116635137720481 +-2.515106366717475,0.1837644527720409 +3.2232974045748324,0.26429470807382743 +3.413250548688054,-0.0034303997908344264 +9.974869882305693,-0.5317799960471995 +-5.284113012889158,-0.7470513838382684 +9.899865708115037,-0.6279225359296469 +-6.98910328113194,-1.1540789854610904 +-9.400173889741229,-1.5158481478979433 +1.3698400957414272,0.10329896427321383 +-6.785150309715977,-1.1359395584066445 +3.626869889082687,0.4815227872651458 +-4.649212258261244,-0.18111659076165862 +8.843691265068582,-0.3101550812660697 +-2.6145496522159957,0.2327392446488299 +5.628470379312729,0.8927296959437955 +4.005995246612471,0.30809713965918223 +4.964604565013499,0.6451463394969834 +-0.8308981874145651,0.26830473419931167 +7.270395460967257,0.1899730695564384 +2.2325576050072686,-0.009014434434569646 +-1.084222947023088,0.11380787795116755 +4.454908651712444,0.541142914693213 +-5.178045643760925,-0.6128212636511249 +7.861102721252343,0.10703707408535663 +-9.346258583857797,-1.3126321785250483 +-7.893272322534642,-1.2422952283280295 +-2.797716556426653,0.25939138851741717 +0.639760519439605,0.17502788712496728 +-3.2720044511572115,0.2758757599776135 +-0.10084186946479079,0.3245842058922768 +2.3143499152701086,0.15421359634596146 +7.238121147640094,0.3313150175825475 +-3.943434467692533,0.17167079270866448 +-4.249749596327813,-0.04204901172787365 +4.111461280448641,0.6307432414776466 +8.875828821741848,-0.23192777687643473 +9.483924320791346,-0.6255807197304191 +-5.862788720051505,-0.7148662595159928 +-1.3108639129208868,0.22113951084929392 +1.3145395779214866,0.2098719505717769 +6.703957774218004,0.44315240879866813 +3.6009721173795217,0.35045119006807474 +3.7610087957441785,0.1694937884716966 +-5.163936346915637,-0.4166418787801358 +-8.613955044899612,-1.3094925637214914 +7.480365459552143,0.17933756090652692 +-0.7691225083618249,0.08455989992260461 +4.308702984152175,0.6040401772186771 +4.1575562972494104,0.43137300088513564 +1.2457000024288334,0.1826417841129464 +-1.3952516418630658,0.01774645456787252 +-4.58809416096409,-0.050912989445684856 +-0.30916344762725423,-0.04599431486727126 +6.773127193281137,0.41116338372666733 +8.64585861503738,-0.39878099536037426 +8.367176048613935,-0.2185524147472948 +-6.971457107089422,-1.2012156575443358 +-2.5003853062598047,0.2950421626200023 +-8.44186704664282,-1.0721431252901354 +-6.226905978686964,-0.9866709828584307 +8.616890451534797,-0.21478439581637418 +8.91799506663462,-0.22734341043140377 +5.916673597490224,0.8674958400097644 +8.115397481644898,-0.09956435930266733 +-4.273748056094835,-0.10492677981820897 +-9.427235645371574,-1.2299506504914228 +-6.582374892122104,-1.0301046570024497 +1.4122016462913543,0.3028306116419188 +9.444839552350587,-0.43816337076494255 +-7.402758100619966,-1.2269971656346694 +-9.689481271163384,-1.5464532945341447 +8.389932210230295,-0.09549844607284258 +1.8783247709161799,0.1455601532986987 +6.204240699719158,0.5438114365659449 +-6.158588078115951,-0.9031453762693054 +9.556419695559622,-0.5468785221946348 +8.107210888016386,0.0404189322583961 +1.6866220338237667,0.14192616067932967 +1.1602171252677707,-0.03176736281867337 +-4.358251429847865,-0.014205020581999843 +3.7714267295154396,0.3928223415848966 +-0.53846180810174,0.05985192873907509 +-4.003086366999353,0.15125462136447282 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_935.csv b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_935.csv new file mode 100644 index 0000000..4bb1fb8 --- /dev/null +++ b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_935.csv @@ -0,0 +1,1001 @@ +x,y +9.967572757398724,-0.3424373641006067 +-3.3138576135052222,-0.0999335017121406 +9.566273865735617,-0.2877363788531925 +0.8398493063524448,0.0526298643064614 +5.611050457718756,0.2262029077173447 +-5.859241104722162,0.007893088609925835 +-3.0695672782363825,-0.018107071362353074 +-9.878731463225364,0.1883790609056862 +3.681144464684234,-0.17983983155806074 +-6.610459099220193,-0.19436907996559322 +-4.128920751022843,0.04527620089200559 +-0.6966198732248419,-0.13703939522913677 +-3.774846445139271,0.10847788226052335 +-7.905400131921084,-0.20795334812384905 +-6.649281200075742,-0.26305041259654094 +-6.8461633710038825,-0.17158850453581784 +-1.3250224649645048,-0.0270344143550397 +-1.0386443433883485,-0.08266532683863118 +6.341335736941339,-0.07474665201745265 +-2.986966821581096,0.03747773817375469 +7.969877451767594,0.255281797804949 +-3.5920584772928077,-0.0669883110314527 +1.5569549712164044,0.1183446417669443 +7.03255786725454,-0.2116310970249276 +-2.944372362837697,0.0855175859589879 +-4.800986136790133,-0.1145590724479461 +1.2119232112784992,0.12752044802981893 +-4.477786498158171,-0.01233689378935963 +6.429820470858424,-0.028035077153187572 +-0.10977800550519667,0.15359607586372087 +5.067269218466578,0.22391321564624891 +0.804296066161232,-0.003109333886247123 +1.8052232767704002,-0.049564627165645136 +-2.6795705239918948,0.13308330387076395 +-9.320389909754962,0.01951602085238812 +8.888324204993815,0.033161111656012085 +-3.8924599614910034,-0.061950280736664226 +-6.328598393500463,-0.14676194585867086 +-9.920480502253483,-0.02634582049270015 +-3.5270710080252954,-0.09416407407688474 +1.5092928660106786,0.10049570543615688 +7.187892336732666,0.12511108823201325 +4.518279316217928,0.08118532558555572 +-9.778293216498021,0.3443087880703366 +0.06412857600301436,0.03274721390972577 +3.890290112376085,-0.032016311533871314 +8.523397807508015,0.18254427458608413 +5.487674531082941,0.41206613148783744 +1.3856260584773228,0.2527861355296489 +2.283347575449404,-0.16611567182278203 +8.916259849485428,0.10334839629993739 +-7.3518419543811495,-0.27616904050940294 +-4.065731113879426,0.21281886338448633 +-0.7350798350912058,0.02603070238829209 +5.581776551587311,0.1136741394468821 +-9.189633345206808,0.023625069514362863 +-1.613798596295739,-0.15251548486950653 +0.09756019776152058,0.037309003050152105 +-5.3828193418212145,-0.042350719316464266 +-3.501305444668823,-0.13959937715549892 +-6.600855428159797,-0.24093766487684132 +0.06342691056950045,0.08275012153018763 +-4.60363057100048,0.09322257298171885 +7.187503908267171,0.20483741271492004 +-0.1593447640158807,-0.07186706225497222 +-8.772489329444273,0.049972689623158116 +1.9789859130233936,0.19526606245344147 +4.816728924556388,0.3207346536533496 +6.55061411585784,0.01702565017653633 +5.128506077558601,0.2582903714895716 +-6.496318521805495,-0.3123850396397935 +1.7803733756158824,0.037519343879596975 +9.428663858697668,-0.1621811899873441 +-9.017215318249598,-0.10049523833291388 +9.17475019322261,-0.1517001167217813 +-4.318206221320704,0.13260508331522997 +-6.66256211668431,-0.21579036454218453 +6.26225541783295,0.069686540767961 +-0.446583701494351,0.003392938893627408 +3.1736540580035832,0.006131749601934716 +-6.669095949730796,-0.09979341270068148 +-2.647546448637792,0.2588256202853321 +-6.967342236584862,-0.4309732926370741 +-6.6693921722104434,-0.09110138369926699 +-2.9208811446303518,0.16902274948720786 +2.3553816736228366,0.021921567987682064 +-8.895265498232618,0.013446909659867168 +-2.9256215859241053,-0.07905625372149362 +-2.2449880212195654,0.25214370680313763 +7.118474085569133,0.07629124582548986 +6.378860487939192,0.18251617240113696 +6.171388206999219,0.17299035879817237 +5.863037595688901,0.15236356270727963 +4.607771873932265,0.06259243325857632 +7.516963457078841,0.12611509272622803 +-8.461227154184048,-0.08916067257497265 +6.248883966986863,-0.030373223025217222 +-2.1820101714181344,0.2851304430310505 +0.8754087771477881,0.3501617090252184 +-1.2724291431162484,-0.18388811289553633 +-7.679798207778813,-0.2738822011942298 +-5.803937758607599,0.2785850055344122 +-3.0518072929286038,0.02990659788091902 +1.1755648221346924,0.05480121516688512 +5.597995899119175,0.13646870507717018 +-8.832813567783887,0.09083259902145976 +-9.295635753094423,0.09556230658565246 +-1.1535027513021099,-0.23179111463223429 +-9.085906521100444,-0.010051229356103933 +9.01820332076393,0.15925142260629024 +-9.222001803949617,0.032532993395842535 +8.949540440762508,0.05848976708764514 +-2.087929394426311,0.22195395293312004 +5.494845002692603,0.39377224386032394 +-1.9211863484761658,0.06330380262279733 +-1.0248253902828264,-0.07638566138987163 +1.9240481347319704,-0.23477869595734227 +-6.73116711210049,-0.32761508963594477 +-3.349475970889806,-0.15687002820646506 +-9.871600398267809,-0.2761768069821729 +4.545768562700143,-0.04942206457068034 +2.7125466640267515,0.05439780491848989 +-1.563638121540606,0.1405532421320163 +-6.21780948409257,0.1295692001391636 +-2.7405268580996545,-0.020414873007959577 +-6.377862941398078,-0.042251892033892756 +-0.775972384017587,0.02098075951906543 +6.06038087723971,0.08182658313101204 +-4.997455015434609,-0.076169422689082 +-6.427564288944776,0.039492271646881474 +2.170401601648941,0.21386559198317823 +4.171873784164536,-0.10212866256799377 +2.481492060669641,-0.07723119066360065 +-6.31212249024939,0.009522198766332715 +2.1335982388445363,-0.16166656972973695 +-6.570631141483915,-0.241843568265478 +-8.594000742636176,0.03075326715840985 +9.157816977237054,-0.019554739947899857 +-3.1271870376974675,-0.0388475925412851 +1.7633807920723896,0.08624560118137106 +-0.2045225436738196,0.10859770103032508 +-1.7060714133913315,-0.07078499363166571 +7.940400489713255,0.3700352248621124 +8.179676700607608,0.3469904294068804 +7.225158004196487,0.14140963939532436 +1.0076313503188992,-0.009662080814653057 +3.86464390457046,-0.12496695827670273 +-6.130778132548159,0.25134456883201023 +-5.400333632016331,0.09663002339979399 +7.055468980184028,-0.11579002977769182 +8.006729073559455,0.393338700140781 +-6.63834854735466,-0.30933066877975574 +6.239797248255172,0.11024364387113292 +-6.520501520060048,-0.035412281225686265 +4.756226282565624,0.09028449831868379 +8.008014139793957,0.5302541856932662 +3.2438592042919328,-0.07864394182488843 +-5.97128265551417,0.04828367995317995 +-5.627108736762363,0.1605472228175642 +4.886319702293314,0.22813725256681897 +-4.385959212871944,0.26534506298494753 +2.2388328015458203,-0.041729265666445824 +-8.542059311933254,-0.10633939761952702 +-8.566165223049609,-0.14975894070840418 +0.09858050067635737,0.30641226834300817 +-0.7364391617765662,-0.07596495490274092 +6.155469936869898,0.1685766921026326 +-6.776174717505793,-0.23023878956345123 +-3.53544700000414,0.007056966863372857 +-3.6757162758681705,-0.031856636031298693 +-4.572692855289793,-0.06143915627890317 +-8.395317337897687,0.10285148559627189 +-1.1278388298971578,-0.13122954901108044 +6.989103402360929,0.10552838030543932 +-0.7805479127598005,-0.04187692967480357 +9.118573434885036,0.09516196900878421 +6.9251803160440915,0.024982431110339723 +-4.949400143531722,-0.06387614932710976 +-6.484995445543532,-0.24817109832912276 +0.1556788648043863,0.19637378114845194 +-8.218836599062026,-0.1846746483031188 +-3.0072023872496167,0.15873408362794772 +-4.563168516773777,-0.10167278243152411 +-0.16634500375275252,0.03698386248408743 +-6.7386818904128,-0.2368603837054333 +2.1443899172583336,-0.14424418762363234 +3.2068324208717502,-0.0023304064285784784 +-2.541651567135048,0.06254132476526278 +-4.565773637473111,-0.28057540141968784 +4.324728911918019,0.055803396653919035 +7.227340616410763,0.029175920752660324 +-2.438585846171058,0.34919617568896993 +8.615312628259744,0.3816638158410448 +-2.755019995588599,-0.036981436940222226 +3.4955639327901,-0.0312992660588731 +5.669473899613013,0.2186432343867549 +3.9502508785381214,-0.07370921691635571 +-0.21257005045207222,-0.005242745038901847 +-8.674860356791072,-0.15387091538899442 +-9.748269169521294,-0.013539204437116988 +2.988107767892543,0.08029973701799915 +5.858100609135128,0.22338379278380072 +-5.023913417664264,-0.09888834891320253 +2.62505086341481,-0.14428720293164693 +-4.442704356490097,-0.2053432679259159 +-1.7873238687405557,0.2598885688469454 +1.7775986295637338,0.22903922879440336 +8.062135011295165,0.3012992703181587 +5.8280570571859425,0.2823115586183091 +7.254391594535016,-0.01626058190912265 +-7.999024580358501,-0.043619676215170655 +1.901830508992873,0.016107480013084163 +7.707700703287949,0.36191660293908756 +6.307253977715818,0.06816183031246367 +0.6800718016111915,-0.04555970261083357 +-1.391436852025251,-0.08127145714960524 +8.233932656374062,0.07211557504616012 +-5.718689406588036,0.16759237018943063 +3.0369019704034232,-0.18307236087577886 +-8.796186244301147,-0.11842361442048319 +-9.043085519193085,-0.031175204405600995 +-4.717655496247204,0.02156138623716146 +7.515175910129655,0.21267428595887083 +1.1188493185648731,0.22976155471367182 +2.1612163232477637,0.04352151553216408 +-8.111025486194976,-0.1377596172391955 +-9.965975152661635,0.04290392262750232 +0.9088380291168097,0.166774453653518 +1.2708644096936528,0.04382924882795476 +-1.5632510449713912,-0.05114323579104346 +6.700156619705808,0.06105209329603154 +-4.41156017076834,-0.06524064891605816 +0.9749945171571603,0.13199579574967307 +7.954722528977346,0.27600450601370025 +-8.830845568788895,0.054676532394275765 +-5.95608345401477,0.1663154544430144 +-6.698206561202574,-0.2960632670545268 +-1.2397924798829951,-0.20601430116166408 +-6.479786479463051,-0.11888903541611322 +-9.627668078741305,-0.11510073936246284 +0.8117172863570055,0.03772906083088762 +-6.335258545241732,-0.0010144046364390508 +6.14921897728086,0.10139376271964966 +1.6408789807148771,0.0659726029622174 +-6.040642474377948,0.20315186648354355 +3.940732814843404,-0.16880189540672907 +-1.0569687304338515,-0.11629643133731267 +8.164373336028746,0.14646810874523017 +-5.2041801708752224,-0.09181040680074345 +6.143409159393896,0.15717717376020934 +8.785916341139743,0.17330659825464467 +-7.98032989552517,-0.13295815762503976 +-2.3570115398258285,0.40397292545422725 +7.884376028006827,0.24150259231921709 +-6.332774536835907,-0.06705984747697895 +5.19311968375218,0.3738687824539658 +5.766865442758062,0.19655371392826482 +2.3036728488939358,0.07160857771575646 +-9.939469048024854,-0.02754928500914777 +-9.929661203249221,0.15648215058903148 +-6.207376286673841,0.06353520454190818 +-7.431604057539008,-0.13220268839533322 +2.163739029599112,-0.1007171616461818 +-8.524173383138027,-0.18285860685882088 +-4.214270303031293,0.05921335801498795 +-8.602545461314023,-0.01892566560779567 +1.8850643978167891,0.1690370822209037 +9.765821082390072,-0.43132478491938725 +9.420215389343856,-0.12114205939325412 +-2.9194627045091477,0.15993979860550323 +-1.5563838270667354,-0.21255418831278416 +-2.252699141083841,0.27646009916898634 +-6.841458413441196,-0.3563374634254618 +9.84564397059043,-0.4459966827130565 +7.308487312602981,0.009651142030508625 +-8.880578240956435,-0.06380344801773671 +-3.6579546342425395,-0.11502919262442152 +3.6331684541341147,-0.012222658743174311 +4.597784952708931,0.10619383110853822 +-3.6878048619433734,-0.07447225256062509 +5.17554447792962,0.4355079646471063 +6.040784342217581,0.07607341272385285 +-2.9926148815474196,-0.09657448243380655 +-8.821128029050538,0.02753180703717343 +-8.96352806999381,-0.1489388443207462 +-1.0689347533024485,-0.07256706962646747 +-9.348027849699704,0.045835691892519154 +-5.774165788704767,0.17773223332364899 +8.59824788740546,0.021580285993601983 +-0.7370247362353908,0.0685534319779518 +-5.218567520638043,0.061587194956908675 +2.387969078813388,-0.0303712982782024 +-1.09883240870996,-0.1766944361693899 +-3.118277545767722,0.052762471081232526 +-1.9693404952797078,0.22724333271797925 +1.7333599512531528,-0.06880736140954156 +-7.797964061328862,-0.10306733613030548 +-7.5864611760104514,-0.19460253963649338 +3.0652774018771605,-0.03351469942285694 +6.432240708691811,0.08759064918829434 +0.9862774202772648,0.1233264572377217 +1.1678046755965497,0.15332034615975706 +-9.268409971337217,-0.0024389718816908833 +6.465600734398635,0.1769491012728946 +-3.765430763983289,-0.14099626723226766 +3.819430845795786,-0.06758465937426453 +9.808485158219806,-0.3033897515963421 +4.920250614714657,0.39686805743800124 +1.4398931522931946,0.07992553840246275 +8.231383193921612,0.2869045170619167 +5.166037191555217,0.49757723442564494 +-9.261882166777646,-0.02005555264333804 +-1.330997147771713,0.11570587202952277 +-9.949946012662476,0.0029846799976199732 +-4.680787256203258,-0.1569467797176456 +7.660276291410661,0.4655561147906234 +5.636376790859693,0.42677160905619504 +-4.447889744810297,0.05049314857634554 +-7.832959192429897,-0.159247333763493 +-7.017006618177043,-0.16632445251642694 +9.019481264132384,-0.08710686162858011 +9.772891913455975,-0.2788227634936652 +-1.1703319436603863,-0.022496304073455292 +-5.419698251956637,-0.17025982670714496 +-6.7860631086272285,-0.25773818137019955 +8.983195941925683,0.07431509349247806 +1.2270899085052847,0.0814015391514391 +5.3849829796039295,0.237444489893883 +7.177774178317983,-0.06016648201335437 +-8.110406930755452,-0.096868597126345 +-9.563326402235933,0.05208477985874286 +6.984353572070895,0.0025181562481671844 +2.4353556065452113,-0.12231020365806794 +-4.973758492589608,-0.11026526842513945 +-8.941957031330322,-0.014830455302142128 +7.209769686834115,0.11332929253575792 +1.4902819937373266,0.06290982362285037 +-0.6248582722082041,0.0559162558679135 +-1.1511483822748136,-0.1570679892723558 +4.299543032329609,-0.09358464545504674 +4.519675503102082,0.010487856850798692 +-3.948866017742043,0.01738753963385837 +3.3779841728293913,-0.20520373599081712 +-5.260558348929457,0.1052587727166399 +-8.193817780940023,-0.07651514498300871 +-1.0340995145802232,-0.26150242491568426 +3.5929285744129698,-0.2798555591164874 +-2.045517278442741,0.21760362626655438 +-7.441132781429008,-0.4695348883891566 +-4.483225148698192,-0.010709428666806124 +5.847634935099801,0.24432001466120581 +1.5983386765449747,0.1373591195629101 +-3.1335408200406167,0.01145919859868446 +-7.011008469153225,-0.40332360999396827 +6.482869950481608,0.07383034033910371 +-4.225808112792819,0.1372954883770951 +-8.719033374268918,-0.06652181010565077 +-0.9916671393171583,-0.04386321829687144 +6.05652087618768,0.18157084051250788 +1.7368524105280478,0.20858662475082296 +-6.397610817400236,-0.12198201168457515 +6.364206958311257,-0.040089632393858515 +6.015007589200536,0.18622683963201486 +1.0302137784149998,0.3964519561615205 +-5.235940281065838,-0.18462249483890342 +8.0887898652689,0.35988757299987834 +5.985128517338726,0.25204228199069717 +-4.740071327280839,0.014988498261699096 +-2.7830103705911036,0.10549218244806034 +8.984749132102383,0.055720633914456974 +0.35542795413170936,0.24654563360740678 +8.13481818625517,0.18771730725700536 +-2.2250816606946167,0.20508161440508293 +1.6295814330855674,0.02154662747650353 +3.6482982414121246,0.001972287353981597 +4.992772443118106,0.3651941644682625 +2.210675082518247,0.012738631649441591 +-2.435730873299416,0.23473806444540812 +6.44729238950552,0.07674821560734785 +-9.024475230022155,-0.2751906996568835 +0.4686831641866682,-0.011909552872436446 +4.749044852975693,0.09431203550019482 +-3.4339212873841913,-0.11592581176557745 +-1.264471200486458,-0.1753661082801642 +5.950352852080796,0.21714954707470976 +-7.958631848988208,-0.14489677935439832 +-7.040260923468221,-0.22696117327771348 +-4.084578048983637,0.08155500756435842 +-8.242877720286664,-0.16653053863303113 +8.08304339197299,0.34615724626551525 +5.824206317994074,0.1892680766937805 +1.0965063290670471,0.12045520831141168 +-1.5687836362256427,0.02419502131106933 +2.771187097132937,0.17852047609350447 +1.1994262973931402,-0.008671343444488167 +9.460095412412372,-0.1853093162749306 +6.235981801463208,0.22745114600869665 +6.097053510740341,0.032806917183037757 +2.985043019303855,0.0051289355340997135 +-2.5530298807104135,0.24529598269768038 +6.865063793761156,-0.04262150355806575 +-9.319183194079066,-0.11715267668551207 +5.974354037867702,0.08202727003497697 +-0.03157955785385269,0.0630325675653528 +5.241102926487059,0.17634976936984798 +-9.446544485548728,-0.07459236531317832 +3.4622292901040237,-0.11143698575605515 +9.635763354921842,-0.13911322693058867 +-5.777251134611747,0.09092149357944854 +-9.076648546321561,0.07042313194837609 +-7.7825306204946685,0.05384288721725136 +-2.8830835535400823,-0.022328831845564723 +7.249990216401155,-0.03399505713309503 +6.85434395604772,0.16920937342808315 +-9.059206095609031,-0.06252248808557893 +-0.35585367604467066,0.07928395678337331 +4.63548235919388,0.05869834449943664 +-7.8199745342237605,-0.17912425589986986 +-6.576642818407009,-0.24745742302096108 +-9.582696598093744,-0.04753491560209016 +-9.781300587662933,0.2076084869264711 +2.400500655329964,0.040144796720587905 +-1.4597829041458112,-0.1317513125270982 +-0.2508493546660482,-0.009250514723206843 +8.29935670015125,0.37499557805902173 +-0.3763180551628018,0.1656061511025229 +-2.432621863733262,0.2789369021058251 +8.711535558860067,0.18595968694819964 +7.3452028301723695,0.2822317697863428 +7.527566754377748,0.17251970423187354 +1.7204049595552071,0.026861036292108273 +-1.1097409256134547,-0.2105047429514083 +0.6164727175787732,0.08272732145657487 +0.4027581056908325,0.09959379655139186 +-3.436095578564453,-0.14186582590692612 +4.279288362743161,-0.11466666138440577 +8.121461952630153,0.4272392650000316 +-5.940242716128786,0.11967609416067629 +-0.4492205785965808,-0.02043561335281098 +8.67635136420656,0.17890134589791215 +-8.309372520024901,-0.16008570568141156 +-8.87153252488855,0.09812827774420349 +-8.28200489973035,-0.28655743876093764 +4.420521314218625,0.04365152397314024 +-7.796475788902243,-0.27905679753640467 +2.1623229224797846,0.11310454564225986 +-5.729679310615513,0.15315123372833486 +7.412779429903221,0.18262731819179168 +1.6137992269290855,0.1865813239769276 +-3.2154878190466856,0.07440195624711991 +5.989614523883887,-0.03616349574335745 +-3.9895605032793253,0.08035897478271606 +-7.168175126293161,-0.320844157842872 +0.1550857123155822,0.08767812137813066 +-1.158995126446051,-0.04346333132979778 +-8.658965794079684,-0.040622921185820435 +2.2784139322806634,-0.012742954387222892 +3.5019241075086565,-0.17118868123668318 +1.6151933477972946,0.19214992832802077 +5.4558828046822185,0.2787930700859517 +8.066454810085489,0.4758890266031382 +8.873562798959696,0.12930009052042227 +7.673767466348372,0.25628502610871196 +4.530740135855066,0.036196573207364746 +-9.2910253721329,-0.019651515907222028 +2.135049588987208,0.027088793530989464 +2.508005256974002,0.08609745622399483 +-3.9424617161667985,0.012804473015576696 +-1.6197198163214956,-0.2384118816534777 +9.697813927981215,-0.17632893557058543 +-8.643999285580833,-0.08027272535776839 +-1.2961671849182554,-0.001789592276684765 +5.101731305491878,0.16853149232005127 +9.465543673930426,-0.21841342571260278 +-5.317989819596164,0.1322541867308756 +-4.802165334189596,-0.02132154142925973 +-5.846317943541429,0.211299701237405 +-8.487877322874592,-0.2103238747901011 +5.818940151094276,0.2567536855382094 +3.020433184175646,-0.12461692016721512 +0.9652823053186133,-0.016231618148236066 +-7.840307331503865,-0.3088150928419844 +8.219014338392533,0.48851014059004916 +-3.44364672492369,0.007544349248675142 +4.583000403712356,0.013205687867744745 +2.502844959695514,0.05551140802185911 +1.2017053429612545,0.19399060639315474 +-7.870577958345454,-0.23962788186560496 +-4.517698637474474,-0.033662484519088064 +-7.148779446881317,-0.17246810117542785 +-4.467898426891384,-0.0018218110797528107 +-9.304049465512058,0.11212194536063644 +-5.623741836134473,0.22534715326924043 +-4.4478405288451395,0.14291662931952592 +-6.550756511737696,-0.13364921501223176 +9.691323933407709,-0.3647427073090243 +8.612623810241704,0.318809248462436 +-4.970290411251245,-0.1433094269888962 +9.48290662328979,-0.13784292710467397 +7.724682697597637,0.3599730723294387 +7.8312414277091555,0.32113776147683715 +8.475077204604396,0.272456156636734 +-0.15144652819599536,0.006792686280146121 +3.800297672491375,-0.28056922660527106 +-8.547519457002473,0.09217277176629435 +-2.5893245079747462,0.12138786929295015 +-4.19900514762781,0.00487232937216385 +-9.02975112813396,0.00492576134768483 +-8.47885707852624,-0.07694162387535361 +7.471339985047599,0.26405213662178206 +5.362508367853231,0.19131055293395383 +3.0700699084906584,0.029145091767318686 +-9.990034188609815,-0.0482539799340651 +-8.729436724085055,0.1645288498875634 +7.073955846499356,-0.07966320901719091 +1.9297630123031126,0.05664133542985709 +-7.012656754489367,-0.23280640112519152 +8.218595804244213,0.3513107495469035 +3.7338591491297457,-0.04414776615074445 +2.229854753942963,-0.05943410730954944 +6.727180065285413,-0.08418967160648648 +-0.21563249154958797,0.025838319215392015 +4.558854649410385,-0.057788160681777466 +-1.0243575453749223,-0.09309562203274946 +1.5103935976897418,-0.04188682385434017 +-6.742799269771176,-0.4099935892562131 +-9.324003442532913,-0.13055264267493744 +-1.760808362504811,0.11078823816146717 +-6.7602660184739705,-0.18027663357947493 +-8.69018617257208,0.012476002882067114 +-8.919813716478586,-0.12516415935142863 +-3.951059150928526,0.009212974509856843 +1.9692722340699333,0.08286694121293872 +-8.715427564479128,0.22154090006268468 +6.610294151244958,-0.02086536074754777 +-6.0208438275524365,0.02584648786941321 +-8.044664585047382,-0.2554634658719718 +-2.496380629439137,0.07112302458527991 +2.5566490496831396,-0.022726005035434822 +6.8438231413819075,0.07934921355013143 +8.715776079044044,-0.0021202202412892446 +-1.5427612738162377,0.016637599882247243 +-0.016787984605034723,0.021191732718534834 +-3.0932190244072455,0.016850434164575476 +7.608053914716923,0.14095387240952986 +-5.722199106799648,0.08024420077757324 +1.4458710551769158,0.10203299053927116 +-0.87687723611214,-0.11388030009134095 +-9.416746621356712,0.08891448332399765 +1.0179624712929236,0.10072554721965404 +-4.168957870688104,0.12518133772535622 +1.050131744071777,0.048275925765386474 +-8.289427934409737,-0.1067542245976798 +-2.208791817268727,0.10442024550558926 +-2.260874737523979,0.2774482453670973 +4.490419017841219,0.09989935717545367 +-6.257704972615991,0.2826949179051932 +-3.651712763912776,0.12439505369427635 +4.120855548395866,0.022152860205359476 +-6.309072632192908,-0.04897533081578663 +1.6923762320744586,0.14710730894852947 +-6.538547856791354,-0.1468041293813352 +4.4280158990458585,-0.05300841571192097 +1.285793618988098,0.13476416450521295 +-1.05479350149675,-0.2206945874372357 +-6.04462846652443,0.008477108277206297 +1.6928979431557423,0.29015248353103373 +1.0005510336166914,0.04388187196241995 +-5.727732536523815,0.11583289691803735 +4.498637697461284,0.10146777898993035 +4.443855199187068,0.03483015573206535 +-4.697502226738521,-0.01055066652933763 +0.8660544388659108,0.10489036698085323 +4.299165908230179,-0.25204449739863993 +-1.8895330021858001,0.09372478350457289 +7.939659822603539,0.502052084837926 +-7.965439820373096,-0.1961177297524172 +4.223107134044692,-0.21519047290428947 +-6.890583421902051,-0.23908462479648687 +8.185451294611582,0.3962641746477985 +-3.950032774370054,-0.10941023793826984 +9.234207251194757,-0.0011714316309191786 +-0.39846999170530495,0.057660947187921696 +9.699616589727608,-0.40317218388036813 +1.5774689223870766,0.040634854907369955 +-9.833211910415049,-0.01631138093218052 +4.811242187005398,0.08581464017276562 +-5.7466520743515215,0.11418935177851064 +2.5560158602247856,-0.026989161925612015 +4.505062226898904,0.08566028343261392 +7.837946317837567,0.5601782573365814 +9.774691766598792,-0.32814004882118697 +-6.841004504989287,-0.39480924491745506 +-7.9574762169075655,-0.16076501805801938 +4.825465684093855,0.202076014607758 +-9.550990588790057,0.07651702317533136 +5.955655178256901,0.14582426869852314 +9.662172035664241,-0.1408120431127582 +8.356731212922494,0.28926168875638236 +0.12555170393850545,0.2311469486737177 +-9.584361663852125,0.13605486410346593 +-3.3039474642365807,0.15430662610186718 +-7.50921459501631,-0.08921053693976391 +-7.865377334818796,-0.04248483440552567 +-1.297966647367307,-0.12613635902006293 +-1.3935684629553826,-0.27018540649092376 +0.2654777830836572,0.2502333607866021 +9.676141116403695,-0.17090142063385905 +5.023290370463309,0.1840331503819928 +-0.06809796508844279,0.06743293862051733 +0.9959541969788219,0.014317102122817948 +-6.438430523038434,-0.1941628626156978 +-3.003786215103146,0.09902009870619571 +8.54080771367996,0.28511760015779875 +-5.9120030415529,0.09218339249524041 +-3.1125175354662145,-0.10639335316291981 +8.778452362864371,0.018268749061147282 +6.714559038108313,0.12678166150377387 +-3.5819147407283443,-0.18385603767001657 +2.7069852330653,0.09794275105598882 +0.32164502457071364,0.21194227884793765 +4.211202506570007,0.050281152798099 +-4.373179850796532,-0.1589200078397388 +-2.222301710077268,0.22048878512058107 +3.7417949244668147,-0.2340775863214189 +7.730844655412881,0.4624911567261905 +2.5857816654988457,0.13737122904592658 +-6.7196350426503635,-0.24804874099932425 +1.1334278296985865,0.04935721287262219 +4.390999761198419,0.03888171304012254 +2.0765551197602012,-0.06787364480980951 +-9.726721590861324,0.12013804371110837 +-9.03954521867172,-0.07004180286817496 +-6.632974609591863,-0.09905875696315382 +4.244283981570254,0.011013817992176628 +3.3308987015973646,-0.016815339894146582 +-1.9716696450483475,0.2559759491200163 +1.76620241530558,0.12356297182821785 +-8.664720535625673,0.05649845450374818 +9.05256879229789,-0.004566707618828783 +-3.908918098166105,0.12109718007916083 +-7.980362375768549,-0.3077625593212933 +-3.1651961532806894,-0.099425790953974 +-5.164623681847225,-0.12123478223880618 +-4.114029512507621,-0.01238755361436273 +-6.7099954328433,-0.3171592450773042 +4.15065800562914,0.05283687820824649 +-2.0217098644092206,0.13877186688754406 +8.68931659185131,0.24466216456089257 +7.43879689488184,0.2006719767766608 +5.457721641318809,0.27006938464471003 +7.159427235397011,0.2447176297669265 +1.53809344889034,0.2797460862812289 +7.705782784121617,0.28882634271465146 +-8.003406000743052,-0.1384095370571522 +-4.20281536242539,0.212528833183742 +-2.227092072994905,0.2332299726437581 +5.6183369755497345,0.21725349813220474 +9.311525000450395,-0.1690782576265351 +2.4940162857634363,-0.030379738541914468 +-1.1969286077206132,-0.2393061483753183 +-7.522878623112259,-0.1803633938364558 +0.8509315330069711,-0.024841081736422244 +7.176141748926629,0.08367200783038764 +-8.125031502688486,-0.18495444838179767 +-7.98653833788154,-0.07120366108973383 +-5.4508539871438,0.07522197805332764 +9.081306927172562,0.05547563703361983 +7.073090926303137,-0.05193545007069316 +3.5925034199909907,-0.04041865831384352 +-2.0007044396837377,0.22354746784360244 +-7.773623320542118,-0.2168835972364266 +4.135494474499048,-0.04318716577573437 +1.2547555432544932,-0.02157420880264621 +0.857999203582068,0.11661706193547708 +-4.907551455240341,0.05295067108734239 +2.8938761861562905,0.11146943760769357 +-6.908743063460925,-0.1632549358574393 +2.385726982079199,0.05236667187995775 +4.985659531962391,-0.03619116775287523 +9.752778871810307,-0.25256991575069476 +3.952658826930575,-0.2153790224787404 +7.423802874764366,0.33197231300826335 +2.8540620843962383,0.08052586003131684 +-7.2660061400615525,-0.37601659786533204 +-0.334381313812564,-0.11252195137652998 +3.833120202793619,-0.19751673283905524 +-3.769092003084795,0.012000009618943888 +-4.056382073500746,0.14060951897808424 +-1.2255967560370795,0.07998649490139825 +-9.873647647822548,0.11059346946301196 +-6.080154942536286,-0.04519145088513227 +-0.9542714103908487,0.1671804155339664 +-0.35027761203019203,-0.057887866503705906 +3.1492002166332296,-0.19846432510248296 +9.647185428577252,-0.36751741412223315 +1.3696896672344891,0.17050514525001734 +1.7203328727230094,0.04585110113559924 +0.3129020717608064,0.04466807336177077 +-5.624345680075873,0.16969937224124287 +-0.9519107780096725,-0.10723981384971817 +3.1303781438859564,-0.02475204809065372 +-4.919207559353026,-0.01874913270502203 +0.9530767243734886,0.23196295755908697 +-8.542188405558356,-0.1244489523911241 +-1.6375146017916329,0.12758625756295142 +-3.157571165550639,0.069594840707787 +-7.748923916636979,-0.2781774362131007 +6.501146508174259,-0.004878013147397789 +-9.664628995285632,-0.015158363405940657 +-1.5772744904541902,0.06159760571218746 +-3.4068031951903244,-0.22296983005216608 +-5.254179085962129,-0.34646717733424304 +-7.925936137061612,-0.25264763300747634 +-0.18112759124232625,-0.03640427247979558 +5.044972847005077,0.21611737631909722 +-1.8958109773521326,0.27056619984931973 +6.873363444855496,-0.0819621051015949 +2.34224329003792,0.03554076058758492 +-9.099209567851755,-0.023644589627075684 +-2.750562177013549,0.20879164143689857 +8.440678249973619,-0.015130658932242746 +8.865640382502761,0.2255657208136668 +2.496001022378751,0.04194257875184233 +-3.5999514492883034,0.04826890504837439 +2.9231168523170896,0.1258438715474015 +-6.4710995671747265,-0.16647288158375348 +-7.627477516552137,-0.25483162887780075 +1.3488310053389974,0.16396152502030478 +-3.7337176219912394,0.11172446181099992 +4.105030701148888,-0.1646522205861969 +-1.6309161863161936,0.13972416423849998 +-9.180855357759555,5.788613495685754E-4 +-8.999350630266267,0.04218304015949157 +5.946941239584618,0.13464727295513806 +1.1769356992262878,0.21035962831374197 +3.329543900289696,-0.16592239658534474 +3.96624157945916,-0.20426069024582316 +8.31044339535503,0.39647534799764433 +-7.344055504495589,-0.30070001891782994 +-1.398702561761418,-0.039293753224549345 +-3.961574434761395,-0.05085398664605014 +-4.455579803776889,-0.13700411079554878 +-0.9716490624207985,-0.0235168100542765 +-1.1697440749176646,-0.16959695911481476 +-4.735485745441906,-0.18291803876943513 +4.491077907840374,0.09598136133882158 +1.55635967154819,0.2246447314280144 +-5.225288200607871,-0.018483570933756008 +-9.499106281246483,0.09083974608552768 +7.117110904958416,0.05761235432928684 +-4.4338461597258,0.02407651412680068 +9.168614476702068,-0.08993978168776778 +-1.697309268935875,0.034468580186338436 +7.6101127582398655,0.3761850217029957 +6.801734444364541,-0.015388515386390245 +-8.214359030928286,-0.26676083992801025 +-2.7332720297533175,-0.11043574902824702 +6.787928883416355,0.014924677283915076 +8.96139190554024,0.002159748645080714 +0.6621279025642846,0.08053322180981862 +-8.807398371367228,-0.1628103554648606 +-7.256113231513378,-0.35687625734178136 +-0.28224635561062783,0.036521755434455974 +0.8680102923997026,0.181596195377826 +3.501226576287873,0.015003485056631738 +-5.555679270603049,0.17684543745540526 +3.0976693996623124,-0.09183126458036435 +-9.530956996269456,0.012922060188938852 +-1.6100491169792406,0.17952410684030035 +6.054754916798672,0.2523579224077128 +-4.551292348338297,-0.10714927198707208 +4.407125407388953,-0.024752985131657537 +7.913085141381245,0.4329444115952579 +9.618308253431383,-0.0985476255671312 +-6.593994577747765,-0.2375572777032085 +-0.6624796030698512,0.10609960154359567 +-9.778263534687355,-0.030447928664824674 +-6.954751458544806,-0.3850835971690809 +0.7375333088890592,0.045179651304052076 +4.757664370114089,0.11934735563705776 +4.8887626179895705,0.30405015501147725 +4.035138899719355,0.07300400343639128 +-6.077814418665327,-0.041391601794970714 +-5.340409657941279,0.02047785948950948 +-0.32943803851849474,0.1662022242225448 +-4.067277537714764,0.08196528973668306 +-9.471315210031062,0.03351514788588212 +1.55407882373051,0.2009132581416639 +-3.8193381844587333,0.005634929205484189 +5.068553934926317,0.16507483345955043 +5.49309720218902,0.365113895038093 +-7.140499604929321,-0.28053692528248364 +1.1915227528257688,-0.0900212521921279 +5.487261094555716,0.3274581989785552 +2.5536147091820776,0.07998518191826122 +1.2828777965760807,0.17540442938414597 +6.72758495247006,-0.2005845446138496 +-3.1844641399306806,-0.04522882038552849 +0.9071585235668564,-0.07352475179435722 +-8.008582804179266,-0.032419466376572864 +5.441939886794005,0.25145002455004417 +-9.179039007469608,0.11989704726569989 +-1.7178832103506192,0.0890568956395869 +-1.0578615913197975,-0.0027812444437481376 +6.026900076906657,0.18317271338180582 +-2.424622855384357,0.43577727052221144 +6.845889679770284,0.056205176477116134 +4.711637203171787,0.11841468168073604 +9.633830322392573,-0.061623296874005196 +5.849948031316167,0.09588292960893807 +-4.894975170854012,-0.2105536613408157 +-8.739781835323551,-0.1706004290530344 +-2.8729164735197,-0.03715290204281549 +-5.457348420917452,0.12193381597450939 +4.538188342507139,0.048380611505168224 +-9.16801171060158,0.11749673388447582 +-3.808543596236284,-0.18577001755784028 +-5.322163744425147,-0.019300507727643676 +-0.4382605731342615,0.13803816315190218 +-4.93466302775181,-0.11497953873663201 +-6.5819869930622374,-0.07052650098935007 +-8.25574825149161,-0.04655778993978911 +5.529575059626847,0.44022250629488957 +-1.7427302421305093,0.03434848571156877 +9.490115291135108,-0.1516200599171818 +-5.873589890460114,0.0999393826752326 +1.0533669565882242,0.11422313917721726 +3.5106834724153795,-0.3516820895241479 +4.789591349882244,0.17870466051077 +-7.596753525758594,-0.38940593209509444 +6.16426440200958,0.0760685704787107 +-4.384407213610881,0.25265366215350465 +-7.2454066713773635,-0.31687253888295447 +0.38083507572732245,0.036652519898593416 +1.0463725210888075,0.05566181457818181 +-2.709296452154426,0.3379579881799496 +-4.176876009347366,-0.095474997691177 +1.9394445933520323,-0.0063663096993655105 +8.054282496384381,0.27564455334631693 +1.544623150874024,0.19299551183101277 +-9.7042906063438,0.11096481013883525 +-2.7303077337015265,0.22591295590150182 +-9.804208500230885,0.07749108670103705 +-7.005996346634432,-0.3971448222852011 +-2.583980331747515,0.18292598012634073 +5.4464482205758635,0.30695880392838404 +-7.795193034546628,-0.108963395255759 +-8.21531788187357,-0.11319695658635966 +9.236407281203938,0.09850636949474224 +-9.86524650010043,0.05281520750068537 +6.548826258234431,0.08497181995227751 +-2.2067131814737944,0.3297441131980954 +-7.706390073023183,-0.32748739767510604 +8.602325074812505,0.1840028820666264 +-4.604107595199265,-0.021648546491884836 +-1.9388102838555366,0.03341679389438501 +1.4760025192678263,0.15870003754492296 +-7.020319709458555,-0.18436274316528029 +6.27707558600398,0.12187501333735934 +-1.5821428121779633,-0.006370016334934204 +5.974678740002059,0.11379647358666156 +9.079139512181825,0.03265288272901844 +2.0178771770400594,0.08826641917550801 +2.426432335429101,-0.15644322577933034 +2.7387816131417875,0.008995196875664043 +-8.671003071951574,0.02169834878061746 +5.178033606446606,0.2862819626410465 +-7.680951077457606,-0.22443871046350353 +-3.481909313158869,-0.16666056553997544 +4.9516700660880595,0.4458551092345193 +0.7077128422951482,0.04819679768384999 +1.243290082229521,0.07932998661704155 +-0.7537372230210302,0.01132711709459544 +-9.332061467936228,-0.04436370095055073 +-7.033462561312946,-0.16320198868261834 +5.7439345307105185,0.20772256039762624 +-6.921860145297046,-0.16913194341630505 +5.054342515314447,0.4643100224628224 +6.250099721667674,0.15744936304983287 +-5.879969173812757,0.10824196747395987 +6.865466616904854,-0.20779451547278782 +1.4221033020584617,0.24029608662794213 +4.880760860441647,0.23946882867905192 +5.513507085390144,0.3135203695780516 +-6.859159558045861,-0.15602383697316782 +-1.9537222276255974,0.21801873986396447 +-9.782580656406182,0.03306294732632986 +-1.8253998287916673,0.18996724002935783 +4.4427039949449885,0.09016517884940652 +-8.93755096297907,-0.08555423193166378 +5.7671527033361425,0.409766524274489 +9.903662800840717,-0.39745871185505677 +-2.9677737453388886,0.1638869832002772 +-1.521835137424663,0.11446223618853138 +9.115510094664163,-0.047358310688860594 +1.647324890355101,0.1419782613073069 +7.816442417150533,0.3471454632901434 +2.42650094814179,-0.15973621348713132 +9.192910646831223,-0.115151638943555 +-5.80895715625088,0.08473618542700227 +4.382872811768092,0.010384067927423864 +-1.5424833828735487,-0.14122672797011432 +4.238167907530425,-0.021991618455765992 +3.4515573035483667,0.026246261675813926 +-2.0028465608773023,0.35346781783128056 +-2.7143380969524915,0.25269805125547123 +6.493682698579973,-0.10669138278290767 +8.449147580657375,0.33738941233908143 +9.743607325121989,-0.3682834979067883 +-0.3130946471612148,0.030367052970418914 +-8.468201275123292,0.00250873531439344 +-0.12073933391631186,0.13055066226729067 +-6.453607698759921,-0.0454367735089552 +4.5843015321294125,0.08357504759823704 +-4.42627707944213,-0.19927212322967686 +-3.768890053618712,-0.048509266874921325 +-9.311285765992817,0.003390195716079801 +8.573336276961268,0.16636960033103843 +6.852183069133876,0.046505208094049705 +-4.718422793302594,-0.011946126044343253 +-4.06325977663608,-0.09477773510902907 +9.495900729850206,-0.12490100383873545 +-5.089954768489115,0.07944011110479457 +4.699228075300695,0.20047781384854646 +-9.139052708123518,-0.13994902881656454 +-5.976362712575911,0.11738840740765671 +2.8845294554277108,-0.05627808415965679 +7.990598258773375,0.2572861751077753 +-2.678573033068808,0.3428362969471599 +4.854160137309108,0.26553873998812894 +-3.045209781115661,-0.008265438472530041 +-8.067170218247352,-0.17456075587568987 +-7.654782981880754,-0.23705766581054477 +-1.2753164526534775,-0.08107444391677589 +-6.063179715078121,0.11077853763819928 +-1.5003588032914994,0.029079188776406743 +-9.715773621954312,-0.18844652857554164 +-0.17678129221689076,0.06484999529656212 +8.017925286423079,0.39756472180980595 +6.859804560739814,0.01804464205902743 +6.390837992165031,0.04909921878275541 +2.1188783824690116,-0.05468874193311481 +4.836205044375252,0.1400709692450331 +5.70026674070753,0.22235909733184317 +8.137303049814065,0.3355931400651527 +0.4770618648500715,0.0855032221979913 +-8.28013265938089,0.07944655963305644 +3.020618452750492,0.09701947078569123 +-3.431086702703663,-0.2989041666936694 +9.837425429770306,-0.36408530320228794 +-3.7171845110197625,0.04322566808456516 +5.965487627641007,0.0975662867208074 +5.3101644247878745,0.2996733694252112 +-3.980914697743385,0.02250305956246245 +9.443122773975222,-0.26705597988935253 +7.570860868116868,0.1388965426568874 +0.5459308619005387,0.25998470763873904 +-5.411459216726557,-0.10724662989308122 +3.0412847971731654,0.10262298998883097 +0.39623838481253415,0.22873574816695852 +-5.285287915808414,-0.014715139345412555 +4.544986656014881,0.011741158281656713 +1.4172179546402432,0.17089843137910582 +6.16513720870013,0.24978827485969635 +-1.3988752611959434,0.07985975109012322 +-0.46850182846723953,-0.06253498426867374 +1.6642014836044794,0.038610279055417904 +-9.675533370874675,-0.0424113187672696 +-0.4908862132006995,0.004552789195724563 +-2.3222057780602245,0.268587361565898 +5.844415146813699,0.18439070743016928 +-6.706669412505101,-0.14499743521756767 +-0.8195016961578805,-0.06748370572768755 +-4.6617155051574155,-0.026373432408507435 +4.264091916888756,-0.04028187250578186 +-6.200973303404061,-0.08426673950651332 +5.459539062684868,0.3521088711179976 +7.062936641663317,0.042266360096154525 +-9.101952337152255,0.16333121297734612 +3.754074053044562,-0.13992186284282018 +-7.69733351141101,-0.13989890321977788 +5.312502793909495,0.23235281615263148 +4.745712376328868,0.1684172604125217 +5.267726749267791,0.28913006532365043 +0.7543552229653994,0.042854978891124176 +-2.8200548572729507,0.11358622383784404 +-6.8559910817571454,-0.163667900395776 +-9.232899288016789,0.13266074663597602 +7.73580123512847,0.4533854400991457 +-3.036623418916289,-0.12535901262339946 +-0.9931522835951831,-0.1052234501627524 +-4.069129330207257,0.0865962435058859 +4.810564126255002,0.1943624633415308 +-9.935854602251958,0.030442872613922613 +-3.397097398912754,0.17099926967124846 +-5.59777030093295,0.11571011815288781 +-0.49981711432213416,0.16683726894684775 +5.078930141550293,0.2562282574843703 +1.3724484385444633,0.011118486970975672 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_936.csv b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_936.csv new file mode 100644 index 0000000..68d8fa6 --- /dev/null +++ b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_936.csv @@ -0,0 +1,1001 @@ +x,y +4.655396850273306,0.9111688949207661 +2.8025022064030924,0.677654863944986 +9.168748517435453,0.28427401979447264 +-9.604090890425773,0.4171345161559618 +6.866674131627519,0.019100117686948026 +-7.980669888394569,0.02894572155243194 +-0.06789797849611645,-0.6441326348882185 +3.562850128929998,0.5266302759374062 +1.8235087219010815,0.30278227311620315 +1.8792337907126644,0.1569942244150953 +3.3239292304390755,0.5343739687494122 +-6.313730386152731,0.23042789220758153 +-7.117766394931909,0.07248029998721373 +8.374272411895205,-0.19375469517467636 +-7.50670699063133,0.06187572631308253 +-9.916430126822739,0.2139237182727441 +-3.6932149308453472,0.004461718042346567 +7.162728892837712,0.0015084938826681743 +-0.4004764052630527,-0.593325287568871 +8.814180902049003,0.14293526471790063 +8.71236324335554,-0.025754342437174782 +-1.4701932215675804,-0.6192186328454545 +1.0896687200405495,0.0784336849727568 +8.619074115172076,-0.04077644496604677 +-9.066384558799726,0.33601800607760574 +4.377450043878461,0.6243228531496846 +9.374803221347854,0.06516435795639866 +9.413479513011392,0.1652274465580198 +-3.1843896977091823,-0.0016206483648007475 +-6.796646071214706,0.1540210788532423 +-4.560727411377794,0.24465671965363778 +2.618854400550323,0.5764493039773048 +2.254998348029442,0.5058546446514653 +9.655769484363969,0.14414085575164412 +-3.7377796536076917,-0.18131359773649514 +-8.669350160891543,0.18962579368803661 +-1.5083142134886884,-0.6372183933716946 +4.885950050055502,0.555608873443792 +1.5057228223449393,0.24391140034519143 +-4.95702223932426,0.4377103274885871 +-3.8523374737691096,0.12287414431793983 +-3.5471335918966407,0.018252188283845623 +8.075818063049152,-0.049994683805096536 +1.2973956242088924,-0.0029357314728974265 +1.8956109423932865,0.3574035404269572 +-1.5014949458787328,-0.5137629446961383 +-4.910465829049052,0.31869174073008316 +-8.690631440378269,0.16741112555375093 +-3.036383354568253,-0.05185972178318342 +1.018010752410344,-0.09760183019066504 +9.496841857457099,0.40728923704122955 +-3.3231679585598073,0.1028535674220469 +4.116134280724086,0.8193924863142267 +-4.027461100178629,0.060912584430299066 +5.334145729974317,0.3234747406278312 +-7.444964376028199,0.2030742786275504 +3.21752388675796,0.6463600272767687 +1.1504088238074228,-0.1970471491545757 +-3.5893648521938597,0.13835076907561208 +7.343423432859545,-0.25673319001622785 +7.059771545760469,-0.25864539429535727 +-0.828191617894035,-0.8434927576890235 +-2.2986574985498325,-0.21044749485131692 +-6.981890278659435,0.07924127968479816 +7.9940246503664945,-0.2664079760649197 +-8.054689015084616,0.348867402183341 +-9.886886955100636,0.2942026203526571 +-5.333125386433446,0.25199592315462216 +5.805851778920278,0.21934554397832273 +5.003111740856788,0.32060246070378523 +-5.799625830934922,0.1435846480575563 +-4.631683110148629,0.2388824253291488 +-6.913434857000151,0.09933225360568329 +-2.5409251731544558,-0.35882199788707114 +-3.828230044017968,0.15250912353555185 +2.058670483795502,0.46121516514374433 +-9.23130264916573,0.19739011876093554 +2.828353254606757,0.6558825658506057 +5.434188883972876,0.4545567302440179 +1.133827771332406,-0.12107923988061153 +-6.562193358584491,0.32391687883760245 +4.7826616407629,0.5972074937722992 +-3.442729295430036,0.18671869601854615 +-2.2224339433013935,-0.14683784184909102 +9.212618772775638,0.3539241839221843 +-8.883648843233068,0.029253646876446826 +7.272940166926212,-0.26043258477623143 +-4.72481400740123,0.36665052543156995 +-7.552843933960255,0.08427132167304605 +3.3859207178091566,0.4911499565955346 +-5.037985172818061,0.030012455188431264 +-0.1948825963177292,-0.8747113410302437 +1.1822344869315913,-0.09218124178169818 +2.0177645128444954,0.36196012415660733 +-7.9956308195622405,0.11295945237386784 +-7.366762038431588,0.028782013371609363 +-0.25901261224140626,-0.6029540475372019 +-7.757029799585894,0.13448484142945946 +-0.1834892490721387,-0.6508051575730682 +2.125462884263767,0.4921819168478583 +-0.943090908360368,-0.5560819298571774 +7.4068058733233935,-0.06071276793788721 +-8.377727204431011,0.1274311377277409 +-8.872599729847792,0.08407004990235939 +1.3744017439263612,0.030792239050587297 +0.646430469817517,-0.38927473167606386 +-8.418922920362814,0.17400480650580855 +-7.694937666696564,0.04948998796696465 +3.746258070745876,0.8183180119633948 +3.816639335873248,0.6959837905759607 +-9.35761158022298,0.41953258577308905 +2.734286437774309,0.4777313959929759 +-3.2016139011472866,0.10064575345835348 +-0.3143718488702021,-0.7532452544108855 +-9.811497062918274,0.2600171342766851 +4.2032633116280635,0.7116774715414161 +-0.9978461550381432,-0.8004949633172904 +-0.7187875661466805,-0.7505065793263948 +3.1448892604185996,0.699841458987736 +-5.833246592390671,0.23004119876049714 +5.4725028975274945,0.12695233588497162 +-8.09740634231028,0.025528777917058307 +-8.617487804590013,0.25298278448133477 +8.435527352325106,-0.06801030437738706 +-7.333869790536881,0.016748986980675992 +-5.537817735101015,0.3279245532379685 +-6.536730193558462,0.1321023756865306 +-8.378478604876648,0.2984916668822133 +-8.129334265328009,0.14755357070023212 +7.429521128466799,-0.2510027968937405 +-3.4694885191786677,0.02776424942040203 +0.442631281679585,-0.5786491783823657 +0.470687104012498,-0.6843351723523294 +-4.696581739361502,0.2353581963021463 +2.2511596732040613,0.4685817603104322 +8.776798626089342,-0.08779757747226896 +2.3402403921889015,0.5483455169823466 +4.1169685653899215,0.4727538195874245 +-2.556013641199737,-0.16273594041825026 +-7.650374846352701,0.04745863958264062 +-7.471990286782759,0.20044832279144434 +5.6117357172854465,0.1731831684732958 +-3.353788034731679,0.134073184657479 +-8.810481893903223,0.2600442838645516 +4.93476313788455,0.6945306498279724 +2.4942094433448365,0.6511693747576622 +9.222663221610471,0.2975846406751751 +-1.456105751482255,-0.6769407588970104 +8.824175929602266,0.21215514726032275 +2.2774870442581334,0.5089991771751022 +-9.7995294782273,0.21713512314207867 +7.7210068481138885,-0.04659485513005657 +-7.9956972700993925,0.08747302393388798 +-7.417707113786874,0.2301111186642656 +-9.427191703399185,0.3516405800346868 +3.128893966386812,0.5717888080920469 +-9.184246860711774,0.296654803475468 +-6.824208428310499,0.16933597700671316 +-1.7815010244996632,-0.6888346841503012 +-8.553173632320362,0.1385627713402459 +9.603351329679025,0.25387188694508006 +-6.5738393340505255,0.0770883949848667 +7.689281712234074,-0.21846270351565056 +4.289557315317394,0.5363686478004756 +-5.134680328531015,0.3324016971863866 +7.356255439464512,-0.07402053748644996 +8.728958353838536,0.07420963815057513 +-6.960962711040612,0.10148992334473073 +2.994929370845241,0.7172694280637015 +2.248642786327317,0.6145811690975762 +-7.461627912856894,0.1433065690258271 +-8.314087118499867,0.21949962966038028 +-2.139295689395926,-0.34280819431251125 +3.282195183888197,0.6659913157000958 +-7.050657299007711,0.2834385888689319 +1.6149515508760715,0.21805506730854612 +9.855356110168422,0.4558449025046845 +-3.6240823255574828,0.147529828626419 +-5.989659486238423,0.19026467273492273 +7.963509036162798,-0.2686593332104993 +4.955240980442458,0.4561465309387987 +-7.31584079231807,0.09207121742464505 +6.933784054786073,-0.29620803356331016 +9.389529500604297,0.035841651258501944 +3.3478614916687692,0.6805190311756641 +-9.351211799576696,0.23159278342840425 +-6.569371041265303,0.18219439839363835 +-0.5034184010885134,-0.7947422604970578 +-3.2269668820516983,-0.09922459637738579 +-2.996422779754728,-0.1254847969123431 +7.973118036384853,-0.32174435987393013 +1.5234739009499307,0.2214929947636336 +9.938681605648828,0.23434034971425372 +-5.67493645225464,0.35123560644974566 +-3.998816237455518,0.13301383076559697 +-8.706014281195692,0.0066246475963117335 +-8.713359990323507,0.16550197004917344 +1.7515339083201162,0.40151591180422086 +9.612423227659647,0.40456316481657856 +0.1872645247303737,-0.5949137552582491 +4.679586937567532,0.5090965471128822 +1.061807669073339,0.06175111993980098 +2.5135502962935914,0.4340097312831971 +3.416081493350447,0.7599369727999821 +-9.555491832378493,0.11480187675803319 +8.448126983511504,-0.010473476625789499 +3.3526129530383475,0.5772909309308552 +2.639637378518817,0.780081087884851 +-6.265069898110113,0.2772719024757693 +8.72617668051248,0.012438801634219811 +-9.146961832444212,0.2617604329764968 +2.7462304264347814,0.6543633713583403 +-7.594384962454846,0.17437923797812113 +1.0227962128156989,-0.23437472067875492 +-9.145696670743511,0.2628751920786307 +-8.360431502744568,0.17100794632485905 +6.977118455398145,-0.17804379436672796 +-7.818496822507811,0.049443750254723515 +-7.268370020955671,0.06313587237869056 +-9.392275130391145,0.4167925777681333 +-3.945962799914757,0.0808080104471556 +1.0837481565355809,-0.20228000658405762 +-6.178647618726881,0.22380561995764572 +-1.8361897643116585,-0.47115651799599073 +-3.1227742471140236,-0.22168901483208336 +-8.604142511393146,0.1465647647097281 +5.79803543967499,0.10347481979133451 +6.136548382427108,-0.01719918068064566 +2.2951578888124007,0.4410793353882104 +-9.574069192791864,0.4134365593962814 +7.773664832133108,-0.16102810763887146 +-2.309398689543203,-0.4045290498021911 +-8.04006661692883,0.17821919966284652 +-4.724316174456881,0.3459575667391068 +-6.520517596836367,0.2467759442192315 +8.99239117073553,0.10788519147756574 +-2.093850315867001,-0.26049655147790085 +-6.883794754147164,0.27040444697543164 +6.77942492923937,-0.1321768855919652 +-1.2115907074211751,-0.6541832307383051 +-9.177092575564917,0.1921158017362864 +4.550658040871092,0.6933307000444354 +5.4293270967155305,0.440144126256951 +-3.779433220961806,-0.026582737547881694 +6.203442509143585,0.10838098909887414 +-8.528091077166955,0.13895925600008543 +-3.551332372039049,0.14342673020457794 +3.219332961272503,0.6244773885596149 +-8.898605538537922,0.13728544317663743 +-4.4332873719675,0.2487770445916278 +6.142143041321329,-0.002200236766518829 +5.13740944896783,0.501590308415174 +-9.310814883439523,0.45165485175093323 +-2.318321110436457,-0.19935808784783293 +-1.345748578394069,-0.4589496812660836 +1.3871670604997952,-0.013355634040636985 +-6.035111027088816,0.2358442798807995 +7.655315287278647,-0.2917773994751602 +-0.9974023974565238,-0.7502588539372099 +2.7752701957198234,0.6481328919482889 +6.927574733511097,-0.26147905508534663 +-2.904095559239863,-0.012877862269981472 +-1.5305672954986385,-0.49505329069290993 +-3.8670151852988166,0.021044558718095227 +-7.172056282677835,0.19555414408402538 +4.113833280849001,0.6004756083500495 +-3.4300943047614534,-0.01998021859917112 +7.880123009706913,-0.27159577532670653 +-1.5190122747582038,-0.5367876859397431 +-8.55699983911924,0.3073970451717277 +3.476908415426756,0.7910231912913892 +-6.365927269000381,0.379347906492877 +5.457467952859991,0.46443464951540514 +-4.312978691511699,0.03635721524184851 +-2.8887309748687073,-0.12983214098447635 +-5.8887593673769025,0.27453435765482054 +-8.133234817414152,0.11684505041883114 +-1.1133911030807155,-0.6321793974907861 +-5.187807867176243,0.3828088691055616 +2.229882109978192,0.49837323915737064 +-1.5260668468811964,-0.5512944701703549 +-2.5385921337020316,-0.23129460086800613 +-6.26483032037649,0.38473966522142444 +-7.230188151619154,0.010193032495836168 +7.938805268754944,-0.23945805977272105 +-0.759663041832308,-0.7742033148914373 +-8.925752563475669,0.2666364516336279 +-2.7385028360093777,-0.10694005318780822 +1.983167354291302,0.24653529646801997 +5.492966809705928,0.14281939592788376 +1.008775170540046,-0.23437997974065877 +5.304536465746734,0.1791404534033331 +3.197828404760759,0.6044727946901793 +1.0084504974282424,-0.2676452927987597 +-4.393318009013161,0.04941790155360365 +-8.714994773570837,0.11726911185134044 +-5.73771733309076,0.4506265254882936 +-3.1281800224165934,0.09359387569185562 +-4.509281359754951,0.21777795220165508 +1.4813729658487453,0.36263146008089814 +0.7486283037152486,-0.3328575411478373 +-9.034019128555762,0.15647894299388942 +-8.169890528560423,0.16061531478629867 +4.88472944285704,0.6430130678351729 +-3.993101968392825,0.2606680765947692 +6.464541883236553,0.07891940880181648 +8.099167389384647,-0.15441306013514516 +8.23854044451975,0.006714946383125281 +1.0957461248252098,-0.02833949734148966 +8.870888597288626,0.015361515882674377 +5.340667011077244,0.3454491694558185 +3.0383277813581837,0.7582704640049518 +4.214111089857893,0.7680163013799222 +-0.4199988551082434,-0.7016453688315047 +6.489773812334762,-0.09786584195246292 +1.0292535345634768,-0.1809202938044735 +2.6593690626323725,0.500320116762441 +-4.428842227830177,0.26649698728841364 +-1.5064477834921597,-0.5576860758303896 +3.912887697380554,0.7742728843630085 +-1.3414970977119456,-0.6751091688446218 +8.893338374118809,0.14330047877874258 +6.244833962250688,-0.1769862809433732 +3.1143550791547487,0.6873240682705565 +4.553513035305009,0.4994747323914838 +-6.388602089478557,0.2380561479061017 +1.5940638492756811,0.2420105682268255 +-4.529196473994901,0.26020671928718114 +-3.684489488907836,0.04933887756375588 +-2.2357838237678034,-0.3987430082371606 +-0.5654068550001732,-0.6858734718176351 +4.002652668354045,0.7868190328750299 +9.831427576697749,0.25712777702248824 +9.654691651912472,0.3136080145472188 +-4.900841395614982,0.27012807756146995 +-8.37433188116119,0.3411557999570912 +0.2672028377637581,-0.8423923353292282 +0.20529757071486543,-0.5122114529642555 +-5.03143047897877,0.1803695573529118 +-4.290533112372197,0.28227212194514406 +5.563089886598309,0.19662711043782455 +6.869786635230355,-0.023557880203304016 +8.056578296409324,-0.24372882563014361 +4.9719660788737094,0.5666097850579919 +-9.334959907445537,0.2556249463098903 +-8.258686944375214,0.10836780456639612 +8.373543964482643,-0.018277545912170373 +2.742346818141726,0.5411946242819874 +5.571635278980832,0.2824046742060194 +7.212703028274031,-0.4158838956743185 +-9.737068093505634,0.2057867151156989 +0.2314200206747472,-0.5629905842292446 +1.9646623372373728,0.4997636548606814 +-2.960677917035404,-0.01885476869090738 +8.310865582003714,0.0021437091846674416 +8.875282477272087,-0.040438889183394985 +-4.944649146715188,0.35898126014706966 +1.0410938265480585,-0.22979428129092144 +-4.344171333611552,0.17062745720496297 +7.830119034805488,-0.1530148888535942 +-7.66566407185524,0.3128035296446928 +9.395766156853476,0.18045386936910324 +-1.7456541780997483,-0.4720451456682948 +0.9551230321592783,-0.26388491732579467 +7.145691704516616,-0.25503490529844186 +-6.927402057532355,0.10760178766538195 +-1.9503156743215797,-0.33374708914251555 +2.727150230007922,0.45391238973160086 +3.421929044710046,0.4906727876902589 +0.9590882505744336,-0.046640352409340974 +7.314802848405186,-0.09079730688063395 +5.040434171709286,0.2628408427637645 +-1.9111673987174527,-0.27157323251703813 +9.318973244602834,0.36813262619015397 +-8.546938666799946,0.08924596220071067 +6.876273523242631,-0.042349151993329476 +0.22252558866306416,-0.7200501472782178 +3.427552930728797,0.8115043300860637 +-5.3755821414860705,0.29203749070759455 +-4.865319629236469,0.23234907614649108 +5.943732926907801,0.2579274901615912 +-3.9459263004723115,0.23325270747417579 +-0.9945823219867123,-0.7661863179654206 +1.0195996966005758,-0.19922535249838574 +7.919903528528075,-0.3239210272477795 +3.567810461057368,0.6380789365086591 +8.952430989191448,0.014073865003949929 +-8.168801347087474,0.1719582201626413 +1.8020682717151573,0.2706854554261445 +9.829319014631587,0.4786622481690699 +4.603882901514922,0.6282970851326555 +5.093015637660017,0.34763244927676146 +-9.899938686448873,0.24056051403940795 +-1.5051314507414624,-0.6520459556400614 +3.2920027926274287,0.7316844624739135 +7.923601381480282,-0.31891919070744645 +8.73624286116042,0.16036690850064617 +0.8982952409486558,-0.33054152735993286 +-4.570210322391297,0.03914104637567184 +8.856117414443794,0.07407854262102642 +4.9638071988137575,0.45724874556281453 +1.03444968179884,-0.133565815036354 +5.043381692274629,0.5693248722482085 +-1.0894164921122815,-0.6014783396109192 +9.06851397351096,0.21612847210842712 +-6.327763761897355,0.18359941244974748 +-9.845933123094582,0.21096078572810523 +-4.501410790166403,0.16444806141424545 +0.27135506727521275,-0.7700508986695458 +6.95610949612734,-0.28042874106191473 +-4.229393133911458,0.2643215958924383 +0.36763415755576645,-0.5301485045058433 +6.882640147667004,-0.10417055195533839 +-0.34738909455226974,-0.7347967123419299 +2.670270297735055,0.6565733023370321 +4.207302318493888,0.5289253210036371 +1.6328683537989406,0.19997538127736744 +-0.32739270792190744,-0.8842864902483756 +-3.1556074660303146,-0.04342551893220637 +2.822935695290374,0.7337917594616851 +8.476967532464649,-0.22093257724235887 +-4.73567341946428,0.11097492402304321 +0.9582248631372003,-0.20137840112135016 +1.184002524130868,-0.1997279087191609 +7.378617018737476,-0.2507094462852738 +-8.32032423505915,0.22690772034341103 +-2.1631955166136407,-0.155927023351065 +6.705322250560556,-0.19996315082599989 +8.02565803053195,-0.2804482987559469 +8.864405148422474,0.20797768436494757 +1.276055083980978,-0.1242463865756159 +2.086079842092259,0.47287929237642395 +9.54897146718725,0.33251867093024495 +-6.141747167472328,0.22706206370727122 +-1.0514839737713686,-0.6491041904940527 +-9.461910190035304,0.13666004593387457 +4.524349187438862,0.4563773746530895 +2.9062617745140606,0.6595555680952832 +-9.194008758750858,0.46141865393654624 +-6.822296202811785,0.0686960060000429 +9.446026325501442,0.2918240422234501 +6.270621875293116,0.07092624891680202 +2.8185243038323726,0.6426427990885392 +4.767331921116952,0.3414250510902271 +7.494820354490067,-0.18667800332722528 +5.644162966363279,0.17849706669816368 +-1.158896490572939,-0.5900599835239118 +-7.145238016541376,0.07195637496988377 +9.924064209168712,0.49279029779689004 +-0.19007332627298013,-0.8766039662882793 +-3.8998216746924226,-0.0015426139949373252 +9.018076154612547,0.08060585025346215 +5.7710491558907115,0.16184870298006576 +-2.143007772804268,-0.3122267195800081 +6.858780438757669,-0.1797178556075465 +6.166171398136285,0.11929856988804466 +1.0497537777274388,-0.2078499789356173 +8.922252139698806,0.13794538610690252 +-2.860046882222793,-0.042860528464074614 +3.0046691028050216,0.6325980167856883 +1.9310727266927508,0.3940279523581985 +-4.352297585124552,0.1813936146130443 +-2.592662017886189,-0.2600333423724232 +-2.314649878672657,-0.2616369567683242 +7.848964580967319,-0.1311025130165138 +9.254109735543988,0.3887081893248747 +-7.809118913640734,0.1858484940590342 +5.939426935749822,0.005905955159525134 +6.831761618649054,-0.30411110674296155 +4.14139135686003,0.7935215134904409 +8.647134652455598,0.007333805253564193 +-2.6906283437493035,-0.2309465826245149 +8.560780080037013,-0.02975026512672182 +-7.238743413089842,0.030323449821683404 +4.529913245205801,0.6064838487454484 +9.602520160492588,0.1958523489392761 +-3.0817701268908326,0.03732868295883101 +1.303514755646141,0.04597206271566721 +0.4579135922935862,-0.6311175012026022 +7.012675905803568,-0.12980264172989348 +2.9015967182684577,0.6063274846675812 +-3.8048373791578083,0.3374936453505873 +0.6362580806646623,-0.3588499134459366 +8.167977636477048,-0.29981438892888435 +-4.85758447563275,0.0634331270343804 +-4.411374131571311,0.1700878634634954 +3.22991576843374,0.6816498929737843 +0.22848860702772456,-0.8281470895449141 +-6.367272746078893,0.20931290063733418 +-4.83727922019504,0.2873311859448261 +0.5031852487892685,-0.2826205847376531 +-0.7625062639492377,-0.718140969998959 +-9.422191266999693,0.2731414346463066 +5.974523919126735,0.129054055883492 +-7.565405711959357,0.03063532938262678 +-0.644073805140275,-0.6838836319281281 +-6.183594359307803,0.33073053378435796 +5.589591669192995,0.2894367949020151 +9.04750467201874,-0.10171294186959629 +-8.76498413189394,0.2231224101097273 +2.3613442151817043,0.5131802489035717 +8.256849159709532,6.631771458056046E-4 +6.965254276710748,-0.2110042232129669 +8.776910723743022,-0.024582051658381433 +-3.0665277787065155,-0.061910745955542454 +6.188260637291716,-0.1080107692750226 +4.894753803903328,0.472612191756452 +-0.047876676470085755,-0.7676376240659019 +9.579510709576262,0.29637816025945285 +-2.3552434471949764,-0.2195621061579714 +-7.550805932222132,-0.08513067565877046 +-9.278281957284314,0.37775061241630176 +6.523719067360663,-0.03787073852170882 +9.756874581163075,0.4964800143961354 +3.9775229359709297,0.6705724202879095 +-7.669894036386694,0.01008546512371207 +-2.034806733175154,-0.5298454204344363 +-3.067158153638992,-0.14105890972814458 +7.621594818084688,-0.04170659161060519 +-5.167385775170228,0.3882616007208036 +8.900726381800823,0.03421077709445177 +-4.001074082929804,0.0951785772180618 +-7.194199184736441,0.2622866663612899 +3.0362259883521237,0.6799016165310663 +4.450344761105658,0.4237535090176467 +3.507830573603057,0.5518301878191461 +3.076257235293758,0.5032634227666422 +-6.303555788170026,0.32201189695957466 +-5.217661032293579,0.37780313802603194 +8.872447357127257,-0.15570938505308413 +8.29839520111581,-0.0824472826377673 +-0.6754968024790706,-0.6703967580486019 +-8.000506865283796,0.181364230464434 +9.702268983588414,0.2718872723627099 +4.062510992601656,0.4575142904652101 +-2.5630137597425278,-0.22865284691765514 +-0.8742478827728242,-0.7757355239846189 +3.123595514598243,0.5372823874482416 +-4.576639979265412,0.12340718529843808 +-0.055388739460561,-0.6312672552454446 +-4.193607616537127,-0.017553997788897657 +-0.41745137905679464,-0.5916706393620534 +-1.463513277388513,-0.5268551311098464 +0.28303048508493944,-0.5621245924372982 +2.0157256712745624,0.40442151953056954 +8.45824397607069,-0.18872878446843622 +-0.8667012230315141,-0.7904686786250487 +9.514071703916123,0.3860079050906153 +-0.9389043102265671,-0.5263785505773177 +-4.670460920341419,0.3402533469709284 +-8.537217502870575,0.3549540000353715 +4.750556532231105,0.5177767957545929 +2.734877768885933,0.6591301417829815 +-8.249267884577314,-0.03526241839307365 +8.210911197301456,-0.20514219402232947 +-5.646257841616453,0.3032188046120908 +1.7301120399116332,0.2522881605168701 +3.250635458276605,0.7526234666696988 +6.080618078178386,0.13768195029562255 +-0.3810300442682788,-0.9050448131056574 +1.9174310362755307,0.4096244414033522 +-2.779405078943608,-0.048765280815742135 +-4.7562029796899274,0.1202550503955331 +-1.8985614694022246,-0.5565217210323355 +4.545292496459229,0.6188385258718259 +0.5426048160203489,-0.5356977354933752 +-2.049150219588629,-0.29006495787341857 +-4.908834776292013,0.18570119347605385 +-7.924786765604569,0.2050986450439456 +-6.81353687666502,-0.01802407530694511 +0.44934896639407995,-0.492195064994351 +7.6782200908121325,-0.2076441037538211 +3.3184662359965738,0.6361929043148475 +6.474847876268734,0.12667242817135027 +-1.7809188334337342,-0.38448856902256295 +-7.308381474236868,0.053051093881926956 +2.1484591982070356,0.6034179860919247 +-6.477142873162478E-4,-0.8283175520038748 +4.786298730724052,0.725577376085643 +-6.680189216286575,0.007929652296537637 +-8.922478370983367,0.1773949340861626 +-9.552412958160957,0.37465554774685694 +-0.607329217382123,-0.8881864535640702 +-4.643431902947723,0.25085070301299894 +7.228188234846215,-0.20639954502396282 +9.363859827492153,0.349134191760371 +1.5083267557915825,0.03060291362990425 +3.990044908208961,0.5692143501352737 +7.086100529447883,-0.13153092722714788 +2.401152020330461,0.49495358015633023 +1.4566784136910158,0.1449404236280365 +9.727173733435535,0.6351946225357561 +-6.797838696724937,0.11722506603774938 +-2.28715791663705,-0.4073813516171666 +5.337099386506651,0.4333468666931479 +2.600668564310542,0.5968823058623041 +-3.916622519220816,0.07144650704832137 +-3.754003617653194,0.04560960358779929 +4.485247262596905,0.48927338765457085 +0.6852082946734015,-0.3259351686277238 +-1.3111427002869682,-0.5774152456618499 +3.2978120892721168,0.7091960740935984 +-4.810512617055679,0.3854652733018496 +9.196837556258757,0.09042313009549166 +3.0515762143234593,0.7428019158163545 +-0.2920271324702881,-0.6341416816579629 +-5.8752403880802895,0.2734409482481158 +-0.551699869534179,-0.8961558602139407 +-6.223890683024905,0.22800505959592793 +4.840467858560512,0.5786199388294887 +-7.88751376255259,0.15508869211206872 +-1.1219908645917442,-0.7792802012462997 +0.4841095248389813,-0.6017147332444713 +6.297401574733171,-0.15371280934040682 +2.237215020229964,0.4802945055793322 +4.270047020730976,0.6206712180341108 +8.677406407477832,-0.014904296068215098 +7.605153477216287,-0.031209339461667818 +9.241980838849521,0.13252131235242826 +5.218246082939967,0.3431729542294738 +-8.80998724159205,0.22949848801226086 +8.939765869832893,0.14638350734638922 +-9.833309258968619,0.328637623811429 +1.1578652137122845,0.006060568175673395 +-6.328950180136147,0.2681683314947417 +6.619127470670598,-0.0760237298754633 +4.463655704127705,0.8196937975686768 +-5.914507756027261,0.2884436603865035 +3.764656055856918,0.6206040658990029 +-4.208371762131371,0.22803828746459154 +-5.458839586407809,0.302727940313536 +9.481875509901183,0.19130748985017063 +1.5364518233838087,0.17551663896381245 +1.856006970781392,0.584335276519435 +1.3163704460911596,0.048252599265838025 +0.9421965243806838,-0.10797108032088198 +-8.718918890550679,0.362241763238875 +-9.572748450613421,0.13986539015389085 +-6.648281271418991,0.15541019896816566 +-5.086559868812742,0.2827849905602954 +6.339183092781152,-0.22973136781861997 +4.450679921475338,0.4133022540057904 +1.460804351425903,0.0759394885188037 +-6.818163998226589,0.13329927613214754 +-5.500191276117118,0.37645213054599724 +-4.168652921774644,0.23283711876864055 +1.3023246227508487,0.18493283869500757 +1.9047725113836087,0.4860563621564681 +-2.624988415233309,-0.27061685045392786 +7.191738926348247,-0.1647673120715043 +5.937216169841143,0.11901462419346137 +-9.024320770985035,0.1533023886352282 +4.536017624225881,0.5302506165782084 +8.038042703190534,-0.20237119586308722 +3.8682031297156843,0.6829471053908345 +7.692473014299107,-0.13842419525993604 +-7.840279887712782,0.24864029624936257 +8.283182937176758,-0.21381112707649358 +-5.166003221233556,0.2873483730406332 +1.043492255735785,-0.12357962381602125 +-5.100955736402938,0.20065556331903084 +1.625917051074257,0.23787597610213432 +-1.8601786858864067,-0.6083940488622553 +3.7891017824329953,0.6385771805283196 +-1.974532244008147,-0.2443202015791736 +-6.05726314298198,0.32094844648264453 +-8.271921069869745,0.3163152267892099 +-4.2011231420377015,0.07267278588468781 +-2.8220481107358393,-3.824752526096348E-4 +5.795475780076952,0.32482736444567867 +-2.212490622697882,-0.17220243321458203 +5.276418083143209,0.2113948879220016 +-1.5529334528383778,-0.5330473891202362 +-3.4014240738321355,0.09563645991624005 +2.8693160696960303,0.6134079217480931 +6.076600887010276,0.04169657208136286 +4.978961366304446,0.4989600650700524 +0.08350775767408614,-0.748274401994992 +-8.810964277898865,0.3263655044734795 +7.8555633126004665,-0.20050340314493773 +-1.4156629287688958,-0.5175291824204744 +9.813391718136035,0.45866054120732314 +5.604389003575498,0.1418187019358459 +-6.372607602690987,0.18421840561047204 +6.804727914688943,-0.18440661435224565 +-5.216825744808261,0.23254895236950207 +1.077033937561147,-0.13746382952055955 +-2.3257915795931083,-0.1810912438285357 +-9.898169569252964,0.3192535694547537 +9.789959221376172,0.20978295434074531 +-8.328874850031582,0.2598699414352278 +-7.102489761149159,0.027544090041483657 +-4.070893678800651,0.10566807020323077 +7.347402129180946,0.02004009333534662 +6.78637012247811,-0.1249512429617026 +3.687113550796632,0.551290298972788 +-4.44305931247845,0.18005235812045228 +1.7379825021168305,0.05215653348249369 +7.198281150650203,-0.1271386071316924 +0.5433149578545624,-0.5100363394333118 +-5.691919345704285,0.32309377563136543 +-6.611004448670613,0.3253889800866895 +-7.85859794981446,0.09638921539598737 +-2.7194797866424163,-0.15151427980096593 +4.223594605980331,0.5140398250368549 +5.233327878653707,0.41971146204607623 +-4.836962325512864,0.21765979796127577 +-3.3711089191463106,0.14814504077107443 +-2.2742894322988643,-0.20846362756701828 +-6.571102913333316,0.05793454160745795 +-8.443088273275485,0.1654902841958415 +-1.915014301265714,-0.46369098816964044 +8.68603312573622,-0.04221784022198212 +-5.584798011573685,0.1308587162822902 +8.07091159131016,-0.13913004082623917 +-4.408458304751623,0.19542282798651067 +-4.270788598760236,0.2457740973217784 +5.13195616157704,0.378261868449243 +5.18859234692342,0.5508982975018737 +-6.164686107174432,0.3033442096563771 +-2.6246343620956747,-0.17236961184948368 +4.554361292843897,0.4965605998789706 +-9.614386488793745,0.18270277276575658 +0.4211139544048379,-0.5575431443994778 +4.729672560452833,0.6232274613904408 +-8.400509685678482,0.15191329873182347 +9.273771455264669,0.24879263268764676 +1.6605745689589781,0.14117128807183388 +-4.052682127927735,-0.02673868146273356 +9.774743928521573,0.26190572444271454 +-6.7308444217285555,0.037286844309463024 +-5.20579954439099,0.387721378112433 +7.1456931033674564,-0.03304035077287415 +-0.33421934592833935,-0.8361987996799156 +-6.360929044017215,0.10387371690095815 +3.166975776646787,0.5985872132547515 +5.341292729179132,0.4339908914632498 +-1.7587612198859937,-0.49890963732036553 +-6.811997791203858,0.10508958866135422 +-1.3048043240828946,-0.48413500777695256 +8.790718653279722,0.06456468627863204 +-7.237774373779907,0.023720434776006674 +0.8756904197906845,-0.20242442725126164 +5.19309789359696,0.3160175410749215 +2.8667030365119532,0.5583998744906968 +-2.0062197613863257,-0.4044316387472007 +-6.2189800556880215,0.5446814822301109 +-0.2760203579802898,-0.9019521678904109 +-2.251630108668534,-0.3865336639115587 +3.6750660929628154,0.629950641389949 +6.149506656457238,-0.26230977934865146 +-3.8210348181556912,-0.04181792594179287 +9.18158171853776,-0.004973411046813203 +5.678259926518088,0.5505362800399576 +-0.029448712786898312,-0.524171569347288 +-5.62098325027673,0.29573475133327803 +-0.5256056882697013,-0.5413179970458518 +9.020113818128117,0.19039262742163 +-8.79327092648252,0.42993859066902207 +-2.0186577940039503,-0.6070217642193475 +-0.6447869014505727,-0.809316889484238 +8.903755882288845,0.02533368921601624 +-6.041630443321374,0.262709157558525 +3.8445093308592466,0.6857196365366967 +5.135310339034156,0.4226719829614269 +9.51737226887505,0.23387443128670105 +-9.249470225276959,0.24402865987690292 +-7.103002874248175,0.03393405639165249 +0.28014366109733757,-0.6027175859448177 +-1.6741475634239755,-0.6444177141185015 +-2.2653469149496845,-0.1809882229902839 +2.0416168386625837,0.5455423895929788 +6.4747255848704475,-0.049855344735393714 +-2.2218348789815945,-0.19936807351884558 +-7.725250015105075,0.17259529797708575 +0.678268940097313,-0.4856885456036753 +5.863597181782442,0.027550148020619836 +-1.8409439889916115,-0.3789185754716333 +0.6508019453558447,-0.2699172618864839 +-8.639839122163853,0.22917856589999766 +7.273626163863852,-0.1632024355547107 +-3.375163609104735,-0.14188870558908362 +-4.425854808527019,0.09687389921782291 +8.422091854726908,-0.21483903014790967 +-0.5385416720431948,-0.6551070196362302 +3.3687323457425085,0.7647497211994103 +-4.032824591846653,0.1729517656455366 +-1.1440405313363442,-0.766975866306491 +-2.9553498338518214,-0.1668300019812866 +4.156145696913644,0.44914395550448466 +-8.037100874302595,-0.0130459576365764 +-9.444064853898947,0.3409570729221444 +-5.488832490052879,0.3565249701962215 +-9.396536970681076,0.37417673581062993 +-8.189167541296033,0.1901538116576707 +-4.270768388498606,0.18649515000907918 +3.0980745103990426,0.5260817658767438 +0.2534415585810237,-0.640107868084296 +-0.5837062502176629,-0.74573639647393 +-4.7880331543249355,0.3185788314838959 +-0.9984297438577805,-0.6805987209650513 +-0.45191331072289564,-0.6410730275535287 +3.78954469038328,0.716913143127943 +-2.489533020735259,-0.24334902311378054 +-7.740881116626923,0.1536920295291289 +-2.622487530684774,-0.05605848213128012 +-3.5320937358729902,0.06006760224156728 +3.6413945174174334,0.6471905924999494 +9.196452383894126,0.13399277202066673 +-8.738775819016467,0.2518170742443663 +-2.3101495832641166,-0.21201179971105702 +3.7729961674634045,0.6765203065584827 +-0.45214992274899757,-0.842134710425609 +-0.8006356765275378,-0.8783216402277422 +7.807410438640421,-0.07794268084896723 +-8.355954283340147,0.14690780844993903 +-3.6507759298638565,0.05138820200437481 +-9.324544889418544,0.19480778143564934 +-8.475836779237301,0.197295247242596 +-6.805336938210438,0.23890124787229988 +0.26639758278198755,-0.5282477561383253 +-9.284825135094493,0.26359797408908087 +8.768226495914572,0.022458985845939174 +1.1393483494330994,0.10888632452485693 +3.814028495006614,0.5753966251029623 +3.7373999567970806,0.6992413687696869 +-3.175350265514187,-0.1695588434943662 +-2.581917397868043,-0.23946845063828567 +3.5225602814520625,0.5480493851454792 +3.456103037486118,0.4686450130085835 +6.2004670382208005,0.0345192249489909 +5.033313029484855,0.6005502956123958 +4.532759757369767,0.6480535079115975 +-4.990556351510179,0.15271528928961275 +-8.394683103247019,0.0928152903034828 +9.788156730035329,0.2208184007404088 +8.922152371929116,0.0920239349966019 +2.7860396803867804,0.6994079809985696 +0.13840353378452797,-0.7301818835969215 +5.420794172854668,0.338269754921429 +6.03741606546449,0.15396634201942794 +1.7164820942174952,0.2717776175650003 +2.8770096100293774,0.609249955907948 +9.480282715233244,0.3975171539427885 +8.250837092124577,-0.12648080206097329 +0.848874779610016,-0.4726748048775812 +-0.052354800824625514,-0.7097400079481038 +-0.24015425686771863,-0.7870639071007655 +0.7279697287317699,-0.11879380260338238 +-8.516099478657647,0.25288989871428774 +5.184419026688403,0.620270177388578 +-7.005279528749341,0.17808418356772354 +2.0270439911732514,0.4188853504701164 +-1.0876195341263983,-0.7392913113814074 +9.367012656101,0.2745815682470051 +-3.707382026615274,-0.011577916396053892 +3.346782701611337,0.5905444592804726 +8.365326353220116,-0.13141272493000955 +-8.509024747488532,0.06309881151337399 +-7.153740132185309,-0.01057858197757934 +2.546182168756843,0.6253109571245019 +-6.934193033875617,0.08828187601446433 +9.162427997021886,0.3269364790350992 +-6.299173524412431,0.1380490974725222 +7.490034615991416,-0.13328914582428425 +6.6371611181465795,-0.2803472148322737 +1.3344011651360113,-0.02682273210083736 +0.7581980159352355,-0.4411602402838285 +9.848247936423146,0.44364826195091506 +-2.2586989952342447,-0.31249924999691187 +9.004425601480953,-0.07934076423614735 +-2.0434763599351236,-0.4423870467986324 +8.490165406151114,0.03319957047061782 +2.95598451850497,0.7943653215394515 +6.02203760232964,0.006954436285952978 +-6.767744274707557,0.014211520900670183 +1.8327501502705914,0.41837903011923266 +1.7646889174483515,0.3446809740151062 +5.386075177096586,0.3585726768418702 +5.8097907565492655,-0.17489244935119347 +1.7147347481203745,0.41399972834050636 +6.7780080642956975,-0.28219529280079697 +-0.9610121936832847,-0.6308029987011319 +-1.7011425371779723,-0.34665306569316356 +-2.327924111059843,-0.327603742816726 +-7.681878652688191,0.09723462855207692 +7.411692810243263,-0.2355125551497533 +3.630198838387342,0.5128850289581803 +-1.3920307743900437,-0.5731948764563521 +-5.339160123216008,0.5544631167966985 +9.37628383737714,0.0877084491607357 +-4.703897133706452,0.37570551442672506 +-0.5856414061586346,-0.7570135634580497 +2.556228074475806,0.666588623693363 +-5.323416548163106,0.3832526766204146 +0.6372987308765019,-0.29822290523587847 +5.687623874625931,0.28680977470496316 +1.8422760037530228,0.3924476121491964 +-9.297154552416721,0.33019615390233137 +-8.844472037608014,0.3602692847207475 +9.517955736192484,0.2189286775583994 +7.916766506174548,-0.2584039515925373 +-8.900665848709139,0.4893390522010309 +3.1626534068183965,0.649431959830519 +-2.1165235393154447,-0.30646902988269953 +-1.6218246925343607,-0.5737986634626584 +4.655362076820673,0.66195870962423 +8.94451080289685,0.2119279691497423 +1.0502390125580376,-0.18081120275559634 +-2.61053366382872,-0.1526709922879315 +4.106241276303656,0.8155979721627196 +-0.8268678849659814,-0.8777368373493545 +-3.355837526052841,0.009645852776610134 +-3.326932700017626,0.1122733826545804 +3.7870999126345635,0.6878862257679931 +9.083842882998539,0.12343191598711088 +-8.746756936207106,0.1325450446007073 +9.737750651318038,0.3076577278153348 +-1.9525133797036744,-0.39926470964572375 +4.400399439684026,0.45465142860408125 +4.817310788181368,0.4983066483307001 +-0.46374950838855256,-0.7734703230456933 +2.224981182615608,0.3691637926779049 +-8.156256084575936,0.0025607117809368085 +1.4243690164666134,-0.09036610167210546 +3.568141630762782,0.594329664188812 +0.8104761399132911,-0.4038717280083364 +-1.2224653788578088,-0.5856386038792504 +0.45055093507520994,-0.460914455879295 +7.537894849012673,-0.10670824818128535 +-8.747668648855335,0.3044058025556804 +3.4327785303264196,0.5217273040447006 +-5.497623427109421,0.31170118502018745 +-4.873832090329815,0.3973595267328562 +9.467944922021001,0.15579131505087174 +-5.836381565207148,0.24546839462764009 +-1.2545581280736329,-0.578800300886858 +4.23295016764834,0.6285307619227637 +0.1078700352838382,-0.6602075482225359 +-6.677645761675176,-0.0738437145229624 +-5.0033810866840955,0.5035610609480804 +-7.546020246424195,0.06413778339888901 +-8.693175450048145,0.18347220951560558 +3.7772350913692994,0.7797672265216209 +-8.269172780129324,0.13921425410481014 +-7.838194394309276,0.11179072632220984 +-9.356967994651235,0.2932756818383742 +-1.799432091285734,-0.6033899914600211 +4.545315070477782,0.559164979680948 +-8.35370294147975,0.36686186488550365 +-1.5818040059755312,-0.31057198581709056 +-8.816106283927695,0.11098378580748036 +0.6761708228966423,-0.3634997701103687 +1.75288138854234,0.36783396545405345 +3.1239620727707655,0.6470613085586254 +9.922967744950576,0.262127348813028 +-2.497517173060246,-0.034506726585419256 +3.607955199718944,0.6151352793505518 +-5.297861921527144,0.3272644125369263 +-8.075213826178135,0.16179489587663543 +2.5931676218473942,0.6660614306723056 +-5.965951292190326,0.11252104170069943 +-7.481222964562516,0.16671039755446365 +-9.628821362569532,0.43704323001211065 +4.03210766280746,0.6801443048479852 +-0.3504017413368299,-0.7381118418802519 +3.827348623871316,0.5312763484040353 +-9.94765452467859,0.24789746501339838 +3.81631832102681,0.6586665223560184 +-3.7032793857062796,-0.04786464967848911 +-4.208631736092713,0.2548515117629547 +0.11927366192053057,-0.7396770244164305 +5.19941521934221,0.3254620559559147 +-8.050394919356812,0.31860983880949123 +-8.071487372618659,0.24595621894239034 +-0.1630321415751368,-0.7652761113637586 +9.570825046304478,0.38611403461265204 +-1.9458245871847701,-0.47838884120579755 +-2.065964411626102,-0.38701339033079224 +3.6421302680200855,0.7468808263577545 +-5.487355867602538,0.28557313299166426 +6.6037514568431455,-0.28514296285931057 +-8.681317003406544,0.2665379732745869 +-8.606443431556801,0.1918464176420307 +9.097045181995732,0.21683606355283946 +-8.030766918012397,0.003912356118611904 +-8.577661013300407,0.20871136731732878 +-9.529247466221712,0.14299788035375527 +5.969357933113063,-0.009143057475310495 +0.5268653616966859,-0.35079310277872566 +-7.029911898415836,0.15189694836255074 +-4.9311655556929335,0.26550453693786774 +5.817427907474029,-0.036270768925680735 +-4.228814783487195,0.24182433466252395 +-8.24425291347643,0.008815060347471454 +1.278435588259189,-0.01353142691423415 +7.040926861697109,-0.3485633236204747 +-8.385007922184997,0.16480526467011689 +7.053524748341111,-0.22222341541670212 +5.402463433476029,0.1530189052984991 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_937.csv b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_937.csv new file mode 100644 index 0000000..873fb55 --- /dev/null +++ b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_937.csv @@ -0,0 +1,1001 @@ +x,y +-1.937281788646402,1.101637380872502 +-0.6244343287045062,1.0297740537169362 +-9.807033849606821,0.21314106999579643 +-2.832310026541997,1.311078494528639 +-2.914643554911658,1.301638979096919 +-7.797494155537642,-0.05644221940489691 +4.494515587798137,1.8122183655140542 +2.3208958077593955,1.611531346874751 +-9.256281698518176,0.0035867440913874143 +-5.748557968556444,0.39380383803883745 +6.412206976737277,2.3442063876528914 +7.171635431037661,2.315550886385257 +-5.419734846996347,0.7182427245245431 +-2.039089615685299,0.9040771641625654 +8.74500837755544,1.8208700454937679 +6.051514786145703,2.2019447059732946 +-7.732757488609777,-0.2632333322978994 +5.126548642295017,2.093052724372674 +-4.846063834538935,1.0778740041251254 +-3.611948426749198,1.133695580521365 +-2.114697993689991,1.1530181414618497 +2.6749656513535096,1.6330713326496347 +-9.903573304618458,0.0840064739562549 +3.0075095281375663,1.5087205855062875 +-6.466243374365107,0.17682097285971105 +-1.4808278975683287,1.0195444533897309 +1.9630413837140068,1.4491103291423904 +8.0422807489878,2.0797236459130537 +-2.9363274068017997,1.122046403378034 +-6.417910864511867,0.2628716067627039 +-2.613096975788949,1.0401092504783884 +-7.894400796194226,-0.05566398427068439 +-5.520466454162678,0.6046710728316974 +-1.2228872059312046,1.027278758857411 +-3.062947330427983,0.9696250197273777 +-3.672251573224967,1.2555454598398108 +5.259096268836023,2.236685240543284 +1.1158995725064322,1.4007538446012118 +1.690161791938877,1.4987808384805783 +-7.784717637213451,-0.15020610352452174 +-7.61356414979732,-0.1162195432539242 +-9.147173085448578,0.004293717026882394 +1.13099477943409,1.4186619198953434 +-2.9775984867625342,1.3343792015922618 +7.139759333555173,2.5538011995621526 +-7.856184148368656,-0.13757708249931083 +-7.4421332015792965,-0.10218617940267745 +-9.03407771691655,-0.019911783355737084 +3.8290375157215846,1.5520966233449092 +-1.80367076507296,0.9453654134472077 +-8.218971511666243,-0.1201442405202174 +9.516805131884752,1.5242704640725984 +3.398021726392177,1.5874421057484742 +1.1962179522139849,1.3214945677244112 +-2.0094041223815395,1.2456950120601986 +0.10729363736040831,1.178412070775137 +-3.969502498370425,1.1919739825619289 +-2.301817268998896,1.1478185880438012 +3.8480634579698187,1.4772681266947814 +8.423470532182375,2.034270762161879 +7.4852999836474154,2.2496028524185534 +-2.721121744515864,1.1097296731763624 +-5.107808793839919,0.9878937411267437 +2.3245919180270254,1.4099265602986344 +-1.4245562528216293,0.8789925472697528 +-2.700932369177775,1.1929169772327621 +-1.6785135649483962,1.1235536530620944 +-7.134537267016029,-0.21489659051950613 +-2.413964267849465,0.9870951100048289 +-0.2624238218361352,1.0786865897382294 +0.6500880890139715,1.1611054235179028 +7.646974939646398,2.165838881859609 +-9.480634535097792,-0.08797067757004559 +-0.7187931610459053,1.171486742248554 +6.032009575162217,2.2725277469469938 +0.35388722647246595,1.1047371943225872 +4.959270565634842,2.0994045016959832 +0.23506255579096802,1.3099508294236113 +6.421747569706299,2.417836707287151 +-0.6421567442913165,0.9147904257422951 +1.4981681641201483,1.2962644167997135 +0.5197266372509368,1.3775803055990026 +-7.0423043666949745,-0.017510708945272305 +-6.760068866110135,-0.06283432557098112 +-4.033863667152126,1.294205050068989 +7.820851048190892,2.2800241037249447 +9.319342042344509,1.444424019087927 +-4.633386722602445,0.8951353642648615 +1.6898900081169455,1.2422717751571677 +-9.983141819013355,-0.13106605987412961 +7.627411552712925,2.223246955536564 +-0.39542996258265184,0.9576580516938338 +2.6100409578197907,1.3613966046212316 +6.937243795731618,2.512104555020451 +4.238721907907275,1.755482776832035 +6.271747002789848,2.2238083853364805 +-4.15993754436753,1.2896196107828126 +-8.852651182894299,-0.1030653961077792 +2.8369086630256963,1.6725955117041726 +-4.222287626246391,1.0963673200627237 +-5.8085348744464795,0.4095697840098224 +7.386606297745146,2.4258906384610026 +3.577510868126783,1.57190463431874 +-8.816594547535672,-0.1813535872974907 +9.287521123335296,1.535484851628432 +-6.051142772635004,0.4086940136908808 +-1.6135323442642058,1.0422215430627786 +4.457736052929979,1.785686872440047 +0.7415589323456331,1.363220951259524 +7.175964152075159,2.174149290655909 +8.27162538613873,1.958101674331741 +-6.506062196045619,0.08096039181347764 +-2.0684649641264525,1.190000128326932 +7.7396066609337275,2.523517797669279 +-8.102424643427003,-0.22117674159187664 +2.3611291664587863,1.5948418283964787 +-4.1638561904459515,1.1907657470170827 +2.3992418267841398,1.6018922093586516 +-3.478263837601663,1.3730204312671381 +-7.094361103406746,-0.14220691782101685 +-5.743541858528701,0.4895427064034443 +-7.070616581625772,-0.2060720538464128 +-9.505161016489655,0.2373218306469333 +5.935332785912584,2.3265520011166734 +0.26116824982734954,1.2403408550524497 +-4.335578784183017,1.0769666791801964 +0.19977205751010274,1.212987374757555 +4.848734021101286,2.1589972140976434 +5.05415341214977,2.1319456800920915 +7.04281092153105,2.4966376893213753 +1.7163291471227282,1.3179725498090256 +-0.6145393452566452,0.9541019350827815 +6.442748987740501,2.5831876494782193 +-7.047497284262949,-0.01738226722855884 +4.606361067764748,2.0552426443518903 +-1.7172920681900639,1.0738180058434474 +-1.463393917476047,1.0377076051323528 +4.202902043183192,1.9757615416009822 +1.7989005080613563,1.441059589867276 +9.639476478644,1.4795884381250688 +-0.8612708660427781,1.0223049713767116 +-2.0219234239084787,1.2839297292006966 +-0.07707577774178631,1.012037816479622 +-7.257856681319809,-0.14553656729622205 +-1.5265250618683481,0.9321473786947355 +1.076275277173778,1.3775360188851125 +9.921399469090886,1.3548783462616572 +-3.924575348360939,1.1160526701273326 +4.427233315899377,1.8122903283145966 +-1.1951134605471125,0.9286324297580312 +0.9406152301055783,1.2820828087659746 +5.892585729435167,2.1671484470620297 +3.246263670437997,1.691592399741329 +-5.59291127409864,0.6430129261591373 +-5.165548443566648,0.41620937891859366 +8.613927167373209,2.013171694806745 +-7.246583741419471,-0.15884595702036658 +-6.527690666348732,0.22316679695638042 +2.733266164963663,1.3367509503310522 +1.7325804104026599,1.3507897839666327 +-7.412401401075265,-0.21696039133848163 +-0.9566660246588654,0.9627049537142292 +-0.40300070088679085,1.1726521385345512 +-7.106913278887017,-0.13499481735893992 +-2.711000258780927,1.1946533652833775 +-0.1741052774777465,1.0610473410434786 +6.763986592855847,2.3986972707993868 +0.8517727503846828,1.2866848052855082 +9.447214574141071,1.3607484391191043 +6.35895938520294,2.3518120818575943 +-1.6118943412774591,1.005289616606621 +-4.032928720753969,1.2637900761036736 +4.262143077668558,1.827665652954508 +-1.4777503131111445,1.1170958275097365 +1.5093964852956265,1.313233147638234 +2.5927976148370964,1.3445678823341467 +8.777501430392078,1.7885794338644725 +-3.86858306363405,1.0837645543415433 +0.1776633394100191,1.1864053141342608 +-1.2473385842012696,1.0221218596540034 +9.378508218764956,1.658131137031902 +0.530396445414949,1.1107804822993814 +-8.631327212145948,-0.21596817756804657 +-1.630799668252667,0.9555178338803623 +3.4028925805259025,1.59009646083739 +3.6685807030390687,1.781847189744699 +-9.007750084662995,-0.05974511870176556 +0.4796375537487272,1.0567759840261268 +-7.753525641514024,-0.07966472080066939 +7.359417040454189,2.471887596759782 +-6.571547642468673,0.015710535494025513 +-6.895940945779678,-0.23366201031017356 +-9.550559543935652,0.01707018222488011 +-5.724895552503032,0.4477714952148454 +7.212378799912006,2.2881693634072766 +-1.9693482188005529,1.0482972802293016 +-4.1403392066483455,1.1748050908341021 +3.7995595136571185,1.7362595238364154 +-0.7059204155578147,1.1283997420514262 +9.999792860199115,1.328125423977871 +7.064803159692691,2.2464080298445324 +-4.78551655942554,0.8814750915964628 +7.3475566360976705,2.4744601669480297 +-9.2065333455118,0.09369439717642913 +-4.953923262314849,0.9776504977774345 +1.8521593183603677,1.5066089890433674 +-8.80890247471585,-0.16992686028763837 +-9.400578554523563,-0.07094668676598513 +-5.320035862407785,0.7076497796326742 +3.110490009463671,1.3306658670427707 +-5.383495547393675,0.9636233197518238 +-5.039478748575883,0.6857670561611263 +-2.5283686628029005,1.1987457562225237 +4.026186957738652,1.6133467188566506 +2.2553433343852585,1.3688386207341467 +-9.79183489708182,0.16317787856976174 +7.096753791240652,2.4043903255217103 +-5.603544609063388,0.49734731362038986 +-2.21475154708132,1.0852627559181136 +9.454767631646014,1.5825282290248959 +1.788893062361927,1.5410575329097458 +5.233817675474416,2.1207104884206243 +-5.622300723622221,0.581732123616754 +0.339581065824067,1.0248806001015334 +5.256766440687528,2.1615362269388285 +8.314918051047918,2.21730005867438 +-7.597196958786444,-0.16695722333434287 +-2.4156433919289495,1.1055068280562137 +6.783937453516305,2.4061338238491685 +-7.718420756778194,-0.15305635038509188 +2.512969983743702,1.499450068039886 +-9.981677268736327,0.1744821915347641 +0.6869377100983023,1.274858260922344 +-0.016543833409659925,1.2764484068184332 +-8.711713057823786,-0.16365537317539083 +-8.973423444319806,-0.09066230420827363 +1.8372094751859933,1.3744491816395419 +-7.778610735720477,-0.17475255827840755 +-1.1536035277702474,1.0050426607442373 +-3.1347645849832073,1.0912602336365855 +1.3322529523740236,1.4167363142889098 +2.3836868558288455,1.5191508658493884 +1.5244133875421628,1.3534931633560376 +-8.504248904916409,-0.07659034376711639 +-2.3496786978505035,1.13828470673944 +8.605720063412843,1.9446943692661638 +4.818462350719223,2.025879793916354 +6.547718267540894,2.4061376315437646 +2.991155524796536,1.4813459911848892 +8.996007921633726,1.7785756297059063 +4.756478104439115,2.066289220506045 +-5.685569989174146,0.5466217043876751 +3.7213606836515467,1.448502452060234 +9.490519862212189,1.4572956448774175 +4.026942985458186,1.8537815240402293 +8.37836114447068,2.132847171096073 +-7.537488056840217,-0.3425539411477204 +2.622325942544737,1.5308070308764377 +9.458757824612949,1.65384776064821 +4.548612819910346,1.950290480884669 +-8.75733318088253,-0.05089392516788153 +-3.652677829185866,1.2265639438056575 +-2.783861716863076,1.2940426958615188 +1.8730805546326437,1.3713041194425182 +2.346968089223811,1.4666381566587254 +-3.741236174874057,1.3088123741677729 +1.973414699960534,1.31578820662871 +8.925464873948293,1.8929262669781455 +4.36808859820963,1.8951595779590114 +1.0087678219535903,1.31668700634358 +4.964206591109699,2.025087002686647 +-7.784407520411847,-0.10366956495482835 +-2.7865162950765705,1.0550361567140665 +-0.151295441335213,1.2272342985231537 +-5.094654025095027,0.8179763362247369 +-0.9336647693637978,1.0033641347320474 +-2.070163793293469,1.060845428128491 +0.9879141841157733,1.1531535653520806 +6.1911798616358205,2.3559111667338595 +-6.3446097793845055,0.08829386081257928 +2.0078543464935668,1.1407931749326237 +5.906374429794203,2.2411198810303574 +8.79869489011653,2.068529363923199 +-3.847878389024761,1.4251245300465807 +-0.8312446904379911,0.9361829948564274 +-6.771472515295006,0.001787073349199068 +-0.7618423636905991,1.3464483806038432 +1.3041571841677868,1.32337087277372 +-5.869759712201397,0.4141535663941297 +-6.26883480590481,0.38197991559120137 +8.763230221770357,1.751443043598742 +-5.98991105658965,0.5267541445524856 +6.933365602166841,2.306242764189065 +-4.816354698603217,1.0278602022594163 +-2.2539057798106477,1.0405614093854179 +-2.7961939063016454,1.2876376929870714 +-1.180577452279671,1.0749977107712714 +-0.5269944762635692,1.0173584758153775 +-7.578216895872259,-0.2480328255697929 +-3.792632168723728,1.1776374545886101 +-6.77772047069654,0.20422966751663596 +-2.300048208798815,1.216932638155371 +6.734069772609676,2.442213544787095 +-8.884269174441076,-0.10665897629358735 +-1.757690124289386,0.9717933409242538 +7.0666394720865355,2.2607985524394634 +-9.025515976127028,-0.21205667964315825 +3.2821231969149482,1.6617343509824405 +7.3009006484716785,2.4396225985987923 +9.385379301102077,1.585795360641975 +-0.8221430542311161,0.9789005609328812 +-2.013936204736657,1.0831087022767547 +-0.6080876776179434,1.306536859566411 +-6.333805575257144,0.14631121672350098 +-8.831356840328276,0.07115593109989754 +-2.2454846262967987,1.0997879489145337 +9.546733216439947,1.5831549457071834 +6.531089162864517,2.356543257638176 +-7.724860755677163,-0.08200240540892209 +-3.5061892153691776,1.0366408136268435 +1.8320875474504419,1.3119944755279769 +2.5957743006577476,1.5079045210935729 +8.223656265194904,2.098311391171053 +4.149013079793313,1.7324359302425973 +4.98446814221397,2.0822269464200054 +-9.632552961256803,0.02334404041083578 +1.672787688514962,1.4126988859709209 +-4.921564663212482,0.8690007139541924 +-8.269040582960429,-0.2066829793000278 +-5.756685736438314,0.5232790589997544 +-4.004427999776512,1.099932395182013 +2.618615761241663,1.6165834477382346 +7.101753542360703,2.3573090636392866 +5.373124600321933,2.04442988799907 +5.1757016186779214,2.0201070513246155 +-0.8067924272811808,0.9530514703186221 +-2.5394470393863067,1.1575642327373306 +4.009255454161046,1.6275257957236453 +-1.6709025945318157,1.1018501555684574 +-4.964237539440903,0.7085409878246623 +-1.388405000115407,0.8092902206436655 +8.982411640011357,1.8120223095236616 +-2.450259384553526,1.0695783749344454 +4.238435889137042,1.8255412629812533 +0.7799427274295603,1.0869106449432766 +-1.1763500841506502,1.1195333478119516 +7.536262292649877,2.4286682003367353 +7.293549569873061,2.2474514816848976 +2.3928091513386778,1.4280555569799538 +-6.271811823229116,0.3950735042782381 +-9.619461931164224,0.09990246298278292 +-0.7824429653294551,1.0333454723827507 +2.9511665670918585,1.2932553096255939 +-7.12662738370069,-0.02769724191327881 +3.8147676135551034,1.696332165533097 +3.179019793186111,1.500493796170439 +7.523963631878679,2.457606432333653 +5.042550800845795,2.056409443043166 +3.9127947750142904,1.5527169867419528 +9.467969533823961,1.7192916898276227 +-4.356532455181096,0.9462619964954309 +6.455729075057754,2.2655168170859734 +-8.409255048960812,0.04366288247716105 +8.746463116635418,1.9165936160695674 +9.639564248770775,1.3612624246260285 +0.11398010755240406,0.9912799398744614 +6.80233130676816,2.4030358140162162 +-8.890354250683256,0.033616133609907686 +-7.8281067954757155,-0.21581349654209786 +-9.374629703281183,0.12683630782668368 +2.6008017619505215,1.4287142535157313 +-7.927364260559461,-0.16119016725443247 +7.552309859176795,2.314582430869968 +8.425402749209454,2.0254819463382243 +1.0439758393441032,1.317816038799841 +1.9841209962347648,1.3713541704830152 +-5.929592319777207,0.6078207736903372 +1.1663543324683534,1.3432886321578656 +1.034897683888234,1.2172501526485013 +-6.219137286956603,0.2264423582330019 +-2.6617141723803646,1.1890010869481946 +-7.146687552218985,-0.18469767511804025 +-2.1576674656846784,1.1368426085258698 +-4.523085143941724,1.3443703395298785 +-6.82567052169221,-0.09140156773462801 +1.8201301674638781,1.565924097477026 +2.417674127742373,1.3778863481114376 +9.925349208592955,1.5810502964816544 +-5.1680563716780625,0.6244531119299759 +0.7219286209038689,1.2011288881957338 +7.140647462716455,2.273192830302519 +1.1762085998027274,1.4102309637998744 +-8.813216073005457,0.011818246494794238 +8.680420582927141,2.0872557848735336 +8.152937681890219,2.2595366791786953 +-0.28945570118256825,1.0108691890409809 +3.496154431664373,1.649575075065515 +9.743532794713449,1.3973913227060628 +-6.607008305477088,-0.051204144224534606 +1.1960996898908949,1.3948208473927908 +-5.07018012119036,0.8348825510396843 +4.572163060593454,1.8051246257372782 +-8.772616126625628,-0.042696275229795855 +2.8468692576840926,1.592831628675091 +5.715826811887217,2.341708216154105 +9.399381130394197,1.5655799648551656 +-2.8346477518945297,1.2238386233617777 +1.9546385889527897,1.2318204332812024 +7.882789500104078,2.2368844080676826 +-3.646138394036282,1.2126188009072583 +8.20222144406095,2.035656429875077 +-8.781516413650266,-0.10189569063885236 +-9.8181617227517,0.03518832958944251 +7.901897531787895,2.2775270308265854 +2.4110361958483484,1.616652698004718 +-1.7364545582484503,1.0116586308756177 +7.477908796128091,2.4777802767522346 +-9.939617657056203,0.13758387579667553 +8.409277938416654,1.9364776595264888 +-5.873204996582476,0.47392242004258506 +-9.409210764283756,-0.022141312493894998 +-6.340021119044693,0.22906362782530376 +-3.0229219158514997,1.2274003994039595 +4.92136112574129,1.995252575075895 +-8.758894305034008,0.004992000269652852 +-4.117384148583469,1.1857178934875996 +9.307801522327608,1.6760526643516234 +3.1098176915588245,1.5411624195023 +-4.299365412449596,1.2996036914941171 +-1.2558237253478577,1.070708870461595 +3.368298455025931,1.6152098250656686 +-6.7771875949262075,0.2849987946792279 +6.211355839375777,2.332219289030618 +8.103650468161568,2.240990069559442 +2.058282542449326,1.2238521716542214 +6.802146170740841,2.2574059135870588 +3.4335493767537173,1.7227611313072837 +7.100257124445623,2.404123048481778 +-0.13021339824034328,1.1476346948325018 +0.1676988606984331,0.9914442601199892 +-7.401417269224805,-0.09627716182324979 +-6.083292373603353,0.3362428049794695 +8.838953560977085,1.9158781398217832 +-2.102174707728599,1.1108863387281773 +8.035038700040275,2.0800831334726313 +9.296710321706936,1.7049433359809345 +-0.4431033859194411,1.1725669717903526 +5.621950014676344,2.1478769863401572 +-9.313152365232927,0.0724990386984786 +-0.693926353857881,1.0040837225263695 +4.4684780512987,1.9543085789441417 +2.844400319718776,1.2510054792712193 +1.8347166215123867,1.2392100549258815 +2.5328604178737852,1.4567874330908297 +-1.8924162545057008,1.107338521684541 +9.522175249816762,1.5075788191172415 +7.989499660549938,2.3231867207167145 +-7.611810178299642,-0.2339549796401621 +2.9140890452908192,1.47349203749592 +6.103170346695261,2.076257499507763 +4.911359860313542,1.9632522999133195 +-1.5844288120135364,1.0973074983765492 +-5.351276633066252,0.7402376692275475 +4.256492369414456,1.7817068793158262 +6.651220022141339,2.403471601892456 +6.290923092548777,2.2836094182243394 +1.3078490949632702,1.1732616502150488 +-3.013284262105507,1.32658008031514 +-8.592369788851318,-0.04741443005633761 +-2.494393424928503,1.141376869514854 +-4.624543499431756,0.9887302753674624 +3.6573208672641973,1.6010503251710935 +5.057237658499446,1.9695685168457353 +-5.778014270867908,0.5879598792669983 +8.521225085229155,1.9850410389193058 +3.43480012718687,1.5711694334967004 +7.077091329126546,2.258803842903114 +-0.9238291948872579,0.9484405566671124 +8.389510259940426,2.103240191332462 +2.1188269407596887,1.404435780829963 +-7.009074876477995,0.11286717162384063 +1.7709726076032837,1.4455338037574461 +-4.386075227451061,1.1246736764910394 +1.2389728542854748,1.2523255665174933 +0.7303251649745555,1.2851401098870399 +3.904925181158663,1.8698780772284083 +9.95963440671109,1.3558369469687312 +-9.273842447845006,-0.07607174729935207 +5.542045317323172,2.211392723232353 +8.881529684238679,1.74757817014309 +7.46669400893294,2.1612676244289504 +1.8759180469948866,1.4555577508012025 +-4.280913795316463,1.2389723951346845 +-1.5006924923799438,1.070237263884327 +-2.150284814863239,1.2118849938586673 +-7.4281631898079645,-0.20830630577452708 +-3.1800332251371355,1.2357625155316943 +5.438422405527717,2.1113482665986445 +-3.5739819684788228,1.2623937567359806 +3.3219833961549305,1.6572731754660495 +1.7922477291735195,1.3660073429207862 +4.242086265415148,1.9765941689075248 +1.0398573962814073,1.1989630246355398 +-4.699369062548925,1.0176359680756348 +-9.665825763347758,0.02719760970621931 +-1.074606502706219,0.9595147105282116 +-9.596287480914715,0.1559861425204404 +-0.09013427107399252,1.1517102894800162 +7.584497105786866,2.33182885050387 +7.086928001929632,2.3150112086720664 +2.753073109950353,1.5050900825365499 +-2.4518214841315977,1.070484716455881 +8.83799000695602,1.9019041414670368 +0.8016060817476891,1.2835672863556984 +7.777407221282701,2.1484138716075982 +-6.79766150304384,0.05588666121051912 +-1.7613367419710801,1.071059208428882 +0.5663148554727115,1.1636857183987028 +7.500087300732382,2.42349652336894 +-7.0936617680767435,-0.1576219647861975 +-9.79135993088802,0.05121205956439502 +-5.431711203416491,0.5318973873855951 +7.45438367416488,2.357435132495557 +-9.010492786274945,-0.039470483088022805 +-6.73962192076365,0.09988077905390627 +-2.653606332726724,1.1191532225864496 +-7.891722982944241,-0.1040787084726401 +7.159700247373241,2.545963506605633 +-2.0941393653557316,1.052673207668813 +-4.1242609434322475,1.1976399879796817 +5.0955223460717605,2.175908466366162 +4.159673530485097,1.7042094076554617 +0.3235219284164348,1.1941759480861995 +6.676921285402485,2.20238294440396 +7.177704061353044,2.295080275614744 +-0.9537435398213887,1.2293029047080828 +-5.160229479852525,0.8178753324259256 +-1.6934886815650163,0.8383172046181251 +6.795905612504537,2.5170159946415716 +1.906440125288519,1.397434300922517 +1.502759987554807,1.4446290480434052 +-0.1265485705002689,1.141725481766531 +-1.9687842165255365,0.9833713775956704 +-1.6199801575487562,1.0301211818224736 +3.8011577521365236,1.63974486397546 +4.78584054181594,2.0394231595401813 +-8.184149144863618,-0.34472659926246907 +-2.084486643504779,1.0186571637222137 +-7.564628299817979,-0.04588484910733151 +-2.2655126500001366,1.0395124935856839 +9.867670258487173,1.5303886942279237 +-0.17750076840244766,1.1733998655664633 +-9.96047992487895,-0.04233093562946051 +5.099349229092262,1.8266546939005448 +9.731944191924839,1.3388014746397645 +1.843024870725154,1.3105255231717274 +9.053076232637608,1.8163966810747216 +-3.0006255205271652,1.2096713636167025 +-2.463516450271541,0.9634946226356984 +2.1751682435297237,1.4214314523295746 +8.483720063603371,2.0377928825004124 +-6.5867316004678536,0.1174594989343623 +-2.98169321545322,1.274863972878313 +9.258147826793056,1.5655591203959829 +-5.428072178286518,0.7491061602560966 +4.108185945706149,1.9348212643820422 +-0.13992310150559462,1.0109368351547199 +6.156462719808008,2.408167166039992 +7.418735070886156,2.2649038927835323 +0.036664334701782764,1.0392918785079293 +6.638897805662324,2.27436186530331 +-3.101680390829884,1.1294898232973352 +8.61334404782193,1.8867775345244078 +-5.865450536033755,0.42075485331307944 +-6.656559262649257,0.044287686684401834 +-0.339817502423303,1.0321111620283385 +1.8997707213724944,1.4755314730691238 +-7.456264795819712,-0.07755259188184187 +-9.869312428572037,0.16557437403232078 +-1.1487534596989857,1.121392142367297 +-4.756624715536265,0.9876171852343841 +4.787239718012497,2.131925096574648 +-5.734478284930805,0.5416089183940711 +-1.9626924447002558,0.9781501491150515 +5.299499447188477,2.1558597413464913 +1.9937438412446884,1.3967067546060479 +5.074301929484925,2.085968899897748 +5.84967744800743,2.317378833937415 +-3.12906502350414,1.1828353124284208 +5.0801197922594845,2.0940433884005425 +-5.888251742063289,0.36865826892357345 +-2.418190920569012,1.1301494873685598 +-3.7962936457372987,1.282514470807918 +9.724359329610188,1.4756335608102733 +-4.453461029918424,1.1117307080219636 +5.871380442257541,2.1439141047583194 +-6.613417570737678,0.29561675522865705 +-6.55219503907114,0.2515396391330227 +-2.0190498986459677,0.8720077774367904 +-6.644933570580811,0.03703267679632196 +-5.9821758303116255,0.21692231995372047 +0.5383126231871636,1.2662465370768596 +4.099416619476216,1.8125185033421525 +4.22814739003365,1.8123623556151285 +8.513379659881615,2.0476100682670437 +-1.9067551886220269,1.0947672950482437 +8.280029864585178,2.154609950110912 +-2.002397623152166,1.2293864085387338 +0.6882704487595639,1.2314466484961133 +-7.390658717523885,-0.03824524878831627 +-8.134313991960482,-0.22672904251118164 +-7.9965364038104845,-0.3573795379129463 +-6.568081906425768,0.2109567929722833 +2.8250704573695944,1.441805024714349 +-1.9757784709534398,0.8877349767259061 +9.394414004473571,1.714257211302741 +6.4025518771532575,2.367235768986867 +-2.980006518253573,1.350241553445623 +0.2607376232661842,0.9167717490174541 +4.106723044791259,1.7878246381878382 +0.37748481689646596,1.1287281218823981 +-1.4227294501794177,0.9024277612007403 +-4.830900988526089,0.9924002044212007 +9.92172577824627,1.2506518173192633 +5.706020415884506,2.2410610083422795 +-3.568220123171777,1.2815261800589097 +4.926200257585428,1.9990524658047064 +-0.8286789094717406,1.045412619937775 +5.693663783780236,2.2020687271277324 +0.958682120826996,1.3253922434513943 +9.951743630127446,1.4479635651024092 +4.956624610527163,2.048675704442639 +2.815810469794515,1.7519557034152147 +5.7010920510988825,2.129677656580936 +8.584105003295246,2.1047485244778947 +2.2425741355191735,1.4388000486353738 +0.7928291365727489,1.3756317342636974 +-8.311105088435227,-0.2203264403524804 +-7.82999710123514,-0.1983277198067337 +9.019452423279752,1.825250496042616 +2.9742870610331806,1.4067997488607196 +2.4268524662308533,1.3156974708929372 +4.36950873453938,1.8754102109048123 +0.18450923735169766,1.2858630277580123 +9.721063400258224,1.4539829820055719 +0.009439510849361454,1.081971199794009 +-2.145429680576232,1.1954675030865516 +1.938893916449235,1.432866506166479 +-0.27686775745242365,1.1390429078856528 +-0.6927253906411757,0.9765631578349632 +5.601909562612754,2.2318517245806855 +9.642775091969575,1.608736405106275 +2.4722311447649137,1.42541074024059 +-4.580705053729908,1.1270100663045868 +6.52632162341666,2.1218920252665456 +-7.801317379899366,-0.0643771134255259 +7.870489992941828,2.1641028413625674 +-5.511633293779887,0.6358194689554623 +-7.06768214303012,-0.05491242023258582 +-4.98510477112645,1.007629038321521 +9.301014399733308,1.5789815503839222 +-7.060453890961899,-0.17732778171130803 +5.972768971336876,2.1579894593246722 +0.2443942601411475,1.2484685619506268 +1.714657013790891,1.4867678273298597 +-5.017187034016315,0.9447766901625161 +-9.006501937262502,-0.10290230769011367 +2.747961769226283,1.6100559448160576 +-7.924485334944119,-0.10919420137907114 +-7.771591717518085,-0.25731083046071845 +3.638731594657889,1.5214619373838387 +0.4224574499897251,1.2680800875716045 +-8.036601827427788,0.013989022475766277 +-1.175137460714888,0.9187052448436736 +9.146169902120466,1.7190365061793054 +-6.797938384056401,-1.4832050640519942E-4 +-4.332110441587172,1.025236130189542 +5.718069682556703,2.380666116331858 +8.860148695783217,1.896104482377675 +-3.7365690998598744,1.063235652542374 +5.7514295304126115,2.2526983174368294 +5.496120938776624,2.192464226218381 +-0.49972255794777354,1.0814653344693577 +3.40396827315502,1.6426093863691598 +-3.0121271082892638,1.3009627596483735 +6.984781531838745,2.3351380642597674 +-0.7510766296026397,1.1484062167898312 +0.7403933977503065,1.315904003952933 +4.663158259017127,1.9648347157428658 +-2.831155089565534,1.2537518953846036 +9.004311417922839,1.755826363785609 +9.94109323582526,1.3822897389803162 +2.8334601790445735,1.4835674100767549 +7.90336213090788,2.343042549510923 +-8.55901678765527,-0.03891214143965453 +1.7603759667708694,1.3330308366988899 +-0.4483477681131909,0.9388561774120867 +0.9221808189981395,1.3137451898678691 +6.352317503360862,2.261409755722757 +-9.712007556600376,0.162949602596525 +0.3668309764803901,1.2371675264386854 +-4.46064186121388,1.1631999668077635 +4.414815034544084,1.7457479620711025 +-3.218047465297147,1.3786692400339222 +9.650973971334238,1.6728634039100156 +-5.787166782161201,0.34990268760077015 +1.1725788163109296,1.412732025599461 +-0.20742096019827905,1.1792088214446879 +0.6468149521281674,1.2563822902783424 +-1.1411538570205249,1.0404039214035241 +9.048330885206106,1.6067061062710721 +3.833414776818147,1.7155865704150606 +-7.505336596854137,-0.07102512098952567 +-3.5317675111069176,1.2306700669460429 +4.5857144683384465,1.8329629707931163 +-0.4020878045999474,1.0261413191525812 +2.422237148319075,1.5325096684081934 +-8.966962132126021,0.007441458962223375 +-7.746040568121862,-0.20494787016217286 +9.258505054913797,1.7334509299254517 +5.068772997790983,2.050927417503677 +7.379184655635687,2.3337225028991533 +7.57240590884086,2.205772195896116 +0.49961723789173185,1.2703473546688637 +1.8950330761107281,1.3235749137768016 +4.115754009640504,1.8754826553906412 +0.7929912406203865,1.3601711308175708 +3.8767453592366192,1.6289632911168646 +0.7230470244025966,1.1735411583522406 +-8.60383301932302,-0.030354610139906812 +7.656823891632008,2.162182758013317 +-9.706865942544592,0.07920364914945238 +-6.471344335644148,0.10082943930845067 +-2.920613609887992,1.2126672264614002 +6.857251114169767,2.458040255218846 +-6.974101502977877,-0.0838908938587311 +0.924526147384821,1.4206563366665064 +-2.0338815733444804,1.0017617929169034 +-4.233825056949353,1.1342340909887638 +3.868796154254124,1.8016775831060083 +-7.80817845597927,-0.18200688563914091 +7.930914907569972,2.1468441022142226 +-3.3303745476769504,1.2298087559914044 +-1.6718667385005936,0.9507852294735188 +-0.9527786749803866,1.1743392346750685 +-4.7610150839893794,0.8759695674512139 +8.357898808431038,2.1718532129666572 +7.57291346590927,2.318289226767642 +-4.953252476266696,0.7786387949210947 +3.124645073898393,1.686402512950925 +-0.7223860623855671,1.1193091483983595 +-9.972870323956968,0.15538903312969005 +-2.7660711468355577,1.2922657550670362 +-1.5864310180071115,0.8249623108936603 +-5.473523179755659,0.7893305063433416 +-9.144060241006272,0.08087765214914563 +5.291834282958558,2.2301065871515995 +0.09898344062967723,1.0694927386689588 +-3.4708853463671474,1.4544407866220523 +-8.894991129327416,-0.05457006892930559 +-4.58342509064984,0.9477308119416581 +6.46776409575149,2.3099104402064508 +-7.909200668049344,-0.13823574899859847 +5.674296743160658,2.229011043469894 +6.027821363647078,2.243939957746367 +4.8071243693934065,2.056655137168865 +4.224091929543219,1.6910575270434318 +5.183123918488501,2.2582730959206962 +7.187582319826112,2.259670224169189 +-9.46176074217415,-0.06534742944735103 +-0.7816024956687961,0.8727180244581505 +-1.8247597086949128,0.9351144601791744 +-3.2104849882857245,1.3489422601782624 +9.085800931467702,1.8283149187012528 +3.0255155419336077,1.487465460534964 +-4.409148707687972,1.1758665159081074 +-8.693559615176355,-0.07607428301092553 +9.844707466751487,1.4024425314098867 +7.575697241022602,2.2580930891318647 +4.803093645785559,1.9365222885849922 +-4.4402897548729925,0.8506416652606679 +-2.2088585743801876,1.0606171691531936 +1.6143655394773546,1.5375110301974184 +-1.5138823539781843,1.0727454349520549 +-7.503939664139657,-0.17859415609904086 +2.3725097190439204,1.4410060173531873 +-9.347523409285596,-0.04971396380885719 +-9.941408487312152,0.13772734053394592 +-5.989773412674349,0.3291455194436439 +9.412311095941554,1.3782884389648895 +-7.832717734004898,-0.16504842476241943 +-3.667629345163812,1.3803550050264233 +-5.842082952245772,0.41777535510535574 +9.748812765122668,1.539946329740598 +-4.167412214722987,1.1277047859768905 +-2.7747023640622497,1.125334065720562 +2.535605839179422,1.4870406671416148 +4.593605966468633,2.107725698707695 +8.451662587087561,1.890541251852448 +1.0188275175081571,1.21319959711236 +2.3087326837372046,1.4605585898463442 +-8.08946115263165,-0.18248454111282295 +-4.2986632898185775,1.1696828836651576 +6.128896888310752,2.2481642762196463 +1.1657611978967086,1.2057118458904166 +8.89672658900664,1.9806728526688904 +7.72487941479897,2.4964357128409578 +-3.6813101132231907,1.4558854641277257 +-5.534178758980563,0.6699751279341988 +8.572946941700824,2.017353665625102 +6.054795140892125,2.3718913623886784 +-0.021980932537193354,1.1544749114146018 +-3.8165503349423524,1.253242946493384 +-2.815429453347109,0.9815195816232741 +-3.3686787235211657,1.1282782789311645 +4.49646101614956,1.7070707303408712 +1.1164481050273523,1.2324053604095306 +-1.886993402717195,0.8919343081485778 +9.399635777393797,1.5453053677677069 +9.018042735584054,1.721763553107703 +-1.0090609436099829,0.9912853086317709 +-0.5182129415779269,1.146668466156497 +8.06996195219471,2.1518090346143035 +8.804041610145617,1.8371724763654822 +-4.545432554580495,0.9082347350940103 +7.496459061465445,2.439065154494213 +3.5200810963804803,1.4778650916226117 +-8.262143252200747,0.03424983326926331 +5.152306913433865,2.12070913267714 +4.664288185926754,2.0191572642859446 +-4.941180127728003,0.9820403357368995 +-6.698106809728014,0.21885712216161807 +-3.50785200182369,1.1191206246991734 +7.928973418044948,1.9937233457396268 +-9.211966072959758,0.020958525799344742 +1.6682746890448374,1.381042581110412 +6.719236723122929,2.3165791690053914 +-2.062480267750924,1.1245526591882995 +2.678895870962212,1.357065291189725 +-7.649158972370892,-0.09660642544724868 +-5.037948588620903,0.8232030088601451 +-5.032102981893392,0.956720568690072 +-2.2187190930744904,1.0281568957664384 +-6.516019924983937,-0.031894631830358317 +1.2136063916112363,1.3958755466208848 +0.4946193293076284,1.2724610151578761 +8.784052715101144,2.0370990450791004 +7.108165426443929,2.38561117387143 +-5.071300004255219,0.8044055437494603 +7.9354091590145694,2.0927973740027346 +-3.1751968881234345,1.1434531434165625 +6.76798796377755,2.2868033396873235 +8.72751280961523,1.944705023706391 +-5.112787969740804,0.8527921428532144 +2.9268154483989584,1.5935068943366784 +6.45944348904505,2.554957379173803 +8.934259071424492,1.871810852805611 +0.7741725741123062,1.2253648192074333 +-3.430695974994409,1.3250893800264916 +-0.8795278211807265,1.047312610747004 +-7.71362440693045,-0.2663444606036415 +-8.53142298050399,-0.08561318432378304 +-2.9575783732809353,1.1803875679532352 +-0.9302681724238901,0.9713988644703369 +-4.851726352387814,0.9567635205203172 +1.848481651694831,1.228398548801423 +-2.86406234830531,1.1722610119400048 +-0.236424434797172,1.1715223645986137 +-7.350820306202657,-0.1991301418754393 +9.097825411003658,1.6655307588367687 +7.256519285423035,2.5543542197921902 +9.709477671418888,1.5411134826816735 +-3.417843395076869,1.2216109419095984 +-2.5996265995911916,1.1446491196172837 +-6.693903889389738,-0.12251182764575994 +3.591487428273638,1.652195890004082 +-1.3137305463463367,0.9253945791431506 +9.527915919794413,1.5537415175226776 +-3.4875762642750234,1.1523575411355058 +-3.4495443932949676,1.236060754887468 +4.916018554375361,1.9822641484714696 +5.0456792043668806,1.9661334036165616 +6.033126129338324,2.3316488695019246 +1.1257024191188947,1.2952894696911685 +5.042059385572305,2.0165318994505395 +6.858074306279697,2.2291115051735026 +6.760855186210183,2.2533260493155476 +-1.0802819484249468,1.0691848158380937 +-7.599838699547013,-0.0827739526952388 +-1.2903436669974013,1.1105018832243183 +-4.349449841912957,1.0554692085512938 +8.45395076350411,2.0993991557049605 +-3.919333066426511,1.12546922489839 +-5.527116975920837,0.64072704200074 +4.942122926795802,1.8922942135838146 +3.80502198804416,1.5901257089259395 +3.920934514354833,1.701592535693402 +-1.4545994506833217,1.12495245459509 +4.011415308202384,1.5173268190333606 +0.3540176463768674,1.2159116498004285 +-7.421033393928941,4.960971700019856E-4 +8.314878035825242,1.9585966597974258 +-8.74981309468533,-0.33313573489520465 +7.879486468573368,2.323158393473604 +2.907799214851319,1.5110584868132126 +7.485113576258584,2.3150976675986747 +-8.583563853513716,-0.011547777507071519 +-0.4516289213622535,1.0714665170349527 +-8.9247631366229,-0.012640073362907865 +-2.499414207352322,1.0919483102675094 +3.8393720436431877,1.7477742386773816 +-4.967527904523422,0.9448138413213059 +-0.12219778678998239,1.0977584859851977 +-0.6570422069753725,1.007245874027586 +2.333471366000044,1.55505682110833 +-4.794263206380439,1.2677168060279889 +-7.665856997453048,-0.11080580198862786 +-6.9728906533524615,-0.019935853263441803 +0.010354176370244161,1.0013407286113123 +-7.911175322966653,-0.15346928911893354 +-3.355873020170108,1.3892280112762962 +4.0178823693890475,1.8898410933442416 +8.895923722601257,1.9446928827859427 +-3.5114951941216295,1.3359373992923684 +9.264590680398772,1.6989042706826414 +0.15079940451125928,1.1148773325877057 +2.4318694077784997,1.359229653019655 +5.973076652128517,2.168975662887019 +7.643005375816777,2.3274816175445965 +4.896776726590071,1.9499771276498583 +4.137800544370975,2.028556753254687 +-4.504966220423498,0.9414413551629535 +0.055794522021868076,1.0353733978564215 +-9.015364473154396,-0.11793739865585792 +8.337484880752069,2.05988392751203 +6.00346991947064,2.2142491374569846 +-5.234972089663113,0.9902622513292391 +-9.970524553129174,-0.01825581065229827 +4.890264942323391,1.909437672839197 +6.702291962520054,2.41264272233689 +-7.340789895211452,-0.09378032200541961 +9.388953358312829,1.8608637932952456 +-9.126127744017971,0.020911129663466355 +-5.015714679053126,0.7944295882452957 +-7.772996716208702,-0.032552986461611505 +-0.6213959899492316,0.8927466634729998 +8.997232834696241,1.876479625378098 +0.6264564421495713,1.316816049491827 +-8.851676562757682,0.06487129475131335 +-6.1478196374675775,0.40400349701937793 +0.6939701077494309,1.1350763091679994 +0.3278234738875785,1.2554900405818445 +-7.1191633532875045,-0.09453132610134053 +-0.7346836299883783,1.1578581542902768 +0.3234088117687417,1.2772488471227799 +-8.515162085030767,-0.17312954887292437 +5.629223030236474,2.057302825653834 +-0.12698673919265957,1.038169053172115 +-2.4615083368727575,1.4231535468589092 +-0.38865205876462383,1.0284440414857257 +-1.7933091573991256,1.0578360224652108 +0.6613814224192147,1.2488695451375154 +-4.43638754519017,1.1905463504252638 +3.636665319536343,1.4075407598277194 +1.8038463275087189,1.449747922955239 +-4.158779827430873,1.3235595023847502 +-2.178754181100761,1.1598976741191058 +6.879274383780477,2.2444494678717497 +-8.904198822683526,0.09523538802778632 +4.580682945988851,2.0421568199254287 +-9.670072449151085,0.25555390146806434 +-5.112914060684993,0.9583499978212889 +6.798917020615009,2.2179030059694296 +8.948455476674905,1.8811219278459086 +0.09053658774476858,1.3395275494955194 +8.738354478430427,1.9704480940827798 +5.619939509532035,2.273047672542259 +-2.8242950456997917,1.137129193430485 +1.2994439241390676,1.377789635230429 +2.44622415725329,1.404573637814922 +-0.05801104439284721,1.0721258014433532 +7.475838758426153,2.448794724037765 +-0.5981380930538283,0.9383490416370313 +-8.604306350142025,-0.054442063090989326 +-8.983218811660837,-0.036921435364286596 +-4.627273781908912,1.0534996426234116 +3.2728732949418986,1.6592461354717463 +-5.082561725342497,0.7957178053582807 +1.8706469899991482,1.347369550738042 +7.4099627421182745,2.305581477884914 +9.653438975177508,1.4321262905366656 +9.633009383009878,1.4027980109715275 +4.99329756649486,1.9163488199125034 +-3.463187596146584,1.1965585065410558 +-2.2398610022282295,1.022120084739265 +-1.7464870670114063,0.9548518399891983 +-0.3082636630224158,1.0715333116098678 +4.494195165072865,1.890103307510523 +-9.763459474869691,0.11127223462503769 +1.3588173220707702,1.2266915966132395 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_938.csv b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_938.csv new file mode 100644 index 0000000..94ad0ff --- /dev/null +++ b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_938.csv @@ -0,0 +1,1001 @@ +x,y +4.181595581405846,0.5033496891137835 +-6.707153559510587,-0.9527517750813709 +9.29635805279121,-1.0954925680081287 +8.47451143314288,-1.1356222434011523 +-8.405036133453283,0.8378442744806778 +-3.615063966756043,0.9002151454722932 +-6.6004240249007315,-1.1217196974389883 +-4.589374987663287,0.8042664711327829 +1.1418457760599043,1.1425619560367402 +9.3434351963991,-0.9844785578528703 +3.4843104598430545,0.17905201880717367 +-1.4563369368059043,-1.0395396716148304 +-0.6050291948933904,-0.5506963773215008 +-2.3295481846436554,0.8001968166682965 +-5.140767494241022,0.14295728524640675 +4.536622149848487,0.5360781785567656 +-2.733892568715359,1.2932045862871087 +-2.659008407919856,1.1052599398636396 +2.8793686908255407,-0.5360981149589117 +3.275976024974794,0.02222624694593643 +-1.2757772497766773,-0.8400434622405746 +-6.643960776025551,-0.9839731953832613 +-1.7562921009668475,-0.9137840704255327 +-4.657583847339799,0.7321707613681706 +8.626333913154195,-1.1516363835356003 +-4.9552909181454865,0.5301088707604813 +-4.484532222880349,0.8203013465958208 +2.7169964365440475,-0.7592757340056312 +-8.114758709519002,0.8557605293470439 +7.505829344774568,-0.7943594715061959 +-0.44645069706307083,-0.4338240416746354 +7.263439513177751,-0.32728045380453524 +-4.025652015957565,0.7774216427824938 +2.098477317349081,0.1891849010552452 +3.587082930485934,0.43759042343809734 +5.3657478179843565,-0.43044170286940386 +-5.954189878277756,-1.2048248200397333 +-2.168524667579707,0.17057190226255087 +1.1172187327129457,0.9665115783782919 +-6.362641939156425,-1.2895105955639663 +-2.6399656142837102,1.2704333844165645 +-3.382900009680645,0.8057720272651164 +-7.6297793278069115,0.5885240461686929 +7.351945645893515,-0.7474341576542017 +8.693956030876556,-1.1914661370807387 +-4.097368240959781,0.7144748457833223 +2.012721736556959,0.20853371136005436 +-0.1659300063177085,0.04036503321406176 +0.32708760202731924,1.1853810798918314 +7.9483211468422965,-1.2445936330295317 +8.383336883691243,-1.3954627044076464 +-0.1088202313454989,0.04072326683014302 +9.211998693505286,-1.240990963241528 +5.8835105265898235,-0.1869591680069762 +7.269445841770814,-0.47764117514753673 +-4.285178667096478,0.7941270388526879 +3.3051255828043047,-0.021461023409793 +-9.674782566528165,-1.1726770972797245 +-3.3433184813621253,0.7748690034063626 +-1.4752908128208873,-0.9838208807635915 +-2.2718009200307865,0.47627942943868906 +9.08400030351553,-0.9664350014119809 +0.9203184775660809,1.1189110645836269 +-8.843079067376426,-0.20980211255950992 +7.9239362839290415,-1.2936675680688559 +-5.026192180691641,0.5497868840496817 +6.903375537619652,0.33660582293443175 +-9.255390755786838,-1.1840871462765028 +5.451679893762993,-0.3204413029774637 +3.555089129573892,0.39144576725166097 +-5.683619001895117,-0.9867149714294026 +-7.9027221467262265,0.8470084174677829 +6.374117257277025,0.4527608013031873 +6.308263538954307,0.33086979366408875 +-2.112816814242638,0.03402253118740761 +-3.1073108737259503,1.0736270029227082 +-6.868957757574897,-0.676315063270884 +9.643899457357342,-1.262425263706888 +-4.469856582665432,0.8770826085760158 +-2.337175399789149,0.7243061474850349 +-6.843574704951539,-0.6552348983116139 +2.992032264609268,-0.37527556124874495 +-3.5127299364907256,0.6237388285501164 +-9.030263180601704,-0.8629195878767735 +-4.984102986096378,0.4198934502051697 +9.685792674075824,-1.469947066912269 +3.454801385818289,0.16051858731374535 +-4.084681765271041,0.8147348635452623 +0.11374473342117142,0.6866705123970921 +6.466794071123978,0.5551931846502673 +-2.646513767887701,1.0688541729998284 +-5.87699530323027,-1.1742542510643672 +-6.015110018854401,-1.1386942037847088 +-9.209907882238547,-1.0013265124455402 +1.0248497853171692,1.200040214572 +4.80888952359507,0.46654357729414625 +5.520263816682531,-0.5657034135736623 +2.539851794284127,-0.5876464129216583 +-1.1140901757800705,-0.9030326008052085 +-4.9755809427797395,0.3617482194275345 +7.994448172789372,-1.4394735635660831 +-9.868354335956798,-0.7905844471661125 +-5.024230805403555,0.44907471993244197 +-2.0040750619217196,-0.1501386622412281 +6.284373334330869,0.6257498538214369 +3.6337998689986595,0.3711912128140145 +-1.8267549619500087,-0.6397970648367156 +4.498391505791929,0.6738467383425704 +4.345684679824,0.734071325629027 +8.064442193210265,-1.3469812032961113 +-0.6278049475262328,-0.4315911321024848 +-7.131865365260172,-0.15234105717874377 +-5.23866545298568,-0.12480932463786153 +7.6003974993351875,-0.9884801961783518 +-0.023407622841427056,0.40303359230182334 +5.541556403289176,-0.33996644582862046 +-5.187706050411126,0.08543292177796744 +0.7414800530510632,1.2819285841206578 +4.243174846685429,0.6972170326570748 +-4.308622192869382,0.9712896274241156 +6.974970128947685,0.1964604023445691 +-1.6345499809158515,-0.9854681218403599 +1.7237466255062408,0.7345766700491841 +-9.35612425936348,-1.1840657969058648 +2.303398711882023,-0.38180556557344564 +8.848007863347593,-1.0202222204447147 +-4.916385034162879,0.40955195836152836 +2.4184327447219545,-0.4091274024508956 +7.877030991481227,-1.211578710211231 +8.779158043600106,-0.9988492670353476 +7.166358194607491,-0.25208072824575684 +1.8763281986407634,0.2577261217198763 +-2.7699703355837713,1.0397870325100211 +0.32777728240347415,0.9961562882619099 +1.726693719054314,0.5758573410732317 +6.453327305382729,0.45344927049073014 +-6.857320276584274,-0.7091171400412551 +-7.585877757829262,0.6398692946387744 +-4.1958804242881484,0.8488715340979004 +7.071513950255578,-0.05681217542277216 +-0.4139067877482283,-0.3421870110637673 +-3.9997590675081884,0.6429545651136548 +-9.619652884776066,-1.1316026335514524 +1.4926633090166384,0.8191843386318309 +5.0525830051555864,0.28678646924021134 +6.035508260605419,-0.04994420152525554 +-8.972418946664185,-0.5227322484839804 +4.33838205731389,0.8393449913590034 +-5.269300913650955,-0.1334498249576851 +5.399086293571077,-0.29693670580824666 +3.276373172694198,-0.02567428934164478 +2.995734457539388,-0.46901025546119784 +9.906113546503136,-1.0957207618556914 +-2.3734165968922976,0.5974008132693484 +6.422777948411067,0.5508961883727473 +6.092704131576735,0.1657349593068398 +7.5025403760100104,-0.7843105061061629 +-5.465303143356767,-0.6927565533417469 +-4.216160989500208,0.9189640989199391 +-5.122831406239472,0.132594935040577 +1.7366384522240317,0.5558448468612788 +2.5485041482068698,-0.752257138201951 +-3.3939253022652593,0.7862622105453824 +-6.76416319197557,-0.9169390259465409 +2.7965691035819074,-0.6385188038141645 +1.458755572042815,0.7711563388292503 +-1.6411217227685615,-0.8147223937050022 +-0.7990949543854953,-0.5739246423656171 +-3.7018664093360028,0.3942232399981642 +-2.743082722031156,1.0451669531803012 +5.043946682731018,0.12333893653204658 +3.6950483515163097,0.5490461263206309 +-9.764880509314192,-0.8868930252821446 +7.682301912213837,-1.0010267110415723 +3.7979025097453305,0.6366283690531025 +9.048948331455101,-0.896276181932238 +6.168518805769337,0.19024711834616448 +5.516702363143612,-0.29086564241476215 +8.631020134085144,-1.1789869296411568 +-0.3463883048329599,-0.11939086398795645 +-7.543965710848334,0.46908404613423915 +-0.056114718595598134,0.16616947711873278 +-0.28709972572541176,-0.19811578692158271 +-7.699964802107218,0.6091786686533689 +-7.734582478428763,0.6608204286426794 +7.3186770297013695,-0.39931732875680653 +9.732110824015582,-1.2453120953444476 +-0.09536752544619631,0.30617718516422393 +2.742105390638141,-0.6106688612946304 +3.8588597056328577,0.6454865544403364 +-0.25121799997832817,-0.13475267103015534 +-8.200091842591167,0.9820926970569565 +4.58429168157986,0.5709661925956091 +4.946354380497935,0.014715906293159609 +8.978696508901617,-1.0978169718453463 +2.907264640621832,-0.2880504543416649 +-9.670797274962595,-0.9179461899692837 +9.907658716696378,-1.086574758889599 +-4.19602465495084,0.6991235001591111 +0.4493389560648442,1.261961020645157 +-5.470122388964459,-0.7350503525475244 +8.520540488571239,-1.1353130269501406 +3.185682372660814,-0.029034339681739033 +-3.285710839428418,0.8846215072760594 +-0.37401938424639347,-0.21291545358236005 +3.0521584664393853,-0.4746105628543319 +1.2535685106470815,1.1263564338018868 +-8.147040816195773,1.0028845776983493 +-7.6093610308671025,0.6379409703993382 +1.3893006130782268,0.8966079647727224 +5.587485537904836,-0.3869080296368683 +2.607880392115538,-0.5731492996488153 +-6.4820383963084325,-1.1103859919308827 +-3.759300521324933,0.6265890156416563 +0.4859947665021913,1.0474111547357503 +0.8234041496019664,1.035380520900649 +6.490954289510125,0.5329486771722228 +5.100416720871435,-0.13348881855162198 +-3.9936664600693876,0.8413781485342837 +3.6194328791024866,0.42401233208213956 +-8.22595480387968,1.2161465503306381 +-0.963636819150576,-0.7543518020303656 +9.744312282786034,-1.2791428070122326 +-2.544796601057273,0.9896055715622185 +1.0512259533341393,1.0913023653484881 +-6.236606285256023,-1.313268657912493 +8.850851566410391,-0.9327244086527802 +0.6315678076119333,1.3494264014987545 +5.973441056157052,0.06074607907179904 +6.591431670783222,0.3204336338471634 +-7.230283390459679,-0.020529149182206965 +-0.02827992332257523,0.22286336103254778 +4.8243013681296745,0.25245028149460746 +4.414010525197099,0.649383190655532 +6.812534870458062,0.3614297030716801 +9.951537233096538,-1.323113388331252 +1.9970679096901947,0.14563913585577373 +2.708401303201637,-0.7287214380222282 +1.487001360660635,0.9236438214518486 +-7.413861719635996,0.389583970775699 +-2.643910634904842,1.0916245648911234 +-0.09694216305361181,0.07729821183122952 +5.790879460476678,-0.2607801876449256 +2.944522269622287,-0.492995948348793 +-9.576926132619292,-1.3535034473402723 +6.210698250768605,0.2893446003804347 +3.820957162675409,0.4706566230183232 +-5.9364833612026136,-1.31264441487954 +-5.090175660918369,0.3272543110122371 +-0.6741681426631132,-0.44375821332026577 +3.0397793625249965,-0.4907806204169232 +3.0924312720511793,-0.30322198693108343 +-6.168510073114298,-1.2993939854348353 +-8.170534050540134,0.9434829810259844 +-6.301047590287886,-1.4484871797541121 +-7.744686374302559,0.820437842503767 +6.95877793112193,0.30572899346262716 +1.2744195263836602,0.957297934699645 +7.883307091595263,-1.184555844150169 +6.799458955167456,0.3284927364764228 +-7.942997401275132,0.7470124136648831 +2.9839019286319868,-0.3572587819387929 +6.169974806342363,0.18982102754620195 +-6.555548801671463,-1.2129201606747644 +5.168980621748279,0.04412755321107051 +-4.938964636620646,0.6842966697956001 +6.981631404751717,-0.036355961912690166 +8.393922434660336,-1.2187657516035573 +3.26067937175595,-0.17435190179621773 +-7.2407434860402295,-0.3123707273179946 +4.372934365033272,0.7659756364428107 +-2.4188259622263746,0.7380636738311165 +3.1606993015117584,-0.2233840169308573 +-5.711999960518298,-0.9757580542771661 +1.969795346546622,0.46589713204072775 +4.446149738154993,0.6801366569371078 +7.463433227248618,-0.5355546971495194 +5.7364288657263085,-0.4241828962790959 +-4.090136637151245,0.833873296404903 +9.831628581795435,-0.9943391348628208 +7.71362993073231,-1.0419210629041689 +6.246315700956258,0.3819921399115402 +4.648792449291829,0.71011884637009 +-8.83331098628404,0.05950367905868567 +-7.463748577728771,0.356322748274389 +2.348192731197889,-0.3030916532289662 +-3.4007016480076264,0.790958387456095 +-1.3140405336856134,-0.8342890173932463 +4.392550512543947,0.7288824710732325 +-0.39451763257767425,-0.2859438941624533 +-1.7214215851503845,-0.7439958397265221 +-0.1527199261522263,0.07558141159846925 +-6.013616714524126,-1.3223449545876327 +-9.910030963232405,-0.6323947231838573 +-8.363008367860814,0.8386901027832318 +-0.2128928358247517,0.0023974883222730517 +9.56575862433774,-1.3292042535520296 +8.704988517903757,-1.0317422357332668 +3.3602223464312875,0.3103672160337119 +-1.3859130538962852,-0.9355543278982773 +-6.049191985538069,-1.2299091219715326 +-1.564514730644131,-0.8344120249744332 +3.1759572698711658,-0.14983428485281258 +2.650295182268092,-0.6808891296663142 +-4.604613044255306,0.8613528093927538 +3.796685970713778,0.7071524710328182 +-9.59148841906854,-1.2160578602003007 +-1.9907597067513105,-0.3299220042775045 +2.995003876893172,-0.5839445858979565 +7.790401078807498,-1.1808322405108456 +2.73675163414536,-0.6146425461591332 +8.776512014820803,-1.260779011928858 +6.673316157297414,0.6311611871659322 +-1.4571887694993393,-0.94148980696898 +6.460302735989316,0.5157709580385859 +1.033182429895696,1.285208394455899 +1.3072690296533906,1.0459069345501435 +-6.102431425764628,-1.2302219469660467 +1.7655578826969354,0.554080982275951 +-4.118467482064743,0.694089564801361 +-6.3455143781297885,-1.168254697912728 +5.207002222599959,-0.12258479133099516 +-8.752316756167922,0.30050320693304233 +1.9121283192182084,0.3100870494607316 +6.178255367334536,0.388555236390176 +-7.784088568152416,0.8312132916607834 +4.338788999675174,0.7119254423425965 +-6.087932476025376,-1.2714016400218338 +6.99017539493954,0.11008337317053671 +1.6276480044566846,0.6839679282246478 +-1.740797018438922,-0.9243393272851288 +-2.1389826613407426,-0.1137363838155683 +1.6808747485681952,0.40696768421105867 +2.065518850464372,0.09217353093986907 +3.143231059512628,-0.2942203396757403 +-3.627826012603297,0.6673100279651757 +-3.8304658991242224,0.6538496307581714 +7.2512107452039345,-0.41063964189513835 +0.452558571107744,0.8906820905117814 +-1.6379729042345925,-0.6514170114497153 +-9.119611234098087,-0.926589326734339 +0.20831310506974177,0.7500021797328734 +-0.6575028126555225,-0.346900818856277 +-6.6157240399051,-1.1156981343539876 +-0.4863080547963694,-0.4335633617707058 +-8.35807328435877,0.8414307835838135 +-6.081422182289785,-1.3081580225159102 +4.5001753486360165,0.8456193240602863 +-5.112046347273051,0.144529478035545 +0.004169835602985117,0.5202015419131091 +1.5258581732397225,0.7888438013937281 +6.82530334492229,0.3676762838706446 +2.6108140420803356,-0.60049130590743 +-6.160460164984257,-1.2164279619591583 +4.631280804946186,0.5219028127398673 +-8.34062005525022,0.8977937394270099 +-7.197558788234634,-0.21523156667409446 +5.943037307968378,-0.17252127648760773 +-0.7590249920409029,-0.5027640606735719 +-7.810977673575369,0.966041928530056 +-1.6999295537181158,-0.8343654569856097 +-7.051833462665606,-0.3803967864849491 +-2.940883764226223,1.1284101229426997 +-8.888917188517418,-0.21419880745014896 +-2.4010554482296698,0.6997359654771904 +7.597511406828922,-0.8371610434376516 +-8.394411005043306,0.7404564620235644 +-3.8363789255166925,0.6741277798593968 +-2.917657838762451,1.0778353317804719 +6.2559179444617365,0.5239964482190308 +3.967297611630727,0.5638890360077706 +-9.131378388982384,-0.9488388795109715 +-7.816803644817681,0.9063251768149693 +-6.975753028590353,-0.5301651984683432 +-5.051618982994771,0.2722725172307913 +1.623201185989295,0.6266683766256396 +-8.432468159625486,0.7084872753106388 +-4.281046259909447,0.785419850716015 +-6.332265083441876,-1.2744048582129301 +-3.5903642094034627,0.5790647366627611 +8.394766492453552,-1.2245209758417497 +9.504752266169408,-1.2256038453880747 +8.305894601558933,-1.4665636214060744 +-7.911933089314288,0.8483033752856065 +9.816698318933277,-1.2241776476206347 +-3.289895726751375,0.8954340271339188 +0.8733150039637305,1.067054255360149 +4.001929239306087,0.4943327878707124 +-5.962483197349311,-1.2277229309318551 +-6.870898686119634,-0.7895334728977349 +9.440954943694877,-1.2982814998037837 +3.0926169362221856,-0.3362231380873852 +-5.945425522927428,-1.330976033452442 +-4.586445125220919,0.8802069036540292 +5.524253319129349,-0.47234304576292474 +0.16886939424744085,0.5949346462714288 +-1.2539422041806212,-0.8358833607359123 +-3.0078276214048887,1.1628843633709627 +-9.350957422256077,-1.2546541418831327 +2.5272342789832525,-0.596101643100264 +-8.381366833986057,0.8327446650105684 +4.6766453274924515,0.42916752847205136 +3.2664609946610046,-0.04634209826560856 +7.303869945444763,-0.3837118487725626 +-6.107403935498947,-1.1940819363682738 +-3.652371637526315,0.7816477236966723 +6.413462581267165,0.7754178314506837 +-5.607796579287667,-0.8382408986170289 +0.09942529516384724,0.6763765500926415 +4.320053233525165,0.6797724597390807 +-7.056309497357142,-0.24483137120149695 +3.5920420848728796,0.19148304324595802 +1.5725516747485102,0.6900047481917093 +-2.4742320260858897,0.9107704505395886 +-2.4239786073040155,1.0102407144951762 +8.847636487832318,-0.9762727296962262 +8.676641677196713,-1.19687964774216 +-5.444347998615351,-0.5467039118331731 +-4.583510183887478,0.8076953603442292 +-5.1592853469751745,0.22900837804586716 +7.029688569592004,0.03038031895517782 +-3.289794234571261,0.7558130918867741 +2.025280162880762,0.11611925073402188 +3.0967662890677516,-0.16017740387678245 +0.8546014082853048,1.2349483763481743 +1.1088307847635992,1.1759560219148286 +1.9674417766896024,0.34274085532478704 +6.640406271089834,0.5206098566714293 +-4.722065493165605,0.6728800671594749 +8.042612685496042,-1.3004413811612978 +2.8085451953709395,-0.7607032244450957 +0.9374647730740051,1.2123644353133738 +-2.8722704012338656,1.3185282301561243 +-7.341770111836556,0.029424157672241727 +-0.6514337785910165,-0.5817900940542972 +-5.072923823790845,0.22206310260909617 +-3.738438648906657,0.6905786364555709 +9.486725489210354,-1.3731168092203636 +9.058275066292797,-0.9799139923182938 +1.3466581375722075,1.0591131132507092 +-3.1116773643064555,1.133395834522501 +-2.3769221656801736,0.8030885766216396 +6.47316599978209,0.6642942520839679 +-3.801421004538117,0.4547910982866099 +6.676033030617866,0.6161559557812949 +7.476014144590003,-0.836457363814808 +-3.5769673856210775,0.5529192199602005 +7.073157776912886,-0.02224065483804591 +8.601404238705598,-1.1165729062232115 +-3.611357134206625,0.6777612046556815 +-1.5440194613391967,-1.2554998954826952 +-5.5941148230874695,-0.8580252863922608 +-9.048465350477276,-0.8565896354291997 +7.116030802287142,-0.2144515654818184 +-1.600539200738309,-0.8598312339863183 +-0.6014991994474883,-0.32367087189086374 +9.314775992215914,-1.0966589467671746 +4.298975297546722,0.8504742194131814 +7.067778034111573,0.04797886775798342 +-6.6460436377851195,-0.9769355623838415 +0.1992588429332578,0.5913071731443019 +9.399451462600524,-1.2858631826593963 +-0.7213602845238487,-0.5752216314876246 +2.5404373070031068,-0.6011011401697485 +-9.788017916160076,-0.9499540704739621 +6.984563828029664,0.23853265746154667 +-1.2430035195575648,-1.1359707675547652 +-2.6221558708493475,1.1699134970677738 +-8.776417083174202,0.01767280921682416 +-8.797262530245938,0.03705042620780722 +1.213415903725119,1.0853059991014948 +-9.394288711971676,-1.2551414396626581 +-3.2285769434701943,0.963718490524411 +-7.486729993797091,0.4740870986516179 +-0.7674539492233823,-0.6991001949651653 +6.888118629320491,0.3971677075382506 +-0.21409741711778096,0.04222534478990177 +-4.349420397286536,0.7887274683009854 +-9.600443368620724,-1.1658945986109275 +-6.285040157861089,-1.3781566204951112 +5.392162679425979,-0.4392959893172532 +1.8307468230556445,0.44859583573262335 +6.079974529215989,0.2515326490522626 +-6.3906878289706235,-1.2878304831863399 +4.4145318836952985,0.688183931190885 +-4.36693285274243,0.697812959107212 +-6.243540451459143,-1.3283762472393335 +3.441430894831839,0.13783279482125543 +-6.469053652495198,-1.2707690385307975 +-2.350579886141295,0.511113312972743 +4.640485472306551,0.4757547015897563 +0.14756527052962376,0.5366422907590573 +6.587504624865659,0.4333236821644888 +9.873482450295729,-1.1902303193451647 +9.35858644450861,-1.1760314528289595 +2.9449465911012536,-0.4576775309114664 +-2.6765089327770712,1.22490704901497 +3.8819540368426164,0.5028479936389707 +7.775339011186823,-1.0809409712693807 +-4.267330261925032,0.7928353188611638 +7.434084862218771,-0.6684199857764607 +-6.024657236166079,-1.2712241700946783 +-4.693145284657554,0.8691625361555482 +3.7651419477474017,0.518121784694231 +-0.7776008565049466,-0.7117587042696287 +-7.675894578683767,0.597926084441917 +-1.8156167505660026,-0.6743346100472832 +-6.814725854464886,-0.781968099942633 +0.8910311620195,1.2219024039627038 +2.690004804476946,-0.6653719476988272 +-4.532529175073435,0.8509042532708129 +-1.716420636597661,-0.7558257639299989 +-8.501037861261779,0.7635966897153887 +2.9146955212319132,-0.3853971361959695 +-5.48579334181853,-0.39649953661985426 +1.2528571815306933,0.9131832474040211 +3.814689293751723,0.6951547527399984 +-5.13740536486603,0.1165239487769866 +8.703085168640897,-1.130224042453123 +-0.45839144530473774,-0.24697758193837577 +-9.267261663367549,-1.1591153303237072 +-3.3204298142064066,0.9985928787389067 +2.8798662770812733,-0.5588463345832659 +-6.711117063811702,-0.9323494255803149 +3.6261710236930877,0.5121871649119067 +-4.494519710974828,0.8927022087894537 +3.9106420408676,0.6029426203087142 +-4.495909768702116,0.9538334435965596 +7.139182947894177,0.0012293026645224803 +-5.401166902311969,-0.35620094455059376 +-9.827325201479425,-0.8416779471786969 +8.465150878931347,-1.1836871713581925 +-8.183423393032783,1.014663657362307 +5.591480843118752,-0.46913756635354636 +-4.776550978095017,0.7252673081387401 +-2.610126011840222,1.1719934087479054 +-7.9628051719103885,1.0225655498893595 +4.538233706760257,0.6824751422288736 +9.173121385397309,-1.1310909202266495 +5.469788880134697,-0.19012326993699316 +7.509984128925545,-0.753635733515353 +-6.691758061331345,-0.9368676314162255 +7.935690226416155,-1.1294736681425737 +-0.3152291174200972,-0.22386701409126697 +-6.679243263635573,-0.8671279777374629 +4.491841497210638,0.5885579433736314 +4.318928874744303,0.6103349714485462 +-4.9878004234483875,0.3449082861552065 +6.0470497376021335,0.10469570947842208 +-2.721421404110846,1.0946060059218161 +-6.96090213837592,-0.6321570603784227 +-6.365762572702608,-1.27009226660975 +5.994159176428656,-0.09788733599950189 +-6.265396150160988,-1.255524644907267 +-6.869988579569405,-0.6768033054770658 +-9.652043639598308,-1.0978385430419617 +-1.1745855554903084,-0.8748578547219261 +6.948086259780268,0.01202886942301995 +6.536305162195664,0.5250104373937031 +1.4934149552792686,0.8703005443844307 +-9.090259962803824,-0.7170494607462587 +5.992632248272418,-0.020877781197774496 +-2.56280586931187,0.9376544893417247 +-2.583642199740419,1.1256583805726021 +5.592035956930195,-0.3983649061008147 +7.880058466256759,-1.2236814688731203 +2.7953221406944766,-0.8248426296870279 +-7.5353781400873245,0.18934171397646166 +-4.113850942714938,0.752419993282082 +7.346815786266351,-0.3747311118497376 +2.952419839538574,-0.5010618817715967 +-1.1952086339514345,-0.9530063427414718 +-7.617677948919387,0.4132644090067679 +-7.456724172078605,0.3153437232597516 +6.489179759690388,0.6090959342258009 +1.555108344359457,0.8897840431750538 +-5.664856326005406,-0.6622217544695521 +-4.299653954354676,0.8124446073079945 +5.502936823224314,-0.44564485446554 +5.323950505140527,-0.29204327125929014 +-7.824990002737601,0.8326379595071931 +-2.051740841088874,-0.008724847368530103 +0.5885531921152332,1.1244158890875928 +7.678094111344143,-1.0516981068173759 +-8.406671420228982,0.7913254151232785 +1.9975637334458476,0.19102897682026665 +-0.009287678905099739,0.46642319809885135 +-0.3943526326309197,-0.33414538043865055 +-2.033514219922994,-0.21318667647999234 +3.1070925080308687,-0.2999869759449118 +2.1112316210173216,-0.08289795823707224 +9.772535569075774,-1.0505709560716812 +-0.0032221905696196984,0.2849035384919475 +0.9797903324825761,1.2319964214090537 +5.649866010007338,-0.3015633575489094 +9.788203737742528,-1.077236687545406 +-7.501867126290271,0.34967108530010066 +4.96875031899811,0.18760786368024357 +-2.469443404553475,0.8054549264806556 +-6.062128926621572,-1.311096408627588 +7.8210055173981194,-1.318985101570813 +-1.4284724685164072,-0.8774504219184855 +-7.318395527438754,-0.013847098598213356 +-9.610149628012813,-1.1535769818814614 +-6.793800996273455,-0.810886016264104 +6.198082168757416,0.4195855994076325 +5.688727674708618,-0.36753416807919465 +-1.621985893751262,-0.9079882715813797 +-6.695612106583869,-1.097827421367056 +-4.744649845747002,0.8246236861224863 +0.2780468709467918,0.9728523298319728 +-9.160335479232575,-1.072435689997801 +2.5053090264605298,-0.500988992382115 +-3.7729519376242098,0.5525989564752671 +6.650160639511103,0.4101990552557468 +-9.538094987558296,-1.0658185097350514 +8.684187279276863,-1.1024107598982134 +9.000157612625628,-0.9925235746889355 +-9.790769746686976,-1.0164039435731744 +4.643922334925708,0.656653299502657 +-2.5086905351236766,0.9894925637906288 +-6.71531627697589,-1.183329847792611 +-5.7318410523512675,-1.0329088120836327 +-1.812306697661965,-0.7036904809230382 +2.60492628914815,-0.635500533953251 +-5.69844985750847,-0.9245742074564395 +4.031907896819785,0.7729828800606271 +9.24163672317841,-1.2394824994742135 +-5.88743958240582,-1.0896507929819166 +2.5857193204340905,-0.6200944720247028 +-3.831583050167975,0.4862873541109154 +7.730519190391717,-0.9598604838106769 +0.9106477909585742,1.2158151655999956 +-9.895742178726273,-0.8091649122541164 +5.199341607210677,-0.30247011371041876 +-8.210453635975895,1.10461321010706 +-5.513202945312358,-0.7384668559946589 +3.2028056857659455,-0.15084041527570316 +9.403441177581758,-1.2288611762912358 +-2.121990625425227,0.17236796286045308 +1.4110106688524624,0.8436846894040761 +6.939885508667398,0.18863895729945668 +-1.7265983811931243,-0.9118390335499392 +9.64303814009942,-1.235468662012191 +6.254233686327627,0.38547683380720815 +-8.417220442003707,0.8061037818425282 +6.932672681913622,0.44315956509831944 +0.9315174372697754,1.132394743682046 +7.633441838820466,-0.8412655750633671 +7.677430730254989,-1.0905614942714001 +-3.592790631723597,0.5487214045395781 +-6.693468736088853,-1.0484437656418508 +-9.47392539264898,-1.1994398457189372 +3.086282817010897,-0.19658943349017854 +-6.014662790396738,-1.3408520992944233 +-4.243358746316277,0.8620675374337428 +4.403133062599952,0.6585884836266697 +5.167902959977667,-0.017427990246348044 +3.7085615348291827,0.5353521345102339 +4.441330311411615,0.9409944076072014 +-9.387513491737785,-1.171902512558174 +4.1810603958192125,0.5653036407627902 +-3.1847855132588165,1.0601824789054695 +-7.593423117251462,0.32275479558306286 +7.629205714894177,-1.0669573611169356 +4.568299141843891,0.690235987781961 +0.17539205677463343,0.5852073265723065 +-6.963752022686762,-0.7344688801935253 +-0.6568743480623684,-0.5092948101415036 +-2.3906545201879896,0.8423089664992571 +-0.7654409765093995,-0.7043519463451683 +3.966142528659642,0.6881738054694526 +-8.428222778383471,0.7103648135568694 +3.374836140352833,0.11422470910237556 +5.76877933612546,-0.5373796569975807 +-4.41315132507707,0.8812601601113772 +-0.364870067233527,-0.2354942582358946 +-2.225651828571134,0.4536810315034804 +9.792201341376845,-1.2946909960680668 +9.176459160750685,-0.9893828753557525 +3.41246585767502,0.1733695800497905 +2.9693373670316565,-0.4697960796985781 +-0.2662243815255927,-0.1149450861353222 +9.51920108319273,-1.0533980160045087 +-5.291680193018294,-0.09618017686880088 +9.39720441600825,-1.0744811216574524 +0.8457957632751487,1.299094767439693 +-1.002592550414473,-0.685260906937623 +-2.59174777034136,0.9787080824937017 +9.842134144787067,-1.2440395170190592 +9.116116108070514,-1.0718243896225372 +-4.271246325360277,0.9230063317388262 +3.1496025816950417,-0.1521716767928633 +-4.7747549286245174,0.6502004552444023 +-1.4052012230794944,-1.1014340792422181 +-0.2397257243637405,0.004851709533736932 +-9.415535797034678,-1.4437194220711387 +6.733746165880113,0.5672475108123651 +0.795820684929339,1.296113023254252 +-6.207767637078167,-1.3622009401962112 +8.823425504828563,-1.0270773775644584 +8.513215830863729,-1.2418463979638286 +-0.88438660074182,-0.6794541621853699 +8.204418819024784,-1.363106470317066 +9.737761619254233,-1.3196164802083508 +3.7481861713518643,0.6094884028130299 +-2.6343707528473725,1.252810351996732 +-3.2436592275105585,0.8383251091900531 +-9.882371135957207,-0.6621700889241459 +6.6111906487900365,0.6437188619912587 +-9.036426584503594,-0.6288998537620951 +-0.278505257188538,-0.2694940016428678 +-7.914217243541746,0.8053538166107659 +3.8032044730894388,0.5721034331300667 +7.778603190215673,-0.8837579064789168 +-0.20234069670049237,-0.06909051570881031 +8.857583065907683,-0.971290297715944 +0.2866757482263935,0.7676344640312025 +1.210936806495578,1.0696788226460718 +9.555162793429929,-1.1991372025808085 +9.27759059902354,-1.0599362143279984 +2.207414729583368,0.10281285203776938 +7.890617015254303,-1.1060846394118693 +3.4775500539708215,0.17548902380921527 +3.6988935504716913,0.27909783372910774 +-4.850350914086681,0.7167494119190156 +8.39590322726638,-1.415176704884097 +5.439544321963762,-0.39905761125126593 +-3.897250286884933,0.6127717025134645 +-2.651115795527792,1.173040194081669 +-5.085143189718018,0.11711021500350607 +-1.750677100689387,-0.7801464167508031 +0.41733629047141285,0.9507563568461843 +7.814892390370866,-1.1177172436385103 +-6.6488875277290305,-1.0029891500770403 +-1.7538976567601416,-0.9343500843526953 +-4.481363951288234,0.818268452357857 +-4.308430043382412,1.1485632762033067 +3.3385952444161227,0.0735857142151965 +5.788726349816669,-0.205899139216021 +-5.616498594267885,-0.8540388853041013 +-0.4514477269474426,-0.4603274756393575 +-4.441542546641827,0.9356533673515943 +6.273040106790724,0.3516598440451635 +-9.53803684672042,-1.161849393107251 +9.462152214569606,-1.2679520660734347 +5.6140789700583404,-0.5193729947236423 +-0.8458810416559395,-0.7692562957496961 +-3.050629064370764,1.231047517264307 +-1.3851541620984165,-1.136058567667779 +-1.309743177692333,-1.069921868919807 +4.949023320572623,0.22723172352992071 +3.006205775103719,-0.580269195311387 +2.7349356182893736,-0.5334307665177122 +3.9686622722127614,0.6251027599180404 +-5.447229433469079,-0.6587624459307033 +2.535681775043166,-0.7715088505154927 +-3.5496733681993486,0.6963065559306506 +0.4323884491378216,1.1139573057566077 +4.016269342979712,0.7695836248371957 +-2.9293062644823347,1.0837784750216815 +-6.432463922268362,-1.27883460292241 +-0.8655523116728894,-0.6037669022207041 +1.3798618884507405,0.9065538981901546 +-7.300055118265213,0.06736821764958764 +3.5518459705790484,0.4732337314113121 +-2.779546846294863,1.1780691628761355 +-4.11090210860956,0.8231653480202598 +-5.946672492858953,-1.2231906658502794 +-9.953923610251046,-0.48882214861787043 +-8.28521222276677,1.080055313659425 +8.015398829945092,-1.3657241904722714 +5.2022245148307,-0.06607802274957648 +6.078078871311167,0.11955541963776738 +5.672218958284564,-0.5164912214509881 +8.15020532434767,-1.409109502017853 +-9.532203137706627,-1.3640983393202524 +-0.036745771487227685,0.32455189296913795 +0.9255878472103873,1.4059018803054566 +-1.2046335733607538,-0.9829450745226747 +-9.934506101412033,-0.6990643343827938 +-8.834332765767059,-0.08154539632815702 +-9.521662673843135,-1.2797432907214792 +-3.120207670605204,1.0322786090634675 +9.744136683267172,-1.2664587014813438 +-0.8902343411123681,-0.6587856770048564 +-3.1493292834503617,1.0420097233751053 +-9.69812778592874,-1.0291333032906995 +-4.88184249465014,0.623184191824351 +7.851871581000211,-1.3219751239710162 +-2.442034820739849,1.0421141596504568 +-4.996647899828076,0.3028548420438663 +-7.248758672279703,-0.1132983959670078 +-6.051380739555118,-1.2499638758230447 +5.369095854491395,-0.37861019160737175 +7.519052308372508,-0.646338317542593 +2.3227295082659793,-0.37733207277586117 +3.5132841246808333,0.1929118280166594 +8.982361681259793,-0.9940933388722171 +6.2063863624512825,0.3545611477208638 +6.969353282745744,0.1157499330233551 +9.038837366701312,-1.1388837935714873 +-1.191804732400632,-0.9584811134606306 +3.1469165750214323,-0.2128140695926345 +-0.4827254867869346,-0.3443988961182711 +-8.24734025879117,0.9167820861284822 +-0.8328299012131701,-0.6558811948716172 +-0.802824508234842,-0.6971445784901957 +2.6706927837325622,-0.6285605518314544 +-2.022023069854089,-0.1489767420131326 +8.02949476403796,-1.3053162277953807 +-8.664236430671423,0.31077774496413846 +2.400105169644928,-0.6143283279094879 +6.205133879169459,0.4184564852012034 +-2.1998777489996395,0.17028916166524788 +0.9604162630072377,1.2783049112404572 +5.540323690714427,-0.2455018784354515 +6.681686442473599,0.46102087557228155 +4.1686426008432065,0.8429186945093108 +-0.42629651531931856,-0.07227906799606779 +1.0497065540669794,1.2567922978719137 +5.346071147681117,-0.266991779367659 +-5.414579088971362,-0.23497619378187348 +-6.1418728895481465,-1.3198960969103304 +-6.460120015753424,-1.2176940488985764 +-8.211285163158564,0.8894645618393662 +-6.661524815557005,-0.8779659021555735 +-8.046503809552426,0.922783307209231 +4.519312179136561,0.6170881376818017 +-7.968345764995316,1.0461236965382859 +-5.886305272260399,-1.0561361512479428 +2.8729425139561737,-0.5755915923560402 +-7.99424469655564,1.0548911087907116 +-8.921236356203469,-0.38490090834920926 +4.065373544988141,0.6016521996128403 +-5.1148084972673615,0.2371326276260519 +-7.333437442465627,0.10315672713261392 +-9.304927590705137,-1.2452218631236784 +6.952753942756043,-0.09101537772695234 +-6.863175289351563,-0.6180878418280324 +2.4263822996895397,-0.5278366278483339 +-2.03093963290911,-0.3282786806786583 +-4.473740287018231,0.9968961587559224 +-4.510792835681276,0.7399315264293613 +-3.612310919031003,0.6325066271831992 +5.969671256076188,-0.08525966155837758 +-1.7595079318726725,-0.7198672153154203 +4.107171955947582,0.6484047336611033 +-1.5385622634846374,-1.1329639807986025 +-9.88221713293946,-0.5138169157645485 +5.4376034790728145,-0.38094349017247503 +2.650292531284766,-0.7224463194817846 +-8.467442135337208,0.5950945352256906 +-9.496517204140794,-1.3563336217594686 +0.04348039869169895,0.5498268227004524 +-8.118160839402357,1.0213060055532628 +4.458311245537239,0.68397497506951 +-8.864523622932026,-0.40558898581790986 +-0.12357574750520328,0.19110653268114913 +7.269456029578393,-0.4842720772263711 +9.529350736974838,-1.3234755087392895 +1.6454674987604676,0.6828189482478751 +-8.572369723848436,0.45078659164664225 +9.281083515560535,-1.1028998559763923 +-7.866138044488942,0.8532618255870619 +-1.9675654110931884,-0.2565670332725001 +-1.4104927803533442,-1.0532504913558773 +-2.9226938595767353,1.3166832112609232 +9.64172879303635,-1.1452745296462057 +-0.5182442875868212,-0.28408325267776197 +2.4743780492837963,-0.6642031771105443 +5.023611854470378,0.2543796421193537 +0.4287118643888643,0.9147979849248574 +-8.871417001410155,-0.13438406525211333 +-3.2284197849395073,0.882604145082638 +-5.219065148777426,0.12479230444471698 +9.485729543452315,-1.1697497715953369 +3.026887226972597,-0.2501828008075821 +7.341851644932653,-0.6633484835013486 +-0.08538613456692801,0.45703074066075244 +9.091443756060237,-1.034532875862857 +-1.9306716336012322,-0.5752457342327443 +-6.84862624603059,-0.7730719383672371 +4.328951319100289,0.7918088146845591 +-7.032660052807405,-0.6674251686937611 +-3.68334429212338,0.7883936548984318 +5.098123842123577,0.23707800830408152 +2.265111312227383,-0.14473309151025063 +6.643997958567226,0.626365801024356 +0.2574698608793078,0.9827912157434661 +-8.727642232598697,0.2446768366633621 +-8.564794861212444,0.42092273267844516 +-0.26553862019556007,-0.0013320045883965514 +9.05454114481888,-1.1021220992580691 +8.932147804934367,-1.0060995831501915 +5.3035299073010265,-0.207907809598733 +-1.0388532950558051,-0.7745257430394833 +7.07001501541939,-0.0024107016865352535 +3.423691458928207,0.14269106420784813 +0.47356257592360507,1.0272830578449064 +0.17695879539271075,0.6213231493161262 +7.935017519007906,-1.2757963909978023 +1.5456634456638696,0.6993085864459806 +7.925594347743573,-1.1532305761927841 +5.631391127061889,-0.33840220188451275 +1.1979287536529206,1.0460861094638083 +-0.6074651264638842,-0.3729526892176745 +2.335505779258003,-0.3253885097971716 +9.664034048945737,-1.2098854835522665 +6.951759161345912,0.04995911426281438 +1.2231333191827751,0.9857060268282369 +3.7670205612915315,0.4943425100699603 +-7.717782833664097,0.8202271869606766 +-0.14599997465976067,0.052349783476054304 +-9.539667976160256,-1.1717548912979374 +-0.6630996343537277,-0.5403878132592281 +5.53988233253197,-0.3070796128304241 +-5.653285326929861,-0.6514110836387955 +2.554493810977263,-0.5328231898370959 +-9.176426288329935,-0.8171536744370059 +5.788702526867301,-0.22688459315559045 +3.696074192847423,0.37075647163341746 +-3.5009046833929918,0.7598103650798309 +2.6231733582961207,-0.7001597963636716 +6.499879302074546,0.5953487355629372 +-4.855616091902682,0.7101791826294316 +-1.6863079593901773,-0.9373185072794415 +4.912847591231344,0.27573485897029715 +-9.909315795947325,-0.7955288082837022 +-8.300306176758173,0.8869412303707956 +-3.726962401464018,0.6645323926674812 +-6.939884163720915,-0.5500106966967797 +-4.965789317134677,0.4722603126814917 +-1.6928408471873624,-0.9425652104213063 +-3.6856262069588257,0.524673123274881 +5.251180524119977,-0.2089895083371699 +-8.56735539659288,0.5279843154564126 +-2.854828914926335,1.0005010283149 +4.243195514289502,0.6406980235255512 +6.648550686712692,0.5470605723737625 +8.583075272001587,-1.2210261686266237 +-3.0980997041501634,1.1107801178249754 +0.9710515077636082,1.3515075428098442 +8.283019443320654,-1.3434127637204136 +-5.028904494560763,0.2729783237621142 +-2.9221627511298998,0.9949390564486943 +0.5929542684729974,1.0981020868050528 +-7.903551048401872,1.0094928545466808 +3.9263198980555636,0.7830506152944723 +-0.27479434742392783,-0.22124390202574223 +7.912922451292793,-1.291805766560888 +6.94411236714361,0.2011279830085042 +7.19843855293318,-0.33418126746894955 +0.24294626349039916,0.6684015141650211 +0.2607081039157819,0.9632083398528979 +1.2254100248578847,1.2564465928175914 +1.2104788514180207,1.2032691045245436 +-0.8294135212272646,-0.5615860120855498 +-2.2481254638140453,0.44700843485494024 +1.335048925453517,0.9694485559909909 +7.077933904802013,-0.40609015392738446 +7.922777887120517,-1.079292033908994 +-3.2679149223157467,0.8905421153827058 +6.831848014308491,0.1684477729176032 +-2.873820473065955,1.0662957960031678 +9.331819015685138,-1.1136995568854733 +9.900924702013569,-1.103659816018776 +7.001057544181233,-0.02730368732808193 +-0.672081064946406,-0.40661882990110676 +4.950367507891125,0.17664669552420612 +-8.171899447639571,1.0095199546381817 +9.629919849161222,-1.1405796073808467 +7.054201442588565,0.04067877752246857 +9.991713661105436,-0.8624512082210533 +8.875206924720722,-1.3058049973121724 +-7.751219116874694,0.8286087885119967 +-3.737339338097087,0.523376072558217 +4.191799886028452,0.5641636975525695 +-0.8646912216671065,-0.5561987854585234 +8.918698474899966,-0.8502513364109417 +-2.920368385289027,1.2583821857645314 +9.742770176911293,-1.09300231140871 +-1.4546655714971877,-1.1867928726891912 +-3.849794680896497,0.6818850682173301 +-4.532792150088758,0.8339730451530835 +-6.538756682268342,-1.2269304263357868 +4.5899643792861475,0.5626725279571376 +3.864171281103541,0.6433525316706028 +-9.255289440326866,-0.9566304566573429 +2.3327202362413857,-0.2976893961675822 +-2.6555512356361444,1.0714942590143124 +3.3145675241141426,0.24252062272728256 +2.90940907171612,-0.5526630281393383 +-0.21780609996990918,0.05616083500056228 +-7.801033626587785,0.6495596692347301 +0.9089124008643559,1.1558236354443774 +8.944725034060049,-0.9895773181259723 +0.14576347249492194,0.520024974622018 +-9.369104634796685,-1.0268831664398195 +-5.410227515355267,-0.4347071369827905 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_939.csv b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_939.csv new file mode 100644 index 0000000..29ed524 --- /dev/null +++ b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/gp_data_tables/train_data_939.csv @@ -0,0 +1,1001 @@ +x,y +4.460848711407879,1.0981433451338958 +1.4229926127017833,-0.028586978483167182 +2.1841933878085307,-2.4480552906987323 +-1.7872867784234678,-0.21544487642056137 +-8.99431667658165,-2.3541055404542637 +4.670349414139645,1.4405536997541084 +-5.733616820095637,0.44705717030910486 +-7.762973083159292,2.13254461882453 +-7.14143234059382,1.915013540928613 +-0.1645988858943639,1.6651645471557495 +-4.7222218000154115,0.41162624028347206 +3.3054303678738033,0.21429686445894627 +0.24536723764051693,1.2583751495298643 +-5.694074958637131,0.4404596471715378 +0.014829326142869803,1.826627608997529 +7.1457737221956386,1.2565719816510565 +0.9257559555217973,-1.8180487689307452 +-0.9092613975976569,-0.9679497648562182 +2.8197006917879364,-0.7165418531600636 +1.6102489283324495,-0.4035811406229242 +-3.4335499889861953,0.04373697391018949 +3.3085344623603024,0.09557384326115986 +-1.385108933127647,0.126076652913356 +9.069101949845695,1.3471004905066832 +9.72404396029599,1.1737130613807567 +-5.524189864428081,0.9344231648901038 +-9.16402503249548,-2.158138717506348 +4.341649850553084,0.5681718209510055 +1.336648176248616,0.11749583537204478 +-1.4754816286339114,-0.18845810681692385 +5.606915535227541,-0.3779597886270815 +9.06651611591252,1.4890586912789443 +6.856670652082109,0.20162462654563085 +-0.5881815764779432,0.023684910992990083 +4.23740885174913,0.24751153181178476 +3.600472427815662,-0.7654044327140038 +-9.830918015020401,-4.530069648688303 +-8.161076903391962,0.17059908879312347 +-6.033927771145887,-0.9436046195127483 +4.382861323626543,0.848892736690925 +2.157891765702633,-2.1400276803277682 +3.0504144141153677,0.013559593628363314 +9.111873503262402,1.3945469613881265 +2.84988733804596,-0.3190182255535289 +-9.636155815867117,-4.0130638182580896 +-5.82005893697672,-0.009179882755382895 +-1.297165977823859,-0.1683722582218272 +0.6176070801862608,-1.1914935099685533 +4.457458791619494,1.1220519570908842 +2.9443998908279756,-0.2123995126002753 +9.644392883532397,1.3600186537307672 +7.161903597221332,1.2386496825746178 +0.7670180866731648,-1.8161972271902909 +8.279962532447698,2.252942843632707 +-1.9887584801913647,-0.2094433507634831 +-5.346403809932369,1.0887673687066346 +3.9139383033932913,-1.130740784169581 +2.0812691253343107,-2.0899617941974356 +-8.607509447593277,-2.8796722003765423 +3.4562830531367754,-0.36237818106671243 +-8.722569425770594,-2.9409230737140826 +-6.1516819514371015,-0.738006684247128 +5.853026151143283,-0.5466802378536665 +0.956334971498789,-1.5377174074253726 +0.6466032258158183,-1.4741410404991802 +-5.723201026761009,0.4080437501830505 +-6.398657708094015,0.30433580723455766 +9.2446925276341,1.293856194972086 +-0.7533811989447834,-0.548442850177278 +7.915871222815213,0.9752047198348457 +-6.432556493896655,0.30273045592540054 +7.479798060687838,1.5481407007254127 +4.589699371953589,1.3738665393322238 +3.922190922969193,-0.9666276881801754 +-7.684391429355305,2.339902899394171 +-0.7322638617201847,-0.5117722274942028 +5.34858761674978,-0.3509968264517922 +1.0698423145138047,-0.8654386104478402 +1.392981787676785,-0.022826807752715894 +6.152687450074264,-1.0423531455019535 +4.450586320047471,1.0054266512066963 +3.6695610709948805,-1.1250819659285936 +4.03306204866826,-0.6608799459398709 +3.0655763187325964,0.026767192285682986 +0.5557442294404211,-0.7141867335490899 +8.816844940164493,2.095590548910811 +-2.8775410868664193,-0.7194937752389056 +7.827275342731937,0.9936794991240445 +3.130158568108632,0.10208747510517785 +2.848661640670942,-0.5255662996419718 +4.948501185109983,0.5946346899730839 +3.7655873469405243,-1.1550093892127016 +7.371307802603767,1.4698619278438259 +6.333180428496262,-1.46431804460067 +-5.135904726400282,0.7249227358357887 +2.963946327563147,-0.10071139236801185 +-7.474199099284391,2.3245160665026505 +-3.289121136091669,-0.22437015263559368 +-6.807758687008736,1.3441109595509642 +-5.170420375502097,0.8296113770663479 +1.504113922112058,-0.3799165948111968 +-6.448055384096666,0.46700296777611827 +3.1354095431808067,0.11745370148143544 +5.564883035444446,-0.6529124036282599 +-0.8204118774421421,-0.6527079054026073 +-9.19747995535512,-2.2753277013247595 +5.855002440212473,-0.595438257235246 +1.8827411553272064,-1.6513560579839333 +-8.60725024616373,-2.947750357291558 +7.943828423118351,1.1029985951438748 +-7.421598522184695,1.9461890552420982 +-0.5850474662368264,0.03893576055270116 +0.30918677806795003,0.9964412754172928 +-1.0212644721088573,-0.8040504350890507 +0.6094189184693874,-1.005105822700609 +9.981354609825758,0.4146377162112046 +2.7325802343535126,-0.9009223395959447 +5.445193143236642,-0.6716281551853025 +9.01063431533725,1.2995943192196426 +-5.070234453628446,0.4856530397059674 +-1.2342363539715429,-0.41014222517324883 +9.639102001416688,1.2152558997803562 +2.51906180207262,-1.864097078382964 +-1.627719706735423,-0.1070079030435083 +6.143394780569036,-0.8913961510490035 +-8.079025229164543,1.0489659873877075 +-7.440888281449788,2.001786933142861 +7.81039335860151,1.060858899892242 +2.2499050779517304,-2.4153373465753187 +9.75471367607776,0.9623293988251294 +-4.967353461841446,0.5776828074345544 +-8.046347287191299,1.1761939289839383 +-5.473641470252648,1.1807864636549752 +2.5328273665304795,-1.965045674711837 +8.064660804594798,1.5491832191108101 +0.14780715633142982,1.5857229599794949 +5.936522021905111,-0.7871615371008428 +2.393142017949055,-2.1847450680437497 +-2.685844628208067,-0.3385494751097735 +1.2455600534770603,-0.3450689549443753 +5.940069726806264,-0.5838092359225239 +-2.6484117913339884,-0.3954377248089719 +7.594904829864539,1.4659745828942485 +-0.08685961942266118,1.9196008894615573 +-9.670380195770804,-3.9823293496042678 +-5.767815195093796,0.06341247184583908 +-5.648399007431566,0.6976451968261194 +1.4891553574423533,-0.21903380761977836 +1.6184608687469968,-0.5552994676815545 +4.730439937442593,1.2906259505204705 +2.1518992730653164,-2.37762483531762 +8.8704520407953,2.0857386068530035 +-8.962910093914122,-2.580097300962211 +-5.184183554831918,0.9276307618896876 +9.529230282231374,1.5479163919785413 +8.453112268622714,2.9648846401572704 +0.2237930606824854,1.468743827803835 +9.116529628950051,1.2118789921808926 +4.193425227993103,-0.002269756497720281 +-5.162223296679498,0.7764075783838722 +8.231765231313428,2.0815281704576765 +0.27958828066087094,1.2025241899150017 +-4.899895358719224,0.4175797591279574 +4.009409481974986,-0.8760396593542384 +0.4133192151478191,0.12139830755677161 +0.016999598419422668,1.865914795964759 +3.9333088838566748,-1.1276298545125636 +-5.659837169995573,0.6033608304758142 +0.4628939390309128,-0.029585930099038427 +9.421088099801132,1.2998551656599862 +1.8894916339910353,-1.4477883246681742 +3.951398618991444,-1.0924134743068263 +3.458568628269201,-0.4238235949321789 +-1.9724598464889063,-0.2004222386032146 +5.31547681969033,-0.6949964050801019 +-6.112224576725676,-0.6966223553746506 +1.9172847637805965,-1.4573146167414963 +9.143197744576035,1.4295561918310775 +3.795874426369497,-1.1356801511588706 +-2.2182476215566598,-0.3489796549953337 +0.697178664330039,-1.5478023113282326 +-2.05019205238337,-0.2642671240325088 +-1.5715869144318937,0.07331543413311278 +0.5508660481207315,-0.7481108268837378 +-4.258923341357303,-0.8284758346617047 +-1.6643679998609073,-0.025386019893244655 +7.196274991945764,1.4291498195077232 +5.260649512432035,-0.2590235178909586 +2.2708934587013947,-2.147863229828826 +-4.224044891512971,-0.9404561885105697 +6.24523000296194,-1.3445051760871645 +7.253327621429069,1.532260779693039 +9.515952246257928,1.4442895085617784 +0.8287673018168853,-2.012032380030773 +-5.911085835263151,-0.5000710280130265 +-9.166199609184016,-2.1036784858831865 +5.17699678785152,-0.226219967071298 +3.36560718890051,-0.07605318549126716 +-4.850711583884921,0.4330726250616318 +-1.3177398980264385,-0.11050839108547679 +0.9207368091058896,-1.5989530176803908 +3.723762083089341,-1.1339010966610719 +0.7916379941163232,-2.0223281156941555 +1.5607247685357795,-0.493162336500339 +3.9973639609851386,-0.8477494705310795 +-0.3238022665676077,1.2109092099772092 +-5.581272556398673,0.7317547433889803 +7.012812770817366,0.6734646639315842 +8.996828952776283,1.6364715315670961 +5.789187565038102,-0.48241785184809305 +-5.321034152433469,1.0276939978320812 +4.126959655493216,-0.4342694766623921 +8.828339122997978,2.0791225227841355 +4.120706506970201,-0.44175203321621376 +1.6418866074472316,-0.6189801160147675 +-2.406127087882206,-0.2123270528199498 +2.858658925515925,-0.5481321466690295 +-5.884013550348005,-0.3334812355142278 +-4.601880612305461,0.28100155342953415 +-6.67400303081228,1.1740842577303758 +-2.1423173865177563,-0.4434196912395215 +-6.041291793237477,-0.5907773577868014 +-6.161264416825468,-0.6692850738165358 +-1.879884870576113,-0.10769729966763718 +-8.172942327985272,0.26157472695554496 +-9.30851775166261,-2.55919973917368 +-7.05847249697079,1.7751218360974155 +-3.748553478134862,-0.7671238907260454 +-3.4487460287733374,-0.08726784463868709 +5.953843964606843,-0.6039584749280389 +6.309771009774337,-1.422876809671561 +-4.89816993250788,0.5407265862844898 +2.781843844134757,-0.7254017733791631 +2.692595393810482,-1.299451438517627 +7.066029634251038,1.1099027818564693 +5.686041461974414,-0.4534540000196851 +9.845450519987843,0.7220908286375598 +3.778474076411369,-1.106790159428717 +5.254847895585035,-0.39638811261953727 +9.677101254239709,1.1058920706001114 +3.2693943152846128,0.08341690636086786 +8.094228630105405,1.4485847703554624 +-5.863790049929376,-0.33985503115913973 +-6.863177454806287,1.5159335378898904 +-3.3396526533819593,-0.14937453627125752 +-1.0293497936759763,-0.6498834800422508 +-8.70092947244136,-2.6896187446006765 +2.425435484860392,-1.8815878617195523 +6.911284542242274,0.26826557774063375 +-4.548056692992496,0.0837877877807692 +4.024863784021067,-0.7388765123811154 +-5.493297755265303,0.921860139111712 +-0.24332660385524285,1.5695093270382274 +3.9464300772122702,-0.926117453120781 +-5.764479777572188,0.2940180323883951 +-7.569733055232323,2.2090732996899307 +-1.9397232864761182,-0.037868524421514865 +3.861487014266323,-1.2553812772309152 +-3.2555147114174905,-0.25824366589528663 +-0.06560092562817843,1.7213964306146339 +-2.8808822054660648,-0.7638065524667338 +-9.297985964218803,-2.458032315231283 +-6.04976526577094,-0.6879223080179262 +-4.082090572968799,-1.263571820908412 +-2.2787197339786895,-0.40927666085447245 +-5.9846923775922445,-0.7676261564883223 +1.5328466460790366,-0.22434877795278482 +-7.528310739454444,2.0256363861876316 +3.6806203678058718,-0.8518988581128183 +4.620544016011486,1.5262403474137214 +8.85963676763734,1.8329517709054275 +-9.104707941839692,-2.2924316978485764 +9.560638970593285,1.4765832032046626 +6.954980021851487,0.5010256725515745 +4.043637971560452,-0.759390835042588 +7.390667687508227,1.46489382205418 +-4.028811117144478,-1.1607493734995398 +4.986221082822432,0.4231611374725581 +-2.1903177254182933,-0.44876962917350044 +-8.69883452865362,-3.086458841414051 +-2.974402044536721,-0.5192142259621171 +-9.556107716009752,-3.524973247674619 +1.128632644633809,-0.7126174012814005 +6.278481020972207,-1.2889079811218325 +5.020256267702461,0.2905136858496284 +0.235603664246419,1.369054087951635 +5.6206638433400755,-0.43046690816488675 +1.9166381535311992,-1.7828657349725132 +-4.5828640305930755,0.1919913352227747 +9.242613656042863,1.2781170314461345 +9.480767405433326,1.4041680677896358 +-5.122986018299439,0.6391698813499609 +-6.095324476403725,-0.9149509099914342 +2.4930030539591423,-1.7878951841749027 +2.4310679007499747,-2.1316097597405896 +5.091500849770121,-0.013103504836505346 +-0.5125900831961872,0.47102937950864854 +-0.835279002732964,-0.8585466272257734 +-9.284957840804367,-2.3597223699071286 +5.791619948496525,-0.5150698121484957 +3.4844940850058936,-0.5011074472464682 +-5.199395032865969,0.7962973738339314 +-4.594512474184649,0.3253180044015521 +-0.7925909037637968,-0.7293391805429935 +-2.8371335193462914,-0.7460393491529487 +-7.887375959184983,2.1954627107022895 +-1.7963481291288907,-0.13683246466732765 +5.331997356046871,-0.44350310566044715 +-5.733866160767729,0.32967270326475234 +0.2223451478815477,1.4242635914853798 +5.558667541130955,-0.17752265057492295 +0.7501925117711714,-1.8596791542217845 +1.6629609020610623,-0.8448487212537634 +-1.1275383836142705,-0.43806888088218515 +6.301608645699432,-1.2889440068762044 +1.6541128479419207,-0.7133147778955791 +1.7201436160175234,-1.0399773272728907 +3.1077045879506837,0.2645132285243118 +-5.679504469137031,0.4965377332789993 +7.027877642212318,0.917416781582079 +5.376101227774214,-0.5261799482059472 +-9.398298771197943,-2.8207803887835 +2.269900555475011,-2.4930237856666935 +-7.582701048867166,2.299860426656293 +5.4323286064956875,-0.49725899309257804 +7.607537295693368,1.041663642003274 +-4.501974064732792,0.13670465260527853 +-2.1780960549017525,-0.3199430048037867 +8.24902642345521,2.1806562988171163 +-3.2501242246329514,-0.3759368611834355 +-6.761218866155324,1.2704334603407577 +-9.49548201087174,-3.2137120651484494 +1.1253358521724088,-0.7374393488369151 +2.4249623770910578,-2.0373337632261648 +-2.564089943992243,-0.3620713439059421 +8.92617243122291,1.6648654105450253 +3.061851522366732,0.05664474936427982 +-2.6182063475859567,-0.40121946734510355 +-0.48375233601208123,0.4845286350963821 +-8.41112970994481,-1.9292193384518783 +-3.080482023951463,-0.6551741666852083 +7.777135948619627,1.0231197439491961 +-4.080068558648735,-1.296155275578777 +5.233338812498909,-0.3779145837829375 +-4.723109466600484,0.35563055129312504 +-5.5306768878142165,1.0096146070126402 +7.1476522335510815,1.2413447501196218 +-8.09899394527094,0.8442050864529915 +-4.987813483374154,0.5833523230222675 +-7.143191484518149,2.000134672106597 +-3.342180382042162,0.21069995364195004 +8.910797241807945,1.7995778851121793 +-8.415853676424806,-2.07230396512551 +4.067436997620796,-0.6459804371959293 +-6.507906668810644,0.6250138691769376 +3.7460194010285757,-1.1156318723441385 +-2.1063167668223493,-0.37921495260820526 +-6.3274475602000955,0.15213636842306338 +0.12291702724452591,1.7963118399307418 +-1.1785194532382839,-0.3873027841991049 +4.094265128211063,-0.8984292747555533 +-7.412427166996892,1.936272428090382 +-5.335527218774629,1.1257007104995478 +-3.3379336182444597,0.06949361088736732 +4.8961968237486975,0.8827975746161022 +-7.952916820350925,1.8466587016174016 +-1.4921105557512693,0.05352152741257326 +-4.424613227372003,-0.21285540619884477 +-5.560862434240965,0.8705561991530624 +4.720149534459775,1.4629893962013234 +5.760303757585241,-0.3462294994171844 +1.9916981006417132,-1.9873823320534487 +-8.218544983049185,-0.12740867490921365 +-4.276827276773805,-0.6730392686074225 +9.228303365766607,1.147520808788058 +-4.964580425530546,0.5736556963547694 +-3.196006107049083,-0.42813581661941746 +9.49746885494396,1.339360597309527 +-8.961011498230679,-2.447211707070291 +9.215831380595333,1.4305772679128939 +-8.42796821753236,-2.041457510418286 +3.4304515392485424,-0.2562718298053736 +3.6136634810465966,-0.849908030326507 +-7.584894155501214,2.038931071936428 +-7.0101224316027455,1.7902082061646962 +-2.7994951522909477,-0.566231088744696 +1.0072705102325976,-1.1596447382618467 +-3.9631199685256884,-1.222232082411177 +-5.381201102176142,1.008645812855445 +-3.9565539504612386,-1.4548712385318836 +5.8654947837177485,-0.5503075484199211 +0.5898029385753834,-0.9636087559886345 +-9.497978169663497,-3.183891688364934 +-7.201621910156515,1.9485146922652374 +-2.946611424811611,-0.7913247977592083 +8.286220016713823,2.359826251346333 +5.266502879639159,-0.4105624258277741 +-8.785924493346137,-2.933467536016206 +-1.5592563718906547,0.15554095629237102 +-1.0738033662376711,-0.54918712475378 +5.386831157472397,-0.47915414834029485 +-2.829885620834553,-0.6911490198408435 +-8.281339378353202,-1.07868066393265 +1.9145094044337938,-1.704242084567167 +8.620852241500994,2.8924119880912884 +8.063325021273773,1.461436238319093 +1.8251387100354002,-1.2973624262428498 +-9.8658002508106,-4.562840267712548 +4.36148388772355,0.6970853846757133 +1.0421375923366796,-1.0860537095868503 +5.0814150602671795,0.15077167645427403 +5.872464472481527,-0.32403124152264806 +2.002822393152597,-2.0447354451378157 +6.995049583286971,0.7168613073248731 +2.8892614162553,-0.3545417342950626 +8.660399249990526,2.708087705272246 +2.4372359042725655,-2.0531989767321748 +1.5892949888148244,-0.5586030684276616 +-0.6924831023657596,-0.44369504298990414 +1.9694693298222496,-1.7820800846989162 +7.384704984445022,1.4030851690869384 +2.216237006625153,-2.265825874017053 +-6.279891319554377,-0.16139451477389147 +-5.012854851629051,0.7470849558730648 +-5.594141456758273,0.7952095602787768 +-3.2584254787222644,-0.08672589600736313 +-6.503475511916719,0.7163235545461455 +-4.03038087709775,-1.2606636296670317 +3.7512241710721295,-1.1126824247830065 +-6.734072811193998,1.0800612412050379 +-3.2232623295464653,-0.1785832742467197 +5.940478268106757,-0.4667016576021017 +7.964945498326266,1.2488038024477652 +-0.10075336383463807,1.6871545228541243 +-8.614936139719688,-2.819482364694529 +-5.374987460213273,1.1252576440714324 +-4.2873129936203505,-0.5984133615662567 +-0.5679543505758389,0.1130386457327979 +-1.260773400387798,-0.46168175770624637 +4.925485749267011,0.9119344061591936 +4.7786114836448945,1.53291404961585 +-0.19406826057062432,1.5501956480296581 +8.173902982215566,1.7114485685533556 +9.947482538609673,0.5062292557054646 +-9.503783274186706,-3.31959134811718 +0.6368994205391623,-1.3847542966798203 +7.530913771169757,1.431320748390796 +6.521299484305602,-1.13738078032121 +-7.2075776467854915,1.8344699432265936 +7.399850330856307,1.5397928671791041 +8.270977207923885,2.203559669481359 +-9.922346749542807,-4.574902519651654 +1.3481731019009615,0.07085751036029206 +7.722415379922687,0.943799599688281 +-1.9276478478052361,-0.27968897906305273 +7.856964305762233,0.8614317114472276 +1.7299579593165113,-1.0790944128069138 +-8.803888054287686,-3.146702044438786 +-2.5347601345556154,-0.44579333604190874 +4.4106875232261125,1.043339384088065 +-0.778724644329909,-0.6854867861620618 +-7.1025862303221885,1.810248084626466 +4.991559633334614,0.540309645276728 +-0.6842862652497637,-0.36924727778187016 +-0.9842155053753121,-0.7454647229013613 +6.256623904404002,-1.341631283189365 +7.676350883068043,1.1634918855942231 +-2.85351825264498,-0.6934956166695266 +7.357182897036285,1.6480688893327244 +-5.266178027098967,0.9450317468959207 +-2.9346329305001095,-0.6547912109317766 +-4.882287822278606,0.4036524503662675 +2.8377597259590903,-0.7579765963594771 +-8.981610661568237,-2.440301117912255 +3.7332645779057176,-1.135100702690476 +-8.952013359990382,-2.475366565434051 +1.1801315490319517,-0.24596432148564146 +8.33250883641746,2.5601486150139983 +-1.9584186397537806,-0.20772518760809233 +-3.8664272694407664,-1.2702951574707386 +-5.2903937591380625,0.9681149303704812 +0.5380835763965752,-0.4185384069724273 +-7.016742497290188,1.7029975448128956 +8.449992261397874,2.8942042896168356 +6.67881354835043,-0.7473186308309174 +-3.007315831930249,-0.8057867092624664 +-1.8814186652238405,-0.11290830293020401 +8.421279813316577,2.9442440024543894 +-3.744318853653965,-0.7482902594410459 +1.221733428253117,-0.17563233300265887 +0.018705381419973577,1.8015604132326017 +-5.7924168753811145,0.04536797695505902 +-1.5237360295138416,0.05398912766791025 +0.22099996307823666,1.4081926149497372 +-7.788057915434364,2.3366373271410614 +7.102063807346273,1.1756573347892099 +-9.782401741032611,-4.392721283367316 +1.6162670277032287,-0.42255141526160367 +1.7213381367660574,-1.0126168589944473 +4.38239886165019,0.9025571768488584 +4.494542445430385,1.3122423432870747 +7.063009667583188,1.0405092626859642 +7.816708647915015,0.9917831656312203 +8.793993686589651,2.2329417521499617 +2.5510213455992936,-1.7951829675623903 +1.1569291979534526,-0.3651441249675953 +-4.543495930785042,0.11759622706855079 +4.813244356870587,1.2577208040658898 +-1.7961685724132224,-0.05494433841705126 +0.7341762008016328,-1.8305990308097715 +-5.969309703069188,-0.6897038718902597 +-1.347191161872857,-0.06057833551053394 +2.604414263003299,-1.4261856982638572 +5.528197122700291,-0.5229417011298072 +3.787564004313859,-1.1529631672535599 +0.27421010158255044,1.1373167049986628 +-2.81250574611493,-0.5799684649608587 +-6.279051622626351,-0.24554698056237861 +6.518324561232372,-1.2529481555766877 +3.4812389718508148,-0.5304472180128166 +6.470815769741058,-1.2634444535050027 +-0.4744033375238601,0.5236597176879567 +-0.5190501601746398,0.5347233572469596 +2.5042549832865824,-1.813979674910593 +7.504978880128611,1.4724618497161586 +9.779882479111194,1.0498435373905781 +3.1809643093534845,0.31434764171364077 +3.4386193372324403,-0.15438128578592158 +1.613006095351576,-0.4657547912047896 +-6.879304899417783,1.5504555521996113 +4.8879699101654275,1.0567035697563985 +-3.2492072892112667,-0.03663808504332311 +3.669365099374229,-1.0192831021690578 +-9.63785470401497,-4.134538871292696 +-8.500582119136894,-2.543778036384445 +9.62826229949841,1.3645212095998633 +8.522335056376527,3.0612281351602286 +8.742081141996767,2.6789575798420637 +-6.225932482778035,-0.3103817890537519 +-7.451551689413289,2.0312650266885455 +3.27666074887587,-0.0033539867699345915 +-5.1845910603982315,0.9786517349252939 +9.487717352892648,1.3669173105782506 +9.287307526162962,1.1086424894878366 +3.9493850039622775,-1.1200995935398663 +0.14979642847450236,1.6745487076449543 +-3.1650356313208583,-0.5213299828368667 +6.207798523353239,-1.1561723921734466 +5.09470532516119,0.1581182143334963 +8.6459967950668,2.7876660850698287 +-2.2546188712168598,-0.33526548347651275 +1.7179547649439364,-0.8536354431862688 +8.383391255504137,2.6947368162545664 +7.314641702189718,1.416582914533618 +8.188006044036161,2.031267128255174 +4.77517014019937,1.4410885970485765 +0.8800372095370523,-1.7085159802999936 +-4.546858776280587,-0.011344609659005273 +5.477432635223675,-0.4706057512849597 +-7.9514887722211745,1.7456647174150386 +-0.3757625297196743,1.1052747586253842 +5.688318920794675,-0.4286328036768829 +-1.8728782357138485,0.023034556042804494 +-5.5537872572778255,0.8087976040664021 +-1.9851373688661624,-0.1724912703021842 +-9.040292499782439,-2.1389543847086157 +-7.323852466659745,1.9773711681061295 +-2.082425570454451,-0.4174707718602749 +8.813321304041835,2.131481380428336 +2.3380477477632904,-2.237438879101341 +-9.174534759277805,-2.117230459176994 +5.941730825980285,-0.6675758615332782 +8.071374496680864,1.441850783477291 +2.737849023597892,-0.9781739408759725 +-2.605517175830725,-0.3707122356566822 +-0.5617733438326091,0.18251544990956006 +8.56526883899555,2.8185938957180587 +2.5533120167123395,-1.6726363910773465 +5.566468198688712,-0.4815087165244708 +-8.351333634518351,-1.5055239698129712 +-7.363360787225584,2.0280798244450495 +1.7038056010127498,-1.1461586092944427 +8.041110778742748,1.5250654903206549 +-6.160716898328032,-0.6173997067961757 +-7.839555614868555,2.1474831942256687 +8.95823438526374,1.728797286972063 +1.8722567304360318,-1.4640545744141689 +6.063537941575664,-0.7775876398148082 +2.221534721201945,-2.3890181099228203 +-0.6579885413375486,-0.20672678328412122 +-2.041394389244102,-0.32897753246799344 +-5.845140771326465,-0.4024650479166519 +-1.560964792786228,-0.0027478885517112847 +6.588829218357937,-1.158836342122518 +1.0186025000231993,-1.1550211690946193 +6.168350107822178,-0.9978416893178184 +-3.8952436473865504,-1.1962336607206197 +-3.8404905965950498,-1.2498867420423714 +-2.8863912606238706,-0.6725764562851075 +8.74993862588014,2.4399598978987913 +5.430510726998401,-0.7417991220018408 +2.7984096111072354,-0.7076041551244043 +7.006319583828756,0.8751093487177949 +-8.475619110013074,-2.3481982844985323 +-9.906827279493598,-4.578440691462523 +-3.929636692561132,-1.0327640643828662 +5.393215426212464,-0.5397047985496953 +5.029708266197366,0.47369421431877484 +-1.3627860153417615,0.030184000680089335 +-9.694364853667047,-4.179299719776533 +-7.751461455149035,2.1632261118827265 +-7.3638175151761445,2.151470731165865 +9.781453028673251,1.0278516215230131 +-8.144614055107112,0.3259217916753722 +4.535452252890755,1.361872351104754 +6.763690240752535,-0.4091531824228997 +-6.66225428867368,1.224963511872706 +1.220878732805497,-0.23160345884198294 +7.062234195520097,0.9504095225359714 +2.4917797738507867,-1.8177019978887696 +-9.878776251294124,-4.5391039341003 +3.2172292057895784,-0.018082714753113016 +0.8065075005867826,-1.8903969021475848 +-4.809745733216078,0.3357463287879673 +2.263211201082744,-2.1107252573846607 +-1.8698380190654618,-0.23958034136979506 +-9.780198549661904,-4.260260710349345 +3.9313447053521777,-1.0695083452801708 +2.5211669476272154,-1.9778784399400888 +-6.378048107553639,0.06626574249610792 +4.634765188056665,1.4539688996336326 +-3.146868071383611,-0.36316677820919774 +9.203170647133764,1.2358784488814019 +4.342890000073702,0.8111321232685889 +-1.0252186788814797,-0.6835520734452128 +4.172068896963182,-0.17806941462335477 +-0.5656225189393371,0.010446751609669097 +1.369178518438492,-0.035990573614680776 +9.682946381314565,1.1981937980923383 +-5.389907069011972,1.1886031112300355 +-5.295867708902273,0.9232516959040367 +2.9275160663833018,-0.2957595472993821 +8.973702103918786,1.6116297964463153 +1.49862071858092,-0.01924068384505806 +-4.963848224872729,0.5054598375162523 +8.278049663110902,2.285129611394216 +-9.642329881779613,-3.9808332629475918 +0.035635309262978865,1.7956887693225778 +2.115215244008102,-2.281334194904022 +-6.597842756487715,1.0934239931512724 +-8.094586090644338,0.7698209984454604 +-8.347562475788479,-1.4618452541864235 +4.903276433487228,0.8487045637358093 +0.9277946578928429,-1.5642998118253613 +8.252652624986009,2.282435080390319 +9.118937889904046,1.1346206786136728 +0.9236994323563117,-1.6153891899938493 +-6.004013921949625,-0.7169662386277345 +-8.231905933208381,-0.37608928198378655 +1.333927944867753,0.1291968080077828 +4.2939428724550375,0.6004213679390089 +-2.3884387336127983,-0.33502262817986783 +-8.82117520082231,-2.9478600326277027 +-5.838080094278734,-0.20089346471381891 +-1.8733794018205607,-0.24865480822680644 +0.5329813813162119,-0.6025114197968795 +0.48099417137997946,-0.038691470782888926 +0.877926987811545,-1.5868424255744562 +8.387246662773475,2.6426836652721972 +6.3027949791331075,-1.4133878749424929 +-5.954743180026224,-0.6539023488652719 +0.5530282079234627,-0.8380951551792886 +-9.327337575376376,-2.5999331495245803 +-5.9775203828883825,-0.44418508907252197 +2.577257638445838,-1.748633224475119 +-6.029080097410965,-0.8171281064791488 +-6.226194493812258,-0.2613008056197427 +8.730277537156557,2.388215910097996 +-3.0911892049754424,-0.6375891238290351 +3.6729854079712787,-1.0141198195778656 +7.90794795528512,0.9154735186483222 +-4.577936504793927,0.25590294956366616 +4.702621876500643,1.3028050561639675 +-9.098016434626874,-1.96335980383013 +-2.9563697962063085,-0.845645690966862 +8.94454725091903,1.6996849867835995 +-8.985034321381189,-2.1488949971670794 +-8.53774534170958,-2.4599106816758667 +1.5669781644605898,-0.3973993409946736 +0.876767135687686,-1.7944351299516492 +-7.258121010460968,1.7859766085712887 +-9.622724894207863,-4.105288747141859 +0.357406058055755,0.6637749334668986 +-2.605644459955339,-0.33742723508409045 +-2.0138135371601695,-0.4488807591769527 +6.583711118961379,-1.1388850590731094 +-8.450785869049156,-2.388679117424311 +0.942163091948359,-1.522068662726008 +6.025944995431868,-0.728334955628572 +-9.117863992998267,-2.3121117278360255 +5.4756354795987185,-0.5757532632868771 +-2.373514854070336,-0.36683340986330903 +-7.978648580046993,1.705220356521207 +-4.9451403071636015,0.6136511958289679 +-0.018241360855387967,1.5624442706712762 +8.213598795603765,1.98443845693259 +5.707047286955685,-0.5550830681420861 +9.274669408681962,1.1571639115960395 +7.156148968659481,1.2707154555501523 +-0.8598218105744238,-0.6529689343695664 +5.149413194053096,-0.045060555954829024 +4.299020824895976,0.42911348366264945 +9.165531335438175,1.144129741715757 +3.5935936443527705,-0.7165906647003782 +-7.793236554928704,2.1821955633139973 +7.896313206241551,1.0746498605232173 +-1.4571828543236354,-0.0031258644888423597 +1.2346752629409163,-0.2546336740160762 +6.753184458663348,-0.4314592557850924 +-6.39178244211001,0.3988492830194189 +7.843026601712574,0.8617663101240718 +-1.7766417161760408,0.013537308975610313 +2.8828230550149936,-0.42868436005001875 +7.109083433594773,1.2992573198045965 +-0.1830835974482916,1.4989778277786325 +0.949767649344313,-1.5188100748627682 +7.384820241484871,1.3125908987320074 +-9.841468212439906,-4.439500291571953 +-2.41262717456221,-0.315837038806211 +0.23433790444443048,1.589196472204022 +5.204099677505933,-0.17362891678814918 +-3.3080097653287144,0.016337638006029803 +-7.206689181074761,2.0078637877176027 +-2.7808193080134953,-0.4202132897358335 +-8.283896652870343,-0.8663409183657858 +-7.011991830098744,1.946993098256366 +3.6910664879824178,-1.0505138827696643 +9.79943162553564,0.9817428152808383 +9.851064363330707,0.9474246663704404 +6.755562555636541,-0.4694654706834503 +-8.515644278213273,-2.578583599442599 +7.18796621212563,1.3685212989834956 +3.0517717806539046,0.2852342881185992 +5.711347926332625,-0.30289194977494577 +0.008987771103111442,1.8513690123482405 +6.522442171817584,-1.229226120933278 +-1.7013484375630483,-0.14703230654441707 +0.03393425349332624,1.8132740611865807 +-6.690999959319836,1.152073991007502 +-0.9729541851741619,-0.7644615981831416 +1.7997213579145033,-1.4833428856525634 +9.613603268588161,1.220454813053007 +-3.8768052586425927,-0.9190726126243887 +1.1701752202770965,-0.57042537964813 +-3.592887308580277,-0.29052524226243653 +2.4335579914995353,-2.0008089576606087 +-4.12477106238477,-1.3608780463522034 +-4.548275822318946,0.13375219126792073 +-4.576464827027053,0.2589091309096775 +1.2643996443256356,-0.12740318599734257 +4.786717799836357,1.0789536694551185 +9.178216177568338,1.2446717749280694 +5.340583263519282,-0.5677865780402297 +8.640780133441215,2.620887469071385 +-3.7661613682837607,-0.8795844323323502 +9.554132973636467,1.3629119332927009 +2.8080207845593503,-0.6218877339798637 +8.817231509403548,2.1162931579218762 +-2.306828926921703,-0.2796921979668485 +-7.2703592886765165,1.9257097906944525 +-6.28379264296019,-0.22608765273756795 +-9.282140998464989,-2.490773704423727 +-3.1316458141212777,-0.4703387170751245 +9.940504668706431,0.6635651146838514 +-0.772645955531118,-0.5506065201999273 +-2.3304224600877927,-0.3821643571806252 +-5.311770693955115,1.2170605390856846 +2.7445329998103762,-0.9182377912231671 +-8.141779311235648,0.3437387268431037 +9.298745441398097,1.2877192178877939 +5.313784833594104,-0.4790398814529712 +6.584834308854607,-1.2306244197156888 +1.8514982381980118,-1.3637454624440375 +-7.089891369085556,1.9258799849597439 +4.479559709194113,1.2595198577149906 +3.0074846649132563,0.12502681517268477 +-8.168957969011409,-0.014312033888841641 +9.156537974043722,1.333307737384893 +3.4937141526392335,-0.5404260847058511 +-6.906990377800347,1.3604692072614355 +-0.4043504537099665,0.6602793098803491 +7.849110193489391,0.9598007701505878 +-2.7046329879019697,-0.5406657059328973 +-1.327718245999776,0.012764939958821192 +-6.497087987758459,0.7095167891020145 +2.6419323461101385,-1.4059382815294583 +-9.43804211584306,-3.0038842299785515 +-0.6248314593487194,0.043183755809934374 +3.002522224508222,0.07337630964170899 +0.05437973597580914,1.8017525164779205 +1.8124781202134308,-1.252419229765019 +4.835772087543138,1.1708698819112282 +2.883307608004437,-0.46017447329150957 +0.13023781687317992,1.7294141149340596 +5.096040731374533,0.22100115398373735 +8.558764895173457,2.6759316955430785 +3.3719009273940426,0.15930191977391112 +-0.946021494698627,-0.8547084169175425 +-5.1050755221252775,0.5186797114353279 +2.6678512029118018,-1.4001755519659873 +-9.599842043630055,-3.772869673134266 +7.901823262597397,1.1862789300502632 +-6.260739660684934,-0.277666236702148 +2.4735955607774596,-2.0726069339700373 +-0.8031789953609536,-0.7502629236361165 +-7.0493100095688,1.8364901314973832 +0.886874543927263,-1.6822570226430134 +9.775358385773778,1.0438656122067904 +2.717284156117699,-1.0480367193980267 +-5.581461594663253,0.9389252312850936 +3.472485851638263,-0.39403820084988644 +8.68643022199258,2.9120079171236366 +7.518436893214634,1.4153313782190295 +-6.59018782222606,1.0663242061015183 +3.6089325875550533,-0.7247001373632918 +5.603968645008477,-0.48998963714235566 +-5.809525045133812,0.01955520156231133 +0.5908649975879054,-1.1202572352427325 +-8.788681500035999,-2.824370702190215 +5.416388784078636,-0.4973397549070849 +-5.442711296075814,1.1001236543978152 +-0.7510686935468627,-0.42058030947050673 +4.517480670289515,1.290170883333015 +-9.663198608714207,-4.136821051839913 +-5.015256249633442,0.455548773959071 +-1.3501039249143432,-0.1604558295866403 +5.950678557107963,-0.5000441442015564 +-5.086017775224278,0.7214674835612204 +-3.2708133165501874,-0.22134949386232994 +-8.201522562688218,-0.26811845413112045 +4.77695231214955,1.3741118261300669 +-5.645688027174627,0.6389917168575667 +7.820696030550419,1.011959205190863 +6.064685004852998,-0.6352407747714481 +5.871801517312974,-0.46637998635121963 +-3.217347038649208,-0.3758813627155286 +-2.2979058107330186,-0.5441307870679666 +-0.0742291850700525,1.8523447515016211 +3.609104134560375,-0.7603626678312471 +-0.5591737048207222,0.2665692537909398 +-9.065781192463245,-2.188321104666416 +-7.980512851147964,1.5352867874264458 +-1.6532680354473683,0.01592936922797685 +4.058859935136923,-0.47547937493128 +0.6665967923029914,-1.560745355021665 +3.0376015219538743,0.0062788934118135215 +0.03557782677871657,1.741455445641898 +-5.4066729695922255,1.1064634572535612 +-6.423964548917468,0.3450198349621172 +-9.50456247903762,-3.32244577029422 +-8.303422468946785,-1.091320068094794 +-6.409268847339241,0.20318275411047124 +5.290136918598844,-0.46200294487500043 +7.429617426019451,1.5585716322746233 +0.5526341403161226,-0.7901535762303271 +-4.983656925730671,0.6069122520906226 +-3.211503853661753,-0.2634674988749521 +7.130935345203767,1.1717846540554258 +9.294713736363747,1.124685654004053 +6.488687362720459,-1.4620458068004132 +6.680223004046724,-0.7688460894729809 +7.813447185114256,0.9579722007862417 +-5.413126494127191,1.0249044962296567 +-8.522041409857628,-2.5724088512355996 +3.201973619795901,0.19375811557487496 +9.448818115199114,1.4396168713417603 +-8.494976617330806,-2.6035596807365144 +1.085206372707809,-0.6819330348021395 +4.330779899787419,0.6366041462211575 +6.054059452487862,-0.8936336576241376 +-3.1832333920212403,-0.46451898885617005 +-4.1112202689198405,-1.2511038750605308 +0.09532618258186076,1.9400054964561955 +5.9570093257498655,-0.5532761355926763 +5.9065972694114866,-0.43813151048796595 +3.7467397454153466,-1.1139728943177074 +2.353040886751918,-2.21708197593556 +5.9100155324216,-0.4391030649269656 +-4.34316832872395,-0.4901840902927287 +-4.0505795187227065,-1.1928431159148454 +4.990626021386358,0.5066771829804159 +8.733257976566154,2.293323816939229 +8.414506672739684,2.6672358965836676 +3.0978125898201636,0.010545031038340147 +-8.491758071674113,-2.4782329672863552 +1.845076742904407,-1.5042040076828318 +9.227766562893414,1.1508638994708011 +3.2904340854616443,0.09919855789904251 +4.854855614973918,1.07117346871453 +5.422139658174165,-0.47816788898617035 +-6.775914118777081,1.445729308429493 +-5.596948396035533,0.9351897319375088 +-8.803861169100038,-2.8037377710484193 +-8.513782545439717,-2.516967657160429 +-4.774760619509744,0.32832981289289986 +-0.48869833049126044,0.627816354263971 +-9.604160055910643,-3.6483657172589012 +9.289249427371217,1.1240666370382468 +5.815212918867076,-0.4406752294150707 +-1.0407984553237917,-0.7619659448982379 +5.439088964564851,-0.548537079778326 +-1.5806422618989089,0.067968144607437 +-9.400285912835173,-2.768440618421317 +6.305082028816188,-1.330488265069157 +1.8230155398533974,-1.4384835884105585 +-3.043502454672602,-0.7169689390481595 +4.86689458686444,1.0733552140926126 +-6.973885835611845,1.6823399138522614 +-1.800440204630366,-0.16516713660875612 +-9.70125327272963,-4.215435408617832 +5.165588742648296,-0.0251760585392046 +6.914910012416641,0.37316566481708724 +-3.005166911415662,-0.6769090275467089 +9.930531841027918,0.6614620972259794 +-5.976956131329629,-0.7695935508933803 +-4.713705854825285,0.5743089249021232 +-8.534581878620635,-2.631102620589895 +6.695812500068791,-0.6180131102078447 +-5.798037004074622,-0.09423159233018652 +-3.3257041280987654,-0.1236920343633085 +0.1412927698891675,1.6055060894998499 +-7.012256596753531,1.6381696632235971 +-5.625572084552694,0.6993632720586768 +-4.3183249996885245,-0.4691141844900668 +2.8419735171180616,-0.40176706064918655 +5.220304684043121,-0.3567559529583488 +3.4616528766871912,-0.4392805167269047 +0.7168551945251878,-1.6677235736465925 +-0.49479841131285696,0.6216025487984371 +4.763004899318993,1.4400111632313555 +5.671916867382514,-0.4318212503299607 +-8.982477092135053,-2.6359352638248668 +-6.873522888299007,1.5471052210531084 +-9.215749119653886,-2.292674139890142 +4.526341452361301,1.4889667187262898 +-4.513322154081315,0.03463821031161772 +-8.244852683552093,-0.5615054511959061 +3.9761787874622594,-0.9555947674293955 +0.8308548731417851,-1.971084482475217 +3.805963380746547,-1.070504992976398 +-1.4885864169772134,0.036358099189359255 +7.98228566234561,1.343961514470796 +2.257674807932841,-2.370726988222142 +-4.96638082139215,0.47182371306468046 +9.73229097257802,1.19089998987018 +3.479624893662536,-0.5302261205589904 +2.340524038582794,-2.187851428700191 +1.6934901749402798,-1.0586443831651455 +3.188023347235895,0.2566066217784692 +-9.541321134067484,-3.5500835459073485 +-9.947199163363717,-4.433374181978795 +0.7516567228668656,-1.7872920357632638 +-8.408801230619375,-2.0456567657872076 +-5.314477907070501,1.0018979107314159 +-1.927504259376743,-0.15735412399410614 +-9.54949557934161,-3.677221656444547 +6.854203969438899,0.10287442579723434 +3.2333855171289945,0.15363433541411134 +-5.491946233962821,0.9107656605862313 +8.717993042405325,2.5263670883232767 +-6.429007072265925,0.4767383243965098 +7.568938825114536,1.1840575677786846 +-0.7571571717418522,-0.5467461532879146 +8.220552383773043,2.1512217791717796 +-3.379311525337556,-0.09539256835608252 +9.434799400092441,1.394405227353394 +0.8282546703652969,-1.807963836416595 +-4.933627927049184,0.547303691076198 +0.4974650893985988,-0.3565582782124466 +4.372018034409017,0.7655300277066643 +-8.688604206987453,-3.121045585886421 +-6.405392265048917,0.2522328027168828 +5.402717339961649,-0.5035400796034697 +-1.9896043831746582,-0.32342911073077296 +9.390814454568975,1.4361861752721419 +6.162315530488512,-1.163191259009428 +9.324081753342497,1.3476562678524073 +9.534619071783013,1.4825795232980061 +-5.932619354885102,-0.4130927948420252 +9.686890939248535,1.2126007968892456 +7.179454118171399,1.2876509683734094 +-7.689327000899545,2.2767865344444984 +4.269750536203848,0.34116995653147414 +-5.336431495703767,0.9211826289206155 +-6.023149952700329,-0.5991926130447186 +0.9210736651836058,-1.4583808447369047 +-7.100464589579264,1.9789591078916215 +-2.231400343851138,-0.4663999679169238 +1.4345855851384037,-0.11958879396683042 +-1.3049287190546757,-0.17100800703232363 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/multivariate_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..438e20a --- /dev/null +++ b/axolotl/tests/data/datasets/multivariate_dataset_1/tables/learningData.csv @@ -0,0 +1,7 @@ +d3mIndex,gpDataFile,amplitude,lengthscale +0,train_data_934.csv,0.6115757969678771,2.2957860332947786 +1,train_data_935.csv,0.026343424234522232,0.6041732289631595 +2,train_data_936.csv,0.15260382863242258,1.6483227666863358 +3,train_data_937.csv,1.1312855843919003,2.70460765772802 +4,train_data_938.csv,1.2752346828569412,0.7611034560553084 +5,train_data_939.csv,2.288503802153945,0.4688916711056519 diff --git a/axolotl/tests/data/datasets/object_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/object_dataset_1/datasetDoc.json new file mode 100644 index 0000000..a430f29 --- /dev/null +++ b/axolotl/tests/data/datasets/object_dataset_1/datasetDoc.json @@ -0,0 +1,82 @@ +{ + "about": { + "datasetID": "object_dataset_1", + "datasetName": "Images with some fake objects", + "license": "CC0", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "46bd637354f1c71cbc5877c55cf97b0868f83eb2abd3a1a9ae88e6f3836524d1" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "media/", + "resType": "image", + "resFormat": { + "image/png": [ + "png" + ] + }, + "isCollection": true + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 4, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "multiIndex" + ] + }, + { + "colIndex": 1, + "colName": "image", + "colType": "string", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "color_not_class", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + }, + { + "colIndex": 3, + "colName": "bounding_polygon_area", + "colType": "realVector", + "role": [ + "suggestedTarget", + "boundaryIndicator", + "boundingPolygon" + ], + "refersTo": { + "resID": "learningData", + "resObject": { + "columnName": "image" + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/object_dataset_1/media/img_00225.png.REMOVED.git-id b/axolotl/tests/data/datasets/object_dataset_1/media/img_00225.png.REMOVED.git-id new file mode 100644 index 0000000..5ea1c76 --- /dev/null +++ b/axolotl/tests/data/datasets/object_dataset_1/media/img_00225.png.REMOVED.git-id @@ -0,0 +1 @@ +00a0cea05b49325c138f42e785e006cb482ef6e7 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/object_dataset_1/media/img_00285.png.REMOVED.git-id b/axolotl/tests/data/datasets/object_dataset_1/media/img_00285.png.REMOVED.git-id new file mode 100644 index 0000000..22f427a --- /dev/null +++ b/axolotl/tests/data/datasets/object_dataset_1/media/img_00285.png.REMOVED.git-id @@ -0,0 +1 @@ +d7d45fd20a81986eca135b620a0f3f45151d6880 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/object_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/object_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..770d6ba --- /dev/null +++ b/axolotl/tests/data/datasets/object_dataset_1/tables/learningData.csv @@ -0,0 +1,5 @@ +d3mIndex,image,color_not_class,bounding_polygon_area +0,img_00285.png,red,"480,457,480,529,515,529,515,457" +0,img_00285.png,black,"10,157,10,229,315,229,315,157" +1,img_00225.png,blue,"522,540,522,660,576,660,576,540" +1,img_00225.png,red,"739,460,739,545,768,545,768,460" diff --git a/axolotl/tests/data/datasets/raw_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/raw_dataset_1/datasetDoc.json new file mode 100644 index 0000000..a52845a --- /dev/null +++ b/axolotl/tests/data/datasets/raw_dataset_1/datasetDoc.json @@ -0,0 +1,23 @@ +{ + "about": { + "datasetID": "raw_dataset_1", + "datasetName": "Raw dataset to be used for tests", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "digest": "e28468d602c30c7da7643aa78840bcaae68a9abb96b48cc98eb51fb94e6fd3af", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "raw/", + "resType": "raw", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": true + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/raw_dataset_1/raw/complementaryData.csv b/axolotl/tests/data/datasets/raw_dataset_1/raw/complementaryData.csv new file mode 100644 index 0000000..7280f16 --- /dev/null +++ b/axolotl/tests/data/datasets/raw_dataset_1/raw/complementaryData.csv @@ -0,0 +1,3 @@ +tpep_pickup_datetime,num_pickups +1/1/2017 0:00,53 +1/1/2017 1:00,3 diff --git a/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/datasetDoc.json b/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/datasetDoc.json new file mode 100644 index 0000000..aa04d59 --- /dev/null +++ b/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/datasetDoc.json @@ -0,0 +1,82 @@ +{ + "about": { + "datasetID": "object_dataset_1_TEST", + "datasetName": "NULL", + "license": "CC0", + "datasetSchemaVersion": "4.0.0", + "redacted": true, + "datasetVersion": "4.0.0", + "digest": "cc76f9b93e81a7129c24565f258f5b5483714e2063af970fdda067723b2c29cb" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "media/", + "resType": "image", + "resFormat": { + "image/png": [ + "png" + ] + }, + "isCollection": true + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 4, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "multiIndex" + ] + }, + { + "colIndex": 1, + "colName": "image", + "colType": "string", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "color_not_class", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + }, + { + "colIndex": 3, + "colName": "bounding_polygon_area", + "colType": "realVector", + "role": [ + "suggestedTarget", + "boundaryIndicator", + "boundingPolygon" + ], + "refersTo": { + "resID": "learningData", + "resObject": { + "columnName": "image" + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/media/img_00225.png.REMOVED.git-id b/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/media/img_00225.png.REMOVED.git-id new file mode 100644 index 0000000..5ea1c76 --- /dev/null +++ b/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/media/img_00225.png.REMOVED.git-id @@ -0,0 +1 @@ +00a0cea05b49325c138f42e785e006cb482ef6e7 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/media/img_00285.png.REMOVED.git-id b/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/media/img_00285.png.REMOVED.git-id new file mode 100644 index 0000000..22f427a --- /dev/null +++ b/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/media/img_00285.png.REMOVED.git-id @@ -0,0 +1 @@ +d7d45fd20a81986eca135b620a0f3f45151d6880 \ No newline at end of file diff --git a/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/tables/learningData.csv b/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/tables/learningData.csv new file mode 100644 index 0000000..e3c63a1 --- /dev/null +++ b/axolotl/tests/data/datasets/score_dataset_1/dataset_TEST/tables/learningData.csv @@ -0,0 +1,5 @@ +d3mIndex,image,color_not_class,bounding_polygon_area +0,img_00285.png,, +0,img_00285.png,, +1,img_00225.png,, +1,img_00225.png,, diff --git a/axolotl/tests/data/datasets/score_dataset_1/targets.csv b/axolotl/tests/data/datasets/score_dataset_1/targets.csv new file mode 100644 index 0000000..09aace3 --- /dev/null +++ b/axolotl/tests/data/datasets/score_dataset_1/targets.csv @@ -0,0 +1,5 @@ +d3mIndex,color_not_class,bounding_polygon_area +0,red,"480,457,480,529,515,529,515,457" +0,black,"10,117,10,329,105,329,105,117" +1,blue,"422,540,422,660,576,660,576,540" +1,red,"739,460,739,545,768,545,768,460" diff --git a/axolotl/tests/data/datasets/text_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/text_dataset_1/datasetDoc.json new file mode 100644 index 0000000..7936f9b --- /dev/null +++ b/axolotl/tests/data/datasets/text_dataset_1/datasetDoc.json @@ -0,0 +1,98 @@ +{ + "about": { + "datasetID": "text_dataset_1", + "datasetName": "Test text dataset", + "description": "Based on 30_personae_dataset", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "93b2d6fda19ce0c64a9fb49f88c3a3c4444318df923ed424c3c7911336dfd34f" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "text/", + "resType": "text", + "resFormat": { + "text/plain": [ + "txt" + ] + }, + "isCollection": true + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 7, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "raw_text_file", + "colType": "string", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "text_language", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "author_gender", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "author_language", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "author_region", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "extrovert", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/text_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/text_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..3b18560 --- /dev/null +++ b/axolotl/tests/data/datasets/text_dataset_1/tables/learningData.csv @@ -0,0 +1,5 @@ +d3mIndex,raw_text_file,text_language,author_gender,author_language,author_region,extrovert +1,1.txt,Dutch,female,Dutch,OV,1 +2,2.txt,Dutch,female,Dutch,L,1 +3,3.txt,Dutch,female,Dutch,Other,1 +4,4.txt,Dutch,female,Dutch,A,1 diff --git a/axolotl/tests/data/datasets/text_dataset_1/text/1.txt b/axolotl/tests/data/datasets/text_dataset_1/text/1.txt new file mode 100644 index 0000000..4bc0498 --- /dev/null +++ b/axolotl/tests/data/datasets/text_dataset_1/text/1.txt @@ -0,0 +1 @@ +Automatic mechanical self reproduction Het onderwerp van deze documentaire is het kopiëren van een machine . Het gaat hier echter niet om het zomaar willekeurig kopiëren , het gaat om de zelf reproductie van een cel . Die cellen vormen dan een perfect duplicaat van de eerste machine . Van dit dupliceren of vermenigvuldigen zagen we een aantal voorbeelden in de documentaire . Mechanismen werden met elkaar verbonden , paren werden gevormd . Na een x aantal aanvoegingen splitsen de mechanismen zich terug . Dit illustreert dat door modellen samen te voegen en te combineren men replica's van een machine kan vormen . Dit betekent dat fundamentele ( menselijke ) verschijnselen te vervangen vallen . Wat op zich een zeer baanbrekend idee is . Bij het creëren van artificiële intelligentie gaat men op zoek naar de fundamenten van leven en intelligentie . Eerst en vooral vraagt de term artificiële intelligentie een korte toelichting . Onder kunstmatig leven verstaat men door de mens gemaakt leven en niet door de natuur . Het gaat erom een biologisch organisme te doen ontstaan door middel van computers en chemicaliën . Centraal staat hierbij de intentie om het door de mens gemaakte er zo natuurlijk mogelijk te laten uitzien . Dit nabootsen en reproduceren van levenskracht in mechanismen is een aspiratie die men reeds eeuwen lang koestert . Zo had men in de zeventiende eeuw een eend gemaakt die er zeer natuurlijk uitzag . Iedereen wist dat er mechanisme principes achter heel dit proces schuil gingen maar ze leek zo echt . Ze was niet uit organische materie opgebouwd maar toch werden uitwerpselen gesimuleerd . Dit punt in het verhaal brengt ons bij John Holland Een machine die zich zelf kan reproduceren dat gelooft toch niemand ! Maar hij stond wel degelijk voor het begin van kunstmatig leven en in het verlengde ervan de zelfreproductie van mechanismen . John van Neumann centreert het verhaal rond genetische en natuurlijke selectieprocessen . Hij gaat aan de slag met kunstmatige simulatiemiddelen om deze processen in kaart te brengen . Uiteindelijk hoopt hij hierin een beter inzicht te verwerven . Maar hoe gaat hij tewerk ? Hij gaat de kenmerken die eigen zijn aan mechanismen reduceren tot een reeks getallen . In tweede instantie gaat hij twee strings ( representaties van reeksen getallen ) op een willekeurige plaats breken en het begin van de eerste string aan het einde van de tweede koppelen . Zo bekomt hij twee nieuwe ketens . Dit proces valt met behulp van een computer heel snel te realiseren . De principes waar hij gebruik van maakt vallen terug te koppelen aan het gebruik van genetische algoritmen . Kopiëren heeft in deze documentaire alles te maken met reproductie . Dat is de leidraad . Ook de recombinatie krijgt een centrale plaats toe bedeeld . Zo vallen tienduizenden generaties ( van ontwikkeling ) in een handomdraai te simuleren . Een belangrijke stap in dit proces van de mens die zijn kennis uitbreidt over kunstmatige intelligentie is het congres in Los Alamos in de jaren tachtig . Wetenschappers van verschillende disciplines werden door de initiatiefnemer met elkaar in contact gebracht . Stel je voor wat hun samenwerking kan beteken . Wij als generatie die al een stap verder zitten horen nu dagelijks de discussies over hoe ethisch verantwoord de onderzoeken in dit veld zijn . Op deze workshop waren maar liefst hondert vijftig mensen aanwezig . Centraal punt vanheel het gebeuren was het filosoferen over nieuwe kunstvormen . Door de vertegenwoordiging van mensen uit verschillende vakgebieden krijg je een uitgebreid gamma aan visies op deze kwestie . Een eenduidig antwoord of ethiek formuleren bleek natuurlijk niet mogelijk . Men stond noch aan het begin van allerlei nieuwe ontwikkelingen . Doyne Farmer , een fysicus , werkachtig in Santa Fé New Mexico , houdt zich bezig met de chaostheorie en de filosofie . De chaostheorie houdt in dat als je aan een deel van de wereld ingrijpt , de gevolgen ervan aan de andere kant kunnen opduiken of hun uitwerking hebben . Nu Doynes filosofie stelt dat leven een spontaan natuurlijk verschijnsel is . Dit zonder ingrijpen van buitenaf , zonder dat er zich uitzonderlijke gebeurtenissen voordoen . Nu dit proces van " leven " is het veld van onderzoek van Doyne . Hij gaat kijken naar het ontstaan van leven vanuit de basis . De cellen hebben de eigenschap dat ze non-lineair zijn . Dit betekent dar ze info opslaan en met elkaar kunnen interageren . Al dit geëxperimenteer op dit vlak brengt ons ook bij een overkoepelende vraag . Wat zijn we als mens , wat betekenen we in deze wereld ? Ook de sociale en politieke gevolgen zijn niet te overzien eens men deze vraag aansnijdt . Het Massachussets Insitute of Technology is de werkplaats van twee belangrijke onderzoekers : nl. Berwick en Brooks . Brooks onderzoekt autonome robots die leven in deze wereld , dit betekent dat ze dus interageren met deze wereld . Een van zijn experimenten was een het simuleren van een landing op de maan met een ruimtetuig waar een robot in ondergebracht was . Voor vier dagen lang , de reisduur naar de maan , had men geen contact met de robot . Maar eens daar aangekomen , breekt de robot uit de consule en zet zijn eerste stappen op de maan . Wat een ongelooflijke prestatie ! Een andere robot waar Brooks mee werkt is Cog . Cog is een mensachtige robot . Z'n intelligentie valt te vergelijken met die van een klein kind . Hij heeft ook een heel aantal menselijke kenmerken . Hij kan z'n hoofd bewegen , zijn ogen bewegen met dezelfde snelheid als die van de mens . Z'n brein bestaat uit kleine verwerkingsmechanismen waarmee hij in staat moet zijn de lagere functies van de mens te kunnen vervullen . Zo kan hij bijvoorbeeld voorwerpen volgen met het oog . Ook de oog hand coördinatie vraagt speciale aandacht en is niet zo vanzelfsprekend . Maar wat noch het meest in het oog springt is de zintuiglijke verbeelding van symmetrie die de robot ten toon spreidt . Hij kan de handen alle twee naar voor brengen en tegen elkaar aan sluiten . Hij geeft de indruk evenwicht te bezitten . Op zich lijkt dit een eenvoudige handeling maar als je denkt aan alle verwerkingsmechanismen die hieraan te pas komen , moet je zonder twijfel toegeven dat de robot een geslaagde representatie is van de verwerkingsmechanismen die aan menselijke handelingen ten grondslag liggen . Wat is nu het doel van dit soort onderzoek ? Inzicht verkrijgen in de opbouw van het menselijke brein en het daarmee gepaarde inzicht hoe het werkt ? Deels kan men dit als het onderzoeksveld aanduiden . Maar wat nog belangrijker is , is dat men nu de kans heeft bestaande hypothesen te toetsen . Zo bouwt men mobiele robots die een bepaalde omgeving kunnen bewegen . De complexe opbouw van het menselijk organisme maakt de representatie van de handelingen die mensen stellen in de vorm van robots niet gemakkelijk . Daarom is het idee gerezen eerst insecten te bestuderen en te bekijken hoe deze zich organiseren . Op dit punt in het onderzoek wordt dan de term artificial insects geïntroduceerd . Zo maken we in de documentaire kennis met Hannibal , een robot met zes poten . Per poot heeft Hannibal maar liefst zeventien sensoren ingebouwd gekregen . Hij is dus wat we noemen decentraal georganiseerd . In tegenstelling tot wat we zo graag zouden willen geloven is ook de mens decentraal georganiseerd . Er bestaat niet zo iets als een centrale ik in mensen . Deze mythe is enkel en alleen ontstaan om onszelf te verklaren . Men kan het de mens natuurlijk niet kwalijk nemen dat hij dit fabeltje heeft verzonnen , maar veel waarde en geloog mogen we er niet aan hechten . Zo zijn we in het verhaal van de evolutie terug aanbeland bij Cog , de mensachtige robot . Doel van deze robot is een elementaire representatie te vormen van de menselijke basisfuncties en mogelijkheden . Een belangrijk onderdeel van Cog zijn de ingewerkte geluidsverwerkingssystemen . Uiteindelijk wil men de herkenning en mogelijke verwerking van menselijke stemmen mogelijk maken . Ook zijn er experimenten gedaan naar materiaal dat de menselijke huid kan voorstellen . Een ander en laatste belangrijke onderdeel van Cog zijn de sensoren , die de tastzin mogelijk maken . En deze zorgen ervoor dat Cog in contact staat met de wereld . Sensoren heb je namelijk nodig voor alle bestaansfuncties . De onderzoeker Herb Simon dat de complexiteit in de wereld kan ontdekt en verkend worden door systemen die in staat zijn te interageren met deze wereld . Dus door bewust in de wereld te staan en al je zintuigen ten volle te gebruiken kan een deel van de complexiteit verklaard worden door observatie en herleid worden tot eenvoudige processen . Waar zijn we dan aanbeland met ons evolutieverhaal ? We leven in de overtuiging dat de mens uit een complex systeem bestaat . Verandert dit iets aan onze houding tot kunstmatig leven ? Chris Langton heeft dit evolutionair proces trachten te begrijpen . Hij gaat evolutionele algoritmen implementeren in de computer . De wereld kan je dan voorstellen door een veld van vierkantjes met verschillende kleuren eromheen . Naargelang de energietoevoer en implementaties verandert de omringende kleur . Uit deze experimenten worden regels afgeleid . Het uiteindelijke doel is een machine te maken die een kopie van zichzelf genereert &slash; Zo kent de machine dan een exponentiele groei en valt dan uiteen in deeltjes duplicaten van zichzelf . Het vermogen van levende organismen om zichzelf te reproduceren lijkt dus op een menselijk kenmerk en valt te dus tegen alle verwachting in te herleiden tot een simpel systeem . Zo vallen er nog tal van voorbeelden aan te halen van experimenten met evolutionaire processen in een twee dimensionele kunstmatige wereld . Stuart Kaufmann ziet dan weer heil in het bestuderen van reproducerende chemische systemen en tracht hiermee moleculen voor de geneeskunde te ontdekken . Op zich lijkt dit me de interessantste tak van het onderzoek omdat ze zich bezig houdt met het mogelijke ontstaan en bestaan van organismen over heel de kosmos . Het beslaat een breed veld van onderzoek met andere woorden . Kaufmann houdt zich vooral bezig met het ontstaan van aminozuren of proteïnen . Hij gaat deze willekeurig combineren in DNA-stringen en zo het RNA afleiden . Proteïnen maken dus deel uit van complexe systemen . Doorheen de vier miljard jaar van evolutie zijn er verschillende soorten eiwitten en aminozuren ontstaan . Op dit moment zijn er slechts een fractie van alle mogelijk moleculen bekend . Maar door onderzoek en , toegegeven , eerder willekeurig combineren kan men nieuwe moleculen creëren met mogelijk nuttige eigenschappen . Deze nieuwe moleculen kunnen aan de basis liggen van de vorming van nieuwe medicijnen , enzymen en kunnen nog tal van andere functies vervullen . De praktische component van dit onderzoek staat dus niet zo ver van ons bed verwijdert als al die mensachtige of insectachtige robots , die als doel hebben de complex lijkende maar tot eenvoudig herleidbare basisfuncties van de mens te representeren . Wat meer inzicht in het functioneren van het menselijk organisme kan teweegbrengen en een decentralisatie van de mens in het kosmisch veld oplevert , maar verder geen baanbrekende ontdekkingen teweegbrengt . Nu zijn al deze chemische systemen en materialen mogelijk door diversiteit . Nu heeft men rond het dynamisch gedrag van complexe systemen heel wat theorieën opgebouwd . En wat is een betere testplaats dan de computer voor deze theorieën . Men gaat parallelle netwerken van moleculen opbouwen in buisjes of reactievaten . Deze chemische systemen gaan zich dan voortplanten . Ze bouwen zichzelf op en dragen zelf de mogelijkheid in zich informatie over te dragen . Maar een belangrijke component van dit onderzoek , namelijk de overdracht van energie en het werkelijk vormen van materie , is door een medium als de computer spijtig genoeg niet mogelijk of realiseerbaar . Moleculen die zich collectief voortplanten maakt deel uit van een nieuwe evolutie . Stel je voor , nieuwe ecosystemen die zich vormen . Als deze nieuw geïmplementeerde moleculen vergif vormen voor het nieuwe ecosysteem gaan ze een verdediging vormen . En als ze voedsel vormen gaan ze elementen als zuurstof en koolstof delen en treedt het fenomeen symbiose op . Dit is een heel interessant proces . Je kan zien hoe deze organismen hun leven vormgeven en wat de praktische uitwerking van deze wisselwerking is . \ No newline at end of file diff --git a/axolotl/tests/data/datasets/text_dataset_1/text/2.txt b/axolotl/tests/data/datasets/text_dataset_1/text/2.txt new file mode 100644 index 0000000..97588b9 --- /dev/null +++ b/axolotl/tests/data/datasets/text_dataset_1/text/2.txt @@ -0,0 +1 @@ +Artificiële Intelligentie Een documentaire uit de jaren '80 De documentaire start met de vraagstelling wat artificiële intelligentie precies is . Er wordt een vergelijking gemaakt met triplex blokken . De basis van leven is voortplanting en dat is dus ook de basisvereiste om te spreken van AI . Een programma dat in staat is zichzelf te reproduceren of samen te gaan met een ander programma kan intelligent zijn . Hierbij speelt de survival of the fittest een grote rol . Het kenmerkende aan Artificiële Intelligentie is dat er iets biologisch geschept wordt door de mens in plaats van door de natuur . Vervolgens wordt er iets biologisch gemaakt door middel van iets niet biologisch . Het is echter niet zo simpel als het lijkt . Een machine die zichzelf reproduceert is verre toekomstmuziek , of onmogelijk . Wat wel kan , is dat een programma met een lijst van kenmerken , die kenmerken laat kruisen en zo tot een nieuwe cijferreeks komt . Door het kopiëren van combineren van gegevens wordt een genetisch algoritme gecreëerd . Lange tijd waren verschillende onderzoekers geïsoleerd bezig met onderzoek naar AI . Een eerste congres , georganiseerd door Dayne Forner , bracht daar verandering in . Het was een groot succes . Er werd niet alleen gesproken over de technische kant van Artificiële Intelligentie , maar ook over de filosofische vragen die dit nieuwe fenomeen met zich meebrengt . Een groot verschil met natuurlijk leven , is dat artificieel leven niet het resultaat is van toeval , maar een beredeneerd proces is . De vraag is of dit ethisch verantwoord is . Sociaal en politiek gezien is AI geen vanzelfsprekendheid . Deze intelligente revolutie neemt ingrijpende veranderingen met zich mee . Door het congres werd duidelijk dat onderzoek naar Artificiële Intelligentie geen werk was van enkele zonderlingen . De natte droom van elke onderzoeker op gebied van Artificiële Intelligentie is een autonome robot . In Amerika is men erin geslaagd een robot te ontwerpen die dezelfde cognitieve capaciteiten heeft van een klein kind . De robot beschikt over 5 zintuigen en kan bewegen . Deze robot is verbonden met een extern ' brein ' dat hem kan besturen . Vooralsnog heeft deze machine geen noemenswaardige acties uitgevoerd , hij bevindt zich nog in een pril stadium . Meer ontwikkeld zijn , ironisch genoeg , artificiële insecten . Zo'n insect is bijvoorbeeld gebruikt bij een maanlandingsimulatie . Het insect kroop uit het ruimtetuig en nam stalen van de gesimuleerde maan . Er was dus interactie met de complexe buitenwereld . Artificiële Intelligentie is , net als de natuur , een evolutieproces . Evolutionaire algoritmen reproduceren zichzelf , en op den duur krijgt men willekeurige DNA-slierten die zelf ook nieuwe moleculen maken . Deze evolutie is eindeloos aangezien er oneindig veel verschillende moleculen mogelijk zijn . AI is dus dynamisch en kan evolueren tot complexe systemen , nieuwe ecosystemen en kan de evolutie van onze culturen danig beïnvloeden . Deze verouderde documentaire is een zeer ongelukkige filmkeuze . Voor zover ik begrepen heb , is het de bedoeling van het experiment om af te leiden uit onze schrijfstijl wat voor persoonlijkheid we hebben . Ik zie het nut hier zeker van in en ben benieuwd naar de resultaten . Verder begrijp ik dat de keuze viel op een film met betrekking tot het onderwerp van de colleges , zodat we die dinsdag nuttig bezig waren . Ik stel mij echter serieus vragen bij deze keuze . Ten eerste hebben we de film maar bekeken tot in de helft . De clou van het verhaal is dus niemand duidelijk geworden . Of het dus nuttig was voor onze algemene vorming om deze film te bekijken is mij zeer de vraag . Ten tweede was de documentaire totaal verouderd . Iedereen weet dat technologie razendsnel vooruit gaat , en dat hightech van de jaren '80 nu prehistorisch lijkt . Ik veronderstel dat het de bedoeling was om de basis van AI uit te leggen door middel van deze film . Wat dat betreft moet ik u ten zeerste teleurstellen , want ook dit doel is mislukt . Ik dacht even dat het aan mij lag , maar na wat rondvragen bij mijn medestudenten en studentes bleek dat ik zeker niet de enige ben die er vrij weinig begrepen heeft . Ik kan dus wel stellen dat de film zijn doel volledig gemist heeft . Over het concept Artificiële Intelligentie . Ik ben van mening dat vooruitgang belangrijk is in onze samenleving . Bijgevolg heb ik principieel niks tegen AI . Wel heb ik moeite met de benaming Artificiële Intelligentie , het is pretentieus te zeggen dat een programma of zelfs een robot zelf nadenkt . De acties zijn niet meer dan een voorgeprogrammeerde reactie op een bepaalde impuls . Nu kan men onder behavioristische invloeden wel zeggen dat ook de mens niet meer doet dan reageren op zijn omgeving . Tot op zeker hoogte kan ik mee gaan in die redenering , maar een intelligente mens kan kiezen op welke manier hij reageert op een bepaalde impuls , terwijl een machine slechts op een geprogrammeerde manier reageert . Er is dus geen keuze mogelijk . Zelfs wanneer er verschillende mogelijkheden geprogrammeerd zijn , dan zal het programma reageren volgens bepaalde voorwaarden . Wanneer een programma een nooit geziene situatie tegenkomt zal het niet weten hoe erop te reageren . Dit komt omdat ze niet zelfstandig na kunnen denken . Er is echter een sociaal probleem dat Artificiële intelligentie met zich meebrengt en waarover naar mijn mening te weinig nagedacht wordt . Iedereen gaat er van uit dat AI vooruitgang is , maar men moet zich afvragen ten koste van wat ? Intelligente machines zullen misschien inderdaad ingezet worden in de ruimte , omdat zij in onmenselijke omstandigheden kunnen ' overleven ' . Dit zijn volgens mij slechts uitzonderingen . Ik denk dat intelligente machines veel meer gebruikt zullen worden in het bedrijfsleven , aan de lopende band . Er zijn op dit moment enorm veel mensen die handelingen aan de band uitvoeren die binnen afzienbare tijd uitgevoerd kunnen worden door machines . Dit zal waarschijnlijk veel goedkoper zijn voor het bedrijf , en logischer wijze zullen deze machines de mensen vervangen . Er zullen dus een heleboel mensen werkloos worden . Op zich is dat niet zo dramatisch . Er is genoeg sociale zekerheid om deze mensen op te vangen , het grote probleem stelt zich echter pas later in de evolutie . Er zal een steeds grotere groep mensen komen die werkloos zijn . Vacatures zullen er nog genoeg zijn , maar enkel voor beroepen waarbij niet met de handen , maar met het brein gewerkt moet worden . Deze vacatures zullen niet ingevuld worden . Ik voorspel dat op de lange duur een meerderheid van de bevolking werkloos zal zijn . Nogmaals , dit is niet dramatisch voor armoede , maar er zal een heel nieuw probleem opduiken : verveling . Mensen die zich vervelen worden ontevreden . Ik denk dat deze hoge werkloosheid voor enorme sociale onrust zal leiden . De grote angst die Artificiële Intelligentie met zich meebrengt is volgens mij niet grogrond . Pessimisten vrezen ervoor dat de machines op een gegeven moment de macht over de mensen zullen krijgen . Ik denk dat dit zeer onwaarschijnlijk is . De mens is verstandig genoeg om , wanneer experimenten uit de hand dreigen te lopen , deze experimenten te stoppen . Een ' the Matrixachtig ' scenario zie ik nog niet meteen gebeuren . In extremis kan men bij wijze van spreken nog steeds de elektriciteit uit zetten . Ik denk dat ik kan besluiten dat Artificiële Intelligentie een grote stap vooruit is in de evolutie . Men moet hier echter voorzichtig mee omspringen zodat het niet uit de hand loopt . Wat de documentaire zelf betreft ben ik teleurgesteld in de keuze , want ik denk dat er veel interessantere beelden te vinden zijn bij dit toch wel intrigerende onderwerp . \ No newline at end of file diff --git a/axolotl/tests/data/datasets/text_dataset_1/text/3.txt b/axolotl/tests/data/datasets/text_dataset_1/text/3.txt new file mode 100644 index 0000000..082a9f0 --- /dev/null +++ b/axolotl/tests/data/datasets/text_dataset_1/text/3.txt @@ -0,0 +1 @@ +Film " Artificial Intelligence " De film start met een experiment van de wiskundige Von Neumann . Hij construeert bepaalde modellen van triplex die horizontaal ten opzichte van elkaar bewegen door het toevoegen van energie . Door de bewegingen grijpen de modellen in elkaar , waarna ze samengevoegd worden tot een groter model . Op deze manier wordt natuurlijke selectie nagebootst . Men gaat in de wetenschap van de " Artificial Intelligence " op zoek naar de fundamenten van het leven . Een wetenschapper verklaart dat het leven kunstmatig is en door mensen veroorzaakt wordt , de natuur staat hier buiten ( " Life is made bij human-beings , not by nature " ) . De wetenschap heeft als doel een biologisch organisme te creëren om zo de biologische verschijnselen te kunnen verklaren . Als voorbeeld wordt de " Mechanische eend " gegeven . Wetenschapper Holland liet in de jaren '60 de computer dingen creëren . Helaas kon de machine zichzelf niet reproduceren , maar deze was wel in staat problemen oplossen . De PC reproduceerde namelijk kenmerken tot reeksen getallen die met elkaar ook weer gecombineerd konden worden tot nieuwe reeksen getallen . Wanneer twee getallenketens gekruist worden vormt de nieuw samengestelde keten een genetisch algoritme , die gemakkelijk te verwerken is voor de computer . Deze worden ook wel " crossing copies " genoemd die nieuwe generaties kweken . Dit fenomeen werd indertijd verbreid door studenten . Fysicus Farmer geeft een congres over kunstmatig leven . Hij is pionier op het gebied van de Chaostheorie . Zijn filosofie luidt dat het leven spontaan en automatisch verloopt . Er vinden geen afzonderlijke gebeurtenissen plaats . Het gaat erom simpele delen te laten reageren , waardoor het leven spontaan geleefd wordt . De complexiteit van het leven ligt in de wereld en niet in de mens . Het beste systeem is de mens zelf . Wetenschapper Langton ziet de wereld als een ruitjespapier : men kan de wereld dus zien als iets dat is onderverdeeld in hokjes &slash; ruitjes . Deze structuur van de wereld kopieert zichzelf voordurend tot dat het op een gegeven moment niet meer mogelijk is . Dit verschijnsel wordt vergeleken met de voorplanting van levende wezens . Het laat de complexiteit van reacties van levende organismen zien . Dit wordt ook wel het zwermsimulatiesysteem genoemd . Wetenschappers Miller en Kaufmann onderzoeken de bouwstenen van levende organismen . Zij trachten complexe systemen van complexe organismen te maken , dit is de wetenschap van de toegepaste nucleaire evolutie . Meneer Brooks van het Artificial Labatory creëert autonome robots . Een voorbeeld is " Hannibal " , een mechanisch wezen met zes poten en 17 sensoren in die " voetjes " . De sensoren vormen het zenuwstelsel van de robot . Brooks plaatst een camera in de robot en laat een testvlucht uitvoeren door een apparaatje waarin Hannibal zich bevindt . Deze laat hij vervolgens landen op een maanlandschap . Cog is een menselijke robot , die bestaat uit een romp en een hoofd met ogen . De armen en benen ontbreken nog , maar hij heeft wel heupen . De bedoeling is dat Cog het menselijke brein krijgt en daardoor ook als een mens kan functioneren . Het menselijke leven wordt op deze manier nagebootst . De nek reageert bijvoorbeeld op datgene dat de ogen waarnemen . Dit moet de robot wel aangeleerd worden . Het brein vertoont dus intelligentie . De armen en handen van Cog zijn nog in de maak . Deze krijgen een plastic huid waarin zich sensoren bevinden om de robot tastzin te geven . Ook reageren de sensoren op temperatuurwisselingen , net als bij een echt mens . De intelligentie bestaat eigenlijk uit een wisselwerking tussen systeem en wereld , benut door moleculen . De wetenschap van de Articificiële Intelligentie heeft behoefte aan dynamisch gedrag van organismen , cellen , moleculen en de bits op chips ( in computers ) . Deze hebben een " chemical life " , zij zorgen met andere woorden zelf voor hun leven . Bepaalde soorten organismen scheppen ook hun eigen wereld , zij zorgen zelf voor het functioneren van hun eigen ecosysteem . De wetenschappers zijn het erover eens dat de aarde een wereldbeschaving kent en zelf zorgt voor de ontplooiing ervan . Men moet eerst de moleculen begrijpen en daarna is zij in staat de maatschappij van de mensen te kunnen doorgronden . Mijn mening tegenover deze film is gematigd positief . Ik sta achter het idee dat er onderzoek gedaan moet worden naar het functioneren van het brein van de mens door middel van artificiële intelligentie . Wel vind ik het een moeilijk te begrijpen materie , men moet een technische studie gedaan hebben om het allemaal te kunnen volgen , en , belangrijker nog , om zelf ook nieuwe theorieën te bedenken en uit te voeren . Interessant vind ik het experiment met de robot Cog , van wie het brein op dezelfde manier zou moeten functioneren als de menselijke hersenen . Ook de tastzin is iets menselijks dat nu ook " beschikbaar " is voor een robot . Op deze manier kan wellicht de mens in de toekomst geholpen worden wanneer er in het brein van de mens zelf iets mis is . In plaats van dat er dan geëxperimenteerd moet worden met echte hersenen kan men allerlei dingen juist uitproberen op een nagemaakt brein . Het is goed dat er verschillende wetenschappers in de film aan het woord gelaten worden , die zich allemaal bezighouden met een andere tak van de Artificiële Intelligentie . Zo leren we de verschillende mogelijkheden van deze wetenschap kennen en kunnen we zo bepalen wie met wat bezig is en wat we belangrijk vinden . Ik ben het met de wetenschappers eens dat de mens zelf gedeeltelijk voor zijn ontplooiing zorgt . Maar men mag niet vergeten dat een mens niet alleen door zichzelf , maar ook gevormd wordt door zijn omgeving , door het milieu waarin hij opgroeit en door de mensen met wie hij of zij omgaat . Ik denk dus niet dat de mens er geheel alleen voor staat . Bovendien wordt er in de film beweerd dat men eerst de moleculen moet begrijpen alvorens de maatschappij van de mens te kunnen doorgronden . Ik ben het er niet mee eens , omdat een maatschappij veelal bestaat uit een gevoelswereld : een mens heeft gevoelens en reageert daarop . Daaruit ontstaat een maatschappij waarin ieder mens zich anders gedraagt en ik geloof daarom niet dat men eerst via moleculen ( eigenlijk alles binnen de wetenschap ) kan begrijpen hoe een samenleving in elkaar zit . Het leven verloopt spontaan , dat geloof ik ook . Het leven hangt volgens mij van toevalligheden aan elkaar en de mens heeft weinig invloed op het gebied van ouder worden enz. Er wordt nog gezegd dat de complexiteit van het leven ligt in de wereld zelf en niet in de mens , maar ik ben van mening dat de mens zelf in staat is het leven complex te maken . Het is dus naar mijn mening niet zo dat alleen de wereld om ons heen complex is , maar ook dat de mens een ingewikkeld wezen is en leven leeft op een wel of niet complexe manier , afhankelijk van de persoon zelf . Ik vind het erg jammer , dat de film verre van recent is . Het precieze jaartal van de film is mij onbekend , maar ik zie aan de techniek en de kwaliteit van de film dat deze toch wel uit de jaren '80 moet zijn . Ik denk dat men nu , twintig jaar later , al veel verder is in deze wetenschap van de Artificiële Intelligentie . De robot zal inmiddels wel armen en benen hebben , die bovendien prima functioneren . Ook is men in staat nu heel veel met de computer te kunnen doen , omdat nu eenmaal de programma's sterk verbeterd zijn . Ik vraag mij dus af en ben ook erg benieuwd hoe het met de Artificiële Intelligentie in het jaar 2006 gesteld is . \ No newline at end of file diff --git a/axolotl/tests/data/datasets/text_dataset_1/text/4.txt b/axolotl/tests/data/datasets/text_dataset_1/text/4.txt new file mode 100644 index 0000000..feca7c3 --- /dev/null +++ b/axolotl/tests/data/datasets/text_dataset_1/text/4.txt @@ -0,0 +1 @@ +Bespreking documentaire Artificiële Intelligentie In een ietwat verouderde documentaire krijgen we een aantal ( Amerikaanse ? ) wetenschappers te zien die vertellen over hun zoektocht naar artificieel leven en dus eigenlijk ook naar de fundamenten van leven en intelligentie . Ze hopen ooit zelf een levend organisme te creëren . Anders dan alles wat wij nu het adjectief ' levend ' zouden toekennen , zijn ze op zoek naar leven dat niet door de natuur gemaakt is . In navolging van de computer die in de jaren '60 van de vorige eeuw voor het eerst zelf op zoek ging naar oplossingen voor ingewikkelde problemen , wil het heerschap een machine maken die zichzelf kan reproduceren . ' Een computer kan zelf oplossingen bedenken , nu willen we er eentje die zichzelf ook nog eens kan voortplanten ' , zo lijkt hun adagium te luiden . Dit gaat uiteraard in tegen de heersende ideeën die de mogelijkheid van zelfreproductie ontkennen . Edoch , er blijkt meer interesse voor dergelijke projecten dan verwacht . Steeds meer mensen zijn gebiologeerd door de vraag naar artificiële levensvormen en de eerste conferentie voor ' artificial life ' was dan ook een groot succes . In eerste instantie zien we hoe het probleem op technologisch niveau gereduceerd wordt tot een lijst van nummers . Die vormen ketens en via genetische algoritmes worden ze dan gekopieerd en gekruist . Op die manier ontstaan er op korte tijd verschillende generaties en dit is wat onze wetenschappers willen bewerkstelligen . Uiteindelijk draait het namelijk allemaal om de mens zelf . Als we zelf artificieel leven kunnen beginnen , kunnen we van nabij - en vanaf het prille begin - een evolutie bestuderen die dan zou moeten overeenkomen met de evolutie die de mens tot mens gemaakt heeft : Darwins theorie in de praktijk omgezet . Maar meer dan de evolutietheorie hopen onze wetenschappers ook op een ultiem inzicht in de raadselachtige werking van het menselijk brein . Ene Brooks ontwikkelt autonome robots die zich als levende wezens gedragen en toont ons een gesimuleerde maanreis . Zijn creatuur Cog zou ooit het intelligentieniveau van een klein kind moeten bereiken . Hij ontwerpt ook artificiële insecten om zijn hypotheses uit te testen . Zo beschikken we over een gedecentraliseerd zenuwstelsel en blijken we dus niet over één centraal punt te beschikken waarop alles samenkomt . Ons wezen , wie we zijn , bevindt zich dus niet op één aanwijsbare plaats in ons lichaam . We zien robots met onder andere een huid die net als de onze bedekt is met allerlei sensoren . Hij stelt zijn creaturen op die manier in staat tot aanraken maar ook tot het horen en zien van zichzelf en de wereld . Kortom , er worden sensoren ontwikkeld voor elke bestaansfunctie . Het verbaast dan ook niet dat het de wisselwerking met de wereld rondom ons is , die ons maakt tot wie we zijn . De complexiteit blijkt in de wereld en niet in de mens te zitten . Er passeren nog wat wetenschappers de revue die via hun zoektocht naar artificieel leven eigenlijk blijk geven van hun verwondering over de mens als intelligent wezen en diens interactie met de ons omringende wereld . Stanley Miller maakte in de jaren '50 zelf van DNA en RNA willekeurige strengen die geen kopieën vormden van bestaande proteïnen . Hij wilde dus geen leven klonen maar nieuwe reproducerende chemische systemen maken . Als we naar alle bouwstenen kijken , blijkt dat er op dat vlak nog maar een fractie van de mogelijkheden benut is . Er zijn nog ontelbaar veel combinaties van moleculen die we niet uitgetest hebben . Dit kan een interessant te verkennen terrein zijn in de zoektocht naar de mogelijkheden van de creatie van artificieel leven . Wat nu kan op de computer - bits op chips die op elkaar reageren - kunnen we misschien ooit op parallelle wijze doen met cellen en moleculen . Hieraan wordt een mogelijke nieuwe visie op mens en wereld gekoppeld . We zouden zelf ecosystemen kunnen vormen en , zoals eerder vermeld , een nieuwe evolutie bestuderen . Maar , dit heeft ook ethische consequenties . Mogelijk kunnen wie via de studie van artificieel leven ideeën opdoen over onze maatschappij . Hoe kunnen we gemeenschappen laten samenleven ? Op naar eeuwige wereldvrede ? Dat was een oppervlakkige weergave van het door ons bekeken deel van de documentaire over artificiële intelligentie en nu is het de bedoeling dat ik mijn mening te berde breng . Allereerst lijkt het me een moeilijke - zo niet onmogelijke - opdracht mijn mening te geven over een onderwerp waarin ik niet thuis ben . Niet voor niets beschouw ik mijn korte inhoud van de uitzending als oppervlakkig ; ik heb geen voeling met het onderwerp en zou dus ook niet tot een coherente en diepgaande beschrijving in staat zijn . Toegegeven , de film was niet moeilijk te volgen en het is altijd prettig geconfronteerd te worden met iets nieuws . Over de vorm kan ik natuurlijk ook wel iets zeggen . Op inhoudelijk vlak vind ik enige schroom van mijnentwege echter niet overbodig . Verder heb ik ook enige scrupules tegenover de test die we hebben moeten afleggen . Iedereen is vertrouwd met Jungs termen ' introvert ' en extravert ' en de uitkomst lijkt me dan ook enigszins arbitrair aangezien de invuller op voorhand kan voorspellen wat de invloed van zijn antwoorden op de uiteindelijke score zal zijn . Maar goed , misschien nemen de onderzoekers ons in het ootje en draait dit experiment helemaal niet rond de voorspelling van zogenaamde karaktereigenschappen van auteurs aan de hand van taalkundige kenmerken van hun teksten . Terug naar onze documentaire . Zoals reeds aangestipt vond ik het niet oninteressant . Ik kan het alleen niet laten mij af te vragen in welk jaar alles opgenomen is en hoe men ondertussen geëvolueerd is . Verouderd leek het me alleszins , zeker in het licht van de technologische vooruitgang die onze tijd kenmerkt . Verder was het helder en toegankelijk gebracht . De interviews waren niet altijd even flitsend . Ik kan me niet van de idee ontdoen dat de meeste geportretteerden aardig in de buurt kwamen van het prototype ' computernerd ' , al is het uiteraard nooit slecht om begeesterde mensen te horen vertellen over hun grote passies . Het doet vreemd aan dat deze figuren - die dag in dag uit eenzaam voor hun scherm of prutsend met mechanische vliegjes in de weer zijn - meestal erg veel interesse blijken te hebben voor de mens ( ofte het intelligente wezen bij uitstek ) . Sterker nog , hun motivatie ligt bij die mens in wie ze een ultiem inzicht willen verwerven . Ik meen daar een zekere paradox in te ontwaren . Ik vraag me af of écht contact met een échte mens niet inzichtelijker kan zijn dan het zoeken naar artificiële levensvormen . Die gedachte is misschien wat te ongenuanceerd en het is zeker niet mijn bedoeling hun onderzoek als nutteloos af te doen . Het lijkt me zelfs belangrijk dat men zich hiermee bezig blijft houden , al was het maar vanuit de ethische overwegingen die men zelf aangaf . Alleen , nogmaals , ik ben absoluut geen kenner ; wat mij betreft had dit zelfs één klucht kunnen zijn , een door mij nog onontdekte film van Monty Python ( maar dan minder grappig ) . Wat ook opvallend is , is dat al deze mannen op zoek waren naar het ultieme , de oorsprong . Ze hoopten inzichten te verwerven in het meest complexe . Ik durf echter te beweren dat , zelfs als men ooit slaagt in het creëren van artificiële levensvormen , er altijd iets zal zijn dat ons ontsnapt . De interactie van de mens met de wereld zal altijd iets ongrijpbaars inhouden , het lijkt me een utopie een ultiem inzicht te kunnen verwerven in de menselijke geest , in wie we echt zijn . Of zijn we nu aan het filosoferen ? \ No newline at end of file diff --git a/axolotl/tests/data/datasets/timeseries_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/timeseries_dataset_1/datasetDoc.json new file mode 100644 index 0000000..09b93e2 --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_1/datasetDoc.json @@ -0,0 +1,71 @@ +{ + "about": { + "datasetID": "timeseries_dataset_1", + "datasetName": "A test dataset with timeseries data", + "description": "Historical stock market closing data", + "source": "Kaggle", + "sourceURI": "https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs", + "datasetSchemaVersion": "4.0.0", + "license": "CC0", + "redacted": false, + "digest": "819ececf396b0f2d60198115c8ff2001a8a6804e6cfb188388ad48ea28d43b16", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 5, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "Company", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "Year", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "Date", + "colType": "dateTime", + "role": [ + "attribute", + "timeIndicator" + ] + }, + { + "colIndex": 4, + "colName": "Close", + "colType": "real", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/timeseries_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/timeseries_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..1d9fad5 --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_1/tables/learningData.csv @@ -0,0 +1,41 @@ +d3mIndex,Company,Year,Date,Close +50,abbv,2013,2013-11-01,42.653 +51,abbv,2013,2013-11-04,42.48 +52,abbv,2013,2013-11-05,41.965 +53,abbv,2013,2013-11-06,41.861999999999995 +54,abbv,2013,2013-11-07,41.155 +55,abbv,2013,2013-11-08,41.843999999999994 +56,abbv,2013,2013-11-11,42.315 +57,abbv,2013,2013-11-12,41.365 +58,abbv,2013,2013-11-13,41.68899999999999 +59,abbv,2013,2013-11-14,41.853 +60,abbv,2013,2013-11-15,42.201 +61,abbv,2013,2013-11-18,42.576 +62,abbv,2013,2013-11-19,42.22 +63,abbv,2013,2013-11-20,41.773999999999994 +64,abbv,2013,2013-11-21,42.315 +65,abbv,2013,2013-11-22,42.637 +66,abbv,2013,2013-11-25,42.176 +67,abbv,2013,2013-11-26,42.341 +68,abbv,2013,2013-11-27,42.036 +69,abbv,2013,2013-11-29,42.211000000000006 +70,abbv,2013,2013-12-02,42.158 +71,abbv,2013,2013-12-03,43.533 +72,abbv,2013,2013-12-04,43.272 +73,abbv,2013,2013-12-05,43.351000000000006 +74,abbv,2013,2013-12-06,44.735 +75,abbv,2013,2013-12-09,44.615 +76,abbv,2013,2013-12-10,45.426 +77,abbv,2013,2013-12-11,45.886 +78,abbv,2013,2013-12-12,45.643 +79,abbv,2013,2013-12-13,45.625 +80,abbv,2013,2013-12-16,46.498000000000005 +81,abbv,2013,2013-12-17,46.722 +82,abbv,2013,2013-12-18,47.323 +83,abbv,2013,2013-12-19,45.851000000000006 +84,abbv,2013,2013-12-20,45.816 +85,abbv,2013,2013-12-23,45.806999999999995 +86,abbv,2013,2013-12-24,45.617 +87,abbv,2013,2013-12-26,46.166000000000004 +88,abbv,2013,2013-12-27,45.783 +89,abbv,2013,2013-12-30,46.183 diff --git a/axolotl/tests/data/datasets/timeseries_dataset_2/datasetDoc.json b/axolotl/tests/data/datasets/timeseries_dataset_2/datasetDoc.json new file mode 100644 index 0000000..3d815a2 --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_2/datasetDoc.json @@ -0,0 +1,85 @@ +{ + "about": { + "datasetID": "timeseries_dataset_2", + "datasetName": "Test of timeseries classification dataset", + "description": "Based on 66_chlorineConcentration", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "8d574424ecf2ce36d5cadcefcfe967e8fac523de91f9f35588acb2b86daec329" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "timeseries/", + "resType": "timeseries", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": true, + "columnsCount": 2, + "columns": [ + { + "colIndex": 0, + "colName": "time", + "colType": "integer", + "role": [ + "timeIndicator" + ] + }, + { + "colIndex": 1, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + } + ] + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 3, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timeseries_file", + "colType": "string", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "label", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/timeseries_dataset_2/tables/learningData.csv b/axolotl/tests/data/datasets/timeseries_dataset_2/tables/learningData.csv new file mode 100644 index 0000000..db84456 --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_2/tables/learningData.csv @@ -0,0 +1,6 @@ +d3mIndex,timeseries_file,label +0,0000_train_ts.csv,1 +1,0001_train_ts.csv,3 +2,0002_train_ts.csv,1 +3,0003_train_ts.csv,3 +4,0004_train_ts.csv,2 diff --git a/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0000_train_ts.csv b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0000_train_ts.csv new file mode 100644 index 0000000..d85f421 --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0000_train_ts.csv @@ -0,0 +1,167 @@ +time,value +0,2.6173 +1,3.2310000000000003 +2,2.8508 +3,2.7515 +4,2.3457 +5,2.2746 +6,1.9898 +7,1.849 +8,1.4533 +9,1.3171 +10,1.1623 +11,0.9791 +12,0.76748 +13,-0.27063000000000004 +14,1.4944 +15,1.4457 +16,1.2830000000000001 +17,1.1554 +18,0.73141 +19,0.53898 +20,0.26045 +21,0.058329 +22,-0.30429 +23,-0.60378 +24,-0.92625 +25,-1.0126 +26,1.8559 +27,1.5001 +28,1.3893 +29,0.9980899999999999 +30,0.68913 +31,0.35189000000000004 +32,0.078764 +33,-0.41918 +34,-0.74724 +35,-1.0089 +36,-1.0009 +37,-1.0751 +38,0.9516600000000001 +39,1.4072 +40,0.83868 +41,0.91775 +42,-0.18871 +43,-0.76323 +44,-0.57789 +45,-1.0485 +46,-0.68799 +47,-0.94221 +48,-1.0463 +49,-1.0259 +50,0.41773999999999994 +51,1.3372 +52,0.9633 +53,0.65598 +54,-0.14524 +55,-0.58696 +56,-0.87084 +57,-0.89159 +58,-1.0501 +59,-1.0585 +60,-1.0934 +61,-1.015 +62,-1.1281 +63,1.093 +64,0.71907 +65,0.46547 +66,0.030246 +67,-0.3837 +68,-0.6169399999999999 +69,-0.7783100000000001 +70,-0.88265 +71,-1.0328 +72,-0.9803 +73,0.22255 +74,0.029094 +75,-0.056766 +76,-0.26168 +77,-1.0202 +78,0.5554600000000001 +79,0.013634 +80,-0.44581000000000004 +81,-0.80943 +82,1.3638 +83,1.2515 +84,0.8237200000000001 +85,0.8472 +86,0.43723999999999996 +87,0.093769 +88,-0.39741 +89,-0.522 +90,-0.69016 +91,-1.0744 +92,-1.0935 +93,-1.0839 +94,-0.97987 +95,1.1519 +96,0.7447600000000001 +97,0.010931 +98,-1.0667 +99,-1.0067 +100,-1.0659 +101,-1.0271 +102,0.9205700000000001 +103,0.6635300000000001 +104,0.65109 +105,0.7600899999999999 +106,0.54686 +107,0.26136 +108,-0.76693 +109,-1.016 +110,-1.0511 +111,-1.0899 +112,0.10147 +113,0.3881 +114,0.30463 +115,-0.23206 +116,-1.1049 +117,-1.1303 +118,-0.55779 +119,-0.17225 +120,0.095547 +121,-0.44538 +122,-0.7712600000000001 +123,-0.51544 +124,-0.9514600000000001 +125,-1.03 +126,-1.1066 +127,-1.0669 +128,-0.9079200000000001 +129,-1.1306 +130,-1.0904 +131,0.13903 +132,-1.1024 +133,-1.1326 +134,-1.1344 +135,-0.20390999999999998 +136,-0.20756999999999998 +137,0.23044 +138,0.8553799999999999 +139,1.1254 +140,0.1059 +141,-0.69467 +142,0.66376 +143,-0.18738 +144,-1.0587 +145,-0.59105 +146,-1.0612 +147,0.331 +148,-0.52687 +149,0.25469 +150,0.34862 +151,-0.51362 +152,-0.25538 +153,0.31214000000000003 +154,1.7621 +155,-0.27767 +156,-0.33077 +157,1.307 +158,0.10141 +159,-0.5019600000000001 +160,-0.17143 +161,-0.70724 +162,-0.7883399999999999 +163,-1.0059 +164,-0.79022 +165,-0.8463200000000001 diff --git a/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0001_train_ts.csv b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0001_train_ts.csv new file mode 100644 index 0000000..e18c17d --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0001_train_ts.csv @@ -0,0 +1,167 @@ +time,value +0,1.9823 +1,3.4956 +2,3.1081 +3,2.9702 +4,2.5463 +5,2.4592 +6,2.1733 +7,1.9542 +8,1.355 +9,1.2977 +10,0.98324 +11,0.77696 +12,0.098927 +13,-0.50663 +14,1.528 +15,1.4895 +16,1.3467 +17,1.2313 +18,1.0021 +19,0.7187100000000001 +20,0.48213 +21,0.13885 +22,-0.33235 +23,-0.65793 +24,-0.8736 +25,-0.75132 +26,1.9997 +27,1.6977 +28,1.5681 +29,1.0839 +30,0.56184 +31,-0.028273000000000003 +32,-0.37601 +33,-0.67812 +34,-0.7914399999999999 +35,-0.8059999999999999 +36,-0.75559 +37,-0.9463 +38,1.1798 +39,1.4796 +40,0.9118200000000001 +41,1.0849 +42,0.50515 +43,-0.22955 +44,-0.37004000000000004 +45,-0.85449 +46,-0.56311 +47,-0.71116 +48,-0.8479899999999999 +49,-0.7910699999999999 +50,0.26954 +51,1.3922 +52,1.0935 +53,0.72696 +54,-0.24668 +55,-0.7944100000000001 +56,-0.8673700000000001 +57,-0.8511 +58,-0.8705 +59,-0.84898 +60,-0.8915700000000001 +61,-0.79677 +62,-0.82947 +63,1.2566 +64,0.71404 +65,0.48503999999999997 +66,-0.10758 +67,-0.68355 +68,-0.74587 +69,-0.8713200000000001 +70,-0.92735 +71,-0.84114 +72,-0.87654 +73,0.25838 +74,-0.1559 +75,-0.24135 +76,-0.63272 +77,-0.81109 +78,0.29877 +79,-0.43173 +80,-0.71367 +81,-0.82047 +82,1.4291 +83,1.3075 +84,0.9849700000000001 +85,0.90586 +86,0.027719999999999998 +87,-0.26041 +88,-0.44211999999999996 +89,-0.5905100000000001 +90,-0.74142 +91,-0.83173 +92,-0.8819600000000001 +93,-0.8899799999999999 +94,-0.81484 +95,1.2019 +96,0.52318 +97,-0.089643 +98,-0.91986 +99,-0.8296600000000001 +100,-0.84331 +101,-0.8537399999999999 +102,0.8985 +103,0.25436 +104,-0.3046 +105,0.63276 +106,-0.070212 +107,-0.42841 +108,-0.86419 +109,-0.9148700000000001 +110,-0.8722200000000001 +111,-0.9547 +112,0.49523 +113,-0.38916999999999996 +114,-0.46358 +115,-0.7834800000000001 +116,-0.98756 +117,-1.0025 +118,-0.35483000000000003 +119,-0.55901 +120,-0.59605 +121,-0.91122 +122,-1.0008 +123,-0.9440700000000001 +124,-0.9098200000000001 +125,-0.96752 +126,-0.8563299999999999 +127,-0.9729399999999999 +128,-0.88623 +129,-0.8785 +130,-1.022 +131,0.19705 +132,-0.81402 +133,-0.99931 +134,-0.92496 +135,-0.62571 +136,-0.4478 +137,-0.28868 +138,0.81553 +139,1.2772 +140,-0.062002 +141,-0.68184 +142,0.72142 +143,-0.40339 +144,-0.89655 +145,-0.62529 +146,-0.8924 +147,0.37454 +148,-0.49625 +149,0.29134 +150,0.37382 +151,-0.49281 +152,-0.26200999999999997 +153,0.38423 +154,2.0327 +155,-0.24527 +156,-0.36593000000000003 +157,1.6178 +158,0.11659000000000001 +159,-0.21259 +160,-0.10132000000000001 +161,-0.46892 +162,-0.57443 +163,-0.84519 +164,-0.58468 +165,-0.64564 diff --git a/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0002_train_ts.csv b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0002_train_ts.csv new file mode 100644 index 0000000..af1a0f6 --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0002_train_ts.csv @@ -0,0 +1,167 @@ +time,value +0,4.8586 +1,4.5758 +2,4.1498 +3,3.9892 +4,3.0488 +5,2.8938 +6,2.2543 +7,2.2126 +8,1.2969 +9,1.1425 +10,1.0597 +11,0.14063 +12,-0.35444000000000003 +13,-0.70572 +14,1.1516 +15,0.9837799999999999 +16,0.6471 +17,0.33396 +18,-0.10039 +19,-0.29231999999999997 +20,-0.70015 +21,-0.75255 +22,-0.39102 +23,-0.084372 +24,-0.37454 +25,-0.53305 +26,1.8829 +27,1.3576 +28,1.2267 +29,0.43056000000000005 +30,0.068672 +31,-0.1271 +32,-0.089653 +33,-0.044433999999999994 +34,-0.081214 +35,-0.21785 +36,-0.68967 +37,-0.767 +38,0.0027614999999999996 +39,0.9681299999999999 +40,0.26405 +41,0.010522 +42,-0.16035 +43,-0.35631999999999997 +44,-0.63384 +45,-0.3864 +46,-0.30669 +47,-0.5795 +48,-0.6926800000000001 +49,-0.45736000000000004 +50,-0.81803 +51,0.78193 +52,0.26509 +53,-0.26216999999999996 +54,-0.71526 +55,-0.34074 +56,-0.42466000000000004 +57,-0.27652 +58,-0.35326 +59,-0.41158 +60,-0.5677 +61,-0.33064 +62,0.30475 +63,0.40873000000000004 +64,-0.23434000000000002 +65,-0.49125 +66,-0.77212 +67,-0.37124 +68,-0.39996 +69,-0.23154 +70,-0.34033 +71,-0.30262 +72,-0.70125 +73,-0.65893 +74,-0.64687 +75,-0.6125 +76,-0.24585 +77,-0.4276 +78,-0.054196 +79,-0.15147 +80,-0.12872999999999998 +81,-0.16357 +82,0.79564 +83,0.61305 +84,-0.054303 +85,-0.08632100000000001 +86,-0.73575 +87,-0.73821 +88,-0.41079 +89,-0.38871999999999995 +90,-0.27960999999999997 +91,-0.43795 +92,-0.56787 +93,-0.50261 +94,-0.72768 +95,0.38936 +96,-0.33752 +97,-0.78262 +98,-0.7526 +99,-0.66086 +100,-0.60836 +101,-0.71277 +102,-0.12215999999999999 +103,-0.5754199999999999 +104,-0.75343 +105,-0.26294 +106,-0.6723600000000001 +107,-0.6574399999999999 +108,-0.27908 +109,-0.77252 +110,-0.72296 +111,-0.7743800000000001 +112,-0.44978999999999997 +113,-0.6368 +114,-0.60634 +115,-0.30333000000000004 +116,-0.79396 +117,-0.7883600000000001 +118,-0.28548 +119,-0.27745 +120,-0.6561600000000001 +121,-0.44916999999999996 +122,-0.3475 +123,-0.28018000000000004 +124,-0.50326 +125,-0.42686999999999997 +126,-0.60644 +127,-0.53396 +128,-0.27691 +129,-0.83825 +130,-0.5946899999999999 +131,-0.8487899999999999 +132,-0.73541 +133,-0.8252799999999999 +134,-0.8324600000000001 +135,-0.83101 +136,-0.59501 +137,-0.42867 +138,1.3102 +139,0.8799100000000001 +140,-0.0632 +141,0.10967 +142,0.20725 +143,-0.19758 +144,-0.70539 +145,-0.11584000000000001 +146,-0.6632100000000001 +147,-0.091676 +148,0.2286 +149,0.11830999999999998 +150,0.22989 +151,0.23324 +152,0.15070999999999998 +153,1.0782 +154,2.0468 +155,0.12240999999999999 +156,0.07021799999999999 +157,1.1612 +158,0.87789 +159,0.38756 +160,0.62425 +161,-0.0047575000000000004 +162,-0.35352 +163,-0.7303 +164,-0.46326000000000006 +165,-0.51458 diff --git a/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0003_train_ts.csv b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0003_train_ts.csv new file mode 100644 index 0000000..ec8409b --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0003_train_ts.csv @@ -0,0 +1,167 @@ +time,value +0,1.236 +1,3.7163 +2,3.2952 +3,3.1181 +4,2.6496 +5,2.5397 +6,2.1628 +7,1.8094 +8,0.7572399999999999 +9,0.56834 +10,-0.22157 +11,-0.52215 +12,-1.1688 +13,-0.9571799999999999 +14,0.9615799999999999 +15,1.0536 +16,1.0224 +17,1.0149 +18,0.9633799999999999 +19,0.89428 +20,0.76551 +21,0.59334 +22,0.16609000000000002 +23,-0.3031 +24,-0.84449 +25,-0.9038299999999999 +26,1.9254 +27,1.6323 +28,1.5239 +29,1.0874 +30,0.7976 +31,0.40471999999999997 +32,-0.013235 +33,-0.53563 +34,-0.83278 +35,-0.92916 +36,-0.84964 +37,-1.0666 +38,1.1001 +39,1.021 +40,0.62069 +41,0.92768 +42,0.62527 +43,-0.07334 +44,-0.16005999999999998 +45,-0.8034 +46,-0.42658 +47,-0.99103 +48,-1.0043 +49,-0.9260700000000001 +50,0.87384 +51,1.0152 +52,0.8123199999999999 +53,0.7854800000000001 +54,0.07384600000000001 +55,-0.69735 +56,-0.90138 +57,-0.96973 +58,-0.9998 +59,-0.98606 +60,-1.0353 +61,-0.9385399999999999 +62,-1.0972 +63,0.9461700000000001 +64,0.69755 +65,0.59309 +66,0.22095 +67,-0.53742 +68,-0.6631100000000001 +69,-0.8613700000000001 +70,-0.9646899999999999 +71,-1.0123 +72,-0.9758600000000001 +73,0.41891000000000006 +74,0.17314000000000002 +75,0.053424 +76,-0.46629 +77,-0.963 +78,0.62378 +79,-0.1376 +80,-0.58324 +81,-0.89433 +82,1.0381 +83,0.97735 +84,0.79093 +85,0.85362 +86,0.56367 +87,0.1854 +88,-0.27579000000000004 +89,-0.30308 +90,-0.57725 +91,-1.0168 +92,-1.0294 +93,-0.9908600000000001 +94,-0.88141 +95,0.9129200000000001 +96,0.6469 +97,0.06919299999999999 +98,-1.0373 +99,-0.9388799999999999 +100,-0.9823799999999999 +101,-0.9659700000000001 +102,0.82484 +103,0.47025 +104,-0.34156 +105,0.73311 +106,0.30246 +107,-0.081764 +108,-0.6783899999999999 +109,-0.98646 +110,-0.99456 +111,-1.0414 +112,0.611 +113,0.033302 +114,-0.11234000000000001 +115,-0.58287 +116,-1.1023 +117,-1.1077 +118,0.062259 +119,-0.38235 +120,-0.42728999999999995 +121,-0.8333799999999999 +122,-0.8248 +123,-0.81028 +124,-1.065 +125,-0.8055399999999999 +126,-1.0867 +127,-1.0317 +128,-1.0554 +129,-1.0606 +130,-0.9756799999999999 +131,-0.1002 +132,-0.90191 +133,-1.1446 +134,-1.0586 +135,-0.64595 +136,-0.64173 +137,-0.4198 +138,0.40267 +139,1.3179 +140,0.32492 +141,-0.51839 +142,0.8811399999999999 +143,0.061887 +144,-1.0208 +145,-0.46601000000000004 +146,-1.031 +147,0.60511 +148,-0.20646 +149,0.52095 +150,0.50285 +151,-0.21169000000000002 +152,0.046396 +153,1.0739 +154,2.1339 +155,-0.0021662 +156,-0.082932 +157,1.8489 +158,0.7227399999999999 +159,-0.32481 +160,0.24686999999999998 +161,-0.57438 +162,-0.68706 +163,-0.9234899999999999 +164,-0.67801 +165,-0.76225 diff --git a/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0004_train_ts.csv b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0004_train_ts.csv new file mode 100644 index 0000000..bb8191e --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_2/timeseries/0004_train_ts.csv @@ -0,0 +1,167 @@ +time,value +0,4.1621 +1,4.0772 +2,3.6477 +3,3.5531 +4,3.0601 +5,2.8388 +6,2.553 +7,2.1887 +8,1.5887 +9,1.2968 +10,0.9756600000000001 +11,0.77048 +12,0.10374000000000001 +13,-0.6864899999999999 +14,1.3616 +15,1.2196 +16,0.92115 +17,0.76363 +18,0.38006999999999996 +19,0.08728 +20,-0.16378 +21,-0.48458999999999997 +22,-0.8103899999999999 +23,-0.6513 +24,-0.56476 +25,-0.63364 +26,1.8541 +27,1.5558 +28,1.2927 +29,0.67728 +30,0.30975 +31,-0.032976 +32,-0.35449 +33,-0.36991 +34,-0.37947 +35,-0.45283 +36,-0.77623 +37,-0.8405299999999999 +38,0.48558999999999997 +39,1.1947 +40,0.54381 +41,0.55455 +42,-0.36416 +43,-0.60013 +44,-0.64847 +45,-0.59737 +46,-0.7189 +47,-0.71638 +48,-0.81893 +49,-0.72714 +50,-0.90655 +51,1.0144 +52,0.68723 +53,0.24664 +54,-0.49659 +55,-0.62345 +56,-0.7178800000000001 +57,-0.51759 +58,-0.56052 +59,-0.64741 +60,-0.78386 +61,-0.58019 +62,0.092265 +63,0.79367 +64,0.36529 +65,0.0030281999999999996 +66,-0.28183 +67,-0.6076699999999999 +68,-0.6735 +69,-0.51994 +70,-0.5429 +71,-0.5239 +72,-0.83519 +73,-0.072636 +74,-0.29081999999999997 +75,-0.41641000000000006 +76,-0.55282 +77,-0.6942699999999999 +78,0.13058 +79,-0.40402 +80,-0.45037 +81,-0.42299 +82,1.022 +83,0.9245 +84,0.5172800000000001 +85,0.47334 +86,0.168 +87,-0.39237 +88,-0.81158 +89,-0.7716 +90,-0.70168 +91,-0.51783 +92,-0.6302 +93,-0.60169 +94,-0.82883 +95,0.81981 +96,0.33216 +97,-0.21643 +98,-0.85967 +99,-0.77095 +100,-0.6786800000000001 +101,-0.81099 +102,0.60083 +103,0.26256 +104,0.26605 +105,0.37041 +106,0.15785 +107,-0.15934 +108,-0.63055 +109,-0.83868 +110,-0.8144100000000001 +111,-0.84034 +112,-0.45428 +113,0.03017 +114,-0.14679 +115,-0.47747 +116,-0.87072 +117,-0.8852200000000001 +118,-0.56418 +119,-0.79096 +120,-0.40602 +121,-0.80889 +122,-0.64132 +123,-0.69466 +124,-0.66477 +125,-0.51452 +126,-0.80548 +127,-0.715 +128,-0.6111300000000001 +129,-0.91214 +130,-0.6806300000000001 +131,-0.8817299999999999 +132,-0.79045 +133,-0.8927799999999999 +134,-0.87331 +135,-0.80257 +136,-0.7348100000000001 +137,-0.21825 +138,1.2362 +139,0.97067 +140,-0.15671 +141,-0.18739 +142,0.29656 +143,-0.46723000000000003 +144,-0.82604 +145,-0.37963 +146,-0.7842600000000001 +147,-0.044379 +148,-0.099467 +149,0.026996 +150,0.23393000000000003 +151,-0.10467 +152,-0.17337 +153,0.52908 +154,2.1264 +155,-0.15232 +156,-0.19658 +157,0.9815799999999999 +158,0.42601000000000006 +159,0.18282 +160,0.2563 +161,-0.1835 +162,-0.51883 +163,-0.80288 +164,-0.60351 +165,-0.65157 diff --git a/axolotl/tests/data/datasets/timeseries_dataset_3/datasetDoc.json b/axolotl/tests/data/datasets/timeseries_dataset_3/datasetDoc.json new file mode 100644 index 0000000..a79187b --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_3/datasetDoc.json @@ -0,0 +1,74 @@ +{ + "about": { + "datasetID": "timeseries_dataset_3", + "datasetName": "A test dataset with timeseries data", + "description": "Historical stock market closing data. This dataset is similar to the timeseries_dataset_1. The only difference is that the year part of the date column has been removed and to get a full date, the year column must be combined with the date column.", + "source": "Kaggle", + "sourceURI": "https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs", + "datasetSchemaVersion": "4.0.0", + "license": "CC0", + "redacted": false, + "digest": "cca8504382cc80bf4c45c17830cff278c54c38a17b7a9de94c66d43f3df08ad3", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 5, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "Company", + "colType": "categorical", + "role": [ + "attribute", + "suggestedGroupingKey" + ] + }, + { + "colIndex": 2, + "colName": "Year", + "colType": "integer", + "role": [ + "attribute", + "timeIndicator", + "suggestedGroupingKey" + ] + }, + { + "colIndex": 3, + "colName": "Date", + "colType": "dateTime", + "role": [ + "attribute", + "timeIndicator" + ] + }, + { + "colIndex": 4, + "colName": "Close", + "colType": "real", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/timeseries_dataset_3/tables/learningData.csv b/axolotl/tests/data/datasets/timeseries_dataset_3/tables/learningData.csv new file mode 100644 index 0000000..c53600b --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_3/tables/learningData.csv @@ -0,0 +1,41 @@ +d3mIndex,Company,Year,Date,Close +50,abbv,2013,11-01,42.653 +51,abbv,2013,11-04,42.48 +52,abbv,2013,11-05,41.965 +53,abbv,2013,11-06,41.861999999999995 +54,abbv,2013,11-07,41.155 +55,abbv,2013,11-08,41.843999999999994 +56,abbv,2013,11-11,42.315 +57,abbv,2013,11-12,41.365 +58,abbv,2013,11-13,41.68899999999999 +59,abbv,2013,11-14,41.853 +60,abbv,2013,11-15,42.201 +61,abbv,2013,11-18,42.576 +62,abbv,2013,11-19,42.22 +63,abbv,2013,11-20,41.773999999999994 +64,abbv,2013,11-21,42.315 +65,abbv,2013,11-22,42.637 +66,abbv,2013,11-25,42.176 +67,abbv,2013,11-26,42.341 +68,abbv,2013,11-27,42.036 +69,abbv,2013,11-29,42.211000000000006 +70,abbv,2013,12-02,42.158 +71,abbv,2013,12-03,43.533 +72,abbv,2013,12-04,43.272 +73,abbv,2013,12-05,43.351000000000006 +74,abbv,2013,12-06,44.735 +75,abbv,2013,12-09,44.615 +76,abbv,2013,12-10,45.426 +77,abbv,2013,12-11,45.886 +78,abbv,2013,12-12,45.643 +79,abbv,2013,12-13,45.625 +80,abbv,2013,12-16,46.498000000000005 +81,abbv,2013,12-17,46.722 +82,abbv,2013,12-18,47.323 +83,abbv,2013,12-19,45.851000000000006 +84,abbv,2013,12-20,45.816 +85,abbv,2013,12-23,45.806999999999995 +86,abbv,2013,12-24,45.617 +87,abbv,2013,12-26,46.166000000000004 +88,abbv,2013,12-27,45.783 +89,abbv,2013,12-30,46.183 diff --git a/axolotl/tests/data/datasets/timeseries_dataset_4/datasetDoc.json b/axolotl/tests/data/datasets/timeseries_dataset_4/datasetDoc.json new file mode 100644 index 0000000..98c9dee --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_4/datasetDoc.json @@ -0,0 +1,71 @@ +{ + "about": { + "datasetID": "timeseries_dataset_4", + "datasetName": "A test dataset with unsorted timeseries data", + "description": "Historical stock market closing data, where timestamps are not in chronological order", + "source": "Kaggle", + "sourceURI": "https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs", + "datasetSchemaVersion": "4.0.0", + "license": "CC0", + "redacted": false, + "digest": "b72f4c8a758a40264432957fd4795c855644efd231ab80664e7c7b660e08e8b5", + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 5, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "Company", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "Year", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "Date", + "colType": "dateTime", + "role": [ + "attribute", + "timeIndicator" + ] + }, + { + "colIndex": 4, + "colName": "Close", + "colType": "real", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/timeseries_dataset_4/tables/learningData.csv b/axolotl/tests/data/datasets/timeseries_dataset_4/tables/learningData.csv new file mode 100644 index 0000000..e6b78f7 --- /dev/null +++ b/axolotl/tests/data/datasets/timeseries_dataset_4/tables/learningData.csv @@ -0,0 +1,41 @@ +d3mIndex,Company,Year,Date,Close +50,abbv,2013,2013-12-02,42.158 +51,abbv,2013,2013-12-03,43.533 +52,abbv,2013,2013-12-04,43.272 +53,abbv,2013,2013-12-05,43.351000000000006 +54,abbv,2013,2013-12-06,44.735 +55,abbv,2013,2013-12-09,44.615 +56,abbv,2013,2013-12-10,45.426 +57,abbv,2013,2013-12-11,45.886 +58,abbv,2013,2013-12-12,45.643 +59,abbv,2013,2013-12-13,45.625 +60,abbv,2013,2013-12-16,46.498000000000005 +61,abbv,2013,2013-12-17,46.722 +62,abbv,2013,2013-12-18,47.323 +63,abbv,2013,2013-12-19,45.851000000000006 +64,abbv,2013,2013-12-20,45.816 +65,abbv,2013,2013-12-23,45.806999999999995 +66,abbv,2013,2013-12-24,45.617 +67,abbv,2013,2013-12-26,46.166000000000004 +68,abbv,2013,2013-12-27,45.783 +69,abbv,2013,2013-12-30,46.183 +70,abbv,2013,2013-11-01,42.653 +71,abbv,2013,2013-11-04,42.48 +72,abbv,2013,2013-11-05,41.965 +73,abbv,2013,2013-11-06,41.861999999999995 +74,abbv,2013,2013-11-07,41.155 +75,abbv,2013,2013-11-08,41.843999999999994 +76,abbv,2013,2013-11-11,42.315 +77,abbv,2013,2013-11-12,41.365 +78,abbv,2013,2013-11-13,41.68899999999999 +79,abbv,2013,2013-11-14,41.853 +80,abbv,2013,2013-11-15,42.201 +81,abbv,2013,2013-11-18,42.576 +82,abbv,2013,2013-11-19,42.22 +83,abbv,2013,2013-11-20,41.773999999999994 +84,abbv,2013,2013-11-21,42.315 +85,abbv,2013,2013-11-22,42.637 +86,abbv,2013,2013-11-25,42.176 +87,abbv,2013,2013-11-26,42.341 +88,abbv,2013,2013-11-27,42.036 +89,abbv,2013,2013-11-29,42.211000000000006 diff --git a/axolotl/tests/data/datasets/video_dataset_1/datasetDoc.json b/axolotl/tests/data/datasets/video_dataset_1/datasetDoc.json new file mode 100644 index 0000000..edcc544 --- /dev/null +++ b/axolotl/tests/data/datasets/video_dataset_1/datasetDoc.json @@ -0,0 +1,71 @@ +{ + "about": { + "datasetID": "video_dataset_1", + "datasetName": "Test video dataset from HMDB dataset", + "description": "Two videos from the human activity recognition from video dataset.", + "citation": "\n@InProceedings{Kuehne11,\n author= \"Kuehne, H. and Jhuang, H. and Garrote, E. and Poggio, T. and Serre, T.\",\n title = \"{HMDB}: a large video database for human motion recognition\",\n booktitle = \"Proceedings of the International Conference on Computer Vision (ICCV)\",\n year = \"2011\",\n}\n", + "license": "No license information provided; citation request", + "source": "SERRE Lab, Brown University", + "sourceURI": "http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/", + "approximateSize": "400 KB", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0", + "digest": "a30c93f97309b425746e195a9bfb8e97f03a71c603dd3b733fdb000470db611e" + }, + "dataResources": [ + { + "resID": "0", + "resPath": "media/", + "resType": "video", + "resFormat": { + "video/mp4": [ + "mp4" + ] + }, + "isCollection": true + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columnsCount": 3, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "video_file", + "colType": "string", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "activity_label", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/data/datasets/video_dataset_1/media/April_09_brush_hair_u_nm_np1_ba_goo_0.avi.mp4 b/axolotl/tests/data/datasets/video_dataset_1/media/April_09_brush_hair_u_nm_np1_ba_goo_0.avi.mp4 new file mode 100644 index 0000000..1596262 Binary files /dev/null and b/axolotl/tests/data/datasets/video_dataset_1/media/April_09_brush_hair_u_nm_np1_ba_goo_0.avi.mp4 differ diff --git a/axolotl/tests/data/datasets/video_dataset_1/media/Jessica_and_Gregs_Cartwheel_Competition_cartwheel_f_cm_np1_ba_med_1.avi.mp4 b/axolotl/tests/data/datasets/video_dataset_1/media/Jessica_and_Gregs_Cartwheel_Competition_cartwheel_f_cm_np1_ba_med_1.avi.mp4 new file mode 100644 index 0000000..7bef998 Binary files /dev/null and b/axolotl/tests/data/datasets/video_dataset_1/media/Jessica_and_Gregs_Cartwheel_Competition_cartwheel_f_cm_np1_ba_med_1.avi.mp4 differ diff --git a/axolotl/tests/data/datasets/video_dataset_1/tables/learningData.csv b/axolotl/tests/data/datasets/video_dataset_1/tables/learningData.csv new file mode 100644 index 0000000..3e38c66 --- /dev/null +++ b/axolotl/tests/data/datasets/video_dataset_1/tables/learningData.csv @@ -0,0 +1,3 @@ +d3mIndex,video_file,activity_label +0,Jessica_and_Gregs_Cartwheel_Competition_cartwheel_f_cm_np1_ba_med_1.avi.mp4,cartwheel +1,April_09_brush_hair_u_nm_np1_ba_goo_0.avi.mp4,brush_hair diff --git a/axolotl/tests/data/docker/summing/Dockerfile b/axolotl/tests/data/docker/summing/Dockerfile new file mode 100644 index 0000000..b88d36a --- /dev/null +++ b/axolotl/tests/data/docker/summing/Dockerfile @@ -0,0 +1,17 @@ +FROM registry.gitlab.com/datadrivendiscovery/images/base:ubuntu-bionic-python36 + +EXPOSE 8000/tcp + +RUN apt-get update -q -q && \ + apt-get install --yes --force-yes runit + +COPY ./runsvdir-start /usr/local/sbin/runsvdir-start +COPY ./requirements.txt /requirements.txt +COPY ./code /code +COPY ./etc /etc + +RUN pip3 install -r /requirements.txt && rm -f /requirements.txt + +WORKDIR /code + +ENTRYPOINT ["/usr/local/sbin/runsvdir-start"] diff --git a/axolotl/tests/data/docker/summing/README.md b/axolotl/tests/data/docker/summing/README.md new file mode 100644 index 0000000..e47a451 --- /dev/null +++ b/axolotl/tests/data/docker/summing/README.md @@ -0,0 +1,3 @@ +A Docker image with a simple HTTP service on port 8000: HTTP POST to it expects +a pickled numpy array or a list (of lists) as payload and sums all elements +together into a scalar. diff --git a/axolotl/tests/data/docker/summing/code/server.py b/axolotl/tests/data/docker/summing/code/server.py new file mode 100755 index 0000000..01a0d1d --- /dev/null +++ b/axolotl/tests/data/docker/summing/code/server.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +import collections +import logging +import pickle +from http import server + +import pandas + +logger = logging.getLogger(__name__) + + +class HTTPServer(server.HTTPServer): + def handle_error(self, request, client_address): + logger.exception("Exception happened during processing of request from %(client_address)s.", {'client_address': client_address}) + + +class HTTPRequestHandler(server.BaseHTTPRequestHandler): + def do_POST(self): + data = self.rfile.read(int(self.headers['Content-Length'])) + # In the future, we should read here just an ObjectId of data + # in Arrow format in Plasma store and read it from there. + value = pickle.loads(data) + sum = self.sum(value) + result = str(sum).encode('utf-8') + + self.send_response(200) + self.send_header('Content-Length', len(result)) + self.end_headers() + self.wfile.write(result) + + def sum(self, value): + if isinstance(value, pandas.DataFrame): + return sum(self.sum(v) for v in value.itertuples(index=False, name=None)) + if isinstance(value, collections.Iterable): + return sum(self.sum(v) for v in value) + else: + return value + + def log_message(self, message, *args): + logger.info(message, *args) + + +if __name__ == '__main__': + PORT = 8000 + + logging.basicConfig(level=logging.INFO) + + logger.info("Listening on port %(port)s.", {'port': PORT}) + + httpd = HTTPServer(('', PORT), HTTPRequestHandler) + + try: + httpd.serve_forever() + except KeyboardInterrupt: + pass + + httpd.server_close() + logging.info("Server stopped.") diff --git a/axolotl/tests/data/docker/summing/etc/service/summing/run b/axolotl/tests/data/docker/summing/etc/service/summing/run new file mode 100755 index 0000000..5d300c7 --- /dev/null +++ b/axolotl/tests/data/docker/summing/etc/service/summing/run @@ -0,0 +1,4 @@ +#!/bin/bash -e + +cd /code +exec chpst -u nobody:nogroup ./server.py 2>&1 diff --git a/axolotl/tests/data/docker/summing/requirements.txt b/axolotl/tests/data/docker/summing/requirements.txt new file mode 100644 index 0000000..65e7abc --- /dev/null +++ b/axolotl/tests/data/docker/summing/requirements.txt @@ -0,0 +1,2 @@ +pandas==0.21.1 +numpy==1.13.3 diff --git a/axolotl/tests/data/docker/summing/runsvdir-start b/axolotl/tests/data/docker/summing/runsvdir-start new file mode 100755 index 0000000..a8dec8a --- /dev/null +++ b/axolotl/tests/data/docker/summing/runsvdir-start @@ -0,0 +1,5 @@ +#!/bin/sh + +export PATH=/usr/local/bin:/usr/local/sbin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/X11R6/bin + +exec runsvdir -P /etc/service 'log: ...........................................................................................................................................................................................................................................................................................................................................................................................................' diff --git a/axolotl/tests/data/generate-database-datasets.py b/axolotl/tests/data/generate-database-datasets.py new file mode 100755 index 0000000..c0e55e2 --- /dev/null +++ b/axolotl/tests/data/generate-database-datasets.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 + +import argparse +import enum +import json +import os +import os.path +import sys + +import numpy +import pandas + +from d3m import container + + +class DatasetType(enum.Enum): + COUNTS_PER_USER = 1 + COMMENTS_PER_POST = 2 + HAS_USER_MADE_COMMENT_ON_POST = 3 + + +def pareto_choice(random_state, array, size): + # 80/20 rule. + a = 1.161 + + p = random_state.pareto(a, size=len(array)) + 1 + p /= numpy.sum(p) + + return random_state.choice(array, size=size, replace=True, p=p) + + +def generate_main_resources(random_state, resources, size): + users_count = size + posts_count = size * 10 + comments_count = size * 10 + + user_ids = numpy.array(range(users_count)) + post_ids = numpy.array(range(posts_count)) + comment_ids = numpy.array(range(comments_count)) + + users = container.DataFrame({ + 'id': user_ids, + 'name': [f'User {i}' for i in range(users_count)], + }) + + posts = container.DataFrame({ + 'id': post_ids, + 'author_id': pareto_choice(random_state, user_ids, posts_count), + 'post': [f'Post {i}' for i in range(posts_count)], + }) + + comments = container.DataFrame({ + 'id': comment_ids, + 'post_id': pareto_choice(random_state, post_ids, comments_count), + 'author_id': pareto_choice(random_state, user_ids, comments_count), + 'comment': [f'Comment {i}' for i in range(comments_count)], + }) + + resources.update({'users': users, 'posts': posts, 'comments': comments}) + + +def generate_learning_data_counts_per_user(random_state, resources): + user_ids = resources['users'].loc[:, 'id'] + users_count = len(user_ids) + posts = resources['posts'] + comments = resources['comments'] + + learning_data = container.DataFrame({ + 'd3mIndex': numpy.array(range(users_count)), + 'user_id': user_ids, + 'posts_count': [(posts.loc[:, 'author_id'] == user_id).sum() for user_id in user_ids], + 'comments_count': [(comments.loc[:, 'author_id'] == user_id).sum() for user_id in user_ids], + }) + + resources['learningData'] = learning_data + + +def generate_learning_data_comments_per_post(random_state, resources): + post_ids = resources['posts'].loc[:, 'id'] + posts_count = len(post_ids) + comments = resources['comments'] + + learning_data = container.DataFrame({ + 'd3mIndex': numpy.array(range(posts_count)), + 'post_id': post_ids, + 'comments_count': [(comments.loc[:, 'post_id'] == post_id).sum() for post_id in post_ids], + }) + + resources['learningData'] = learning_data + + +def generate_learning_data_has_user_made_comment_on_post(random_state, resources): + user_ids = resources['users'].loc[:, 'id'] + post_ids = resources['posts'].loc[:, 'id'] + users_count = len(user_ids) + comments = resources['comments'] + + authors_and_posts = comments.loc[:, ['author_id', 'post_id']] + + authors_and_posts_set = set(authors_and_posts.itertuples(index=False, name=None)) + + data = { + 'user_id': [], + 'post_id': [], + 'made_comment': [], + } + + for author_id, post_id in authors_and_posts.sample(n=users_count, random_state=random_state).itertuples(index=False, name=None): + data['user_id'].append(author_id) + data['post_id'].append(post_id) + data['made_comment'].append('yes') + + for user_id in random_state.permutation(user_ids): + for post_id in random_state.permutation(post_ids): + if (user_id, post_id) in authors_and_posts_set: + continue + + data['user_id'].append(user_id) + data['post_id'].append(post_id) + data['made_comment'].append('no') + + if len(data['user_id']) == 2 * users_count: + break + + if len(data['user_id']) == 2 * users_count: + break + + assert len(data['user_id']) == 2 * users_count + + data = container.DataFrame(data) + data = data.sample(frac=1.0, random_state=random_state).reset_index(drop=True) + + index = container.DataFrame({ + 'd3mIndex': numpy.array(range(len(data))), + }) + + resources['learningData'] = container.DataFrame(pandas.concat([index, data], axis=1)) + + +def update_metadata_main_resources(dataset, dataset_id, dataset_type, size, random_seed): + dataset.metadata = dataset.metadata.update((), { + 'id': dataset_id, + 'name': f"Database dataset of type {dataset_type}", + 'description': f"Database dataset of type {dataset_type}, size {size}, random seed {random_seed}", + }) + + dataset.metadata = dataset.metadata.update_column(0, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/Integer'], + }, at=('users',)) + dataset.metadata = dataset.metadata.update_column(1, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Text'], + }, at=('users',)) + + dataset.metadata = dataset.metadata.update_column(0, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/Integer'], + }, at=('posts',)) + dataset.metadata = dataset.metadata.update_column(1, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Integer'], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': 'users', + 'column_index': 0, + }, + }, at=('posts',)) + dataset.metadata = dataset.metadata.update_column(2, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Text'], + }, at=('posts',)) + + dataset.metadata = dataset.metadata.update_column(0, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/Integer'], + }, at=('comments',)) + dataset.metadata = dataset.metadata.update_column(1, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Integer'], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': 'posts', + 'column_index': 0, + }, + }, at=('comments',)) + dataset.metadata = dataset.metadata.update_column(2, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Integer'], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': 'users', + 'column_index': 0, + }, + }, at=('comments',)) + dataset.metadata = dataset.metadata.update_column(3, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Text'], + }, at=('comments',)) + + +def update_metadata_counts_per_user(dataset): + dataset.metadata = dataset.metadata.update_column(0, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/Integer'], + }, at=('learningData',)) + dataset.metadata = dataset.metadata.update_column(1, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Integer'], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': 'users', + 'column_index': 0, + }, + }, at=('learningData',)) + dataset.metadata = dataset.metadata.update_column(2, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'http://schema.org/Integer'], + }, at=('learningData',)) + dataset.metadata = dataset.metadata.update_column(3, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'http://schema.org/Integer'], + }, at=('learningData',)) + + +def update_metadata_comments_per_post(dataset): + dataset.metadata = dataset.metadata.update_column(0, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/Integer'], + }, at=('learningData',)) + dataset.metadata = dataset.metadata.update_column(1, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Integer'], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': 'posts', + 'column_index': 0, + }, + }, at=('learningData',)) + dataset.metadata = dataset.metadata.update_column(2, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'http://schema.org/Integer'], + }, at=('learningData',)) + + +def update_metadata_has_user_made_comment_on_post(dataset): + dataset.metadata = dataset.metadata.update_column(0, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/Integer'], + }, at=('learningData',)) + dataset.metadata = dataset.metadata.update_column(1, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Integer'], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': 'users', + 'column_index': 0, + }, + }, at=('learningData',)) + dataset.metadata = dataset.metadata.update_column(2, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Integer'], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': 'posts', + 'column_index': 0, + }, + }, at=('learningData',)) + dataset.metadata = dataset.metadata.update_column(3, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'http://schema.org/Boolean'], + }, at=('learningData',)) + + +def handler(arguments): + random_state = numpy.random.RandomState(arguments.random_seed) + + resources = {} + generate_main_resources(random_state, resources, arguments.size) + + if arguments.dataset_type == DatasetType.COUNTS_PER_USER: + generate_learning_data_counts_per_user(random_state, resources) + + elif arguments.dataset_type == DatasetType.COMMENTS_PER_POST: + generate_learning_data_comments_per_post(random_state, resources) + + elif arguments.dataset_type == DatasetType.HAS_USER_MADE_COMMENT_ON_POST: + generate_learning_data_has_user_made_comment_on_post(random_state, resources) + + else: + raise ValueError(f"Unknown dataset type: {arguments.dataset_type.name}") + + dataset = container.Dataset(resources, generate_metadata=True) + update_metadata_main_resources(dataset, arguments.dataset_id, arguments.dataset_type.name, arguments.size, arguments.random_seed) + + if arguments.dataset_type == DatasetType.COUNTS_PER_USER: + update_metadata_counts_per_user(dataset) + + elif arguments.dataset_type == DatasetType.COMMENTS_PER_POST: + update_metadata_comments_per_post(dataset) + + elif arguments.dataset_type == DatasetType.HAS_USER_MADE_COMMENT_ON_POST: + update_metadata_has_user_made_comment_on_post(dataset) + + else: + raise ValueError(f"Unknown dataset type: {arguments.dataset_type.name}") + + dataset_output_uri = 'file://' + os.path.join(os.path.abspath(arguments.output_dir), arguments.dataset_id, 'datasetDoc.json') + + dataset.save(dataset_output_uri) + + os.makedirs(os.path.join(os.path.abspath(arguments.output_dir), arguments.problem_id)) + + with open(os.path.join(os.path.abspath(arguments.output_dir), arguments.problem_id, 'problemDoc.json'), 'x', encoding='utf8') as problem_file: + if arguments.dataset_type == DatasetType.COUNTS_PER_USER: + task_keywords = ['regression', 'multivariate'] + metric = { + 'metric': 'rootMeanSquaredError', + } + targets = [ + { + 'targetIndex': 0, + 'resID': 'learningData', + 'colIndex': 2, + 'colName': 'posts_count', + }, + { + 'targetIndex': 1, + 'resID': 'learningData', + 'colIndex': 3, + 'colName': 'comments_count', + }, + ] + elif arguments.dataset_type == DatasetType.COMMENTS_PER_POST: + task_keywords = ['regression', 'univariate'] + metric = { + 'metric': 'rootMeanSquaredError', + } + targets = [ + { + 'targetIndex': 0, + 'resID': 'learningData', + 'colIndex': 2, + 'colName': 'comments_count', + }, + ] + elif arguments.dataset_type == DatasetType.HAS_USER_MADE_COMMENT_ON_POST: + task_keywords = ['classification', 'binary'] + metric = { + 'metric': 'f1', + 'posLabel': 'yes', + } + targets = [ + { + 'targetIndex': 0, + 'resID': 'learningData', + 'colIndex': 3, + 'colName': 'made_comment', + }, + ] + + json.dump({ + 'about': { + 'problemID': arguments.problem_id, + 'problemName': f"Database problem of type {arguments.dataset_type.name}", + 'taskKeywords': task_keywords, + 'problemSchemaVersion': '4.0.0', + }, + 'inputs': { + 'data': [ + { + 'datasetID': arguments.dataset_id, + 'targets': targets, + }, + ], + 'performanceMetrics': [ + metric, + ], + }, + 'expectedOutputs': { + 'predictionsFile': 'predictions.csv', + 'scoresFile': 'scores.csv', + }, + }, problem_file, indent=2) + + +def main(argv): + parser = argparse.ArgumentParser(description="Generate database datasets.") + + parser.add_argument( + '--dataset-type', choices=[dataset_type.name for dataset_type in DatasetType], action='store', required=True, + help="what type of dataset to generate", + ) + parser.add_argument( + '--dataset-id', action='store', required=True, + help="dataset ID to use", + ) + parser.add_argument( + '--problem-id', action='store', required=True, + help="problem ID to use", + ) + parser.add_argument( + '--random-seed', type=int, action='store', default=0, + help="random seed to use", + ) + parser.add_argument( + '--size', type=int, action='store', default=1000, + help="size of dataset to generate", + ) + parser.add_argument( + '--output-dir', action='store', default='.', + help="directory where to store generated dataset and problem, default is current directory", + ) + + arguments = parser.parse_args(argv[1:]) + + arguments.dataset_type = DatasetType[arguments.dataset_type] + + handler(arguments) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/axolotl/tests/data/pipelines/data-preparation-no-split.yml b/axolotl/tests/data/pipelines/data-preparation-no-split.yml new file mode 100644 index 0000000..2a1ff87 --- /dev/null +++ b/axolotl/tests/data/pipelines/data-preparation-no-split.yml @@ -0,0 +1,36 @@ +id: fcaddd7f-39fa-49cc-9f31-c3f326b85557 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Daragh +created: "2019-04-29T04:16:39.642266Z" +context: TESTING +name: Data preparation pipeline - no split version +description: | + Data preparation which just passes all data through unchanged +inputs: + - name: folds + - name: full dataset +outputs: + - name: train datasets + data: steps.0.produce + - name: test datasets + data: steps.0.produce + - name: score datasets + data: steps.0.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 48c683ad-da9e-48cf-b3a0-7394dba5e5d2 + version: 0.1.0 + python_path: d3m.primitives.evaluation.no_split_dataset_split.Common + name: No-split tabular dataset splits + arguments: + inputs: + type: CONTAINER + data: inputs.0 + dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce diff --git a/axolotl/tests/data/pipelines/data-preparation-train-test-split.yml b/axolotl/tests/data/pipelines/data-preparation-train-test-split.yml new file mode 100644 index 0000000..5ff94d6 --- /dev/null +++ b/axolotl/tests/data/pipelines/data-preparation-train-test-split.yml @@ -0,0 +1,37 @@ +id: 0168fd77-5310-472e-a755-1bb89edcbffd +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2019-05-01T23:54:43.334702Z" +context: TESTING +name: Data preparation pipeline - train-test split version +description: | + Data preparation which does train-test split but does not redact anything +inputs: + - name: folds + - name: full dataset +outputs: + - name: train datasets + data: steps.0.produce + - name: test datasets + data: steps.0.produce_score_data + - name: score datasets + data: steps.0.produce_score_data +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 3fcc6dc4-6681-4c86-948e-066d14e7d803 + version: 0.1.0 + python_path: d3m.primitives.evaluation.train_score_dataset_split.Common + name: Train-score tabular dataset splits + arguments: + inputs: + type: CONTAINER + data: inputs.0 + dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce + - id: produce_score_data diff --git a/axolotl/tests/data/pipelines/fake_compute_score.yml b/axolotl/tests/data/pipelines/fake_compute_score.yml new file mode 100644 index 0000000..d73546f --- /dev/null +++ b/axolotl/tests/data/pipelines/fake_compute_score.yml @@ -0,0 +1,31 @@ +id: de6443dd-de0a-4000-9b4e-383920019571 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2018-07-27T18:04:43.650608Z" +name: Fake scoring pipeline +description: | + A fake scoring pipeline calling fake_score primitive. +inputs: + - name: predictions + - name: score dataset +outputs: + - name: scores + data: steps.0.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 1c4d5cbd-163c-424d-8be5-0f267641ae34 + version: 0.1.0 + python_path: d3m.primitives.evaluation.compute_scores.Test + name: Generate fake scores for testing + arguments: + inputs: + type: CONTAINER + data: inputs.0 + score_dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce diff --git a/axolotl/tests/data/pipelines/increment-dataframe.yml b/axolotl/tests/data/pipelines/increment-dataframe.yml new file mode 100644 index 0000000..95d366b --- /dev/null +++ b/axolotl/tests/data/pipelines/increment-dataframe.yml @@ -0,0 +1,55 @@ +id: 1b6184c1-49ba-44f8-b02d-90fb41e65e1a +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +created: "2018-10-08 02:56:45.277695Z" +description: | + Test pipeline used in simple-ta3. It expects a DataFrame as input. +inputs: + - name: dataframe +outputs: + - data: steps.2.produce +steps: + - type: PRIMITIVE + primitive: + id: d7e14b12-abeb-42d8-942f-bdb077b4fd37 + version: 0.1.0 + python_path: d3m.primitives.data_transformation.add_semantic_types.Common + name: Add semantic types to columns + arguments: + inputs: + type: CONTAINER + data: inputs.0 + outputs: + - id: produce + hyperparams: + columns: + type: VALUE + data: + - 0 + semantic_types: + type: VALUE + data: + - http://schema.org/Integer + - type: PRIMITIVE + primitive: + id: d510cb7a-1782-4f51-b44c-58f0236e47c7 + version: 0.6.0 + python_path: d3m.primitives.data_transformation.column_parser.Common + name: Parses strings into their types + arguments: + inputs: + type: CONTAINER + data: steps.0.produce + outputs: + - id: produce + - type: PRIMITIVE + primitive: + id: 5c9d5acf-7754-420f-a49f-90f4d9d0d694 + version: 0.1.0 + python_path: d3m.primitives.operator.increment.Test + name: Increment Values + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce diff --git a/axolotl/tests/data/pipelines/multi-input-test.json b/axolotl/tests/data/pipelines/multi-input-test.json new file mode 100644 index 0000000..627ff86 --- /dev/null +++ b/axolotl/tests/data/pipelines/multi-input-test.json @@ -0,0 +1,85 @@ +{ + "created": "2019-03-14T03:22:12.490865Z", + "id": "f52d3690-31fb-458c-ad4c-4c2be11f5f36", + "inputs": [ + { + "name": "inputs1" + }, + { + "name": "inputs0" + } + ], + "outputs": [ + { + "data": "steps.2.produce", + "name": "output" + } + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "version": "0.3.0" + }, + "arguments": { + "inputs": { + "data": "inputs.0", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "version": "0.3.0" + }, + "arguments": { + "inputs": { + "data": "inputs.1", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "b8d0d982-fc53-4a3f-8a8c-a284fdd45bfd", + "name": "Random Classifier", + "python_path": "d3m.primitives.classification.random_classifier.Test", + "version": "0.1.0" + }, + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + }, + "outputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] +} diff --git a/axolotl/tests/data/pipelines/random-classifier.yml b/axolotl/tests/data/pipelines/random-classifier.yml new file mode 100644 index 0000000..79c866e --- /dev/null +++ b/axolotl/tests/data/pipelines/random-classifier.yml @@ -0,0 +1,58 @@ +id: b1e92676-f3c4-4c10-a014-b33a55217540 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2019-05-04T09:42:27.443844Z" +context: TESTING +name: A random classifier pipeline for tests +description: | + A simple pipeline which runs Random classifier on tabular data. + It does not depend on TrueTarget and is useful just for testing. +inputs: + - name: input dataset +outputs: + - name: predictions + data: steps.2.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 4b42ce1e-9b98-4a25-b68e-fad13311eb65 + version: 0.3.0 + python_path: d3m.primitives.data_transformation.dataset_to_dataframe.Common + name: Extract a DataFrame from a Dataset + arguments: + inputs: + type: CONTAINER + data: inputs.0 + outputs: + - id: produce + # Step 1. + - type: PRIMITIVE + primitive: + id: d510cb7a-1782-4f51-b44c-58f0236e47c7 + version: 0.6.0 + python_path: d3m.primitives.data_transformation.column_parser.Common + name: Parses strings into their types + arguments: + inputs: + type: CONTAINER + data: steps.0.produce + outputs: + - id: produce + # Step 2. + - type: PRIMITIVE + primitive: + id: b8d0d982-fc53-4a3f-8a8c-a284fdd45bfd + version: 0.1.0 + python_path: d3m.primitives.classification.random_classifier.Test + name: Random Classifier + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce diff --git a/axolotl/tests/data/pipelines/random-forest-classifier.yml b/axolotl/tests/data/pipelines/random-forest-classifier.yml new file mode 100644 index 0000000..cfc3062 --- /dev/null +++ b/axolotl/tests/data/pipelines/random-forest-classifier.yml @@ -0,0 +1,74 @@ +id: 9ae0b7f5-613a-4ca2-975f-83cf9317a03c +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2018-02-28T09:42:27.443844Z" +context: TESTING +name: A random forest classifier pipeline for tests +description: | + A simple pipeline which runs Random Forest classifier on tabular data. + It does not do imputation so not useful as a general pipeline. +inputs: + - name: input dataset +outputs: + - name: predictions + data: steps.3.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 4b42ce1e-9b98-4a25-b68e-fad13311eb65 + version: 0.3.0 + python_path: d3m.primitives.data_transformation.dataset_to_dataframe.Common + name: Extract a DataFrame from a Dataset + arguments: + inputs: + type: CONTAINER + data: inputs.0 + outputs: + - id: produce + # Step 1. + - type: PRIMITIVE + primitive: + id: d510cb7a-1782-4f51-b44c-58f0236e47c7 + version: 0.6.0 + python_path: d3m.primitives.data_transformation.column_parser.Common + name: Parses strings into their types + arguments: + inputs: + type: CONTAINER + data: steps.0.produce + outputs: + - id: produce + # Step 2. + - type: PRIMITIVE + primitive: + id: 37c2b19d-bdab-4a30-ba08-6be49edcc6af + version: 0.4.0 + python_path: d3m.primitives.classification.random_forest.Common + name: Random forest classifier + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce + # Step 3. + - type: PRIMITIVE + primitive: + id: 8d38b340-f83f-4877-baaa-162f8e551736 + version: 0.3.0 + python_path: d3m.primitives.data_transformation.construct_predictions.Common + name: Construct pipeline predictions output + arguments: + inputs: + type: CONTAINER + data: steps.2.produce + reference: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce diff --git a/axolotl/tests/data/pipelines/random-sample.yml b/axolotl/tests/data/pipelines/random-sample.yml new file mode 100644 index 0000000..abf0756 --- /dev/null +++ b/axolotl/tests/data/pipelines/random-sample.yml @@ -0,0 +1,32 @@ +id: 30e8be4b-3aec-447b-9c36-f9a37c81c3ed +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +created: "2018-07-27T15:40:34.012397Z" +inputs: + - name: indices +outputs: + - data: steps.1.produce +steps: + - type: PRIMITIVE + primitive: + id: df3153a1-4411-47e2-bbc0-9d5e9925ad79 + version: 0.1.0 + python_path: d3m.primitives.data_generation.random.Test + name: Random Samples + arguments: + inputs: + type: CONTAINER + data: inputs.0 + outputs: + - id: produce + - type: PRIMITIVE + primitive: + id: 5c9d5acf-7754-420f-a49f-90f4d9d0d694 + version: 0.1.0 + python_path: d3m.primitives.operator.increment.Test + name: Increment Values + arguments: + inputs: + type: CONTAINER + data: steps.0.produce + outputs: + - id: produce diff --git a/axolotl/tests/data/pipelines/semi-standard-pipeline.json b/axolotl/tests/data/pipelines/semi-standard-pipeline.json new file mode 100644 index 0000000..72af83a --- /dev/null +++ b/axolotl/tests/data/pipelines/semi-standard-pipeline.json @@ -0,0 +1,67 @@ +{ + "created": "2019-03-14T03:22:12.490865Z", + "id": "f52d3690-31fb-458c-ad4c-4c2be11f5f36", + "description": "A non-standard pipeline, which still takes a Dataset input, but returns two outputs.", + "inputs": [ + { + "name": "inputs0" + } + ], + "outputs": [ + { + "data": "steps.1.produce", + "name": "output" + }, + { + "data": "steps.0.produce", + "name": "dataframe-output" + } + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "version": "0.3.0" + }, + "arguments": { + "inputs": { + "data": "inputs.0", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "b8d0d982-fc53-4a3f-8a8c-a284fdd45bfd", + "name": "Random Classifier", + "python_path": "d3m.primitives.classification.random_classifier.Test", + "version": "0.1.0" + }, + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + }, + "outputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] +} diff --git a/axolotl/tests/data/primitives/setup.cfg b/axolotl/tests/data/primitives/setup.cfg new file mode 100644 index 0000000..b6a8bc3 --- /dev/null +++ b/axolotl/tests/data/primitives/setup.cfg @@ -0,0 +1,25 @@ +[pycodestyle] +max-line-length = 200 + +[metadata] +description-file = README.md + +[mypy] +warn_redundant_casts = True +# TODO: Enable back once false positives are fixed. +# See: https://github.com/python/mypy/issues/4412 +#warn_unused_ignores = True +warn_unused_configs = True +disallow_untyped_defs = True + +# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300 +[mypy-d3m.container.list] +ignore_errors = True + +# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300 +[mypy-d3m.metadata.hyperparams] +ignore_errors = True + +# TODO: Remove once this is fixed: https://github.com/python/mypy/pull/4384#issuecomment-354033177 +[mypy-d3m.primitive_interfaces.distance] +ignore_errors = True diff --git a/axolotl/tests/data/primitives/setup.py b/axolotl/tests/data/primitives/setup.py new file mode 100644 index 0000000..8eafc35 --- /dev/null +++ b/axolotl/tests/data/primitives/setup.py @@ -0,0 +1,42 @@ +import os +from setuptools import setup, find_packages + +PACKAGE_NAME = 'test_primitives' + + +def read_package_variable(key): + """Read the value of a variable from the package without importing.""" + module_path = os.path.join(PACKAGE_NAME, '__init__.py') + with open(module_path) as module: + for line in module: + parts = line.strip().split(' ') + if parts and parts[0] == key: + return parts[-1].strip("'") + raise KeyError("'{0}' not found in '{1}'".format(key, module_path)) + + +setup( + name=PACKAGE_NAME, + version=read_package_variable('__version__'), + description='Test primitives', + author=read_package_variable('__author__'), + packages=find_packages(exclude=['contrib', 'docs', 'tests*']), + install_requires=[ + 'd3m', + ], + url='https://gitlab.com/datadrivendiscovery/tests-data', + keywords='d3m_primitive', + entry_points={ + 'd3m.primitives': [ + 'regression.monomial.Test = test_primitives.monomial:MonomialPrimitive', + 'operator.increment.Test = test_primitives.increment:IncrementPrimitive', + 'operator.sum.Test = test_primitives.sum:SumPrimitive', + 'data_generation.random.Test = test_primitives.random:RandomPrimitive', + 'operator.primitive_sum.Test = test_primitives.primitive_sum:PrimitiveSumPrimitive', + 'operator.null.TransformerTest = test_primitives.null:NullTransformerPrimitive', + 'operator.null.UnsupervisedLearnerTest = test_primitives.null:NullUnsupervisedLearnerPrimitive', + 'classification.random_classifier.Test = test_primitives.random_classifier:RandomClassifierPrimitive', + 'evaluation.compute_scores.Test = test_primitives.fake_score:FakeScorePrimitive', + ], + }, +) diff --git a/axolotl/tests/data/primitives/test_primitives/__init__.py b/axolotl/tests/data/primitives/test_primitives/__init__.py new file mode 100644 index 0000000..bce4e13 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/__init__.py @@ -0,0 +1,2 @@ +__author__ = 'Test team' +__version__ = '0.1.0' diff --git a/axolotl/tests/data/primitives/test_primitives/abs_sum.py b/axolotl/tests/data/primitives/test_primitives/abs_sum.py new file mode 100644 index 0000000..0b130dc --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/abs_sum.py @@ -0,0 +1,80 @@ +import os.path +import typing + +import numpy as np # type: ignore + +from d3m import container, utils, exceptions +from d3m.metadata import hyperparams, base as metadata_base +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__ + +__all__ = ('AbsSumPrimitive',) + + +Inputs = typing.Union[container.ndarray, container.DataFrame, container.List] +Outputs = container.List + + +class Hyperparams(hyperparams.Hyperparams): + """ + No hyper-parameters for this primitive. + """ + + pass + + +class AbsSumPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that sums the absolute value of the elements in a container and returns a list with a single value: the sum. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': '24de67db-aa08-4b66-85b2-b7be97154cf6', + 'version': __version__, + 'name': "Absolute Sum Test Primitive", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/abs_sum.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/abs_sum.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.sum.AbsTest', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.COMPUTER_ALGEBRA, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + @base.singleton + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + result = np.abs(self._convert_value(inputs)).sum() + outputs = container.List((result,), generate_metadata=True) + return base.CallResult(outputs) + + def _convert_value(self, value: typing.Any) -> typing.Union[np.ndarray, typing.List, typing.Any]: + if isinstance(value, container.ndarray): + return value.view(np.ndarray) + elif isinstance(value, container.List): + return [self._convert_value(v) for v in value] + elif isinstance(value, container.DataFrame): + return value.values + else: + raise exceptions.InvalidArgumentTypeError('Input value must be an instance of `container.ndarray`, `container.List`, or `container.DataFrame.') diff --git a/axolotl/tests/data/primitives/test_primitives/container_hyperparam.py b/axolotl/tests/data/primitives/test_primitives/container_hyperparam.py new file mode 100644 index 0000000..10b84a5 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/container_hyperparam.py @@ -0,0 +1,68 @@ +import os.path +import typing + +import numpy as np # type: ignore + +from d3m import container, utils +from d3m.metadata import hyperparams, base as metadata_base +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__ + +__all__ = ('ContainerHyperparamPrimitive',) + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + dataframe = hyperparams.Hyperparameter[container.DataFrame]( + default=container.DataFrame(0, index=np.arange(10), columns=['Values'], generate_metadata=True), + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description='The values to be added to input, element-wise' + ) + + +class ContainerHyperparamPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which uses a hyperparam of type container_argument. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': '442b600e-1144-11e9-ab14-d663bd873d93', + 'version': __version__, + 'name': "Container Hyperparam Tester", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/container_hyperparam.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/container_hyperparam.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.sum.ContainerHyperparamTest', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.COMPUTER_ALGEBRA, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + outputs = inputs + self.hyperparams['dataframe'] + return base.CallResult(outputs) diff --git a/axolotl/tests/data/primitives/test_primitives/data_hyperparam.py b/axolotl/tests/data/primitives/test_primitives/data_hyperparam.py new file mode 100644 index 0000000..dea065d --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/data_hyperparam.py @@ -0,0 +1,66 @@ +import os.path +import typing + +from d3m import container, utils +from d3m.metadata import hyperparams, base as metadata_base +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__ + +__all__ = ('DataHyperparamPrimitive',) + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + value = hyperparams.Hyperparameter[float]( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='The value to be added to input' + ) + + +class DataHyperparamPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that requires a data argument hyperparam. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': '98582315-33f9-4fe9-91a4-5d768a123aa8', + 'version': __version__, + 'name': "Data Hyperparam Test Primitive", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/data_hyperparam.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/data_hyperparam.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.sum.DataHyperparamTest', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.COMPUTER_ALGEBRA, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + outputs = inputs.add(self.hyperparams['value']) + return base.CallResult(outputs) diff --git a/axolotl/tests/data/primitives/test_primitives/fail.py b/axolotl/tests/data/primitives/test_primitives/fail.py new file mode 100644 index 0000000..10df35b --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/fail.py @@ -0,0 +1,106 @@ +import os.path +import typing + +from d3m import container, exceptions, utils +from d3m.metadata import hyperparams, base as metadata_base +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__ + +__all__ = ('FailPrimitive',) + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + + method_to_fail = hyperparams.Enumeration[str]( + values=['__init__', 'set_training_data', 'fit', 'produce', 'none'], + default='produce', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="The name of the method the user wants this primitive to fail on.", + ) + + +class IntentionalError(Exception): + """ + Exception raised for testing purposes. + + Parameters + ---------- + class_name : str + Name of the class where the error occurred. + method_name : str + Name of the method where the error occurred. + """ + + def __init__(self, class_name: str, method_name: str) -> None: + message = f"This is an exception raised by a(n) {class_name} object in the {method_name} method" + super().__init__(message) + + +class FailPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which fails on the requested method (given as hyper-parameter). + + Moreover, primitive does not correctly preserve state so if you pickle + and unpickle it, it does not seen itself as fitted anymore. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': 'd6dfbefa-0fb8-11e9-ab14-d663bd873d93', + 'version': __version__, + 'name': "Failure Tester", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/fail.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/fail.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.null.FailTest', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.IDENTITY_FUNCTION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + self._conditional_fail('__init__') + self._fitted = False + + def _conditional_fail(self, method_name: str) -> None: + if self.hyperparams['method_to_fail'] == method_name: + raise IntentionalError(self.__class__.__name__, method_name) + + def set_training_data(self) -> None: # type: ignore + self._conditional_fail('set_training_data') + self._fitted = False + super().set_training_data() + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + self._conditional_fail('fit') + self._fitted = True + return super().fit(timeout=timeout, iterations=iterations) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + self._conditional_fail('produce') + if not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive is not fitted.") + return base.CallResult(inputs) diff --git a/axolotl/tests/data/primitives/test_primitives/fake_score.py b/axolotl/tests/data/primitives/test_primitives/fake_score.py new file mode 100644 index 0000000..f8b1080 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/fake_score.py @@ -0,0 +1,100 @@ +import os.path +import typing + +from d3m import container, utils +from d3m.metadata import base as metadata_base, hyperparams, problem +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__ + +__all__ = ('FakeScorePrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class FakeScorePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that takes a DataFrame and returns hard-coded fake accuracy scores. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata( + { + 'id': '1c4d5cbd-163c-424d-8be5-0f267641ae34', + 'version': __version__, + 'name': "Generate fake scores for testing", + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/fake_score.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/fake_score.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.evaluation.compute_scores.Test', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ACCURACY_SCORE, + ], + 'primitive_family': metadata_base.PrimitiveFamily.EVALUATION, + }, + ) + + def produce( # type: ignore + self, *, inputs: Inputs, score_dataset: container.Dataset, timeout: float = None, + iterations: int = None, + ) -> base.CallResult[Outputs]: + outputs: typing.Dict[str, typing.List] = { + 'metric': [problem.PerformanceMetric.ACCURACY.name], + 'value': [1.0], + 'normalized': [1.0], + } + + results = container.DataFrame(data=outputs, columns=list(outputs.keys()), generate_metadata=True) + + results.metadata = results.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey', + ) + results.metadata = results.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/Score', + ) + results.metadata = results.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, 2), + 'https://metadata.datadrivendiscovery.org/types/Score', + ) + + return base.CallResult(results) + + def multi_produce( # type: ignore + self, *, produce_methods: typing.Sequence[str], inputs: Inputs, + score_dataset: container.Dataset, timeout: float = None, iterations: int = None, + ) -> base.MultiCallResult: + return self._multi_produce( + produce_methods=produce_methods, timeout=timeout, iterations=iterations, + inputs=inputs, score_dataset=score_dataset, + ) + + def fit_multi_produce( # type: ignore + self, *, produce_methods: typing.Sequence[str], inputs: Inputs, + score_dataset: container.Dataset, timeout: float = None, iterations: int = None + ) -> base.MultiCallResult: + return self._fit_multi_produce( + produce_methods=produce_methods, timeout=timeout, iterations=iterations, + inputs=inputs, score_dataset=score_dataset, + ) diff --git a/axolotl/tests/data/primitives/test_primitives/file_reader.py b/axolotl/tests/data/primitives/test_primitives/file_reader.py new file mode 100644 index 0000000..178ff18 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/file_reader.py @@ -0,0 +1,71 @@ +import os +import typing + +import numpy # type: ignore +import frozendict # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.base import primitives +from d3m.metadata import base as metadata_base + +from . import __author__, __version__ + +__all__ = ('DummyImageReaderPrimitive',) + + +class DummyImageReaderPrimitive(primitives.FileReaderPrimitiveBase): + """ + A primitive which pretends to read columns referencing image files, + but returns just the basename of the file path as dummy value of the file, + wrapped inside a 1x1 ndarray. + """ + + _supported_media_types = ( + 'image/jpeg', + 'image/png', + ) + _file_structural_type = container.ndarray + _file_semantic_types = ('http://schema.org/ImageObject',) + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata( + { + 'id': '4f6e56b6-4ece-444b-9354-5a2b4e575a13', + 'version': __version__, + 'name': 'Dummy image reader', + 'python_path': 'd3m.primitives.data_preprocessing.image_reader.Test', + 'keywords': ['image', 'reader', 'jpg', 'png'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/file_reader.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, + ], + 'supported_media_types': _supported_media_types, + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + } + ) + + def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict, fileuri: str) -> container.ndarray: + image_array = container.ndarray(numpy.array([[fileuri.split('/')[-1]]], dtype=object), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.ndarray, + }, generate_metadata=False) + + image_array.metadata = image_array.metadata.update((), { + 'image_reader_metadata': { + 'foobar': 42, + }, + }) + + return image_array diff --git a/axolotl/tests/data/primitives/test_primitives/increment.py b/axolotl/tests/data/primitives/test_primitives/increment.py new file mode 100644 index 0000000..25a7cf2 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/increment.py @@ -0,0 +1,99 @@ +import os.path +import typing + +from d3m import container, utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__ + +__all__ = ('IncrementPrimitive',) + + +# It is useful to define these names, so that you can reuse it both +# for class type arguments and method signatures. +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + # We can provide a type argument to a Hyperparameter class to signal which + # structural type the Hyperparameter is. If you do not provide it, it is + # automatically detected. + # This is not a tuning parameter but a control parameter which should be decided + # once during pipeline building but then fixed and not changed during hyper-parameter + # tuning. + amount = hyperparams.Hyperparameter[float](default=1, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']) + + +class IncrementPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + # It is important to provide a docstring because this docstring is used as a description of + # a primitive. Some callers might analyze it to determine the nature and purpose of a primitive. + + """ + A primitive which increments each value by a fixed amount, by default 1. + """ + + # This should contain only metadata which cannot be automatically determined from the code. + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". + 'id': '5c9d5acf-7754-420f-a49f-90f4d9d0d694', + 'version': __version__, + 'name': "Increment Values", + # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + # Unstructured URIs. Link to file and link to repo in this case. + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/increment.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + # A list of dependencies in order. These can be Python packages, system packages, or Docker images. + # Of course Python packages can also have their own dependencies, but sometimes it is necessary to + # install a Python package first to be even able to run setup.py of another package. Or you have + # a dependency which is not on PyPi. + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + # URIs at which one can obtain code for the primitive, if available. + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/increment.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + # The same path the primitive is registered with entry points in setup.py. + 'python_path': 'd3m.primitives.operator.increment.Test', + # Choose these from a controlled vocabulary in the schema. If anything is missing which would + # best describe the primitive, make a merge request. + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.COMPUTER_ALGEBRA, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + # A metafeature about preconditions required for this primitive to operate well. + 'preconditions': [ + # Instead of strings you can also use available Python enumerations. + metadata_base.PrimitivePrecondition.NO_MISSING_VALUES, + metadata_base.PrimitivePrecondition.NO_CATEGORICAL_VALUES, + ] + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + # If "inputs" is container.DataFrame, then also result is. + outputs = typing.cast(Outputs, inputs + float(self.hyperparams['amount'])) + + # Metadata might not be preserved through operations, so we make sure and update metadata ourselves. + # Because just values changed (but not structure) and the primitive is a transformation, we can reuse + # inputs metadata, but generate new metadata for new value to assure everything is matching. + outputs.metadata = inputs.metadata.generate(outputs) + + # Wrap it into default "CallResult" object: we are not doing any iterations. + return base.CallResult(outputs) diff --git a/axolotl/tests/data/primitives/test_primitives/monomial.py b/axolotl/tests/data/primitives/test_primitives/monomial.py new file mode 100644 index 0000000..3bf7fa6 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/monomial.py @@ -0,0 +1,127 @@ +import os.path +import typing + +from d3m import container, utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces import base, supervised_learning + +from . import __author__, __version__ + +__all__ = ('MonomialPrimitive',) + + +# It is useful to define these names, so that you can reuse it both +# for class type arguments and method signatures. +Inputs = container.List +Outputs = container.List + + +class Params(params.Params): + a: float + + +class Hyperparams(hyperparams.Hyperparams): + bias = hyperparams.Hyperparameter(default=0.0, semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']) + + +class MonomialPrimitive(supervised_learning.SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + # It is important to provide a docstring because this docstring is used as a description of + # a primitive. Some callers might analyze it to determine the nature and purpose of a primitive. + + """ + A primitive which fits output = a * input. + """ + + # This should contain only metadata which cannot be automatically determined from the code. + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". + 'id': '4a0336ae-63b9-4a42-860e-86c5b64afbdd', + 'version': __version__, + 'name': "Monomial Regressor", + # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + # Unstructured URIs. Link to file and link to repo in this case. + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/monomial.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + # A list of dependencies in order. These can be Python packages, system packages, or Docker images. + # Of course Python packages can also have their own dependencies, but sometimes it is necessary to + # install a Python package first to be even able to run setup.py of another package. Or you have + # a dependency which is not on PyPi. + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + # URIs at which one can obtain code for the primitive, if available. + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + # The same path the primitive is registered with entry points in setup.py. + 'python_path': 'd3m.primitives.regression.monomial.Test', + # Choose these from a controlled vocabulary in the schema. If anything is missing which would + # best describe the primitive, make a merge request. + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.LINEAR_REGRESSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.REGRESSION, + }) + + # Random seed is not needed, but we need it in tests to test which random seed was passed to a primitive. + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + self._a: float = 0 + self._training_inputs: Inputs = None + self._training_outputs: Outputs = None + self._fitted: bool = False + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + if self._a is None: + raise ValueError("Calling produce before fitting.") + + # We compute the result. We use (...) here and not [...] to create a + # generator and not a list which would then just be copied into "List". + result = (self._a * input + self.hyperparams['bias'] for input in inputs) + + # We convert a regular list to container list which supports metadata attribute. + # Even if the structure of outputs is the same as inputs, conceptually, outputs are different, + # they are new data. So we do not reuse metadata from inputs but generate new metadata. + outputs: container.List = container.List(result, generate_metadata=True) + + # Wrap it into default "CallResult" object: we are not doing any iterations. + return base.CallResult(outputs) + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._training_inputs = inputs + self._training_outputs = outputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + if self._fitted: + return base.CallResult(None) + + if not self._training_inputs or not self._training_inputs: + raise ValueError("Missing training data.") + + quotients = [output / input for output, input in zip(self._training_outputs, self._training_inputs) if input != 0] + self._a = sum(quotients) / len(quotients) + self._fitted = True + + return base.CallResult(None) + + def get_params(self) -> Params: + # You can pass a dict or keyword arguments. + return Params(a=self._a) + + def set_params(self, *, params: Params) -> None: + # Params are just a fancy dict. + self._a = params['a'] diff --git a/axolotl/tests/data/primitives/test_primitives/multi_data_hyperparam.py b/axolotl/tests/data/primitives/test_primitives/multi_data_hyperparam.py new file mode 100644 index 0000000..3bcbc3d --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/multi_data_hyperparam.py @@ -0,0 +1,70 @@ +import os.path +import typing + +import numpy as np # type: ignore + +from d3m import container, utils +from d3m.metadata import hyperparams, base as metadata_base +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__ + +__all__ = ('MultiDataHyperparamPrimitive',) + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + values = hyperparams.Hyperparameter[typing.List[np.float64]]( # type: ignore + default=[np.float64(1)], + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='The values to be added to input' + ) + + +class MultiDataHyperparamPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that requires a data argument hyperparam. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': 'ad8b8a35-9023-4f24-a628-a8f41eb2e3b0', + 'version': __version__, + 'name': "Multi Data Hyperparam Test Primitive", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/multi_data_hyperparam.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/multi_data_hyperparam.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.sum.MultiDataHyperparamTest', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.COMPUTER_ALGEBRA, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + outputs = inputs + for value in self.hyperparams['values']: + outputs = outputs + value + return base.CallResult(outputs) diff --git a/axolotl/tests/data/primitives/test_primitives/null.py b/axolotl/tests/data/primitives/test_primitives/null.py new file mode 100644 index 0000000..a153a2b --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/null.py @@ -0,0 +1,219 @@ +import os.path +import typing + +from d3m import container, utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces import base, transformer, unsupervised_learning + +from . import __author__, __version__ + +__all__ = ('NullTransformerPrimitive', 'NullUnsupervisedLearnerPrimitive', 'NullDataFrameUnsupervisedLearnerPrimitive') + +Inputs = container.List +Outputs = container.List + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class Params(params.Params): + pass + + +class NullTransformerPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which passes through inputs as outputs. + + It does not really care if inputs is list. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': 'e0f83c35-fe3d-4fa6-92cf-f7421408eab5', + 'version': __version__, + 'name': "Produce the same as the input", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/null.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/add_primitives.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.null.TransformerTest', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.IDENTITY_FUNCTION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + return base.CallResult( + value=inputs + ) + + +class NullUnsupervisedLearnerPrimitive(unsupervised_learning.UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A primitive which passes through inputs as outputs. + + It does not really care if inputs is list. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': '5310d7c4-89a0-4dab-8419-3285e650105a', + 'version': __version__, + 'name': "Produce the same as the input", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/null.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/add_primitives.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.null.UnsupervisedLearnerTest', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.IDENTITY_FUNCTION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def set_training_data(self) -> None: # type: ignore + """ + A noop. + + Parameters + ---------- + """ + + return + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + """ + A noop. + """ + + return base.CallResult(None) + + def get_params(self) -> Params: + """ + A noop. + """ + + return Params() + + def set_params(self, *, params: Params) -> None: + """ + A noop. + """ + + return + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + return base.CallResult( + value=inputs + ) + + +DataframeInputs = container.DataFrame +DataframeOutputs = container.DataFrame + + +class NullDataFrameUnsupervisedLearnerPrimitive(unsupervised_learning.UnsupervisedLearnerPrimitiveBase[DataframeInputs, DataframeOutputs, Params, Hyperparams]): + """ + A primitive which passes through inputs as outputs. + + It does not really care if inputs is a Dataframe. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': '0c063f7b-98d8-4d3c-91df-6a56623b9cc3', + 'version': __version__, + 'name': "Produce the same as the input", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/null.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/add_primitives.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.null.DataFrameUnsupervisedLearnerTest', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.IDENTITY_FUNCTION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def set_training_data(self) -> None: # type: ignore + """ + A noop. + + Parameters + ---------- + """ + + return + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + """ + A noop. + """ + + return base.CallResult(None) + + def get_params(self) -> Params: + """ + A noop. + """ + + return Params() + + def set_params(self, *, params: Params) -> None: + """ + A noop. + """ + + return + + def produce(self, *, inputs: DataframeInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DataframeOutputs]: + return base.CallResult( + value=inputs + ) diff --git a/axolotl/tests/data/primitives/test_primitives/postgresql.py b/axolotl/tests/data/primitives/test_primitives/postgresql.py new file mode 100644 index 0000000..300de95 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/postgresql.py @@ -0,0 +1,222 @@ +import tempfile +import os +import os.path +import pwd +import re +import shutil +import signal +import subprocess +import time +import typing + +import prctl # type: ignore +import psycopg2 # type: ignore + +from d3m import container, utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__ + +__all__ = ('PostgreSQLPrimitive',) + + +Inputs = container.List +Outputs = container.List + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class PostgreSQLPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which which uses PostgreSQL to compute a value. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': 'f23ea340-ce22-4b15-b2f3-e63885f192b3', + 'version': __version__, + 'name': "PostgreSQL operator", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/postgresql.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.UBUNTU, + 'package': 'build-essential', + 'version': '12.4ubuntu1', + }, { + 'type': metadata_base.PrimitiveInstallationType.UBUNTU, + 'package': 'libcap-dev', + 'version': '1:2.25-1.2', + }, { + 'type': metadata_base.PrimitiveInstallationType.UBUNTU, + 'package': 'postgresql-10', + 'version': '10.8-0ubuntu0.18.04.1', + }, { + 'type': metadata_base.PrimitiveInstallationType.UBUNTU, + 'package': 'libpq-dev', + 'version': '10.8-0ubuntu0.18.04.1', + }, { + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'psycopg2', + 'version': '2.8.2', + }, { + # "python-prctl" requires "build-essential" and "libcap-dev". We list it here instead of + # "setup.py" to not have to list these system dependencies for every test primitive (because + # we cannot assure this primitive annotation gets installed first). + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'python-prctl', + 'version': '1.7', + }, { + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/postgresql.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.postgresql.Test', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.COMPUTER_ALGEBRA, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def __init__(self, *, hyperparams: Hyperparams, temporary_directory: str = None) -> None: + super().__init__(hyperparams=hyperparams, temporary_directory=temporary_directory) + + # Initialize it early so that "__del__" has access to these attributes. + self._connection: psycopg2.connection = None + self._process: subprocess.Popen = None + self._postgresql_base: str = None + + self._postgresql_base = tempfile.mkdtemp() + os.chmod(self._postgresql_base, 0o755) + + self._config_dir = os.path.join(self._postgresql_base, 'conf') + self._data_dir = os.path.join(self._postgresql_base, 'data') + self._run_dir = os.path.join(self._postgresql_base, 'run') + self._config_file = os.path.join(self._config_dir, 'postgresql.conf') + + shutil.copytree('/etc/postgresql/10/main', self._config_dir) + shutil.copy('/etc/ssl/certs/ssl-cert-snakeoil.pem', os.path.join(self._config_dir, 'server.pem')) + shutil.copy('/etc/ssl/private/ssl-cert-snakeoil.key', os.path.join(self._config_dir, 'server.key')) + os.chmod(os.path.join(self._config_dir, 'server.key'), 0o600) + + with open(self._config_file, 'r', encoding='utf8') as config_file: + config_file_lines = config_file.readlines() + with open(self._config_file, 'w', encoding='utf8') as config_file: + for line in config_file_lines: + line = re.sub('/etc/ssl/certs/ssl-cert-snakeoil.pem', os.path.join(self._config_dir, 'server.pem'), line) + line = re.sub('/etc/ssl/private/ssl-cert-snakeoil.key', os.path.join(self._config_dir, 'server.key'), line) + line = re.sub('/var/lib/postgresql/10/main', self._data_dir, line) + line = re.sub('/etc/postgresql/10/main/pg_hba.conf', os.path.join(self._config_dir, 'pg_hba.conf'), line) + line = re.sub('/etc/postgresql/10/main/pg_ident.conf', os.path.join(self._config_dir, 'pg_ident.conf'), line) + line = re.sub('/var/run/postgresql/10-main.pid', os.path.join(self._run_dir, '10-main.pid'), line) + line = re.sub('/var/run/postgresql/10-main.pg_stat_tmp', os.path.join(self._run_dir, '10-main.pg_stat_tmp'), line) + line = re.sub('/var/run/postgresql', self._run_dir, line) + config_file.write(line) + + with open(os.path.join(self._config_dir, 'conf.d', 'local.conf'), 'w', encoding='utf8') as config_file: + # We disable TCP access. + config_file.write("listen_addresses = ''\n") + + with open(os.path.join(self._config_dir, 'pg_hba.conf'), 'w', encoding='utf8') as config_file: + config_file.write("local all all trust\n") + + # 700 is required by PostgreSQL. + os.mkdir(self._data_dir, mode=0o700) + os.mkdir(self._run_dir) + os.mkdir(os.path.join(self._run_dir, '10-main.pg_stat_tmp')) + + if os.getuid() == 0: + self._username = 'postgres' + + # We have to run PostgreSQL as non-root user. + shutil.chown(self._data_dir, 'postgres', 'postgres') + shutil.chown(self._run_dir, 'postgres', 'postgres') + shutil.chown(os.path.join(self._run_dir, '10-main.pg_stat_tmp'), 'postgres', 'postgres') + shutil.chown(os.path.join(self._config_dir, 'pg_hba.conf'), 'postgres', 'postgres') + shutil.chown(os.path.join(self._config_dir, 'pg_ident.conf'), 'postgres', 'postgres') + shutil.chown(os.path.join(self._config_dir, 'server.key'), 'postgres', 'postgres') + else: + self._username = pwd.getpwuid(os.getuid())[0] + + self._init_and_start_database() + + @staticmethod + def _process_configure() -> None: + if os.getuid() == 0: + os.setgid(shutil._get_gid('postgres')) # type: ignore + os.setuid(shutil._get_uid('postgres')) # type: ignore + + # Setting "pdeathsig" will make the process be killed if our process dies for any reason. + prctl.set_pdeathsig(signal.SIGTERM) + + def _init_and_start_database(self) -> None: + args = [ + '/usr/lib/postgresql/10/bin/initdb', + '-D', + self._data_dir, + '--locale', + 'en_US.UTF-8', + '--encoding', + 'UTF-8', + ] + + try: + subprocess.run( + args, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + encoding='utf8', check=True, preexec_fn=self._process_configure, + ) + except subprocess.CalledProcessError as error: + self.logger.error("Error running initdb: %(stdout)s", {'stdout': error.stdout}) + raise error + + args = [ + '/usr/lib/postgresql/10/bin/postgres', + '-D', + self._data_dir, + '-c', + 'config_file={config_file}'.format(config_file=self._config_file), + ] + + self._process = subprocess.Popen(args, stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT, encoding='utf8', preexec_fn=self._process_configure) + + # Waits for 2 seconds. + connection_error = None + for i in range(20): + try: + self._connection = psycopg2.connect(dbname=self._username, user=self._username, host=self._run_dir) + break + except psycopg2.OperationalError as error: + connection_error = error + time.sleep(0.1) + else: + raise connection_error + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + with self._connection.cursor() as cursor: + cursor.execute("SELECT 42;") + return base.CallResult(container.List([cursor.fetchone()[0]], generate_metadata=True)) + + def __del__(self) -> None: + if self._connection is not None: + self._connection.close() + self._connection = None + + if self._process is not None and self._process.poll() is None: + self._process.terminate() + + if self._postgresql_base is not None: + shutil.rmtree(self._postgresql_base, ignore_errors=True) diff --git a/axolotl/tests/data/primitives/test_primitives/primitive_hyperparam.py b/axolotl/tests/data/primitives/test_primitives/primitive_hyperparam.py new file mode 100644 index 0000000..732996a --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/primitive_hyperparam.py @@ -0,0 +1,76 @@ +import os.path +import typing + +import pandas as pd # type: ignore + +from d3m import container, utils +from d3m.metadata import hyperparams, base as metadata_base +from d3m.primitive_interfaces import base, transformer +from test_primitives.increment import IncrementPrimitive, Hyperparams as IncrementPrimitiveHyperparams + +from . import __author__, __version__ + +__all__ = ('PrimitiveHyperparamPrimitive',) + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + primitive = hyperparams.Hyperparameter[base.PrimitiveBase]( + default=IncrementPrimitive(hyperparams=IncrementPrimitiveHyperparams.defaults()), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='The primitive instance to be passed to PrimitiveHyperparamPrimitive' + ) + + +class PrimitiveHyperparamPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that requires a data argument hyperparam. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': 'bd67f49a-bf10-4251-9774-019add57370b', + 'version': __version__, + 'name': "Primitive Hyperparam Test Primitive", + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/primitive_hyperparam.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/primitive_hyperparam.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.operator.sum.PrimitiveHyperparamTest', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.COMPUTER_ALGEBRA, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + primitive = self.hyperparams['primitive'] + result = primitive.produce(inputs=inputs) + data = result.value + if isinstance(data, pd.DataFrame): + value = data.iloc[0] + else: + value = data[0] + outputs = inputs + value + return base.CallResult(outputs) diff --git a/axolotl/tests/data/primitives/test_primitives/primitive_sum.py b/axolotl/tests/data/primitives/test_primitives/primitive_sum.py new file mode 100644 index 0000000..b17e81b --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/primitive_sum.py @@ -0,0 +1,139 @@ +import os.path +import time +import typing + +import numpy # type: ignore + +from d3m import container, exceptions, utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__, null + +__all__ = ('PrimitiveSumPrimitive',) + +Inputs = container.List +Outputs = container.List + + +class Hyperparams(hyperparams.Hyperparams): + # These primitives should already be fitted (or be a transformer) and they should accept + # "List" container type as an input, and return a "List" container type as an output. + # TODO: How to define this in the hyper-parameter definition? + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/210 + primitive_1 = hyperparams.Primitive[base.PrimitiveBase]( + default=null.NullTransformerPrimitive, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + primitive_2 = hyperparams.Primitive[base.PrimitiveBase]( + default=null.NullTransformerPrimitive, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + +class PrimitiveSumPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + # It is important to provide a docstring because this docstring is used as a description of + # a primitive. Some callers might analyze it to determine the nature and purpose of a primitive. + + """ + A primitive which element-wise sums the produced results of two other primitives. Each of those two primitives + are given inputs (a list of numbers) to this primitive first as their inputs, are expected to return a list + of numbers back, and then those lists are element-wise summed together, to produce the final list. + + This primitive exists just as a demonstration. To sum results you would otherwise just simply + sum the results directly instead of getting an instance of the primitive and call + produce methods on it. But this does allow more complicated ways of interacting with a + primitive and this primitive demonstrates it. + """ + + # This should contain only metadata which cannot be automatically determined from the code. + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". + 'id': '6b061902-5e40-4a7a-9a21-b995dce1b2aa', + 'version': __version__, + 'name': "Sum results of other primitives", + # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + # Unstructured URIs. Link to file and link to repo in this case. + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/primitive_sum.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + # A list of dependencies in order. These can be Python packages, system packages, or Docker images. + # Of course Python packages can also have their own dependencies, but sometimes it is necessary to + # install a Python package first to be even able to run setup.py of another package. Or you have + # a dependency which is not on PyPi. + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + # URIs at which one can obtain code for the primitive, if available. + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/add_primitives.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + # The same path the primitive is registered with entry points in setup.py. + 'python_path': 'd3m.primitives.operator.primitive_sum.Test', + # Choose these from a controlled vocabulary in the schema. If anything is missing which would + # best describe the primitive, make a merge request. + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.COMPUTER_ALGEBRA, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + # A metafeature about preconditions required for this primitive to operate well. + 'preconditions': [ + # Instead of strings you can also use available Python enumerations. + metadata_base.PrimitivePrecondition.NO_MISSING_VALUES, + metadata_base.PrimitivePrecondition.NO_CATEGORICAL_VALUES, + ] + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + primitive_1 = self.hyperparams['primitive_1'] + primitive_2 = self.hyperparams['primitive_2'] + + results = [] + + if primitive_1 is not None: + start = time.perf_counter() + results.append(primitive_1.produce(inputs=inputs, timeout=timeout, iterations=iterations)) + delta = time.perf_counter() - start + + # Decrease the amount of time available to other calls. This delegates responsibility + # of raising a "TimeoutError" exception to produce methods themselves. It also assumes + # that if one passes a negative timeout value to a produce method, it raises a + # "TimeoutError" exception correctly. + if timeout is not None: + timeout -= delta + + if primitive_2 is not None: + results.append(primitive_2.produce(inputs=inputs, timeout=timeout, iterations=iterations)) + + if not results: + raise exceptions.InvalidArgumentValueError("No primitives provided as hyper-parameters.") + + # Even if the structure of outputs is the same as inputs, conceptually, outputs are different, + # they are new data. So we do not reuse metadata from inputs but generate new metadata. + outputs = container.List([sum(x) for x in zip(*[result.value for result in results])], generate_metadata=True) + + # We return the maximum number of iterations done by any produce method we called. + iterations_done = None + for result in results: + if result.iterations_done is not None: + if iterations_done is None: + iterations_done = result.iterations_done + else: + iterations_done = max(iterations_done, result.iterations_done) + + return base.CallResult( + value=outputs, + has_finished=all(result.has_finished for result in results), + iterations_done=iterations_done, + ) diff --git a/axolotl/tests/data/primitives/test_primitives/random.py b/axolotl/tests/data/primitives/test_primitives/random.py new file mode 100644 index 0000000..6767311 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/random.py @@ -0,0 +1,154 @@ +import os.path +import typing + +import numpy # type: ignore + +from d3m import container, utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, generator + +from . import __author__, __version__ + +__all__ = ('RandomPrimitive',) + + +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + # These hyper-parameters can be both control or tuning parameter depending on their + # role in a pipeline. So it depends how a pipeline is constructed: with them having + # a fixed value or something which can be tuned. So they have two semantic types. + mu = hyperparams.Hyperparameter[float](default=0.0, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter', + 'https://metadata.datadrivendiscovery.org/types/TuningParameter' + ]) + sigma = hyperparams.Hyperparameter[float](default=1.0, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter', + 'https://metadata.datadrivendiscovery.org/types/TuningParameter' + ]) + + +class RandomPrimitive(generator.GeneratorPrimitiveBase[Outputs, None, Hyperparams]): + # It is important to provide a docstring because this docstring is used as a description of + # a primitive. Some callers might analyze it to determine the nature and purpose of a primitive. + + """ + A primitive which draws random samples from a normal distribution. + """ + + # This should contain only metadata which cannot be automatically determined from the code. + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". + 'id': 'df3153a1-4411-47e2-bbc0-9d5e9925ad79', + 'version': __version__, + 'name': "Random Samples", + # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + # Unstructured URIs. Link to file and link to repo in this case. + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/random.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + # A list of dependencies in order. These can be Python packages, system packages, or Docker images. + # Of course Python packages can also have their own dependencies, but sometimes it is necessary to + # install a Python package first to be even able to run setup.py of another package. Or you have + # a dependency which is not on PyPi. + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + # URIs at which one can obtain code for the primitive, if available. + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/random.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + # The same path the primitive is registered with entry points in setup.py. + 'python_path': 'd3m.primitives.data_generation.random.Test', + # Choose these from a controlled vocabulary in the schema. If anything is missing which would + # best describe the primitive, make a merge request. + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.MERSENNE_TWISTER, + metadata_base.PrimitiveAlgorithmType.NORMAL_DISTRIBUTION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_GENERATION, + }) + + # It is not necessary to limit arguments this way, but we use it in tests to test that it is supported. + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + def produce(self, *, inputs: container.List, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + # We get as an input a list of non-negative integers, indices into the set of random values. + # For each integer we redraw the number of samples up to that index at which time we return + # the last value, the value for that index. We add one to the index because index can start + # with 0 but we want to draw at least 1 number then. + # TODO: Optimize this if the inputs are a sequence of integers, we could reuse the state. + results = [numpy.random.RandomState(self.random_seed).normal(self.hyperparams['mu'], self.hyperparams['sigma'], i + 1)[-1] for i in inputs] + + # Outputs are different from inputs, so we do not reuse metadata from inputs but create new metadata. + # We convert the list to a container DataFrame which supports metadata attribute. + outputs = container.DataFrame({'results': results}, generate_metadata=True) + + # Wrap it into default "CallResult" object: we are not doing any iterations. + return base.CallResult(outputs) + + def set_training_data(self) -> None: # type: ignore + """ + A noop. + """ + + return + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + """ + A noop. + """ + + return base.CallResult(None) + + def get_params(self) -> None: + """ + A noop. + """ + + return None + + def set_params(self, *, params: None) -> None: + """ + A noop. + """ + + return + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: container.List, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore + """ + A method calling ``fit`` and after that multiple produce methods at once. + + Parameters + ---------- + produce_methods : Sequence[str] + A list of names of produce methods to call. + inputs : List + The inputs given to all produce methods. + timeout : float + A maximum time this primitive should take to both fit the primitive and produce outputs + for all produce methods listed in ``produce_methods`` argument, in seconds. + iterations : int + How many of internal iterations should the primitive do for both fitting and producing + outputs of all produce methods. + + Returns + ------- + MultiCallResult + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs) # type: ignore diff --git a/axolotl/tests/data/primitives/test_primitives/random_classifier.py b/axolotl/tests/data/primitives/test_primitives/random_classifier.py new file mode 100644 index 0000000..4db52c4 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/random_classifier.py @@ -0,0 +1,130 @@ +import os +import random +import typing + +from d3m import container, exceptions, utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces.base import CallResult, ContinueFitMixin +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase + +from . import __author__, __version__ + +__all__ = ('RandomClassifierPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + classes: typing.Optional[typing.Sequence[typing.Any]] + random_state: typing.Any + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class RandomClassifierPrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): + """ + A primitive randomly classify a class. For test purposes. + + It uses the first column of ``outputs`` as a target column. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + 'id': 'b8d0d982-fc53-4a3f-8a8c-a284fdd45bfd', + 'version': __version__, + 'name': "Random Classifier", + 'python_path': 'd3m.primitives.classification.random_classifier.Test', + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.BINARY_CLASSIFICATION, + metadata_base.PrimitiveAlgorithmType.MULTICLASS_CLASSIFICATION + ], + 'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION, + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/random_classifier.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/random_classifier.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + }) + + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + self._random: random.Random = random.Random() + self._random.seed(random_seed) + self._training_outputs: Outputs = None + self._fitted = False + self._classes: typing.List = [] + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._training_outputs = outputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + if self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + self._classes = sorted(self._training_outputs.iloc[:, 0].unique().tolist()) + + self._fitted = True + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + _classes = self._training_outputs.iloc[:, 0].unique().tolist() + self._classes = sorted(set(self._classes + _classes)) + + self._fitted = True + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise exceptions.PrimitiveNotFittedError("Not fitted.") + + k = len(inputs) + predictions = self._random.choices(self._classes, k=k) # type: ignore + + result = container.DataFrame({'predictions': predictions}, generate_metadata=True) + + return CallResult(result) + + def get_params(self) -> Params: + if self._fitted: + return Params( + classes=self._classes, + random_state=self._random.getstate(), + ) + else: + return Params( + classes=None, + random_state=self._random.getstate(), + ) + + def set_params(self, *, params: Params) -> None: + self._classes = params['classes'] + self._random.setstate(params['random_state']) + if self._classes is not None: + self._fitted = True diff --git a/axolotl/tests/data/primitives/test_primitives/sum.py b/axolotl/tests/data/primitives/test_primitives/sum.py new file mode 100644 index 0000000..c9d9096 --- /dev/null +++ b/axolotl/tests/data/primitives/test_primitives/sum.py @@ -0,0 +1,151 @@ +import os.path +import pickle +import typing +from http import client + +import numpy # type: ignore + +from d3m import container, utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +from . import __author__, __version__ + +__all__ = ('SumPrimitive',) + + +DOCKER_KEY = 'summing' + +# It is useful to define these names, so that you can reuse it both +# for class type arguments and method signatures. +# This is just an example of how to define a more complicated input type, +# which is in fact more restrictive than what the primitive can really handle. +# One could probably just use "typing.Union[typing.Container]" in this case, if accepting +# a wide range of input types. +Inputs = typing.Union[container.ndarray, container.DataFrame, container.List] +Outputs = container.List + + +class Hyperparams(hyperparams.Hyperparams): + """ + No hyper-parameters for this primitive. + """ + + pass + + +class SumPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + # It is important to provide a docstring because this docstring is used as a description of + # a primitive. Some callers might analyze it to determine the nature and purpose of a primitive. + + """ + A primitive which sums all the values on input into one number. + """ + + # This should contain only metadata which cannot be automatically determined from the code. + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata({ + # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()". + 'id': '9c00d42d-382d-4177-a0e7-082da88a29c8', + 'version': __version__, + 'name': "Sum Values", + # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable. + 'keywords': ['test primitive'], + 'source': { + 'name': __author__, + 'contact': 'mailto:author@example.com', + 'uris': [ + # Unstructured URIs. Link to file and link to repo in this case. + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/sum.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + # A list of dependencies in order. These can be Python packages, system packages, or Docker images. + # Of course Python packages can also have their own dependencies, but sometimes it is necessary to + # install a Python package first to be even able to run setup.py of another package. Or you have + # a dependency which is not on PyPi. + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }, { + 'type': metadata_base.PrimitiveInstallationType.DOCKER, + # A key under which information about a running container will be provided to the primitive. + 'key': DOCKER_KEY, + 'image_name': 'registry.gitlab.com/datadrivendiscovery/tests-data/summing', + # Instead of a label, an exact hash of the image is required. This assures reproducibility. + # You can see digests using "docker images --digests". + 'image_digest': 'sha256:f75e21720e44cfa29d8a8e239b5746c715aa7cf99f9fde7916623fabc30d3364', + }], + # URIs at which one can obtain code for the primitive, if available. + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/sum.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + # The same path the primitive is registered with entry points in setup.py. + 'python_path': 'd3m.primitives.operator.sum.Test', + # Choose these from a controlled vocabulary in the schema. If anything is missing which would + # best describe the primitive, make a merge request. + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.COMPUTER_ALGEBRA, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + # A metafeature about preconditions required for this primitive to operate well. + 'preconditions': [ + # Instead of strings you can also use available Python enumerations. + metadata_base.PrimitivePrecondition.NO_MISSING_VALUES, + metadata_base.PrimitivePrecondition.NO_CATEGORICAL_VALUES, + ] + }) + + def __init__(self, *, hyperparams: Hyperparams, docker_containers: typing.Dict[str, base.DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, docker_containers=docker_containers) + + # We cannot check for expected ports here because during class construction, a mock value is passed which has empty ports dict. + if not self.docker_containers or DOCKER_KEY not in self.docker_containers: + raise ValueError("Docker key '{docker_key}' missing among provided Docker containers.".format(docker_key=DOCKER_KEY)) + + def _convert_value(self, value: typing.Any) -> typing.Union[numpy.ndarray, typing.List, typing.Any]: + # Server does not know about container types, just standard numpy arrays and lists. + if isinstance(value, container.ndarray): + return value.view(numpy.ndarray) + elif isinstance(value, container.List): + return [self._convert_value(v) for v in value] + else: + return value + + @base.singleton + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + # In the future, we should store here data in Arrow format into + # Plasma store and just pass an ObjectId of data over HTTP. + value = self._convert_value(inputs) + data = pickle.dumps(value) + + # TODO: Retry if connection fails. + # This connection can sometimes fail because the service inside a Docker container + # is not yet ready, despite container itself already running. Primitive should retry + # a few times before aborting. + + # Primitive knows the port the container is listening on. + connection = client.HTTPConnection(self.docker_containers[DOCKER_KEY].address, port=self.docker_containers[DOCKER_KEY].ports['8000/tcp']) + # This simple primitive does not keep any state in the Docker container. + # But if your primitive does have to associate requests with a primitive, consider + # using Python's "id(self)" call to get an identifier of a primitive's instance. + self.logger.debug("HTTP request: container=%(container)s", {'container': self.docker_containers[DOCKER_KEY]}, extra={'data': value}) + connection.request('POST', '/', data, { + 'Content-Type': 'multipart/form-data', + }) + response = connection.getresponse() + self.logger.debug("HTTP response: status=%(status)s", {'status': response.status}, extra={'response': response}) + + if response.status != 200: + raise ValueError("Invalid HTTP response status: {status}".format(status=response.status)) + + result = float(response.read()) + + # Outputs are different from inputs, so we do not reuse metadata from inputs but generate new metadata. + outputs = container.List((result,), generate_metadata=True) + + # Wrap it into default "CallResult" object: we are not doing any iterations. + return base.CallResult(outputs) diff --git a/axolotl/tests/data/problems/boston_problem_1/problemDoc.json b/axolotl/tests/data/problems/boston_problem_1/problemDoc.json new file mode 100644 index 0000000..30d7d5d --- /dev/null +++ b/axolotl/tests/data/problems/boston_problem_1/problemDoc.json @@ -0,0 +1,36 @@ +{ + "about": { + "problemID": "boston_problem_1", + "problemName": "Predict median value of a home", + "problemSchemaVersion": "4.0.0", + "problemVersion": "4.0.0", + "taskKeywords": [ + "regression", + "univariate" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "boston_dataset_1", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 14, + "colName": "MEDV" + } + ] + } + ], + "performanceMetrics": [ + { + "metric": "rSquared" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv", + "scoresFile": "scores.csv" + } +} \ No newline at end of file diff --git a/axolotl/tests/data/problems/boston_problem_2/problemDoc.json b/axolotl/tests/data/problems/boston_problem_2/problemDoc.json new file mode 100644 index 0000000..61debd9 --- /dev/null +++ b/axolotl/tests/data/problems/boston_problem_2/problemDoc.json @@ -0,0 +1,36 @@ +{ + "about": { + "problemID": "boston_problem_2", + "problemName": "Predict nitrous oxide level", + "problemSchemaVersion": "4.0.0", + "problemVersion": "4.0.0", + "taskKeywords": [ + "regression", + "univariate" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "boston_dataset_1", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 5, + "colName": "NOX" + } + ] + } + ], + "performanceMetrics": [ + { + "metric": "rSquared" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv", + "scoresFile": "scores.csv" + } +} \ No newline at end of file diff --git a/axolotl/tests/data/problems/database_problem_2/problemDoc.json b/axolotl/tests/data/problems/database_problem_2/problemDoc.json new file mode 100644 index 0000000..05ad512 --- /dev/null +++ b/axolotl/tests/data/problems/database_problem_2/problemDoc.json @@ -0,0 +1,42 @@ +{ + "about": { + "problemID": "database_problem_2", + "problemName": "Database problem of type COUNTS_PER_USER", + "problemSchemaVersion": "4.0.0", + "problemVersion": "4.0.0", + "taskKeywords": [ + "regression", + "multivariate" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "database_dataset_2", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 2, + "colName": "posts_count" + }, + { + "targetIndex": 1, + "resID": "learningData", + "colIndex": 3, + "colName": "comments_count" + } + ] + } + ], + "performanceMetrics": [ + { + "metric": "rootMeanSquaredError" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv", + "scoresFile": "scores.csv" + } +} \ No newline at end of file diff --git a/axolotl/tests/data/problems/database_problem_3/problemDoc.json b/axolotl/tests/data/problems/database_problem_3/problemDoc.json new file mode 100644 index 0000000..1bf5a15 --- /dev/null +++ b/axolotl/tests/data/problems/database_problem_3/problemDoc.json @@ -0,0 +1,36 @@ +{ + "about": { + "problemID": "database_problem_3", + "problemName": "Database problem of type COMMENTS_PER_POST", + "problemSchemaVersion": "4.0.0", + "problemVersion": "4.0.0", + "taskKeywords": [ + "regression", + "univariate" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "database_dataset_3", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 2, + "colName": "comments_count" + } + ] + } + ], + "performanceMetrics": [ + { + "metric": "rootMeanSquaredError" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv", + "scoresFile": "scores.csv" + } +} \ No newline at end of file diff --git a/axolotl/tests/data/problems/database_problem_4/problemDoc.json b/axolotl/tests/data/problems/database_problem_4/problemDoc.json new file mode 100644 index 0000000..91cb471 --- /dev/null +++ b/axolotl/tests/data/problems/database_problem_4/problemDoc.json @@ -0,0 +1,37 @@ +{ + "about": { + "problemID": "database_problem_4", + "problemName": "Database problem of type HAS_USER_MADE_COMMENT_ON_POST", + "problemSchemaVersion": "4.0.0", + "problemVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "database_dataset_4", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 3, + "colName": "made_comment" + } + ] + } + ], + "performanceMetrics": [ + { + "metric": "f1", + "posLabel": "yes" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv", + "scoresFile": "scores.csv" + } +} \ No newline at end of file diff --git a/axolotl/tests/data/problems/image_problem_2/problemDoc.json b/axolotl/tests/data/problems/image_problem_2/problemDoc.json new file mode 100644 index 0000000..f2b94be --- /dev/null +++ b/axolotl/tests/data/problems/image_problem_2/problemDoc.json @@ -0,0 +1,36 @@ +{ + "about": { + "problemID": "image_problem_2", + "problemName": "Multiclass image classification", + "problemDescription": "Multiclass image classification problem. Each image belongs to one of 10 classes. Based on 124_120_mnist_problem.", + "problemSchemaVersion": "4.0.0", + "problemVersion": "4.0.0", + "taskKeywords": [ + "classification", + "multiClass" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "image_dataset_2", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 2, + "colName": "label" + } + ] + } + ], + "performanceMetrics": [ + { + "metric": "accuracy" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/axolotl/tests/data/problems/iris_problem_1/dataSplits.csv b/axolotl/tests/data/problems/iris_problem_1/dataSplits.csv new file mode 100644 index 0000000..69d033e --- /dev/null +++ b/axolotl/tests/data/problems/iris_problem_1/dataSplits.csv @@ -0,0 +1,151 @@ +d3mIndex,type,repeat,fold +0,TRAIN,0,0 +1,TRAIN,0,0 +2,TEST,0,0 +3,TRAIN,0,0 +4,TEST,0,0 +5,TRAIN,0,0 +6,TRAIN,0,0 +7,TRAIN,0,0 +8,TRAIN,0,0 +9,TRAIN,0,0 +10,TRAIN,0,0 +11,TEST,0,0 +12,TRAIN,0,0 +13,TRAIN,0,0 +14,TRAIN,0,0 +15,TEST,0,0 +16,TRAIN,0,0 +17,TRAIN,0,0 +18,TRAIN,0,0 +19,TRAIN,0,0 +20,TRAIN,0,0 +21,TEST,0,0 +22,TRAIN,0,0 +23,TRAIN,0,0 +24,TEST,0,0 +25,TRAIN,0,0 +26,TEST,0,0 +27,TRAIN,0,0 +28,TEST,0,0 +29,TRAIN,0,0 +30,TRAIN,0,0 +31,TRAIN,0,0 +32,TEST,0,0 +33,TEST,0,0 +34,TRAIN,0,0 +35,TRAIN,0,0 +36,TRAIN,0,0 +37,TRAIN,0,0 +38,TRAIN,0,0 +39,TEST,0,0 +40,TEST,0,0 +41,TRAIN,0,0 +42,TRAIN,0,0 +43,TRAIN,0,0 +44,TEST,0,0 +45,TRAIN,0,0 +46,TRAIN,0,0 +47,TEST,0,0 +48,TRAIN,0,0 +49,TRAIN,0,0 +50,TEST,0,0 +51,TRAIN,0,0 +52,TEST,0,0 +53,TEST,0,0 +54,TEST,0,0 +55,TRAIN,0,0 +56,TRAIN,0,0 +57,TEST,0,0 +58,TRAIN,0,0 +59,TEST,0,0 +60,TRAIN,0,0 +61,TEST,0,0 +62,TRAIN,0,0 +63,TEST,0,0 +64,TRAIN,0,0 +65,TRAIN,0,0 +66,TEST,0,0 +67,TEST,0,0 +68,TRAIN,0,0 +69,TRAIN,0,0 +70,TRAIN,0,0 +71,TRAIN,0,0 +72,TRAIN,0,0 +73,TRAIN,0,0 +74,TRAIN,0,0 +75,TRAIN,0,0 +76,TRAIN,0,0 +77,TRAIN,0,0 +78,TRAIN,0,0 +79,TRAIN,0,0 +80,TEST,0,0 +81,TEST,0,0 +82,TEST,0,0 +83,TRAIN,0,0 +84,TRAIN,0,0 +85,TRAIN,0,0 +86,TRAIN,0,0 +87,TRAIN,0,0 +88,TRAIN,0,0 +89,TRAIN,0,0 +90,TRAIN,0,0 +91,TEST,0,0 +92,TEST,0,0 +93,TRAIN,0,0 +94,TRAIN,0,0 +95,TEST,0,0 +96,TRAIN,0,0 +97,TRAIN,0,0 +98,TRAIN,0,0 +99,TEST,0,0 +100,TRAIN,0,0 +101,TEST,0,0 +102,TEST,0,0 +103,TRAIN,0,0 +104,TRAIN,0,0 +105,TRAIN,0,0 +106,TRAIN,0,0 +107,TRAIN,0,0 +108,TEST,0,0 +109,TEST,0,0 +110,TRAIN,0,0 +111,TRAIN,0,0 +112,TRAIN,0,0 +113,TEST,0,0 +114,TRAIN,0,0 +115,TEST,0,0 +116,TRAIN,0,0 +117,TRAIN,0,0 +118,TRAIN,0,0 +119,TRAIN,0,0 +120,TRAIN,0,0 +121,TEST,0,0 +122,TEST,0,0 +123,TRAIN,0,0 +124,TRAIN,0,0 +125,TEST,0,0 +126,TRAIN,0,0 +127,TEST,0,0 +128,TRAIN,0,0 +129,TEST,0,0 +130,TRAIN,0,0 +131,TRAIN,0,0 +132,TRAIN,0,0 +133,TRAIN,0,0 +134,TEST,0,0 +135,TRAIN,0,0 +136,TRAIN,0,0 +137,TEST,0,0 +138,TRAIN,0,0 +139,TRAIN,0,0 +140,TRAIN,0,0 +141,TRAIN,0,0 +142,TRAIN,0,0 +143,TRAIN,0,0 +144,TEST,0,0 +145,TRAIN,0,0 +146,TRAIN,0,0 +147,TRAIN,0,0 +148,TRAIN,0,0 +149,TRAIN,0,0 diff --git a/axolotl/tests/data/problems/iris_problem_1/problemDoc.json b/axolotl/tests/data/problems/iris_problem_1/problemDoc.json new file mode 100644 index 0000000..7cb357b --- /dev/null +++ b/axolotl/tests/data/problems/iris_problem_1/problemDoc.json @@ -0,0 +1,45 @@ +{ + "about": { + "problemID": "iris_problem_1", + "problemName": "Distinguish Iris flowers", + "problemDescription": "Distinguish Iris flowers of three related species.", + "problemSchemaVersion": "4.0.0", + "problemVersion": "4.0.0", + "taskKeywords": [ + "classification", + "multiClass" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "iris_dataset_1", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 5, + "colName": "species" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.3, + "numFolds": 0, + "stratified": false, + "numRepeats": 0, + "splitsFile": "dataSplits.csv" + }, + "performanceMetrics": [ + { + "metric": "accuracy" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv", + "scoresFile": "scores.csv" + } +} \ No newline at end of file diff --git a/axolotl/tests/data/problems/iris_problem_2/problemDoc.json b/axolotl/tests/data/problems/iris_problem_2/problemDoc.json new file mode 100644 index 0000000..a36588e --- /dev/null +++ b/axolotl/tests/data/problems/iris_problem_2/problemDoc.json @@ -0,0 +1,36 @@ +{ + "about": { + "problemID": "iris_problem_2", + "problemName": "Distinguish Iris flowers", + "problemDescription": "Distinguish Iris flowers of three related species, without datasetID in targets.", + "problemSchemaVersion": "4.0.0", + "problemVersion": "4.0.0", + "taskKeywords": [ + "classification", + "multiClass" + ] + }, + "inputs": { + "data": [ + { + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 5, + "colName": "species" + } + ] + } + ], + "performanceMetrics": [ + { + "metric": "accuracy" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv", + "scoresFile": "scores.csv" + } +} diff --git a/axolotl/tests/data/problems/multi_dataset_problem/problemDoc.json b/axolotl/tests/data/problems/multi_dataset_problem/problemDoc.json new file mode 100644 index 0000000..c723d57 --- /dev/null +++ b/axolotl/tests/data/problems/multi_dataset_problem/problemDoc.json @@ -0,0 +1,48 @@ +{ + "about": { + "problemID": "multi_input_problem", + "problemName": "Problem associate with multiple dataset", + "problemDescription": "Distinguish Iris flowers of three related species.", + "problemSchemaVersion": "4.0.0", + "problemVersion": "4.0.0", + "taskKeywords": [ + "classification", + "multiClass" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "iris_dataset_1", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 5, + "colName": "species" + } + ] + }, + { + "datasetID": "boston_dataset_1", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 14, + "colName": "MEDV" + } + ] + } + ], + "performanceMetrics": [ + { + "metric": "accuracy" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv", + "scoresFile": "scores.csv" + } +} \ No newline at end of file diff --git a/axolotl/tests/resources/logistic_regeression.json b/axolotl/tests/resources/logistic_regeression.json new file mode 100644 index 0000000..5f30a0c --- /dev/null +++ b/axolotl/tests/resources/logistic_regeression.json @@ -0,0 +1,146 @@ +{ + "id": "b9cc24a0-30ce-4fe2-adde-77af46987f60", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2020-06-26T02:36:34.125148Z", + "inputs": [ + { + "name": "inputs" + } + ], + "outputs": [ + { + "data": "steps.4.produce", + "name": "output predictions" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "version": "0.3.0", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "name": "Extract a DataFrame from a Dataset" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", + "version": "0.6.0", + "python_path": "d3m.primitives.data_transformation.column_parser.Common", + "name": "Parses strings into their types" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.0.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde", + "version": "2020.6.24", + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "name": "sklearn.impute.SimpleImputer" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.1.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "use_semantic_types": { + "type": "VALUE", + "data": true + }, + "return_result": { + "type": "VALUE", + "data": "replace" + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "b9c81b40-8ed1-3b23-80cf-0d6fe6863962", + "version": "2020.6.24", + "python_path": "d3m.primitives.classification.logistic_regression.SKlearn", + "name": "sklearn.linear_model.logistic.LogisticRegression" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.2.produce" + }, + "outputs": { + "type": "CONTAINER", + "data": "steps.2.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "use_semantic_types": { + "type": "VALUE", + "data": true + }, + "add_index_columns": { + "type": "VALUE", + "data": true + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "8d38b340-f83f-4877-baaa-162f8e551736", + "version": "0.3.0", + "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", + "name": "Construct pipeline predictions output" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.3.produce" + }, + "reference": { + "type": "CONTAINER", + "data": "steps.0.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/resources/svc_pipeline.json b/axolotl/tests/resources/svc_pipeline.json new file mode 100644 index 0000000..13d532c --- /dev/null +++ b/axolotl/tests/resources/svc_pipeline.json @@ -0,0 +1,146 @@ +{ + "id": "c41cbe88-7caf-45a3-a7e1-77dda65709b5", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2020-06-26T02:36:35.138147Z", + "inputs": [ + { + "name": "inputs" + } + ], + "outputs": [ + { + "data": "steps.4.produce", + "name": "output predictions" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "version": "0.3.0", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "name": "Extract a DataFrame from a Dataset" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", + "version": "0.6.0", + "python_path": "d3m.primitives.data_transformation.column_parser.Common", + "name": "Parses strings into their types" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.0.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde", + "version": "2020.6.24", + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "name": "sklearn.impute.SimpleImputer" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.1.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "use_semantic_types": { + "type": "VALUE", + "data": true + }, + "return_result": { + "type": "VALUE", + "data": "replace" + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "0ae7d42d-f765-3348-a28c-57d94880aa6a", + "version": "2020.6.24", + "python_path": "d3m.primitives.classification.svc.SKlearn", + "name": "sklearn.svm.classes.SVC" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.2.produce" + }, + "outputs": { + "type": "CONTAINER", + "data": "steps.2.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "use_semantic_types": { + "type": "VALUE", + "data": true + }, + "add_index_columns": { + "type": "VALUE", + "data": true + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "8d38b340-f83f-4877-baaa-162f8e551736", + "version": "0.3.0", + "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", + "name": "Construct pipeline predictions output" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.3.produce" + }, + "reference": { + "type": "CONTAINER", + "data": "steps.0.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] +} \ No newline at end of file diff --git a/axolotl/tests/test_algorithms_dummy.py b/axolotl/tests/test_algorithms_dummy.py new file mode 100644 index 0000000..429fac7 --- /dev/null +++ b/axolotl/tests/test_algorithms_dummy.py @@ -0,0 +1,55 @@ +from pathlib import Path +import unittest +import tempfile +import shutil + +from d3m.metadata import problem as problem_module +from d3m import container + +from axolotl.backend.simple import SimpleRunner +from axolotl.algorithms.dummy import DummySearch + + +class SimpleSearch(unittest.TestCase): + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_search_fit_produce(self): + problem_description, dataset = get_data() + + backend = SimpleRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir) + dummy_search = DummySearch(problem_description=problem_description, backend=backend) + + # check if we were able to find and fit + fitted_pipeline, pipeline_result = dummy_search.search_fit(input_data=[dataset], time_limit=100) + self.assertEqual(pipeline_result.error, None) + + # check first history entry + self.assertEqual(dummy_search.history[0].scores.values.tolist()[0], [ + 'ACCURACY', 0.9133333333333333, 0.9133333333333333, 42, 0]) + + # test if we can produce the same training input + pipeline_result = dummy_search.produce(fitted_pipeline, [dataset]) + self.assertEqual(pipeline_result.error, None) + + +def get_data(dataset_name='iris_dataset_1', problem_name='iris_problem_1'): + if problem_name: + problem_doc_path = Path( + Path(__file__).parent.absolute(), 'data', 'problems', problem_name, 'problemDoc.json' + ).as_uri() + problem_description = problem_module.get_problem(problem_doc_path) + else: + problem_description = None + + dataset_doc_path = Path(Path(__file__).parent.absolute(), 'data', 'datasets', + dataset_name, 'datasetDoc.json').as_uri() + iris_dataset = container.dataset.get_dataset(dataset_doc_path) + return problem_description, iris_dataset + + +if __name__ == '__main__': + unittest.main() diff --git a/axolotl/tests/test_autokeras.py b/axolotl/tests/test_autokeras.py new file mode 100644 index 0000000..174cdef --- /dev/null +++ b/axolotl/tests/test_autokeras.py @@ -0,0 +1,82 @@ +import pathlib +import shutil +import sys +import unittest + +import os +import tempfile + +from axolotl.algorithms.autokeras_search import AutoKerasSearch +from axolotl.backend.simple import SimpleRunner + +PROJECT_ROOT = os.path.join(os.path.dirname(__file__), '..') +sys.path.insert(0, PROJECT_ROOT) + +from d3m.metadata import problem as problem_module +from d3m import container as container_module + + +class TestAutoKeras(unittest.TestCase): + def setUp(self): + self.test_dir = tempfile.mkdtemp() + self.backend = SimpleRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir) + + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_fit(self): + test_data = os.path.join(PROJECT_ROOT, 'tests', 'data') + dataset_name = 'image_dataset_2' + + dataset_path = os.path.join( + test_data, 'datasets', dataset_name, 'datasetDoc.json') + dataset = self.__get_dataset(dataset_path) + + problem_path = os.path.join( + test_data, 'problems', dataset_name.replace('dataset', 'problem'), 'problemDoc.json') + problem = self.__get_problem(problem_path) + + tuner_base = AutoKerasSearch(problem, backend=self.backend, max_trials=1, directory=self.test_dir) + pipeline_result = tuner_base.search_fit(input_data=[dataset], time_limit=1000) + # TODO https://gitlab.com/datadrivendiscovery/jpl-primitives/-/issues/41 + self.assertNotEqual(pipeline_result.error, None) + + def _fit_cifar10(self): + test_data = os.path.join('/data/d3m/datasets/seed_datasets_current') + dataset_name = '124_174_cifar10_MIN_METADATA' + + dataset_path = os.path.join( + test_data, dataset_name, '{}_dataset'.format(dataset_name), 'datasetDoc.json') + dataset = self.__get_dataset(dataset_path) + + problem_path = os.path.join( + test_data, dataset_name, '{}_problem'.format(dataset_name), 'problemDoc.json') + problem = self.__get_problem(problem_path) + + tuner_base = AutoKerasSearch(problem, backend=self.backend, max_trials=1, directory=self.test_dir) + pipeline_result = tuner_base.search_fit(input_data=[dataset], time_limit=1000) + # TODO https://gitlab.com/datadrivendiscovery/jpl-primitives/-/issues/41 + self.assertNotEqual(pipeline_result.error, None) + + def __get_uri(self, path): + return pathlib.Path(os.path.abspath(path)).as_uri() + + def __get_problem(self, problem_path): + problem_uri = self.__get_uri(problem_path) + problem = problem_module.Problem.load(problem_uri) + return problem + + def __get_dataset(self, dataset_path): + dataset_uri = self.__get_uri(dataset_path) + dataset = container_module.dataset.get_dataset(dataset_uri) + return dataset + + +if __name__ == '__main__': + suite = unittest.TestSuite() + for test_case in ( + 'test_fit', + ): + suite.addTest(TestAutoKeras(test_case)) + unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/axolotl/tests/test_backend_ray.py b/axolotl/tests/test_backend_ray.py new file mode 100644 index 0000000..1eeed1a --- /dev/null +++ b/axolotl/tests/test_backend_ray.py @@ -0,0 +1,105 @@ +import ray +import json +from pathlib import Path +import unittest +import tempfile +import shutil + +from d3m.metadata import problem as problem_module +from d3m import container + +from axolotl.backend.ray import RayRunner +from axolotl.utils import schemas as schemas_utils, pipeline as pipeline_utils + + +class SimpleRunnerTestCase(unittest.TestCase): + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_fit_produce_pipelines(self): + pipeline = get_classification_pipeline() + problem_description, dataset = get_data() + ray_runner = RayRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir, n_workers=1) + result = ray_runner.fit_pipeline(problem_description=problem_description, + pipeline=pipeline, input_data=[dataset]) + + self.assertEqual(result.status, 'COMPLETED') + + result = ray_runner.produce_pipeline(fitted_pipeline_id=result.fitted_pipeline_id, input_data=[dataset]) + self.assertEqual(result.status, 'COMPLETED') + + def test_evaluate_pipeline(self): + pipeline = get_classification_pipeline() + ray_runner = RayRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir, n_workers=1) + problem_description, dataset = get_data() + data_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + scoring_pipeline = schemas_utils.get_scoring_pipeline() + + no_split = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + + result = ray_runner.evaluate_pipeline( + problem_description=problem_description, pipeline=pipeline, + input_data=[dataset], metrics=schemas_utils.MULTICLASS_CLASSIFICATION_METRICS, + data_preparation_pipeline=data_pipeline, scoring_pipeline=scoring_pipeline, + data_preparation_params=no_split + ) + + self.assertEqual(result.error, None) + self.assertEqual(result.scores.values.tolist(), [ + ['ACCURACY', 0.9133333333333333, 0.9133333333333333, 42, 0], + ['F1_MICRO', 0.9133333333333333, 0.9133333333333333, 42, 0], + ['F1_MACRO', 0.9123688388315397, 0.9123688388315397, 42, 0]] + ) + + def test_evaluate_pipelines(self): + pipeline = get_classification_pipeline() + ray_runner = RayRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir, n_workers=1) + problem_description, dataset = get_data() + data_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + scoring_pipeline = schemas_utils.get_scoring_pipeline() + + no_split = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + + results = ray_runner.evaluate_pipelines( + problem_description=problem_description, pipelines=[pipeline] * 3, + input_data=[dataset], metrics=schemas_utils.MULTICLASS_CLASSIFICATION_METRICS, + data_preparation_pipeline=data_pipeline, scoring_pipeline=scoring_pipeline, + data_preparation_params=no_split + ) + + for result in results: + self.assertEqual(result.error, None) + self.assertEqual(result.status, 'COMPLETED') + + +def get_classification_pipeline(): + with open(schemas_utils.PIPELINES_DB_DIR) as file: + default_pipelines = json.load(file) + + return pipeline_utils.load_pipeline(default_pipelines['CLASSIFICATION'][0]) + + +def get_data(dataset_name='iris_dataset_1', problem_name='iris_problem_1'): + if problem_name: + problem_doc_path = Path( + Path(__file__).parent.absolute(), 'data', 'problems', problem_name, 'problemDoc.json' + ).as_uri() + problem_description = problem_module.get_problem(problem_doc_path) + else: + problem_description = None + + dataset_doc_path = Path(Path(__file__).parent.absolute(), 'data', 'datasets', + dataset_name, 'datasetDoc.json').as_uri() + iris_dataset = container.dataset.get_dataset(dataset_doc_path) + return problem_description, iris_dataset + + +if __name__ == '__main__': + ray.init() + unittest.main() + ray.shutdown() + + diff --git a/axolotl/tests/test_backend_simple.py b/axolotl/tests/test_backend_simple.py new file mode 100644 index 0000000..93abe92 --- /dev/null +++ b/axolotl/tests/test_backend_simple.py @@ -0,0 +1,82 @@ +import json +from pathlib import Path +import unittest +import tempfile +import shutil + +from d3m.metadata import problem as problem_module +from d3m import container + +from axolotl.backend.simple import SimpleRunner +from axolotl.utils import schemas as schemas_utils, pipeline as pipeline_utils + + +class SimpleRunnerTestCase(unittest.TestCase): + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_fit_produce_pipelines(self): + pipeline = get_classification_pipeline() + problem_description, dataset = get_data() + simple_runner = SimpleRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir) + result = simple_runner.fit_pipeline(problem_description=problem_description, pipeline=pipeline, + input_data=[dataset]) + self.assertEqual(result.status, 'COMPLETED') + + result = simple_runner.produce_pipeline(fitted_pipeline_id=result.fitted_pipeline_id, input_data=[dataset]) + self.assertEqual(result.status, 'COMPLETED') + + def test_evaluate_pipelines(self): + pipeline = get_classification_pipeline() + simple_runner = SimpleRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir) + problem_description, dataset = get_data() + data_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + scoring_pipeline = schemas_utils.get_scoring_pipeline() + + no_split = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + + result = simple_runner.evaluate_pipeline( + problem_description=problem_description, pipeline=pipeline, + input_data=[dataset], metrics=schemas_utils.MULTICLASS_CLASSIFICATION_METRICS, + data_preparation_pipeline=data_pipeline, scoring_pipeline=scoring_pipeline, + data_preparation_params=no_split + ) + + # result = list(results.values())[0] + self.assertEqual(result.error, None) + self.assertEqual(result.scores.values.tolist(), [ + ['ACCURACY', 0.9133333333333333, 0.9133333333333333, 42, 0], + ['F1_MICRO', 0.9133333333333333, 0.9133333333333333, 42, 0], + ['F1_MACRO', 0.9123688388315397, 0.9123688388315397, 42, 0]] + ) + + +def get_classification_pipeline(): + with open(schemas_utils.PIPELINES_DB_DIR) as file: + default_pipelines = json.load(file) + + return pipeline_utils.load_pipeline(default_pipelines['CLASSIFICATION'][0]) + + +def get_data(dataset_name='iris_dataset_1', problem_name='iris_problem_1'): + if problem_name: + problem_doc_path = Path( + Path(__file__).parent.absolute(), 'data', 'problems', problem_name, 'problemDoc.json' + ).as_uri() + problem_description = problem_module.get_problem(problem_doc_path) + else: + problem_description = None + + dataset_doc_path = Path(Path(__file__).parent.absolute(), 'data', 'datasets', + dataset_name, 'datasetDoc.json').as_uri() + iris_dataset = container.dataset.get_dataset(dataset_doc_path) + return problem_description, iris_dataset + + +if __name__ == '__main__': + unittest.main() + + diff --git a/axolotl/tests/test_bayesian.py b/axolotl/tests/test_bayesian.py new file mode 100644 index 0000000..53bc92a --- /dev/null +++ b/axolotl/tests/test_bayesian.py @@ -0,0 +1,93 @@ +import pathlib + +import ray +import shutil +import sys +import unittest + +import os +import tempfile +from axolotl.backend.ray import RayRunner + +from axolotl.algorithms.bayesian_search import BayesianSearch +from axolotl.backend.simple import SimpleRunner + +PROJECT_ROOT = os.path.join(os.path.dirname(__file__), '..') +sys.path.insert(0, PROJECT_ROOT) + +from d3m.metadata import problem as problem_module +from d3m import container as container_module +from axolotl.utils import pipeline as pipeline_utils + + +class TestBayesianSearch(unittest.TestCase): + def setUp(self): + self.test_data = os.path.join(PROJECT_ROOT, 'tests', 'data') + dataset_name = 'iris_dataset_1' + problem = self.__get_problem(dataset_name) + self.problem = problem + self.dataset = self.__get_dataset(dataset_name) + self.test_dir = tempfile.mkdtemp() + backend = SimpleRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir) + self.tuner_base = BayesianSearch(problem, backend=backend, max_trials=10, directory=self.test_dir, + num_initial_points=5) + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_fit(self): + _, pipeline_result = self.tuner_base.search_fit(input_data=[self.dataset], time_limit=60) + self.assertEqual(pipeline_result.error, None) + + def test_fit_svc(self): + pipeline_info = os.path.join(os.path.dirname(__file__), 'resources', 'svc_pipeline.json') + pipeline = pipeline_utils.load_pipeline(pipeline_info) + _, pipeline_result = self.tuner_base.search_fit(input_data=[self.dataset], time_limit=60, + pipeline_candidates=[pipeline]) + self.assertEqual(pipeline_result.error, None) + + def test_fit_lr(self): + pipeline_info = os.path.join(os.path.dirname(__file__), 'resources', 'logistic_regeression.json') + pipeline = pipeline_utils.load_pipeline(pipeline_info) + _, pipeline_result = self.tuner_base.search_fit(input_data=[self.dataset], time_limit=60, + pipeline_candidates=[pipeline]) + self.assertEqual(pipeline_result.error, None) + + def test_fit_ray(self): + if not ray.is_initialized(): + ray.init() + backend = RayRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir) + tuner_base = BayesianSearch(self.problem, backend=backend, max_trials=30, directory=self.test_dir, + num_initial_points=5) + _, pipeline_result = tuner_base.search_fit(input_data=[self.dataset], time_limit=100) + self.assertEqual(pipeline_result.error, None) + ray.shutdown() + + def __get_uri(self, path): + return pathlib.Path(os.path.abspath(path)).as_uri() + + def __get_problem(self, dataset_name): + problem_path = os.path.join( + self.test_data, 'problems', dataset_name.replace('dataset', 'problem'), 'problemDoc.json') + problem_uri = self.__get_uri(problem_path) + problem = problem_module.Problem.load(problem_uri) + return problem + + def __get_dataset(self, dataset_name): + dataset_path = os.path.join( + self.test_data, 'datasets', dataset_name, 'datasetDoc.json') + dataset_uri = self.__get_uri(dataset_path) + dataset = container_module.dataset.get_dataset(dataset_uri) + return dataset + + +if __name__ == '__main__': + suite = unittest.TestSuite() + for test_case in ( + 'test_fit', + 'test_fit_ray', + 'test_fit_lr', + 'test_fit_svc', + ): + suite.addTest(TestBayesianSearch(test_case)) + unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/axolotl/tests/test_predefine_pipelines.py b/axolotl/tests/test_predefine_pipelines.py new file mode 100644 index 0000000..5e9991c --- /dev/null +++ b/axolotl/tests/test_predefine_pipelines.py @@ -0,0 +1,85 @@ +import os +import pathlib +import unittest +import sys +PROJECT_ROOT = os.path.join(os.path.dirname(__file__), '..') +sys.path.insert(0, PROJECT_ROOT) + +from d3m.runtime import Runtime +from d3m.metadata import base as metadata_base, problem as problem_module +from d3m import container as container_module + +import axolotl.predefined_pipelines as predefined_pipelines + + +class TestPredefined(unittest.TestCase): + def setUp(self): + self.test_data = os.path.join(PROJECT_ROOT, 'tests', 'data') + + def tearDown(self): + pass + + def test_fetch_from_file(self): + dataset_name = 'iris_dataset_1' + problem = self.__get_problem(dataset_name) + dataset = self.__get_dataset(dataset_name) + predefined_path = os.path.join(PROJECT_ROOT, 'axolotl/utils/resources/default_pipelines.json') + pipelines = predefined_pipelines.fetch_from_file(problem, predefined_path) + self.assertNotEqual(len(pipelines), 0) + result = self.__run_pipeline(pipelines[0], dataset) + result.check_success() + self.assertEqual(result.error, None) + + def test__fetch_from_preprocessors(self): + dataset_name = 'iris_dataset_1' + problem = self.__get_problem(dataset_name) + dataset = self.__get_dataset(dataset_name) + pipelines = predefined_pipelines._fetch_from_preprocessors(dataset, problem) + self.assertNotEqual(len(pipelines), 0) + result = self.__run_pipeline(pipelines[0], dataset) + result.check_success() + self.assertEqual(result.error, None) + + def test_fetch(self): + dataset_name = 'iris_dataset_1' + problem = self.__get_problem(dataset_name) + dataset = self.__get_dataset(dataset_name) + pipelines = predefined_pipelines.fetch(dataset, problem) + self.assertNotEqual(len(pipelines), 0) + result = self.__run_pipeline(pipelines[-1], dataset) + result.check_success() + self.assertEqual(result.error, None) + + def __get_uri(self, path): + return pathlib.Path(os.path.abspath(path)).as_uri() + + def __get_problem(self, dataset_name): + problem_path = os.path.join( + self.test_data, 'problems', dataset_name.replace('dataset', 'problem'), 'problemDoc.json') + problem_uri = self.__get_uri(problem_path) + problem = problem_module.Problem.load(problem_uri) + return problem + + def __get_dataset(self, dataset_name): + dataset_path = os.path.join( + self.test_data, 'datasets', dataset_name, 'datasetDoc.json') + dataset_uri = self.__get_uri(dataset_path) + dataset = container_module.dataset.get_dataset(dataset_uri) + return dataset + + def __run_pipeline(self, pipeline_description, data, volume_dir='/volumes'): + runtime = Runtime(pipeline=pipeline_description, context=metadata_base.Context.TESTING, volumes_dir=volume_dir) + fit_result = runtime.fit([data]) + return fit_result + + +if __name__ == '__main__': + suite = unittest.TestSuite() + for test_case in ( + 'test_fetch_from_file', + 'test__fetch_from_preprocessors', + 'test_fetch', + + ): + suite.addTest(TestPredefined(test_case)) + unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/axolotl/tests/test_preprocessor.py b/axolotl/tests/test_preprocessor.py new file mode 100644 index 0000000..d58110d --- /dev/null +++ b/axolotl/tests/test_preprocessor.py @@ -0,0 +1,246 @@ +import argparse +import pathlib + +import shutil +import sys + +import os +import unittest +from pprint import pprint +PROJECT_ROOT = os.path.join(os.path.dirname(__file__), '..') +sys.path.insert(0, PROJECT_ROOT) + +# from TimeSeriesD3MWrappers.primitives.classification_knn import Kanine +from d3m import container as container_module, index +from d3m.metadata import base as metadata_base, problem as problem_module +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import PrimitiveStep +from d3m.metadata.problem import TaskKeyword +from d3m.runtime import Runtime +from sklearn_wrap.SKLogisticRegression import SKLogisticRegression + +from axolotl.predefined_pipelines import preprocessor +from axolotl.utils import pipeline as pipeline_utils + + +def run_pipeline(pipeline_description, data, volume_dir='/volumes'): + runtime = Runtime(pipeline=pipeline_description, context=metadata_base.Context.TESTING, volumes_dir=volume_dir) + fit_result = runtime.fit([data]) + return fit_result + + +def add_classifier(pipeline_description, dataset_to_dataframe_step, attributes, targets): + lr = PrimitiveStep(primitive=SKLogisticRegression) + lr.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, + data_reference=attributes) + lr.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, + data_reference=targets) + lr.add_output('produce') + pipeline_description.add_step(lr) + + construct_pred = PrimitiveStep( + primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) + construct_pred.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, + data_reference=pipeline_utils.int_to_step(lr.index)) + construct_pred.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, + data_reference=dataset_to_dataframe_step) + construct_pred.add_output('produce') + pipeline_description.add_step(construct_pred) + # Final Output + pipeline_description.add_output(name='output predictions', + data_reference=pipeline_utils.int_to_step(construct_pred.index)) + + +# def add_time_series_specific_classifier(pipeline_description, attributes, targets): +# k = PrimitiveStep(primitive=Kanine) +# k.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, +# data_reference=attributes) +# k.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, +# data_reference=targets) +# k.add_output('produce') +# pipeline_description.add_step(k) +# pipeline_description.add_output(name='output predictions', +# data_reference=pipeline_utils.int_to_step(k.index)) +# return k + + +def _remove_volatile(target_pipe, predef_pipe): + target_pipe = target_pipe.to_json_structure() + for step in target_pipe['steps']: + del step['primitive']['digest'] + subset = {k: v for k, v in target_pipe.items() if k in predef_pipe} + return subset + + +class TestPreprocessor(unittest.TestCase): + time_series_data: container_module.Dataset = None + temp_dir: str = os.path.join(os.path.dirname(__file__), 'temp') + + @classmethod + def setUpClass(cls) -> None: + cls.maxDiff = None + cls.test_data = os.path.join(PROJECT_ROOT, 'tests', 'data') + # cls.time_series_data = datasets.get('timeseries_dataset_2') + # cls.tabular_classification_data = datasets.get('iris_dataset_1') + # cls.image_data = datasets.get('image_dataset_1') + # cls.audio_dataset = datasets.get('audio_dataset_1') + + @classmethod + def tearDownClass(cls): + for dir_name in ( + # cls.test_dir + 'solutions', + # cls.test_dir + 'fitted_solutions', + ): + if os.path.exists(dir_name): + shutil.rmtree(dir_name) + + # def test_timeseries_tabular(self): + # pp = preprocessor.get_preprocessor(task=metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name, + # treatment=metadata_base.PrimitiveFamily.CLASSIFICATION.name, + # data_types=[TaskKeyword.TIME_SERIES], semi=False, + # inputs_metadata=self.time_series_data.metadata, problem=None, + # main_resource='learningData')[0] + # add_classifier(pp.pipeline_description, pp.dataset_to_dataframe_step, pp.attributes, pp.targets) + # result = run_pipeline(pp.pipeline_description, self.time_series_data) + # result.check_success() + # self.assertEqual(result.error, None) + # + # def test_timeseries_specific(self): + # pp = preprocessor.get_preprocessor(task=metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name, + # treatment=metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION.name, + # data_types=[TaskKeyword.TIME_SERIES], semi=False, + # inputs_metadata=self.time_series_data.metadata, problem=None, + # main_resource='learningData')[0] + # + # add_time_series_specific_classifier(pp.pipeline_description, pp.attributes, pp.targets) + # result = run_pipeline(pp.pipeline_description, self.time_series_data) + # result.check_success() + # self.assertEqual(result.error, None) + + def test_TabularPreprocessor(self): + dataset_name = 'iris_dataset_1' + problem = self.__get_problem(dataset_name) + dataset = self.__get_dataset(dataset_name) + pp = preprocessor.get_preprocessor( + input_data=dataset, + problem=problem, + treatment=metadata_base.PrimitiveFamily.CLASSIFICATION.name, + )[0] + add_classifier(pp.pipeline_description, pp.dataset_to_dataframe_step, pp.attributes, pp.targets) + result = run_pipeline(pp.pipeline_description, dataset) + # pprint(pp.pipeline_description.to_json_structure()) + result.check_success() + self.assertEqual(result.error, None) + + # def test_image_tensor(self): + # pp = preprocessor.get_preprocessor(task=metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING.name, + # treatment=metadata_base.PrimitiveFamily.DIGITAL_IMAGE_PROCESSING.name, + # data_types=[TaskKeyword.IMAGE], semi=False, + # inputs_metadata=self.image_data.metadata, problem=None, + # main_resource='learningData')[0] + # add_classifier(pp.pipeline_description, pp.dataset_to_dataframe_step, pp.attributes, pp.targets) + # # pprint(pp.pipeline_description.to_json_structure()) + # result = run_pipeline(pp.pipeline_description, self.image_data) + # result.check_success() + # self.assertEqual(result.error, None) + + # TODO update static files on the CI + # def test_ImageDataFramePreprocessor(self): + # dataset_name = 'image_dataset_2' + # problem = self.__get_problem(dataset_name) + # dataset = self.__get_dataset(dataset_name) + # problem['problem']['task_keywords'].append(TaskKeyword.IMAGE) + # pp = preprocessor.get_preprocessor( + # input_data=dataset, + # problem=problem, + # treatment=metadata_base.PrimitiveFamily.CLASSIFICATION.name, + # )[0] + # volume = os.path.join(PROJECT_ROOT, 'tests') + # add_classifier(pp.pipeline_description, pp.dataset_to_dataframe_step, pp.attributes, pp.targets) + # # pprint(pp.pipeline_description.to_json_structure()) + # result = run_pipeline(pp.pipeline_description, dataset, volume_dir=volume) + # result.check_success() + # self.assertEqual(result.error, None) + + # TODO need to augment text_dataset_1 + # def test_TextPreprocessor(self): + # dataset_name = 'text_dataset_1' + # # No text_problem_1, so I use iris_problem instead + # problem = self.__get_problem('iris_problem_1') + # problem['problem']['task_keywords'] = [TaskKeyword.CLASSIFICATION, TaskKeyword.TEXT] + # dataset = self.__get_dataset(dataset_name) + # # TextSent2VecPreprocessor, TextPreprocessor + # pp = preprocessor.get_preprocessor( + # input_data=dataset, + # problem=problem, + # treatment=metadata_base.PrimitiveFamily.CLASSIFICATION.name, + # )[-1] + # add_classifier(pp.pipeline_description, pp.dataset_to_dataframe_step, pp.attributes, pp.targets) + # pprint(pp.pipeline_description.to_json_structure()) + # result = run_pipeline(pp.pipeline_description, dataset) + # result.check_success() + # self.assertEqual(result.error, None) + + # def test_timeseries_forecasting_tabular(self): + # dataset = datasets.get('timeseries_dataset_1') + # pp = preprocessor.get_preprocessor(task=metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING.name, + # treatment=metadata_base.PrimitiveFamily.TIME_SERIES_FORECASTING.name, + # data_types=[TaskKeyword.TIME_SERIES.name, TaskKeyword.TABULAR.name], + # semi=False, inputs_metadata=dataset.metadata, problem=None, + # main_resource='learningData')[0] + # + # add_classifier(pp.pipeline_description, pp.dataset_to_dataframe_step, pp.attributes, pp.targets) + # result = run_pipeline(pp.pipeline_description, dataset) + # pprint(pp.pipeline_description.to_json_structure()) + # result.check_success() + # self.assertEqual(result.error, None) + + # TODO need to update tests/data/datasets/audio_dataset_1 + # def test_AudioPreprocessor(self): + # dataset_name = 'audio_dataset_1' + # # No audio_problem_1, so I use iris_problem instead + # problem = self.__get_problem('iris_problem_1') + # problem['problem']['task_keywords'] = [TaskKeyword.AUDIO, TaskKeyword.VIDEO] + # dataset = self.__get_dataset(dataset_name) + # pp = preprocessor.get_preprocessor( + # input_data=dataset, + # problem=problem, + # treatment=metadata_base.PrimitiveFamily.DIGITAL_SIGNAL_PROCESSING.name, + # )[-1] + # volume = os.path.join(PROJECT_ROOT, 'tests') + # add_classifier(pp.pipeline_description, pp.dataset_to_dataframe_step, pp.attributes, pp.targets) + # pprint(pp.pipeline_description.to_json_structure()) + # result = run_pipeline(pp.pipeline_description, dataset, volume_dir=volume) + # result.check_success() + # + # self.assertEqual(result.error, None) + + def __get_uri(self, path): + return pathlib.Path(os.path.abspath(path)).as_uri() + + def __get_problem(self, dataset_name): + problem_path = os.path.join( + self.test_data, 'problems', dataset_name.replace('dataset', 'problem'), 'problemDoc.json') + problem_uri = self.__get_uri(problem_path) + problem = problem_module.Problem.load(problem_uri) + return problem + + def __get_dataset(self, dataset_name): + dataset_path = os.path.join( + self.test_data, 'datasets', dataset_name, 'datasetDoc.json') + dataset_uri = self.__get_uri(dataset_path) + dataset = container_module.dataset.get_dataset(dataset_uri) + return dataset + + +# if __name__ == '__main__': +# suite = unittest.TestSuite() +# for test_case in ( +# # 'test_ImageDataFramePreprocessor', +# 'test_TabularPreprocessor', +# # 'test_AudioPreprocessor', +# # 'test_TextPreprocessor', +# +# ): +# suite.addTest(TestPreprocessor(test_case)) +# unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/axolotl/tests/test_random_search.py b/axolotl/tests/test_random_search.py new file mode 100644 index 0000000..6067d39 --- /dev/null +++ b/axolotl/tests/test_random_search.py @@ -0,0 +1,91 @@ +import pathlib + +import ray +import shutil +import sys +import unittest + +import os +import tempfile +from axolotl.backend.ray import RayRunner + +from axolotl.algorithms.random_search import RandomSearch +from axolotl.backend.simple import SimpleRunner + +PROJECT_ROOT = os.path.join(os.path.dirname(__file__), '..') +sys.path.insert(0, PROJECT_ROOT) + +from d3m.metadata import problem as problem_module +from d3m import container as container_module +from axolotl.utils import pipeline as pipeline_utils + + +class TestRandomSearch(unittest.TestCase): + def setUp(self): + self.test_data = os.path.join(PROJECT_ROOT, 'tests', 'data') + dataset_name = 'iris_dataset_1' + problem = self.__get_problem(dataset_name) + self.problem = problem + self.dataset = self.__get_dataset(dataset_name) + self.test_dir = tempfile.mkdtemp() + backend = SimpleRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir) + self.tuner_base = RandomSearch(problem, backend=backend, max_trials=10, directory=self.test_dir) + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_fit(self): + _, pipeline_result = self.tuner_base.search_fit(input_data=[self.dataset], time_limit=60) + self.assertEqual(pipeline_result.error, None) + + def test_fit_svc(self): + pipeline_info = os.path.join(os.path.dirname(__file__), 'resources', 'svc_pipeline.json') + pipeline = pipeline_utils.load_pipeline(pipeline_info) + _, pipeline_result = self.tuner_base.search_fit(input_data=[self.dataset], time_limit=60, + pipeline_candidates=[pipeline]) + self.assertEqual(pipeline_result.error, None) + + def test_fit_lr(self): + pipeline_info = os.path.join(os.path.dirname(__file__), 'resources', 'logistic_regeression.json') + pipeline = pipeline_utils.load_pipeline(pipeline_info) + _, pipeline_result = self.tuner_base.search_fit(input_data=[self.dataset], time_limit=60, + pipeline_candidates=[pipeline]) + self.assertEqual(pipeline_result.error, None) + + def test_fit_ray(self): + if not ray.is_initialized(): + ray.init() + backend = RayRunner(random_seed=42, volumes_dir=None, scratch_dir=self.test_dir) + tuner_base = RandomSearch(self.problem, backend=backend, max_trials=30, directory=self.test_dir) + _, pipeline_result = tuner_base.search_fit(input_data=[self.dataset], time_limit=60) + self.assertEqual(pipeline_result.error, None) + ray.shutdown() + + def __get_uri(self, path): + return pathlib.Path(os.path.abspath(path)).as_uri() + + def __get_problem(self, dataset_name): + problem_path = os.path.join( + self.test_data, 'problems', dataset_name.replace('dataset', 'problem'), 'problemDoc.json') + problem_uri = self.__get_uri(problem_path) + problem = problem_module.Problem.load(problem_uri) + return problem + + def __get_dataset(self, dataset_name): + dataset_path = os.path.join( + self.test_data, 'datasets', dataset_name, 'datasetDoc.json') + dataset_uri = self.__get_uri(dataset_path) + dataset = container_module.dataset.get_dataset(dataset_uri) + return dataset + + +if __name__ == '__main__': + suite = unittest.TestSuite() + for test_case in ( + 'test_fit', + 'test_fit_two', + 'test_fit_lr', + 'test_fit_ray', + ): + suite.addTest(TestRandomSearch(test_case)) + unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/d3m/CODE_STYLE.md b/d3m/CODE_STYLE.md new file mode 100644 index 0000000..7dc0afa --- /dev/null +++ b/d3m/CODE_STYLE.md @@ -0,0 +1,258 @@ +# Code Style + +## Python + +**Consistency is the main code style guideline** and if in doubt, try to find a similar existing code and style +your code the same. Our code style is very similar to [PEP8](https://www.python.org/dev/peps/pep-0008/) with few +more details. + +Indent with 4 spaces. Never with tabs. No trailing whitespace. + +**Be verbose**. Always fully spell out any part of the function, class, or variable name. + +### Blank lines + +Use blank lines to organize long code blocks into units of what they do. Often a block is preceded by a +comment, explaining what the block does. + +This will help someone new understand the code quicker when they read it. You are leaving little hints behind, +what parts of code to understand as one unit, one step of your algorithm. Imagine you were writing the code +to be published in an article and you try to make everything as easy to learn as possible. It's the same +here, because we assume our teammates are going to use the code after us. + +Comments always have one blank line before them, except when they are the first line of an indented block of code. + +```python +for item in items: + # No new line above this comment. + ... + +# New line above this comment. +... +``` + +Do not have multiple (two or more) blank lines beyond what is expected by PEP8. + +### Line wrapping + +We **do not wrap lines** except when logically reasonable or when it greatly increases readability +(we still wrap logically and not just at the end of the line). + +We do wrap comments at the 120 characters right margin. If the comment wraps to two lines, balance the lines +so they are both approximately the same length. + +The closing brace/bracket/parenthesis on multi-line constructs should align with the first character of the +line that starts the multi-line construct, as in: + +```python +my_list = [ + 1, 2, 3, + 4, 5, 6, +] +result = some_function_that_takes_arguments( + 'a', 'b', 'c', + 'd', 'e', 'f', +) +``` + +Always include a trailing comma in such cases. + +When defining a function which takes too many arguments to leave all of them in one line, use hanging indentation: + +```python +def some_function_that_takes_arguments( + a, b, c, + d, e, f, +): + return a + b + c + d + e + f +``` + +[Black](https://github.com/python/black) generally formats according to this style so you can use +it to help you. + +### Strings + +Use `'single_quote_strings'` for constant strings and `"double_quote_strings"` for any string shown to the +user (like exception messages, or warning). A general guideline is: if a string might be ever translated to a +different language, use double quotes for it. + +This means all dictionary key names should use single quotes. + +Always use keyword based string formatting. When only simple variable name interpolation is being done, +[f-Strings](https://realpython.com/python-f-strings/) are the preferred format. + +```python +f"Value is '{value}' and message is '{message}'." +``` + +If longer expressions are being computed, then `.format()` should be used, with keywords. + +```python +"This complicated string lists all values: {values}".format( + values=[x.lowercase() for x in values], +) +``` + +Inline values wrap inside messages with `'`. If value is at the end of the message, there is no +need for wrapping and also no need for trailing dot. + +When creating logging statements, use `%`-based format, also with keyword based arguments. + +```python +logger.misc("Step '%(requirement)s'.", {'requirement': requirement}) +``` + +### Logging + +Use [Python logging facility](https://docs.python.org/3/library/logging.html) for all output and never use +`print()` (except when used in CLI commands). Obtain `logger` instance by using `__name__`, at the very +beginning of the module: + +```python +import logging + +logger = logging.getLogger(__name__) +``` + +### Imports + +Imports should be **just modules** divided into multiples sections, in order from more global to more local, separated by empty line: + * core Python packages + * external Python packages + * non-local imports (for example, imports from some other top-level `d3m.` module) + * local relative imports for the current module and sub-modules + +Inside each section, imports should be ordered alphabetically, first based on package name, then on model imported. +Each package should be on its own line, but importing multiple modules from the same package should be in one line. + +Example: + +```python +import os +import time + +import numpy +from sklearn import metrics, model_selection + +from d3m import exceptions +from d3m.metadata import problem + +from . import data +``` + +If you are importing multiple modules with the same name from different package, rename more global one with a prefix +of the package: + +```python +from sklearn import metrics as sklearn_metrics + +from d3m import metrics +``` + +### Docstrings + +Every class, method and function has a docstring with description. Docstrings should be split into multiple lines +when needed to improve readability. Docstrings should use the [numpy style](https://numpydoc.readthedocs.io/en/latest/format.html#docstring-standard) +to document arguments, return value and everything else, which means also using [ReST/Sphinx](http://www.sphinx-doc.org/en/stable/rest.html) +syntax for formatting. + +Always separate the docstring from the rest of the code with an empty line, and have `"""` on their own line, even +for one-line docstrings. + +We use a custom [Python metaclasses](https://docs.python.org/3/reference/datamodel.html#metaclasses) for d3m classes which +[automatically inherits or extends docstrings from parent methods](https://github.com/meowklaski/custom_inherit): + +```python +from d3m import utils + + +class MyBaseClass(metaclass=utils.Metaclass): + pass + + +class MyAbstractBaseClass(metaclass=utils.AbstractMetaclass): + pass +``` + +### Comments + +Both standalone one-sentence one-line comments and multi-sentence comments should have grammatically correct punctuation. +For formatting, use [ReST/Sphinx](http://www.sphinx-doc.org/en/stable/rest.html) syntax. + +- When you are explaining what the code will do, end the sentence with a dot. + + ```python + # Calculate total value. + value = quantity * price + ``` +- Short after-the-line comments (which should not be sentences) do not have an ending dot: + + ```python + sleep(10) # seconds + ``` + +- Titles that are separating sections of code are also not a sentence (no dot). + + ```python + ### Vector operations ### + + def dot_product(vector1, vector2): + ... + + def cross_product(vector1, vector2): + ... + + ### Matrix operations ### + + def transform(vector, matrix): + ... + ``` + +If TODO comments cannot be a short one-line with grammatically correct punctuation, then split it into multiple lines in this way: + +```python +# TODO: Short description of a TODO. +# A longer description of what we could potentially do here. Maybe we +# could do X or Y, but Y has this consequences. We should probably +# wait for server rendering feature to be implemented. +# See: https://github.com/example/project/issues/123 +``` + +Try to keep the formatting of the first line exactly as shown above so that it is easier parsed by IDEs. +Including the space after `#` and space after `:`. + +## Code repository + +Commit often and make sure each commit is a rounded change. Do not squash commits, unless that helps making a set of commits +into a clearer change. We leave unsuccessful attempts in the repository because maybe in the future we can come back to them +and use them, maybe in a different context or way. + +For almost all changes to the repository, we make first a feature branch from `devel` branch. We make all necessary changes in +that new branch, potentially make multiple commits. We make a merge request against the `devel` branch for the change +to be reviewed and merged. We should make a merge request even before all changes are finished so that others can comment +and discuss the development. We can continue adding more commits to this branch even after the merge request has been made +and GitLab will update the merge request automatically. Until a merge request is finished and is deemed ready to be merged +by its author, merge request's title should be prefixed with `WIP:` so that it is clear that it is not yet meant +to be merged (and thoroughly reviewed). Make sure you include also a change to the [changelog](#changelog) in your merge request. + +### Changelog + +We are maintaining `HISTORY.md` file where we document changes to the project so that +everyone involved can have one location where they can see what has changed and what +they might adapt in their code or the way they are working on the project. + +### Commit messages + +Commit messages should be descriptive and full sentences, with grammatically correct punctuation. +If possible, they should reference relevant tickets (by appending something like `See #123.`) or even close them +(`Fixes #123.`). GitLab recognizes that. If longer commit message is suitable (which is always a good thing), +first one line summary should be made (50 characters is a soft limit), followed by an empty line, followed +by a multi-line message: + + Added artificially lowering of the height in IE. + + In IE there is a problem when rendering when user is located + higher than 2000m. By artificially lowering the height rendering + now works again. + + Fixes #123. diff --git a/d3m/HISTORY.md b/d3m/HISTORY.md new file mode 100644 index 0000000..7989ca6 --- /dev/null +++ b/d3m/HISTORY.md @@ -0,0 +1,1823 @@ +## v2020.5.18 + +### Enhancements + +* Scoring primitive and pipeline now accept new hyper-parameter `all_labels` + which can be used to provide information about all labels possible in a target + column. + [#431](https://gitlab.com/datadrivendiscovery/d3m/-/issues/431) + [!377](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/377) +* Added `all_distinct_values` metadata field which can contain all values (labels) + which can be in a column. This is meant to be used on target columns to help + implementing `ContinueFitMixin` in a primitive which might require knowledge + of all possible labels before starting fitting on a subset of data. + [#447](https://gitlab.com/datadrivendiscovery/d3m/-/issues/447) + [!377](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/377) +* Reference runtime now does not keep primitive instances in memory anymore + but uses `get_params`/`set_params` to retain and reuse only primitive's parameters. + This makes memory usage lower and allows additional resource releasing when primitive's + object is freed (e.g., releasing GPUs). + [#313](https://gitlab.com/datadrivendiscovery/d3m/-/issues/313) + [!376](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/376) +* Added support for version 4.1.0 of D3M dataset schema: + * Added `MONTHS` to column's `time_granularity` metadata. + [!340](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/340) + * Added mean reciprocal rank and hits at k metrics. + [!361](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/361) + * Added `https://metadata.datadrivendiscovery.org/types/Rank` semantic type + and `rank_for` metadata field. `PerformanceMetric` classes have now + `requires_rank` method. + [!372](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/372) + * Added `NESTED` task keyword. + [!372](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/372) + * Added `file_columns_count` metadata field and updated `file_columns` metadata field + with additional sub-fields. Also renamed sub-field `name` to `column_name` and added + `column_index` sub-fields to `file_columns` metadata. + [!372](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/372) + **Backwards incompatible.** +* Moved high-level primitive base classes for file readers and dataset splitting + from common primitives to d3m core package. + [!120](https://gitlab.com/datadrivendiscovery/common-primitives/-/merge_requests/120) + [!339](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/339) +* A warning is issued if a primitive uses a global random source + during pipeline execution. Such behavior can make pipeline + execution not reproducible. + [#384](https://gitlab.com/datadrivendiscovery/d3m/-/issues/384) + [!365](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/365) +* CLI accepts `--logging-level` argument to configure which logging + messages are printed to the console. + [!360](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/360) +* Output to stdout/stderr during pipeline execution is now not suppressed + anymore, which makes it possible to debug pipeline execution using pdb. + Stdout/stderr is at the same time still logged to Python logging. + [#270](https://gitlab.com/datadrivendiscovery/d3m/-/issues/270) + [!360](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/360) +* Redirect from stdout to Python logging now operates per lines and + not per write operations, makes logs more readable. + [#168](https://gitlab.com/datadrivendiscovery/d3m/-/issues/168) + [!358](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/358) +* Made sure that multi-label metrics work correctly. + [#370](https://gitlab.com/datadrivendiscovery/d3m/-/issues/370) + [!343](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/343) +* Implemented ROC AUC metrics. They require predictions to include + confidence for all possible labels. + [#317](https://gitlab.com/datadrivendiscovery/d3m/-/issues/317) + [!318](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/318) +* Additional (non-standard) performance metrics can now be registered + using `PerformanceMetric.register_metric` class method. + [#207](https://gitlab.com/datadrivendiscovery/d3m/-/issues/207) + [!348](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/348) +* All D3M enumerations can now be extended with additional values + through `register_value` class method. This allows one to add values + to existing standard values (which come from the metadata schema). + Internally, enumeration values are now represented as strings and not + integers anymore. + [#438](https://gitlab.com/datadrivendiscovery/d3m/-/issues/438) + [!348](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/348) + **Could be backwards incompatible.** +* Added CLI to validate primitive descriptions for metalearning database + (`python3 -m d3m index validate`). + [!333](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/333) +* Raise an exception during dataset loading if `targets.csv` file does + not combine well with the dataset entry point. + [!330](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/330) + +### Bugfixes + +* CLI now displays correct error messages for invalid arguments to subcommands. + [#409](https://gitlab.com/datadrivendiscovery/d3m/-/issues/409) + [!368](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/368) +* Reference runtime does not call `fit` and `produce` + methods in a loop anymore. This mitigates an infinite loop for misbehaving primitives. + [!364](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/364) +* During pipeline execution all Python logging is now recorded in the + pipeline run and it does not depend anymore on logging level otherwise + configured during execution. + [!360](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/360) +* Default sampling code for hyper-parameters now makes sure to return + values in original types and not numpy ones. + [#440](https://gitlab.com/datadrivendiscovery/d3m/-/issues/440) + [!352](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/352) +* We now ensure that file handles opened for CLI commands are flushed + so that data is not lost. + [#436](https://gitlab.com/datadrivendiscovery/d3m/issues/436) + [!335](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/335) +* Fixed saving exposed produced outputs for `fit-score` CLI command when + scoring failed. + [!341](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/341) +* Made sure `time_granularity` metadata is saved when saving a D3M dataset. + [!340](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/340) +* Changed version of GitPython dependency to 3.1.0 to fix older versions + being broken because of its own unconstrained upper dependency. + [!336](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/336) +* Fixed how paths are constructed when exposing and saving produced values. + [!336](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/336) + +### Other + +* Added guides to the documentation. + [!351](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/351) + [!374](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/374) +* Removed type annotations from docstrings. Python type annotations are now used instead when rendering documentation. + [#239](https://gitlab.com/datadrivendiscovery/d3m/-/issues/239) + [!371](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/371) +* Renamed `blacklist` in `d3m.index.load_all` and `primitives_blacklist` in `d3m.metadata.pipeline.Resolver` + to `blocklist` and `primitives_blocklist`, respectively. + **Backwards incompatible.** +* Removed `https://metadata.datadrivendiscovery.org/types/GPUResourcesUseParameter` + semantic type. Added `can_use_gpus` primitive metadata field to signal that + the primitive can use GPUs if available. + [#448](https://gitlab.com/datadrivendiscovery/d3m/-/issues/448) + [!369](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/369) + **Backwards incompatible.** +* Clarified that hyper-parameters using `https://metadata.datadrivendiscovery.org/types/CPUResourcesUseParameter` + should have 1 as default value. + [!369](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/369) +* Clarified that it is not necessary to call `fit` before calling + `continue_fit`. +* `index` CLI command has been renamed to `primitive` CLI command. + [#437](https://gitlab.com/datadrivendiscovery/d3m/-/issues/437) + [!363](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/363) +* `numpy.matrix` has been removed as an allowed container type, as it + was deprecated by NumPy. + [#230](https://gitlab.com/datadrivendiscovery/d3m/-/issues/230) + [!362](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/362) + **Backwards incompatible.** +* CLI has now `--version` command which returns the version of the d3m + core package itself. + [#378](https://gitlab.com/datadrivendiscovery/d3m/-/issues/378) + [!359](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/359) +* Upgraded schemas to JSON Schema draft 7, and upgraded Python `jsonschema` + dependency to version 3. + [#392](https://gitlab.com/datadrivendiscovery/d3m/-/issues/392) + [!342](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/342) +* Added a Primitive Good Citizen Checklist to documentation, documenting + some best practices when writing a primitive. + [#127](https://gitlab.com/datadrivendiscovery/d3m/-/issues/127) + [!347](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/347) + [!355](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/355) +* Updated upper bounds of core dependencies to latest available versions. + [!337](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/337) +* Added to `algorithm_types`: + * `SAMPLE_SELECTION` + * `SAMPLE_MERGING` + * `MOMENTUM_CONTRAST` + * `CAUSAL_ANALYSIS` + + [!332](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/332) + [!357](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/357) + [!373](https://gitlab.com/datadrivendiscovery/d3m/-/merge_requests/373) + +## v2020.1.9 + +### Enhancements + +* Support for D3M datasets with minimal metadata. + [#429](https://gitlab.com/datadrivendiscovery/d3m/issues/429) + [!327](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/327) +* Pipeline runs (and in fact many other input documents) can now be directly used gzipped + in all CLI commands. They have to have filename end with `.gz` for decompression to happen + automatically. + [#420](https://gitlab.com/datadrivendiscovery/d3m/issues/420) + [!317](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/317) +* Made problem descriptions again more readable when converted to JSON. + [!316](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/316) +* Improved YAML handling to encourage faster C implementation. + [#416](https://gitlab.com/datadrivendiscovery/d3m/issues/416) + [!313](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/313) + +### Bugfixes + +* Fixed the error message if all required CLI arguments are not passed to the runtime. + [#411](https://gitlab.com/datadrivendiscovery/d3m/issues/411) + [!319](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/319) +* Removed assumption that all successful pipeline run steps have method calls. + [#422](https://gitlab.com/datadrivendiscovery/d3m/issues/422) + [!320](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/320) +* Fixed "Duplicate problem ID" warnings when multiple problem descriptions + have the same problem ID, but in fact they are the same problem description. + No warning is made in this case anymore. + [#417](https://gitlab.com/datadrivendiscovery/d3m/issues/417) + [!321](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/321) +* Fixed the use of D3M container types in recent versions of Keras and TensorFlow. + [#426](https://gitlab.com/datadrivendiscovery/d3m/issues/426) + [!322](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/322) +* Fixed `validate` CLI commands to work on YAML files. + +### Other + +* Updated upper bounds of core dependencies to latest available versions. + [#427](https://gitlab.com/datadrivendiscovery/d3m/issues/427) + [!325](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/325) +* Refactored default pipeline run parser implementation to make it + easier to provide alternative dataset and problem resolvers. + [!314](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/314) +* Moved out local test primitives into [`tests/data` git submodule](https://gitlab.com/datadrivendiscovery/tests-data). + Now all test primitives are in one place. + [#254](https://gitlab.com/datadrivendiscovery/d3m/issues/254) + [!312](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/312) + +## v2019.11.10 + +* Support for version 4.0.0 of D3M dataset schema has been added. +* D3M core package now supports loading directly datasets from OpenML. +* When saving `Dataset` object to D3M dataset format, metadata is now preserved. +* NetworkX objects are not anymore container types and are not allowed + anymore to be passed as values between primitives. +* "Meta" files are not supported anymore by the runtime. Instead save a + pipeline run with configuration of the run you want, and use the pipeline + run to re-run using that configuration. + +### Enhancements + +* Primitive family `REMOTE_SENSING` has been added. + [!310](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/310) +* Added support for version 4.0.0 of D3M dataset schema: + * There are no more `NODE` and `EDGE` references (used in graph datasets), + but only `NODE_ATTRIBUTE` and `EDGE_ATTRIBUTE`. + * `time_granularity` can now be present on a column. + * `forecasting_horizon` can now be present in a problem description. + * `task_type` and `task_subtype` have been merged into `task_keywords`. + As a consequence, Python `TaskType` and `TaskSubtype` were replaced + with `TaskKeyword`. + + [#401](https://gitlab.com/datadrivendiscovery/d3m/issues/401) + [!310](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/310) + **Backwards incompatible.** + +* Added OpenML dataset loader. Now you can pass an URL to a OpenML dataset + and it will be downloaded and converted to a `Dataset` compatible object, + with including many of available meta-features. Combined with support + for saving datasets, this now allows easy conversion between OpenML + datasets and D3M datasets, e.g., `python3 -m d3m dataset convert -i https://www.openml.org/d/61 -o out/datasetDoc.json`. + [#252](https://gitlab.com/datadrivendiscovery/d3m/issues/252) + [!309](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/309) +* When saving and loading D3M datasets, metadata is now preserved. + [#227](https://gitlab.com/datadrivendiscovery/d3m/issues/227) + [!265](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/265) +* Metadata can now be converted to a JSON compatible structure in a + reversible manner. + [#373](https://gitlab.com/datadrivendiscovery/d3m/issues/373) + [!308](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/308) +* Pipeline run now records if a pipeline was run as a standard pipeline + under `run.is_standard_pipeline` field. + [#396](https://gitlab.com/datadrivendiscovery/d3m/issues/396) + [!249](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/249) +* "meta" files have been replaced with support for rerunning pipeline runs. + Instead of configuring a "meta" file with configuration how to run a + pipeline, simply provide an example pipeline run which demonstrates how + the pipeline was run. Runtime does not have `--meta` argument anymore, + but has now `--input-run` argument instead. + [#202](https://gitlab.com/datadrivendiscovery/d3m/issues/202) + [!249](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/249) + **Backwards incompatible.** +* Changed `LossFunctionMixin` to support multiple loss functions. + [#386](https://gitlab.com/datadrivendiscovery/d3m/issues/386) + [!305](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/305) + **Backwards incompatible.** +* Pipeline equality and hashing functions now have `only_control_hyperparams` + argument which can be set to use only control hyper-parameters when doing + comparisons. + [!289](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/289) +* Pipelines and other YAML files are now recognized with both `.yml` and + `.yaml` file extensions. + [#375](https://gitlab.com/datadrivendiscovery/d3m/issues/375) + [!302](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/302) +* `F1Metric`, `F1MicroMetric`, and `F1MacroMetric` can now operate on + multiple target columns and average scores for all of them. + [#400](https://gitlab.com/datadrivendiscovery/d3m/issues/400) + [!298](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/298) +* Pipelines and pipeline runs can now be serialized with Arrow. + [#381](https://gitlab.com/datadrivendiscovery/d3m/issues/381) + [!290](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/290) +* `describe` CLI commands now accept `--output` argument to control where + their output is saved to. + [!279](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/279) + +### Bugfixes + +* Made exposed outputs be stored even in the case of an exception. + [#380](https://gitlab.com/datadrivendiscovery/d3m/issues/380) + [!304](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/304) +* Fixed `source.from` metadata in datasets and problem descriptions + and its validation for metalearning database. + [#363](https://gitlab.com/datadrivendiscovery/d3m/issues/363) + [!303](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/303) +* Fixed pipeline run references when running the runtime through + evaluation command. + [#395](https://gitlab.com/datadrivendiscovery/d3m/issues/395) + [!294](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/294) +* The core package scoring primitive has been updated to have digest. + This allows the core package scoring pipeline to have it as well. + This changes makes it required for the core package to be installed + in editable mode (`pip3 install -e ...`) when being installed from the + git repository. + [!280](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/280) + **Backwards incompatible.** + +### Other + +* Few top-level runtime functions had some of their arguments moved + to keyword-only arguments: + * `fit`: `problem_description` + * `score`: `scoring_pipeline`, `problem_description`, `metrics`, `predictions_random_seed` + * `prepare_data`: `data_pipeline`, `problem_description`, `data_params` + * `evaluate`: `data_pipeline`, `scoring_pipeline`, `problem_description`, `data_params`, `metrics` + + [#352](https://gitlab.com/datadrivendiscovery/d3m/issues/352) + [!301](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/301) + **Backwards incompatible.** + +* `can_accept` method has been removed from primitive interfaces. + [#334](https://gitlab.com/datadrivendiscovery/d3m/issues/334) + [!300](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/300) + **Backwards incompatible.** +* NetworkX objects are not anymore container types and are not allowed + anymore to be passed as values between primitives. Dataset loader now + does not convert a GML file to a NetworkX object but represents it + as a files collection resource. A primitive should then convert that + resource into a normalized edge-list graph representation. + [#349](https://gitlab.com/datadrivendiscovery/d3m/issues/349) + [!299](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/299) + **Backwards incompatible.** +* `JACCARD_SIMILARITY_SCORE` metric is now a binary metric and requires + `pos_label` parameter. + [!299](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/299) + **Backwards incompatible.** +* Updated core dependencies. Some important packages are now at versions: + * `tensorflow`: 2.0.0 + * `keras`: 2.3.1 + * `torch`: 1.3.0.post2 + * `theano`: 1.0.4 + * `scikit-learn`: 0.21.3 + * `numpy`: 1.17.3 + * `pandas`: 0.25.2 + * `networkx`: 2.4 + * `pyarrow`: 0.15.1 + * `scipy`: 1.3.1 + + [#398](https://gitlab.com/datadrivendiscovery/d3m/issues/398) + [#379](https://gitlab.com/datadrivendiscovery/d3m/issues/379) + [!299](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/299) + +* Primitive family `DIMENSIONALITY_REDUCTION` has been added. + [!284](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/284) +* Added to `algorithm_types`: + * `POLYNOMIAL_REGRESSION` + * `IMAGENET` + * `RETINANET` + + [!306](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/306) + +* `--process-dependency-link` is not anymore suggested to be used when + installing primitives. +* `sample_rate` metadata field inside `dimension` has been renamed to + `sampling_rate` to make it consistent across metadata. This field + should contain a sampling rate used for the described dimension, + when values in the dimension are sampled. + **Backwards incompatible.** + +## v2019.6.7 + +### Enhancements + +* Dataset loading has been optimized for the case when only one file + type exists in a file collection. Metadata is also simplified in this case. + [#314](https://gitlab.com/datadrivendiscovery/d3m/issues/314) + [!277](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/277) +* Support defining unfitted primitives in the pipeline for passing them + to another primitive as a hyper-parameter. Unfitted primitives do not + have any input connected and runtime just creates a primitive instance + but does not fit or produce them. It then passes this primitive instance + to another primitive as a hyper-parameter value. + [!274](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/274) +* When saving datasets, we now use hard-linking of files when possible. + [#368](https://gitlab.com/datadrivendiscovery/d3m/issues/368) + [!271](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/271) + +### Bugfixes + +* Specifying `-E` to the `d3m runtime` CLI now exposes really all outputs + of all steps and not just pipeline outputs. + [#367](https://gitlab.com/datadrivendiscovery/d3m/issues/367) + [!270](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/270) +* Fixed minor issues when loading sklearn example datasets. +* Fixed PyPi metadata of the package. + [!267](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/267) +* When saving D3M dataset, also structural type information is now used to set + column type. + [#339](https://gitlab.com/datadrivendiscovery/d3m/issues/339) + [!255](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/255) +* When saving D3M dataset, update digest of saved dataset to digest of + what has been saved. + [#340](https://gitlab.com/datadrivendiscovery/d3m/issues/340) + [!262](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/262) + +### Other + +* Pipeline's `get_exposable_outputs` method has been renamed to `get_producing_outputs`. + [!270](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/270) +* Updating columns from DataFrame returned from `DataFrame.select_columns` + does not raise a warning anymore. + [!268](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/268) +* Added `scipy==1.2.1` as core dependency. + [!266](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/266) +* Added code style guide to the repository. + [!260](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/260) +* Added to `algorithm_types`: + + * `ITERATIVE_LABELING` + + [!276](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/276) + +## v2019.5.8 + +* This release contains an implementation of `D3MDatasetSaver` so `Dataset` objects + can now be saved using their `save` method into D3M dataset format. +* Additional hyper-parameters classes have been defined and existing improved. + Probably the most useful addition is `List` hyper-parameter which allows + repeated values with order of values (in contrast with `Set`). +* Standard graph representation has been standardized (a nodelist table and an + edge list table) and related semantic types have been added to mark source + and target columns for edges. +* Standard time-series representation has been standardized (a long format) + and related semantic types have been added to identify columns to index + time-series by. +* Feature construction primitive should mark newly constructed attributes + with `https://metadata.datadrivendiscovery.org/types/ConstructedAttribute` + semantic type. +* There are now mixins available to define primitives which can be used to + describe neural networks as pipelines. +* There is now a single command line interface for the core package under + `python3 -m d3m`. + +### Enhancements + +* Runtime now raises an exception if target columns from problem description + could not be found in provided input datasets. + [#281](https://gitlab.com/datadrivendiscovery/d3m/issues/281) + [!155](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/155) +* Core package command line interfaces have been consolidated and revamped + and are now all available under single `python3 -m d3m`. + [#338](https://gitlab.com/datadrivendiscovery/d3m/issues/338) + [!193](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/193) + [!233](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/233) +* Added `--expose-produced-outputs` argument runtime CLI to allow saving + to a directory produced outputs of all primitives from pipeline's run. + Useful for debugging. + [#206](https://gitlab.com/datadrivendiscovery/d3m/issues/206) + [!223](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/223) +* CSVLoader and SklearnExampleLoader dataset loaders now add + `d3mIndex` column if one does not exist already. + [#266](https://gitlab.com/datadrivendiscovery/d3m/issues/266) + [!202](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/202) +* Added `--not-standard-pipeline` argument to `fit`, `produce`, and `fit-produce` + runtime CLI to allow running non-standard pipelines. + [#312](https://gitlab.com/datadrivendiscovery/d3m/issues/312) + [!228](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/228) +* Sampling `Bounded` and base `Hyperparameter` hyper-parameter now issues + a warning that sampling of those hyper-parameters is ill-defined. + [!220](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/220) +* `Bounded` hyper-parameter with both bounds now samples from uniform + distribution. + [!220](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/220) +* Added new hyper-parameter classes: `SortedSet`, `List`, and `SortedList`. + [#236](https://gitlab.com/datadrivendiscovery/d3m/issues/236) + [#292](https://gitlab.com/datadrivendiscovery/d3m/issues/292) + [!219](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/219) +* All bounded hyper-parameter classes now accept additional arguments to + control if bounds are inclusive or exclusive. + [#199](https://gitlab.com/datadrivendiscovery/d3m/issues/199) + [!215](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/215) +* `Dataset` objects can now be saved to D3M dataset format by + calling `save` method on them. + [#31](https://gitlab.com/datadrivendiscovery/d3m/issues/31) + [#344](https://gitlab.com/datadrivendiscovery/d3m/issues/344) + [!96](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/96) + [!217](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/217) + +### Bugfixes + +* Fixed `NormalizeMutualInformationMetric` implementation. + [#357](https://gitlab.com/datadrivendiscovery/d3m/issues/357) + [!257](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/257) +* JSON representation of `Union` hyper-parameter values and other + pickled hyper-parameter values has been changed to assure better + interoperability. + [#359](https://gitlab.com/datadrivendiscovery/d3m/issues/359) + [!256](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/256) + **Backwards incompatible.** +* All d3m schemas are now fully valid according to JSON schema draft v4. + [#79](https://gitlab.com/datadrivendiscovery/d3m/issues/79) + [!233](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/233) +* Fixed an error when saving a fitted pipeline to stdout. + [#353](https://gitlab.com/datadrivendiscovery/d3m/issues/353) + [!250](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/250) +* Hyper-parameters cannot use `NaN` and infinity floating-point values + as their bounds. This assures compatibility with JSON. + [#324](https://gitlab.com/datadrivendiscovery/d3m/issues/324) + [!237](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/237) + **Backwards incompatible.** +* Pipelines are now exported to JSON in strict compliance of the + JSON specification. + [#323](https://gitlab.com/datadrivendiscovery/d3m/issues/323) + [!238](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/238) +* Runtime execution does not fail anymore if predictions cannot be converted + to JSON for pipeline run. A warning is issued instead. + [#347](https://gitlab.com/datadrivendiscovery/d3m/issues/347) + [!227](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/227) +* Better support for running reference runtime without exceptions on non-Linux + operating systems. + [#246](https://gitlab.com/datadrivendiscovery/d3m/issues/246) + [!218](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/218) +* Strict checking of dataset, pipeline and primitive digests against those provided + in metadata are now correctly controlled using `--strict-digest`/`strict_digest` + arguments. + [#346](https://gitlab.com/datadrivendiscovery/d3m/issues/346) + [!213](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/213) +* Fixed error propagation in `evaluate` runtime function, if error + happens during scoring. + [!210](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/210) +* Fixed accessing container DataFrame's `metadata` attribute when + DataFrame also contains a column with the name `metadata`. + [#330](https://gitlab.com/datadrivendiscovery/d3m/issues/330) + [!201](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/201) +* Fixed `.meta` file resolving when `--datasets` runtime argument + is not an absolute path. + [!194](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/194) +* Fixed `get_relations_graph` resolving of column names (used in `Denormalize` + common primitive). + [!196](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/196) + +### Other + +* Other validation functions for metalearning documents. This includes + also CLI to validate. + [#220](https://gitlab.com/datadrivendiscovery/d3m/issues/220) + [!233](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/233) +* Pipeline run schema now requires scoring dataset inputs to be recorded + if a data preparation pipeline has not been used. + [!243](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/243) + **Backwards incompatible.** +* Core package now provides standard scoring primitive and scoring pipeline + which are used by runtime by default. + [#307](https://gitlab.com/datadrivendiscovery/d3m/issues/307) + [!231](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/231) +* Pipeline run can now be generated also for a subset of non-standard + pipelines: those which have all inputs of `Dataset` type. + [!232](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/232) +* Pipeline run now also records a normalized score, if available. + [!230](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/230) +* Pipeline `context` field has been removed from schema and implementation. + [!229](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/229) +* Added `pure_primitive` field to primitive's metadata so that primitives + can mark themselves as not pure (by default all primitives are seen as pure). + [#331](https://gitlab.com/datadrivendiscovery/d3m/issues/331) + [!226](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/226) +* `Metadata` methods `to_json_structure` and `to_simple_structure` has been + modified to not return anymore internal metadata representation but + metadata representation equivalent to what you get from `query` call. + To obtain internal representation use `to_internal_json_structure` + and `to_internal_simple_structure`. + [!225](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/225) + **Backwards incompatible.** +* `NeuralNetworkModuleMixin` and `NeuralNetworkObjectMixin` have been + added to primitive interfaces to support representing neural networks + as pipelines. + [#174](https://gitlab.com/datadrivendiscovery/d3m/issues/174) + [!87](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/87) +* `get_loss_function` has been renamed to `get_loss_metric` in + `LossFunctionMixin`. + [!87](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/87) + **Backwards incompatible.** +* `UniformInt`, `Uniform`, and `LogUniform` hyper-parameter classes now + subclass `Bounded` class. + [!216](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/216) +* Metrics do not have default parameter values anymore, cleaned legacy + parts of code assuming so. + [!212](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/212) +* Added new semantic types: + * `https://metadata.datadrivendiscovery.org/types/EdgeSource` + * `https://metadata.datadrivendiscovery.org/types/DirectedEdgeSource` + * `https://metadata.datadrivendiscovery.org/types/UndirectedEdgeSource` + * `https://metadata.datadrivendiscovery.org/types/SimpleEdgeSource` + * `https://metadata.datadrivendiscovery.org/types/MultiEdgeSource` + * `https://metadata.datadrivendiscovery.org/types/EdgeTarget` + * `https://metadata.datadrivendiscovery.org/types/DirectedEdgeTarget` + * `https://metadata.datadrivendiscovery.org/types/UndirectedEdgeTarget` + * `https://metadata.datadrivendiscovery.org/types/SimpleEdgeTarget` + * `https://metadata.datadrivendiscovery.org/types/MultiEdgeTarget` + * `https://metadata.datadrivendiscovery.org/types/ConstructedAttribute` + * `https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey` + * `https://metadata.datadrivendiscovery.org/types/GroupingKey` + + [#134](https://gitlab.com/datadrivendiscovery/d3m/issues/134) + [#348](https://gitlab.com/datadrivendiscovery/d3m/issues/348) + [!211](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/211) + [!214](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/214) + +* Updated core dependencies. Some important packages are now at versions: + * `scikit-learn`: 0.20.3 + * `pyarrow`: 0.13.0 + + [!206](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/206) + +* Clarified in primitive interface documentation that if primitive should have been + fitted before calling its produce method, but it has not been, primitive should + raise a ``PrimitiveNotFittedError`` exception. + [!204](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/204) +* Added to `algorithm_types`: + + * `EQUI_JOIN` + * `DATA_RETRIEVAL` + * `DATA_MAPPING` + * `MAP` + * `INFORMATION_THEORETIC_METAFEATURE_EXTRACTION` + * `LANDMARKING_METAFEATURE_EXTRACTION` + * `MODEL_BASED_METAFEATURE_EXTRACTION` + * `STATISTICAL_METAFEATURE_EXTRACTION` + * `VECTORIZATION` + * `BERT` + + [!160](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/160) + [!186](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/186) + [!224](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/224) + [!247](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/247) + +* Primitive family `METAFEATURE_EXTRACTION` has been renamed to `METALEARNING`. + [!160](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/160) + **Backwards incompatible.** + +## v2019.4.4 + +* With this release metadata is not automatically generated anymore when DataFrame or ndarray + is being wrapped into a corresponding container type. Now you have to explicitly set + `generate_metadata` constructor argument to `True` or call `generate` method on metadata + object afterwards. + This has been changed to improve performance of many primitives and operations on + container types which were slowed down because of unnecessary and unexpected + generation of metadata. + This change requires manual inspection of primitive's code to determine what change + is necessary. Some suggestions what to look for: + * `set_for_value` method has been deprecated: generally it can be replaced with `generate` + call, or even removed in some cases: + * `value.metadata = value.metadata.set_for_value(value, generate_metadata=False)` remove. + * `value.metadata = new_metadata.set_for_value(value, generate_metadata=False)` replace with `value.metadata = new_metadata`. + * `value.metadata = new_metadata.set_for_value(value, generate_metadata=True)` replace with `value.metadata = new_metadata.generate(value)`. + * `clear` method has been deprecated: generally you can now instead simply create + a fresh instance of `DataMetadata`, potentially calling `generate` as well: + * `outputs_metadata = inputs_metadata.clear(new_metadata, for_value=outputs, generate_metadata=True)` replace with + `outputs_metadata = metadata_base.DataMetadata(metadata).generate(outputs)`. + * `outputs_metadata = inputs_metadata.clear(for_value=outputs, generate_metadata=False)` replace with + `outputs_metadata = metadata_base.DataMetadata()`. + * Search for all calls to constructors of `container.List`, `container.ndarray`, + `container.Dataset`, `container.DataFrame` container types and explicitly set + `generate_metadata` to `True`. Alternatively, you can also manually update + metadata instead of relying on automatic metadata generation. + * The main idea is that if you are using automatic metadata generation in your primitive, + make sure you generate it only once, just before you return container type from + your primitive. Of course, if you call code which expects metadata from inside your primitive, + you might have to assure or generate metadata before calling that code as well. + +### Enhancements + +* Primitives now get a `temporary_directory` constructor argument pointing + to a directory they can use to store any files for the duration of current pipeline + run phase. The main intent of this temporary directory is to store files referenced + by any ``Dataset`` object your primitive might create and followup primitives in + the pipeline should have access to. To support configuration of the location of these + temporary directories, the reference runtime now has a `--scratch` command line argument + and corresponding `scratch_dir` constructor argument. + [#306](https://gitlab.com/datadrivendiscovery/d3m/issues/306) + [!190](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/190) +* Made sure that number of inputs provided to the runtime has to match the number of inputs a pipeline accepts. + [#301](https://gitlab.com/datadrivendiscovery/d3m/issues/301) + [!183](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/183) +* Supported MIT-LL dataset and problem schemas version 3.3.0. Now all suggested targets and suggested privileged data + columns are now by default also attributes. Runtime makes sure that if any column is marked as problem description's + target it is not marked as an attribute anymore. + [#291](https://gitlab.com/datadrivendiscovery/d3m/issues/291) + [!182](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/182) + **Backwards incompatible.** +* `steps` and `method_calls` made optional in pipeline run schema to allow easier recording of failed pipelines. + [!167](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/167) +* Pipeline run now records also start and end timestamps of pipelines and steps. + [#258](https://gitlab.com/datadrivendiscovery/d3m/issues/258) + [!162](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/162) +* `Metadata` has two new methods to query metadata, `query_field` and `query_field_with_exceptions` + which you can use when you want to query just a field of metadata, and not whole metadata object. + Similarly, `DataMetadata` has a new method `query_column_field`. +* `DataMetadata`'s `generate` method has now `compact` argument to control it automatically + generated metadata is compacted (if all elements of a dimension have equal metadata, it is + compacted into `ALL_ELEMENTS` selector segment) or not (default). + There is also a `compact` method available on `Metadata` to compact metadata on demand. +* Automatically generated metadata is not automatically compacted anymore by default + (compacting is when all elements of a dimension have equal metadata, moving that + metadata `ALL_ELEMENTS` selector segment). +* `generate_metadata` argument of container types' constructors has been switched + from default `True` to default `False` to prevent unnecessary and unexpected + generation of metadata, slowing down execution of primitives. Moreover, + `DataMetadata` has now a method `generate` which can be used to explicitly + generate and update metadata given a data value. + Metadata methods `set_for_value` and `clear` have been deprecated and can + be generally replaced with `generate` call, or creating a new metadata + object, or removing the call. + **Backwards incompatible.** + [#143](https://gitlab.com/datadrivendiscovery/d3m/issues/143) + [!180](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/180) +* Loading of datasets with many files has been heavily optimized. + [#164](https://gitlab.com/datadrivendiscovery/d3m/issues/164) + [!136](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/136) + [!178](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/178) + [!](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/179) +* Extended container's `DataFrame.to_csv` method to use by default + metadata column names for CSV header instead of column names of + `DataFrame` itself. + [!158](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/158). +* Problem parsing has been refactored into extendable system similar to how + dataset parsing is done. A simple `d3m.metadata.problem.Problem` class has + been defined to contain a problem description. Default implementation supports + loading of D3M problems. `--problem` command line argument to reference runtime + can now be a path or URI to a problem description. + [#276](https://gitlab.com/datadrivendiscovery/d3m/issues/276) + [!145](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/145) +* Data metadata is not validated anymore at every update, but only when explicitly + validated using the `check` method. This improves metadata performance. + [!144](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/144) + +### Other + +* Top-level runtime functions now also return `Result` (or new `MultiResult`) + objects instead of raising special `PipelineRunError` exception (which has been + removed) and instead of returning just pipeline run (which is available + inside `Result`). + [#297](https://gitlab.com/datadrivendiscovery/d3m/issues/297) + [!192](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/192) + **Backwards incompatible.** +* Metrics have been reimplemented to operate on whole predictions DataFrame. + [#304](https://gitlab.com/datadrivendiscovery/d3m/issues/304) + [#311](https://gitlab.com/datadrivendiscovery/d3m/issues/311) + [!171](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/171) + **Backwards incompatible.** +* Pipeline run implementation has been refactored to be in a single class to + facilitate easier subclassing. + [#255](https://gitlab.com/datadrivendiscovery/d3m/issues/255) + [#305](https://gitlab.com/datadrivendiscovery/d3m/issues/305) + [!164](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/164) +* Added new semantic types: + * `https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey` + * `https://metadata.datadrivendiscovery.org/types/BoundingPolygon` + * `https://metadata.datadrivendiscovery.org/types/UnknownType` +* Removed semantic types: + * `https://metadata.datadrivendiscovery.org/types/BoundingBox` + * `https://metadata.datadrivendiscovery.org/types/BoundingBoxXMin` + * `https://metadata.datadrivendiscovery.org/types/BoundingBoxYMin` + * `https://metadata.datadrivendiscovery.org/types/BoundingBoxXMax` + * `https://metadata.datadrivendiscovery.org/types/BoundingBoxYMax` + + **Backwards incompatible.** +* Added to `primitive_family`: + * `SEMISUPERVISED_CLASSIFICATION` + * `SEMISUPERVISED_REGRESSION` + * `VERTEX_CLASSIFICATION` +* Added to `task_type`: + * `SEMISUPERVISED_CLASSIFICATION` + * `SEMISUPERVISED_REGRESSION` + * `VERTEX_CLASSIFICATION` +* Added to `performance_metric`: + * `HAMMING_LOSS` +* Removed from `performance_metric`: + * `ROOT_MEAN_SQUARED_ERROR_AVG` + + **Backwards incompatible.** +* Added `https://metadata.datadrivendiscovery.org/types/GPUResourcesUseParameter` and + `https://metadata.datadrivendiscovery.org/types/CPUResourcesUseParameter` semantic types for + primitive hyper-parameters which control the use of GPUs and CPUs (cores), respectively. + You can use these semantic types to mark which hyper-parameter defines a range of how many + GPUs or CPUs (cores), respectively, a primitive can and should use. + [#39](https://gitlab.com/datadrivendiscovery/d3m/issues/39) + [!177](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/177) +* Added `get_hyperparams` and `get_volumes` helper methods to `PrimitiveMetadata` + so that it is easier to obtain hyper-parameters definitions class of a primitive. + [#163](https://gitlab.com/datadrivendiscovery/d3m/issues/163) + [!175](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/175) +* Pipeline run schema now records the global seed used by the runtime to run the pipeline. + [!187](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/187) +* Core package scores output now includes also a random seed column. + [#299](https://gitlab.com/datadrivendiscovery/d3m/issues/299) + [!185](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/185) +* Metrics in core packages now take as input whole predictions DataFrame + objects and compute scores over them. So `applicability_to_targets` metric + method has been removed, and also code which handles the list of target + columns metric used to compute the score. This is not needed anymore + because now all columns are always used by all metrics. Moreover, + corresponding `dataset_id` and `targets` fields have been removed from + pipeline run schema. +* Core package now requires pip 19 or later to be installed. + `--process-dependency-links` argument when installing the package is not needed + nor supported anymore. + Primitives should not require use of `--process-dependency-links` to install + them either. Instead use link dependencies as described in + [PEP 508](https://www.python.org/dev/peps/pep-0508/). + [#285](https://gitlab.com/datadrivendiscovery/d3m/issues/285) + [!176](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/176) + **Backwards incompatible.** +* `outputs` field in parsed problem description has been removed. + [#290](https://gitlab.com/datadrivendiscovery/d3m/issues/290) + [!174](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/174) + **Backwards incompatible.** +* `Hyperparameter`'s `value_to_json` and `value_from_json` methods have been + renamed to `value_to_json_structure` and `value_from_json_structure`, respectively. + [#122](https://gitlab.com/datadrivendiscovery/d3m/issues/122) + [#173](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/173) +* Moved utility functions from common primitives package to core package: + + * `copy_metadata` to `Metadata.copy_to` method + * `select_columns` to `DataFrame.select_columns` method + * `select_columns_metadata` to `DataMetadata.select_columns` method + * `list_columns_with_semantic_types` to `DataMetadata.list_columns_with_semantic_types` method + * `list_columns_with_structural_types` to `DataMetadata.list_columns_with_structural_types` method + * `remove_columns` to `DataFrame.remove_columns` method + * `remove_columns_metadata` to `DataMetadata.remove_columns` method + * `append_columns` to `DataFrame.append_columns` method + * `append_columns_metadata` to `DataMetadata.append_columns` method + * `insert_columns` to `DataFrame.insert_columns` method + * `insert_columns_metadata` to `DataMetadata.insert_columns` method + * `replace_columns` to `DataFrame.replace_columns` method + * `replace_columns_metadata` to `DataMetadata.replace_columns` method + * `get_index_columns` to `DataMetadata.get_index_columns` method + * `horizontal_concat` to `DataFrame.horizontal_concat` method + * `horizontal_concat_metadata` to `DataMetadata.horizontal_concat` method + * `get_columns_to_use` to `d3m.base.utils.get_columns_to_use` function + * `combine_columns` to `d3m.base.utils.combine_columns` function + * `combine_columns_metadata` to `d3m.base.utils.combine_columns_metadata` function + * `set_table_metadata` to `DataMetadata.set_table_metadata` method + * `get_column_index_from_column_name` to `DataMetadata.get_column_index_from_column_name` method + * `build_relation_graph` to `Dataset.get_relations_graph` method + * `get_tabular_resource` to `d3m.base.utils.get_tabular_resource` function + * `get_tabular_resource_metadata` to `d3m.base.utils.get_tabular_resource_metadata` function + * `cut_dataset` to `Dataset.select_rows` method + + [#148](https://gitlab.com/datadrivendiscovery/d3m/issues/148) + [!172](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/172) + +* Updated core dependencies. Some important packages are now at versions: + * `pyarrow`: 0.12.1 + + [!156](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/156) + +## v2019.2.18 + +### Bugfixes + +* JSON schema for problem descriptions has been fixed to allow loading + D3M problem descriptions with data augmentation fields. + [#284](https://gitlab.com/datadrivendiscovery/d3m/issues/284) + [!154](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/154) +* Utils now contains representers to encode numpy float and integer numbers + for YAML. Importing `utils` registers them. + [#275](https://gitlab.com/datadrivendiscovery/d3m/issues/275) + [!148](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/148) +* Made sure all JSON files are read with UTF-8 encoding, so that we do + not depend on the encoding of the environment. + [!150](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/150) + [!153](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/153) + +## v2019.2.12 + +### Enhancements + +* Runtime now makes sure that target columns are never marked as attributes. + [#265](https://gitlab.com/datadrivendiscovery/d3m/issues/265) + [!131](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/131) +* When using runtime CLI, pipeline run output is made even in the case of an + exception. Moreover, exception thrown from `Result.check_success` contains + associated pipeline runs in its `pipeline_runs` attribute. + [#245](https://gitlab.com/datadrivendiscovery/d3m/issues/245) + [!120](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/120) +* Made additional relaxations when reading D3M datasets and problem descriptions + to not require required fields which have defaults. + [!128](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/128) +* When loading D3M datasets and problem descriptions, package now just warns + if they have an unsupported schema version and continues to load them. + [#247](https://gitlab.com/datadrivendiscovery/d3m/issues/247) + [!119](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/119) +* Added to `primitive_family`: + + * `NATURAL_LANGUAGE_PROCESSING` + + [!125](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/125) + +### Bugfixes + +* Fixed an unexpected exception when running a pipeline using reference + runtime but not requesting to return output values. + [#260](https://gitlab.com/datadrivendiscovery/d3m/issues/260) + [!127](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/127) +* Fixed infinite recursion loop which happened if Python logging was + configured inside primitive's method call. Moreover, recording of + logging records for pipeline run changed so that it does not modify + the record itself while recording it. + [#250](https://gitlab.com/datadrivendiscovery/d3m/issues/250) + [#123](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/123) +* Correctly populate `volumes` primitive constructor argument. + Before it was not really possible to use primitive static files with + reference runtime. + [!132](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/132) +* Fixed runtime/pipeline run configuration through environment variables. + Now it reads them without throwing an exception. + [#274](https://gitlab.com/datadrivendiscovery/d3m/issues/274) + [!118](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/118) + [!137](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/137) + +## v2019.1.21 + +* Some enumeration classes were moved and renamed: + * `d3m.metadata.pipeline.ArgumentType` to `d3m.metadata.base.ArgumentType` + * `d3m.metadata.pipeline.PipelineContext` to `d3m.metadata.base.Context` + * `d3m.metadata.pipeline.PipelineStep` to `d3m.metadata.base.PipelineStepType` + + **Backwards incompatible.** + +* Added `pipeline_run.json` JSON schema which describes the results of running a + pipeline as described by the `pipeline.json` JSON schema. Also implemented + a reference pipeline run output for reference runtime. + [#165](https://gitlab.com/datadrivendiscovery/d3m/issues/165) + [!59](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/59) +* When computing primitive digests, primitive's ID is included in the + hash so that digest is not the same for all primitives from the same + package. + [#154](https://gitlab.com/datadrivendiscovery/d3m/issues/154) +* When datasets are loaded, digest of their metadata and data can be + computed. To control when this is done, `compute_digest` argument + to `Dataset.load` can now take the following `ComputeDigest` + enumeration values: `ALWAYS`, `ONLY_IF_MISSING` (default), and `NEVER`. + [!75](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/75) +* Added `digest` field to pipeline descriptions. Digest is computed based + on the pipeline document and it helps differentiate between pipelines + with same `id`. When loading a pipeline, if there + is a digest mismatch a warning is issued. You can use + `strict_digest` argument to request an exception instead. + [#190](https://gitlab.com/datadrivendiscovery/d3m/issues/190) + [!75](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/75) +* Added `digest` field to problem description metadata. + This `digest` field is computed based on the problem description document + and it helps differentiate between problem descriptions with same `id`. + [#190](https://gitlab.com/datadrivendiscovery/d3m/issues/190) + [!75](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/75) +* Moved `id`, `version`, `name`, `other_names`, and `description` fields + in problem schema to top-level of the problem description. Moreover, made + `id` required. This aligns it more with the structure of other descriptions we have. + [!75](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/75) + **Backwards incompatible.** +* Pipelines can now provide multiple inputs to the same primitive argument. + In such case runtime wraps those inputs into a `List` container type, and then + passes the list to the primitive. + [#200](https://gitlab.com/datadrivendiscovery/d3m/issues/200) + [!112](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/112) +* Primitives now have a method `fit_multi_produce` which primitive author can + override to implement an optimized version of both fitting and producing a primitive on same data. + The default implementation just calls `set_training_data`, `fit` and produce methods. + If your primitive has non-standard additional arguments in its `produce` method(s) then you + will have to implement `fit_multi_produce` method to accept those additional arguments + as well, similarly to how you have had to do for `multi_produce`. + [#117](https://gitlab.com/datadrivendiscovery/d3m/issues/117) + [!110](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/110) + **Could be backwards incompatible.** +* `source`, `timestamp`, and `check` arguments to all metadata functions and container types' + constructors have been deprecated. You do not have to and should not be providing them anymore. + [#171](https://gitlab.com/datadrivendiscovery/d3m/issues/171) + [#172](https://gitlab.com/datadrivendiscovery/d3m/issues/172) + [#173](https://gitlab.com/datadrivendiscovery/d3m/issues/173) + [!108](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/108) + [!109](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/109) +* Primitive's constructor is not run anymore during importing of primitive's class + which allows one to use constructor to load things and do any resource + allocation/reservation. Constructor is now the preferred place to do so. + [#158](https://gitlab.com/datadrivendiscovery/d3m/issues/158) + [!107](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/107) +* `foreign_key` metadata has been extended with `RESOURCE` type which allows + referencing another resource in the same dataset. + [#221](https://gitlab.com/datadrivendiscovery/d3m/issues/221) + [!105](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/105) +* Updated supported D3M dataset and problem schema both to version 3.2.0. + Problem description parsing supports data augmentation metadata. + A new approach for LUPI datasets and problems is now supported, + including runtime support. + Moreover, if dataset's resource name is `learningData`, it is marked as a + dataset entry point. + [#229](https://gitlab.com/datadrivendiscovery/d3m/issues/229) + [#225](https://gitlab.com/datadrivendiscovery/d3m/issues/225) + [#226](https://gitlab.com/datadrivendiscovery/d3m/issues/226) + [!97](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/97) +* Added support for "raw" datasets. + [#217](https://gitlab.com/datadrivendiscovery/d3m/issues/217) + [!94](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/94) +* A warning is issued if a primitive does not provide a description through + its docstring. + [#167](https://gitlab.com/datadrivendiscovery/d3m/issues/167) + [!101](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/101) +* A warning is now issued if an installable primitive is lacking contact or bug + tracker URI metadata. + [#178](https://gitlab.com/datadrivendiscovery/d3m/issues/178) + [!81](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/81) +* `Pipeline` class now has also `equals` and `hash` methods which can help + determining if two pipelines are equal in the sense of isomorphism. + [!53](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/53) +* `Pipeline` and pipeline steps classes now has `get_all_hyperparams` + method to return all hyper-parameters defined for a pipeline and steps. + [#222](https://gitlab.com/datadrivendiscovery/d3m/issues/222) + [!104](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/104) +* Implemented a check for primitive Python paths to assure that they adhere + to the new standard of all of them having to be in the form `d3m.primitives.primitive_family.primitive_name.kind` + (e.g., `d3m.primitives.classification.random_forest.SKLearn`). + Currently there is a warning if a primitive has a different Python path, + and after January 2019 it will be an error. + For `primitive_name` segment there is a [`primitive_names.py`](./d3m/metadata/primitive_names.py) + file containing a list of all allowed primitive names. + Everyone is encouraged to help currate this list and suggest improvements (merging, removals, additions) + of values in that list. Initial version was mostly automatically made from an existing list of + values used by current primitives. + [#3](https://gitlab.com/datadrivendiscovery/d3m/issues/3) + [!67](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/67) +* Added to semantic types: + * `https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens` + * `https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation` + * `https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber` + * `https://metadata.datadrivendiscovery.org/types/UnspecifiedStructure` + * `http://schema.org/email` + * `http://schema.org/URL` + * `http://schema.org/address` + * `http://schema.org/State` + * `http://schema.org/City` + * `http://schema.org/Country` + * `http://schema.org/addressCountry` + * `http://schema.org/postalCode` + * `http://schema.org/latitude` + * `http://schema.org/longitude` + + [!62](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/62) + [!95](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/95) + [!94](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/94) + +* Updated core dependencies. Some important packages are now at versions: + * `scikit-learn`: 0.20.2 + * `numpy`: 1.15.4 + * `pandas`: 0.23.4 + * `networkx`: 2.2 + * `pyarrow`: 0.11.1 + + [#106](https://gitlab.com/datadrivendiscovery/d3m/issues/106) + [#175](https://gitlab.com/datadrivendiscovery/d3m/issues/175) + +* Added to `algorithm_types`: + * `IDENTITY_FUNCTION` + * `DATA_SPLITTING` + * `BREADTH_FIRST_SEARCH` +* Moved a major part of README to Sphinx documentation which is built + and available at [http://docs.datadrivendiscovery.org/](http://docs.datadrivendiscovery.org/). +* Added a `produce_methods` argument to `Primitive` hyper-parameter class + which allows one to limit matching primitives only to those providing all + of the listed produce methods. + [#124](https://gitlab.com/datadrivendiscovery/d3m/issues/124) + [!56](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/56) +* Fixed `sample_multiple` method of the `Hyperparameter` class. + [#157](https://gitlab.com/datadrivendiscovery/d3m/issues/157) + [!50](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/50) +* Fixed pickling of `Choice` hyper-parameter. + [!49](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/49) + [!51](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/51) +* Added `Constant` hyper-parameter class. + [#186](https://gitlab.com/datadrivendiscovery/d3m/issues/186) + [!90](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/90) +* Added `count` to aggregate values in metafeatures. + [!52](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/52) +* Clarified and generalized some metafeatures, mostly renamed so that it can be + used on attributes as well: + * `number_of_classes` to `number_distinct_values` + * `class_entropy` to `entropy_of_values` + * `majority_class_ratio` to `value_probabilities_aggregate.max` + * `minority_class_ratio` to `value_probabilities_aggregate.min` + * `majority_class_size` to `value_counts_aggregate.max` + * `minority_class_size` to `value_counts_aggregate.min` + * `class_probabilities` to `value_probabilities_aggregate` + * `target_values` to `values_aggregate` + * `means_of_attributes` to `mean_of_attributes` + * `standard_deviations_of_attributes` to `standard_deviation_of_attributes` + * `categorical_joint_entropy` to `joint_entropy_of_categorical_attributes` + * `numeric_joint_entropy` to `joint_entropy_of_numeric_attributes` + * `pearson_correlation_of_attributes` to `pearson_correlation_of_numeric_attributes` + * `spearman_correlation_of_attributes` to `spearman_correlation_of_numeric_attributes` + * `canonical_correlation` to `canonical_correlation_of_numeric_attributes` + + [!52](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/52) + +* Added metafeatures: + * `default_accuracy` + * `oner` + * `jrip` + * `naive_bayes_tree` + * `number_of_string_attributes` + * `ratio_of_string_attributes` + * `number_of_other_attributes` + * `ratio_of_other_attributes` + * `attribute_counts_by_structural_type` + * `attribute_ratios_by_structural_type` + * `attribute_counts_by_semantic_type` + * `attribute_ratios_by_semantic_type` + * `value_counts_aggregate` + * `number_distinct_values_of_discrete_attributes` + * `entropy_of_discrete_attributes` + * `joint_entropy_of_discrete_attributes` + * `joint_entropy_of_attributes` + * `mutual_information_of_discrete_attributes` + * `equivalent_number_of_discrete_attributes` + * `discrete_noise_to_signal_ratio` + + [!21](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/21) + [!52](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/52) + +* Added special handling when reading scoring D3M datasets (those with true targets in a separate + file `targets.csv`). When such dataset is detected, the values from the separate file are now + merged into the dataset, and its ID is changed to finish with `SCORE` suffix. Similarly, an + ID of a scoring problem description gets its suffix changed to `SCORE`. + [#176](https://gitlab.com/datadrivendiscovery/d3m/issues/176) +* Organized semantic types and add to some of them parent semantic types to organize/structure + them better. New parent semantic types added: `https://metadata.datadrivendiscovery.org/types/ColumnRole`, + `https://metadata.datadrivendiscovery.org/types/DimensionType`, `https://metadata.datadrivendiscovery.org/types/HyperParameter`. +* Fixed that `dateTime` column type is mapped to `http://schema.org/DateTime` semantic + type and not `https://metadata.datadrivendiscovery.org/types/Time`. + **Backwards incompatible.** +* Updated generated [site for metadata](https://metadata.datadrivendiscovery.org/) and + generate sites describing semantic types. + [#33](https://gitlab.com/datadrivendiscovery/d3m/issues/33) + [#114](https://gitlab.com/datadrivendiscovery/d3m/issues/114) + [!37](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/37) +* Optimized resolving of primitives in `Resolver` to not require loading of + all primitives when loading a pipeline, in the common case. + [#162](https://gitlab.com/datadrivendiscovery/d3m/issues/162) + [!38](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/38) +* Added `NotFoundError`, `AlreadyExistsError`, and `PermissionDeniedError` + exceptions to `d3m.exceptions`. +* `Pipeline`'s `to_json_structure`, `to_json`, and `to_yaml` now have `nest_subpipelines` + argument which allows conversion with nested sub-pipelines instead of them + being only referenced. +* Made sure that Arrow serialization of metadata does not pickle also linked + values (`for_value`). +* Made sure enumerations are picklable. +* `PerformanceMetric` class now has `best_value` and `worst_value` which + return the range of possible values for the metric. Moreover, `normalize` + method normalizes the metric's value to a range between 0 and 1. +* Load D3M dataset qualities only after data is loaded. This fixes + lazy loading of datasets with qualities which was broken before. +* Added `load_all_primitives` argument to the default pipeline `Resolver` + which allows one to control loading of primitives outside of the resolver. +* Added `primitives_blacklist` argument to the default pipeline `Resolver` + which allows one to specify a collection of primitive path prefixes to not + (try to) load. +* Fixed return value of the `fit` method in `TransformerPrimitiveBase`. + It now correctly returns `CallResult` instead of `None`. +* Fixed a typo and renamed `get_primitive_hyparparams` to `get_primitive_hyperparams` + in `PrimitiveStep`. + **Backwards incompatible.** +* Additional methods were added to the `Pipeline` class and step classes, + to support runtime and easier manipulation of pipelines programmatically + (`get_free_hyperparams`, `get_input_data_references`, `has_placeholder`, + `replace_step`, `get_exposable_outputs`). +* Added reference implementation of the runtime. It is available + in the `d3m.runtime` module. This module also has an extensive + command line interface you can access through `python3 -m d3m.runtime`. + [#115](https://gitlab.com/datadrivendiscovery/d3m/issues/115) + [!57](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/57) + [!72](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/72) +* `GeneratorPrimitiveBase` interface has been changed so that `produce` method + accepts a list of non-negative integers as an input instead of a list of `None` values. + This allows for batching and control by the caller which outputs to generate. + Previously outputs would depend on number of calls to `produce` and number of outputs + requested in each call. Now these integers serve as an index into the set of potential + outputs. + **Backwards incompatible.** +* We now try to preserve metadata log in default implementation of `can_accept`. +* Added `sample_rate` field to `dimension` metadata. +* `python3 -m d3m.index download` command now accepts `--prefix` argument to limit the + primitives for which static files are downloaded. Useful for testing. +* Added `check` argument to `DataMetadata`'s `update` and `remove` methods which allows + one to control if selector check against `for_value` should be done or not. When + it is known that selector is valid, not doing the check can speed up those methods. +* Defined metadata field `file_columns` which allows to store known columns metadata for + tables referenced from columns. This is now used by a D3M dataset reader to store known + columns metadata for collections of CSV files. Previously, this metadata was lost despite + being available in Lincoln Labs dataset metadata. + +## v2018.7.10 + +* Made sure that `OBJECT_DETECTION_AVERAGE_PRECISION` metric supports operation on + vectorized target column. + [#149](https://gitlab.com/datadrivendiscovery/d3m/issues/149) +* Files in D3M dataset collections are now listed recursively to support datasets + with files split into directories. + [#146](https://gitlab.com/datadrivendiscovery/d3m/issues/146) +* When parameter value for `Params` fails to type check, a name of the parameter is now + reported as well. + [#135](https://gitlab.com/datadrivendiscovery/d3m/issues/135) +* `python3 -m d3m.index` has now additional command `download` which downloads all static + files needed by available primitives. Those files are then exposed through `volumes` + constructor argument to primitives by TA2/runtime. Files are stored into an output + directory in a standard way where each volume is stored with file or directory name + based on its digest. + [#102](https://gitlab.com/datadrivendiscovery/d3m/issues/102) +* Fixed standard return type of `log_likelihoods`, `log_likelihood`, `losses`, and `loss` + primitive methods to support multi-target primitives. +* Clarified that `can_accept` receives primitive arguments and not just method arguments. +* Added `https://metadata.datadrivendiscovery.org/types/FilesCollection` for resources which are + file collections. Also moved the main semantic type of file collection's values to the column. +* Fixed conversion of a simple list to a DataFrame. +* Added `https://metadata.datadrivendiscovery.org/types/Confidence` semantic type for columns + representing confidence and `confidence_for` metadata which can help confidence column refer + to the target column for which it is confidence for. +* Fixed default `can_accept` implementation to return type unwrapped from `CallResult`. +* Fixed `DataMetadata.remove` to preserve `for_value` value (and allow it to be set through the call). +* Fixed a case where automatically generated metadata overrode explicitly set existing metadata. + [!25](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/25) +* Fixed `query_with_exceptions` metadata method to correctly return exceptions for + deeper selectors. +* Added to `primitive_family`: + * `SCHEMA_DISCOVERY` + * `DATA_AUGMENTATION` +* Added to `algorithm_types`: + * `HEURISTIC` + * `MARKOV_RANDOM_FIELD` + * `LEARNING_USING_PRIVILEGED_INFORMATION` + * `APPROXIMATE_DATA_AUGMENTATION` +* Added `PrimitiveNotFittedError`, `DimensionalityMismatchError`, and `MissingValueError` + exceptions to `d3m.exceptions`. + [!22](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/22) +* Fixed setting semantic types for boundary columns. + [#126](https://gitlab.com/datadrivendiscovery/d3m/issues/126) [!23](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/23) +* Added `video/avi` media type to lists of known media types. +* Fixed a type check which prevented an additional primitive argument to be of `Union` type. +* Fixed erroneous removal of empty dicts (`{}`) from metadata when empty dicts were + explicitly stored in metadata. + [#118](https://gitlab.com/datadrivendiscovery/d3m/issues/118) +* Made sure that conflicting entry points are resolved in a deterministic way. +* Made sure primitive metadata's `python_path` matches the path under which + a primitive is registered under `d3m.primitives`. This also prevents + a primitive to be registered twice at different paths in the namespace. + [#4](https://gitlab.com/datadrivendiscovery/d3m/issues/4) +* Fixed a bug which prevented registration of primitives at deeper levels + (e.g., `d3m.primitives...`). + [#121](https://gitlab.com/datadrivendiscovery/d3m/issues/121) + +## v2018.6.5 + +* `Metadata` class got additional methods to manipulate metadata: + * `remove(selector)` removes metadata at `selector`. + * `query_with_exceptions(selector)` to return metadata for selectors which + have metadata which differs from that of `ALL_ELEMENTS`. + * `add_semantic_type`, `has_semantic_type`, `remove_semantic_type`, + `get_elements_with_semantic_type` to help with semantic types. + * `query_column`, `update_column`, `remove_column`, `get_columns_with_semantic_type` + to make it easier to work with tabular data. + + [#55](https://gitlab.com/datadrivendiscovery/d3m/issues/55) + [#78](https://gitlab.com/datadrivendiscovery/d3m/issues/78) + +* Container `List` now inherits from a regular Python `list` and not from `typing.List`. + It does not have anymore a type variable. Typing information is stored in `metadata` + anyway (`structural_type`). This simplifies type checking (and improves performance) + and fixes pickling issues. + **Backwards incompatible.** +* `Hyperparams` class' `defaults` method now accepts optional `path` argument which + allows one to fetch defaults from nested hyper-parameters. +* `Hyperparameters` class and its subclasses now have `get_default` method instead + of a property `default`. + **Backwards incompatible.** +* `Hyperparams` class got a new method `replace` which makes it easier to modify + hyper-parameter values. +* `Set` hyper-parameter can now accept also a hyper-parameters configuration as elements + which allows one to define a set of multiple hyper-parameters per each set element. + [#94](https://gitlab.com/datadrivendiscovery/d3m/issues/94) +* Pipeline's `check` method now checks structural types of inputs and outputs and assures + they match. + [!19](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/19) +* `Set` hyper-parameter now uses tuple of unique elements instead of set to represent the set. + This assures that the order of elements is preserved to help with reproducibility when + iterating over a set. + **Backwards incompatible.** + [#109](https://gitlab.com/datadrivendiscovery/d3m/issues/109) +* `Set` hyper-parameter can now be defined without `max_samples` argument to allow a set + without an upper limit on the number of elements. + `min_samples` and `max_samples` arguments to `Set` constructor have been switched as + a consequence, to have a more intuitive order. + Similar changes have been done to `sample_multiple` method of hyper-parameters. + **Backwards incompatible.** + [#110](https://gitlab.com/datadrivendiscovery/d3m/issues/110) +* Core dependencies have been upgraded: `numpy==1.14.3`. `pytypes` is now a released version. +* When converting a numpy array with more than 2 dimensions to a DataFrame, higher dimensions are + automatically converted to nested numpy arrays inside a DataFrame. + [#80](https://gitlab.com/datadrivendiscovery/d3m/issues/80) +* Metadata is now automatically preserved when converting between container types. + [#76](https://gitlab.com/datadrivendiscovery/d3m/issues/76) +* Basic metadata for data values is now automatically generated when using D3M container types. + Value is traversed over its structure and `structural_type` and `dimension` with its `length` + keys are populated. Some `semantic_types` are added in simple cases, and `dimension`'s + `name` as well. In some cases analysis of all data to generate metadata can take time, + so you might consider disabling automatic generation by setting `generate_metadata` + to `False` in container's constructor or `set_for_value` calls and then manually populating + necessary metadata. + [#35](https://gitlab.com/datadrivendiscovery/d3m/issues/35) + [!6](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/6) + [!11](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/11) +* When reading D3M datasets, `media_types` metadata now includes proper media types + for the column, and also media type for each particular row (file). +* D3M dataset and problem description parsing has been updated to 3.1.2 version: + * `Dataset` class now supports loading `edgeList` resources. + * `primitive_family` now includes `OBJECT_DETECTION`. + * `task_type` now includes `OBJECT_DETECTION`. + * `performance_metrics` now includes `PRECISION`, `RECALL`, `OBJECT_DETECTION_AVERAGE_PRECISION`. + * `targets` of a problem description now includes `clusters_number`. + * New metadata `boundary_for` can now describe for which other column + a column is a boundary for. + * Support for `realVector`, `json` and `geojson` column types. + * Support for `boundingBox` column role. + * New semantic types: + * `https://metadata.datadrivendiscovery.org/types/EdgeList` + * `https://metadata.datadrivendiscovery.org/types/FloatVector` + * `https://metadata.datadrivendiscovery.org/types/JSON` + * `https://metadata.datadrivendiscovery.org/types/GeoJSON` + * `https://metadata.datadrivendiscovery.org/types/Interval` + * `https://metadata.datadrivendiscovery.org/types/IntervalStart` + * `https://metadata.datadrivendiscovery.org/types/IntervalEnd` + * `https://metadata.datadrivendiscovery.org/types/BoundingBox` + * `https://metadata.datadrivendiscovery.org/types/BoundingBoxXMin` + * `https://metadata.datadrivendiscovery.org/types/BoundingBoxYMin` + * `https://metadata.datadrivendiscovery.org/types/BoundingBoxXMax` + * `https://metadata.datadrivendiscovery.org/types/BoundingBoxYMax` + + [#99](https://gitlab.com/datadrivendiscovery/d3m/issues/99) + [#107](https://gitlab.com/datadrivendiscovery/d3m/issues/107) + +* Unified the naming of attributes/features metafeatures to attributes. + **Backwards incompatible.** + [!13](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/13) +* Unified the naming of categorical/nominal metafeatures to categorical. + **Backwards incompatible.** + [!12](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/12) +* Added more metafeatures: + * `pca` + * `random_tree` + * `decision_stump` + * `naive_bayes` + * `linear_discriminant_analysis` + * `knn_1_neighbor` + * `c45_decision_tree` + * `rep_tree` + * `categorical_joint_entropy` + * `numeric_joint_entropy` + * `number_distinct_values_of_numeric_features` + * `class_probabilities` + * `number_of_features` + * `number_of_instances` + * `canonical_correlation` + * `entropy_of_categorical_features` + * `entropy_of_numeric_features` + * `equivalent_number_of_categorical_features` + * `equivalent_number_of_numeric_features` + * `mutual_information_of_categorical_features` + * `mutual_information_of_numeric_features` + * `categorical_noise_to_signal_ratio` + * `numeric_noise_to_signal_ratio` + + [!10](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/10) + [!14](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/14) + [!17](https://gitlab.com/datadrivendiscovery/d3m/merge_requests/17) + +* Added metafeatures for present values: + * `number_of_instances_with_present_values` + * `ratio_of_instances_with_present_values` + * `number_of_present_values` + * `ratio_of_present_values` + + [#84](https://gitlab.com/datadrivendiscovery/d3m/issues/84) + +* Implemented interface for saving datasets. + [#31](https://gitlab.com/datadrivendiscovery/d3m/issues/31) +* To remove a key in metadata, instead of using `None` value one should now use + special `NO_VALUE` value. + **Backwards incompatible.** +* `None` is now serialized to JSON as `null` instead of string `"None"`. + **Could be backwards incompatible.** +* Unified naming and behavior of methods dealing with JSON and JSON-related + data. Now across the package: + * `to_json_structure` returns a structure with values fully compatible with JSON and serializable with default JSON serializer + * `to_simple_structure` returns a structure similar to JSON, but with values left as Python values + * `to_json` returns serialized value as JSON string + + **Backwards incompatible.** + +* Hyper-parameters are now required to specify at least one + semantic type from: `https://metadata.datadrivendiscovery.org/types/TuningParameter`, + `https://metadata.datadrivendiscovery.org/types/ControlParameter`, + `https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter`, + `https://metadata.datadrivendiscovery.org/types/MetafeatureParameter`. + **Backwards incompatible.** +* Made type strings in primitive annotations deterministic. + [#93](https://gitlab.com/datadrivendiscovery/d3m/issues/93) +* Reimplemented primitives loading code to load primitives lazily. + [#74](https://gitlab.com/datadrivendiscovery/d3m/issues/74) +* `d3m.index` module now has new and modified functions: + * `search` now returns a list of Python paths of all potential + primitives defined through entry points (but does not load them + or checks if entry points are valid) + * `get_primitive` loads and returns a primitive given its Python path + * `get_primitive_by_id` returns a primitive given its ID, but a primitive + has to be loaded beforehand + * `get_loaded_primitives` returns a list of all currently loaded primitives + * `load_all` tries to load all primitives + * `register_primitive` now accepts full Python path instead of just suffix + + **Backwards incompatible.** + [#74](https://gitlab.com/datadrivendiscovery/d3m/issues/74) + +* Defined `model_features` primitive metadata to describe features supported + by an underlying model. This is useful to allow easy matching between + problem's subtypes and relevant primitives. + [#88](https://gitlab.com/datadrivendiscovery/d3m/issues/88) +* Made hyper-parameter space of an existing `Hyperparams` subclass immutable. + [#91](https://gitlab.com/datadrivendiscovery/d3m/issues/91) +* `d3m.index describe` command now accept `-s`/`--sort-keys` argument which + makes all keys in the JSON output sorted, making output JSON easier to + diff and compare. +* `can_accept` now gets a `hyperparams` object with hyper-parameters under + which to check a method call. This allows `can_accept` to return a result + based on control hyper-parameters. + **Backwards incompatible.** + [#81](https://gitlab.com/datadrivendiscovery/d3m/issues/81) +* Documented that all docstrings should be made according to + [numpy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html). + [#85](https://gitlab.com/datadrivendiscovery/d3m/issues/85) +* Added to semantic types: + * `https://metadata.datadrivendiscovery.org/types/MissingData` + * `https://metadata.datadrivendiscovery.org/types/InvalidData` + * `https://metadata.datadrivendiscovery.org/types/RedactedTarget` + * `https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData` +* Added to `primitive_family`: + * `TIME_SERIES_EMBEDDING` +* Added to `algorithm_types`: + * `IVECTOR_EXTRACTION` +* Removed `SparseDataFrame` from standard container types because it is being + deprecated in Pandas. + **Backwards incompatible.** + [#95](https://gitlab.com/datadrivendiscovery/d3m/issues/95) +* Defined `other_names` metadata field for any other names a value might have. +* Optimized primitives loading time. + [#87](https://gitlab.com/datadrivendiscovery/d3m/issues/87) +* Made less pickling of values when hyper-parameter has `Union` structural type. + [#83](https://gitlab.com/datadrivendiscovery/d3m/issues/83) +* `DataMetadata.set_for_value` now first checks new value against the metadata, by default. + **Could be backwards incompatible.** +* Added `NO_NESTED_VALUES` primitive precondition and effect. + This allows primitive to specify if it cannot handle values where a container value + contains nested other values with dimensions. + +## v2018.4.18 + +* Added `pipeline.json` JSON schema to this package. Made `problem.json` JSON schema + describing parsed problem description's schema. There is also a `d3m.metadata.pipeline` + parser for pipelines in this schema and Python object to represent a pipeline. + [#53](https://gitlab.com/datadrivendiscovery/d3m/issues/53) +* Updated README to make it explicit that for tabular data the first dimension + is always rows and the second always columns, even in the case of a DataFrame + container type. + [#54](https://gitlab.com/datadrivendiscovery/d3m/issues/54) +* Made `Dataset` container type return Pandas `DataFrame` instead of numpy `ndarray` + and in generaly suggest to use Pandas `DataFrame` as a default container type. + **Backwards incompatible.** + [#49](https://gitlab.com/datadrivendiscovery/d3m/issues/49) +* Added `UniformBool` hyper-parameter class. +* Renamed `FeaturizationPrimitiveBase` to `FeaturizationLearnerPrimitiveBase`. + **Backwards incompatible.** +* Defined `ClusteringTransformerPrimitiveBase` and renamed `ClusteringPrimitiveBase` + to `ClusteringLearnerPrimitiveBase`. + **Backwards incompatible.** + [#20](https://gitlab.com/datadrivendiscovery/d3m/issues/20) +* Added `inputs_across_samples` decorator to mark which method arguments + are inputs which compute across samples. + [#19](https://gitlab.com/datadrivendiscovery/d3m/issues/19) +* Converted `SingletonOutputMixin` to a `singleton` decorator. This allows + each produce method separately to be marked as a singleton produce method. + **Backwards incompatible.** + [#17](https://gitlab.com/datadrivendiscovery/d3m/issues/17) +* `can_accept` can also raise an exception with information why it cannot accept. + [#13](https://gitlab.com/datadrivendiscovery/d3m/issues/13) +* Added `Primitive` hyper-parameter to describe a primitive or primitives. + Additionally, documented in docstrings better how to define hyper-parameters which + use primitives for their values and how should such primitives-as-values be passed + to primitives as their hyper-parameters. + [#51](https://gitlab.com/datadrivendiscovery/d3m/issues/51) +* Hyper-parameter values can now be converted to and from JSON-compatible structure + using `values_to_json` and `values_from_json` methods. Non-primitive values + are pickled and stored as base64 strings. + [#67](https://gitlab.com/datadrivendiscovery/d3m/issues/67) +* Added `Choice` hyper-parameter which allows one to define + combination of hyper-parameters which should exists together. + [#28](https://gitlab.com/datadrivendiscovery/d3m/issues/28) +* Added `Set` hyper-parameter which samples multiple times another hyper-parameter. + [#52](https://gitlab.com/datadrivendiscovery/d3m/issues/52) +* Added `https://metadata.datadrivendiscovery.org/types/MetafeatureParameter` + semantic type for hyper-parameters which control which meta-features are + computed by the primitive. + [#41](https://gitlab.com/datadrivendiscovery/d3m/issues/41) +* Added `supported_media_types` primitive metadata to describe + which media types a primitive knows how to manipulate. + [#68](https://gitlab.com/datadrivendiscovery/d3m/issues/68) +* Renamed metadata property `mime_types` to `media_types`. + **Backwards incompatible.** +* Made pyarrow dependency a package extra. You can depend on it using + `d3m[arrow]`. + [#66](https://gitlab.com/datadrivendiscovery/d3m/issues/66) +* Added `multi_produce` method to primitive interface which allows primitives + to optimize calls to multiple produce methods they might have. + [#21](https://gitlab.com/datadrivendiscovery/d3m/issues/21) +* Added `d3m.utils.redirect_to_logging` context manager which can help + redirect primitive's output to stdout and stderr to primitive's logger. + [#65](https://gitlab.com/datadrivendiscovery/d3m/issues/65) +* Primitives can now have a dependency on static files and directories. + One can use `FILE` and `TGZ` entries in primitive's `installation` + metadata to ask the caller to provide paths those files and/or + extracted directories through new `volumes` constructor argument. + [#18](https://gitlab.com/datadrivendiscovery/d3m/issues/18) +* Core dependencies have been upgraded: `numpy==1.14.2`, `networkx==2.1`. +* LUPI quality in D3M datasets is now parsed into + `https://metadata.datadrivendiscovery.org/types/SuggestedPrivilegedData` + semantic type for a column. + [#61](https://gitlab.com/datadrivendiscovery/d3m/issues/61) +* Support for primitives using Docker containers has been put on hold. + We are keeping a way to pass information about running containers to a + primitive and defining dependent Docker images in metadata, but currently + it is not expected that any runtime running primitives will run + Docker containers for a primitive. + [#18](https://gitlab.com/datadrivendiscovery/d3m/issues/18) +* Primitives do not have to define all constructor arguments anymore. + This allows them to ignore arguments they do not use, e.g., + `docker_containers`. + On the other side, when creating an instance of a primitive, one + has now to check which arguments the constructor accepts, which is + available in primitive's metadata: + `primitive.metadata.query()['primitive_code'].get('instance_methods', {})['__init__']['arguments']`. + [#63](https://gitlab.com/datadrivendiscovery/d3m/issues/63) +* Information about running primitive's Docker container has changed + from just its address to a `DockerContainer` tuple containing both + the address and a map of all exposed ports. + At the same time, support for Docker has been put on hold so you + do not really have to upgrade for this change anything and can simply + remove the `docker_containers` argument from primitive's constructor. + **Backwards incompatible.** + [#14](https://gitlab.com/datadrivendiscovery/d3m/issues/14) +* Multiple exception classes have been defined in `d3m.exceptions` + module and are now in use. This allows easier and more precise + handling of exceptions. + [#12](https://gitlab.com/datadrivendiscovery/d3m/issues/12) +* Fixed inheritance of `Hyperparams` class. + [#44](https://gitlab.com/datadrivendiscovery/d3m/issues/44) +* Each primitive's class now automatically gets an instance of + [Python's logging](https://docs.python.org/3/library/logging.html) + logger stored into its ``logger`` class attribute. The instance is made + under the name of primitive's ``python_path`` metadata value. Primitives + can use this logger to log information at various levels (debug, warning, + error) and even associate extra data with log record using the ``extra`` + argument to the logger calls. + [#10](https://gitlab.com/datadrivendiscovery/d3m/issues/10) +* Made sure container data types can be serialized with Arrow/Plasma + while retaining their metadata. + [#29](https://gitlab.com/datadrivendiscovery/d3m/issues/29) +* `Scores` in `GradientCompositionalityMixin` replaced with `Gradients`. + `Scores` only makes sense in a probabilistic context. +* Renamed `TIMESERIES_CLASSIFICATION`, `TIMESERIES_FORECASTING`, and + `TIMESERIES_SEGMENTATION` primitives families to + `TIME_SERIES_CLASSIFICATION`, `TIME_SERIES_FORECASTING`, and + `TIME_SERIES_SEGMENTATION`, respectively, to match naming + pattern used elsewhere. + Similarly, renamed `UNIFORM_TIMESERIES_SEGMENTATION` algorithm type + to `UNIFORM_TIME_SERIES_SEGMENTATION`. + Compound words using hyphens are separated, but hyphens for prefixes + are not separated. So "Time-series" and "Root-mean-squared error" + become `TIME_SERIES` and `ROOT_MEAN_SQUARED_ERROR` + but "Non-overlapping" and "Multi-class" are `NONOVERLAPPING` and `MULTICLASS`. + **Backwards incompatible.** +* Updated performance metrics to include `PRECISION_AT_TOP_K` metric. +* Added to problem description parsing support for additional metric + parameters and updated performance metric functions to use them. + [#42](https://gitlab.com/datadrivendiscovery/d3m/issues/42) +* Merged `d3m_metadata`, `primitive_interfaces` and `d3m` repositories + into `d3m` repository. This requires the following changes of + imports in existing code: + * `d3m_metadata` to `d3m.metadata` + * `primitive_interfaces` to `d3m.primitive_interfaces` + * `d3m_metadata.container` to `d3m.container` + * `d3m_metadata.metadata` to `d3m.metadata.base` + * `d3m_metadata.metadata.utils` to `d3m.utils` + * `d3m_metadata.metadata.types` to `d3m.types` + + **Backwards incompatible.** + [#11](https://gitlab.com/datadrivendiscovery/d3m/issues/11) + +* Fixed computation of sampled values for `LogUniform` hyper-parameter class. + [#47](https://gitlab.com/datadrivendiscovery/d3m/issues/47) +* When copying or slicing container values, metadata is now copied over + instead of cleared. This makes it easier to propagate metadata. + This also means one should make sure to update the metadata in the + new container value to reflect changes to the value itself. + **Could be backwards incompatible.** +* `DataMetadata` now has `set_for_value` method to make a copy of + metadata and set new `for_value` value. You can use this when you + made a new value and you want to copy over metadata, but you also + want this value to be associated with metadata. This is done by + default for container values. +* Metadata now includes SHA256 digest for primitives and datasets. + It is computed automatically during loading. This should allow one to + track exact version of primitive and datasets used. + `d3m.container.dataset.get_d3m_dataset_digest` is a reference + implementation of computing digest for D3M datasets. + You can set `compute_digest` to `False` to disable this. + You can set `strict_digest` to `True` to raise an exception instead + of a warning if computed digest does not match one in metadata. +* Datasets can be now loaded in "lazy" mode: only metadata is loaded + when creating a `Dataset` object. You can use `is_lazy` method to + check if dataset iz lazy and data has not yet been loaded. You can use + `load_lazy` to load data for a lazy object, making it non-lazy. +* There is now an utility metaclass `d3m.metadata.utils.AbstractMetaclass` + which makes classes which use it automatically inherit docstrings + for methods from the parent. Primitive base class and some other D3M + classes are now using it. +* `d3m.metadata.base.CONTAINER_SCHEMA_VERSION` and + `d3m.metadata.base.DATA_SCHEMA_VERSION` were fixed to point to the + correct URI. +* Many `data_metafeatures` properties in metadata schema had type + `numeric` which does not exist in JSON schema. They were fixed to + `number`. +* Added to a list of known semantic types: + `https://metadata.datadrivendiscovery.org/types/Target`, + `https://metadata.datadrivendiscovery.org/types/PredictedTarget`, + `https://metadata.datadrivendiscovery.org/types/TrueTarget`, + `https://metadata.datadrivendiscovery.org/types/Score`, + `https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint`, + `https://metadata.datadrivendiscovery.org/types/SuggestedPrivilegedData`, + `https://metadata.datadrivendiscovery.org/types/PrivilegedData`. +* Added to `algorithm_types`: `ARRAY_CONCATENATION`, `ARRAY_SLICING`, + `ROBUST_PRINCIPAL_COMPONENT_ANALYSIS`, `SUBSPACE_CLUSTERING`, + `SPECTRAL_CLUSTERING`, `RELATIONAL_ALGEBRA`, `MULTICLASS_CLASSIFICATION`, + `MULTILABEL_CLASSIFICATION`, `OVERLAPPING_CLUSTERING`, `SOFT_CLUSTERING`, + `STRICT_PARTITIONING_CLUSTERING`, `STRICT_PARTITIONING_CLUSTERING_WITH_OUTLIERS`, + `UNIVARIATE_REGRESSION`, `NONOVERLAPPING_COMMUNITY_DETECTION`, + `OVERLAPPING_COMMUNITY_DETECTION`. + +## v2018.1.26 + +* Test primitives updated to have `location_uris` metadata. +* Test primitives updated to have `#egg=` package URI suffix in metadata. +* Primitives (instances of their classes) can now be directly pickled + and unpickled. Internally it uses `get_params` and `set_params` in + default implementation. If you need to preserve additional state consider + extending `__getstate__` and `__setstate__` methods. +* Added `RandomPrimitive` test primitive. +* Bumped `numpy` dependency to `1.14` and `pandas` to `0.22`. +* Added `https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter` as a known URI + for `semantic_types` to help convey which hyper-parameters control the use of resources by the + primitive. + [#41](https://gitlab.com/datadrivendiscovery/metadata/issues/41) +* Fixed use of `numpy` values in `Params` and `Hyperparams`. + [#39](https://gitlab.com/datadrivendiscovery/metadata/issues/39) +* Added `upper_inclusive` argument to `UniformInt`, `Uniform`, and `LogUniform` classes + to signal that the upper bound is inclusive (default is exclusive). + [#38](https://gitlab.com/datadrivendiscovery/metadata/issues/38) +* Made `semantic_types` and `description` keyword-only arguments in hyper-parameter description classes. +* Made all enumeration metadata classes have their instances be equal to their string names. +* Made sure `Hyperparams` subclasses can be pickled and unpickled. +* Improved error messages during metadata validation. +* Documented common metadata for primitives and data in the README. +* Added standard deviation to aggregate metadata values possible. +* Added `NO_JAGGED_VALUES` to `preconditions` and `effects`. +* Added to `algorithm_types`: `AGGREGATE_FUNCTION`, `AUDIO_STREAM_MANIPULATION`, `BACKWARD_DIFFERENCE_CODING`, + `BAYESIAN_LINEAR_REGRESSION`, `CATEGORY_ENCODER`, `CROSS_VALIDATION`, `DISCRETIZATION`, `ENCODE_BINARY`, + `ENCODE_ORDINAL`, `FEATURE_SCALING`, `FORWARD_DIFFERENCE_CODING`, `FREQUENCY_TRANSFORM`, `GAUSSIAN_PROCESS`, + `HASHING`, `HELMERT_CODING`, `HOLDOUT`, `K_FOLD`, `LEAVE_ONE_OUT`, `MERSENNE_TWISTER`, `ORTHOGONAL_POLYNOMIAL_CODING`, + `PASSIVE_AGGRESSIVE`, `PROBABILISTIC_DATA_CLEANING`, `QUADRATIC_DISCRIMINANT_ANALYSIS`, `RECEIVER_OPERATING_CHARACTERISTIC`, + `RELATIONAL_DATA_MINING`, `REVERSE_HELMERT_CODING`, `SEMIDEFINITE_EMBEDDING`, `SIGNAL_ENERGY`, `SOFTMAX_FUNCTION`, + `SPRUCE`, `STOCHASTIC_GRADIENT_DESCENT`, `SUM_CODING`, `TRUNCATED_NORMAL_DISTRIBUTION`, `UNIFORM_DISTRIBUTION`. +* Added to `primitive_family`: `DATA_GENERATION`, `DATA_VALIDATION`, `DATA_WRANGLING`, `VIDEO_PROCESSING`. +* Added `NoneType` to the list of data types allowed inside container types. +* For `PIP` dependencies specified by a `package_uri` git URI, an `#egg=package_name` URI suffix is + now required. + +## v2018.1.5 + +* Made use of the PyPI package official. Documented a requirement for + `--process-dependency-links` argument during installation. + [#27](https://gitlab.com/datadrivendiscovery/metadata/issues/27) +* Arguments `learning_rate` and `weight_decay` in `GradientCompositionalityMixin` renamed to + `fine_tune_learning_rate` and `fine_tune_weight_decay`, respectively. + `learning_rate` is a common hyper-parameter name. + [#41](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/41) +* Added `https://metadata.datadrivendiscovery.org/types/TuningParameter` and + `https://metadata.datadrivendiscovery.org/types/ControlParameter` as two known URIs for + `semantic_types` to help convey which hyper-parameters are true tuning parameters (should be + tuned during hyper-parameter optimization phase) and which are control parameters (should be + determined during pipeline construction phase and are part of the logic of the pipeline). +* Made `installation` metadata optional. This allows local-only primitives. + You can still register them into D3M namespace using `d3m.index.register_primitive`. +* Fixed serialization to JSON of hyper-parameters with `q` argument. +* Clarified that primitive's `PIP` dependency `package` has to be installed with `--process-dependency-link` argument + enabled, and `package_uri` with both `--process-dependency-link` and `--editable`, so that primitives can have access + to their git history to generate metadata. +* Only `git+http` and `git+https` URI schemes are allowed for git repository URIs for `package_uri`. +* Added to `algorithm_types`: `AUDIO_MIXING`, `CANONICAL_CORRELATION_ANALYSIS`, `DATA_PROFILING`, `DEEP_FEATURE_SYNTHESIS`, + `INFORMATION_ENTROPY`, `MFCC_FEATURE_EXTRACTION`, `MULTINOMIAL_NAIVE_BAYES`, `MUTUAL_INFORMATION`, `PARAMETRIC_TRAJECTORY_MODELING`, + `SIGNAL_DITHERING`, `SIGNAL_TO_NOISE_RATIO`, `STATISTICAL_MOMENT_ANALYSIS`, `UNIFORM_TIMESERIES_SEGMENTATION`. +* Added to `primitive_family`: `SIMILARITY_MODELING`, `TIMESERIES_CLASSIFICATION`, `TIMESERIES_SEGMENTATION`. + +## v2017.12.27 + +* Documented `produce` method for `ClusteringPrimitiveBase` and added + `ClusteringDistanceMatrixMixin`. + [#18](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/18) +* Added `can_accept` class method to primitive base class and implemented its + default implementation. + [#20](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/20) +* "Distance" primitives now accept an extra argument instead of a tuple. +* `Params` should now be a subclass of `d3m.metadata.params.Params`, which is a + specialized dict instead of a named tuple. +* Removed `Graph` class. There is no need for it anymore because we can identify + them by having input type a NetworkX graph and through metadata discovery. +* Added `timeout` and `iterations` arguments to more methods. +* Added `forward` and `backward` backprop methods to `GradientCompositionalityMixin` + to allow end-to-end backpropagation across diverse primitives. + [#26](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/26) +* Added `log_likelihoods` method to `ProbabilisticCompositionalityMixin`. +* Constructor now accepts `docker_containers` argument with addresses of running + primitive's Docker containers. + [#25](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/25) +* Removed `CallMetadata` and `get_call_metadata` and changed so that some methods + directly return new but similar `CallResult`. + [#27](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/27) +* Documented how extra arguments to standard and extra methods can be defined. +* Documented that all arguments with the same name in all methods should have the + same type. Arguments are per primitive not per method. + [#29](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/29) +* Specified how to define extra "produce" methods which have same semantics + as `produce` but different output types. + [#30](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/30) +* Added `SingletonOutputMixin` to signal that primitive's output contains + only one element. + [#15](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/15) +* Added `get_loss_primitive` to allow accessing to the loss primitive + being used. +* Moved `set_training_data` back to the base class. + This breaks Liskov substitution principle. + [#19](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/19) +* Renamed `__metadata__` to `metadata` attribute. + [#23](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/23) +* `set_random_seed` method has been removed and replaced with a + `random_seed` argument to the constructor, which is also exposed as an attribute. + [#16](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/16) +* Primitives have now `hyperparams` attribute which returns a + hyper-parameters object passed to the constructor. + [#14](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/14) +* `Params` and `Hyperparams` are now required to be pickable and copyable. + [#3](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/3) +* Primitives are now parametrized by `Hyperparams` type variable as well. + Constructor now receives hyper-parameters as an instance as one argument + instead of multiple keyword arguments. + [#13](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/13) +* `LossFunctionMixin`'s `get_loss_function` method now returns a value from + problem schema `Metric` enumeration. +* `LossFunctionMixin` has now a `loss` and `losses` methods which allows one + to ask a primitive to compute loss for a given set of inputs and outputs using + internal loss function the primitive is using. + [#17](https://gitlab.com/datadrivendiscovery/primitive-interfaces/issues/17) +* Added `Params` class. +* Removed `Graph` class in favor of NetworkX `Graph` class. +* Added `Metadata` class with subclasses and documented the use of selectors. +* Added `Hyperparams` class. +* Added `Dataset` class. +* "Sequences" have generally been renamed to "containers". Related code is also now under + `d3m.container` and not under `d3m.metadata.sequence` anymore. +* `__metadata__` attribute was renamed to `metadata`. +* Package renamed from `d3m_types` to `d3m_metadata`. +* Added schemas for metadata contexts. +* A problem schema parsing and Python enumerations added in + `d3m.metadata.problem` module. +* A standard set of container and base types have been defined. +* `d3m.index` command tool rewritten to support three commands: `search`, `discover`, + and `describe`. See details by running `python -m d3m.index -h`. +* Package now requires Python 3.6. +* Repository migrated to gitlab.com and made public. + +## v2017.10.10 + +* Made `d3m.index` module with API to register primitives into a `d3m.primitives` module + and searches over it. +* `d3m.index` is also a command-line tool to list available primitives and automatically + generate JSON annotations for primitives. +* Created `d3m.primitives` module which automatically populates itself with primitives + using Python entry points. diff --git a/d3m/HOW_TO_RELEASE.md b/d3m/HOW_TO_RELEASE.md new file mode 100644 index 0000000..e8bfcae --- /dev/null +++ b/d3m/HOW_TO_RELEASE.md @@ -0,0 +1,35 @@ +# How to release a new version + +*A cheat sheet.* + +* On `devel` branch: + * `git pull` to make sure everything is in sync with remote origin. + * Change a version in `d3m/__init__.py` to the new version, e.g., `2019.2.12`. + * Change `vNEXT` in `HISTORY.md` to the to-be-released version, with `v` prefix. + * Commit with message `Bumping version for release.` + * `git push` + * Wait for CI to run tests successfully. +* On `master` branch: + * `git pull` to make sure everything is in sync with remote origin. + * Merge `devel` into `master` branch: `git merge devel` + * `git push` + * Wait for CI to run tests successfully. + * Release a package to PyPi: + * `rm -rf dist/` + * `python setup.py sdist` + * `twine upload dist/*` + * Tag with version prefixed with `v`, e.g., for version `2017.9.20`: `git tag v2017.9.20` + * `git push` & `git push --tags` +* On `devel` branch: + * `git merge master` to make sure `devel` is always on top of `master`. + * Change a version in `d3m/__init__.py` to `devel`. + * Add a new empty `vNEXT` version on top of `HISTORY.md`. + * Commit with message `Version bump for development.` + * `git push` +* After a release: + * Create a new [`core` and `primitives` Docker images](https://gitlab.com/datadrivendiscovery/images) for the release. + * Add new release to the [primitives index repository](https://gitlab.com/datadrivendiscovery/primitives/blob/master/HOW_TO_MANAGE.md). + +If there is a need for a patch version to fix a released version on the same day, +use `.postX` prefix, like `2017.9.20.post0`. If more than a day has passed, just +use the new day's version. diff --git a/d3m/LICENSE.txt b/d3m/LICENSE.txt new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/d3m/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/d3m/MANIFEST.in b/d3m/MANIFEST.in new file mode 100644 index 0000000..3e677d0 --- /dev/null +++ b/d3m/MANIFEST.in @@ -0,0 +1,2 @@ +include README.md +include LICENSE.txt diff --git a/d3m/README.md b/d3m/README.md new file mode 100644 index 0000000..6030203 --- /dev/null +++ b/d3m/README.md @@ -0,0 +1,56 @@ +# Common code for D3M project + +This package provides a core package for D3M project with common code available. +It contains standard interfaces, reference implementations, and utility implementations. + +## Installation + +This package works with Python 3.6 and pip 19+. You need to have the following packages installed on the system (for Debian/Ubuntu): + +* `libssl-dev` +* `libcurl4-openssl-dev` +* `libyaml-dev` + +You can install latest stable version from [PyPI](https://pypi.org/): + +``` +$ pip3 install d3m +``` + +To install latest development version: + +``` +$ pip3 install -e git+https://gitlab.com/datadrivendiscovery/d3m.git@devel#egg=d3m +``` + +When cloning a repository, clone it recursively to get also git submodules: + +``` +$ git clone --recursive https://gitlab.com/datadrivendiscovery/d3m.git +``` + +## Changelog + +See [HISTORY.md](./HISTORY.md) for summary of changes to this package. + +## Documentation + +Documentation for the package is available at [https://docs.datadrivendiscovery.org/](https://docs.datadrivendiscovery.org/). + +## Contributing + +See [CODE_STYLE.md](./CODE_STYLE.md) for our coding style and contribution guide. Please ensure any merge requests you open follow this guide. + +## Repository structure + +`master` branch contains latest stable release of the package. +`devel` branch is a staging branch for the next release. + +Releases are [tagged](https://gitlab.com/datadrivendiscovery/d3m/tags). + +## About Data Driven Discovery Program + +DARPA Data Driven Discovery (D3M) Program is researching ways to get machines to build +machine learning pipelines automatically. It is split into three layers: +TA1 (primitives), TA2 (systems which combine primitives automatically into pipelines +and executes them), and TA3 (end-users interfaces). diff --git a/d3m/d3m/__init__.py b/d3m/d3m/__init__.py new file mode 100644 index 0000000..23a8751 --- /dev/null +++ b/d3m/d3m/__init__.py @@ -0,0 +1,8 @@ +__version__ = '2020.5.18' +__description__ = 'Common code for D3M project' +__author__ = 'DARPA D3M Program' + + +from d3m import namespace + +namespace.setup() diff --git a/d3m/d3m/__main__.py b/d3m/d3m/__main__.py new file mode 100644 index 0000000..c1332ce --- /dev/null +++ b/d3m/d3m/__main__.py @@ -0,0 +1,6 @@ +import sys + +from d3m import cli + + +cli.main(sys.argv) diff --git a/d3m/d3m/base/__init__.py b/d3m/d3m/base/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d3m/d3m/base/primitives.py b/d3m/d3m/base/primitives.py new file mode 100644 index 0000000..144eec8 --- /dev/null +++ b/d3m/d3m/base/primitives.py @@ -0,0 +1,451 @@ +import abc +import typing +import weakref + +import frozendict # type: ignore +import numpy # type: ignore +import pandas # type: ignore + +from d3m import container, exceptions, types +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces import base, generator, transformer + +__all__ = ( + 'FileReaderPrimitiveBase', + 'DatasetSplitPrimitiveBase', + 'TabularSplitPrimitiveBase', +) + +FileReaderInputs = container.DataFrame +FileReaderOutputs = container.DataFrame + + +class FileReaderHyperparams(hyperparams.Hyperparams): + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column does not contain filenames for supported media types, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should columns with read files be appended, should they replace original columns, or should only columns with read files be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + + +class FileReaderPrimitiveBase(transformer.TransformerPrimitiveBase[FileReaderInputs, FileReaderOutputs, FileReaderHyperparams]): + """ + A primitive base class for reading files referenced in columns. + + Primitives using this base class must implement: + + * ``_supported_media_types``: A sequence of supported media types such as ``audio/mpeg``, ``image/jpeg``, etc. + * ``_file_structural_type``: Structural type of the file contents after being read such as ``container.ndarray``, ``container.DataFrame``, etc. + * ``_file_semantic_types``: A sequence of semantic types to be applied to the produced column. + * ``metadata``: Primitive Metadata. + * ``_read_fileuri``: The function which describes how to load each file. This function must load one file at the time. + """ + + _supported_media_types: typing.Sequence[str] = () + _file_structural_type: type = None + # If any of these semantic types already exists on a column, then nothing is done. + # If all are missing, the first one is set. + _file_semantic_types: typing.Sequence[str] = () + + def __init__(self, *, hyperparams: FileReaderHyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + # Because same file can be referenced multiple times in multiple rows, we maintain + # a cache of read files so that we do not have to read same files again and again. + self._cache: weakref.WeakValueDictionary[typing.Tuple[int, str], typing.Any] = weakref.WeakValueDictionary() + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + if column_metadata['structural_type'] != str: + return False + + semantic_types = column_metadata.get('semantic_types', []) + media_types = set(column_metadata.get('media_types', [])) + + if 'https://metadata.datadrivendiscovery.org/types/FileName' in semantic_types and media_types <= set(self._supported_media_types): + return True + + return False + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + # We are OK if no columns ended up being read. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns contain filenames for supported media types. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def produce(self, *, inputs: FileReaderInputs, timeout: float = None, iterations: int = None) -> base.CallResult[FileReaderOutputs]: + columns_to_use = self._get_columns(inputs.metadata) + + output_columns = [self._produce_column(inputs, column_index) for column_index in columns_to_use] + + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + if self.hyperparams['return_result'] == 'append': + outputs.metadata = self._reassign_boundaries(outputs.metadata, columns_to_use) + + return base.CallResult(outputs) + + @abc.abstractmethod + def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict, fileuri: str) -> typing.Any: + pass + + def _read_filename(self, column_index: int, metadata: frozendict.FrozenOrderedDict, filename: str) -> typing.Any: + # TODO: Support handling multiple "location_base_uris". + # "location_base_uris" should be made so that we can just concat with the filename + # ("location_base_uris" end with "/"). + fileuri = metadata['location_base_uris'][0] + filename + + # We do not use the structure where we check if the key exists in the cache and if not set it and then + # return from the cache outside if clause because we are not sure garbage collection might not remove it + # before we get to return. So we directly ask for a reference and return it, or we obtain the file + # and populate the cache. + file = self._cache.get((column_index, fileuri), None) + if file is not None: + return file + + file = self._read_fileuri(metadata, fileuri) + + # We cache the file based on column index as well, because it could be that file is read differently + # based on column metadata, or that resulting metadata is different for a different column. + # We cache only if we can make a weakref. Many Python built-in types like "str" do not support them. + if type(file).__weakrefoffset__: + self._cache[(column_index, fileuri)] = file + + return file + + def _produce_column(self, inputs: FileReaderInputs, column_index: int) -> FileReaderOutputs: + read_files = [self._read_filename(column_index, inputs.metadata.query((row_index, column_index)), value) for row_index, value in enumerate(inputs.iloc[:, column_index])] + + column = container.DataFrame({inputs.columns[column_index]: read_files}, generate_metadata=False) + + column.metadata = self._produce_column_metadata(inputs.metadata, column_index, read_files) + column.metadata = column.metadata.generate(column, compact=True) + + return column + + def _produce_column_metadata( + self, inputs_metadata: metadata_base.DataMetadata, column_index: int, read_files: typing.Sequence[typing.Any], + ) -> metadata_base.DataMetadata: + column_metadata = inputs_metadata.select_columns([column_index]) + column_metadata = column_metadata.update_column(0, { + 'structural_type': self._file_structural_type, + # Clear metadata useful for filename columns. + 'location_base_uris': metadata_base.NO_VALUE, + 'media_types': metadata_base.NO_VALUE, + }) + + # It is not a filename anymore. + column_metadata = column_metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/FileName') + + # At least one semantic type from listed semantic types should be set. + semantic_types = column_metadata.query_column(0).get('semantic_types', []) + if not set(semantic_types) & set(self._file_semantic_types): + # Add the first one. + column_metadata = column_metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), self._file_semantic_types[0]) + + for row_index, file in enumerate(read_files): + # Copy metadata only if we have a container type. + if isinstance(file, types.Container): + column_metadata = file.metadata.copy_to(column_metadata, (), (row_index, 0)) + + column_metadata = column_metadata.compact(['name', 'structural_type', 'media_types', 'location_base_uris', 'semantic_types']) + + return column_metadata + + def _reassign_boundaries(self, inputs_metadata: metadata_base.DataMetadata, columns: typing.List[int]) -> metadata_base.DataMetadata: + """ + Moves metadata about boundaries from the filename column to image object column. + """ + + outputs_metadata = inputs_metadata + columns_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + for column_index in range(columns_length): + column_metadata = outputs_metadata.query_column(column_index) + + if 'boundary_for' not in column_metadata: + continue + + # TODO: Support also "column_name" boundary metadata. + if 'column_index' not in column_metadata['boundary_for']: + continue + + try: + i = columns.index(column_metadata['boundary_for']['column_index']) + except ValueError: + continue + + outputs_metadata = outputs_metadata.update_column(column_index, { + 'boundary_for': { + # We know that "columns" were appended at the end. + 'column_index': columns_length - len(columns) + i, + } + }) + + return outputs_metadata + + +DatasetSplitInputs = container.List +DatasetSplitOutputs = container.List + + +class DatasetSplitPrimitiveBase(generator.GeneratorPrimitiveBase[DatasetSplitOutputs, base.Params, base.Hyperparams]): + """ + A base class for primitives which fit on a ``Dataset`` object to produce splits of that + ``Dataset`` when producing. There are two produce methods: `produce` and `produce_score_data`. + They take as an input a list of non-negative integers which identify which ``Dataset`` + splits to return. + + This class is parameterized using only by two type variables, + ``Params`` and ``Hyperparams``. + """ + + @abc.abstractmethod + def produce(self, *, inputs: DatasetSplitInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DatasetSplitOutputs]: + """ + For each input integer creates a ``Dataset`` split and produces the training ``Dataset`` object. + This ``Dataset`` object should then be used to fit (train) the pipeline. + """ + + @abc.abstractmethod + def produce_score_data(self, *, inputs: DatasetSplitInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DatasetSplitOutputs]: + """ + For each input integer creates a ``Dataset`` split and produces the scoring ``Dataset`` object. + This ``Dataset`` object should then be used to test the pipeline and score the results. + + Output ``Dataset`` objects do not have targets redacted and are not directly suitable for testing. + """ + + @abc.abstractmethod + def set_training_data(self, *, dataset: container.Dataset) -> None: # type: ignore + """ + Sets training data of this primitive, the ``Dataset`` to split. + + Parameters + ---------- + dataset: + The dataset to split. + """ + + +class TabularSplitPrimitiveParams(params.Params): + dataset: typing.Optional[container.Dataset] + main_resource_id: typing.Optional[str] + splits: typing.Optional[typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]]] + graph: typing.Optional[typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]]] + + +# TODO: Make clear the assumption that both output container type (List) and output Datasets should have metadata. +# Redaction primitive expects that, while there is officially no reason for Datasets +# to really have metadata: metadata is stored available on the input container type, not +# values inside it. +class TabularSplitPrimitiveBase(DatasetSplitPrimitiveBase[TabularSplitPrimitiveParams, base.Hyperparams]): + """ + A primitive base class for splitting tabular datasets. + + Primitives using this base class must implement: + + * ``_get_splits``: The function which describes how to split the tabular dataset. + """ + + def __init__(self, *, hyperparams: base.Hyperparams, random_seed: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + # We need random seed multiple times. So we create our own random state we use everywhere. + self._random_state = numpy.random.RandomState(self.random_seed) + self._fitted: bool = False + self._dataset: container.Dataset = None + self._main_resource_id: str = None + self._splits: typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]] = None + self._graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]] = None + + def produce(self, *, inputs: DatasetSplitInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DatasetSplitOutputs]: + return self._produce(inputs, True) + + def produce_score_data(self, *, inputs: DatasetSplitInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DatasetSplitOutputs]: + return self._produce(inputs, False) + + def set_training_data(self, *, dataset: container.Dataset) -> None: # type: ignore + main_resource_id, main_resource = base_utils.get_tabular_resource(dataset, None, has_hyperparameter=False) + + self._main_resource_id = main_resource_id + self._dataset = dataset + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + """ + This function computes everything in advance, including generating the relation graph. + """ + + if self._dataset is None: + raise exceptions.InvalidStateError('Missing training data.') + + if self._fitted: + return base.CallResult(None) + + targets, target_columns = self._get_target_columns(self._dataset, self._main_resource_id) + attributes = self._get_attribute_columns(self._dataset, self._main_resource_id, target_columns) + + # Get splits' indices. + self._splits = self._get_splits(attributes, targets, self._dataset, self._main_resource_id) + + # Graph is the adjacency representation for the relations graph. Make it not be a "defaultdict". + self._graph = dict(self._dataset.get_relations_graph()) + + self._fitted = True + + return base.CallResult(None) + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: DatasetSplitInputs, # type: ignore + dataset: container.Dataset, timeout: float = None, iterations: int = None) -> base.MultiCallResult: + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, dataset=dataset) # type: ignore + + @abc.abstractmethod + def _get_splits(self, attributes: pandas.DataFrame, targets: pandas.DataFrame, dataset: container.Dataset, main_resource_id: str) -> typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]]: + pass + + def _get_target_columns(self, dataset: container.Dataset, main_resource_id: str) -> typing.Tuple[pandas.DataFrame, typing.Sequence[int]]: + target_columns = dataset.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(main_resource_id,)) + + # It is OK if there are no target columns. "_get_splits" should raise an exception + # if this is a problem for a given split logic. + + return dataset[main_resource_id].iloc[:, list(target_columns)], target_columns + + def _get_attribute_columns(self, dataset: container.Dataset, main_resource_id: str, target_columns: typing.Sequence[int]) -> pandas.DataFrame: + attribute_columns = dataset.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/Attribute'], at=(main_resource_id,)) + + if not attribute_columns: + # No attribute columns with semantic types, let's use all + # non-target columns as attributes then. + all_columns = list(range(dataset.metadata.query((main_resource_id, metadata_base.ALL_ELEMENTS,))['dimension']['length'])) + attribute_columns = [column_index for column_index in all_columns if column_index not in target_columns] + + if not attribute_columns: + raise ValueError("No attribute columns.") + + return dataset[main_resource_id].iloc[:, list(attribute_columns)] + + def _produce(self, inputs: DatasetSplitInputs, is_train: bool) -> base.CallResult[DatasetSplitOutputs]: + """ + This function splits the fitted Dataset. + + Parameters + ---------- + inputs: + A list of 0-based indices which specify which splits to be used as test split in output. + is_train: + Whether we are producing train or test data. + + Returns + ------- + Returns a list of Datasets. + """ + + if not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + output_datasets = container.List(generate_metadata=True) + + for index in inputs: + train_indices, test_indices = self._splits[index] + + if is_train: + output_dataset = base_utils.sample_rows( + self._dataset, + self._main_resource_id, + set(train_indices), + self._graph, + delete_recursive=self.hyperparams.get('delete_recursive', False), + ) + else: + output_dataset = base_utils.sample_rows( + self._dataset, + self._main_resource_id, + set(test_indices), + self._graph, + delete_recursive=self.hyperparams.get('delete_recursive', False), + ) + + output_datasets.append(output_dataset) + + output_datasets.metadata = metadata_base.DataMetadata({ + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.List, + 'dimension': { + 'length': len(output_datasets), + }, + }) + + # We update metadata based on metadata of each dataset. + # TODO: In the future this might be done automatically by generate_metadata. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/119 + for index, dataset in enumerate(output_datasets): + output_datasets.metadata = dataset.metadata.copy_to(output_datasets.metadata, (), (index,)) + + return base.CallResult(output_datasets) + + def get_params(self) -> TabularSplitPrimitiveParams: + if not self._fitted: + return TabularSplitPrimitiveParams( + dataset=None, + main_resource_id=None, + splits=None, + graph=None, + ) + + return TabularSplitPrimitiveParams( + dataset=self._dataset, + main_resource_id=self._main_resource_id, + splits=self._splits, + graph=self._graph, + ) + + def set_params(self, *, params: TabularSplitPrimitiveParams) -> None: + self._dataset = params['dataset'] + self._main_resource_id = params['main_resource_id'] + self._splits = params['splits'] + self._graph = params['graph'] + self._fitted = all(param is not None for param in params.values()) + + def __getstate__(self) -> dict: + state = super().__getstate__() + + state['random_state'] = self._random_state + + return state + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + + self._random_state = state['random_state'] diff --git a/d3m/d3m/base/utils.py b/d3m/d3m/base/utils.py new file mode 100644 index 0000000..f17b782 --- /dev/null +++ b/d3m/d3m/base/utils.py @@ -0,0 +1,342 @@ +import collections +import copy +import logging +import typing + +from d3m import container, exceptions +from d3m.metadata import base as metadata_base + +logger = logging.getLogger(__name__) + + +def get_columns_to_use( + metadata: metadata_base.DataMetadata, use_columns: typing.Sequence[int], exclude_columns: typing.Sequence[int], + can_use_column: typing.Callable, +) -> typing.Tuple[typing.List[int], typing.List[int]]: + """ + A helper function which computes a list of columns to use and a list of columns to ignore + given ``use_columns``, ``exclude_columns``, and a ``can_use_column`` function which should + return ``True`` when column can be used. + """ + + all_columns = list(use_columns) + + # If "use_columns" is provided, this is our view of which columns exist. + if not all_columns: + # Otherwise, we start with all columns. + all_columns = list(range(metadata.query_field((metadata_base.ALL_ELEMENTS,), 'dimension')['length'])) + + # And remove those in "exclude_columns". + all_columns = [column_index for column_index in all_columns if column_index not in exclude_columns] + + # Now we create a list of columns for which "can_use_column" returns "True", + # but also a list of columns for which it does not. The latter can be used + # to determine if there is an error or warning. For example, when using "use_columns", + # ideally, "columns_not_to_use" should be empty or a warning should be made. + # Or, some primitives might require to operate on all columns, so "columns_not_to_use" + # is empty, an error should be raised. + columns_to_use = [] + columns_not_to_use = [] + for column_index in all_columns: + if can_use_column(column_index): + columns_to_use.append(column_index) + else: + columns_not_to_use.append(column_index) + + return columns_to_use, columns_not_to_use + + +def combine_columns( + inputs: container.DataFrame, column_indices: typing.Sequence[int], columns_list: typing.Sequence[container.DataFrame], *, + return_result: str, add_index_columns: bool, +) -> container.DataFrame: + """ + Method which appends existing columns, replaces them, or creates new result from them, based on + ``return_result`` argument, which can be ``append``, ``replace``, or ``new``. + + ``add_index_columns`` controls if when creating a new result, primary index columns should be added + if they are not already among columns. + + ``inputs`` is a DataFrame for which we are appending on replacing columns, or if we are creating new result, + from where a primary index column can be taken. + + ``column_indices`` controls which columns in ``inputs`` were used to create ``columns_list``, + and which columns should be replaced when replacing them. + + ``columns_list`` is a list of DataFrames representing all together new columns. The reason it is a list is + to make it easier to operate per-column when preparing ``columns_list`` and not have to concat them all + together unnecessarily. + + Top-level metadata in ``columns_list`` is ignored, except when creating new result. + In that case top-level metadata from the first element in the list is used. + + When ``column_indices`` columns are being replaced with ``columns_list``, existing metadata in ``column_indices`` + columns is not preserved but replaced with metadata in ``columns_list``. Ideally, metadata for ``columns_list`` + has been constructed by copying source metadata from ``column_indices`` columns and modifying it as + necessary to adapt it to new columns. But ``columns_list`` also can have completely new metadata, if this + is more reasonable, but it should be understood that in this case when replacing ``column_indices`` + columns, any custom additional metadata on those columns will be lost. + + ``column_indices`` and ``columns_list`` do not have to match in number of columns. Columns are first + replaced in order for matching indices and columns. If then there are more ``column_indices`` than + ``columns_list``, additional ``column_indices`` columns are removed. If there are more ``columns_list`` than + ``column_indices`` columns, then additional ``columns_list`` are inserted after the last replaced column. + + If ``column_indices`` is empty, then the replacing behavior is equivalent to appending. + """ + + if return_result == 'append': + outputs = inputs + for columns in columns_list: + outputs = outputs.append_columns(columns) + + elif return_result == 'replace': + if not column_indices: + return combine_columns(inputs, column_indices, columns_list, return_result='append', add_index_columns=add_index_columns) + + # We copy here and disable copying inside "replace_columns" to copy only once. + # We have to copy because "replace_columns" is modifying data in-place. + outputs = copy.copy(inputs) + + columns_replaced = 0 + for columns in columns_list: + columns_length = columns.shape[1] + if columns_replaced < len(column_indices): + # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns + # listed in the slice will be replaced and others appended after the last replaced column. + outputs = outputs.replace_columns(columns, column_indices[columns_replaced:columns_replaced + columns_length], copy=False) + else: + # We insert the rest of columns after the last columns we replaced. We know that "column_indices" + # is non-empty and that the last item of "column_indices" points ot the last column we replaced + # for those listed in "column_indices". We replaced more columns though, so we have to add the + # difference, and then add 1 to insert after the last column. + outputs = outputs.insert_columns(columns, column_indices[-1] + (columns_replaced - len(column_indices)) + 1) + columns_replaced += columns_length + + if columns_replaced < len(column_indices): + outputs = outputs.remove_columns(column_indices[columns_replaced:len(column_indices)]) + + elif return_result == 'new': + if not any(columns.shape[1] for columns in columns_list): + raise ValueError("No columns produced.") + + outputs = columns_list[0] + for columns in columns_list[1:]: + outputs = outputs.append_columns(columns) + + if add_index_columns: + inputs_index_columns = inputs.metadata.get_index_columns() + outputs_index_columns = outputs.metadata.get_index_columns() + + if inputs_index_columns and not outputs_index_columns: + # Add index columns at the beginning. + outputs = inputs.select_columns(inputs_index_columns).append_columns(outputs, use_right_metadata=True) + + else: + raise exceptions.InvalidArgumentValueError("\"return_result\" has an invalid value: {return_result}".format(return_result=return_result)) + + return outputs + + +def combine_columns_metadata( + inputs: metadata_base.DataMetadata, column_indices: typing.Sequence[int], columns_list: typing.Sequence[metadata_base.DataMetadata], *, + return_result: str, add_index_columns: bool, +) -> metadata_base.DataMetadata: + """ + Analogous to ``combine_columns`` but operates only on metadata. + """ + + if return_result == 'append': + outputs = inputs + for columns in columns_list: + outputs = outputs.append_columns(columns) + + elif return_result == 'replace': + if not column_indices: + return combine_columns_metadata(inputs, column_indices, columns_list, return_result='append', add_index_columns=add_index_columns) + + outputs = inputs + + columns_replaced = 0 + for columns in columns_list: + columns_length = columns.query_field((metadata_base.ALL_ELEMENTS,), 'dimension')['length'] + if columns_replaced < len(column_indices): + # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns + # listed in the slice will be replaced and others appended after the last replaced column. + outputs = outputs.replace_columns(columns, column_indices[columns_replaced:columns_replaced + columns_length]) + else: + # We insert the rest of columns after the last columns we replaced. We know that "column_indices" + # is non-empty and that the last item of "column_indices" points ot the last column we replaced + # for those listed in "column_indices". We replaced more columns though, so we have to add the + # difference, and then add 1 to insert after the last column. + outputs = outputs.insert_columns(columns, column_indices[-1] + (columns_replaced - len(column_indices)) + 1) + columns_replaced += columns_length + + if columns_replaced < len(column_indices): + outputs = outputs.remove_columns(column_indices[columns_replaced:len(column_indices)]) + + elif return_result == 'new': + if not any(columns_metadata.query_field((metadata_base.ALL_ELEMENTS,), 'dimension')['length'] for columns_metadata in columns_list): + raise ValueError("No columns produced.") + + outputs = columns_list[0] + for columns in columns_list[1:]: + outputs = outputs.append_columns(columns) + + if add_index_columns: + inputs_index_columns = inputs.get_index_columns() + outputs_index_columns = outputs.get_index_columns() + + if inputs_index_columns and not outputs_index_columns: + # Add index columns at the beginning. + outputs = inputs.select_columns(inputs_index_columns).append_columns(outputs, use_right_metadata=True) + + else: + raise exceptions.InvalidArgumentValueError("\"return_result\" has an invalid value: {return_result}".format(return_result=return_result)) + + return outputs + + +def get_tabular_resource( + dataset: container.Dataset, resource_id: typing.Optional[str], *, + pick_entry_point: bool = True, pick_one: bool = True, has_hyperparameter: bool = True, +) -> typing.Tuple[str, container.DataFrame]: + if resource_id is None and pick_entry_point: + for dataset_resource_id in dataset.keys(): + if dataset.metadata.has_semantic_type((dataset_resource_id,), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'): + resource_id = dataset_resource_id + break + + if resource_id is None and pick_one: + tabular_resource_ids = [dataset_resource_id for dataset_resource_id, dataset_resource in dataset.items() if isinstance(dataset_resource, container.DataFrame)] + if len(tabular_resource_ids) == 1: + resource_id = tabular_resource_ids[0] + + if resource_id is None: + if has_hyperparameter: + if pick_entry_point and pick_one: + raise ValueError("A Dataset with multiple tabular resources without an entry point and no resource specified as a hyper-parameter.") + elif pick_entry_point: + raise ValueError("A Dataset without an entry point and no resource specified as a hyper-parameter.") + elif pick_one: + raise ValueError("A Dataset with multiple tabular resources and no resource specified as a hyper-parameter.") + else: + raise ValueError("No resource specified as a hyper-parameter.") + else: + if pick_entry_point and pick_one: + raise ValueError("A Dataset with multiple tabular resources without an entry point.") + elif pick_entry_point: + raise ValueError("A Dataset without an entry point.") + elif pick_one: + raise ValueError("A Dataset with multiple tabular resources.") + else: + raise ValueError("No resource specified.") + + else: + resource = dataset[resource_id] + + if not isinstance(resource, container.DataFrame): + raise TypeError("The Dataset resource '{resource_id}' is not a DataFrame, but '{type}'.".format( + resource_id=resource_id, + type=type(resource), + )) + + return resource_id, resource + + +def get_tabular_resource_metadata( + dataset: metadata_base.DataMetadata, resource_id: typing.Optional[metadata_base.SelectorSegment], *, + pick_entry_point: bool = True, pick_one: bool = True, +) -> metadata_base.SelectorSegment: + if resource_id is None and pick_entry_point: + # This can be also "ALL_ELEMENTS" and it will work out, but we prefer a direct resource ID, + # if available. So we reverse the list, because the first is "ALL_ELEMENTS" if it exists. + for dataset_resource_id in reversed(dataset.get_elements(())): + if dataset.has_semantic_type((dataset_resource_id,), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'): + resource_id = dataset_resource_id + break + + if resource_id is None and pick_one: + # This can be also "ALL_ELEMENTS" and it will work out, but we prefer a direct resource ID, + # if available. So we reverse the list, because the first is "ALL_ELEMENTS" if it exists. + tabular_resource_ids = [] + for dataset_resource_id in reversed(dataset.get_elements(())): + dataset_resource_type = dataset.query((dataset_resource_id,)).get('structural_type', None) + + if dataset_resource_type is None: + continue + + if issubclass(dataset_resource_type, container.DataFrame): + tabular_resource_ids.append(dataset_resource_id) + + if len(tabular_resource_ids) == 1: + resource_id = tabular_resource_ids[0] + + if resource_id is None: + if pick_entry_point and pick_one: + raise ValueError("A Dataset with multiple tabular resources without an entry point and no DataFrame resource specified as a hyper-parameter.") + elif pick_entry_point: + raise ValueError("A Dataset without an entry point and no DataFrame resource specified as a hyper-parameter.") + elif pick_one: + raise ValueError("A Dataset with multiple tabular resources and no DataFrame resource specified as a hyper-parameter.") + else: + raise ValueError("No DataFrame resource specified as a hyper-parameter.") + + else: + resource_type = dataset.query((resource_id,))['structural_type'] + + if not issubclass(resource_type, container.DataFrame): + raise TypeError("The Dataset resource '{resource_id}' is not a DataFrame, but '{type}'.".format( + resource_id=resource_id, + type=resource_type, + )) + + return resource_id + + +def sample_rows( + dataset: container.Dataset, main_resource_id: str, main_resource_indices_to_keep: typing.Set[int], + relations_graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]], *, + delete_recursive: bool = False, +) -> container.Dataset: + # We store rows as sets, but later on we sort them when we select rows. + row_indices_to_keep_sets: typing.Dict[str, typing.Set[int]] = collections.defaultdict(set) + row_indices_to_keep_sets[main_resource_id] = main_resource_indices_to_keep + + # If "delete_recursive" is set to "False", we do not populate "row_indices_to_keep_sets" + # with other resources, making "select_rows" simply keep them. + if delete_recursive: + # We sort to be deterministic. + for main_resource_row_index in sorted(row_indices_to_keep_sets[main_resource_id]): + queue = [] + queue.append((main_resource_id, [main_resource_row_index])) + while queue: + current_resource_id, current_row_indices = queue.pop(0) + current_resource = dataset[current_resource_id] + + for edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state in relations_graph[current_resource_id]: + # All rows from the main resource we want are already there. + # TODO: What to do if we get a reference to the row in the main resource which is not part of this sample? + # This means that probably the sample is invalid. We should not be generating such samples which do not + # preserve reference loops and their consistency. Otherwise it is not really possible to denormalize + # such Dataset properly: a reference is referencing a row in the main resource which does not exist. + if edge_resource_id == main_resource_id: + continue + + edge_resource = dataset[edge_resource_id] + + to_column_values = edge_resource.iloc[:, edge_to_index] + for from_column_value in current_resource.iloc[current_row_indices, edge_from_index]: + # We assume here that "index" corresponds to the default index with row indices. + rows_with_value = edge_resource.index[to_column_values == from_column_value] + # We sort to be deterministic. + new_rows_list = sorted(set(rows_with_value) - row_indices_to_keep_sets[edge_resource_id]) + row_indices_to_keep_sets[edge_resource_id].update(new_rows_list) + queue.append((edge_resource_id, new_rows_list)) + + # We sort indices to get deterministic outputs from sets (which do not have deterministic order). + # We also do not want to change the row order but keep the original row order. + # Sorting by row indices values assure that. + row_indices_to_keep = {resource_id: sorted(indices) for resource_id, indices in row_indices_to_keep_sets.items()} + + return dataset.select_rows(row_indices_to_keep) diff --git a/d3m/d3m/cli.py b/d3m/d3m/cli.py new file mode 100644 index 0000000..a6d43c9 --- /dev/null +++ b/d3m/d3m/cli.py @@ -0,0 +1,1172 @@ +import argparse +import logging +import typing + +from d3m import exceptions, index, runtime, utils, __version__ +from d3m.container import dataset as dataset_module +from d3m.metadata import base as metadata_base, pipeline as pipeline_module, pipeline_run, problem as problem_module + +logger = logging.getLogger(__name__) + + +def pipeline_run_handler( + arguments: argparse.Namespace, parser: argparse.ArgumentParser, +) -> None: + # Call a handler for the command. + arguments.pipeline_run_handler( + arguments, + ) + + +def pipeline_run_configure_parser(parser: argparse.ArgumentParser, *, skip_arguments: typing.Tuple = ()) -> None: + subparsers = parser.add_subparsers(dest='pipeline_run_command', title='commands') + subparsers.required = True # type: ignore + + validate_parser = subparsers.add_parser( + 'validate', help="validate pipeline runs", + description="Validate pipeline runs for use in metalearning database.", + ) + + if 'list' not in skip_arguments: + validate_parser.add_argument( + '-l', '--list', default=False, action='store_true', + help="print path of pipeline run being validated", + ) + if 'continue' not in skip_arguments: + validate_parser.add_argument( + '-c', '--continue', default=False, action='store_true', + help="continue after pipeline run validation error", + ) + if 'pipeline_runs' not in skip_arguments: + validate_parser.add_argument( + 'pipeline_runs', metavar='PIPELINE_RUN', nargs='+', + help="path to a pipeline run", + ) + validate_parser.set_defaults(pipeline_run_handler=pipeline_run.pipeline_run_handler) + + +def dataset_handler( + arguments: argparse.Namespace, parser: argparse.ArgumentParser, *, + dataset_resolver: typing.Callable = None, +) -> None: + # Call a handler for the command. + arguments.dataset_handler( + arguments, + dataset_resolver=dataset_resolver, + ) + + +def dataset_configure_parser(parser: argparse.ArgumentParser, *, skip_arguments: typing.Tuple = ()) -> None: + subparsers = parser.add_subparsers(dest='dataset_command', title='commands') + subparsers.required = True # type: ignore + + describe_parser = subparsers.add_parser( + 'describe', help="generate JSON description of datasets", + description="Generates JSON descriptions of datasets.", + ) + convert_parser = subparsers.add_parser( + 'convert', help="convert datasets", + description="Converts one dataset to another.", + ) + validate_parser = subparsers.add_parser( + 'validate', help="validate datasets", + description="Validate dataset descriptions for use in metalearning database.", + ) + + if 'list' not in skip_arguments: + describe_parser.add_argument( + '-l', '--list', default=False, action='store_true', + help="print path or URI of dataset being described", + ) + if 'indent' not in skip_arguments: + describe_parser.add_argument( + '-i', '--indent', type=int, default=2, action='store', + help="indent JSON by this much, 0 disables indentation, default 2", + ) + if 'sort_keys' not in skip_arguments: + describe_parser.add_argument( + '-s', '--sort-keys', default=False, action='store_true', + help="sort keys in JSON" + ) + if 'print' not in skip_arguments: + describe_parser.add_argument( + '-p', '--print', default=False, action='store_true', + help="pretty print dataset contents instead of printing JSON description", + ) + if 'metadata' not in skip_arguments: + describe_parser.add_argument( + '-m', '--metadata', default=False, action='store_true', + help="pretty print dataset metadata instead of printing JSON description", + ) + if 'lazy' not in skip_arguments: + describe_parser.add_argument( + '-L', '--lazy', default=False, action='store_true', + help="load dataset lazily", + ) + if 'time' not in skip_arguments: + describe_parser.add_argument( + '-t', '--time', default=False, action='store_true', + help="time dataset loading instead of printing JSON description", + ) + if 'continue' not in skip_arguments: + describe_parser.add_argument( + '-c', '--continue', default=False, action='store_true', + help="continue after dataset loading error", + ) + if 'output' not in skip_arguments: + describe_parser.add_argument( + '-o', '--output', type=utils.FileType('w', encoding='utf8'), default='-', action='store', + help="save output to a file, default stdout", + ) + if 'datasets' not in skip_arguments: + describe_parser.add_argument( + 'datasets', metavar='DATASET', nargs='*', + help="path or URI of a dataset", + ) + describe_parser.set_defaults(dataset_handler=dataset_module.describe_handler) + + if 'input_uri' not in skip_arguments: + convert_parser.add_argument( + '-i', '--input', dest='input_uri', + help="input path or URI of a dataset", + ) + if 'output_uri' not in skip_arguments: + convert_parser.add_argument( + '-o', '--output', dest='output_uri', + help="output path or URI of a dataset", + ) + if 'preserve_metadata' not in skip_arguments: + convert_parser.add_argument( + '--no-metadata', default=True, action='store_false', dest='preserve_metadata', + help="do not preserve metadata", + ) + convert_parser.set_defaults(dataset_handler=dataset_module.convert_handler) + + if 'list' not in skip_arguments: + validate_parser.add_argument( + '-l', '--list', default=False, action='store_true', + help="print path or URI of dataset being validated", + ) + if 'continue' not in skip_arguments: + validate_parser.add_argument( + '-c', '--continue', default=False, action='store_true', + help="continue after dataset validation error", + ) + if 'datasets' not in skip_arguments: + validate_parser.add_argument( + 'datasets', metavar='DATASET', nargs='+', + help="path to a dataset description", + ) + validate_parser.set_defaults(dataset_handler=pipeline_run.dataset_handler) + + +def problem_handler( + arguments: argparse.Namespace, parser: argparse.ArgumentParser, *, + problem_resolver: typing.Callable = None, +) -> None: + # Call a handler for the command. + arguments.problem_handler( + arguments, + problem_resolver=problem_resolver, + ) + + +def problem_configure_parser(parser: argparse.ArgumentParser, *, skip_arguments: typing.Tuple = ()) -> None: + subparsers = parser.add_subparsers(dest='problem_command', title='commands') + subparsers.required = True # type: ignore + + describe_parser = subparsers.add_parser( + 'describe', help="generate JSON description of problems", + description="Generates JSON descriptions of problems.", + ) + validate_parser = subparsers.add_parser( + 'validate', help="validate problems", + description="Validate problem descriptions for use in metalearning database.", + ) + + if 'list' not in skip_arguments: + describe_parser.add_argument( + '-l', '--list', default=False, action='store_true', + help="print path or URI of problem being described", + ) + if 'indent' not in skip_arguments: + describe_parser.add_argument( + '-i', '--indent', type=int, default=2, action='store', + help="indent JSON by this much, 0 disables indentation, default 2", + ) + if 'sort_keys' not in skip_arguments: + describe_parser.add_argument( + '-s', '--sort-keys', default=False, action='store_true', + help="sort keys in JSON" + ) + if 'print' not in skip_arguments: + describe_parser.add_argument( + '-p', '--print', default=False, action='store_true', + help="pretty print problem description instead of printing JSON", + ) + if 'continue' not in skip_arguments: + describe_parser.add_argument( + '-c', '--continue', default=False, action='store_true', + help="continue after problem parsing error", + ) + if 'output' not in skip_arguments: + describe_parser.add_argument( + '-o', '--output', type=utils.FileType('w', encoding='utf8'), default='-', action='store', + help="save output to a file, default stdout", + ) + if 'no_print' not in skip_arguments: + describe_parser.add_argument( + '--no-print', default=False, action='store_true', + help="do not print JSON", + ) + if 'problems' not in skip_arguments: + describe_parser.add_argument( + 'problems', metavar='PROBLEM', nargs='+', + help="path or URI to a problem description", + ) + describe_parser.set_defaults(problem_handler=problem_module.describe_handler) + + if 'list' not in skip_arguments: + validate_parser.add_argument( + '-l', '--list', default=False, action='store_true', + help="print path or URI of problem being validated", + ) + if 'continue' not in skip_arguments: + validate_parser.add_argument( + '-c', '--continue', default=False, action='store_true', + help="continue after problem validation error", + ) + if 'problems' not in skip_arguments: + validate_parser.add_argument( + 'problems', metavar='PROBLEM', nargs='+', + help="path to a problem description", + ) + validate_parser.set_defaults(problem_handler=pipeline_run.problem_handler) + + +def primitive_handler(arguments: argparse.Namespace, parser: argparse.ArgumentParser) -> None: + # Call a handler for the command. + arguments.primitive_handler(arguments) + + +def primitive_configure_parser(parser: argparse.ArgumentParser, *, skip_arguments: typing.Tuple = ()) -> None: + subparsers = parser.add_subparsers(dest='primitive_command', title='commands') + subparsers.required = True # type: ignore + + search_parser = subparsers.add_parser( + 'search', help="search locally available primitives", + description="Searches locally available primitives. Lists registered Python paths for primitives installed on the system.", + ) + discover_parser = subparsers.add_parser( + 'discover', help="discover primitives available on PyPi", + description="Discovers primitives available on PyPi. Lists package names containing D3M primitives on PyPi.", + ) + describe_parser = subparsers.add_parser( + 'describe', help="generate JSON description of primitives", + description="Generates JSON descriptions of primitives.", + ) + download_parser = subparsers.add_parser( + 'download', help="download files for primitives' volumes", + description="Downloads static files needed by primitives.", + ) + validate_parser = subparsers.add_parser( + 'validate', help="validate primitive descriptions", + description="Validate primitive descriptions for use in metalearning database.", + ) + + if 'prefix' not in skip_arguments: + search_parser.add_argument( + '-p', '--prefix', action='store', + help="primitive path prefix to limit search results to", + ) + search_parser.set_defaults(primitive_handler=index.search_handler) + + if 'index' not in skip_arguments: + discover_parser.add_argument( + '-i', '--index', default=index.DEFAULT_INDEX, action='store', + help=f"base URL of Python Package Index to use, default {index.DEFAULT_INDEX}", + ) + discover_parser.set_defaults(primitive_handler=index.discover_handler) + + if 'list' not in skip_arguments: + describe_parser.add_argument( + '-l', '--list', default=False, action='store_true', + help="print path or ID of primitive being described", + ) + if 'indent' not in skip_arguments: + describe_parser.add_argument( + '-i', '--indent', type=int, default=2, action='store', + help="indent JSON by this much, 0 disables indentation, default 2", + ) + if 'sort_keys' not in skip_arguments: + describe_parser.add_argument( + '-s', '--sort-keys', default=False, action='store_true', + help="sort keys in JSON" + ) + if 'print' not in skip_arguments: + describe_parser.add_argument( + '-p', '--print', default=False, action='store_true', + help="pretty print primitive description instead of printing JSON", + ) + if 'continue' not in skip_arguments: + describe_parser.add_argument( + '-c', '--continue', default=False, action='store_true', + help="continue after primitive loading error", + ) + if 'output' not in skip_arguments: + describe_parser.add_argument( + '-o', '--output', type=utils.FileType('w', encoding='utf8'), default='-', action='store', + help="save output to a file, default stdout", + ) + if 'primitives' not in skip_arguments: + describe_parser.add_argument( + 'primitives', metavar='PRIMITIVE', nargs='+', + help="primitive path od primitive ID", + ) + describe_parser.set_defaults(primitive_handler=index.describe_handler) + + if 'output' not in skip_arguments: + download_parser.add_argument( + '-o', '--output', default=index.DEFAULT_OUTPUT, action='store', + help="path of a directory to download to, default current directory", + ) + if 'redownload' not in skip_arguments: + download_parser.add_argument( + '-r', '--redownload', default=False, action='store_true', + help="redownload files again, even if they already exist", + ) + if 'prefix' not in skip_arguments: + download_parser.add_argument( + '-p', '--prefix', action='store', + help="primitive path prefix to limit download to", + ) + download_parser.set_defaults(primitive_handler=index.download_handler) + + if 'list' not in skip_arguments: + validate_parser.add_argument( + '-l', '--list', default=False, action='store_true', + help="print path of primitive description being validated", + ) + if 'continue' not in skip_arguments: + validate_parser.add_argument( + '-c', '--continue', default=False, action='store_true', + help="continue after primitive description validation error", + ) + if 'primitives' not in skip_arguments: + validate_parser.add_argument( + 'primitives', metavar='PRIMITIVE', nargs='+', + help="path to a primitive description", + ) + validate_parser.set_defaults(primitive_handler=pipeline_run.primitive_handler) + + +def pipeline_handler( + arguments: argparse.Namespace, parser: argparse.ArgumentParser, *, + resolver_class: typing.Type[pipeline_module.Resolver] = None, + no_resolver_class: typing.Type[pipeline_module.Resolver] = None, + pipeline_class: typing.Type[pipeline_module.Pipeline] = None, +) -> None: + # Call a handler for the command. + arguments.pipeline_handler( + arguments, + resolver_class=resolver_class, + no_resolver_class=no_resolver_class, + pipeline_class=pipeline_class, + ) + + +def pipeline_configure_parser(parser: argparse.ArgumentParser, *, skip_arguments: typing.Tuple = ()) -> None: + subparsers = parser.add_subparsers(dest='pipeline_command', title='commands') + subparsers.required = True # type: ignore + + describe_parser = subparsers.add_parser( + 'describe', help="generate JSON description of pipelines", + description="Generates JSON descriptions of pipelines.", + ) + validate_parser = subparsers.add_parser( + 'validate', help="validate pipelines", + description="Validate pipeline descriptions for use in metalearning database.", + ) + + if 'no_resolving' not in skip_arguments: + describe_parser.add_argument( + '-n', '--no-resolving', default=False, action='store_true', + help="do not resolve primitives and pipelines, this prevents checking to be fully done though", + ) + if 'check' not in skip_arguments: + describe_parser.add_argument( + '-C', '--no-check', default=True, action='store_false', dest='check', + help="do not check a pipeline, just parse it", + ) + if 'allow_placeholders' not in skip_arguments: + describe_parser.add_argument( + '-a', '--allow-placeholders', default=False, action='store_true', + help="allow placeholders in a pipeline", + ) + if 'standard_pipeline' not in skip_arguments: + describe_parser.add_argument( + '-t', '--not-standard-pipeline', default=True, action='store_false', dest='standard_pipeline', + help="allow a pipeline to not have standard inputs and outputs", + ) + if 'list' not in skip_arguments: + describe_parser.add_argument( + '-l', '--list', default=False, action='store_true', + help="print path of pipeline being described", + ) + if 'indent' not in skip_arguments: + describe_parser.add_argument( + '-i', '--indent', type=int, default=2, action='store', + help="indent JSON by this much, 0 disables indentation, default 2", + ) + if 'sort_keys' not in skip_arguments: + describe_parser.add_argument( + '-s', '--sort-keys', default=False, action='store_true', + help="sort keys in JSON" + ) + if 'print' not in skip_arguments: + describe_parser.add_argument( + '-p', '--print', default=False, action='store_true', + help="pretty print pipeline description instead of printing JSON", + ) + if 'continue' not in skip_arguments: + describe_parser.add_argument( + '-c', '--continue', default=False, action='store_true', + help="continue after pipeline parsing error", + ) + if 'set_source_name' not in skip_arguments: + describe_parser.add_argument( + '--set-source-name', action='store', + help="set pipeline's source name", + ) + if 'output' not in skip_arguments: + describe_parser.add_argument( + '-o', '--output', type=utils.FileType('w', encoding='utf8'), default='-', action='store', + help="save output to a file, default stdout", + ) + if 'pipelines' not in skip_arguments: + describe_parser.add_argument( + 'pipelines', metavar='PIPELINE', nargs='+', + help="path to a pipeline (.json, .yml, or .yaml)", + ) + describe_parser.set_defaults(pipeline_handler=pipeline_module.describe_handler) + + if 'list' not in skip_arguments: + validate_parser.add_argument( + '-l', '--list', default=False, action='store_true', + help="print path of pipeline being validated", + ) + if 'continue' not in skip_arguments: + validate_parser.add_argument( + '-c', '--continue', default=False, action='store_true', + help="continue after pipeline validation error", + ) + if 'pipelines' not in skip_arguments: + validate_parser.add_argument( + 'pipelines', metavar='PIPELINE', nargs='*', + help="path to a pipeline (.json, .yml, or .yaml)", + ) + validate_parser.set_defaults(pipeline_handler=pipeline_run.pipeline_handler) + + +def runtime_handler( + arguments: argparse.Namespace, parser: argparse.ArgumentParser, *, + pipeline_resolver: typing.Callable = None, pipeline_run_parser: typing.Callable = None, + dataset_resolver: typing.Callable = None, problem_resolver: typing.Callable = None, +) -> None: + # Dynamically fetch which subparser was used. + subparser = parser._subparsers._group_actions[0].choices[arguments.runtime_command] # type: ignore + + # TODO: These arguments are required, but this is not visible from the usage line. These arguments are marked as optional there. + if getattr(arguments, 'input_run', None) is None: + manual_config = { + 'fit': [ + ('-i/--input', 'inputs'), ('-p/--pipeline', 'pipeline'), + ], + 'produce': [ + ('-t/--test-input', 'test_inputs'), + ], + 'score': [ + ('-t/--test-input', 'test_inputs'), ('-a/--score-input', 'score_inputs'), + ], + 'fit-produce': [ + ('-i/--input', 'inputs'), ('-t/--test-input', 'test_inputs'), ('-p/--pipeline', 'pipeline'), + ], + 'fit-score': [ + ('-i/--input', 'inputs'), ('-t/--test-input', 'test_inputs'), ('-a/--score-input', 'score_inputs'), + ('-p/--pipeline', 'pipeline'), + ], + 'evaluate': [ + ('-i/--input', 'inputs'), ('-p/--pipeline', 'pipeline'), ('-d/--data-pipeline', 'data_pipeline'), + ], + }.get(arguments.runtime_command, []) + + if any(getattr(arguments, dest, None) is None for (name, dest) in manual_config): + subparser.error( + '{command} requires either -u/--input-run or the following arguments: {manual_arguments}'.format( + command=arguments.runtime_command, + manual_arguments=', '.join( + name for (name, dest) in manual_config + ), + ) + ) + else: + manual_config_with_defaults = [ + ('-i/--input', 'inputs', None), ('-t/--test-input', 'test_inputs', None), ('-a/--score-input', 'score_inputs', None), + ('-r/--problem', 'problem', None), ('-p/--pipeline', 'pipeline', None), ('-d/--data-pipeline', 'data_pipeline', None), + ('-n/--random-seed', 'random_seed', 0), ('-e/--metric', 'metrics', None), ('-Y/--scoring-param', 'scoring_params', None), + ('--scoring-random-seed', 'scoring_random_seed', 0), ('-n/--scoring-pipeline', 'scoring_pipeline', runtime.DEFAULT_SCORING_PIPELINE_PATH), + ('-y/--data-param', 'data_params', None), ('--data-split-file', 'data_split_file', None), ('--data-random-seed', 'data_random_seed', 0), + ('--not-standard-pipeline', 'standard_pipeline', True), + ] + if any(getattr(arguments, dest, None) not in [default, None] for (name, dest, default) in manual_config_with_defaults): + subparser.error( + '-u/--input-run cannot be used with the following arguments: {manual_arguments}'.format( + manual_arguments=', '.join( + name for (name, dest, default) in manual_config_with_defaults if getattr(arguments, dest, None) not in [default, None] + ), + ) + ) + + if not getattr(arguments, 'standard_pipeline', True) and getattr(arguments, 'output', None) is not None: + subparser.error("you cannot save predictions for a non-standard pipeline") + + # Call a handler for the command. + arguments.runtime_handler( + arguments, + pipeline_resolver=pipeline_resolver, + pipeline_run_parser=pipeline_run_parser, + dataset_resolver=dataset_resolver, + problem_resolver=problem_resolver, + ) + + +def runtime_configure_parser(parser: argparse.ArgumentParser, *, skip_arguments: typing.Tuple = ()) -> None: + if 'random_seed' not in skip_arguments: + parser.add_argument( + '-n', '--random-seed', type=int, default=0, action='store', metavar='SEED', + help="random seed to use", + ) + if 'context' not in skip_arguments: + parser.add_argument( + '-x', '--context', choices=[context.name for context in metadata_base.Context], default=metadata_base.Context.TESTING.name, action='store', + help="in which context to run pipelines, default is TESTING", + ) + if 'volumes_dir' not in skip_arguments: + parser.add_argument( + '-v', '--volumes', action='store', dest='volumes_dir', + help="path to a directory with static files required by primitives, in the standard directory structure (as obtained running \"python3 -m d3m index download\")", + ) + if 'datasets_dir' not in skip_arguments: + parser.add_argument( + '-d', '--datasets', action='store', dest='datasets_dir', + help="path to a directory with datasets (and problem descriptions) to resolve IDs in pipeline run files", + ) + if 'scratch_dir' not in skip_arguments: + parser.add_argument( + '-s', '--scratch', action='store', dest='scratch_dir', + help="path to a directory to store any temporary files needed during execution", + ) + if 'worker_id' not in skip_arguments: + parser.add_argument( + '--worker-id', action='store', + help="globally unique identifier for the machine on which the runtime is running", + ) + + subparsers = parser.add_subparsers(dest='runtime_command', title='commands') + subparsers.required = True # type: ignore + + fit_parser = subparsers.add_parser( + 'fit', help="fit a pipeline", + description="Fits a pipeline on train data, resulting in a fitted pipeline. Outputs also produced predictions during fitting on train data.", + ) + produce_parser = subparsers.add_parser( + 'produce', help="produce using a fitted pipeline", + description="Produce predictions on test data given a fitted pipeline.", + ) + score_parser = subparsers.add_parser( + 'score', help="produce using a fitted pipeline and score results", + description="Produce predictions on test data given a fitted pipeline and compute scores.", + ) + fit_produce_parser = subparsers.add_parser( + 'fit-produce', help="fit a pipeline and then produce using it", + description="Fit a pipeline on train data and produce predictions on test data.", + ) + fit_score_parser = subparsers.add_parser( + 'fit-score', help="fit a pipeline, produce using it and score results", + description="Fit a pipeline on train data, then produce predictions on test data and compute scores.", + ) + score_predictions_parser = subparsers.add_parser( + 'score-predictions', help="score a predictions file", + description="Compute scores given a file with predictions.", + ) + evaluate_parser = subparsers.add_parser( + 'evaluate', help="evaluate a pipeline", + description="Run pipeline multiple times using an evaluation approach and compute scores for each run.", + ) + + if 'pipeline' not in skip_arguments: + fit_parser.add_argument( + '-p', '--pipeline', action='store', + help="path to a pipeline file (.json, .yml, or .yaml) or pipeline ID", + ) + if 'problem' not in skip_arguments: + fit_parser.add_argument( + '-r', '--problem', action='store', + help="path or URI to a problem description", + ) + if 'inputs' not in skip_arguments: + fit_parser.add_argument( + '-i', '--input', action='append', metavar='INPUT', dest='inputs', + help="path or URI of an input train dataset", + ) + if 'input_run' not in skip_arguments: + fit_parser.add_argument( + '-u', '--input-run', type=utils.FileType('r', encoding='utf8'), action='store', + help="path to a pipeline run file with configuration, use \"-\" for stdin", + ) + if 'save' not in skip_arguments: + fit_parser.add_argument( + '-s', '--save', type=utils.FileType('wb'), action='store', + help="save fitted pipeline to a file, use \"-\" for stdout", + ) + if 'output' not in skip_arguments: + fit_parser.add_argument( + '-o', '--output', type=utils.FileType('w', encoding='utf8'), action='store', + help="save produced predictions during fitting to a file, use \"-\" for stdout", + ) + if 'output_run' not in skip_arguments: + fit_parser.add_argument( + '-O', '--output-run', type=utils.FileType('w', encoding='utf8'), action='store', + help="save pipeline run document to a YAML file, use \"-\" for stdout", + ) + if 'standard_pipeline' not in skip_arguments: + fit_parser.add_argument( + '--not-standard-pipeline', default=True, action='store_false', dest='standard_pipeline', + help="allow a pipeline to not have standard inputs and outputs", + ) + if 'expose_produced_outputs_dir' not in skip_arguments: + fit_parser.add_argument( + '-E', '--expose-produced-outputs', action='store', dest='expose_produced_outputs_dir', + help="save to a directory produced outputs of all primitives from pipeline's fit run", + ) + fit_parser.set_defaults(runtime_handler=runtime.fit_handler) + + if 'fitted_pipeline' not in skip_arguments: + produce_parser.add_argument( + '-f', '--fitted-pipeline', type=utils.FileType('rb'), action='store', required=True, + help="path to a saved fitted pipeline, use \"-\" for stdin", + ) + if 'test_inputs' not in skip_arguments: + produce_parser.add_argument( + '-t', '--test-input', action='append', metavar='INPUT', dest='test_inputs', + help="path or URI of an input test dataset", + ) + if 'input_run' not in skip_arguments: + produce_parser.add_argument( + '-u', '--input-run', type=utils.FileType('r', encoding='utf8'), action='store', + help="path to a pipeline run file with configuration, use \"-\" for stdin", + ) + if 'output' not in skip_arguments: + produce_parser.add_argument( + '-o', '--output', type=utils.FileType('w', encoding='utf8'), action='store', + help="save produced predictions to a file, use \"-\" for stdout", + ) + if 'output_run' not in skip_arguments: + produce_parser.add_argument( + '-O', '--output-run', type=utils.FileType('w', encoding='utf8'), action='store', + help="save pipeline run document to a YAML file, use \"-\" for stdout", + ) + if 'expose_produced_outputs_dir' not in skip_arguments: + produce_parser.add_argument( + '-E', '--expose-produced-outputs', action='store', dest='expose_produced_outputs_dir', + help="save to a directory produced outputs of all primitives from pipeline's produce run", + ) + produce_parser.set_defaults(runtime_handler=runtime.produce_handler) + + if 'fitted_pipeline' not in skip_arguments: + score_parser.add_argument( + '-f', '--fitted-pipeline', type=utils.FileType('rb'), action='store', required=True, + help="path to a saved fitted pipeline, use \"-\" for stdin", + ) + if 'scoring_pipeline' not in skip_arguments: + score_parser.add_argument( + '-n', '--scoring-pipeline', default=runtime.DEFAULT_SCORING_PIPELINE_PATH, action='store', + help="path to a scoring pipeline file (.json, .yml, or .yaml) or pipeline ID, default is standard scoring pipeline", + ) + if 'test_inputs' not in skip_arguments: + score_parser.add_argument( + '-t', '--test-input', action='append', metavar='INPUT', dest='test_inputs', + help="path or URI of an input test dataset", + ) + if 'score_inputs' not in skip_arguments: + score_parser.add_argument( + '-a', '--score-input', action='append', metavar='INPUT', dest='score_inputs', + help="path or URI of an input score dataset", + ) + if 'input_run' not in skip_arguments: + score_parser.add_argument( + '-u', '--input-run', type=utils.FileType('r', encoding='utf8'), action='store', + help="path to a pipeline run file with configuration, use \"-\" for stdin", + ) + if 'metrics' not in skip_arguments: + score_parser.add_argument( + '-e', '--metric', choices=[metric.name for metric in problem_module.PerformanceMetric], + action='append', metavar='METRIC', dest='metrics', + help="metric to use, can be specified multiple times, default from problem description", + ) + if 'scoring_params' not in skip_arguments: + score_parser.add_argument( + '-Y', '--scoring-param', nargs=2, action='append', metavar=('NAME', 'VALUE'), dest='scoring_params', + help="hyper-parameter name and its value for scoring pipeline, can be specified multiple times, value should be JSON-serialized", + ) + if 'output' not in skip_arguments: + score_parser.add_argument( + '-o', '--output', type=utils.FileType('w', encoding='utf8'), action='store', + help="save produced predictions to a file, use \"-\" for stdout", + ) + if 'scores' not in skip_arguments: + score_parser.add_argument( + '-c', '--scores', type=utils.FileType('w', encoding='utf8'), default='-', action='store', + help="save scores to a file, default stdout", + ) + if 'output_run' not in skip_arguments: + score_parser.add_argument( + '-O', '--output-run', type=utils.FileType('w', encoding='utf8'), action='store', + help="save pipeline run document to a YAML file, use \"-\" for stdout", + ) + if 'expose_produced_outputs_dir' not in skip_arguments: + score_parser.add_argument( + '-E', '--expose-produced-outputs', action='store', dest='expose_produced_outputs_dir', + help="save to a directory produced outputs of all primitives from pipeline's produce run", + ) + score_parser.set_defaults(runtime_handler=runtime.score_handler) + + if 'pipeline' not in skip_arguments: + fit_produce_parser.add_argument( + '-p', '--pipeline', action='store', + help="path to a pipeline file (.json, .yml, or .yaml) or pipeline ID", + ) + if 'problem' not in skip_arguments: + fit_produce_parser.add_argument( + '-r', '--problem', action='store', + help="path or URI to a problem description", + ) + if 'inputs' not in skip_arguments: + fit_produce_parser.add_argument( + '-i', '--input', action='append', metavar='INPUT', dest='inputs', + help="path or URI of an input train dataset", + ) + if 'test_inputs' not in skip_arguments: + fit_produce_parser.add_argument( + '-t', '--test-input', action='append', metavar='INPUT', dest='test_inputs', + help="path or URI of an input test dataset", + ) + if 'input_run' not in skip_arguments: + fit_produce_parser.add_argument( + '-u', '--input-run', type=utils.FileType('r', encoding='utf8'), action='store', + help="path to a pipeline run file with configuration, use \"-\" for stdin", + ) + if 'save' not in skip_arguments: + fit_produce_parser.add_argument( + '-s', '--save', type=utils.FileType('wb'), action='store', + help="save fitted pipeline to a file, use \"-\" for stdout", + ) + if 'output' not in skip_arguments: + fit_produce_parser.add_argument( + '-o', '--output', type=utils.FileType('w', encoding='utf8'), action='store', + help="save produced predictions to a file, use \"-\" for stdout", + ) + if 'output_run' not in skip_arguments: + fit_produce_parser.add_argument( + '-O', '--output-run', type=utils.FileType('w', encoding='utf8'), action='store', + help="save pipeline run documents to a YAML file, use \"-\" for stdout", + ) + if 'standard_pipeline' not in skip_arguments: + fit_produce_parser.add_argument( + '--not-standard-pipeline', default=True, action='store_false', dest='standard_pipeline', + help="allow a pipeline to not have standard inputs and outputs", + ) + if 'expose_produced_outputs_dir' not in skip_arguments: + fit_produce_parser.add_argument( + '-E', '--expose-produced-outputs', action='store', dest='expose_produced_outputs_dir', + help="save to a directory produced outputs of all primitives from pipeline's produce run", + ) + fit_produce_parser.set_defaults(runtime_handler=runtime.fit_produce_handler) + + if 'pipeline' not in skip_arguments: + fit_score_parser.add_argument( + '-p', '--pipeline', action='store', + help="path to a pipeline file (.json, .yml, or .yaml) or pipeline ID", + ) + if 'scoring_pipeline' not in skip_arguments: + fit_score_parser.add_argument( + '-n', '--scoring-pipeline', default=runtime.DEFAULT_SCORING_PIPELINE_PATH, action='store', + help="path to a scoring pipeline file (.json, .yml, or .yaml) or pipeline ID, default is standard scoring pipeline", + ) + if 'problem' not in skip_arguments: + fit_score_parser.add_argument( + '-r', '--problem', action='store', + help="path or URI to a problem description", + ) + if 'inputs' not in skip_arguments: + fit_score_parser.add_argument( + '-i', '--input', action='append', metavar='INPUT', dest='inputs', + help="path or URI of an input train dataset", + ) + if 'test_inputs' not in skip_arguments: + fit_score_parser.add_argument( + '-t', '--test-input', action='append', metavar='INPUT', dest='test_inputs', + help="path or URI of an input test dataset", + ) + if 'score_inputs' not in skip_arguments: + fit_score_parser.add_argument( + '-a', '--score-input', action='append', metavar='INPUT', dest='score_inputs', + help="path or URI of an input score dataset", + ) + if 'input_run' not in skip_arguments: + fit_score_parser.add_argument( + '-u', '--input-run', type=utils.FileType('r', encoding='utf8'), action='store', + help="path to a pipeline run file with configuration, use \"-\" for stdin", + ) + if 'metrics' not in skip_arguments: + fit_score_parser.add_argument( + '-e', '--metric', choices=[metric.name for metric in problem_module.PerformanceMetric], + action='append', metavar='METRIC', dest='metrics', + help="metric to use, can be specified multiple times, default from problem description", + ) + if 'scoring_params' not in skip_arguments: + fit_score_parser.add_argument( + '-Y', '--scoring-param', nargs=2, action='append', metavar=('NAME', 'VALUE'), dest='scoring_params', + help="hyper-parameter name and its value for scoring pipeline, can be specified multiple times, value should be JSON-serialized", + ) + if 'save' not in skip_arguments: + fit_score_parser.add_argument( + '-s', '--save', type=utils.FileType('wb'), action='store', + help="save fitted pipeline to a file, use \"-\" for stdout", + ) + if 'output' not in skip_arguments: + fit_score_parser.add_argument( + '-o', '--output', type=utils.FileType('w', encoding='utf8'), action='store', + help="save produced predictions to a file, use \"-\" for stdout", + ) + if 'scores' not in skip_arguments: + fit_score_parser.add_argument( + '-c', '--scores', type=utils.FileType('w', encoding='utf8'), default='-', action='store', + help="save scores to a file, default stdout", + ) + if 'output_run' not in skip_arguments: + fit_score_parser.add_argument( + '-O', '--output-run', type=utils.FileType('w', encoding='utf8'), action='store', + help="save pipeline run documents to a YAML file, use \"-\" for stdout", + ) + if 'scoring_random_seed' not in skip_arguments: + fit_score_parser.add_argument( + '--scoring-random-seed', type=int, action='store', default=0, + help="random seed to use for scoring", + ) + if 'expose_produced_outputs_dir' not in skip_arguments: + fit_score_parser.add_argument( + '-E', '--expose-produced-outputs', action='store', dest='expose_produced_outputs_dir', + help="save to a directory produced outputs of all primitives from pipeline's produce run", + ) + fit_score_parser.set_defaults(runtime_handler=runtime.fit_score_handler) + + if 'scoring_pipeline' not in skip_arguments: + score_predictions_parser.add_argument( + '-n', '--scoring-pipeline', default=runtime.DEFAULT_SCORING_PIPELINE_PATH, action='store', + help="path to a scoring pipeline file (.json, .yml, or .yaml) or pipeline ID, default is standard scoring pipeline", + ) + if 'problem' not in skip_arguments: + score_predictions_parser.add_argument( + '-r', '--problem', action='store', + help="path or URI to a problem description", + ) + if 'predictions' not in skip_arguments: + score_predictions_parser.add_argument( + '-p', '--predictions', type=utils.FileType('r', encoding='utf8'), action='store', required=True, + help="path to a predictions file, use \"-\" for stdin", + ) + if 'score_inputs' not in skip_arguments: + score_predictions_parser.add_argument( + '-a', '--score-input', action='append', metavar='INPUT', dest='score_inputs', required=True, + help="path or URI of an input score dataset", + ) + if 'metrics' not in skip_arguments: + score_predictions_parser.add_argument( + '-e', '--metric', choices=[metric.name for metric in problem_module.PerformanceMetric], + action='append', metavar='METRIC', dest='metrics', + help="metric to use, can be specified multiple times, default from problem description", + ) + if 'scoring_params' not in skip_arguments: + score_predictions_parser.add_argument( + '-Y', '--scoring-param', nargs=2, action='append', metavar=('NAME', 'VALUE'), dest='scoring_params', + help="hyper-parameter name and its value for scoring pipeline, can be specified multiple times, value should be JSON-serialized", + ) + if 'scores' not in skip_arguments: + score_predictions_parser.add_argument( + '-c', '--scores', type=utils.FileType('w', encoding='utf8'), default='-', action='store', + help="save scores to a file, default stdout", + ) + if 'scoring_random_seed' not in skip_arguments: + score_predictions_parser.add_argument( + '--scoring-random-seed', type=int, action='store', default=0, + help="random seed to use for scoring", + ) + if 'predictions_random_seed' not in skip_arguments: + score_predictions_parser.add_argument( + '--predictions-random-seed', type=int, action='store', default=None, + help="random seed used for predictions", + ) + score_predictions_parser.set_defaults(runtime_handler=runtime.score_predictions_handler) + + if 'pipeline' not in skip_arguments: + evaluate_parser.add_argument( + '-p', '--pipeline', action='store', + help="path to a pipeline file (.json, .yml, or .yaml) or pipeline ID" + ) + if 'data_pipeline' not in skip_arguments: + evaluate_parser.add_argument( + '-d', '--data-pipeline', action='store', + help="path to a data preparation pipeline file (.json, .yml, or .yaml) or pipeline ID", + ) + if 'scoring_pipeline' not in skip_arguments: + evaluate_parser.add_argument( + '-n', '--scoring-pipeline', default=runtime.DEFAULT_SCORING_PIPELINE_PATH, action='store', + help="path to a scoring pipeline file (.json, .yml, or .yaml) or pipeline ID, default is standard scoring pipeline", + ) + if 'problem' not in skip_arguments: + evaluate_parser.add_argument( + '-r', '--problem', action='store', + help="path or URI to a problem description", + ) + if 'inputs' not in skip_arguments: + evaluate_parser.add_argument( + '-i', '--input', action='append', metavar='INPUT', dest='inputs', + help="path or URI of an input full dataset", + ) + if 'input_run' not in skip_arguments: + evaluate_parser.add_argument( + '-u', '--input-run', type=utils.FileType('r', encoding='utf8'), action='store', + help="path to a pipeline run file with configuration, use \"-\" for stdin", + ) + if 'data_params' not in skip_arguments: + evaluate_parser.add_argument( + '-y', '--data-param', nargs=2, action='append', metavar=('NAME', 'VALUE'), dest='data_params', + help="hyper-parameter name and its value for data preparation pipeline, can be specified multiple times, value should be JSON-serialized", + ) + if 'data_split_file' not in skip_arguments: + evaluate_parser.add_argument( + '--data-split-file', type=utils.FileType('r', encoding='utf8'), action='store', + help="reads the split file and populates \"primary_index_values\" hyper-parameter for data preparation pipeline with " + "values from the \"d3mIndex\" column corresponding to the test data, use \"-\" for stdin", + ) + if 'metrics' not in skip_arguments: + evaluate_parser.add_argument( + '-e', '--metric', choices=[metric.name for metric in problem_module.PerformanceMetric], action='append', metavar='METRIC', dest='metrics', + help="metric to use, can be specified multiple times, default from problem description", + ) + if 'scoring_params' not in skip_arguments: + evaluate_parser.add_argument( + '-Y', '--scoring-param', nargs=2, action='append', metavar=('NAME', 'VALUE'), dest='scoring_params', + help="hyper-parameter name and its value for scoring pipeline, can be specified multiple times, value should be JSON-serialized", + ) + if 'scores' not in skip_arguments: + evaluate_parser.add_argument( + '-c', '--scores', type=utils.FileType('w', encoding='utf8'), default='-', action='store', + help="save scores to a file, default stdout", + ) + if 'output_run' not in skip_arguments: + evaluate_parser.add_argument( + '-O', '--output-run', type=utils.FileType('w', encoding='utf8'), action='store', + help="save pipeline run documents to a YAML file, use \"-\" for stdin", + ) + if 'data_random_seed' not in skip_arguments: + evaluate_parser.add_argument( + '--data-random-seed', type=int, action='store', default=0, + help="random seed to use for data preparation", + ) + if 'scoring_random_seed' not in skip_arguments: + evaluate_parser.add_argument( + '--scoring-random-seed', type=int, action='store', default=0, + help="random seed to use for scoring", + ) + evaluate_parser.set_defaults(runtime_handler=runtime.evaluate_handler) + + +def handler( + arguments: argparse.Namespace, parser: argparse.ArgumentParser, *, + pipeline_resolver: typing.Callable = None, pipeline_run_parser: typing.Callable = None, + dataset_resolver: typing.Callable = None, problem_resolver: typing.Callable = None, + resolver_class: typing.Type[pipeline_module.Resolver] = None, + no_resolver_class: typing.Type[pipeline_module.Resolver] = None, + pipeline_class: typing.Type[pipeline_module.Pipeline] = None, +) -> None: + # Dynamically fetch which subparser was used. + subparser = parser._subparsers._group_actions[0].choices[arguments.d3m_command] # type: ignore + + if arguments.d3m_command == 'primitive': + primitive_handler( + arguments, + subparser, + ) + + elif arguments.d3m_command == 'index': + logger.warning("\"index\" CLI command is deprecated. Use \"primitive\" CLI command instead.") + + primitive_handler( + arguments, + subparser, + ) + + elif arguments.d3m_command == 'pipeline': + pipeline_handler( + arguments, + subparser, + resolver_class=resolver_class, + no_resolver_class=no_resolver_class, + pipeline_class=pipeline_class, + ) + + elif arguments.d3m_command == 'problem': + problem_handler( + arguments, + subparser, + problem_resolver=problem_resolver, + ) + + elif arguments.d3m_command == 'dataset': + dataset_handler( + arguments, + subparser, + dataset_resolver=dataset_resolver, + ) + + elif arguments.d3m_command == 'pipeline-run': + pipeline_run_handler( + arguments, + subparser, + ) + + elif arguments.d3m_command == 'runtime': + runtime_handler( + arguments, + subparser, + pipeline_resolver=pipeline_resolver, + pipeline_run_parser=pipeline_run_parser, + dataset_resolver=dataset_resolver, + problem_resolver=problem_resolver, + ) + + else: + raise exceptions.InvalidStateError("Cannot find a suitable command handler.") + + +# A fixed parser which correctly shows the error message for unknown arguments to the sub-command. +# See: https://gitlab.com/datadrivendiscovery/d3m/-/issues/409 +class _ArgumentParser(argparse.ArgumentParser): + # "parse_known_args" is made to behave exactly like "parse_args". + def parse_known_args(self, args: typing.Sequence[str] = None, namespace: argparse.Namespace = None) -> typing.Tuple[argparse.Namespace, typing.List[str]]: + namespace, argv = super().parse_known_args(args, namespace) + if argv: + msg = argparse._('unrecognized arguments: %s') # type: ignore + self.error(msg % ' '.join(argv)) + return namespace, argv + + +def configure_parser(parser: argparse.ArgumentParser, *, skip_arguments: typing.Tuple = ()) -> None: + if 'pipeline_search_paths' not in skip_arguments: + parser.add_argument( + '-p', '--pipelines-path', action='append', metavar='PATH', dest='pipeline_search_paths', + help="path to a directory with pipelines to resolve from (.json, .yml, or .yaml), " + "can be specified multiple times, has priority over PIPELINES_PATH environment variable", + ) + if 'logging_level' not in skip_arguments: + parser.add_argument( + '-l', '--logging-level', default='info', action='store', + choices=['debug', 'info', 'warning', 'error', 'critical'], + help="logging level to use for the console", + ) + if 'compute_digest' not in skip_arguments: + parser.add_argument( + '--compute-digest', choices=[compute_digest.name for compute_digest in dataset_module.ComputeDigest], + default=dataset_module.ComputeDigest.ONLY_IF_MISSING.name, action='store', + help="when loading datasets, when to compute their digests, default is ONLY_IF_MISSING", + ) + if 'strict_resolving' not in skip_arguments: + parser.add_argument( + '--strict-resolving', default=False, action='store_true', + help="fail resolving if a resolved pipeline, primitive, or dataset, does not fully match specified reference", + ) + if 'strict_digest' not in skip_arguments: + parser.add_argument( + '--strict-digest', default=False, action='store_true', + help="when loading datasets, pipelines, primitives, or problem descriptions, if computed digest does not match the one provided in metadata, raise an exception?" + ) + if 'version' not in skip_arguments: + parser.add_argument( + '-V', '--version', action='version', version=str(__version__), + help="print d3m package version and exit", + ) + + subparsers = parser.add_subparsers(dest='d3m_command', title='commands', parser_class=_ArgumentParser) + subparsers.required = True # type: ignore + + primitive_parser = subparsers.add_parser( + 'primitive', help="describe, validate, explore, and manage primitives", + description="Describe, explore, and manage primitives.", + ) + # Legacy command name. Deprecated. We do not use "aliases" argument to "add_parser" + # because we want this command to be hidden. + subparsers._name_parser_map['index'] = primitive_parser # type: ignore + + primitive_configure_parser(primitive_parser, skip_arguments=skip_arguments) + + pipeline_parser = subparsers.add_parser( + 'pipeline', help="describe and validate pipelines", + description="Describe and validate pipelines.", + ) + + pipeline_configure_parser(pipeline_parser, skip_arguments=skip_arguments) + + problem_parser = subparsers.add_parser( + 'problem', help="describe and validate problems", + description="Describe and validate problems.", + ) + + problem_configure_parser(problem_parser, skip_arguments=skip_arguments) + + dataset_parser = subparsers.add_parser( + 'dataset', help="describe and validate datasets", + description="Describe and validate datasets.", + ) + + dataset_configure_parser(dataset_parser, skip_arguments=skip_arguments) + + pipeline_run_parser = subparsers.add_parser( + 'pipeline-run', help="validate pipeline runs", + description="Validate pipeline runs.", + ) + + pipeline_run_configure_parser(pipeline_run_parser, skip_arguments=skip_arguments) + + runtime_parser = subparsers.add_parser( + 'runtime', help="run D3M pipelines", + description="Run D3M pipelines.", + ) + + runtime_configure_parser(runtime_parser, skip_arguments=skip_arguments) + + # We set metavar at the end, when we know all subparsers. We want + # "index" command to be hidden because it is deprecated. + subparsers.metavar = '{' + ','.join(name for name in subparsers._name_parser_map.keys() if name != 'index') + '}' # type: ignore + + +def main(argv: typing.Sequence) -> None: + parser = argparse.ArgumentParser(prog='d3m', description="Run a D3M core package command.") + configure_parser(parser) + + arguments = parser.parse_args(argv[1:]) + + logging.basicConfig(level=arguments.logging_level.upper()) + + handler(arguments, parser) diff --git a/d3m/d3m/container/__init__.py b/d3m/d3m/container/__init__.py new file mode 100644 index 0000000..38b49ac --- /dev/null +++ b/d3m/d3m/container/__init__.py @@ -0,0 +1,8 @@ +""" +This module provides various container types one can use to pass values between primitives. +""" + +from .dataset import * +from .pandas import * +from .numpy import * +from .list import * diff --git a/d3m/d3m/container/dataset.py b/d3m/d3m/container/dataset.py new file mode 100644 index 0000000..7cdd22c --- /dev/null +++ b/d3m/d3m/container/dataset.py @@ -0,0 +1,3297 @@ +import abc +import argparse +import collections +import datetime +import errno +import filecmp +import hashlib +import io +import itertools +import json +import logging +import math +import os +import os.path +import pprint +import re +import shutil +import sys +import time +import traceback +import typing +from urllib import error as urllib_error, parse as url_parse + +import dateutil.parser # type: ignore +import frozendict # type: ignore +import numpy # type: ignore +import openml # type: ignore +import pandas # type: ignore +from pandas.io import common as pandas_io_common # type: ignore +from sklearn import datasets # type: ignore + +from . import pandas as container_pandas +from d3m import deprecate, exceptions, utils +from d3m.metadata import base as metadata_base + +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/66 +try: + from pyarrow import lib as pyarrow_lib # type: ignore +except ModuleNotFoundError: + pyarrow_lib = None + +__all__ = ('Dataset', 'ComputeDigest') + +logger = logging.getLogger(__name__) + +UNITS = { + 'B': 1, 'KB': 10**3, 'MB': 10**6, 'GB': 10**9, 'TB': 10**12, 'PB': 10**15, + 'KiB': 2*10, 'MiB': 2*20, 'GiB': 2*30, 'TiB': 2*40, 'PiB': 2*50, +} +SIZE_TO_UNITS = { + 1: 'B', 3: 'KB', 6: 'MB', + 9: 'GB', 12: 'TB', 15: 'PB', +} + +D3M_ROLE_CONSTANTS_TO_SEMANTIC_TYPES = { + 'index': 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'multiIndex': 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey', + 'key': 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + 'attribute': 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'suggestedTarget': 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'timeIndicator': 'https://metadata.datadrivendiscovery.org/types/Time', + 'locationIndicator': 'https://metadata.datadrivendiscovery.org/types/Location', + 'boundaryIndicator': 'https://metadata.datadrivendiscovery.org/types/Boundary', + 'interval': 'https://metadata.datadrivendiscovery.org/types/Interval', + 'instanceWeight': 'https://metadata.datadrivendiscovery.org/types/InstanceWeight', + 'boundingPolygon': 'https://metadata.datadrivendiscovery.org/types/BoundingPolygon', + 'suggestedPrivilegedData': 'https://metadata.datadrivendiscovery.org/types/SuggestedPrivilegedData', + 'suggestedGroupingKey': 'https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey', + 'edgeSource': 'https://metadata.datadrivendiscovery.org/types/EdgeSource', + 'directedEdgeSource': 'https://metadata.datadrivendiscovery.org/types/DirectedEdgeSource', + 'undirectedEdgeSource': 'https://metadata.datadrivendiscovery.org/types/UndirectedEdgeSource', + 'simpleEdgeSource': 'https://metadata.datadrivendiscovery.org/types/SimpleEdgeSource', + 'multiEdgeSource': 'https://metadata.datadrivendiscovery.org/types/MultiEdgeSource', + 'edgeTarget': 'https://metadata.datadrivendiscovery.org/types/EdgeTarget', + 'directedEdgeTarget': 'https://metadata.datadrivendiscovery.org/types/DirectedEdgeTarget', + 'undirectedEdgeTarget': 'https://metadata.datadrivendiscovery.org/types/UndirectedEdgeTarget', + 'simpleEdgeTarget': 'https://metadata.datadrivendiscovery.org/types/SimpleEdgeTarget', + 'multiEdgeTarget': 'https://metadata.datadrivendiscovery.org/types/MultiEdgeTarget', +} + +D3M_RESOURCE_TYPE_CONSTANTS_TO_SEMANTIC_TYPES = { + # File collections. + 'image': 'http://schema.org/ImageObject', + 'video': 'http://schema.org/VideoObject', + 'audio': 'http://schema.org/AudioObject', + 'text': 'http://schema.org/Text', + 'speech': 'https://metadata.datadrivendiscovery.org/types/Speech', + 'timeseries': 'https://metadata.datadrivendiscovery.org/types/Timeseries', + 'raw': 'https://metadata.datadrivendiscovery.org/types/UnspecifiedStructure', + # Other. + 'graph': 'https://metadata.datadrivendiscovery.org/types/Graph', + 'edgeList': 'https://metadata.datadrivendiscovery.org/types/EdgeList', + 'table': 'https://metadata.datadrivendiscovery.org/types/Table', +} + +D3M_COLUMN_TYPE_CONSTANTS_TO_SEMANTIC_TYPES = { + 'boolean': 'http://schema.org/Boolean', + 'integer': 'http://schema.org/Integer', + 'real': 'http://schema.org/Float', + 'string': 'http://schema.org/Text', + 'categorical': 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'dateTime': 'http://schema.org/DateTime', + 'realVector': 'https://metadata.datadrivendiscovery.org/types/FloatVector', + 'json': 'https://metadata.datadrivendiscovery.org/types/JSON', + 'geojson': 'https://metadata.datadrivendiscovery.org/types/GeoJSON', + 'unknown': 'https://metadata.datadrivendiscovery.org/types/UnknownType', +} + +SEMANTIC_TYPES_TO_D3M_RESOURCE_TYPES = {v: k for k, v in D3M_RESOURCE_TYPE_CONSTANTS_TO_SEMANTIC_TYPES.items()} +SEMANTIC_TYPES_TO_D3M_ROLES = {v: k for k, v in D3M_ROLE_CONSTANTS_TO_SEMANTIC_TYPES.items()} +SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES = {v: k for k, v in D3M_COLUMN_TYPE_CONSTANTS_TO_SEMANTIC_TYPES.items()} + +D3M_TO_DATASET_FIELDS: typing.Dict[typing.Sequence[str], typing.Tuple[typing.Sequence[str], bool]] = { + ('about', 'datasetID'): (('id',), True), + ('about', 'datasetName'): (('name',), True), + ('about', 'description'): (('description',), False), + ('about', 'datasetVersion'): (('version',), False), + ('about', 'digest'): (('digest',), False), + ('about', 'approximateSize'): (('approximate_stored_size',), False), + ('about', 'citation'): (('source', 'citation'), False), + ('about', 'license'): (('source', 'license'), False), + ('about', 'redacted'): (('source', 'redacted'), False), + ('about', 'source'): (('source', 'name'), False), + ('about', 'citation'): (('source', 'citation'), False), + ('about', 'humanSubjectsResearch'): (('source', 'human_subjects_research'), False), +} + +INTERVAL_SEMANTIC_TYPES = ( + 'https://metadata.datadrivendiscovery.org/types/IntervalStart', + 'https://metadata.datadrivendiscovery.org/types/IntervalEnd', +) + +BOUNDARY_SEMANTIC_TYPES = ( + 'https://metadata.datadrivendiscovery.org/types/Interval', + 'https://metadata.datadrivendiscovery.org/types/BoundingPolygon', +) + INTERVAL_SEMANTIC_TYPES + +# A map between legacy (before v4.0.0) D3M resource formats and media types. +# Now all resource formats are media types. +MEDIA_TYPES = { + 'audio/aiff': 'audio/aiff', + 'audio/flac': 'audio/flac', + 'audio/ogg': 'audio/ogg', + 'audio/wav': 'audio/wav', + 'audio/mpeg': 'audio/mpeg', + 'image/jpeg': 'image/jpeg', + 'image/png': 'image/png', + 'video/mp4': 'video/mp4', + 'video/avi': 'video/avi', + 'text/csv': 'text/csv', + 'text/csv+gzip': 'text/csv+gzip', + 'text/plain': 'text/plain', + # Legacy (before v4.0.0) resource type for GML files. + # In "MEDIA_TYPES_REVERSE" it is not present on purpose. + 'text/gml': 'text/vnd.gml', + 'text/vnd.gml': 'text/vnd.gml', +} +MEDIA_TYPES_REVERSE = {v: k for k, v in MEDIA_TYPES.items()} + +# A legacy (before v4.0.0) map between D3M file extensions and media types. +# Now all datasets include a mapping between resource formats and file extensions. +# Based on: https://gitlab.com/datadrivendiscovery/data-supply/blob/shared/documentation/supportedResourceTypesFormats.json +FILE_EXTENSIONS = { + '.aif': 'audio/aiff', + '.aiff': 'audio/aiff', + '.flac': 'audio/flac', + '.ogg': 'audio/ogg', + '.wav': 'audio/wav', + '.mp3': 'audio/mpeg', + '.jpeg': 'image/jpeg', + '.jpg': 'image/jpeg', + '.png': 'image/png', + '.csv': 'text/csv', + '.csv.gz': 'text/csv+gzip', + '.gml': 'text/vnd.gml', + '.txt': 'text/plain', + '.mp4': 'video/mp4', + '.avi': 'video/avi', +} +FILE_EXTENSIONS_REVERSE: typing.Dict[str, typing.List[str]] = collections.defaultdict(list) +for k, v in FILE_EXTENSIONS.items(): + FILE_EXTENSIONS_REVERSE[v].append(k) + +TIME_GRANULARITIES = { + 'seconds': 'SECONDS', + 'minutes': 'MINUTES', + 'days': 'DAYS', + 'weeks': 'WEEKS', + 'months': 'MONTHS', + 'years': 'YEARS', + 'unspecified': 'UNSPECIFIED', +} +TIME_GRANULARITIES_REVERSE = {v: k for k, v in TIME_GRANULARITIES.items()} + +ALL_D3M_SEMANTIC_TYPES = \ + set(D3M_ROLE_CONSTANTS_TO_SEMANTIC_TYPES.values()) | \ + set(D3M_RESOURCE_TYPE_CONSTANTS_TO_SEMANTIC_TYPES.values()) | \ + set(D3M_COLUMN_TYPE_CONSTANTS_TO_SEMANTIC_TYPES.values()) | \ + set(BOUNDARY_SEMANTIC_TYPES) + +# A map between OpenML qualities and D3M metafeatures. +OPENML_QUALITY_MAP: typing.Dict[str, typing.Tuple[str, typing.Callable]] = { + 'Dimensionality': ('dimensionality', float), + 'NumberOfFeatures': ('number_of_attributes', int), + 'NumberOfInstances': ('number_of_instances', int), + 'NumberOfInstancesWithMissingValues': ('number_of_instances_with_missing_values', int), + 'PercentageOfInstancesWithMissingValues': ('ratio_of_instances_with_missing_values', float), + 'NumberOfMissingValues': ('number_of_missing_values', int), + 'PercentageOfMissingValues': ('ratio_of_missing_values', float), + 'NumberOfNumericFeatures': ('number_of_numeric_attributes', int), + 'PercentageOfNumericFeatures': ('ratio_of_numeric_attributes', float), + 'NumberOfBinaryFeatures': ('number_of_binary_attributes', int), + 'PercentageOfBinaryFeatures': ('ratio_of_binary_attributes', float), + 'NumberOfSymbolicFeatures': ('number_of_categorical_attributes', int), + 'PercentageOfSymbolicFeatures': ('ratio_of_categorical_attributes', float), + 'MeanNoiseToSignalRatio': ('noise_to_signal_ratio', float), + 'EquivalentNumberOfAtts': ('equivalent_number_of_attributes', int), +} + +OPENML_IGNORED_QUALITIES = { + # We use "number_distinct_values" on a target column instead. + 'NumberOfClasses', + # We use "value_counts_aggregate.max" on a target column instead. + 'MajorityClassSize', + # We use "value_probabilities_aggregate.max" on a target column instead. + 'MajorityClassPercentage', + # We use "value_counts_aggregate.min" on a target column instead. + 'MinorityClassSize', + # We use "value_probabilities_aggregate.min" on a target column instead. + 'MinorityClassPercentage', + # We use "entropy_of_values" on a target column instead. + 'ClassEntropy', + # It depends on the order of instances in the dataset, so it is a strange metafeature. + # See: https://github.com/openml/EvaluationEngine/issues/34 + 'AutoCorrelation', + # The following are not computed by code availble through primitives, and we require that. + 'CfsSubsetEval_DecisionStumpAUC', + 'CfsSubsetEval_DecisionStumpErrRate', + 'CfsSubsetEval_DecisionStumpKappa', + 'CfsSubsetEval_NaiveBayesAUC', + 'CfsSubsetEval_NaiveBayesErrRate', + 'CfsSubsetEval_NaiveBayesKappa', + 'CfsSubsetEval_kNN1NAUC', + 'CfsSubsetEval_kNN1NErrRate', + 'CfsSubsetEval_kNN1NKappa', + 'DecisionStumpAUC', + 'DecisionStumpErrRate', + 'DecisionStumpKappa', + 'J48.00001.AUC', + 'J48.00001.ErrRate', + 'J48.00001.Kappa', + 'J48.0001.AUC', + 'J48.0001.ErrRate', + 'J48.0001.Kappa', + 'J48.001.AUC', + 'J48.001.ErrRate', + 'J48.001.Kappa', + 'REPTreeDepth1AUC', + 'REPTreeDepth1ErrRate', + 'REPTreeDepth1Kappa', + 'REPTreeDepth2AUC', + 'REPTreeDepth2ErrRate', + 'REPTreeDepth2Kappa', + 'REPTreeDepth3AUC', + 'REPTreeDepth3ErrRate', + 'REPTreeDepth3Kappa', + 'RandomTreeDepth1AUC', + 'RandomTreeDepth1ErrRate', + 'RandomTreeDepth1Kappa', + 'RandomTreeDepth2AUC', + 'RandomTreeDepth2ErrRate', + 'RandomTreeDepth2Kappa', + 'RandomTreeDepth3AUC', + 'RandomTreeDepth3ErrRate', + 'RandomTreeDepth3Kappa', + 'kNN1NAUC', + 'kNN1NErrRate', + 'kNN1NKappa', + 'NaiveBayesAUC', + 'NaiveBayesErrRate', + 'NaiveBayesKappa', +} + +# A map between OpenML qualities and aggregated D3M metafeatures. +OPENML_QUALITY_AGGREGATE_MAP: typing.Dict[str, typing.Tuple[str, str, typing.Callable]] = { + 'MinAttributeEntropy': ('entropy_of_attributes', 'min', float), + 'MeanAttributeEntropy': ('entropy_of_attributes', 'mean', float), + 'MaxAttributeEntropy': ('entropy_of_attributes', 'max', float), + 'Quartile1AttributeEntropy': ('entropy_of_attributes', 'quartile_1', float), + 'Quartile2AttributeEntropy': ('entropy_of_attributes', 'median', float), + 'Quartile3AttributeEntropy': ('entropy_of_attributes', 'quartile_3', float), + 'MinSkewnessOfNumericAtts': ('skew_of_attributes', 'min', float), + 'MeanSkewnessOfNumericAtts': ('skew_of_attributes', 'mean', float), + 'MaxSkewnessOfNumericAtts': ('skew_of_attributes', 'max', float), + 'Quartile1SkewnessOfNumericAtts': ('skew_of_attributes', 'quartile_1', float), + 'Quartile2SkewnessOfNumericAtts': ('skew_of_attributes', 'median', float), + 'Quartile3SkewnessOfNumericAtts': ('skew_of_attributes', 'quartile_3', float), + 'MinMutualInformation': ('mutual_information_of_attributes', 'min', float), + 'MeanMutualInformation': ('mutual_information_of_attributes', 'mean', float), + 'MaxMutualInformation': ('mutual_information_of_attributes', 'max', float), + 'Quartile1MutualInformation': ('mutual_information_of_attributes', 'quartile_1', float), + 'Quartile2MutualInformation': ('mutual_information_of_attributes', 'median', float), + 'Quartile3MutualInformation': ('mutual_information_of_attributes', 'quartile_3', float), + 'MinMeansOfNumericAtts': ('mean_of_attributes', 'min', float), + 'MaxMeansOfNumericAtts': ('mean_of_attributes', 'max', float), + 'MeanMeansOfNumericAtts': ('mean_of_attributes', 'mean', float), + 'Quartile1MeansOfNumericAtts': ('mean_of_attributes', 'quartile_1', float), + 'Quartile2MeansOfNumericAtts': ('mean_of_attributes', 'median', float), + 'Quartile3MeansOfNumericAtts': ('mean_of_attributes', 'quartile_3', float), + 'MaxStdDevOfNumericAtts': ('standard_deviation_of_attributes', 'max', float), + 'MinStdDevOfNumericAtts': ('standard_deviation_of_attributes', 'min', float), + 'MeanStdDevOfNumericAtts': ('standard_deviation_of_attributes', 'mean', float), + 'Quartile1StdDevOfNumericAtts': ('standard_deviation_of_attributes', 'quartile_1', float), + 'Quartile2StdDevOfNumericAtts': ('standard_deviation_of_attributes', 'median', float), + 'Quartile3StdDevOfNumericAtts': ('standard_deviation_of_attributes', 'quartile_3', float), + 'MinNominalAttDistinctValues': ('number_distinct_values_of_categorical_attributes', 'min', float), + 'MaxNominalAttDistinctValues': ('number_distinct_values_of_categorical_attributes', 'max', float), + 'MeanNominalAttDistinctValues': ('number_distinct_values_of_categorical_attributes', 'mean', float), + 'StdvNominalAttDistinctValues': ('number_distinct_values_of_categorical_attributes', 'std', float), + 'MinKurtosisOfNumericAtts': ('kurtosis_of_attributes', 'min', float), + 'MaxKurtosisOfNumericAtts': ('kurtosis_of_attributes', 'max', float), + 'MeanKurtosisOfNumericAtts': ('kurtosis_of_attributes', 'mean', float), + 'Quartile1KurtosisOfNumericAtts': ('kurtosis_of_attributes', 'quartile_1', float), + 'Quartile2KurtosisOfNumericAtts': ('kurtosis_of_attributes', 'median', float), + 'Quartile3KurtosisOfNumericAtts': ('kurtosis_of_attributes', 'quartile_3', float), +} + +OPENML_ID_REGEX = re.compile(r'^/d/(\d+)$') + +DEFAULT_DATETIME = datetime.datetime.fromtimestamp(0, tz=datetime.timezone.utc) + +if not ALL_D3M_SEMANTIC_TYPES <= metadata_base.ALL_SEMANTIC_TYPES: + raise ValueError("Not all D3M semantic types are defined in metadata.") + + +class ComputeDigest(utils.Enum): + """ + Enumeration of possible approaches to computing dataset digest. + """ + + NEVER = 'NEVER' + ONLY_IF_MISSING = 'ONLY_IF_MISSING' + ALWAYS = 'ALWAYS' + + +def _add_extension_dot(extension: str) -> str: + if not extension.startswith('.'): + return '.' + extension + return extension + + +def _remove_extension_dot(extension: str) -> str: + if extension.startswith('.'): + return extension[1:] + return extension + + +def parse_size(size_string: str) -> int: + number, unit = [string.strip() for string in size_string.split()] + return int(float(number) * UNITS[unit]) + + +def is_simple_boundary(semantic_types: typing.Tuple[str]) -> bool: + """ + A simple boundary is a column with only "https://metadata.datadrivendiscovery.org/types/Boundary" + semantic type and no other. + """ + + return 'https://metadata.datadrivendiscovery.org/types/Boundary' in semantic_types and not any(boundary_semantic_type in semantic_types for boundary_semantic_type in BOUNDARY_SEMANTIC_TYPES) + + +def update_digest(hash: typing.Any, file_path: str) -> None: + with open(file_path, 'rb') as file: + while True: + # Reading is buffered, so we can read smaller chunks. + chunk = file.read(hash.block_size) + if not chunk: + break + hash.update(chunk) + + +# This exists as a reference implementation for computing a digest of D3M dataset. +# Loader below does an equivalent computation as part of dataset loading process. +def get_d3m_dataset_digest(dataset_doc_path: str) -> str: + hash = hashlib.sha256() + + with open(dataset_doc_path, 'r', encoding='utf8') as dataset_doc_file: + dataset_doc = json.load(dataset_doc_file) + + dataset_path = os.path.dirname(dataset_doc_path) + + for data_resource in dataset_doc['dataResources']: + if data_resource.get('isCollection', False): + collection_path = os.path.join(dataset_path, data_resource['resPath']) + + # We assume that we can just concat "collection_path" with a value in the column. + assert collection_path[-1] == '/' + + for filename in utils.list_files(collection_path): + file_path = os.path.join(collection_path, filename) + + # We include both the filename and the content. + hash.update(os.path.join(data_resource['resPath'], filename).encode('utf8')) + update_digest(hash, file_path) + + else: + resource_path = os.path.join(dataset_path, data_resource['resPath']) + + # We include both the filename and the content. + hash.update(data_resource['resPath'].encode('utf8')) + update_digest(hash, resource_path) + + # We remove digest, if it exists in dataset description, before computing the digest over the rest. + dataset_doc['about'].pop('digest', None) + + # We add to hash also the dataset description, with sorted keys. + hash.update(json.dumps(dataset_doc, sort_keys=True).encode('utf8')) + + return hash.hexdigest() + + +class Loader(metaclass=utils.AbstractMetaclass): + """ + A base class for dataset loaders. + """ + + @abc.abstractmethod + def can_load(self, dataset_uri: str) -> bool: + """ + Return ``True`` if this loader can load a dataset from a given URI ``dataset_uri``. + + Parameters + ---------- + dataset_uri: + A URI to load a dataset from. + + Returns + ------- + ``True`` if this loader can load a dataset from ``dataset_uri``. + """ + + @abc.abstractmethod + def load(self, dataset_uri: str, *, dataset_id: str = None, dataset_version: str = None, dataset_name: str = None, lazy: bool = False, + compute_digest: ComputeDigest = ComputeDigest.ONLY_IF_MISSING, strict_digest: bool = False, handle_score_split: bool = True) -> 'Dataset': + """ + Loads the dataset at ``dataset_uri``. + + Parameters + ---------- + dataset_uri: + A URI to load. + dataset_id: + Override dataset ID determined by the loader. + dataset_version: + Override dataset version determined by the loader. + dataset_name: + Override dataset name determined by the loader. + lazy: + If ``True``, load only top-level metadata and not whole dataset. + compute_digest: + Compute a digest over the data? + strict_digest: + If computed digest does not match the one provided in metadata, raise an exception? + handle_score_split: + If a scoring dataset has target values in a separate file, merge them in? + + Returns + ------- + A loaded dataset. + """ + + +class Saver(metaclass=utils.AbstractMetaclass): + """ + A base class for dataset savers. + """ + + @abc.abstractmethod + def can_save(self, dataset_uri: str) -> bool: + """ + Return ``True`` if this saver can save a dataset to a given URI ``dataset_uri``. + + Parameters + ---------- + dataset_uri: + A URI to save a dataset to. + + Returns + ------- + ``True`` if this saver can save a dataset to ``dataset_uri``. + """ + + @abc.abstractmethod + def save(self, dataset: 'Dataset', dataset_uri: str, *, compute_digest: ComputeDigest = ComputeDigest.ALWAYS, preserve_metadata: bool = True) -> None: + """ + Saves the dataset ``dataset`` to ``dataset_uri``. + + Parameters + ---------- + dataset: + A dataset to save. + dataset_uri: + A URI to save to. + compute_digest: + Compute digest over the data when saving? + preserve_metadata: + When saving a dataset, store its metadata as well? + """ + + +class OpenMLDatasetLoader(Loader): + """ + A class for loading OpenML datasets. + """ + + def can_load(self, dataset_uri: str) -> bool: + try: + parsed_uri = url_parse.urlparse(dataset_uri) + except Exception: + return False + + if parsed_uri.scheme != 'https': + return False + + if 'www.openml.org' != parsed_uri.netloc: + return False + + if OPENML_ID_REGEX.search(parsed_uri.path) is None: + return False + + return True + + def _load_data(self, openml_dataset: openml.OpenMLDataset, resources: typing.Dict, metadata: metadata_base.DataMetadata) -> metadata_base.DataMetadata: + # OpenML package always computes digests when downloading data and checks them, failing if they do not match. + # See: https://github.com/openml/OpenML/issues/1027 + data, _, categorical_indicator, column_names = openml_dataset.get_data(include_row_id=True, include_ignore_attribute=True, dataset_format='dataframe') + + assert data.shape[1] == len(categorical_indicator) + assert data.shape[1] == len(column_names) + assert data.shape[1] == len(openml_dataset.features) + assert set(data.columns) == set(column_names) + + if openml_dataset.ignore_attribute: + if isinstance(openml_dataset.ignore_attribute, str): + ignore_columns = set(openml_dataset.ignore_attribute.split(',')) + else: + ignore_columns = set(openml_dataset.ignore_attribute) + else: + ignore_columns = set() + + assert ignore_columns <= set(column_names) + + if openml_dataset.default_target_attribute: + if isinstance(openml_dataset.default_target_attribute, str): + target_columns = set(openml_dataset.default_target_attribute.split(',')) + else: + target_columns = set(openml_dataset.default_target_attribute) + else: + target_columns = set() + + assert target_columns <= set(column_names) + + openml_column_data_types = {} + for i, column_name in enumerate(column_names): + openml_column_data_types[column_name] = openml_dataset.features[i].data_type + + assert (openml_column_data_types[column_name] == 'nominal' and categorical_indicator[i]) or (openml_column_data_types[column_name] != 'nominal' and not categorical_indicator[i]) + + # For nominal data types we store a list of possible values. + if openml_column_data_types[column_name] == 'nominal': + openml_column_data_types[column_name] = openml_dataset.features[i].nominal_values + + data = self._convert_categorical_columns(data, categorical_indicator) + + if openml_dataset.row_id_attribute: + assert openml_dataset.row_id_attribute in column_names + + row_id_column = openml_dataset.row_id_attribute + else: + assert 'd3mIndex' not in column_names + + # We do not update digest with new data generated here. This is OK because this data is determined by + # original data so original digest still applies. When saving a new digest has to be computed anyway + # because this data will have to be converted to string. + data.insert(0, 'd3mIndex', range(len(data))) + + column_names.insert(0, 'd3mIndex') + categorical_indicator = [False] + list(categorical_indicator) + openml_column_data_types['d3mIndex'] = 'integer' + row_id_column = 'd3mIndex' + + data = container_pandas.DataFrame(data) + + resources['learningData'] = data + metadata = metadata.update((), { + 'dimension': {'length': len(resources)}, + }) + + metadata = metadata.update(('learningData',), { + 'structural_type': type(data), + 'dimension': { + 'length': len(data) + }, + }) + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS), { + 'dimension': { + 'length': len(column_names) + }, + }) + + for column_index, column_name in enumerate(column_names): + column_metadata = { + 'semantic_types': [ + self._semantic_type(openml_column_data_types[column_name]), + ], + 'name': column_name, + } + + if column_name in target_columns: + column_metadata['semantic_types'].append('https://metadata.datadrivendiscovery.org/types/SuggestedTarget') + + if column_name == row_id_column: + column_metadata['semantic_types'].append('https://metadata.datadrivendiscovery.org/types/PrimaryKey') + elif column_name not in ignore_columns: + column_metadata['semantic_types'].append('https://metadata.datadrivendiscovery.org/types/Attribute') + + if utils.is_sequence(openml_column_data_types[column_name]): + # We convert all categorical columns into string columns. + column_metadata['structural_type'] = str + elif openml_column_data_types[column_name] == 'nominal': + raise exceptions.InvalidStateError("Nominal column data type which has not been converted to a list of values.") + elif openml_column_data_types[column_name] in ['string', 'date']: + column_metadata['structural_type'] = str + elif openml_column_data_types[column_name] == 'integer': + column_metadata['structural_type'] = int + else: + column_metadata['structural_type'] = float + + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, column_index), column_metadata) + + metadata = metadata.set_table_metadata(at=('learningData',)) + + # Adding it here so that the order of semantic types is consistent between saving and loading of datasets. + metadata = metadata.add_semantic_type(('learningData',), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') + + return metadata + + def _get_dataset_metafeatures(self, openml_dataset: openml.OpenMLDataset) -> typing.Dict: + openml_qualities = openml_dataset.qualities or {} + metafeatures: typing.Dict = {} + + unknown_qualities = set(openml_qualities.keys()) - set(OPENML_QUALITY_MAP.keys()) - set(OPENML_QUALITY_AGGREGATE_MAP.keys()) - OPENML_IGNORED_QUALITIES + if unknown_qualities: + logger.warning("Unknown OpenML qualities in dataset %(dataset_id)s: %(unknown_qualities)s", { + 'dataset_id': openml_dataset.dataset_id, + 'unknown_qualities': sorted(unknown_qualities), + }) + + for quality_key, quality_value in openml_qualities.items(): + if numpy.isnan(quality_value): + continue + + if quality_key in OPENML_IGNORED_QUALITIES: + continue + + if quality_key in OPENML_QUALITY_MAP: + mapped_quality, quality_type = OPENML_QUALITY_MAP[quality_key] + + metafeatures[mapped_quality] = quality_type(quality_value) + + elif quality_key in OPENML_QUALITY_AGGREGATE_MAP: + mapped_quality, aggregate_key, quality_type = OPENML_QUALITY_AGGREGATE_MAP[quality_key] + + if mapped_quality not in metafeatures: + metafeatures[mapped_quality] = {} + + metafeatures[mapped_quality][aggregate_key] = quality_type(quality_value) + + # We warn about unknown qualities above. + + return metafeatures + + def _semantic_type(self, data_type: str) -> str: + if utils.is_sequence(data_type): + if len(data_type) == 2: + return 'http://schema.org/Boolean' + else: + return 'https://metadata.datadrivendiscovery.org/types/CategoricalData' + elif data_type == 'integer': + return 'http://schema.org/Integer' + elif data_type == 'real': + return 'http://schema.org/Float' + elif data_type == 'numeric': + return 'http://schema.org/Float' + elif data_type == 'string': + return 'http://schema.org/Text' + elif data_type == 'date': + return 'http://schema.org/DateTime' + else: + raise exceptions.UnexpectedValueError("Data type '{data_type}' is not supported.".format(data_type=data_type)) + + def _get_dataset_metadata(self, openml_dataset: openml.OpenMLDataset) -> typing.Dict: + """ + Returns OpenML only metadata converted to D3M metadata. It also computes digest using this metadata and expected data digest. + """ + + dataset_metadata: typing.Dict[str, typing.Any] = { + 'id': str(openml_dataset.dataset_id), + } + + if openml_dataset.name: + dataset_metadata['name'] = openml_dataset.name + if openml_dataset.description: + dataset_metadata['description'] = openml_dataset.description + if openml_dataset.version_label: + dataset_metadata['version'] = openml_dataset.version_label + if openml_dataset.tag: + dataset_metadata['keywords'] = openml_dataset.tag + + dataset_source: typing.Dict[str, typing.Any] = { + 'uris': [] + } + + if openml_dataset.creator: + dataset_source['name'] = openml_dataset.creator + if openml_dataset.licence: + dataset_source['license'] = openml_dataset.licence + if openml_dataset.citation: + dataset_source['citation'] = openml_dataset.citation + if openml_dataset.collection_date: + dataset_source['published'] = utils.datetime_for_json(dateutil.parser.parse(openml_dataset.collection_date, default=DEFAULT_DATETIME, fuzzy=True)) + if openml_dataset.openml_url or openml_dataset.url: + dataset_source['uris'].append(openml_dataset.openml_url or openml_dataset.url) + if openml_dataset.original_data_url: + dataset_source['uris'].append(openml_dataset.original_data_url) + if openml_dataset.paper_url: + dataset_source['uris'].append(openml_dataset.paper_url) + + if not dataset_source['uris']: + del dataset_source['uris'] + if dataset_source: + dataset_metadata['source'] = dataset_source + + if not openml_dataset.md5_checksum: + raise exceptions.UnexpectedValueError("OpenML dataset {id} does not have MD5 checksum.".format(id=openml_dataset.dataset_id)) + + dataset_metadata['digest'] = utils.compute_digest(dataset_metadata, openml_dataset.md5_checksum.encode('utf8')) + + return dataset_metadata + + def _convert_categorical_columns(self, data: pandas.DataFrame, categorical_indicator: typing.List[bool]) -> pandas.DataFrame: + """ + Converts categorical DataFrame columns to str columns. In D3M pipelines generally expect categorical + columns to be encoded as strings and only later the pipeline encodes them in some way. + """ + + for column_index, is_categorical in enumerate(categorical_indicator): + if not is_categorical: + continue + + column_name = data.columns[column_index] + + data[column_name] = data[column_name].astype(str) + + return data + + # "strict_digest" and "compute_digest" are ignored because OpenML package always computes digests when downloading data + # and checks them, failing if they do not match. See: https://github.com/openml/OpenML/issues/1027 + # "handle_score_split" is ignored. + def load(self, dataset_uri: str, *, dataset_id: str = None, dataset_version: str = None, dataset_name: str = None, lazy: bool = False, + compute_digest: ComputeDigest = ComputeDigest.ONLY_IF_MISSING, strict_digest: bool = False, handle_score_split: bool = True) -> 'Dataset': + assert self.can_load(dataset_uri) + + parsed_uri = url_parse.urlparse(dataset_uri, allow_fragments=False) + dataset_path_id = OPENML_ID_REGEX.search(parsed_uri.path)[1] + + try: + # We download just metadata first. + openml_dataset = openml.datasets.get_dataset(dataset_path_id, download_data=False) + except openml.exceptions.OpenMLServerException as error: + raise exceptions.DatasetNotFoundError( + "OpenML dataset '{dataset_uri}' cannot be found.".format(dataset_uri=dataset_uri), + ) from error + + # This converts OpenML dataset metadata to D3M dataset metadata. + dataset_metadata = self._get_dataset_metadata(openml_dataset) + + assert dataset_metadata['id'] == dataset_path_id + + # Use overrides if provided. Digest is not computed over those changes on purpose. + if dataset_id is not None: + dataset_metadata['id'] = dataset_id + if dataset_version is not None: + dataset_metadata['version'] = dataset_version + if dataset_name is not None: + dataset_metadata['name'] = dataset_name + + # Other standard metadata. + dataset_metadata.update({ + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': Dataset, + 'location_uris': [ + dataset_uri, + ], + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': 0, + }, + }) + + dataset_metafeatures = self._get_dataset_metafeatures(openml_dataset) + if dataset_metafeatures: + # We set metafeatures on the top level even if otherwise in D3M we set metafeatures at the resource level or + # even target column level, but setting them here allows one to access them in the lazy mode (when there are + # no resources yet in the dataset). We also do not include them into a digest because for D3M datasets + # the digest is just about the stored files of the dataset and not any additional metadata added by the loader. + dataset_metadata['data_metafeatures'] = dataset_metafeatures + + resources: typing.Dict = {} + metadata = metadata_base.DataMetadata(dataset_metadata) + + if not lazy: + load_lazy = None + + metadata = self._load_data( + openml_dataset, resources, metadata, + ) + + else: + def load_lazy(dataset: Dataset) -> None: + # "dataset" can be used as "resources", it is a dict of values. + dataset.metadata = self._load_data( + openml_dataset, dataset, dataset.metadata, + ) + + dataset._load_lazy = None + + return Dataset(resources, metadata, load_lazy=load_lazy) + + +class D3MDatasetLoader(Loader): + """ + A class for loading of D3M datasets. + + Loader support only loading from a local file system. + URI should point to the ``datasetDoc.json`` file in the D3M dataset directory. + """ + + SUPPORTED_VERSIONS = {'3.0', '3.1', '3.1.1', '3.1.2', '3.2.0', '3.2.1', '3.3.0', '3.3.1', '4.0.0', '4.1.0'} + + def can_load(self, dataset_uri: str) -> bool: + try: + parsed_uri = url_parse.urlparse(dataset_uri, allow_fragments=False) + except Exception: + return False + + if parsed_uri.scheme != 'file': + return False + + if parsed_uri.netloc not in ['', 'localhost']: + return False + + if not parsed_uri.path.startswith('/'): + return False + + if os.path.basename(parsed_uri.path) != 'datasetDoc.json': + return False + + return True + + def _load_data(self, resources: typing.Dict, metadata: metadata_base.DataMetadata, *, dataset_path: str, dataset_doc: typing.Dict, + dataset_id: typing.Optional[str], dataset_digest: typing.Optional[str], + compute_digest: ComputeDigest, strict_digest: bool, handle_score_split: bool) -> typing.Tuple[metadata_base.DataMetadata, typing.Optional[str]]: + # Allowing "True" for backwards compatibility. + if compute_digest is True or compute_digest == ComputeDigest.ALWAYS or (compute_digest == ComputeDigest.ONLY_IF_MISSING and dataset_digest is None): + hash = hashlib.sha256() + else: + hash = None + + for data_resource in dataset_doc['dataResources']: + if data_resource.get('isCollection', False): + resources[data_resource['resID']], metadata = self._load_collection(dataset_path, data_resource, metadata, hash) + else: + loader = getattr(self, '_load_resource_type_{resource_type}'.format(resource_type=data_resource['resType']), None) + if loader is None: + raise exceptions.NotSupportedError("Resource type '{resource_type}' is not supported.".format(resource_type=data_resource['resType'])) + + resources[data_resource['resID']], metadata = loader(dataset_path, data_resource, metadata, hash) + + # Backwards compatibility. If there is no resource marked as a dataset entry point, + # check if there is any resource with a suitable filename. + for data_resource in dataset_doc['dataResources']: + if metadata.has_semantic_type((data_resource['resID'],), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'): + break + else: + for data_resource in dataset_doc['dataResources']: + if os.path.splitext(os.path.basename(data_resource['resPath']))[0] == 'learningData': + metadata = metadata.add_semantic_type((data_resource['resID'],), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') + + # Handle a special case for SCORE dataset splits (those which have "targets.csv" file). + # They are the same as TEST dataset splits, but we present them differently, so that + # SCORE dataset splits have targets as part of data. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176 + if handle_score_split and os.path.exists(os.path.join(dataset_path, '..', 'targets.csv')): + self._merge_score_targets(resources, metadata, dataset_path, hash) + + if hash is not None: + # We remove digest, if it exists in dataset description, before computing the digest over the rest. + # We modify "dataset_doc" here, but this is OK, we do not need it there anymore at this point. + dataset_doc['about'].pop('digest', None) + + # We add to hash also the dataset description, with sorted keys. + hash.update(json.dumps(dataset_doc, sort_keys=True).encode('utf8')) + + new_dataset_digest = hash.hexdigest() + + if dataset_digest is not None and dataset_digest != new_dataset_digest: + if strict_digest: + raise exceptions.DigestMismatchError( + "Digest for dataset '{dataset_id}' does not match one from dataset description. Dataset description digest: {dataset_digest}. Computed digest: {new_dataset_digest}.".format( + dataset_id=dataset_id or dataset_doc['about']['datasetID'], + dataset_digest=dataset_digest, + new_dataset_digest=new_dataset_digest, + ) + ) + else: + logger.warning( + "Digest for dataset '%(dataset_id)s' does not match one from dataset description. Dataset description digest: %(dataset_digest)s. Computed digest: %(new_dataset_digest)s.", + { + 'dataset_id': dataset_id or dataset_doc['about']['datasetID'], + 'dataset_digest': dataset_digest, + 'new_dataset_digest': new_dataset_digest, + }, + ) + else: + new_dataset_digest = dataset_doc['about'].get('digest', None) + + return metadata, new_dataset_digest + + def load(self, dataset_uri: str, *, dataset_id: str = None, dataset_version: str = None, dataset_name: str = None, lazy: bool = False, + compute_digest: ComputeDigest = ComputeDigest.ONLY_IF_MISSING, strict_digest: bool = False, handle_score_split: bool = True) -> 'Dataset': + assert self.can_load(dataset_uri) + + parsed_uri = url_parse.urlparse(dataset_uri, allow_fragments=False) + + dataset_doc_path = parsed_uri.path + dataset_path = os.path.dirname(dataset_doc_path) + + try: + with open(dataset_doc_path, 'r', encoding='utf8') as dataset_doc_file: + dataset_doc = json.load(dataset_doc_file) + except FileNotFoundError as error: + raise exceptions.DatasetNotFoundError( + "D3M dataset '{dataset_uri}' cannot be found.".format(dataset_uri=dataset_uri), + ) from error + + dataset_schema_version = dataset_doc.get('about', {}).get('datasetSchemaVersion', '3.3.0') + if dataset_schema_version not in self.SUPPORTED_VERSIONS: + logger.warning("Loading a dataset with unsupported schema version '%(version)s'. Supported versions: %(supported_versions)s", { + 'version': dataset_schema_version, + 'supported_versions': self.SUPPORTED_VERSIONS, + }) + + # We do not compute digest here, but we use one from dataset description if it exist. + # This is different from other loaders which compute digest when lazy loading and check + # it after data is finally loaded to make sure data has not changed in meantime. + dataset_digest = dataset_doc['about'].get('digest', None) + + resources: typing.Dict = {} + metadata = metadata_base.DataMetadata() + + metadata = self._load_top_qualities(dataset_doc, metadata) + + if not lazy: + load_lazy = None + + metadata = self._load_data_qualities(dataset_doc, metadata) + + metadata, dataset_digest = self._load_data( + resources, metadata, dataset_path=dataset_path, dataset_doc=dataset_doc, dataset_id=dataset_id, + dataset_digest=dataset_digest, compute_digest=compute_digest, strict_digest=strict_digest, + handle_score_split=handle_score_split, + ) + + else: + def load_lazy(dataset: Dataset) -> None: + nonlocal dataset_digest + + dataset.metadata = self._load_data_qualities(dataset_doc, dataset.metadata) + + # "dataset" can be used as "resources", it is a dict of values. + dataset.metadata, dataset_digest = self._load_data( + dataset, dataset.metadata, dataset_path=dataset_path, dataset_doc=dataset_doc, dataset_id=dataset_id, + dataset_digest=dataset_digest, compute_digest=compute_digest, strict_digest=strict_digest, + handle_score_split=handle_score_split, + ) + + new_metadata = { + 'dimension': {'length': len(dataset)}, + } + + if dataset_digest is not None: + new_metadata['digest'] = dataset_digest + + dataset.metadata = dataset.metadata.update((), new_metadata) + dataset.metadata = dataset.metadata.generate(dataset) + + dataset._load_lazy = None + + document_dataset_id = dataset_doc['about']['datasetID'] + # Handle a special case for SCORE dataset splits (those which have "targets.csv" file). + # They are the same as TEST dataset splits, but we present them differently, so that + # SCORE dataset splits have targets as part of data. Because of this we also update + # corresponding dataset ID. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176 + if handle_score_split and os.path.exists(os.path.join(dataset_path, '..', 'targets.csv')) and document_dataset_id.endswith('_TEST'): + document_dataset_id = document_dataset_id[:-5] + '_SCORE' + + dataset_metadata = { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': Dataset, + 'id': dataset_id or document_dataset_id, + 'name': dataset_name or dataset_doc['about']['datasetName'], + 'dimension': { + 'name': 'resources', + 'length': len(resources), + }, + } + + if dataset_version or dataset_doc['about'].get('datasetVersion', None): + dataset_metadata['version'] = dataset_version or dataset_doc['about']['datasetVersion'] + + if dataset_digest is not None: + dataset_metadata['digest'] = dataset_digest + + if dataset_doc['about'].get('description', None): + dataset_metadata['description'] = dataset_doc['about']['description'] + + if dataset_doc['about'].get('approximateSize', None): + try: + dataset_metadata['approximate_stored_size'] = parse_size(dataset_doc['about']['approximateSize']) + except Exception as error: + raise ValueError("Unable to parse 'approximateSize': {approximate_size}".format(approximate_size=dataset_doc['about']['approximateSize'])) from error + + dataset_source = {} + + if 'redacted' in dataset_doc['about']: + dataset_source['redacted'] = dataset_doc['about']['redacted'] + + # "license" is often an empty string and in that case we do not want + # really to set the field in dataset metadata. + if dataset_doc['about'].get('license', None): + dataset_source['license'] = dataset_doc['about']['license'] + + if 'humanSubjectsResearch' in dataset_doc['about']: + dataset_source['human_subjects_research'] = dataset_doc['about']['humanSubjectsResearch'] + + if dataset_doc['about'].get('source', None): + dataset_source['name'] = dataset_doc['about']['source'] + + if dataset_doc['about'].get('citation', None): + dataset_source['citation'] = dataset_doc['about']['citation'] + + if dataset_doc['about'].get('publicationDate', None): + try: + dataset_source['published'] = utils.datetime_for_json(dateutil.parser.parse(dataset_doc['about']['publicationDate'], default=DEFAULT_DATETIME, fuzzy=True)) + except Exception as error: + raise ValueError("Unable to parse 'publicationDate': {publication_date}".format(publication_date=dataset_doc['about']['publicationDate'])) from error + + if dataset_source: + dataset_metadata['source'] = dataset_source + + metadata = metadata.update((), dataset_metadata) + + # We reconstruct the URI to normalize it. + location_uri = utils.fix_uri(dataset_doc_path) + location_uris = list(metadata.query(()).get('location_uris', [])) + if location_uri not in location_uris: + location_uris.insert(0, location_uri) + metadata = metadata.update((), {'location_uris': location_uris}) + + if dataset_doc['about'].get('datasetURI', None) and dataset_doc['about']['datasetURI'] not in location_uris: + location_uris.append(dataset_doc['about']['datasetURI']) + metadata = metadata.update((), {'location_uris': location_uris}) + + semantic_types = list(metadata.query(()).get('dimension', {}).get('semantic_types', [])) + if 'https://metadata.datadrivendiscovery.org/types/DatasetResource' not in semantic_types: + semantic_types.append('https://metadata.datadrivendiscovery.org/types/DatasetResource') + metadata = metadata.update((), {'dimension': {'semantic_types': semantic_types}}) + + source_uris = list(metadata.query(()).get('source', {}).get('uris', [])) + if dataset_doc['about'].get('sourceURI', None) and dataset_doc['about']['sourceURI'] not in source_uris: + source_uris.insert(0, dataset_doc['about']['sourceURI']) + metadata = metadata.update((), {'source': {'uris': source_uris}}) + + keywords = list(metadata.query(()).get('keywords', [])) + if dataset_doc['about'].get('applicationDomain', None) and dataset_doc['about']['applicationDomain'] not in keywords: + # Application domain has no vocabulary specified so we map it to keywords. + keywords.append(dataset_doc['about']['applicationDomain']) + metadata.update((), {'keywords': keywords}) + + return Dataset(resources, metadata, load_lazy=load_lazy) + + def _load_top_qualities(self, dataset_doc: typing.Dict, metadata: metadata_base.DataMetadata) -> metadata_base.DataMetadata: + ALL_ELEMENTS_REPR = repr(metadata_base.ALL_ELEMENTS) + + for quality in dataset_doc.get('qualities', []): + restricted_to = quality.get('restrictedTo', {}) + + # D3M metadata stored as D3M qualities. + if quality['qualName'] == 'metadata': + if restricted_to['resID'] == '': + selector: metadata_base.TupleSelector = () + else: + # Here we load only top-level metadata. + continue + + # TODO: Optimize, see: https://gitlab.com/datadrivendiscovery/d3m/issues/408 + metadata = metadata.update(selector, utils.from_reversible_json_structure(quality['qualValue'])) + + return metadata + + def _load_data_qualities(self, dataset_doc: typing.Dict, metadata: metadata_base.DataMetadata) -> metadata_base.DataMetadata: + ALL_ELEMENTS_REPR = repr(metadata_base.ALL_ELEMENTS) + + for quality in dataset_doc.get('qualities', []): + restricted_to = quality.get('restrictedTo', {}) + + # D3M metadata stored as D3M qualities. + if quality['qualName'] == 'metadata': + if restricted_to['resID'] == '': + # Here we load only non top-level metadata. + continue + else: + resource_selector = [metadata_base.ALL_ELEMENTS if segment == ALL_ELEMENTS_REPR else segment for segment in restricted_to['resComponent']['selector']] + selector: metadata_base.TupleSelector = (restricted_to['resID'], *resource_selector) + + # TODO: Optimize, see: https://gitlab.com/datadrivendiscovery/d3m/issues/408 + metadata = metadata.update(selector, utils.from_reversible_json_structure(quality['qualValue'])) + + # An alternative way to describe LUPI datasets using D3M qualities. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/61 + # https://gitlab.com/datadrivendiscovery/d3m/issues/225 + elif quality['qualName'] == 'privilegedFeature': + if quality['qualValue'] != 'True': + continue + + column_index = restricted_to.get('resComponent', {}).get('columnIndex', None) + if column_index is not None: + metadata = self._add_semantic_type_for_column_index(metadata, restricted_to['resID'], column_index, 'https://metadata.datadrivendiscovery.org/types/SuggestedPrivilegedData') + continue + + column_name = restricted_to.get('resComponent', {}).get('columnName', None) + if column_name is not None: + metadata = self._add_semantic_type_for_column_name(metadata, restricted_to['resID'], column_name, 'https://metadata.datadrivendiscovery.org/types/SuggestedPrivilegedData') + continue + + return metadata + + def _add_semantic_type_for_column_index(self, metadata: metadata_base.DataMetadata, resource_id: str, column_index: int, semantic_type: str) -> metadata_base.DataMetadata: + return metadata.add_semantic_type((resource_id, metadata_base.ALL_ELEMENTS, column_index), semantic_type) + + def _add_semantic_type_for_column_name(self, metadata: metadata_base.DataMetadata, resource_id: str, column_name: str, semantic_type: str) -> metadata_base.DataMetadata: + column_index = metadata.get_column_index_from_column_name(column_name, at=(resource_id,)) + + return self._add_semantic_type_for_column_index(metadata, resource_id, column_index, semantic_type) + + def _load_collection(self, dataset_path: str, data_resource: typing.Dict, metadata: metadata_base.DataMetadata, + hash: typing.Any) -> typing.Tuple[container_pandas.DataFrame, metadata_base.DataMetadata]: + assert data_resource.get('isCollection', False) + + collection_path = os.path.join(dataset_path, data_resource['resPath']) + + media_types_with_extensions = {} + # Legacy (before v4.0.0). We obtain a list of file extensions from the global list of file extensions. + if utils.is_sequence(data_resource['resFormat']): + for format in data_resource['resFormat']: + format_media_type = MEDIA_TYPES[format] + media_types_with_extensions[format_media_type] = [_add_extension_dot(extension) for extension in FILE_EXTENSIONS_REVERSE[format_media_type]] + else: + for format, extensions in data_resource['resFormat'].items(): + # We allow unknown formats, hoping that they are proper media types already. + format_media_type = MEDIA_TYPES.get(format, format) + # We do not really care if file extensions are not on the global list of file extensions. + media_types_with_extensions[format_media_type] = [_add_extension_dot(extension) for extension in extensions] + + all_media_types_set = set(media_types_with_extensions.keys()) + + reverse_media_types_with_extensions: typing.Dict[str, str] = {} + for media_type, extensions in media_types_with_extensions.items(): + for extension in extensions: + if extension in reverse_media_types_with_extensions: + raise exceptions.InvalidDatasetError("Conflicting file extension '{file_extension}': {media_type1} and {media_type2}".format( + file_extension=extension, + media_type1=reverse_media_types_with_extensions[extension], + media_type2=media_type, + )) + + reverse_media_types_with_extensions[extension] = media_type + + filenames = [] + media_types = [] + + for filename in utils.list_files(collection_path): + file_path = os.path.join(collection_path, filename) + + filename_extension = os.path.splitext(filename)[1] + + filenames.append(filename) + + try: + media_type = reverse_media_types_with_extensions[filename_extension] + except KeyError as error: + raise TypeError("Unable to determine a media type for the file extension of file '{filename}'.".format(filename=filename)) from error + + media_types.append(media_type) + + if hash is not None: + # We include both the filename and the content. + hash.update(os.path.join(data_resource['resPath'], filename).encode('utf8')) + update_digest(hash, file_path) + + data = container_pandas.DataFrame({'filename': filenames}, columns=['filename'], dtype=object) + + metadata = metadata.update((data_resource['resID'],), { + 'structural_type': type(data), + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/FilesCollection', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': len(data), + }, + }) + + metadata = metadata.update((data_resource['resID'], metadata_base.ALL_ELEMENTS), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }) + + location_base_uri = utils.fix_uri(collection_path) + # We want to make sure you can just concat with the filename. + if not location_base_uri.endswith('/'): + location_base_uri += '/' + + media_types_set = set(media_types) + + extra_media_types = all_media_types_set - media_types_set + if extra_media_types: + logger.warning("File collection '%(resource_id)s' claims more file formats than are used in files. Extraneous formats: %(formats)s", { + 'resource_id': data_resource['resID'], + 'formats': [MEDIA_TYPES_REVERSE.get(format, format) for format in sorted(extra_media_types)], + }) + + # Normalize the list based on real media types used. + all_media_types = sorted(media_types_set) + + column_metadata = { + 'name': 'filename', + 'structural_type': str, + 'location_base_uris': [ + location_base_uri, + ], + # A superset of all media types of files in this collection. + 'media_types': all_media_types, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'https://metadata.datadrivendiscovery.org/types/FileName', + D3M_RESOURCE_TYPE_CONSTANTS_TO_SEMANTIC_TYPES[data_resource['resType']], + ], + } + + if data_resource.get('columns', None): + columns_metadata = [] + + for column in data_resource['columns']: + columns_metadata.append(self._get_column_metadata(column)) + columns_metadata[-1]['column_index'] = column['colIndex'] + columns_metadata[-1]['column_name'] = column['colName'] + + column_metadata['file_columns'] = columns_metadata + + if data_resource.get('columnsCount', None) is not None: + column_metadata['file_columns_count'] = data_resource['columnsCount'] + + metadata = metadata.update((data_resource['resID'], metadata_base.ALL_ELEMENTS, 0), column_metadata) + + # If there are different rows with different media types, we have to set + # on each row which media type it is being used. + if len(all_media_types) > 1: + # The following modifies metadata for rows directly instead of through metadata methods + # to achieve useful performance because some datasets contain many files which means many + # rows have their "media_types" set. Setting it one by one makes things to slow. + # Here we are taking advantage of quite few assumptions: we are modifying metadata in-place + # because we know it is only us having a reference to it, we directly set metadata for + # rows because we know no other metadata exists for rows, moreover, we also know no other + # metadata exists for rows through any higher ALL_ELEMENTS. + # TODO: Expose this as a general metadata method. + # TODO: Or just optimize, see: https://gitlab.com/datadrivendiscovery/d3m/issues/408 + + resource_metadata_entry = metadata._current_metadata.elements[data_resource['resID']] + resource_row_elements_evolver = resource_metadata_entry.elements.evolver() + resource_row_elements_evolver._reallocate(2 * len(media_types)) + for i, media_type in enumerate(media_types): + column_metadata_entry = metadata_base.MetadataEntry( + metadata=frozendict.FrozenOrderedDict({ + # A media type of this particular file. + 'media_types': (media_type,), + }), + is_empty=False, + ) + + row_metadata_entry = metadata_base.MetadataEntry( + elements=utils.EMPTY_PMAP.set(0, column_metadata_entry), + is_empty=False, + is_elements_empty=False, + ) + + resource_row_elements_evolver.set(i, row_metadata_entry) + + resource_metadata_entry.elements = resource_row_elements_evolver.persistent() + resource_metadata_entry.is_elements_empty = not resource_metadata_entry.elements + resource_metadata_entry.update_is_empty() + + return data, metadata + + def _load_resource_type_table(self, dataset_path: str, data_resource: typing.Dict, metadata: metadata_base.DataMetadata, + hash: typing.Any) -> typing.Tuple[container_pandas.DataFrame, metadata_base.DataMetadata]: + assert not data_resource.get('isCollection', False) + + data = None + column_names = None + data_path = os.path.join(dataset_path, data_resource['resPath']) + + if utils.is_sequence(data_resource['resFormat']) and len(data_resource['resFormat']) == 1: + resource_format = data_resource['resFormat'][0] + elif isinstance(data_resource['resFormat'], typing.Mapping) and len(data_resource['resFormat']) == 1: + resource_format = list(data_resource['resFormat'].keys())[0] + else: + resource_format = None + + if resource_format in ['text/csv', 'text/csv+gzip']: + data = pandas.read_csv( + data_path, + # We do not want to do any conversion of values at this point. + # This should be done by primitives later on. + dtype=str, + # We always expect one row header. + header=0, + # We want empty strings and not NaNs. + na_filter=False, + compression='gzip' if resource_format == 'text/csv+gzip' else None, + encoding='utf8', + low_memory=False, + memory_map=True, + ) + + column_names = list(data.columns) + + if data_resource.get('columnsCount', None) is not None and len(column_names) != data_resource['columnsCount']: + raise ValueError("Mismatch between columns count in data {data_count} and expected count {expected_count}.".format( + data_count=len(column_names), + expected_count=data_resource['columnsCount'], + )) + + if hash is not None: + # We include both the filename and the content. + # TODO: Currently we read the file twice, once for reading and once to compute digest. Could we do it in one pass? Would it make it faster? + hash.update(data_resource['resPath'].encode('utf8')) + update_digest(hash, data_path) + + else: + raise exceptions.NotSupportedError("Resource format '{resource_format}' for table '{resource_path}' is not supported.".format( + resource_format=data_resource['resFormat'], + resource_path=data_resource['resPath'], + )) + + if data is None: + raise FileNotFoundError("Data file for table '{resource_path}' cannot be found.".format( + resource_path=data_resource['resPath'], + )) + + data = container_pandas.DataFrame(data) + + assert column_names is not None + + semantic_types = [D3M_RESOURCE_TYPE_CONSTANTS_TO_SEMANTIC_TYPES[data_resource['resType']]] + + if data_resource['resID'] == 'learningData': + semantic_types.append('https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') + + metadata = metadata.update((data_resource['resID'],), { + 'structural_type': type(data), + 'semantic_types': semantic_types, + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': len(data), + }, + }) + + metadata = metadata.update((data_resource['resID'], metadata_base.ALL_ELEMENTS), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': len(column_names), + }, + }) + + for i, column_name in enumerate(column_names): + metadata = metadata.update((data_resource['resID'], metadata_base.ALL_ELEMENTS, i), { + 'name': column_name, + 'structural_type': str, + }) + + metadata_columns = {} + for column in data_resource.get('columns', []): + metadata_columns[column['colIndex']] = column + + for i in range(len(column_names)): + if i in metadata_columns: + if column_names[i] != metadata_columns[i]['colName']: + raise ValueError("Mismatch between column name in data '{data_name}' and column name in metadata '{metadata_name}'.".format( + data_name=column_names[i], + metadata_name=metadata_columns[i]['colName'], + )) + + column_metadata = self._get_column_metadata(metadata_columns[i]) + else: + column_metadata = { + 'semantic_types': [ + D3M_COLUMN_TYPE_CONSTANTS_TO_SEMANTIC_TYPES['unknown'], + ], + } + + if 'https://metadata.datadrivendiscovery.org/types/Boundary' in column_metadata['semantic_types'] and 'boundary_for' not in column_metadata: + # Let's reconstruct for which column this is a boundary: currently + # this seems to be the first non-boundary column before this one. + for column_index in range(i - 1, 0, -1): + column_semantic_types = metadata.query((data_resource['resID'], metadata_base.ALL_ELEMENTS, column_index)).get('semantic_types', ()) + if 'https://metadata.datadrivendiscovery.org/types/Boundary' not in column_semantic_types: + column_metadata['boundary_for'] = { + 'resource_id': data_resource['resID'], + 'column_index': column_index, + } + break + + metadata = metadata.update((data_resource['resID'], metadata_base.ALL_ELEMENTS, i), column_metadata) + + current_boundary_start = None + current_boundary_list: typing.Tuple[str, ...] = None + column_index = 0 + while column_index < len(column_names): + column_semantic_types = metadata.query((data_resource['resID'], metadata_base.ALL_ELEMENTS, column_index)).get('semantic_types', ()) + if is_simple_boundary(column_semantic_types): + # Let's reconstruct which type of a boundary this is. Heuristic is simple. + # If there are two boundary columns next to each other, it is an interval. + if current_boundary_start is None: + assert current_boundary_list is None + + count = 1 + for next_column_index in range(column_index + 1, len(column_names)): + if is_simple_boundary(metadata.query((data_resource['resID'], metadata_base.ALL_ELEMENTS, next_column_index)).get('semantic_types', ())): + count += 1 + else: + break + + if count == 2: + current_boundary_start = column_index + current_boundary_list = INTERVAL_SEMANTIC_TYPES + else: + # Unsupported group of boundary columns, let's skip them all. + column_index += count + continue + + column_semantic_types = column_semantic_types + (current_boundary_list[column_index - current_boundary_start],) + metadata = metadata.update((data_resource['resID'], metadata_base.ALL_ELEMENTS, column_index), { + 'semantic_types': column_semantic_types, + }) + + if column_index - current_boundary_start + 1 == len(current_boundary_list): + current_boundary_start = None + current_boundary_list = None + + column_index += 1 + + return data, metadata + + def _load_resource_type_edgeList(self, dataset_path: str, data_resource: typing.Dict, metadata: metadata_base.DataMetadata, + hash: typing.Any) -> typing.Tuple[container_pandas.DataFrame, metadata_base.DataMetadata]: + assert not data_resource.get('isCollection', False) + + return self._load_resource_type_table(dataset_path, data_resource, metadata, hash) + + def _load_resource_type_graph( + self, dataset_path: str, data_resource: typing.Dict, metadata: metadata_base.DataMetadata, hash: typing.Any, + ) -> typing.Tuple[container_pandas.DataFrame, metadata_base.DataMetadata]: + assert not data_resource.get('isCollection', False) + + data_path = os.path.join(dataset_path, data_resource['resPath']) + collection_path = os.path.dirname(data_path) + filename = os.path.basename(data_path) + filename_extension = os.path.splitext(filename)[1] + + try: + media_type = FILE_EXTENSIONS[filename_extension] + except KeyError as error: + raise TypeError("Unsupported file extension for file '{filename}'.".format(filename=filename)) from error + + if hash is not None: + # We include both the filename and the content. + hash.update(data_resource['resPath'].encode('utf8')) + update_digest(hash, data_path) + + data = container_pandas.DataFrame({'filename': [filename]}, columns=['filename'], dtype=object) + + metadata = metadata.update((data_resource['resID'],), { + 'structural_type': type(data), + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/FilesCollection', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': len(data), + }, + }) + + metadata = metadata.update((data_resource['resID'], metadata_base.ALL_ELEMENTS), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }) + + location_base_uri = utils.fix_uri(collection_path) + # We want to make sure you can just concat with the filename. + if not location_base_uri.endswith('/'): + location_base_uri += '/' + + column_metadata = { + 'name': 'filename', + 'structural_type': str, + 'location_base_uris': [ + location_base_uri, + ], + 'media_types': [media_type], + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'https://metadata.datadrivendiscovery.org/types/FileName', + D3M_RESOURCE_TYPE_CONSTANTS_TO_SEMANTIC_TYPES[data_resource['resType']], + ], + } + + metadata = metadata.update((data_resource['resID'], metadata_base.ALL_ELEMENTS, 0), column_metadata) + + return data, metadata + + def _get_column_metadata(self, column: typing.Dict) -> typing.Dict: + semantic_types = [D3M_COLUMN_TYPE_CONSTANTS_TO_SEMANTIC_TYPES[column['colType']]] + + for role in column['role']: + semantic_types.append(D3M_ROLE_CONSTANTS_TO_SEMANTIC_TYPES[role]) + + # Suggested target is an attribute by default. + if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in semantic_types and 'https://metadata.datadrivendiscovery.org/types/Attribute' not in semantic_types: + semantic_types.append('https://metadata.datadrivendiscovery.org/types/Attribute') + + # Suggested privileged data is an attribute by default. + if 'https://metadata.datadrivendiscovery.org/types/SuggestedPrivilegedData' in semantic_types and 'https://metadata.datadrivendiscovery.org/types/Attribute' not in semantic_types: + semantic_types.append('https://metadata.datadrivendiscovery.org/types/Attribute') + + column_metadata: typing.Dict[str, typing.Any] = { + 'semantic_types': semantic_types, + } + + if column.get('colDescription', None): + column_metadata['description'] = column['colDescription'] + + if column.get('refersTo', None): + if isinstance(column['refersTo']['resObject'], str): + if column['refersTo']['resObject'] == 'item': + # We represent collections as a table with one column of filenames. + column_metadata['foreign_key'] = { + 'type': 'COLUMN', + 'resource_id': column['refersTo']['resID'], + 'column_index': 0, + } + # Legacy (before v4.0.0) node reference. + elif column['refersTo']['resObject'] == 'node': + column_metadata['foreign_key'] = { + 'type': 'NODE_ATTRIBUTE', + 'resource_id': column['refersTo']['resID'], + 'node_attribute': 'nodeID', + } + # Legacy (before v4.0.0) edge reference. + elif column['refersTo']['resObject'] == 'edge': + column_metadata['foreign_key'] = { + 'type': 'EDGE_ATTRIBUTE', + 'resource_id': column['refersTo']['resID'], + 'edge_attribute': 'edgeID', + } + else: + raise exceptions.UnexpectedValueError("Unknown \"resObject\" value: {resource_object}".format(resource_object=column['refersTo']['resObject'])) + else: + if 'columnIndex' in column['refersTo']['resObject']: + if 'https://metadata.datadrivendiscovery.org/types/Boundary' in semantic_types: + column_metadata['boundary_for'] = { + 'resource_id': column['refersTo']['resID'], + 'column_index': column['refersTo']['resObject']['columnIndex'], + } + else: + column_metadata['foreign_key'] = { + 'type': 'COLUMN', + 'resource_id': column['refersTo']['resID'], + 'column_index': column['refersTo']['resObject']['columnIndex'], + } + elif 'columnName' in column['refersTo']['resObject']: + if 'https://metadata.datadrivendiscovery.org/types/Boundary' in semantic_types: + column_metadata['boundary_for'] = { + 'resource_id': column['refersTo']['resID'], + 'column_name': column['refersTo']['resObject']['columnName'], + } + else: + column_metadata['foreign_key'] = { + 'type': 'COLUMN', + 'resource_id': column['refersTo']['resID'], + 'column_name': column['refersTo']['resObject']['columnName'], + } + elif 'nodeAttribute' in column['refersTo']['resObject']: + column_metadata['foreign_key'] = { + 'type': 'NODE_ATTRIBUTE', + 'resource_id': column['refersTo']['resID'], + 'node_attribute': column['refersTo']['resObject']['nodeAttribute'], + } + elif 'edgeAttribute' in column['refersTo']['resObject']: + column_metadata['foreign_key'] = { + 'type': 'EDGE_ATTRIBUTE', + 'resource_id': column['refersTo']['resID'], + 'edge_attribute': column['refersTo']['resObject']['edgeAttribute'], + } + else: + raise exceptions.UnexpectedValueError("Unknown \"resObject\" value: {resource_object}".format(resource_object=column['refersTo']['resObject'])) + + if column.get('timeGranularity', None): + # "units" is backwards compatible field name. + # See: https://gitlab.com/datadrivendiscovery/data-supply/issues/215 + unit = column['timeGranularity'].get('unit', column['timeGranularity'].get('units', None)) + column_metadata['time_granularity'] = { + 'value': column['timeGranularity']['value'], + 'unit': TIME_GRANULARITIES[unit], + } + + return column_metadata + + def _merge_score_targets(self, resources: typing.Dict, metadata: metadata_base.DataMetadata, dataset_path: str, hash: typing.Any) -> None: + targets_path = os.path.join(dataset_path, '..', 'targets.csv') + + targets = pandas.read_csv( + targets_path, + # We do not want to do any conversion of values at this point. + # This should be done by primitives later on. + dtype=str, + # We always expect one row header. + header=0, + # We want empty strings and not NaNs. + na_filter=False, + encoding='utf8', + low_memory=False, + memory_map=True, + ) + + for resource_id, resource in resources.items(): + # We assume targets are only in the dataset entry point. + if metadata.has_semantic_type((resource_id,), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'): + contains_empty_values = {} + for column_name in targets.columns: + if column_name == 'd3mIndex': + continue + + contains_empty_values[column_name] = targets.loc[:, column_name].eq('').any() + + # We first make sure targets match resource in row order. At this stage all values + # are strings, so we can fill simply with empty strings if it happens that index + # values do not match (which in fact should never happen). + reindexed_targets = targets.set_index('d3mIndex').reindex(resource.loc[:, 'd3mIndex'], fill_value='').reset_index() + + for column_name in reindexed_targets.columns: + if column_name == 'd3mIndex': + continue + + # We match columns based on their names. + if column_name in resource.columns: + if not contains_empty_values[column_name] and reindexed_targets.loc[:, column_name].eq('').any(): + raise exceptions.InvalidDatasetError("'d3mIndex' in 'targets.csv' does not match 'd3mIndex' in the resource '{resource_id}'.".format(resource_id=resource_id)) + + resource.loc[:, column_name] = reindexed_targets.loc[:, column_name] + + resources[resource_id] = resource + + +class CSVLoader(Loader): + """ + A class for loading a dataset from a CSV file. + + Loader supports both loading a dataset from a local file system or remote locations. + URI should point to a file with ``.csv`` file extension. + """ + + def can_load(self, dataset_uri: str) -> bool: + try: + parsed_uri = url_parse.urlparse(dataset_uri, allow_fragments=False) + except Exception: + return False + + if parsed_uri.scheme not in pandas_io_common._VALID_URLS: + return False + + if parsed_uri.scheme == 'file': + if parsed_uri.netloc not in ['', 'localhost']: + return False + + if not parsed_uri.path.startswith('/'): + return False + + for extension in ('', '.gz', '.bz2', '.zip', 'xz'): + if parsed_uri.path.endswith('.csv' + extension): + return True + + return False + + def _load_data(self, resources: typing.Dict, metadata: metadata_base.DataMetadata, *, dataset_uri: str, + compute_digest: ComputeDigest) -> typing.Tuple[metadata_base.DataMetadata, int, typing.Optional[str]]: + try: + buffer, compression, should_close = self._get_buffer_and_compression(dataset_uri) + except FileNotFoundError as error: + raise exceptions.DatasetNotFoundError("CSV dataset '{dataset_uri}' cannot be found.".format(dataset_uri=dataset_uri)) from error + except urllib_error.HTTPError as error: + if error.code == 404: + raise exceptions.DatasetNotFoundError("CSV dataset '{dataset_uri}' cannot be found.".format(dataset_uri=dataset_uri)) from error + else: + raise error + except urllib_error.URLError as error: + if isinstance(error.reason, FileNotFoundError): + raise exceptions.DatasetNotFoundError("CSV dataset '{dataset_uri}' cannot be found.".format(dataset_uri=dataset_uri)) from error + else: + raise error + + # CSV files do not have digest, so "ALWAYS" and "ONLY_IF_MISSING" is the same. + # Allowing "True" for backwards compatibility. + if compute_digest is True or compute_digest == ComputeDigest.ALWAYS or compute_digest == ComputeDigest.ONLY_IF_MISSING: + buffer_digest = self._get_digest(buffer) + else: + buffer_digest = None + + buffer_size = len(buffer.getvalue()) + + data = pandas.read_csv( + buffer, + # We do not want to do any conversion of values at this point. + # This should be done by primitives later on. + dtype=str, + # We always expect one row header. + header=0, + # We want empty strings and not NaNs. + na_filter=False, + compression=compression, + encoding='utf8', + low_memory=False, + ) + + if should_close: + try: + buffer.close() + except Exception: + pass + + if 'd3mIndex' not in data.columns: + # We do not update digest with new data generated here. This is OK because this data is determined by + # original data so original digest still applies. When saving a new digest has to be computed anyway + # because this data will have to be converted to string. + data.insert(0, 'd3mIndex', range(len(data))) + d3m_index_generated = True + else: + d3m_index_generated = False + + data = container_pandas.DataFrame(data) + + resources['learningData'] = data + + metadata = metadata.update(('learningData',), { + 'structural_type': type(data), + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': len(data), + }, + }) + + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': len(data.columns), + }, + }) + + for i, column_name in enumerate(data.columns): + if i == 0 and d3m_index_generated: + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), { + 'name': column_name, + 'structural_type': numpy.int64, + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + else: + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), { + 'name': column_name, + 'structural_type': str, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + ], + }) + + return metadata, buffer_size, buffer_digest + + def _get_buffer_and_compression(self, dataset_uri: str) -> typing.Tuple[io.BytesIO, str, bool]: + if hasattr(pandas_io_common, 'infer_compression'): + infer_compression = pandas_io_common.infer_compression + else: + # Backwards compatibility for Pandas before 1.0.0. + infer_compression = pandas_io_common._infer_compression + compression = infer_compression(dataset_uri, 'infer') + buffer, _, compression, should_close = pandas_io_common.get_filepath_or_buffer(dataset_uri, 'utf8', compression) + + return buffer, compression, should_close + + def _get_digest(self, buffer: io.BytesIO) -> str: + return hashlib.sha256(buffer.getvalue()).hexdigest() + + # "strict_digest" is ignored, there is no metadata to compare digest against. + # "handle_score_split" is ignored as well. + def load(self, dataset_uri: str, *, dataset_id: str = None, dataset_version: str = None, dataset_name: str = None, lazy: bool = False, + compute_digest: ComputeDigest = ComputeDigest.ONLY_IF_MISSING, strict_digest: bool = False, handle_score_split: bool = True) -> 'Dataset': + assert self.can_load(dataset_uri) + + parsed_uri = url_parse.urlparse(dataset_uri, allow_fragments=False) + + # Pandas requires a host for "file" URIs. + if parsed_uri.scheme == 'file' and parsed_uri.netloc == '': + parsed_uri = parsed_uri._replace(netloc='localhost') + dataset_uri = url_parse.urlunparse(parsed_uri) + + dataset_size = None + dataset_digest = None + + resources: typing.Dict = {} + metadata = metadata_base.DataMetadata() + + if not lazy: + load_lazy = None + + metadata, dataset_size, dataset_digest = self._load_data( + resources, metadata, dataset_uri=dataset_uri, compute_digest=compute_digest, + ) + + else: + def load_lazy(dataset: Dataset) -> None: + # "dataset" can be used as "resources", it is a dict of values. + dataset.metadata, dataset_size, dataset_digest = self._load_data( + dataset, dataset.metadata, dataset_uri=dataset_uri, compute_digest=compute_digest, + ) + + new_metadata = { + 'dimension': {'length': len(dataset)}, + 'stored_size': dataset_size, + } + + if dataset_digest is not None: + new_metadata['digest'] = dataset_digest + + dataset.metadata = dataset.metadata.update((), new_metadata) + dataset.metadata = dataset.metadata.generate(dataset) + + dataset._load_lazy = None + + dataset_metadata = { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': Dataset, + 'id': dataset_id or dataset_uri, + 'name': dataset_name or os.path.basename(parsed_uri.path), + 'location_uris': [ + dataset_uri, + ], + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': len(resources), + }, + } + + if dataset_version is not None: + dataset_metadata['version'] = dataset_version + + if dataset_size is not None: + dataset_metadata['stored_size'] = dataset_size + + if dataset_digest is not None: + dataset_metadata['digest'] = dataset_digest + + metadata = metadata.update((), dataset_metadata) + + return Dataset(resources, metadata, load_lazy=load_lazy) + + +class SklearnExampleLoader(Loader): + """ + A class for loading example scikit-learn datasets. + + URI should be of the form ``sklearn://``, where names come from + ``sklearn.datasets.load_*`` function names. + """ + + def can_load(self, dataset_uri: str) -> bool: + if dataset_uri.startswith('sklearn://'): + return True + + return False + + def _load_data(self, resources: typing.Dict, metadata: metadata_base.DataMetadata, *, dataset_path: str, + compute_digest: ComputeDigest) -> typing.Tuple[metadata_base.DataMetadata, typing.Optional[str], typing.Optional[str]]: + bunch = self._get_bunch(dataset_path) + + # Sklearn datasets do not have digest, so "ALWAYS" and "ONLY_IF_MISSING" is the same. + # Allowing "True" for backwards compatibility. + if compute_digest is True or compute_digest == ComputeDigest.ALWAYS or compute_digest == ComputeDigest.ONLY_IF_MISSING: + bunch_digest = self._get_digest(bunch) + else: + bunch_digest = None + + bunch_description = bunch.get('DESCR', None) or None + + bunch_data = bunch['data'] + bunch_target = bunch['target'] + + if len(bunch_data.shape) == 1: + bunch_data = bunch_data.reshape((bunch_data.shape[0], 1)) + if len(bunch_target.shape) == 1: + bunch_target = bunch_target.reshape((bunch_target.shape[0], 1)) + + column_names = [] + target_values = None + + if 'feature_names' in bunch: + for feature_name in bunch['feature_names']: + column_names.append(str(feature_name)) + + if 'target_names' in bunch: + if len(bunch['target_names']) == bunch_target.shape[1]: + for target_name in bunch['target_names']: + column_names.append(str(target_name)) + else: + target_values = [str(target_value) for target_value in bunch['target_names']] + + if target_values is not None: + converted_target = numpy.empty(bunch_target.shape, dtype=object) + + for i, row in enumerate(bunch_target): + for j, column in enumerate(row): + converted_target[i, j] = target_values[column] + else: + converted_target = bunch_target + + # Add names for any extra columns. We do not really check for duplicates because Pandas allow columns with the same name. + for i in range(len(column_names), bunch_data.shape[1] + converted_target.shape[1]): + column_names.append('column {i}'.format(i=i)) + + data = pandas.concat([pandas.DataFrame(bunch_data), pandas.DataFrame(converted_target)], axis=1) + data.columns = column_names + data = container_pandas.DataFrame(data) + + # We do not update digest with new data generated here. This is OK because this data is determined by + # original data so original digest still applies. When saving a new digest has to be computed anyway + # because this data will have to be converted to string. + data.insert(0, 'd3mIndex', range(len(data))) + + resources['learningData'] = data + + metadata = metadata.update(('learningData',), { + 'structural_type': type(data), + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': len(data), + }, + }) + + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': len(data.columns), + }, + }) + + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, 0), { + 'name': 'd3mIndex', + 'structural_type': numpy.int64, + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + for column_index in range(1, bunch_data.shape[1] + 1): + column_metadata: typing.Dict[str, typing.Any] = { + 'structural_type': bunch_data.dtype.type, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'name': data.columns[column_index], + } + + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, column_index), column_metadata) + + for column_index in range(bunch_data.shape[1] + 1, bunch_data.shape[1] + bunch_target.shape[1] + 1): + if target_values is not None: + if len(target_values) == 2: + column_type = ['http://schema.org/Boolean'] + elif len(target_values) > 2: + column_type = ['https://metadata.datadrivendiscovery.org/types/CategoricalData'] + else: + raise exceptions.InvalidDatasetError("Too few target values in sklearn dataset.") + else: + column_type = ['https://metadata.datadrivendiscovery.org/types/UnknownType'] + + column_metadata = { + 'structural_type': str if target_values is not None else bunch_target.dtype.type, + 'semantic_types': column_type + [ + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'name': data.columns[column_index], + } + + metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, column_index), column_metadata) + + return metadata, bunch_description, bunch_digest + + def _get_digest(self, bunch: typing.Dict) -> str: + hash = hashlib.sha256() + + hash.update(bunch['data'].tobytes()) + hash.update(bunch['target'].tobytes()) + + if 'feature_names' in bunch: + if isinstance(bunch['feature_names'], list): + for feature_name in bunch['feature_names']: + hash.update(feature_name.encode('utf8')) + else: + hash.update(bunch['feature_names'].tobytes()) + + if 'target_names' in bunch: + if isinstance(bunch['target_names'], list): + for target_name in bunch['target_names']: + hash.update(target_name.encode('utf8')) + else: + hash.update(bunch['target_names'].tobytes()) + + if 'DESCR' in bunch: + hash.update(bunch['DESCR'].encode('utf8')) + + return hash.hexdigest() + + def _get_bunch(self, dataset_path: str) -> typing.Dict: + return getattr(datasets, 'load_{dataset_path}'.format(dataset_path=dataset_path))() + + # "strict_digest" is ignored, there is no metadata to compare digest against. + # "handle_score_split is ignored as well. + def load(self, dataset_uri: str, *, dataset_id: str = None, dataset_version: str = None, dataset_name: str = None, lazy: bool = False, + compute_digest: ComputeDigest = ComputeDigest.ONLY_IF_MISSING, strict_digest: bool = False, handle_score_split: bool = True) -> 'Dataset': + assert self.can_load(dataset_uri) + + dataset_path = dataset_uri[len('sklearn://'):] + + if not hasattr(datasets, 'load_{dataset_path}'.format(dataset_path=dataset_path)): + raise exceptions.DatasetNotFoundError("Sklearn dataset '{dataset_uri}' cannot be found.".format(dataset_uri=dataset_uri)) + + dataset_description = None + dataset_digest = None + + resources: typing.Dict = {} + metadata = metadata_base.DataMetadata() + + if not lazy: + load_lazy = None + + metadata, dataset_description, dataset_digest = self._load_data( + resources, metadata, dataset_path=dataset_path, compute_digest=compute_digest, + ) + + else: + def load_lazy(dataset: Dataset) -> None: + # "dataset" can be used as "resources", it is a dict of values. + dataset.metadata, dataset_description, dataset_digest = self._load_data( + dataset, dataset.metadata, dataset_path=dataset_path, compute_digest=compute_digest, + ) + + new_metadata: typing.Dict = { + 'dimension': {'length': len(dataset)}, + } + + if dataset_description is not None: + new_metadata['description'] = dataset_description + + if dataset_digest is not None: + new_metadata['digest'] = dataset_digest + + dataset.metadata = dataset.metadata.update((), new_metadata) + dataset.metadata = dataset.metadata.generate(dataset) + + dataset._load_lazy = None + + dataset_metadata = { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': Dataset, + 'id': dataset_id or dataset_uri, + 'name': dataset_name or dataset_path, + 'location_uris': [ + dataset_uri, + ], + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': len(resources), + }, + } + + if dataset_version is not None: + dataset_metadata['version'] = dataset_version + + if dataset_description is not None: + dataset_metadata['description'] = dataset_description + + if dataset_digest is not None: + dataset_metadata['digest'] = dataset_digest + + metadata = metadata.update((), dataset_metadata) + + return Dataset(resources, metadata, load_lazy=load_lazy) + + +class D3MDatasetSaver(Saver): + """ + A class for saving of D3M datasets. + + This saver supports only saving to local file system. + URI should point to the ``datasetDoc.json`` file in the D3M dataset directory. + """ + + VERSION = '4.1.0' + + def can_save(self, dataset_uri: str) -> bool: + if not self._is_dataset(dataset_uri): + return False + + if not self._is_local_file(dataset_uri): + return False + + return True + + def _is_dataset(self, uri: str) -> bool: + try: + parsed_uri = url_parse.urlparse(uri, allow_fragments=False) + except Exception: + return False + + if os.path.basename(parsed_uri.path) != 'datasetDoc.json': + return False + + return True + + def _is_local_file(self, uri: str) -> bool: + try: + parsed_uri = url_parse.urlparse(uri, allow_fragments=False) + except Exception: + return False + + if parsed_uri.scheme != 'file': + return False + + if parsed_uri.netloc not in ['', 'localhost']: + return False + + if not parsed_uri.path.startswith('/'): + return False + + return True + + def _get_column_description(self, column_index: int, column_name: str, column_metadata: typing.Dict) -> typing.Dict: + column = { + 'colIndex': column_index, + 'colName': column_name, + 'role': [SEMANTIC_TYPES_TO_D3M_ROLES[x] for x in column_metadata.get('semantic_types', []) if x in SEMANTIC_TYPES_TO_D3M_ROLES] + } + column_type = [SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES[semantic_type] for semantic_type in column_metadata.get('semantic_types', []) if semantic_type in SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES] + + # If column semantic_type is not specified we default to unknown type. + if not column_type: + if 'structural_type' in column_metadata: + if utils.is_int(column_metadata['structural_type']): + column['colType'] = SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES['http://schema.org/Integer'] + elif utils.is_float(column_metadata['structural_type']): + column['colType'] = SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES['http://schema.org/Float'] + elif issubclass(column_metadata['structural_type'], bool): + column['colType'] = SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES['http://schema.org/Boolean'] + else: + column['colType'] = SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES['https://metadata.datadrivendiscovery.org/types/UnknownType'] + else: + column['colType'] = SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES['https://metadata.datadrivendiscovery.org/types/UnknownType'] + elif len(column_type) == 1: + column['colType'] = column_type[0] + else: + raise exceptions.InvalidMetadataError( + "More than one semantic type found for column type: {column_type}".format( + column_type=column_type, + ), + ) + + if column_metadata.get('description', None): + column['colDescription'] = column_metadata['description'] + + return column + + def _get_collection_resource_description(self, dataset: 'Dataset', resource_id: str, resource: typing.Any, dataset_location_base_path: typing.Optional[str]) -> typing.Dict: + if not isinstance(resource, container_pandas.DataFrame): + raise exceptions.InvalidArgumentTypeError("Saving a D3M dataset with a collection resource which is not a DataFrame, but '{structural_type}'.".format( + structural_type=type(resource), + )) + if len(resource.columns) != 1: + raise exceptions.InvalidArgumentTypeError("Saving a D3M dataset with a collection resource with an invalid number of columns: {columns}".format( + columns=len(resource.columns), + )) + if not dataset.metadata.has_semantic_type((resource_id, metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/FileName'): + raise exceptions.InvalidArgumentTypeError("Saving a D3M dataset with a collection resource with with a column which does not contain filenames.") + + selector = (resource_id, metadata_base.ALL_ELEMENTS, 0) + metadata, exceptions_with_selectors = dataset.metadata.query_with_exceptions(selector) + + # We check structural type for all rows in a column, but also if any row has a different structural type. + for structural_type in [metadata['structural_type']] + [metadata['structural_type'] for metadata in exceptions_with_selectors.values() if 'structural_type' in metadata]: + if not issubclass(structural_type, str): + raise exceptions.InvalidArgumentTypeError("Saving a D3M dataset with a collection resource with with a column which does not just string values, but also '{structural_type}'.".format( + structural_type=structural_type, + )) + + # We use "location_base_uris" from all rows. We only support "location_base_uris" + # being the same for all rows, so we have to verify that. + all_location_base_uris_nested = [ + list(metadata.get('location_base_uris', [])) + ] + [ + list(metadata['location_base_uris']) for metadata in exceptions_with_selectors.values() if 'location_base_uris' in metadata + ] + + # Flatten the list of lists, remove duplicates, sort for reproducibility. + all_location_base_uris = sorted({all_location_base_uri for all_location_base_uri in itertools.chain.from_iterable(all_location_base_uris_nested)}) + + local_location_base_uris = [location_base_uri for location_base_uri in all_location_base_uris if self._is_local_file(location_base_uri)] + + if not local_location_base_uris: + raise exceptions.NotSupportedError( + "Saving a D3M dataset with a collection resource without local files is not supported: {all_location_base_uris}".format( + all_location_base_uris=all_location_base_uris, + ), + ) + elif len(local_location_base_uris) > 1: + # When there are multiple base locations in D3M dataset format can lead to conflicts + # where same filename in a column points to different files, but we are storing them + # under the same resource path. We verify that there are no conflicts in "_save_collection". + # Because there is no clear way to determine the best common resource path we use a hard-coded one. + resource_path = 'files/' + elif dataset_location_base_path is None: + # We cannot determine the resource path so we use a hard-coded one. + resource_path = 'files/' + else: + location_base_path = url_parse.urlparse(local_location_base_uris[0], allow_fragments=False).path + + # This is a way to check that "dataset_location_base_path" is a prefix of "location_base_path". + if os.path.commonpath([location_base_path, dataset_location_base_path]) != dataset_location_base_path: + raise exceptions.NotSupportedError( + "Saving a D3M dataset with a collection resource with files location not under the dataset directory.", + ) + + resource_path = location_base_path[len(dataset_location_base_path) + 1:] + + # Just a matter of style. + if not resource_path.endswith('/'): + resource_path += '/' + + resource_formats_set = set() + # "media_types" for "ALL_ELEMENTS" is an union of all rows. + for media_type in metadata.get('media_types', []): + # We allow unknown media types. + resource_formats_set.add(MEDIA_TYPES_REVERSE.get(media_type, media_type)) + + resource_formats = {} + + # An empty collection? Or just a collection resource without metadata? + if not resource_formats_set: + if len(resource): + raise ValueError("A collection resource without media types metadata.") + + # An optimized case, all files in a collection belong to the same resource format. + elif len(resource_formats_set) == 1: + file_extensions_set = set() + for filename in resource.iloc[:, 0]: + root, ext = os.path.splitext(filename) + if not ext: + raise ValueError("A filename without a file extension in a collection resource: {filename}".format(filename=filename)) + ext = _remove_extension_dot(ext) + file_extensions_set.add(ext) + + # Sorting to have reproducibility. + resource_formats[resource_formats_set.pop()] = sorted(file_extensions_set) + + else: + resource_formats_of_sets: typing.Dict[str, typing.Set] = {} + + for row_index, filename in enumerate(resource.iloc[:, 0]): + root, ext = os.path.splitext(filename) + if not ext: + raise ValueError("A filename without a file extension in a collection resource: {filename}".format(filename=filename)) + ext = _remove_extension_dot(ext) + + try: + media_types = dataset.metadata.query((resource_id, row_index, 0))['media_types'] + except KeyError: + raise ValueError("A collection resource without media types metadata for row {row_index}.".format(row_index=row_index)) from None + + if len(media_types) != 1: + raise ValueError("Medata should have only one media type per row in a collection resource, at row {row_index}: {media_types}".format(row_index=row_index, media_types=media_types)) + + # We allow unknown media types. + resource_format = MEDIA_TYPES_REVERSE.get(media_types[0], media_types[0]) + + if resource_format not in resource_formats_of_sets: + resource_formats_of_sets[resource_format] = set() + + resource_formats_of_sets[resource_format].add(ext) + + for resource_format, file_extensions in resource_formats_of_sets.items(): + # Sorting to have reproducibility. + resource_formats[resource_format] = sorted(file_extensions) + + resource_type = [SEMANTIC_TYPES_TO_D3M_RESOURCE_TYPES[semantic_type] for semantic_type in metadata.get('semantic_types', []) if semantic_type in SEMANTIC_TYPES_TO_D3M_RESOURCE_TYPES] + + if len(resource_type) != 1: + raise exceptions.InvalidMetadataError( + "Not exactly one semantic type found for resource type: {resource_type}".format( + resource_type=resource_type, + ), + ) + + resource_description = { + 'resID': resource_id, + 'isCollection': True, + 'resFormat': resource_formats, + 'resType': resource_type[0], + 'resPath': resource_path, + } + + columns = self._get_columns_description(dataset, resource_id, resource) + + if columns: + resource_description['columns'] = columns + + if 'file_columns_count' in metadata: + resource_description['columnsCount'] = metadata['file_columns_count'] + + return resource_description + + # We do not use "dataset_location_base_path" but we keep it for all "_get_*_resource_description" methods to have the same signature. + def _get_dataframe_resource_description(self, dataset: 'Dataset', resource_id: str, resource: typing.Any, dataset_location_base_path: typing.Optional[str]) -> typing.Dict: + if dataset.metadata.has_semantic_type((resource_id,), 'https://metadata.datadrivendiscovery.org/types/EdgeList'): + res_type = 'edgeList' + else: + res_type = 'table' + + resource_description = { + 'resID': resource_id, + 'isCollection': False, + 'resFormat': {'text/csv': ['csv']}, + 'resType': res_type, + 'columnsCount': len(resource.columns), + } + + if dataset.metadata.has_semantic_type((resource_id,), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'): + if resource_id != 'learningData': + logger.error("Saving a dataset with a dataset entry point with resource ID not equal to 'learningData', but '%(resource_id)s'.", {'resource_id': resource_id}) + resource_description['resPath'] = 'tables/learningData.csv' + else: + resource_description['resPath'] = 'tables/{resource_id}.csv'.format(resource_id=resource_id) + + columns = self._get_columns_description(dataset, resource_id, resource) + + if columns: + resource_description['columns'] = columns + + return resource_description + + # TODO: Make it easier to subclass to support other resource types. + def _get_resource_description(self, dataset: 'Dataset', resource_id: str, resource: typing.Any, dataset_location_base_path: typing.Optional[str]) -> typing.Dict: + if dataset.metadata.has_semantic_type((resource_id,), 'https://metadata.datadrivendiscovery.org/types/FilesCollection'): + return self._get_collection_resource_description(dataset, resource_id, resource, dataset_location_base_path) + + elif isinstance(resource, container_pandas.DataFrame): + return self._get_dataframe_resource_description(dataset, resource_id, resource, dataset_location_base_path) + + else: + raise exceptions.NotSupportedError("Saving a D3M dataset with a resource with structural type '{structural_type}' is not supported.".format(structural_type=type(resource))) + + def _get_columns_description(self, dataset: 'Dataset', resource_id: str, resource: typing.Any) -> typing.List[typing.Dict]: + columns = [] + + # Traverse file columns in collections. + if dataset.metadata.has_semantic_type((resource_id,), 'https://metadata.datadrivendiscovery.org/types/FilesCollection'): + # We know there is only one column here. This has been verified in "_get_collection_resource_description". + column_metadata = dataset.metadata.query((resource_id, metadata_base.ALL_ELEMENTS, 0)) + for file_column_metadata in column_metadata.get('file_columns', []): + columns.append(self._get_column_description(file_column_metadata['column_index'], file_column_metadata['column_name'], file_column_metadata)) + + # Traverse columns in a DataFrame. + elif isinstance(resource, container_pandas.DataFrame): + number_of_columns = len(resource.columns) + for column_index in range(number_of_columns): + column_selector = (resource_id, metadata_base.ALL_ELEMENTS, column_index) + column_metadata = dataset.metadata.query(column_selector) + + column = self._get_column_description(column_index, column_metadata['name'], column_metadata) + + if 'boundary_for' in column_metadata and 'foreign_key' in column_metadata: + raise exceptions.NotSupportedError("Both boundary and foreign key are not supported.") + + elif 'foreign_key' in column_metadata: + if column_metadata['foreign_key']['type'] == 'COLUMN': + refers_to = { + 'resID': column_metadata['foreign_key']['resource_id'], + 'resObject': {}, + } + + if 'column_name' in column_metadata['foreign_key']: + refers_to['resObject'] = { + 'columnName': column_metadata['foreign_key']['column_name'], + } + referring_column_index = dataset.metadata.get_column_index_from_column_name( + column_metadata['foreign_key']['column_name'], + at=(column_metadata['foreign_key']['resource_id'],), + ) + elif 'column_index' in column_metadata['foreign_key']: + refers_to['resObject'] = { + 'columnIndex': column_metadata['foreign_key']['column_index'], + } + referring_column_index = column_metadata['foreign_key']['column_index'] + else: + raise exceptions.InvalidMetadataError(f"'foreign_key' is missing a column reference, in metadata of column {column_index} of resource '{resource_id}'.") + + # A special case to handle a reference to a file collection. + if dataset.metadata.has_semantic_type( + (column_metadata['foreign_key']['resource_id'],), + 'https://metadata.datadrivendiscovery.org/types/FilesCollection', + ) and dataset.metadata.has_semantic_type( + (column_metadata['foreign_key']['resource_id'], metadata_base.ALL_ELEMENTS, referring_column_index), + 'https://metadata.datadrivendiscovery.org/types/FileName', + ): + refers_to['resObject'] = 'item' + + column['refersTo'] = refers_to + + elif column_metadata['foreign_key']['type'] == 'NODE_ATTRIBUTE': + column['refersTo'] = { + 'resID': column_metadata['foreign_key']['resource_id'], + 'resObject': { + 'nodeAttribute': column_metadata['foreign_key']['node_attribute'], + }, + } + + elif column_metadata['foreign_key']['type'] == 'EDGE_ATTRIBUTE': + column['refersTo'] = { + 'resID': column_metadata['foreign_key']['resource_id'], + 'resObject': { + 'edgeAttribute': column_metadata['foreign_key']['edge_attribute'], + }, + } + + elif 'boundary_for' in column_metadata: + refers_to = { + # "resource_id" is optional in our metadata and it + # means the reference is local to the resource. + 'resID': column_metadata['boundary_for'].get('resource_id', resource_id), + 'resObject': {}, + } + + if 'column_name' in column_metadata['boundary_for']: + refers_to['resObject'] = { + 'columnName': column_metadata['boundary_for']['column_name'], + } + elif 'column_index' in column_metadata['boundary_for']: + refers_to['resObject'] = { + 'columnIndex': column_metadata['boundary_for']['column_index'], + } + else: + raise exceptions.InvalidMetadataError(f"'boundary_for' is missing a column reference, in metadata of column {column_index} of resource '{resource_id}'.") + + column['refersTo'] = refers_to + + if 'time_granularity' in column_metadata: + try: + column['timeGranularity'] = { + 'value': column_metadata['time_granularity']['value'], + 'unit': TIME_GRANULARITIES_REVERSE[column_metadata['time_granularity']['unit']], + } + except KeyError as error: + raise exceptions.InvalidMetadataError(f"'time_granularity' is invalid, in metadata of column {column_index} of resource '{resource_id}'.") from error + + columns.append(column) + + return columns + + def _get_dataset_description(self, dataset: 'Dataset') -> typing.Dict: + dataset_description: typing.Dict[str, typing.Any] = { + 'about': { + 'datasetSchemaVersion': self.VERSION, + }, + } + + dataset_root_metadata = dataset.metadata.query(()) + + for d3m_path, (dataset_path, required) in D3M_TO_DATASET_FIELDS.items(): + value = utils.get_dict_path(dataset_root_metadata, dataset_path) + if value is not None: + utils.set_dict_path(dataset_description, d3m_path, value) + elif required: + raise exceptions.InvalidMetadataError(f"Dataset metadata field '{'.'.join(dataset_path)}' is required when saving.") + + for x in [dataset_root_metadata.get('stored_size', None), dataset_description['about'].get('approximateSize', None)]: + if x is not None: + exponent = int((math.log10(x) // 3) * 3) + try: + unit = SIZE_TO_UNITS[exponent] + except KeyError as error: + raise KeyError("Unit string for '{exponent}' not found in lookup dictionary {SIZE_TO_UNITS}.".format(exponent=exponent, SIZE_TO_UNITS=SIZE_TO_UNITS)) from error + dataset_description['about']['approximateSize'] = str(x // (10 ** exponent)) + ' ' + unit + break + + # We are only using the first URI due to design of D3M dataset format. Remaining URIs should be stored in qualities. + if dataset_root_metadata.get('source', {}).get('uris', []): + dataset_description['about']['sourceURI'] = dataset_root_metadata['source']['uris'][0] + + dataset_location_uris = [location_uri for location_uri in dataset_root_metadata.get('location_uris', []) if self._is_local_file(location_uri)] + + if dataset_location_uris: + # If there are multiple local URIs, we pick the first. + dataset_location_base_path = os.path.dirname(url_parse.urlparse(dataset_location_uris[0], allow_fragments=False).path) + else: + dataset_location_base_path = None + + data_resources = [] + + for resource_id, resource in dataset.items(): + resource_description = self._get_resource_description(dataset, resource_id, resource, dataset_location_base_path) + + data_resources.append(resource_description) + + dataset_description['dataResources'] = data_resources + + return dataset_description + + def _generate_metadata_qualities(self, dataset: 'Dataset') -> typing.List: + # We start with canonical metadata. + metadata_to_save = dataset._canonical_metadata(dataset.metadata) + + # We remove digest. + metadata_to_save = metadata_to_save.update((), {'digest': metadata_base.NO_VALUE}) + + for resource_id, resource in dataset.items(): + if isinstance(resource, container_pandas.DataFrame): + # All columns in the DataFrame will be saved as strings, so we have to update + # metadata first to reflect that, before we save metadata. + metadata_to_save = metadata_to_save.update((resource_id, metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS), {'structural_type': str}) + + qualities = [] + for metadata_entry in metadata_to_save.to_internal_json_structure(): + restricted_to = { + 'resID': metadata_entry['selector'][0] if metadata_entry['selector'] else '', + } + + if metadata_entry['selector']: + restricted_to['resComponent'] = { + 'selector': metadata_entry['selector'][1:], + } + + qualities.append({ + 'qualName': 'metadata', + 'qualValue': metadata_entry['metadata'], + 'qualValueType': 'dict', + 'restrictedTo': restricted_to, + }) + + return qualities + + def save(self, dataset: 'Dataset', dataset_uri: str, *, compute_digest: ComputeDigest = ComputeDigest.ALWAYS, preserve_metadata: bool = True) -> None: + assert self.can_save(dataset_uri) + + dataset_description = self._get_dataset_description(dataset) + + if preserve_metadata: + dataset_description['qualities'] = self._generate_metadata_qualities(dataset) + + dataset_path = os.path.dirname(url_parse.urlparse(dataset_uri, allow_fragments=False).path) + os.makedirs(dataset_path, 0o755, exist_ok=False) + + dataset_description_path = os.path.join(dataset_path, 'datasetDoc.json') + + # We use "x" mode to make sure file does not already exist. + with open(dataset_description_path, 'x', encoding='utf8') as f: + json.dump(dataset_description, f, indent=2, allow_nan=False) + + for resource_description in dataset_description['dataResources']: + resource_id = resource_description['resID'] + resource = dataset[resource_id] + + self._save_resource(dataset, dataset_uri, dataset_path, resource_description, resource_id, resource) + + # We calculate digest of the new dataset and write it into datasetDoc.json + dataset_description['about']['digest'] = get_d3m_dataset_digest(dataset_description_path) + with open(dataset_description_path, 'w', encoding='utf8') as f: + json.dump(dataset_description, f, indent=2, allow_nan=False) + + # TODO: Make it easier to subclass to support non-local "location_base_uris". + def _save_collection(self, dataset: 'Dataset', dataset_uri: str, dataset_path: str, resource_description: typing.Dict, resource_id: str, resource: typing.Any) -> None: + # Here we can assume collection resource is a DataFrame which contains exactly one + # column containing filenames. This has been verified in "_get_collection_resource_description". + assert isinstance(resource, container_pandas.DataFrame), type(resource) + assert len(resource.columns) == 1, resource.columns + + already_copied: typing.Set[typing.Tuple[str, str]] = set() + linking_warning_issued = False + + for row_index, filename in enumerate(resource.iloc[:, 0]): + # "location_base_uris" is required for collections. + location_base_uris = dataset.metadata.query((resource_id, row_index, 0))['location_base_uris'] + + local_location_base_uris = [location_base_uri for location_base_uri in location_base_uris if self._is_local_file(location_base_uri)] + + # We verified in "_get_collection_resource_description" that there is only one local URI. + assert len(local_location_base_uris) == 1, local_location_base_uris + local_location_base_uri = local_location_base_uris[0] + + # "location_base_uris" should be made so that we can just concat with the filename + # ("location_base_uris" end with "/"). + source_uri = local_location_base_uri + filename + source_path = url_parse.urlparse(source_uri, allow_fragments=False).path + + destination_path = os.path.join(dataset_path, resource_description['resPath'], filename) + + # Multiple rows can point to the same file, so we do not have to copy them multiple times. + if (source_path, destination_path) in already_copied: + continue + + os.makedirs(os.path.dirname(destination_path), 0o755, exist_ok=True) + + linked = False + + try: + os.link(source_path, destination_path) + linked = True + + except FileExistsError as error: + # If existing file is the same, then this is OK. Multiple rows can point to the same file. + if os.path.samefile(source_path, destination_path): + linked = True + elif filecmp.cmp(source_path, destination_path, shallow=False): + linked = True + # But otherwise we raise an exception. + else: + raise exceptions.AlreadyExistsError( + "Destination file '{destination_path}' already exists with different content than '{source_path}' has.".format( + destination_path=destination_path, + source_path=source_path, + ), + ) from error + + except OSError as error: + # OSError: [Errno 18] Invalid cross-device link + if error.errno == errno.EXDEV: + pass + else: + raise error + + # If we can't make a hard-link we try to copy the file. + if not linked: + if not linking_warning_issued: + linking_warning_issued = True + logger.warning("Saving dataset to '%(dataset_uri)s' cannot use hard-linking.", {'dataset_uri': dataset_uri}) + + try: + with open(source_path, 'rb') as source_file: + with open(destination_path, 'xb') as destination_file: + shutil.copyfileobj(source_file, destination_file) + + except FileExistsError as error: + # If existing file is the same, then this is OK. Multiple rows can point to the same file. + if os.path.samefile(source_path, destination_path): + pass + elif filecmp.cmp(source_path, destination_path, shallow=False): + pass + # But otherwise we raise an exception. + else: + raise exceptions.AlreadyExistsError( + "Destination file '{destination_path}' already exists with different content than '{source_path}' has.".format( + destination_path=destination_path, + source_path=source_path, + ), + ) from error + + already_copied.add((source_path, destination_path)) + + # TODO: Make it easier to subclass to support other column types. + def _save_dataframe(self, dataset: 'Dataset', dataset_path: str, resource_description: typing.Dict, resource_id: str, resource: typing.Any) -> None: + destination_path = os.path.join(dataset_path, resource_description['resPath']) + # A subset of "simple_data_types". + # TODO: Support additional types. + # Dicts we can try to convert to "json" column type. Lists of floats we can convert to "realVector". + # We could also probably support boolean values. + supported_column_structural_types = (str, float, int, numpy.integer, numpy.float64, numpy.bool_, type(None)) + + # We verify if structural types of columns are supported. + for column_index in range(dataset.metadata.query((resource_id, metadata_base.ALL_ELEMENTS))['dimension']['length']): + selector = (resource_id, metadata_base.ALL_ELEMENTS, column_index) + metadata, exceptions_with_selectors = dataset.metadata.query_with_exceptions(selector) + + # We check structural type for all rows in a column, but also if any row has a different structural type. + for structural_type in [metadata['structural_type']] + [metadata['structural_type'] for metadata in exceptions_with_selectors.values() if 'structural_type' in metadata]: + if not issubclass(structural_type, supported_column_structural_types): + raise exceptions.NotSupportedError("Saving a D3M dataset with a column with structural type '{structural_type}' is not supported.".format(structural_type=structural_type)) + + os.makedirs(os.path.dirname(destination_path), 0o755, exist_ok=True) + + # We use "x" mode to make sure file does not already exist. + resource.to_csv(destination_path, mode='x', encoding='utf8') + + # TODO: Make it easier to subclass to support other resource types. + def _save_resource(self, dataset: 'Dataset', dataset_uri: str, dataset_path: str, resource_description: typing.Dict, resource_id: str, resource: typing.Any) -> None: + if resource_description.get('isCollection', False): + self._save_collection(dataset, dataset_uri, dataset_path, resource_description, resource_id, resource) + + elif isinstance(resource, container_pandas.DataFrame): + self._save_dataframe(dataset, dataset_path, resource_description, resource_id, resource) + + else: + raise exceptions.NotSupportedError("Saving a D3M dataset with a resource with structural type '{structural_type}' is not supported.".format(structural_type=type(resource))) + + +D = typing.TypeVar('D', bound='Dataset') + + +# TODO: It should be probably immutable. +class Dataset(dict): + """ + A class representing a dataset. + + Internally, it is a dictionary containing multiple resources (e.g., tables). + + Parameters + ---------- + resources: + A map from resource IDs to resources. + metadata: + Metadata associated with the ``data``. + load_lazy: + If constructing a lazy dataset, calling this function will read all the + data and convert the dataset to a non-lazy one. + generate_metadata: bool + Automatically generate and update the metadata. + check: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + """ + + metadata: metadata_base.DataMetadata = None + loaders: typing.List[Loader] = [ + D3MDatasetLoader(), + CSVLoader(), + SklearnExampleLoader(), + OpenMLDatasetLoader(), + ] + savers: typing.List[Saver] = [ + D3MDatasetSaver(), + ] + + @deprecate.arguments('source', 'timestamp', 'check', message="argument ignored") + def __init__(self, resources: typing.Mapping, metadata: metadata_base.DataMetadata = None, *, + load_lazy: typing.Callable[['Dataset'], None] = None, generate_metadata: bool = False, + check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None) -> None: + super().__init__(resources) + + if isinstance(resources, Dataset) and metadata is None: + # We made a copy, so we do not have to generate metadata. + self.metadata = resources.metadata + elif metadata is not None: + # We were provided metadata, so we do not have to generate metadata. + self.metadata = metadata + else: + self.metadata = metadata_base.DataMetadata() + if generate_metadata: + self.metadata = self.metadata.generate(self) + + self._load_lazy = load_lazy + + @classmethod + def load(cls, dataset_uri: str, *, dataset_id: str = None, dataset_version: str = None, dataset_name: str = None, lazy: bool = False, + compute_digest: ComputeDigest = ComputeDigest.ONLY_IF_MISSING, strict_digest: bool = False, handle_score_split: bool = True) -> 'Dataset': + """ + Tries to load dataset from ``dataset_uri`` using all registered dataset loaders. + + Parameters + ---------- + dataset_uri: + A URI to load. + dataset_id: + Override dataset ID determined by the loader. + dataset_version: + Override dataset version determined by the loader. + dataset_name: + Override dataset name determined by the loader. + lazy: + If ``True``, load only top-level metadata and not whole dataset. + compute_digest: + Compute a digest over the data? + strict_digest: + If computed digest does not match the one provided in metadata, raise an exception? + handle_score_split: + If a scoring dataset has target values in a separate file, merge them in? + + Returns + ------- + A loaded dataset. + """ + + for loader in cls.loaders: + if loader.can_load(dataset_uri): + return loader.load( + dataset_uri, dataset_id=dataset_id, dataset_version=dataset_version, dataset_name=dataset_name, + lazy=lazy, compute_digest=compute_digest, strict_digest=strict_digest, handle_score_split=handle_score_split, + ) + + raise exceptions.DatasetUriNotSupportedError( + "No known loader could load dataset from '{dataset_uri}'.".format(dataset_uri=dataset_uri), + ) + + def save(self, dataset_uri: str, *, compute_digest: ComputeDigest = ComputeDigest.ALWAYS, preserve_metadata: bool = True) -> None: + """ + Tries to save dataset to ``dataset_uri`` using all registered dataset savers. + + Parameters + ---------- + dataset_uri: + A URI to save to. + compute_digest: + Compute digest over the data when saving? + preserve_metadata: + When saving a dataset, store its metadata as well? + """ + + for saver in self.savers: + if saver.can_save(dataset_uri): + saver.save(self, dataset_uri, compute_digest=compute_digest, preserve_metadata=preserve_metadata) + return + + raise exceptions.DatasetUriNotSupportedError("No known saver could save dataset to '{dataset_uri}'.".format(dataset_uri=dataset_uri)) + + def is_lazy(self) -> bool: + """ + Return whether this dataset instance is lazy and not all data has been loaded. + + Returns + ------- + ``True`` if this dataset instance is lazy. + """ + + return self._load_lazy is not None + + def load_lazy(self) -> None: + """ + Read all the data and convert the dataset to a non-lazy one. + """ + + if self._load_lazy is not None: + self._load_lazy(self) + + # TODO: Allow one to specify priority which would then insert loader at a different place and not at the end? + @classmethod + def register_loader(cls, loader: Loader) -> None: + """ + Registers a new dataset loader. + + Parameters + ---------- + loader: + An instance of the loader class implementing a new loader. + """ + + cls.loaders.append(loader) + + # TODO: Allow one to specify priority which would then insert saver at a different place and not at the end? + @classmethod + def register_saver(cls, saver: Saver) -> None: + """ + Registers a new dataset saver. + + Parameters + ---------- + saver: + An instance of the saver class implementing a new saver. + """ + + cls.savers.append(saver) + + def __repr__(self) -> str: + return self.__str__() + + def _get_description_keys(self) -> typing.Sequence[str]: + return 'id', 'name', 'location_uris' + + def __str__(self) -> str: + metadata = self.metadata.query(()) + + return '{class_name}({description})'.format( + class_name=type(self).__name__, + description=', '.join('{key}=\'{value}\''.format(key=key, value=metadata[key]) for key in self._get_description_keys() if key in metadata), + ) + + def copy(self: D) -> D: + # Metadata is copied from provided iterable. + return type(self)(resources=self, load_lazy=self._load_lazy) + + def __copy__(self: D) -> D: + return self.copy() + + def select_rows(self: D, row_indices_to_keep: typing.Mapping[str, typing.Sequence[int]]) -> D: + """ + Generate a new Dataset from the row indices for DataFrames. + + Parameters + ---------- + row_indices_to_keep: + This is a dict where key is resource ID and value is a sequence of row indices to keep. + If a resource ID is missing, the whole related resource is kept. + + Returns + ------- + Returns a new Dataset. + """ + + resources = {} + metadata = self.metadata + + for resource_id, resource in self.items(): + # We keep any resource which is missing from "row_indices_to_keep". + if resource_id not in row_indices_to_keep: + resources[resource_id] = resource + else: + if not isinstance(resource, container_pandas.DataFrame): + raise exceptions.InvalidArgumentTypeError("Only DataFrame resources can have rows selected, not '{type}'.".format(type=type(resource))) + + row_indices = sorted(row_indices_to_keep[resource_id]) + resources[resource_id] = self[resource_id].iloc[row_indices, :].reset_index(drop=True) + + # TODO: Expose this as a general metadata method. + # In that case this has to be done recursively over all nested ALL_ELEMENTS. + # Here we are operating at resource level so we have to iterate only over first + # ALL_ELEMENTS and resource's element itself. + + # Change the metadata. Update the number of rows in the split. + # This makes a copy so that we can modify metadata in-place. + metadata = metadata.update( + (resource_id,), + { + 'dimension': { + 'length': len(row_indices), + }, + }, + ) + + # Remove all rows not in this split and reorder those which are. + for element_metadata_entry in [ + metadata._current_metadata.all_elements, + metadata._current_metadata.elements[resource_id], + ]: + if element_metadata_entry is None: + continue + + elements = element_metadata_entry.elements + new_elements_evolver = utils.EMPTY_PMAP.evolver() + for i, row_index in enumerate(row_indices): + if row_index in elements: + new_elements_evolver.set(i, elements[row_index]) + element_metadata_entry.elements = new_elements_evolver.persistent() + element_metadata_entry.is_elements_empty = not element_metadata_entry.elements + element_metadata_entry.update_is_empty() + + return type(self)(resources, metadata) + + def get_relations_graph(self) -> typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]]: + """ + Builds the relations graph for the dataset. + + Each key in the output corresponds to a resource/table. The value under a key is the list of + edges this table has. The edge is represented by a tuple of four elements. For example, + if the edge is ``(resource_id, True, index_1, index_2, custom_state)``, it + means that there is a foreign key that points to table ``resource_id``. Specifically, + ``index_1`` column in the current table points to ``index_2`` column in the table ``resource_id``. + + ``custom_state`` is an empty dict when returned from this method, but allows users + of this graph to store custom state there. + + Returns + ------- + Dict[str, List[Tuple[str, bool, int, int, Dict]]] + Returns the relation graph in adjacency representation. + """ + + graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]] = collections.defaultdict(list) + + for resource_id in self.keys(): + if not issubclass(self.metadata.query((resource_id,))['structural_type'], container_pandas.DataFrame): + continue + + columns_length = self.metadata.query((resource_id, metadata_base.ALL_ELEMENTS,))['dimension']['length'] + for index in range(columns_length): + column_metadata = self.metadata.query((resource_id, metadata_base.ALL_ELEMENTS, index)) + + if 'foreign_key' not in column_metadata: + continue + + if column_metadata['foreign_key']['type'] != 'COLUMN': + continue + + foreign_resource_id = column_metadata['foreign_key']['resource_id'] + + # "COLUMN" foreign keys should not point to non-DataFrame resources. + assert isinstance(self[foreign_resource_id], container_pandas.DataFrame), type(self[foreign_resource_id]) + + if 'column_index' in column_metadata['foreign_key']: + foreign_index = column_metadata['foreign_key']['column_index'] + elif 'column_name' in column_metadata['foreign_key']: + foreign_index = self.metadata.get_column_index_from_column_name(column_metadata['foreign_key']['column_name'], at=(foreign_resource_id,)) + else: + raise exceptions.UnexpectedValueError("Invalid foreign key: {foreign_key}".format(foreign_key=column_metadata['foreign_key'])) + + # "True" and "False" implies forward and backward relationships, respectively. + graph[resource_id].append((foreign_resource_id, True, index, foreign_index, {})) + graph[foreign_resource_id].append((resource_id, False, foreign_index, index, {})) + + return graph + + def get_column_references_by_column_index(self) -> typing.Dict[str, typing.Dict[metadata_base.ColumnReference, typing.List[metadata_base.ColumnReference]]]: + references: typing.Dict[str, typing.Dict[metadata_base.ColumnReference, typing.List[metadata_base.ColumnReference]]] = { + 'confidence_for': {}, + 'rank_for': {}, + 'boundary_for': {}, + 'foreign_key': {}, + } + + for resource_id, resource in self.items(): + if not isinstance(resource, container_pandas.DataFrame): + continue + + resource_references = self.metadata.get_column_references_by_column_index(resource_id, at=(resource_id,)) + + references['confidence_for'].update(resource_references['confidence_for']) + references['rank_for'].update(resource_references['rank_for']) + references['boundary_for'].update(resource_references['boundary_for']) + references['foreign_key'].update(resource_references['foreign_key']) + + return references + + @classmethod + def _canonical_dataset_description(cls, dataset_description: typing.Dict, *, set_no_value: bool = False) -> typing.Dict: + """ + Currently, this is just removing any local URIs the description might have. + """ + + # Making a copy. + dataset_description = dict(dataset_description) + + utils.filter_local_location_uris(dataset_description, empty_value=metadata_base.NO_VALUE if set_no_value else None) + + return dataset_description + + def to_json_structure(self, *, canonical: bool = False) -> typing.Dict: + """ + Returns only a top-level dataset description. + """ + + # Using "to_json_structure" and not "to_internal_json_structure" because + # it is not indented that this would be parsed back directly, but just used + # to know where to find the dataset. + dataset_description = utils.to_json_structure(self.metadata.query(())) + + if canonical: + dataset_description = self._canonical_dataset_description(dataset_description) + + metadata_base.CONTAINER_SCHEMA_VALIDATOR.validate(dataset_description) + + return dataset_description + + @classmethod + def _canonical_metadata(cls, metadata: metadata_base.DataMetadata) -> metadata_base.DataMetadata: + """ + Currently, this is just removing any local URIs the metadata might have. + """ + + metadata = metadata.update((), cls._canonical_dataset_description(metadata.query(()), set_no_value=True)) + + metadata = cls._canonical_metadata_traverse(metadata, metadata, []) + + return metadata + + @classmethod + def _canonical_metadata_traverse(cls, metadata: metadata_base.DataMetadata, output_metadata: metadata_base.DataMetadata, selector: metadata_base.ListSelector) -> metadata_base.DataMetadata: + # "ALL_ELEMENTS" is always first, if it exists, which works in our favor here. + elements = metadata.get_elements(selector) + + for element in elements: + new_selector = selector + [element] + new_metadata = dict(metadata._query(new_selector, metadata._current_metadata, 0)) + utils.filter_local_location_uris(new_metadata, empty_value=metadata_base.NO_VALUE) + output_metadata = output_metadata.update(new_selector, new_metadata) + + output_metadata = cls._canonical_metadata_traverse(metadata, output_metadata, new_selector) + + return output_metadata + + +def dataset_serializer(obj: Dataset) -> dict: + data = { + 'metadata': obj.metadata, + 'dataset': dict(obj), + } + + if type(obj) is not Dataset: + data['type'] = type(obj) + + return data + + +def dataset_deserializer(data: dict) -> Dataset: + dataset = data.get('type', Dataset)(data['dataset'], data['metadata']) + return dataset + + +if pyarrow_lib is not None: + pyarrow_lib._default_serialization_context.register_type( + Dataset, 'd3m.dataset', + custom_serializer=dataset_serializer, + custom_deserializer=dataset_deserializer, + ) + + +def get_dataset( + dataset_uri: str, *, compute_digest: ComputeDigest = ComputeDigest.ONLY_IF_MISSING, + strict_digest: bool = False, lazy: bool = False, + datasets_dir: str = None, handle_score_split: bool = True, +) -> Dataset: + if datasets_dir is not None: + datasets, problem_descriptions = utils.get_datasets_and_problems(datasets_dir, handle_score_split) + + if dataset_uri in datasets: + dataset_uri = datasets[dataset_uri] + + dataset_uri = utils.fix_uri(dataset_uri) + + return Dataset.load(dataset_uri, compute_digest=compute_digest, strict_digest=strict_digest, lazy=lazy) + + +def describe_handler(arguments: argparse.Namespace, *, dataset_resolver: typing.Callable = None) -> None: + if dataset_resolver is None: + dataset_resolver = get_dataset + + output_stream = getattr(arguments, 'output', sys.stdout) + + has_errored = False + + for dataset_path in arguments.datasets: + if getattr(arguments, 'list', False): + print(dataset_path, file=output_stream) + + try: + start_timestamp = time.perf_counter() + dataset = dataset_resolver( + dataset_path, + compute_digest=ComputeDigest[getattr(arguments, 'compute_digest', ComputeDigest.ONLY_IF_MISSING.name)], + strict_digest=getattr(arguments, 'strict_digest', False), + lazy=getattr(arguments, 'lazy', False), + ) + end_timestamp = time.perf_counter() + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=output_stream) + print(f"Error loading dataset: {dataset_path}", file=output_stream) + has_errored = True + continue + else: + raise Exception(f"Error loading dataset: {dataset_path}") from error + + try: + if getattr(arguments, 'print', False) or getattr(arguments, 'metadata', False) or getattr(arguments, 'time', False): + if getattr(arguments, 'print', False): + pprint.pprint(dataset, stream=output_stream) + if getattr(arguments, 'metadata', False): + dataset.metadata.pretty_print(handle=output_stream) + if getattr(arguments, 'time', False): + print(f"Time: {(end_timestamp - start_timestamp):.3f}s", file=output_stream) + else: + dataset_description = dataset.to_json_structure(canonical=True) + + json.dump( + dataset_description, + output_stream, + indent=(getattr(arguments, 'indent', 2) or None), + sort_keys=getattr(arguments, 'sort_keys', False), + allow_nan=False, + ) # type: ignore + output_stream.write('\n') + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=output_stream) + print(f"Error describing dataset: {dataset_path}", file=output_stream) + has_errored = True + continue + else: + raise Exception(f"Error describing dataset: {dataset_path}") from error + + if has_errored: + sys.exit(1) + + +def convert_handler(arguments: argparse.Namespace, *, dataset_resolver: typing.Callable = None) -> None: + if dataset_resolver is None: + dataset_resolver = get_dataset + + try: + dataset = dataset_resolver( + arguments.input_uri, + compute_digest=ComputeDigest[getattr(arguments, 'compute_digest', ComputeDigest.ONLY_IF_MISSING.name)], + strict_digest=getattr(arguments, 'strict_digest', False), + ) + except Exception as error: + raise Exception(f"Error loading dataset '{arguments.input_uri}'.") from error + + output_uri = utils.fix_uri(arguments.output_uri) + + try: + dataset.save(output_uri, preserve_metadata=getattr(arguments, 'preserve_metadata', True)) + except Exception as error: + raise Exception(f"Error saving dataset '{arguments.input_uri}' to '{output_uri}'.") from error + + +def main(argv: typing.Sequence) -> None: + raise exceptions.NotSupportedError("This CLI has been removed. Use \"python3 -m d3m dataset describe\" instead.") + + +if __name__ == '__main__': + main(sys.argv) diff --git a/d3m/d3m/container/list.py b/d3m/d3m/container/list.py new file mode 100644 index 0000000..56591ba --- /dev/null +++ b/d3m/d3m/container/list.py @@ -0,0 +1,170 @@ +import datetime +import typing + +import numpy # type: ignore +import pandas # type: ignore + +from d3m import deprecate +from d3m.metadata import base as metadata_base + +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/66 +try: + from pyarrow import lib as pyarrow_lib # type: ignore +except ModuleNotFoundError: + pyarrow_lib = None + +__all__ = ('List',) + +L = typing.TypeVar('L', bound='List') + + +class List(list): + """ + Extended Python standard `list` with the ``metadata`` attribute. + + You should use only standard data and container types as its elements. + + Metadata attribute is immutable, so if you ``update`` it, you should reassign it back:: + + l.metadata = l.metadata.update(...) + + `List` is mutable, but this can introduce issues during runtime if a primitive + modifies its inputs directly. Callers of primitives are encouraged + to make it immutable to assure such behavior is detected/prevented, + and primitives should copy inputs to a mutable `List` before modifying it. + + Parameters + ---------- + iterable: + Optional initial values for the list. + metadata: + Optional initial metadata for the top-level of the list, or top-level metadata to be updated + if ``iterable`` is another instance of this list class. + generate_metadata: + Automatically generate and update the metadata. + check: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Attributes + ---------- + metadata: + Metadata associated with the list. + """ + + metadata: metadata_base.DataMetadata + + @deprecate.arguments('source', 'timestamp', 'check', message="argument ignored") + def __init__(self, iterable: typing.Iterable = (), metadata: typing.Dict[str, typing.Any] = None, *, + generate_metadata: bool = False, check: bool = True, source: typing.Any = None, + timestamp: datetime.datetime = None) -> None: + if isinstance(iterable, pandas.DataFrame): + super().__init__(type(self)(row) for row in iterable.itertuples(index=False, name=None)) + else: + if isinstance(iterable, numpy.matrix): + # One cannot iterate over a matrix segment by segment. You always get back + # a matrix (2D structure) and not an array of rows or columns. By converting + # it to an array such iteration segment by segment works. + iterable = numpy.array(iterable) + super().__init__(iterable) + + from d3m import types + + if isinstance(iterable, types.Container): + if isinstance(iterable, List): + # We made a copy, so we do not have to generate metadata. + self.metadata: metadata_base.DataMetadata = iterable.metadata + else: + self.metadata: metadata_base.DataMetadata = iterable.metadata + if generate_metadata: + self.metadata = self.metadata.generate(self) + + if metadata is not None: + self.metadata: metadata_base.DataMetadata = self.metadata.update((), metadata) + else: + self.metadata: metadata_base.DataMetadata = metadata_base.DataMetadata(metadata) + if generate_metadata: + self.metadata = self.metadata.generate(self) + + def copy(self: L) -> L: + # Metadata is copied from provided iterable. + return type(self)(iterable=self) + + @typing.overload # type: ignore + def __getitem__(self, i: int) -> typing.Any: + ... + + def __getitem__(self: L, s: slice) -> L: # type: ignore + if isinstance(s, slice): + lst = type(self)(iterable=super().__getitem__(s)) + # TODO: We could do a slice in metadata as well? + # Update dimensions. Slice per-element metadata. + lst.metadata = self.metadata + return lst + else: + return super().__getitem__(s) + + def __add__(self: L, x: typing.List) -> L: + lst = type(self)(iterable=super().__add__(x)) + # TODO: We could do add in metadata as well? + # Update dimensions. Maybe x is List and has metadata. + # What to do if both have conflicting ALL_ELEMENTS metadata? + lst.metadata = self.metadata + return lst + + def __iadd__(self: L, x: typing.Iterable) -> L: + super().__iadd__(x) + # TODO: We could do add in metadata as well? + # Update dimensions. Maybe x is List and has metadata. + # What to do if both have conflicting ALL_ELEMENTS metadata? + return self + + def __mul__(self: L, n: int) -> L: + lst = type(self)(iterable=super().__mul__(n)) + # TODO: We could do multiply in metadata as well? + # Update dimensions. Multiplicate per-element metadata. + lst.metadata = self.metadata + return lst + + def __rmul__(self: L, n: int) -> L: + lst = type(self)(iterable=super().__rmul__(n)) + # TODO: We could do multiply in metadata as well? + # Update dimensions. Multiplicate per-element metadata. + lst.metadata = self.metadata + return lst + + def __setstate__(self, state: dict) -> None: + self.__dict__ = state + + def __reduce__(self) -> typing.Tuple[typing.Callable, typing.Tuple, dict]: + reduced = super().__reduce__() + return reduced + + +def list_serializer(obj: List) -> dict: + data = { + 'metadata': obj.metadata, + 'list': list(obj), + } + + if type(obj) is not List: + data['type'] = type(obj) + + return data + + +def list_deserializer(data: dict) -> List: + data_list = data.get('type', List)(data['list']) + data_list.metadata = data['metadata'] + return data_list + + +if pyarrow_lib is not None: + pyarrow_lib._default_serialization_context.register_type( + List, 'd3m.list', + custom_serializer=list_serializer, + custom_deserializer=list_deserializer, + ) diff --git a/d3m/d3m/container/numpy.py b/d3m/d3m/container/numpy.py new file mode 100644 index 0000000..bf75f77 --- /dev/null +++ b/d3m/d3m/container/numpy.py @@ -0,0 +1,128 @@ +import datetime +import typing + +import numpy # type: ignore + +from d3m import deprecate +from d3m.metadata import base as metadata_base + +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/66 +try: + from pyarrow import lib as pyarrow_lib # type: ignore +except ModuleNotFoundError: + pyarrow_lib = None + +__all__ = ('ndarray',) + +# This implementation is based on these guidelines: +# https://docs.scipy.org/doc/numpy-1.13.0/user/basics.subclassing.html + +N = typing.TypeVar('N', bound='ndarray') + + +# TODO: We could implement also __array_ufunc__ and adapt metadata as well after in-place changes to data? +class ndarray(numpy.ndarray): + """ + Extended `numpy.ndarray` with the ``metadata`` attribute. + + Parameters + ---------- + input_array: + Anything array-like to create an instance from. Including lists and standard numpy arrays. + metadata: + Optional initial metadata for the top-level of the array, or top-level metadata to be updated + if ``input_array`` is another instance of this array class. + generate_metadata: + Automatically generate and update the metadata. + check: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Attributes + ---------- + metadata: + Metadata associated with the array. + """ + + metadata: metadata_base.DataMetadata + + @deprecate.arguments('source', 'timestamp', 'check', message="argument ignored") + def __new__(cls: typing.Type[N], input_array: typing.Sequence, metadata: typing.Dict[str, typing.Any] = None, *, + generate_metadata: bool = False, check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None) -> N: + array = numpy.asarray(input_array).view(cls) + + # Importing here to prevent import cycle. + from d3m import types + + if isinstance(input_array, types.Container): + if isinstance(input_array, ndarray): + # We made a copy, so we do not have to generate metadata. + array.metadata = input_array.metadata # type: ignore + else: + array.metadata = input_array.metadata + if generate_metadata: + array.metadata = array.metadata.generate(array) + + if metadata is not None: + array.metadata = array.metadata.update((), metadata) + else: + array.metadata = metadata_base.DataMetadata(metadata) + if generate_metadata: + array.metadata = array.metadata.generate(array) + + return array + + def __array_finalize__(self, obj: typing.Any) -> None: + # If metadata attribute already exists. + if hasattr(self, 'metadata'): + return + + if obj is not None and isinstance(obj, ndarray) and hasattr(obj, 'metadata'): + # TODO: We could adapt (if this is after a slice) metadata instead of just copying? + self.metadata: metadata_base.DataMetadata = obj.metadata + else: + self.metadata = metadata_base.DataMetadata() + + def __reduce__(self) -> typing.Tuple: + reduced = list(super().__reduce__()) + + reduced[2] = { + 'numpy': reduced[2], + 'metadata': self.metadata, + } + + return tuple(reduced) + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state['numpy']) + + self.metadata = state['metadata'] + + +def ndarray_serializer(obj: ndarray) -> dict: + data = { + 'metadata': obj.metadata, + 'numpy': obj.view(numpy.ndarray), + } + + if type(obj) is not ndarray: + data['type'] = type(obj) + + return data + + +def ndarray_deserializer(data: dict) -> ndarray: + array = data['numpy'].view(data.get('type', ndarray)) + array.metadata = data['metadata'] + return array + + +if pyarrow_lib is not None: + pyarrow_lib._default_serialization_context.register_type( + ndarray, 'd3m.ndarray', + custom_serializer=ndarray_serializer, + custom_deserializer=ndarray_deserializer, + ) diff --git a/d3m/d3m/container/pandas.py b/d3m/d3m/container/pandas.py new file mode 100644 index 0000000..e36eff7 --- /dev/null +++ b/d3m/d3m/container/pandas.py @@ -0,0 +1,495 @@ +import copy as copy_module +import datetime +import logging +import typing + +import numpy # type: ignore +import pandas # type: ignore +from pandas.core.dtypes import common as pandas_common # type: ignore + +from . import list as container_list +from d3m import deprecate, exceptions +from d3m.metadata import base as metadata_base + +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/66 +try: + from pyarrow import lib as pyarrow_lib # type: ignore +except ModuleNotFoundError: + pyarrow_lib = None + +__all__ = ('DataFrame',) + +logger = logging.getLogger(__name__) + +# This implementation is based on these guidelines: +# https://pandas.pydata.org/pandas-docs/stable/internals.html#subclassing-pandas-data-structures + +D = typing.TypeVar('D', bound='DataFrame') + +Data = typing.Union[typing.Sequence, typing.Mapping] + + +# We have to convert our container "List" to regular list because Pandas do not accept list +# subclasses. See: https://github.com/pandas-dev/pandas/issues/21226 +def convert_lists(data: Data = None) -> typing.Optional[Data]: + if isinstance(data, list) and len(data): + if isinstance(data, container_list.List): + data = list(data) + if isinstance(data, list) and isinstance(data[0], container_list.List): + data = [list(row) for row in data] + + return data + + +def convert_ndarray(data: Data = None) -> typing.Optional[Data]: + """ + If ndarray has more than 2 dimensions, deeper dimensions are converted to stand-alone numpy arrays. + """ + + if isinstance(data, numpy.ndarray) and len(data.shape) > 2: + outer_array = numpy.ndarray(shape=(data.shape[0], data.shape[1]), dtype=numpy.object) + for i in range(data.shape[0]): + for j in range(data.shape[1]): + # This retains the type, so if "data" is a container "ndarray", then also "data[i, j]" is. + outer_array[i, j] = data[i, j] + + return outer_array + + return data + + +class DataFrame(pandas.DataFrame): + """ + Extended `pandas.DataFrame` with the ``metadata`` attribute. + + Parameters + ---------- + data: + Anything array-like to create an instance from. + metadata: + Optional initial metadata for the top-level of the data frame, or top-level metadata to be updated + if ``data`` is another instance of this data frame class. + index: + Index to use for resulting frame. + columns: + Column labels to use for resulting frame. + dtype: + Data type to force. + copy: + Copy data from inputs. + generate_metadata: + Automatically generate and update the metadata. + check: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Attributes + ---------- + metadata: + Metadata associated with the data frame. + """ + + metadata: metadata_base.DataMetadata + + # Reversed properties. + _metadata = ['metadata'] + + @property + def _constructor(self) -> type: + return DataFrame + + @deprecate.arguments('source', 'timestamp', 'check', message="argument ignored") + def __init__(self, data: Data = None, metadata: typing.Dict[str, typing.Any] = None, index: typing.Union[pandas.Index, Data] = None, + columns: typing.Union[pandas.Index, Data] = None, dtype: typing.Union[numpy.dtype, str, pandas_common.ExtensionDtype] = None, copy: bool = False, *, + generate_metadata: bool = False, check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None) -> None: + # If not a constructor call to this exact class, then a child constructor + # is responsible to call a pandas constructor. + if type(self) is DataFrame: + pandas.DataFrame.__init__(self, data=convert_ndarray(convert_lists(data)), index=index, columns=columns, dtype=dtype, copy=copy) + + # Importing here to prevent import cycle. + from d3m import types + + if isinstance(data, types.Container): # type: ignore + if isinstance(data, DataFrame): + # We made a copy, so we do not have to generate metadata. + self.metadata: metadata_base.DataMetadata = data.metadata + else: + self.metadata: metadata_base.DataMetadata = data.metadata + if generate_metadata: + self.metadata = self.metadata.generate(self) + + if metadata is not None: + self.metadata: metadata_base.DataMetadata = self.metadata.update((), metadata) + else: + self.metadata: metadata_base.DataMetadata = metadata_base.DataMetadata(metadata) + if generate_metadata: + self.metadata = self.metadata.generate(self) + + def __finalize__(self: D, other: typing.Any, method: str = None, **kwargs: typing.Any) -> D: + self = super().__finalize__(other, method, **kwargs) + + # Merge operation: using metadata of the left object. + if method == 'merge': + obj = other.left + # Concat operation: using metadata of the first object. + elif method == 'concat': + obj = other.objs[0] + else: + obj = other + + if isinstance(obj, DataFrame): + # TODO: We could adapt (if this is after a slice) metadata instead of just copying? + self.metadata: metadata_base.DataMetadata = obj.metadata + # "metadata" attribute should already be set in "__init__", + # but if we got here without it, let's set it now. + elif not hasattr(self, 'metadata'): + self.metadata: metadata_base.DataMetadata = metadata_base.DataMetadata() + + return self + + def __getstate__(self) -> dict: + state = super().__getstate__() + + state['metadata'] = self.metadata + + return state + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + + self.metadata = state['metadata'] + + def to_csv(self, path_or_buf: typing.Union[typing.IO[typing.Any], str] = None, sep: str = ',', na_rep: str = '', + float_format: str = None, columns: typing.Sequence = None, header: typing.Union[bool, typing.Sequence[str]] = True, + index: bool = False, **kwargs: typing.Any) -> typing.Optional[str]: + """ + Extends `pandas.DataFrame` to provide better default method for writing DataFrames to CSV files. + If ``header`` argument is not explicitly provided column names are derived from metadata of the DataFrame. + By default DataFrame indices are not written. + + See Also + -------- + `pandas.DataFrame.to_csv `_ + + Parameters + ---------- + path_or_buf: + File path or object, if None is provided the result is returned as a string. + sep: + String of length 1. Field delimiter for the output file. + na_rep: + Missing data representation. + float_format: + Format string for floating point numbers. + columns: + Columns to write. + header: + Write out the column names. If a list of strings is given it is assumed to be aliases for the column names. + index: + Write row names (index). + kwargs: + Other arguments. + """ + + if header is True: + header = [] + for column_index in range(len(self.columns)): + # We use column name from the DataFrame if metadata does not have it. This allows a bit more compatibility. + header.append(self.metadata.query_column(column_index).get('name', self.columns[column_index])) + + result = super().to_csv(path_or_buf=path_or_buf, sep=sep, na_rep=na_rep, float_format=float_format, columns=columns, header=header, index=index, **kwargs) + + # Make sure handles are flushed so that no data is lost when used with CLI file handles. + # CLI file handles are generally used outside of a context manager which would otherwise + # handle that. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/436 + if hasattr(path_or_buf, 'flush') and not getattr(path_or_buf, 'closed', False): + typing.cast(typing.IO, path_or_buf).flush() + + return result + + def select_columns(self: D, columns: typing.Sequence[metadata_base.SimpleSelectorSegment], *, allow_empty_columns: bool = False) -> D: + """ + Returns a new DataFrame with data and metadata only for given ``columns``. + Moreover, columns are renumbered based on the position in ``columns`` list. + Top-level metadata stays unchanged, except for updating the length of the columns dimension to + the number of columns. + + So if the ``columns`` is ``[3, 6, 5]`` then output DataFrame will have three columns, ``[0, 1, 2]``, + mapping data and metadata for columns ``3`` to ``0``, ``6`` to ``1`` and ``5`` to ``2``. + + This allows also duplication of columns. + """ + + if not columns and not allow_empty_columns: + raise exceptions.InvalidArgumentValueError("No columns selected.") + + output = self.iloc[:, list(columns)] + + # We want to make sure it is a true copy. + if output._is_view: + output = output.copy() + else: + output._set_is_copy(copy=False) + + output.metadata = self.metadata.select_columns(columns, allow_empty_columns=allow_empty_columns) + + return output + + def remove_columns(self: D, column_indices: typing.Sequence[int]) -> D: + """ + Removes columns from the DataFrame and returns one without them, together with all + metadata for columns removed as well. + + It throws an exception if no columns would be left after removing columns. + """ + + # We are not using "drop" because we are dropping by the column index (to support columns with same name). + + columns = list(range(self.shape[1])) + + if not columns: + raise ValueError("No columns to remove.") + + for column_index in column_indices: + columns.remove(column_index) + + if not columns: + raise ValueError("Removing columns would have removed the last column.") + + output = self.iloc[:, list(columns)] + + # We want to make sure it is a true copy. + if output._is_view: + output = output.copy() + else: + output._set_is_copy(copy=False) + + output.metadata = self.metadata.select_columns(columns) + + return output + + def append_columns(self: D, right: 'DataFrame', *, use_right_metadata: bool = False) -> D: + """ + Appends all columns from ``right`` to the right of this DataFrame, together with all metadata + of columns. + + Metadata at the top-level of ``right`` DataFrame is ignored, not merged, except if ``use_right_metadata`` + is set, in which case top-level metadata of this DataFrame is ignored and one from ``right`` is + used instead. + """ + + outputs = pandas.concat([self, right], axis=1) + outputs.metadata = self.metadata + + outputs.metadata = outputs.metadata.append_columns(right.metadata, use_right_metadata=use_right_metadata) + + return outputs + + def insert_columns(self: D, columns: 'DataFrame', at_column_index: int) -> D: + """ + Inserts all columns from ``columns`` before ``at_column_index`` column in this DataFrame, + pushing all existing columns to the right. + + E.g., ``at_column_index == 0`` means inserting ``columns`` at the beginning of this DataFrame. + + Top-level metadata of ``columns`` is ignored. + """ + + columns_length = self.shape[1] + + if at_column_index < 0: + raise exceptions.InvalidArgumentValueError("\"at_column_index\" is smaller than 0.") + if at_column_index > columns_length: + raise exceptions.InvalidArgumentValueError("\"at_column_index\" is larger than the range of existing columns.") + + if at_column_index == 0: + return columns.append_columns(self, use_right_metadata=True) + + if at_column_index == columns_length: + return self.append_columns(columns) + + # TODO: This could probably be optimized without all the slicing and joining. + + before = self.select_columns(list(range(0, at_column_index))) + after = self.select_columns(list(range(at_column_index, columns_length))) + + return before.append_columns(columns).append_columns(after) + + def _replace_column(self: D, column_index: int, columns: 'DataFrame', columns_column_index: int) -> D: + # We do not use "self.iloc[:, column_index] = columns.iloc[:, columns_column_index]" + # but use the following as a workaround. + # See: https://github.com/pandas-dev/pandas/issues/22036 + # "self.iloc[:, [column_index]] = columns.iloc[:, [columns_column_index]]" does not work either. + # See: https://github.com/pandas-dev/pandas/issues/22046 + output = pandas.concat([self.iloc[:, 0:column_index], columns.iloc[:, [columns_column_index]], self.iloc[:, column_index + 1:]], axis=1) + output.metadata = output.metadata._replace_column(column_index, columns.metadata, columns_column_index) + return output + + def replace_columns(self: D, columns: 'DataFrame', column_indices: typing.Sequence[int], *, copy: bool = True) -> D: + """ + Replaces columns listed in ``column_indices`` with ``columns``, in order, in this DataFrame. + + ``column_indices`` and ``columns`` do not have to match in number of columns. Columns are first + replaced in order for matching indices and columns. If then there are more ``column_indices`` than + ``columns``, additional ``column_indices`` columns are removed. If there are more ``columns`` than + ``column_indices`` columns, then additional ``columns`` are inserted after the last replaced column. + + If ``column_indices`` is empty, then the behavior is equivalent to calling ``append_columns``. + + Top-level metadata of ``columns`` is ignored. + """ + + # TODO: This could probably be optimized without all the slicing and joining. + + if not column_indices: + return self.append_columns(columns) + + if copy: + # We have to copy because "_replace" is modifying data in-place. + outputs = copy_module.copy(self) + else: + outputs = self + + columns_length = columns.shape[1] + columns_to_remove = [] + i = 0 + + # This loop will run always at least once, so "column_index" will be set. + while i < len(column_indices): + column_index = column_indices[i] + + if i < columns_length: + outputs = outputs._replace_column(column_index, columns, i) + else: + # If there are more column indices than columns in "columns", we + # select additional columns for removal. + columns_to_remove.append(column_index) + + i += 1 + + # When there are less column indices than columns in "columns", we insert the rest after + # the last replaced column. + if i < columns_length: + columns = columns.select_columns(list(range(i, columns_length))) + # "column_index" points to the last place we inserted a column, so "+ 1" points after it. + outputs = outputs.insert_columns(columns, column_index + 1) + + # We remove columns at the end so that we do not break and column index used before. + # When removing columns, column indices shift. + if columns_to_remove: + outputs = outputs.remove_columns(columns_to_remove) + + return outputs + + def _sort_right_indices(self: 'DataFrame', right: D, indices: typing.Sequence[int], right_indices: typing.Sequence[int]) -> D: + # We try to handle different cases. + + # We do not do anything special. We assume both indices are the same. + if len(indices) == 1 and len(right_indices) == 1: + # TODO: Handle the case when not all index values exist and "reindex" fills values in: we should fill with NA relevant to the column type. + return right.set_index(right.iloc[:, right_indices[0]]).reindex(self.iloc[:, indices[0]]).reset_index(drop=True) + + index_names = [self.metadata.query_column(index).get('name', None) for index in indices] + right_index_names = [right.metadata.query_column(right_index).get('name', None) for right_index in right_indices] + + index_series = [self.iloc[:, index] for index in indices] + right_index_series = [right.iloc[:, right_index] for right_index in right_indices] + + # Number match, names match, order match, things look good. + if index_names == right_index_names: + # We know the length is larger than 1 because otherwise the first case would match. + assert len(indices) > 1 + assert len(indices) == len(right_indices) + + # TODO: Handle the case when not all index values exist and "reindex" fills values in: we should fill with NA relevant to the column type. + return right.set_index(right_index_series).reindex(index_series).reset_index(drop=True) + + sorted_index_names = sorted(index_names) + sorted_right_index_names = sorted(right_index_names) + + # Number and names match, but not the order. + if sorted_index_names == sorted_right_index_names: + # We know the length is larger than 1 because otherwise the first case would match. + assert len(indices) > 1 + assert len(indices) == len(right_indices) + + # We sort index series to be in the sorted order based on index names. + index_series = [s for _, s in sorted(zip(index_names, index_series), key=lambda pair: pair[0])] + right_index_series = [s for _, s in sorted(zip(right_index_names, right_index_series), key=lambda pair: pair[0])] + + # TODO: Handle the case when not all index values exist and "reindex" fills values in: we should fill with NA relevant to the column type. + return right.set_index(right_index_series).reindex(index_series).reset_index(drop=True) + + if len(index_series) == len(right_index_series): + # We know the length is larger than 1 because otherwise the first case would match. + assert len(indices) > 1 + + logger.warning("Primary indices both on left and right not have same names, but they do match in number.") + + # TODO: Handle the case when not all index values exist and "reindex" fills values in: we should fill with NA relevant to the column type. + return right.set_index(right_index_series).reindex(index_series).reset_index(drop=True) + + # It might be that there are duplicate columns on either or even both sides, + # but that should be resolved by adding a primitive to remove duplicate columns first. + raise ValueError("Left and right primary indices do not match in number.") + + def horizontal_concat(self: D, right: D, *, use_index: bool = True, remove_second_index: bool = True, use_right_metadata: bool = False) -> D: + """ + Similar to ``append_columns``, but it respects primary index columns, by default. + + It has some heuristics how it tries to match up primary index columns in the case that there are + multiple of them, but generally it aligns samples by all primary index columns. + + It is required that both inputs have the same number of samples. + """ + + self.metadata._check_same_number_of_samples(right.metadata) + + left_indices = self.metadata.get_index_columns() + right_indices = right.metadata.get_index_columns() + + if left_indices and right_indices: + if use_index: + old_right_metadata = right.metadata + right = self._sort_right_indices(right, left_indices, right_indices) + # TODO: Reorder metadata rows as well. + # This should be relatively easy because we can just modify + # "right.metadata._current_metadata.metadata" map. + right.metadata = old_right_metadata + + # Removing second primary key columns. + if remove_second_index: + right = right.remove_columns(right_indices) + + return self.append_columns(right, use_right_metadata=use_right_metadata) + + +def dataframe_serializer(obj: DataFrame) -> dict: + data = { + 'metadata': obj.metadata, + 'pandas': pandas.DataFrame(obj), + } + + if type(obj) is not DataFrame: + data['type'] = type(obj) + + return data + + +def dataframe_deserializer(data: dict) -> DataFrame: + df = data.get('type', DataFrame)(data['pandas']) + df.metadata = data['metadata'] + return df + + +if pyarrow_lib is not None: + pyarrow_lib._default_serialization_context.register_type( + DataFrame, 'd3m.dataframe', + custom_serializer=dataframe_serializer, + custom_deserializer=dataframe_deserializer, + ) diff --git a/d3m/d3m/container/utils.py b/d3m/d3m/container/utils.py new file mode 100644 index 0000000..989ec59 --- /dev/null +++ b/d3m/d3m/container/utils.py @@ -0,0 +1,50 @@ +import uuid +import os +import json +import typing + +from d3m import container as container_module, exceptions, utils +from d3m.container import dataset as dataset_module + + +def save_container(container: typing.Any, output_dir: str) -> None: + # Saving data. + if isinstance(container, container_module.Dataset): + dataset_root_metadata = container.metadata.query(()) + + missing_metadata: typing.Dict = {} + for d3m_path, (dataset_path, required) in dataset_module.D3M_TO_DATASET_FIELDS.items(): + if not required: + continue + + if utils.get_dict_path(dataset_root_metadata, dataset_path) is None: + # TODO: Use some better value instead of this random value? + utils.set_dict_path(missing_metadata, dataset_path, str(uuid.uuid4())) + + if missing_metadata: + container = container.copy() + container.metadata = container.metadata.update((), missing_metadata) + + # Dataset saver creates any missing directories. + dataset_uri = 'file://{dataset_path}'.format(dataset_path=os.path.abspath(os.path.join(output_dir, 'datasetDoc.json'))) + container.save(dataset_uri) + else: + # We do not want to override anything. + os.makedirs(output_dir, exist_ok=False) + dataframe_path = os.path.join(output_dir, 'data.csv') + + if isinstance(container, container_module.DataFrame): + container.to_csv(dataframe_path) + elif isinstance(container, (container_module.List, container_module.ndarray)): + container = container_module.DataFrame(container) + container.to_csv(dataframe_path) + else: + raise exceptions.NotSupportedError("Value with type '{value_type}' cannot be saved as a container type.".format(value_type=type(container))) + + # Saving metadata. This is just for debugging purposes, so we are + # using "to_json_structure" and not "to_internal_json_structure". + input_metadata = container.metadata.to_json_structure() + metadata_path = os.path.join(output_dir, 'metadata.json') + + with open(metadata_path, 'w') as outfile: + json.dump(input_metadata, outfile, indent=2, sort_keys=True, allow_nan=False) diff --git a/d3m/d3m/contrib/__init__.py b/d3m/d3m/contrib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d3m/d3m/contrib/pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml b/d3m/d3m/contrib/pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml new file mode 100644 index 0000000..e95ecd5 --- /dev/null +++ b/d3m/d3m/contrib/pipelines/f596cd77-25f8-4d4c-a350-bb30ab1e58f6.yml @@ -0,0 +1,31 @@ +id: f596cd77-25f8-4d4c-a350-bb30ab1e58f6 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2020-04-18T11:42:44.138742Z" +name: Scoring pipeline +description: |- + A general scoring pipeline. +inputs: + - name: predictions + - name: score dataset +outputs: + - name: scores + data: steps.0.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 799802fb-2e11-4ab7-9c5e-dda09eb52a70 + version: 0.5.0 + python_path: d3m.primitives.evaluation.compute_scores.Core + name: Compute scores given the metrics to use + arguments: + inputs: + type: CONTAINER + data: inputs.0 + score_dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce diff --git a/d3m/d3m/contrib/primitives/__init__.py b/d3m/d3m/contrib/primitives/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d3m/d3m/contrib/primitives/compute_scores.py b/d3m/d3m/contrib/primitives/compute_scores.py new file mode 100644 index 0000000..e229769 --- /dev/null +++ b/d3m/d3m/contrib/primitives/compute_scores.py @@ -0,0 +1,369 @@ +import inspect +import os.path +import typing + +import pandas # type: ignore + +import d3m +from d3m import container, exceptions, metrics, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams, problem +from d3m.primitive_interfaces import base, transformer + +__all__ = ('ComputeScoresPrimitive',) + +# Primitives needs an installation section so that digest is computed and available for the primitive. +if d3m.__version__[0].isdigit(): + installation = [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'd3m', + 'version': d3m.__version__, + }] +else: + installation = [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/d3m.git@{git_commit}#egg=d3m'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class MetricsHyperparams(hyperparams.Hyperparams, set_names=False): + metric = hyperparams.Enumeration( + values=[metric.name for metric in problem.PerformanceMetric], + # Default is ignored. + # TODO: Remove default. See: https://gitlab.com/datadrivendiscovery/d3m/issues/141 + default='ACCURACY', + ) + pos_label = hyperparams.Hyperparameter[typing.Union[str, None]](None) + k = hyperparams.Hyperparameter[typing.Union[int, None]](None) + + +class AllLabelsHyperparams(hyperparams.Hyperparams, set_names=False): + # Default is ignored. + # TODO: Remove default. See: https://gitlab.com/datadrivendiscovery/d3m/issues/141 + column_name = hyperparams.Hyperparameter[str]('') + labels = hyperparams.Set( + # Default is ignored. + # TODO: Remove default. See: https://gitlab.com/datadrivendiscovery/d3m/issues/141 + elements=hyperparams.Hyperparameter[str](''), + default=(), + ) + + +class Hyperparams(hyperparams.Hyperparams): + metrics = hyperparams.Set( + elements=MetricsHyperparams, + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of metrics to compute.", + ) + all_labels = hyperparams.Set( + elements=AllLabelsHyperparams, + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="All labels available in a dataset, per target column. When provided for a target column, it overrides all labels from metadata or data for that target column.", + ) + add_normalized_scores = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Add additional column with normalized scores?" + ) + + +class ComputeScoresPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that takes a DataFrame with predictions and a scoring Dataset (test split with + target values present), and computes scores for given metrics and outputs them as a DataFrame. + + It searches only the dataset entry point resource for target columns + (which should be marked with ``https://metadata.datadrivendiscovery.org/types/TrueTarget`` + semantic type) in the scoring Dataset. + + Primitive does not align rows between truth DataFrame and predictions DataFrame, it + is expected that metric code does that if necessary. Similarly, it does not align + columns order either. + + It uses metadata to construct the truth DataFrame and renames the index column to match + the standard names ``d3mIndex``. It encodes any float vectors as strings. + + For predictions DataFrame it expects that it is already structured correctly with correct + column names and it leaves to metric code to validate that truth DataFrame and predictions + DataFrame match. It does not use or expect metadata on predictions DataFrame. Predictions + DataFrame should already have float vectors encoded as strings. + """ + + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = metadata_base.PrimitiveMetadata( + { + 'id': '799802fb-2e11-4ab7-9c5e-dda09eb52a70', + 'version': '0.5.0', + 'name': "Compute scores given the metrics to use", + 'python_path': 'd3m.primitives.evaluation.compute_scores.Core', + 'source': { + 'name': d3m.__author__, + 'contact': 'mailto:mitar.d3m@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/d3m/blob/master/d3m/contrib/primitives/compute_scores.py', + 'https://gitlab.com/datadrivendiscovery/d3m.git', + ], + }, + 'installation': installation, + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ACCURACY_SCORE, + metadata_base.PrimitiveAlgorithmType.F1_SCORE, + ], + 'primitive_family': metadata_base.PrimitiveFamily.EVALUATION, + }, + ) + + def produce( # type: ignore + self, *, inputs: Inputs, score_dataset: container.Dataset, timeout: float = None, + iterations: int = None, + ) -> base.CallResult[Outputs]: + if not self.hyperparams['metrics']: + raise ValueError("\"metrics\" hyper-parameter cannot be empty.") + + truth, all_labels = self._get_truth(score_dataset) + predictions = self._get_predictions(inputs) + + for target_column in self.hyperparams['all_labels']: + all_labels[target_column['column_name']] = list(target_column['labels']) + + outputs: typing.Dict[str, typing.List] = { + 'metric': [], + 'value': [], + } + + if self.hyperparams['add_normalized_scores']: + outputs['normalized'] = [] + + for metric_configuration in self.hyperparams['metrics']: + metric = problem.PerformanceMetric[metric_configuration['metric']] + metric_class = metric.get_class() + + params = {} + + if 'all_labels' in inspect.signature(metric_class).parameters and all_labels: + params['all_labels'] = all_labels + + for param_name, param_value in metric_configuration.items(): + if param_name == 'metric': + continue + if param_value is None: + continue + params[param_name] = param_value + + if metric.requires_confidence() and metrics.CONFIDENCE_COLUMN not in predictions.columns: + raise exceptions.InvalidArgumentValueError( + f"Metric {metric.name} requires confidence column in predictions, but it is not available.", + ) + if metric.requires_rank() and metrics.RANK_COLUMN not in predictions.columns: + raise exceptions.InvalidArgumentValueError( + f"Metric {metric.name} requires rank column in predictions, but it is not available.", + ) + + score = metric_class(**params).score(truth, predictions) + + outputs['metric'].append(metric.name) + outputs['value'].append(score) + + if self.hyperparams['add_normalized_scores']: + outputs['normalized'].append(metric.normalize(score)) + + # Dictionary key order is preserved in Python 3.6+ which makes column order as we want it. + results = container.DataFrame(data=outputs, columns=list(outputs.keys()), generate_metadata=True) + + # Not really necessary, but it does not hurt. In theory somebody could list same metric multiple times + # (maybe with different params), so we use "PrimaryMultiKey" here. + results.metadata = results.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey', + ) + results.metadata = results.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/Score', + ) + if self.hyperparams['add_normalized_scores']: + results.metadata = results.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, 2), + 'https://metadata.datadrivendiscovery.org/types/Score', + ) + + return base.CallResult(results) + + def multi_produce( # type: ignore + self, *, produce_methods: typing.Sequence[str], inputs: Inputs, + score_dataset: container.Dataset, timeout: float = None, iterations: int = None, + ) -> base.MultiCallResult: + return self._multi_produce( + produce_methods=produce_methods, timeout=timeout, iterations=iterations, + inputs=inputs, score_dataset=score_dataset, + ) + + def fit_multi_produce( # type: ignore + self, *, produce_methods: typing.Sequence[str], inputs: Inputs, + score_dataset: container.Dataset, timeout: float = None, iterations: int = None + ) -> base.MultiCallResult: + return self._fit_multi_produce( + produce_methods=produce_methods, timeout=timeout, iterations=iterations, + inputs=inputs, score_dataset=score_dataset, + ) + + # TODO: Instead of extracting true targets only from the dataset entry point, first denormalize and then extract true targets. + def _get_truth(self, score_dataset: container.Dataset) -> typing.Tuple[pandas.DataFrame, typing.Dict[str, typing.Any]]: + """ + Extracts true targets from the Dataset's entry point, or the only tabular resource. + It requires that there is only one primary index column, which it makes the first + column, named ``d3mIndex``. Then true target columns follow. + + We return a regular Pandas DataFrame with column names matching those in the metadata, + and a dict mapping target columns to all label values in those columns, if available in metadata. + We convert all columns to strings to match what would be loaded from ``predictions.csv`` file. + It encodes any float vectors as strings. + """ + + main_resource_id, main_resource = base_utils.get_tabular_resource(score_dataset, None, has_hyperparameter=False) + + # We first copy before modifying in-place. + main_resource = container.DataFrame(main_resource, copy=True) + main_resource = self._encode_columns(main_resource) + + dataframe = self._to_dataframe(main_resource) + + indices = list(score_dataset.metadata.get_index_columns(at=(main_resource_id,))) + targets = list(score_dataset.metadata.list_columns_with_semantic_types( + ['https://metadata.datadrivendiscovery.org/types/TrueTarget'], + at=(main_resource_id,), + )) + + if not indices: + raise exceptions.InvalidArgumentValueError("No primary index column.") + elif len(indices) > 1: + raise exceptions.InvalidArgumentValueError("More than one primary index column.") + if not targets: + raise ValueError("No true target columns.") + + dataframe = dataframe.iloc[:, indices + targets] + + dataframe = dataframe.rename({dataframe.columns[0]: metrics.INDEX_COLUMN}) + + if metrics.CONFIDENCE_COLUMN in dataframe.columns[1:]: + raise ValueError("True target column cannot be named \"confidence\". It is a reserved name.") + if metrics.RANK_COLUMN in dataframe.columns[1:]: + raise ValueError("True target column cannot be named \"rank\". It is a reserved name.") + if metrics.INDEX_COLUMN in dataframe.columns[1:]: + raise ValueError("True target column cannot be named \"d3mIndex\". It is a reserved name.") + + if d3m_utils.has_duplicates(dataframe.columns): + duplicate_names = list(dataframe.columns) + for name in set(dataframe.columns): + duplicate_names.remove(name) + raise exceptions.InvalidArgumentValueError( + "True target columns have duplicate names: {duplicate_names}".format( + duplicate_names=sorted(set(duplicate_names)), + ), + ) + + all_labels = {} + + for target_column_name, main_resource_column_index in zip(dataframe.columns[1:], targets): + try: + column_labels = score_dataset.metadata.query_column_field(main_resource_column_index, 'all_distinct_values', at=(main_resource_id,)) + except KeyError: + continue + + all_labels[target_column_name] = [str(label) for label in column_labels] + + return dataframe, all_labels + + def _get_predictions(self, inputs: Inputs) -> pandas.DataFrame: + """ + It requires that predictions already have the right structure (one ``d3mIndex`` + column, at most one ``confidence`` column, at most one ``rank`` column, + no duplicate column names). + + We return a regular Pandas DataFrame with column names matching those in the metadata. + We convert all columns to strings to match what would be loaded from ``predictions.csv`` file. + Predictions DataFrame should already have float vectors encoded as strings. + """ + + dataframe = self._to_dataframe(inputs) + + if metrics.INDEX_COLUMN not in dataframe.columns: + raise exceptions.InvalidArgumentValueError("No primary index column.") + + if d3m_utils.has_duplicates(dataframe.columns): + duplicate_names = list(dataframe.columns) + for name in set(dataframe.columns): + duplicate_names.remove(name) + raise exceptions.InvalidArgumentValueError( + "Predicted target columns have duplicate names: {duplicate_names}".format( + duplicate_names=sorted(set(duplicate_names)), + ), + ) + + return dataframe + + def _to_dataframe(self, inputs: container.DataFrame) -> pandas.DataFrame: + # We have to copy, otherwise setting "columns" modifies original DataFrame as well. + dataframe = pandas.DataFrame(inputs, copy=True) + + column_names = [] + for column_index in range(len(inputs.columns)): + column_names.append(inputs.metadata.query_column(column_index).get('name', inputs.columns[column_index])) + + # Make sure column names are correct. + dataframe.columns = column_names + + # Convert all columns to string. + return dataframe.astype(str) + + @classmethod + def _encode_columns(cls, inputs: Outputs) -> Outputs: + """ + Encode numpy arrays of numbers into float vectors. + """ + + outputs = inputs + target_columns = outputs.metadata.list_columns_with_semantic_types( + ('https://metadata.datadrivendiscovery.org/types/PredictedTarget',), + ) + + for column_index in target_columns: + structural_type = outputs.metadata.query_column(column_index).get('structural_type', None) + + if structural_type is None: + continue + + if not issubclass(structural_type, container.ndarray): + continue + + new_column = [] + all_strings = True + for value in outputs.iloc[:, column_index]: + assert isinstance(value, container.ndarray) + + if value.ndim == 1: + new_column.append(','.join(str(v) for v in value)) + else: + all_strings = False + break + + if not all_strings: + continue + + outputs_metadata = outputs.metadata + outputs.iloc[:, column_index] = new_column + outputs.metadata = outputs_metadata.update_column(column_index, { + 'structural_type': str, + 'dimension': metadata_base.NO_VALUE, + }) + outputs.metadata = outputs.metadata.remove( + (metadata_base.ALL_ELEMENTS, column_index, metadata_base.ALL_ELEMENTS), + recursive=True, + ) + + return outputs diff --git a/d3m/d3m/deprecate.py b/d3m/d3m/deprecate.py new file mode 100644 index 0000000..375dbfb --- /dev/null +++ b/d3m/d3m/deprecate.py @@ -0,0 +1,143 @@ +import functools +import logging +import sys +import typing + +logger = logging.getLogger(__name__) + + +class Context(typing.NamedTuple): + function: typing.Optional[str] + argument: typing.Optional[str] + filename: str + module: str + lineno: int + + +def function(message: str = None) -> typing.Callable: + """ + A decorator which issues a warning if a wrapped function is called. + """ + + def decorator(f: typing.Callable) -> typing.Callable: + already_warned: typing.Set[Context] = set() + + @functools.wraps(f) + def wrapper(*args: typing.Any, **kwargs: typing.Any) -> typing.Any: + frame = sys._getframe(1) + try: + while frame: + # If function has multiple decorators, skip decorators as callers and find the real caller. + if frame.f_code.co_filename != __file__: + break + + frame = frame.f_back + + if not frame: + if message is None: + logger.warning( + "Calling a deprecated function '%(function)s'.", + { + 'function': f.__name__, + }, + ) + else: + logger.warning( + "Calling a deprecated function '%(function)s': %(message)s", + { + 'function': f.__name__, + 'message': message, + }, + ) + return f(*args, **kwargs) + + context = Context(f.__name__, None, frame.f_code.co_filename, frame.f_globals.get('__name__', None), frame.f_lineno) + + finally: + del frame + + if context in already_warned: + return f(*args, **kwargs) + already_warned.add(context) + + if message is None: + logger.warning("%(module)s: Calling a deprecated function '%(function)s' in '%(filename)s' at line %(lineno)s.", context._asdict()) + else: + logger.warning("%(module)s: Calling a deprecated function '%(function)s' in '%(filename)s' at line %(lineno)s: %(message)s", dict(context._asdict(), message=message)) + + return f(*args, **kwargs) + + return wrapper + + return decorator + + +def arguments(*deprecated_arguments: str, message: str = None) -> typing.Callable: + """ + A decorator which issues a warning if any of the ``deprecated_arguments`` is being + passed to the wrapped function. + """ + + def decorator(f: typing.Callable) -> typing.Callable: + already_warned: typing.Set[Context] = set() + + @functools.wraps(f) + def wrapper(*args: typing.Any, **kwargs: typing.Any) -> typing.Any: + for argument in deprecated_arguments: + if argument in kwargs: + frame = sys._getframe(1) + try: + while frame: + # If function has multiple decorators, skip decorators as callers and find the real caller. + if frame.f_code.co_filename != __file__: + break + + frame = frame.f_back + + if not frame: + if message is None: + logger.warning( + "Providing a deprecated argument '%(argument)s' to '%(function)s' function.", + { + 'argument': argument, + 'function': f.__name__, + }, + ) + else: + logger.warning( + "Providing a deprecated argument '%(argument)s' to '%(function)s' function: %(message)s", + { + 'argument': argument, + 'function': f.__name__, + 'message': message, + }, + ) + break + + context = Context(f.__name__, argument, frame.f_code.co_filename, frame.f_globals.get('__name__', None), frame.f_lineno) + + finally: + del frame + + if context in already_warned: + break + already_warned.add(context) + + if message is None: + logger.warning( + "%(module)s: Providing a deprecated argument '%(argument)s' to '%(function)s' function in '%(filename)s' at line %(lineno)s.", + context._asdict(), + ) + else: + logger.warning( + "%(module)s: Providing a deprecated argument '%(argument)s' to '%(function)s' function in '%(filename)s' at line %(lineno)s: %(message)s", + dict(context._asdict(), message=message), + ) + + break + + return f(*args, **kwargs) + + return wrapper + + return decorator diff --git a/d3m/d3m/environment_variables.py b/d3m/d3m/environment_variables.py new file mode 100644 index 0000000..f586667 --- /dev/null +++ b/d3m/d3m/environment_variables.py @@ -0,0 +1,22 @@ +# Environment variables describing runtime environment. +# From inside Docker container it is not really possible to obtain +# information about the Docker image used for the container. This +# is why we use environment variable to pass this information in. +# See descriptions of "base_docker_image" and "docker_image" metadata. +D3M_BASE_IMAGE_NAME = 'D3M_BASE_IMAGE_NAME' +D3M_BASE_IMAGE_DIGEST = 'D3M_BASE_IMAGE_DIGEST' +D3M_IMAGE_NAME = 'D3M_IMAGE_NAME' +D3M_IMAGE_DIGEST = 'D3M_IMAGE_DIGEST' + +# Limits on CPU and memory compute resources available to the runtime +# can be communicated also through environment variables because it is +# not always easy to determine them from inside limited environment +# that not all resources visible are also available. +# Should be in Kubernetes units or equivalent. +# See: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu +# https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory +D3M_CPU = 'D3MCPU' +D3M_RAM = 'D3MRAM' + +# Used by pipeline resolver to configure where to search for files with pipelines. +PIPELINES_PATH = 'PIPELINES_PATH' diff --git a/d3m/d3m/exceptions.py b/d3m/d3m/exceptions.py new file mode 100644 index 0000000..65bd006 --- /dev/null +++ b/d3m/d3m/exceptions.py @@ -0,0 +1,187 @@ + +class NotSupportedError(RuntimeError): + """ + Functionality is not supported. + """ + + +class NotSupportedVersionError(RuntimeError): + """ + This version is not supported. + """ + + +class InvalidArgumentValueError(ValueError): + """ + Provided argument to the function is invalid in value. + """ + + +class InvalidReturnValueError(ValueError): + """ + Returned value from the function is invalid. + """ + + +class InvalidArgumentTypeError(TypeError): + """ + Provided argument to the function is invalid in type. + """ + + +class InvalidReturnTypeError(TypeError): + """ + Type of the returned value from the function is invalid. + """ + + +class NotFoundError(ValueError): + """ + Something requested could not be found. + """ + + +class AlreadyExistsError(ValueError): + """ + Something which should not exist already exists. + """ + + +class MismatchError(ValueError): + """ + A value does not match expected value. + """ + + +class MissingValueError(ValueError): + """ + The required value has not been provided. + """ + + +class DigestMismatchError(MismatchError): + """ + A digest does not match the expect digest. + """ + + +class DimensionalityMismatchError(MismatchError): + """ + Dimensionality mismatch occurs in array computations. + """ + + +class UnexpectedValueError(ValueError): + """ + Value occurred not in a fixed list of possible or supported values, + e.g., during parsing of data with expected schema. + """ + + +class UnexpectedTypeError(TypeError): + """ + Type occurred not in a fixed list of possible or supported types, + e.g., during parsing of data with expected schema. + """ + + +class DatasetUriNotSupportedError(NotSupportedError): + """ + Provided dataset URI is not supported. + """ + + +class ProblemUriNotSupportedError(NotSupportedError): + """ + Provided problem URI is not supported. + """ + + +class DatasetNotFoundError(FileNotFoundError, NotFoundError): + """ + Provided dataset URI cannot be resolved to a dataset. + """ + + +class ProblemNotFoundError(FileNotFoundError, NotFoundError): + """ + Provided problem URI cannot be resolved to a problem. + """ + + +class InvalidStateError(AssertionError): + """ + Program ended up in an invalid or unexpected state, or a state does not match the current code path. + """ + + +class InvalidMetadataError(ValueError): + """ + Metadata is invalid. + """ + + +class InvalidPrimitiveCodeError(ValueError): + """ + Primitive does not match standard API. + """ + + +class ColumnNameError(KeyError): + """ + Table column with name not found. + """ + + +class InvalidPipelineError(ValueError): + """ + Pipeline is invalid. + """ + + +class InvalidPipelineRunError(ValueError): + """ + Pipeline run is invalid. + """ + + +class InvalidProblemError(ValueError): + """ + Problem description is invalid. + """ + + +class InvalidDatasetError(ValueError): + """ + Dataset is invalid. + """ + + +class PrimitiveNotFittedError(InvalidStateError): + """ + The primitive has not been fitted. + """ + + +class PermissionDeniedError(RuntimeError): + """ + No permissions to do or access something. + """ + + +class StepFailedError(RuntimeError): + """ + Running a pipeline step failed. + """ + + +class SamplingError(ArithmeticError): + """ + Error during sampling. + """ + + +class SamplingNotPossibleError(SamplingError): + """ + Sampling is not possible. + """ diff --git a/d3m/d3m/index.py b/d3m/d3m/index.py new file mode 100644 index 0000000..8f948fa --- /dev/null +++ b/d3m/d3m/index.py @@ -0,0 +1,538 @@ +import argparse +import contextlib +import json +import hashlib +import importlib +import importlib.abc +import importlib.machinery +import inspect +import logging +import os.path +import pprint +import subprocess +import shutil +import sys +import time +import traceback +import typing +from xmlrpc import client as xmlrpc # type: ignore + +import frozendict # type: ignore +import pycurl # type: ignore + +from d3m import exceptions, namespace, utils +from d3m.primitive_interfaces import base + +__all__ = ('search', 'get_primitive', 'get_primitive_by_id', 'get_loaded_primitives', 'load_all', 'register_primitive', 'discover') + +logger = logging.getLogger(__name__) + +DEFAULT_INDEX = 'https://pypi.org/pypi' +DEFAULT_OUTPUT = '.' + + +class _SENTINEL_TYPE: + __slots__ = () + + def __repr__(self) -> str: + return '_SENTINEL' + + +_SENTINEL = _SENTINEL_TYPE() + +_loaded_primitives: typing.Set[typing.Type[base.PrimitiveBase]] = set() + + +def search(*, primitive_path_prefix: str = None) -> typing.Sequence[str]: + """ + Returns a list of primitive paths (Python paths under ``d3m.primitives`` namespace) + for all known (discoverable through entry points) primitives, or limited by the + ``primitive_path_prefix`` search argument. + + Not all returned primitive paths are not necessary loadable and it is not necessary that + they are all really pointing to primitive classes, because this method does not try to + load them yet to determine any of that. + + Parameters + ---------- + primitive_path_prefix: + Optionally limit returned primitive paths only to those whose path start with ``primitive_name_prefix``. + + Returns + ------- + A list of primitive paths. + """ + + if primitive_path_prefix is None: + primitive_path_prefix = '' + + results = [] + + for entry_point in namespace.entry_points(): + primitive_path = 'd3m.primitives.{entry_point_name}'.format( + entry_point_name=entry_point.name, + ) + + if primitive_path.startswith(primitive_path_prefix): + results.append(primitive_path) + + # We also go over all loaded primitives to also search over any primitives directly + # registered using "register_primitive" and not through an entry point. + for primitive in get_loaded_primitives(): + primitive_path = primitive.metadata.query()['python_path'] + + if primitive_path in results: + continue + + if primitive_path.startswith(primitive_path_prefix): + results.append(primitive_path) + + return sorted(results) + + +def get_primitive(primitive_path: str) -> typing.Type[base.PrimitiveBase]: + """ + Loads (if not already) a primitive class and returns it. + + Parameters + ---------- + primitive_path: + A Python path under ``d3m.primitives`` namespace of a primitive. + + Returns + ------- + A primitive class. + """ + + if not primitive_path: + raise exceptions.InvalidArgumentValueError("Primitive path is required.") + + if not primitive_path.startswith('d3m.primitives.'): + raise exceptions.InvalidArgumentValueError("Primitive path does not start with \"d3m.primitives\".") + + path, name = primitive_path.rsplit('.', 1) + + module = importlib.import_module(path) + + return getattr(module, name) + + +def get_primitive_by_id(primitive_id: str) -> typing.Type[base.PrimitiveBase]: + """ + Returns a primitive class based on its ID from all currently loaded primitives. + + Parameters + ---------- + primitive_id: + An ID of a primitive. + + Returns + ------- + A primitive class. + """ + + for primitive in get_loaded_primitives(): + if primitive.metadata.query()['id'] == primitive_id: + return primitive + + raise exceptions.InvalidArgumentValueError("Unable to get primitive '{primitive_id}'.".format(primitive_id=primitive_id)) + + +def get_loaded_primitives() -> typing.Sequence[typing.Type[base.PrimitiveBase]]: + """ + Returns a list of all currently loaded primitives. + + Returns + ------- + A list of all currently loaded primitives. + """ + + return list(_loaded_primitives) + + +def load_all(blocklist: typing.Collection[str] = None) -> None: + """ + Loads all primitives available and populates ``d3m.primitives`` namespace with them. + + If a primitive cannot be loaded, an error is logged, but loading of other primitives + continue. + + Parameters + ---------- + blocklist: + A collection of primitive path prefixes to not (try to) load. + """ + + if blocklist is None: + blocklist = [] + + for primitive_path in search(): + if any(primitive_path.startswith(blocklist_prefix) for blocklist_prefix in blocklist): + continue + + try: + get_primitive(primitive_path) + except Exception: + logger.exception("Could not load the primitive: %(primitive_path)s", {'primitive_path': primitive_path}) + + +# TODO: "primitive_path" is not really necessary because it could just be extracted from primitive's metadata. +# We do not allow them to be different anyway. +def register_primitive(primitive_path: str, primitive: typing.Type[base.PrimitiveBase]) -> None: + """ + Registers a primitive under ``d3m.primitives`` namespace. + + This is useful to register primitives not necessary installed on the system + or which are generated at runtime. It is also useful for testing purposes. + + ``primitive_path`` has to start with ``d3m.primitives``. + + Parameters + ---------- + primitive_path: + A primitive path to register a primitive under. + primitive: + A primitive class to register. + """ + + if not primitive_path: + raise exceptions.InvalidArgumentValueError("Path under which to register a primitive is required.") + + if not primitive_path.startswith('d3m.primitives.'): + raise exceptions.InvalidArgumentValueError("Path under which to register a primitive does not start with \"d3m.primitives\".") + + if not inspect.isclass(primitive): + raise exceptions.InvalidArgumentTypeError("Primitive to register has to be a class.") + + if not issubclass(primitive, base.PrimitiveBase): + raise exceptions.InvalidArgumentTypeError("Primitive to register is not a subclass of PrimitiveBase.") + + if primitive.metadata.query()['python_path'] != primitive_path: + raise exceptions.InvalidArgumentValueError("Primitive's \"python_path\" in metadata does not match the path under which to register it: {python_path} vs. {primitive_path}".format( + python_path=primitive.metadata.query()['python_path'], + primitive_path=primitive_path, + )) + + modules_path, name = primitive_path.rsplit('.', 1) + # We remove "d3m.primitives" from the list of modules. + modules = modules_path.split('.')[2:] + + if 'd3m.primitives' not in sys.modules: + import d3m.primitives # type: ignore + + # Create any modules which do not yet exist. + current_path = 'd3m.primitives' + for module_name in modules: + module_path = current_path + '.' + module_name + + if module_path not in sys.modules: + try: + importlib.import_module(module_path) + except ModuleNotFoundError: + # This can happen if this module is not listed in any of entry points. But we want to allow + # registering primitives also outside of existing entry points, so we create a module here. + + # Because we just could not load the module, we know that if the attribute exists, + # it has to be something else, which we do not want to clobber. + if hasattr(sys.modules[current_path], module_name): + raise ValueError("'{module_path}' is already defined.".format(module_path)) + + module_spec = importlib.machinery.ModuleSpec(module_path, namespace.Loader(), is_package=True) + module = importlib.util.module_from_spec(module_spec) + module_spec.loader.exec_module(module) + + sys.modules[module_path] = module + setattr(sys.modules[current_path], module_name, module) + + current_path = module_path + + if hasattr(sys.modules[current_path], name): + existing_value = getattr(sys.modules[current_path], name) + # Registering twice the same primitive is a noop. + if existing_value is primitive: + return + + # Maybe we are just registering this primitive. But if not... + if existing_value is not _SENTINEL: + raise ValueError("'{module}.{name}' is already defined as '{existing_value}'.".format(module=current_path, name=name, existing_value=existing_value)) + + setattr(sys.modules[current_path], name, primitive) + _loaded_primitives.add(primitive) + + +def discover(index: str = 'https://pypi.org/pypi') -> typing.Tuple[str, ...]: + """ + Returns package names from PyPi which provide D3M primitives. + + This is determined by them having a ``d3m_primitive`` among package keywords. + + Parameters + ---------- + index: + Base URL of Python Package Index to use. + + Returns + ------- + A list of package names. + """ + + client = xmlrpc.ServerProxy(index) + hits = client.search({'keywords': 'd3m_primitive'}) + return tuple(sorted({package['name'] for package in hits})) + + +def download_files(primitive_metadata: frozendict.FrozenOrderedDict, output: str, redownload: bool) -> None: + last_progress_call = None + + def curl_progress(download_total: int, downloaded: int, upload_total: int, uploaded: int) -> None: + nonlocal last_progress_call + + # Output at most once every 10 seconds. + now = time.time() + if last_progress_call is None or now - last_progress_call > 10: + last_progress_call = now + + print("Downloaded {downloaded}/{download_total} B".format( + downloaded=downloaded, + download_total=download_total, + ), flush=True) + + for installation_entry in primitive_metadata.get('installation', []): + if installation_entry['type'] not in ['FILE', 'TGZ']: + continue + + # We store into files based on digest. In this way we deduplicate same + # files used by multiple primitives. + output_path = os.path.join(output, installation_entry['file_digest']) + + if installation_entry['type'] == 'FILE': + if os.path.isfile(output_path) and not redownload: + print("File for volume {type}/{key} for primitive {python_path} ({primitive_id}) already exists, skipping: {file_uri}".format( + python_path=primitive_metadata['python_path'], + primitive_id=primitive_metadata['id'], + type=installation_entry['type'], + key=installation_entry['key'], + file_uri=installation_entry['file_uri'], + ), flush=True) + continue + elif installation_entry['type'] == 'TGZ': + if os.path.isdir(output_path) and not redownload: + print("Directory for volume {type}/{key} for primitive {python_path} ({primitive_id}) already exists, skipping: {file_uri}".format( + python_path=primitive_metadata['python_path'], + primitive_id=primitive_metadata['id'], + type=installation_entry['type'], + key=installation_entry['key'], + file_uri=installation_entry['file_uri'], + ), flush=True) + continue + + # Cleanup. + if os.path.isdir(output_path): + shutil.rmtree(output_path) + elif os.path.exists(output_path): + os.remove(output_path) + + print("Downloading file for volume {type}/{key} for primitive {python_path} ({primitive_id}): {file_uri}".format( + python_path=primitive_metadata['python_path'], + primitive_id=primitive_metadata['id'], + type=installation_entry['type'], + key=installation_entry['key'], + file_uri=installation_entry['file_uri'], + ), flush=True) + + output_file_obj: typing.BinaryIO = None + output_tar_process = None + + try: + if installation_entry['type'] == 'FILE': + output_file_obj = open(output_path, 'wb') + elif installation_entry['type'] == 'TGZ': + os.makedirs(output_path, mode=0o755, exist_ok=True) + output_tar_process = subprocess.Popen(['tar', '-xz', '-C', output_path], stdin=subprocess.PIPE) + output_file_obj = typing.cast(typing.BinaryIO, output_tar_process.stdin) + + hash = hashlib.sha256() + downloaded = 0 + start = time.time() + + def write(data: bytes) -> None: + nonlocal hash + nonlocal downloaded + + hash.update(data) + downloaded += len(data) + + output_file_obj.write(data) + + while True: + try: + with contextlib.closing(pycurl.Curl()) as curl: + curl.setopt(curl.URL, installation_entry['file_uri']) + curl.setopt(curl.WRITEFUNCTION, write) + curl.setopt(curl.NOPROGRESS, False) + curl.setopt(curl.FOLLOWLOCATION, True) + curl.setopt(getattr(curl, 'XFERINFOFUNCTION', curl.PROGRESSFUNCTION), curl_progress) + curl.setopt(curl.LOW_SPEED_LIMIT, 30 * 1024) + curl.setopt(curl.LOW_SPEED_TIME, 30) + curl.setopt(curl.RESUME_FROM, downloaded) + + curl.perform() + break + + except pycurl.error as error: + if error.args[0] == pycurl.E_OPERATION_TIMEDOUT: + # If timeout, retry/resume. + print("Timeout. Retrying.", flush=True) + else: + raise + + end = time.time() + + print("Downloaded {downloaded} B in {seconds} second(s).".format( + downloaded=downloaded, + seconds=end - start, + ), flush=True) + + if output_tar_process is not None: + # Close the input to the process to signal that we are done. + output_file_obj.close() + output_file_obj = None + + # Wait for 60 seconds to finish writing everything out. + if output_tar_process.wait(60) != 0: + raise subprocess.CalledProcessError(output_tar_process.returncode, output_tar_process.args) + output_tar_process = None + + if installation_entry['file_digest'] != hash.hexdigest(): + raise ValueError("Digest for downloaded file does not match one from metadata. Metadata digest: {metadata_digest}. Computed digest: {computed_digest}.".format( + metadata_digest=installation_entry['file_digest'], + computed_digest=hash.hexdigest(), + )) + + except Exception: + # Cleanup. + if output_tar_process is not None: + try: + output_tar_process.kill() + output_tar_process.wait() + output_file_obj = None + except Exception: + # We ignore errors cleaning up. + pass + if os.path.isdir(output_path): + shutil.rmtree(output_path) + elif os.path.exists(output_path): + os.remove(output_path) + + raise + + finally: + if output_file_obj is not None: + output_file_obj.close() + + +# TODO: Add more ways to search for primitives (by name, keywords, etc.). +# TODO: Allow displaying results with more than just a primitive path. +def search_handler(arguments: argparse.Namespace) -> None: + for primitive_path in search(primitive_path_prefix=getattr(arguments, 'prefix', None)): + print(primitive_path) + + +def discover_handler(arguments: argparse.Namespace) -> None: + for package_name in discover(index=getattr(arguments, 'index', DEFAULT_INDEX)): + print(package_name) + + +def describe_handler(arguments: argparse.Namespace) -> None: + output_stream = getattr(arguments, 'output', sys.stdout) + + has_errored = False + + for primitive_path in arguments.primitives: + if getattr(arguments, 'list', False): + print(primitive_path, file=output_stream) + + try: + try: + primitive = get_primitive(primitive_path) + except Exception: + primitive = None + + if primitive is None: + load_all() + primitive = get_primitive_by_id(primitive_path) + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=output_stream) + print(f"Error loading primitive: {primitive_path}", file=output_stream) + has_errored = True + continue + else: + raise Exception(f"Error loading primitive: {primitive_path}") from error + + try: + # Using "to_json_structure" and not "to_internal_json_structure" because + # it is not indented that this would be parsed back directly, but just used + # to know where to find the primitive (using "installation" section). + primitive_description = primitive.metadata.to_json_structure() + + if getattr(arguments, 'print', False): + pprint.pprint(primitive_description, stream=output_stream) + + else: + json.dump( + primitive_description, + output_stream, + indent=(getattr(arguments, 'indent', 2) or None), + sort_keys=getattr(arguments, 'sort_keys', False), + allow_nan=False, + ) # type: ignore + output_stream.write('\n') + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=output_stream) + print(f"Error describing primitive: {primitive_path}", file=output_stream) + has_errored = True + continue + else: + raise Exception(f"Error describing primitive: {primitive_path}") from error + + if has_errored: + sys.exit(1) + + +def download_handler(arguments: argparse.Namespace) -> None: + for primitive_path in search(primitive_path_prefix=getattr(arguments, 'prefix', None)): + try: + primitive_class = get_primitive(primitive_path) + except Exception: + logger.exception("Could not load the primitive: %(primitive_path)s", {'primitive_path': primitive_path}) + continue + + try: + download_files(primitive_class.metadata.query(), getattr(arguments, 'output', DEFAULT_OUTPUT), getattr(arguments, 'redownload', False)) + except Exception: + logger.exception("Error downloading files for: %(primitive_path)s", {'primitive_path': primitive_path}) + + +def main(argv: typing.Sequence) -> None: + # We have to disable importing while type checking because it makes + # an import cycle in mypy which makes many typing errors. + if not typing.TYPE_CHECKING: + # Importing here to prevent import cycle. + from d3m import cli + + logging.basicConfig() + + logger.warning("This CLI is deprecated. Use \"python3 -m d3m index\" instead.") + + parser = argparse.ArgumentParser(description="Explore D3M primitives.") + cli.primitive_configure_parser(parser) + + arguments = parser.parse_args(argv[1:]) + + cli.primitive_handler(arguments, parser) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/d3m/d3m/metadata/__init__.py b/d3m/d3m/metadata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d3m/d3m/metadata/base.py b/d3m/d3m/metadata/base.py new file mode 100644 index 0000000..3861f37 --- /dev/null +++ b/d3m/d3m/metadata/base.py @@ -0,0 +1,4034 @@ +import collections +import copy +import datetime +import functools +import json +import logging +import inspect +import itertools +import operator +import os.path +import pickle +import re +import sys +import types +import typing +from urllib import parse as url_parse + +import frozendict # type: ignore +import jsonschema # type: ignore +import numpy # type: ignore +import pandas # type: ignore +from pytypes import type_util # type: ignore + +import d3m +from . import hyperparams as hyperparams_module, primitive_names +from d3m import deprecate, exceptions, utils + +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/66 +try: + from pyarrow import lib as pyarrow_lib # type: ignore +except ModuleNotFoundError: + pyarrow_lib = None + +__all__ = ( + 'ALL_ELEMENTS', 'NO_VALUE', 'DataMetadata', 'PrimitiveMetadata', 'CONTAINER_SCHEMA_VERSION', + 'DATA_SCHEMA_VERSION', 'PRIMITIVE_SCHEMA_VERSION', 'PrimitiveMethodKind', + 'PrimitiveArgumentKind', 'PrimitiveInstallationType', 'PrimitiveAlgorithmType', + 'PrimitiveFamily', 'PrimitivePrecondition', 'PrimitiveEffect', 'ForeignKeyType', 'Context', + 'PipelineRunPhase', 'PipelineStepType', 'PipelineRunStatusState', 'ArgumentType', +) + +logger = logging.getLogger(__name__) + + +def _return_all_elements() -> 'ALL_ELEMENTS_TYPE': + return ALL_ELEMENTS + + +@functools.total_ordering +class ALL_ELEMENTS_TYPE: + __slots__ = () + + def __repr__(self) -> str: + return '__ALL_ELEMENTS__' + + def __lt__(self, other: typing.Any) -> bool: + # "ALL_ELEMENTS" is smaller than anything else, and equal to itself. + # "ALL_ELEMENTS" is a singleton, so is equal only if referentially equal + # (which is a default implementation of "__eq__"). + return self != other + + def __deepcopy__(self, memo: typing.Dict) -> 'ALL_ELEMENTS_TYPE': + return ALL_ELEMENTS + + def __copy__(self) -> 'ALL_ELEMENTS_TYPE': + return ALL_ELEMENTS + + def __reduce__(self) -> typing.Tuple[typing.Callable, typing.Tuple]: + return _return_all_elements, () + + +def _return_no_value() -> 'NO_VALUE_TYPE': + return NO_VALUE + + +class NO_VALUE_TYPE: + __slots__ = () + + def __repr__(self) -> str: + return '__NO_VALUE__' + + def __deepcopy__(self, memo: typing.Dict) -> 'NO_VALUE_TYPE': + return NO_VALUE + + def __copy__(self) -> 'NO_VALUE_TYPE': + return NO_VALUE + + def __reduce__(self) -> typing.Tuple[typing.Callable, typing.Tuple]: + return _return_no_value, () + + +ALL_ELEMENTS = ALL_ELEMENTS_TYPE() +NO_VALUE = NO_VALUE_TYPE() + +COMMIT_HASH_REGEX = re.compile(r'^[0-9a-f]{40}$') + +ARGUMENT_NAME_REGEX = re.compile(r'^[A-Za-z][A-Za-z_0-9]*$') + +CONTAINER_SCHEMA_VERSION = 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json' +DATA_SCHEMA_VERSION = 'https://metadata.datadrivendiscovery.org/schemas/v0/data.json' +PRIMITIVE_SCHEMA_VERSION = 'https://metadata.datadrivendiscovery.org/schemas/v0/primitive.json' + +SCHEMAS_PATH = os.path.join(os.path.dirname(__file__), 'schemas', 'v0') + +# A map of all known schemas from their URIs to loaded JSONs. Not validated. +SCHEMAS = {} +for schema_uri in [ + CONTAINER_SCHEMA_VERSION, + DATA_SCHEMA_VERSION, + 'https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json', + 'https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json', + 'https://metadata.datadrivendiscovery.org/schemas/v0/pipeline_run.json', + PRIMITIVE_SCHEMA_VERSION, + 'https://metadata.datadrivendiscovery.org/schemas/v0/problem.json', +]: + schema_filename = os.path.basename(schema_uri) + with open(os.path.join(SCHEMAS_PATH, schema_filename), 'r', encoding='utf8') as schema_file: + SCHEMAS[schema_uri] = json.load(schema_file) + +# We validate schemas using unmodified validator. +for schema_json in SCHEMAS.values(): + jsonschema.Draft7Validator.check_schema(schema_json) + +DEFINITIONS_JSON = SCHEMAS['https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json'] + +CONTAINER_SCHEMA_VALIDATOR, DATA_SCHEMA_VALIDATOR, PRIMITIVE_SCHEMA_VALIDATOR = utils.load_schema_validators(SCHEMAS, ('container.json', 'data.json', 'primitive.json')) + +HYPERPARAMETER_REQUIRED_SEMANTIC_TYPES = { + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ControlParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + 'https://metadata.datadrivendiscovery.org/types/MetafeatureParameter', +} + +TABULAR_SEMANTIC_TYPES = { + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/TabularRow', + 'https://metadata.datadrivendiscovery.org/types/TabularColumn', +} + +ALL_SEMANTIC_TYPES = set(utils._get_names(DEFINITIONS_JSON, 'definitions.semantic_types.items.anyOf[*].enum[*]')) + +# A list of all fields which is being generated by "_generate_metadata" method. +ALL_GENERATED_FIELDS = [ + 'schema', + 'structural_type', + 'semantic_types', + 'dimension', + 'name', +] + +PrimitiveMethodKind = utils.create_enum_from_json_schema_enum( + 'PrimitiveMethodKind', DEFINITIONS_JSON, + 'definitions.primitive_code.properties.instance_methods.additionalProperties.properties.kind.oneOf[*].enum[*]', + module=__name__, +) +PrimitiveArgumentKind = utils.create_enum_from_json_schema_enum( + 'PrimitiveArgumentKind', DEFINITIONS_JSON, + 'definitions.primitive_code.properties.arguments.additionalProperties.properties.kind.oneOf[*].enum[*]', + module=__name__, +) +PrimitiveInstallationType = utils.create_enum_from_json_schema_enum( + 'PrimitiveInstallationType', DEFINITIONS_JSON, + [ + 'definitions.installation.items.oneOf[*].properties.type.enum[*]', + 'definitions.installation.items.oneOf[*].allOf[*].properties.type.enum[*]' + ], + module=__name__, +) +PrimitiveAlgorithmType = utils.create_enum_from_json_schema_enum( + 'PrimitiveAlgorithmType', DEFINITIONS_JSON, + 'definitions.algorithm_types.items.oneOf[*].enum[*]', + module=__name__, +) +PrimitiveFamily = utils.create_enum_from_json_schema_enum( + 'PrimitiveFamily', DEFINITIONS_JSON, + 'definitions.primitive_family.oneOf[*].enum[*]', + module=__name__, +) +PrimitivePrecondition = utils.create_enum_from_json_schema_enum( + 'PrimitivePrecondition', DEFINITIONS_JSON, + 'definitions.preconditions.items.oneOf[*].enum[*]', + module=__name__, +) +PrimitiveEffect = utils.create_enum_from_json_schema_enum( + 'PrimitiveEffect', DEFINITIONS_JSON, + 'definitions.effects.items.oneOf[*].enum[*]', + module=__name__, +) +ForeignKeyType = utils.create_enum_from_json_schema_enum( + 'ForeignKeyType', DEFINITIONS_JSON, + 'definitions.foreign_key.oneOf[*].properties.type.enum[*]', + module=__name__, +) +Context = utils.create_enum_from_json_schema_enum( + 'Context', DEFINITIONS_JSON, + 'definitions.context.oneOf[*].enum[*]', + module=__name__, +) +PipelineRunPhase = utils.create_enum_from_json_schema_enum( + 'PipelineRunPhase', DEFINITIONS_JSON, + 'definitions.pipeline_run.properties.phase.anyOf[*].enum[*]', + module=__name__, +) +PipelineStepType = utils.create_enum_from_json_schema_enum( + 'PipelineStepType', DEFINITIONS_JSON, + 'definitions.pipeline_steps.items.oneOf[*].properties.type.enum[*]', + module=__name__, +) +PipelineRunStatusState = utils.create_enum_from_json_schema_enum( + 'StatusState', DEFINITIONS_JSON, + 'definitions.status.properties.state.enum[*]', + module=__name__, +) +# Enumeration of argument and hyper-parameter types to a primitive in a step. +ArgumentType = utils.create_enum_from_json_schema_enum( + 'ArgumentType', DEFINITIONS_JSON, + 'definitions[container_argument,container_arguments,primitive_argument,primitive_arguments,data_argument,data_arguments,value_argument].properties.type.enum[*]', + module=__name__, +) + +M = typing.TypeVar('M', bound='MetadataEntry') +T = typing.TypeVar('T', bound='Metadata') +D = typing.TypeVar('D', bound='DataMetadata') +P = typing.TypeVar('P', bound='PrimitiveMetadata') +SimpleSelectorSegment = typing.Union[int, str] +SelectorSegment = typing.Union[SimpleSelectorSegment, ALL_ELEMENTS_TYPE] +ListSelector = typing.List[SelectorSegment] +TupleSelector = typing.Tuple[SelectorSegment, ...] +# A list or tuple of integers, strings, or ALL_ELEMENTS. +Selector = typing.Union[ListSelector, TupleSelector] + +# We register additional immutable values. We are doing it this way to overcome issues with import cycles. +if ALL_ELEMENTS not in utils.additional_immutable_values: + utils.additional_immutable_values += (ALL_ELEMENTS,) +if NO_VALUE not in utils.additional_immutable_values: + utils.additional_immutable_values += (NO_VALUE,) + + +class ColumnReference(typing.NamedTuple): + resource_id: str + column_index: int + + +class MetadataEntry: + __slots__ = ('elements', 'all_elements', 'metadata', 'is_empty', 'is_elements_empty') + + def __init__( + self, elements: utils.PMap = utils.EMPTY_PMAP, all_elements: 'MetadataEntry' = None, + metadata: frozendict.FrozenOrderedDict = frozendict.FrozenOrderedDict(), is_empty: bool = True, + is_elements_empty: bool = True, + ) -> None: + self.elements = elements + self.all_elements = all_elements + self.metadata = metadata + self.is_empty = is_empty + self.is_elements_empty = is_elements_empty + + def copy(self: M) -> M: + return type(self)(self.elements, self.all_elements, self.metadata, self.is_empty, self.is_elements_empty) + + def __copy__(self: M) -> M: + return self.copy() + + def update_is_empty(self) -> None: + self.is_empty = not self.metadata and self.is_elements_empty and self.all_elements is None + + +class Metadata: + """ + A basic class to be used as a value for `metadata` attribute + on values passed between primitives. + + Instances are immutable. + + Parameters + ---------- + metadata: + Optional initial metadata for the top-level of the value. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + """ + + @deprecate.arguments('source', 'timestamp', message="argument ignored") + def __init__(self, metadata: typing.Dict[str, typing.Any] = None, *, source: typing.Any = None, timestamp: datetime.datetime = None) -> None: + self._current_metadata = MetadataEntry() + + self._hash: int = None + + if metadata is not None: + self._update_in_place((), metadata, self._current_metadata) + + @deprecate.arguments('source', 'timestamp', message="argument ignored") + def update(self: T, selector: Selector, metadata: typing.Dict[str, typing.Any], *, source: typing.Any = None, timestamp: datetime.datetime = None) -> T: + """ + Updates metadata with new ``metadata`` for data pointed to with ``selector``. + + If value of any field is ``NO_VALUE``, that field is deleted. + + It returns a copy of this metadata object with new metadata applied. + + Parameters + ---------- + selector: + A selector pointing to data. + metadata: + A map of fields and values with metadata. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Returns + ------- + Updated metadata. + """ + + cls = type(self) + + new_metadata = cls() + + new_metadata._update_in_place(selector, metadata, self._current_metadata) + + return new_metadata + + @deprecate.arguments('source', 'timestamp', message="argument ignored") + def remove(self: T, selector: Selector, *, recursive: bool = False, strict_all_elements: bool = False, + source: typing.Any = None, timestamp: datetime.datetime = None) -> T: + """ + Removes all metadata at ``selector``. + + Parameters + ---------- + selector: + A selector to remove metadata at. + recursive: + Should remove also all metadata under the ``selector``? + strict_all_elements: + If ``True``, then when removing ``ALL_ELEMENTS`` entry, do not remove also metadata for all elements it matches. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Returns + ------- + Updated metadata. + """ + + cls = type(self) + + new_metadata = cls() + + new_metadata._remove_in_place(selector, recursive, strict_all_elements, self._current_metadata) + + return new_metadata + + @deprecate.function(message="create a DataMetadata instance explicitly instead") + @deprecate.arguments('source', 'timestamp', message="argument ignored") + def clear(self: T, metadata: typing.Dict[str, typing.Any] = None, *, source: typing.Any = None, timestamp: datetime.datetime = None) -> T: + """ + DEPRECATED: create a Metadata instance explicitly instead. + + Creates and returns a new (clear) metadata object. + + Parameters + ---------- + metadata: + Optional new initial metadata for the top-level of the value. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Returns + ------- + New metadata object. + """ + + return type(self)(metadata) + + def _update_in_place(self, selector: Selector, metadata: typing.Dict[str, typing.Any], + parent_current_metadata: MetadataEntry) -> None: + """ + This method exist only for internal purposes and you should never ever call this to update metadata from outside. + """ + + self.check_selector(selector) + + # If metadata is already an instance of frozen dict, we just check that it is immutable. + if isinstance(metadata, frozendict.FrozenOrderedDict): + utils.check_immutable(metadata) + else: + metadata = utils.make_immutable_copy(metadata) + + if not isinstance(metadata, frozendict.FrozenOrderedDict): + raise exceptions.InvalidArgumentTypeError("Metadata should be a dict.") + + self._current_metadata = self._update(selector, parent_current_metadata, metadata) + + def _remove_in_place(self, selector: Selector, recursive: bool, strict_all_elements: bool, + parent_current_metadata: MetadataEntry) -> None: + """ + This method exist only for internal purposes and you should never ever call this to remove metadata from outside. + """ + + self.check_selector(selector) + + self._current_metadata = self._remove(selector, recursive, strict_all_elements, parent_current_metadata) + + # TODO: Allow querying only a subset of metadata (not the whole dict). + # TODO: Maybe cache results? LRU? + def query(self, selector: Selector, *, ignore_all_elements: bool = False, remove_no_value: bool = True) -> frozendict.FrozenOrderedDict: + """ + Returns metadata for data pointed to with ``selector``. + + When querying using ``ALL_ELEMENTS`` means only metadata which has been set using ALL_ELEMENTS + is returned. + + Parameters + ---------- + selector: + A selector to query metadata for. + ignore_all_elements: + By default, metadata from ALL_ELEMENTS is merged with metadata for an element itself. + By setting this argument to ``True``, this is disabled and just metadata from an element is returned. + remove_no_value: + By default all ``NO_VALUE`` values are removed. If set to ``False``, they are not removed. + + Returns + ------- + Metadata at a given selector. + """ + + self.check_selector(selector) + + metadata = self._query(selector, self._current_metadata, 0 if ignore_all_elements else None) + + if remove_no_value: + return self._remove_no_value(metadata) + else: + return metadata + + def query_with_exceptions(self, selector: Selector, *, remove_no_value: bool = True) -> typing.Tuple[frozendict.FrozenOrderedDict, typing.Dict[TupleSelector, frozendict.FrozenOrderedDict]]: + """ + In addition to returning metadata for data pointed to with ``selector``, this method for every ``ALL_ELEMENTS`` + selector segment also returns a map between selectors and metadata for all elements which have metadata + which differs from that of ``ALL_ELEMENTS``. + + Parameters + ---------- + selector: + A selector to query metadata for. + remove_no_value: + By default all ``NO_VALUE`` values are removed. If set to ``False``, they are not removed. + + Returns + ------- + A tuple of metadata at a given selector and a dict of exceptions. + """ + + self.check_selector(selector) + + metadata = self._query(selector, self._current_metadata, None) + if remove_no_value: + metadata = self._remove_no_value(metadata) + + exceptions = self._query_exceptions(selector, self._current_metadata) + + exceptions_with_selectors = {} + for exception_selector in exceptions: + exception_metadata = self._query(exception_selector, self._current_metadata, None) + if remove_no_value: + exception_metadata = self._remove_no_value(exception_metadata) + + if exception_metadata and exception_metadata != metadata: + exceptions_with_selectors[exception_selector] = exception_metadata + + return metadata, exceptions_with_selectors + + def query_field(self, selector: Selector, field: str, *, strict_all_elements: bool = True) -> typing.Any: + """ + Queries metadata for data pointed to with ``selector`` and returns only the + ``field`` of that metadata. Raises `KeyError` exception if metadata or field + is not set. + + ``field`` represents only top-level fields in metadata. + + Parameters + ---------- + selector: + A selector to query metadata for. + field: + A field name to query. + strict_all_elements: + If set, the method does not just return ``field`` value of the metadata + under ``selector``, but checks that the value really holds for all + elements matching the ``selector``, without exception. This is helpful + also if metadata is not compacted and ``field`` value is the same + across all elements, but ``ALL_ELEMENTS`` metadata does not contain + that field. + + Returns + ------- + A value of ``field`` of metadata at ``selector``. + """ + + if not strict_all_elements: + return self.query(selector)[field] + + metadata, exceptions_with_selectors = self.query_with_exceptions(selector) + + # We have a candidate which potentially holds for all elements. + if field in metadata: + value = metadata[field] + + for exception_metadata in exceptions_with_selectors.values(): + # Is there an exception for this field? We care only if field exists, + # then it has to match in the value. But if field does not exist, + # value from "metadata" will be used anyway, so that is OK. + if field in exception_metadata and exception_metadata[field] != value: + raise KeyError("Field '{field}' is not the same across all elements.".format(field=field)) + + return value + + # If selector is without "ALL_ELEMENTS" then field is simply not set. + if ALL_ELEMENTS not in selector: + assert not exceptions_with_selectors + raise KeyError("Field '{field}' is not set.".format(field=field)) + + # Field might be set on all elements, but metadata is no compacted, + # check if field is the same across all metadata exceptions. + # TODO: Check that metadata exceptions cover whole dimension. + # When field is not set for ALL_ELEMENTS, we have to traverse all potential elements, + # not just those which have metadata set, but any which could have it set. We can do + # that if dimension length is set, we can enumerate all elements and check that they + # contain equal field value. But easier it is to just check that dimension length + # matches the number of metadata exceptions. Then we know we have checked all elements + # which can exist on data. And if any element is missing (does not have metadata set), + # it does not have field set anyway, which means it does not match field value of other + # elements. This dimension length comparison can work even in the case when dimension + # is not enumerable (e.g., a dict). Checking dimension lengths becomes tricky when + # multiple ALL_ELEMENTS are present in the selector though, and especially if data + # is jagged (does not have same size sub-dimensions for all elements). An issue is + # also that dimensions defined for DataMetadata and not Metadata. + + # Can raise KeyError. + first_exception_selector, first_exception_metadata = exceptions_with_selectors.popitem() + + # Can raise KeyError. + value = first_exception_metadata[field] + + for exception_metadata in exceptions_with_selectors.values(): + # We require that "field" both exist in all exception metadata and has the same value + # as all other fields (which we check by checking against the first exception metadata). + if field not in exception_metadata or exception_metadata[field] != value: + raise KeyError("Field '{field}' is not the same across all elements.".format(field=field)) + + return value + + def query_field_with_exceptions(self, selector: Selector, field: str) -> typing.Tuple[typing.Any, typing.Dict[TupleSelector, typing.Any]]: + """ + In addition to returning ``field`` of metadata for data pointed to with ``selector``, + this method for every ``ALL_ELEMENTS`` selector segment also returns a map between + selectors and field values for all elements which have field which differs from that + of ``ALL_ELEMENTS``. + + If ``field`` does not exist under ``selector``, ``NO_VALUE`` is returned instead, + and all exceptions are required to contain ``field``. + + ``field`` represents only top-level fields in metadata. + + Parameters + ---------- + selector: + A selector to query metadata for. + field: + A field name to query. + + Returns + ------- + A tuple of value at a given selector and field and a dict of exceptions. + """ + + metadata, exceptions_with_selectors = self.query_with_exceptions(selector) + + if field in metadata: + # If "field" exist in "metadata", we return only those exceptions which contain "field" which + # differs from that in "metadata". Only they are real "exceptions" for this "selector" and "field". + return metadata[field], { + exception_selector: exception_metadata[field] for exception_selector, exception_metadata in exceptions_with_selectors.items() + if field in exception_metadata and exception_metadata[field] != metadata[field] + } + + # If selector is without "ALL_ELEMENTS" then field is simply not set. + if ALL_ELEMENTS not in selector: + assert not exceptions_with_selectors + raise KeyError("Field '{field}' is not set.".format(field=field)) + + field_exceptions = {} + + for exception_selector, exception_metadata in exceptions_with_selectors.items(): + if field not in exception_metadata: + raise KeyError("Field '{field}' is not set.".format(field=field)) + + field_exceptions[exception_selector] = exception_metadata[field] + + return NO_VALUE, field_exceptions + + def _query(self, selector: Selector, metadata_entry: typing.Optional[MetadataEntry], ignore_all_elements: typing.Optional[int]) -> frozendict.FrozenOrderedDict: + if metadata_entry is None: + return frozendict.FrozenOrderedDict() + if len(selector) == 0: + return metadata_entry.metadata + + segment, selector_rest = selector[0], selector[1:] + + if ignore_all_elements is not None: + new_ignore_all_elements = ignore_all_elements - 1 + else: + new_ignore_all_elements = None + + all_elements_metadata = self._query(selector_rest, metadata_entry.all_elements, new_ignore_all_elements) + if segment is ALL_ELEMENTS: + metadata = all_elements_metadata + elif segment in metadata_entry.elements: + segment = typing.cast(SimpleSelectorSegment, segment) + metadata = self._query(selector_rest, metadata_entry.elements[segment], new_ignore_all_elements) + if ignore_all_elements is None or ignore_all_elements > 0: + metadata = self._merge_metadata(all_elements_metadata, metadata) + elif ignore_all_elements is not None and ignore_all_elements <= 0: + metadata = frozendict.FrozenOrderedDict() + else: + metadata = all_elements_metadata + + return metadata + + def _query_exceptions(self, selector: Selector, metadata_entry: typing.Optional[MetadataEntry]) -> typing.Sequence[TupleSelector]: + if metadata_entry is None: + return [] + if len(selector) == 0: + return [] + + segment, selector_rest = selector[0], selector[1:] + + exceptions: typing.List[TupleSelector] = [] + if segment is ALL_ELEMENTS: + if selector_rest: + for exception_selector in self._query_exceptions(selector_rest, metadata_entry.all_elements): + exceptions.append((segment,) + exception_selector) + + for element_segment, element_metadata_entry in metadata_entry.elements.items(): + if selector_rest: + for exception_selector in self._query_exceptions(selector_rest, element_metadata_entry): + exceptions.append((typing.cast(SelectorSegment, element_segment),) + exception_selector) + else: + if element_metadata_entry.metadata: + exceptions.append((element_segment,)) + elif segment in metadata_entry.elements: + element_metadata_entry = metadata_entry.elements[typing.cast(SimpleSelectorSegment, segment)] + if selector_rest: + for exception_selector in self._query_exceptions(selector_rest, element_metadata_entry): + exceptions.append((segment,) + exception_selector) + elif element_metadata_entry.metadata: + exceptions.append((segment,)) + + return exceptions + + def _remove(self, selector: Selector, recursive: bool, strict_all_elements: bool, + metadata_entry: typing.Optional[MetadataEntry]) -> MetadataEntry: + if metadata_entry is None: + new_metadata_entry = MetadataEntry() + else: + new_metadata_entry = metadata_entry.copy() + + if len(selector) == 0: + new_metadata_entry.metadata = frozendict.FrozenOrderedDict() + if recursive: + new_metadata_entry.all_elements = None + new_metadata_entry.elements = utils.EMPTY_PMAP + new_metadata_entry.is_elements_empty = True + new_metadata_entry.is_empty = True + else: + new_metadata_entry.update_is_empty() + return new_metadata_entry + + segment, selector_rest = selector[0], selector[1:] + + if segment is ALL_ELEMENTS: + new_metadata_entry.all_elements = self._remove(selector_rest, recursive, strict_all_elements, new_metadata_entry.all_elements) + if new_metadata_entry.all_elements.is_empty: + new_metadata_entry.all_elements = None + new_metadata_entry.update_is_empty() + + if not strict_all_elements and new_metadata_entry.elements: + new_elements_evolver = new_metadata_entry.elements.evolver() + for element_segment, element_metadata_entry in new_metadata_entry.elements.items(): + new_element_metadata_entry = self._remove(selector_rest, recursive, strict_all_elements, element_metadata_entry) + if new_element_metadata_entry.is_empty: + new_elements_evolver.remove(element_segment) + else: + new_elements_evolver.set(element_segment, new_element_metadata_entry) + new_metadata_entry.elements = new_elements_evolver.persistent() + new_metadata_entry.is_elements_empty = not new_metadata_entry.elements + new_metadata_entry.update_is_empty() + + else: + segment = typing.cast(SimpleSelectorSegment, segment) + if segment in new_metadata_entry.elements: + new_element_metadata_entry = self._remove(selector_rest, recursive, strict_all_elements, new_metadata_entry.elements[segment]) + if new_element_metadata_entry.is_empty: + new_metadata_entry.elements = new_metadata_entry.elements.remove(segment) + else: + new_metadata_entry.elements = new_metadata_entry.elements.set(segment, new_element_metadata_entry) + new_metadata_entry.is_elements_empty = not new_metadata_entry.elements + new_metadata_entry.update_is_empty() + + return new_metadata_entry + + def _update(self, selector: Selector, metadata_entry: typing.Optional[MetadataEntry], + metadata: frozendict.FrozenOrderedDict) -> MetadataEntry: + if metadata_entry is None: + new_metadata_entry = MetadataEntry() + else: + new_metadata_entry = metadata_entry.copy() + + if len(selector) == 0: + # One would think that we could remove "NO_VALUE" values during merging, but we have to + # keep them to know which values we have to remove when merging with all elements metadata. + new_metadata_entry.metadata = self._merge_metadata(new_metadata_entry.metadata, metadata) + new_metadata_entry.update_is_empty() + return new_metadata_entry + + segment, selector_rest = selector[0], selector[1:] + + if segment is ALL_ELEMENTS: + new_metadata_entry.all_elements = self._update(selector_rest, new_metadata_entry.all_elements, metadata) + if new_metadata_entry.all_elements.is_empty: + new_metadata_entry.all_elements = None + new_metadata_entry.update_is_empty() + + if new_metadata_entry.elements: + # Fields on direct elements have precedence over fields on ALL_ELEMENTS, but we want the last + # call to update to take precedence. So all fields found in metadata just set on ALL_ELEMENTS + # are removed from all metadata on direct elements. + new_elements_evolver = new_metadata_entry.elements.evolver() + for element_segment, element_metadata_entry in new_metadata_entry.elements.items(): + new_element_metadata_entry = self._prune(selector_rest, element_metadata_entry, metadata) + if new_element_metadata_entry is None or new_element_metadata_entry.is_empty: + new_elements_evolver.remove(element_segment) + else: + new_elements_evolver.set(element_segment, new_element_metadata_entry) + new_metadata_entry.elements = new_elements_evolver.persistent() + new_metadata_entry.is_elements_empty = not new_metadata_entry.elements + new_metadata_entry.update_is_empty() + + else: + segment = typing.cast(SimpleSelectorSegment, segment) + new_element_metadata_entry = self._update(selector_rest, new_metadata_entry.elements.get(segment, None), metadata) + if new_element_metadata_entry.is_empty: + new_metadata_entry.elements = new_metadata_entry.elements.discard(segment) + else: + new_metadata_entry.elements = new_metadata_entry.elements.set(segment, new_element_metadata_entry) + new_metadata_entry.is_elements_empty = not new_metadata_entry.elements + new_metadata_entry.update_is_empty() + + return new_metadata_entry + + def _merge_metadata(self, metadata1: frozendict.FrozenOrderedDict, metadata2: frozendict.FrozenOrderedDict) -> frozendict.FrozenOrderedDict: + """ + Merges all fields from ``metadata2`` on top of ``metadata1``, recursively. + + Only dicts are merged recursively, arrays are not. + """ + + # Copy so that we can mutate. + metadata = collections.OrderedDict(metadata1) + + for name, value in metadata2.items(): + if name in metadata: + if isinstance(metadata[name], frozendict.FrozenOrderedDict) and isinstance(value, frozendict.FrozenOrderedDict): + merged_value = self._merge_metadata(metadata[name], value) + # If value is an empty dict, but before merging it was not, we just remove the whole field. + if metadata[name] and not merged_value: + del metadata[name] + else: + metadata[name] = merged_value + else: + metadata[name] = value + else: + metadata[name] = value + + return frozendict.FrozenOrderedDict(metadata) + + def _merge_metadata_entries(self, metadata_entry1: MetadataEntry, metadata_entry2: MetadataEntry) -> MetadataEntry: + """ + Merges ``metadata_entry2`` on top of ``metadata_entry1``, recursively, and + returns a new metadata entry. + """ + + output_metadata_entry = MetadataEntry() + + # Merging elements. + new_elements_evolver = metadata_entry1.elements.evolver() + for element_segment, element_metadata_entry in metadata_entry2.elements.items(): + if element_segment not in new_elements_evolver: + new_elements_evolver.set(element_segment, element_metadata_entry) + else: + new_elements_evolver.set( + element_segment, self._merge_metadata_entries(new_elements_evolver[element_segment], element_metadata_entry), + ) + output_metadata_entry.elements = new_elements_evolver.persistent() + output_metadata_entry.is_elements_empty = not output_metadata_entry.elements + + # Merging "ALL_ELEMENTS". + if metadata_entry1.all_elements is not None and metadata_entry2.all_elements is not None: + output_metadata_entry.all_elements = self._merge_metadata_entries(metadata_entry1.all_elements, metadata_entry2.all_elements) + elif metadata_entry1.all_elements is not None: + output_metadata_entry.all_elements = metadata_entry1.all_elements + elif metadata_entry2.all_elements is not None: + output_metadata_entry.all_elements = metadata_entry2.all_elements + + # Merging metadata: + output_metadata_entry.metadata = self._merge_metadata(metadata_entry1.metadata, metadata_entry2.metadata) + + output_metadata_entry.update_is_empty() + + return output_metadata_entry + + def _remove_no_value(self, metadata: frozendict.FrozenOrderedDict) -> frozendict.FrozenOrderedDict: + # Copy so that we can mutate. + metadata = collections.OrderedDict(metadata) + + # We iterate over a list so that we can change dict while iterating. + for name, value in list(metadata.items()): + if value is NO_VALUE: + del metadata[name] + elif isinstance(value, frozendict.FrozenOrderedDict): + new_value = self._remove_no_value(value) + # If value is an empty dict, but before removing "NO_VALUE" it was not, we just remove the whole field. + if metadata[name] and not new_value: + del metadata[name] + else: + metadata[name] = new_value + + return frozendict.FrozenOrderedDict(metadata) + + def _prune(self, selector: Selector, metadata_entry: typing.Optional[MetadataEntry], metadata: frozendict.FrozenOrderedDict) -> typing.Optional[MetadataEntry]: + if metadata_entry is None: + return metadata_entry + + new_metadata_entry = metadata_entry.copy() + + if len(selector) == 0: + new_metadata_entry.metadata = self._prune_metadata(new_metadata_entry.metadata, metadata) + new_metadata_entry.update_is_empty() + return new_metadata_entry + + segment, selector_rest = selector[0], selector[1:] + + if segment is ALL_ELEMENTS: + new_metadata_entry.all_elements = self._prune(selector_rest, new_metadata_entry.all_elements, metadata) + if new_metadata_entry.all_elements is not None and new_metadata_entry.all_elements.is_empty: + new_metadata_entry.all_elements = None + new_metadata_entry.update_is_empty() + + if new_metadata_entry.elements: + new_elements_evolver = new_metadata_entry.elements.evolver() + for element_segment, element_metadata_entry in new_metadata_entry.elements.items(): + new_element_metadata_entry = self._prune(selector_rest, element_metadata_entry, metadata) + if new_element_metadata_entry is None or new_element_metadata_entry.is_empty: + new_elements_evolver.remove(element_segment) + else: + new_elements_evolver.set(element_segment, new_element_metadata_entry) + new_metadata_entry.elements = new_elements_evolver.persistent() + new_metadata_entry.is_elements_empty = not new_metadata_entry.elements + new_metadata_entry.update_is_empty() + + elif segment in new_metadata_entry.elements: + segment = typing.cast(SimpleSelectorSegment, segment) + new_element_metadata_entry = self._prune(selector_rest, new_metadata_entry.elements[segment], metadata) + if new_element_metadata_entry is None or new_element_metadata_entry.is_empty: + new_metadata_entry.elements = new_metadata_entry.elements.remove(segment) + else: + new_metadata_entry.elements = new_metadata_entry.elements.set(segment, new_element_metadata_entry) + new_metadata_entry.is_elements_empty = not new_metadata_entry.elements + new_metadata_entry.update_is_empty() + + return new_metadata_entry + + def _prune_metadata(self, metadata1: frozendict.FrozenOrderedDict, metadata2: frozendict.FrozenOrderedDict) -> frozendict.FrozenOrderedDict: + """ + Removes all fields which are found in ``metadata2`` from ``metadata1``, recursively. + + Values of ``metadata2`` do not matter, except if they are a dict, in which case + removal is done recursively. + """ + + # Copy so that we can mutate. + metadata = collections.OrderedDict(metadata1) + + for name, value in metadata2.items(): + if name not in metadata: + continue + + if isinstance(metadata[name], frozendict.FrozenOrderedDict) and isinstance(value, frozendict.FrozenOrderedDict): + pruned_value = self._prune_metadata(metadata[name], value) + # If value is an empty dict, but before pruning it was not, we just remove the whole field. + if metadata[name] and not pruned_value: + del metadata[name] + else: + metadata[name] = pruned_value + else: + del metadata[name] + + return frozendict.FrozenOrderedDict(metadata) + + def compact(self: T, fields_to_compact: typing.Sequence[str]) -> T: + """ + Compact metadata and return it. Produces equivalent but compact + metadata where equal metadata for all elements in a dimension are compacted + into ``ALL_ELEMENTS`` selector segment. + + Parameters + ---------- + fields_to_compact: + Which fields to compact in the metadata. + + Returns + ------- + Compacted metadata. + """ + + metadata_dict: typing.Dict[TupleSelector, typing.Dict] = collections.OrderedDict() + + for metadata_description in self.to_internal_simple_structure(): + metadata_dict[tuple(metadata_description['selector'])] = metadata_description['metadata'] + + metadata_dict = self._compact_metadata(metadata_dict, fields_to_compact) + + new_metadata = copy.copy(self) + + for selector, metadata in metadata_dict.items(): + metadata = utils.make_immutable_copy(metadata) + + if not isinstance(metadata, frozendict.FrozenOrderedDict): + raise exceptions.InvalidArgumentTypeError("Metadata should be a dict.") + + new_metadata._current_metadata = new_metadata._update(selector, new_metadata._current_metadata, metadata) + + return new_metadata + + # TODO: During compacting, we could also create an Union type of all structural types in elements and set it on "ALL_ELEMENTS". + @classmethod + def _compact_metadata(cls: typing.Type[T], metadata_dict: typing.Dict[TupleSelector, typing.Dict], fields_to_compact: typing.Sequence[str]) -> typing.Dict[TupleSelector, typing.Dict]: + """ + Compacts only top-level fields (if their values are all equal) listed in ``fields_to_compact``. + + Only top-level fields listed in ``fields_to_compact`` will be compacted. The reason for ``fields_to_compact`` + is that it is an optimization, so that we do not have to first go over all metadata to detect which all + fields are there. When used by ``_generate``, ``_generate_metadata`` is producing a fixed set of fields which + works in our advantage. + + We prefer to compact segments at the beginning of the selector over the segments later on. + + Parameters + ---------- + metadata_dict: + A dict where field is selector and value is the metadata dict under this selector. + fields_to_compact: + Which fields to compact in the metadata. + + Returns + ------- + Compacted metadata representation in the form of a dict where fields are selectors. + """ + + # We rely on the fact that dicts preserve order in Python 3.6+ and do not use + # "OrderedDict" here for simplicity (we do not compare by equality dicts here to care + # about order of fields in equality check). + results: typing.Dict[TupleSelector, typing.Dict] = collections.defaultdict(dict) + + # Key is the length of selectors and the value is a list of selectors of the same length. + selector_lengths: typing.Dict[int, typing.List[TupleSelector]] = collections.defaultdict(list) + for selector in metadata_dict.keys(): + selector_lengths[len(selector)].append(selector) + + for length, selectors in sorted(selector_lengths.items(), key=operator.itemgetter(0)): + update_selectors: typing.Dict[TupleSelector, typing.List] = collections.defaultdict(list) + + for field in fields_to_compact: + values_to_selectors: typing.Dict[typing.Any, typing.List[TupleSelector]] = collections.defaultdict(list) + for selector in selectors: + if field in metadata_dict[selector]: + values_to_selectors[metadata_dict[selector][field]].append(selector) + + for value in values_to_selectors.keys(): + compacted_selectors = cls._get_compacted_selectors(values_to_selectors[value], selectors) + + for selector in compacted_selectors: + update_selectors[selector].append({field: value}) + + for selector, items in sorted(update_selectors.items(), key=operator.itemgetter(0)): + for item in items: + results[selector].update(item) + + return collections.OrderedDict(results) + + @classmethod + def _get_compacted_selectors(cls, selectors_to_compact: typing.List[TupleSelector], total_selectors: typing.List[TupleSelector]) -> typing.List[TupleSelector]: + """ + This function returns a compacted representation of ``selectors_to_compact``. + + Parameters + ---------- + selectors_to_compact: + A list of selectors to be compacted which have the same value under a certain field. + total_selectors: + All possible selectors of a certain length. + + Returns + ------- + A list of compacted selectors. + """ + + input_selectors = copy.copy(selectors_to_compact) + input_selectors_set = set(input_selectors) + output_selectors = selectors_to_compact + + length_of_selector = len(input_selectors[0]) + + other_selectors_set = set(total_selectors) - input_selectors_set + + for other_selector in sorted(other_selectors_set): + if cls._selector_overlap(other_selector, input_selectors_set): + other_selectors_set.remove(other_selector) + + for i in range(length_of_selector): + all_segments = {selector[i] for selector in total_selectors} + for index, selector_tuple in enumerate(output_selectors): + can_collapse = True + + for segment in all_segments: + test_selector = list(selector_tuple) + test_selector[i] = segment + if cls._selector_overlap(test_selector, other_selectors_set): + can_collapse = False + + if can_collapse: + selector_list = list(selector_tuple) + selector_list[i] = ALL_ELEMENTS + output_selectors[index] = tuple(selector_list) + + output_selectors = sorted(set(output_selectors)) + + output_selectors = cls._greedy_prune_selector(output_selectors, input_selectors) + + return output_selectors + + @classmethod + def _selector_overlap(cls, test_selector: Selector, selectors_set: typing.Set[TupleSelector]) -> bool: + """ + This function checks if ``test_selector`` overlaps with selectors ``selectors_set``. + + Parameters + ---------- + test_selector: + The input selector. + selectors_set: + A set of selectors. + + Returns + ------- + Whether the selector ``test_selector`` overlaps with any selector in ``selectors_set``. + """ + + for selector in selectors_set: + assert len(selector) == len(test_selector) + + is_same = True + for i in range(len(test_selector)): + if test_selector[i] is ALL_ELEMENTS: + continue + if selector[i] is not ALL_ELEMENTS: + if test_selector[i] != selector[i]: + is_same = False + + if is_same: + return True + + return False + + @classmethod + def _selector_contained(cls, selector_1: Selector, selector_2: Selector) -> bool: + """ + This function checks if ``selector_1`` is contained in ``selector_2``. + + Returns + ------- + Whether ``selector_1`` is contained in ``selector_2``. + + Notes + ----- + This function is different from `_selector_overlap` which checks if two selectors overlap. + """ + + for i in range(len(selector_1)): + if selector_1[i] is ALL_ELEMENTS: + if selector_2[i] is not ALL_ELEMENTS: + return False + continue + if selector_2[i] is not ALL_ELEMENTS: + if selector_1[i] != selector_2[i]: + return False + + return True + + @classmethod + def _greedy_prune_selector(cls, compacted_selectors: typing.List[TupleSelector], selectors_to_compact: typing.List[TupleSelector]) -> typing.List[TupleSelector]: + """ + This method implements a greedy algorithm to remove unnecessary selectors from ``compacted_selectors``. + + Parameters + ---------- + compacted_selectors: + This is an already compacted list of selectors which we get from ``selectors_to_compact``. + selectors_to_compact: + This is the list of original selectors with the same value under a certain field. + + Returns + ------- + The list of selectors where unnecessary selectors have been removed from ``compacted_selectors``. + """ + + # Maps from each selector in "compacted_selectors" to selectors which it covers in "selectors_to_compact". + contained_selectors: typing.Dict[TupleSelector, typing.List[TupleSelector]] = collections.defaultdict(list) + selector_count_mask: typing.Dict[TupleSelector, int] = collections.defaultdict(int) + + # Compute for each selector in "selectors_to_compact" how many selectors in "compacted_selectors" cover them. + # Also builds the "contained_selectors". + for compact_selector in compacted_selectors: + for selector in selectors_to_compact: + if cls._selector_contained(selector, compact_selector): + selector_count_mask[selector] += 1 + contained_selectors[compact_selector].append(selector) + + continue_flag = True + while continue_flag: + continue_flag = False + for compact_selector in compacted_selectors: + remove_flag = True + for selector in contained_selectors[compact_selector]: + if selector_count_mask[selector] == 1: + remove_flag = False + if remove_flag: + continue_flag = True + redundant_selector = compact_selector + if continue_flag: + compacted_selectors.remove(redundant_selector) + for selector in contained_selectors[redundant_selector]: + selector_count_mask[selector] -= 1 + + return compacted_selectors + + @classmethod + def check_selector(cls, selector: Selector) -> None: + """ + Checks that a given ``selector`` is a valid selector. If ``selector`` is invalid it raises an exception. + + It checks that it is a tuple or a list and currently we require that all segments of a selector + are strings, integers, or a special value ``ALL_ELEMENTS``. + + Parameters + ---------- + selector: + Selector to check. + """ + + if not isinstance(selector, (tuple, list)): + raise exceptions.InvalidArgumentTypeError("Selector is not a tuple or a list.") + + for i, segment in enumerate(selector): + if not isinstance(segment, (str, int)) and segment is not ALL_ELEMENTS: + raise exceptions.InvalidArgumentTypeError( + "'{segment}' at {path} is not a str, int, or ALL_ELEMENTS.".format( + segment=segment, + path=list(selector[0:i + 1]), + ), + ) + + def __hash__(self) -> int: + if self._hash is None: + self._hash = hash(self._current_metadata) + + return self._hash + + def __eq__(self, other): # type: ignore + if not isinstance(other, Metadata): + return NotImplemented + + return self._current_metadata == other._current_metadata + + def get_elements(self, selector: Selector) -> typing.Sequence[SelectorSegment]: + """ + Returns a list of element names which exists under a selector, if any. + + Parameters + ---------- + selector: + A selector to return elements under. + + Returns + ------- + List of element names. + """ + + self.check_selector(selector) + + return self._get_elements(selector, self._current_metadata) + + def _get_elements(self, selector: Selector, metadata_entry: typing.Optional[MetadataEntry]) -> typing.Sequence[SelectorSegment]: + if metadata_entry is None: + return [] + if len(selector) == 0: + if metadata_entry.all_elements is not None: + all_elements: ListSelector = [ALL_ELEMENTS] + else: + all_elements = [] + return all_elements + list(metadata_entry.elements.keys()) + + segment, selector_rest = selector[0], selector[1:] + + all_elements_elements = self._get_elements(selector_rest, metadata_entry.all_elements) + if segment is ALL_ELEMENTS: + elements = all_elements_elements + elif segment in metadata_entry.elements: + segment = typing.cast(SimpleSelectorSegment, segment) + elements = self._get_elements(selector_rest, metadata_entry.elements[segment]) + elements = sorted(set(typing.cast(typing.List, all_elements_elements) + typing.cast(typing.List, elements))) + else: + elements = all_elements_elements + + return elements + + def to_internal_json_structure(self) -> typing.Sequence[typing.Dict]: + """ + Converts metadata to a JSON-compatible structure. + + The structure exposes how metadata is stored internally (metadata for ``ALL_ELEMENTS`` + separate from metadata for individual elements) and can change in the future. + This method exist for debugging purposes and to allow serialization of metadata. + Use `to_json_structure` method if you want to access semantically valid + representation of metadata. + + Returns + ------- + A JSON-compatible list of dicts. + """ + + ALL_ELEMENTS_REPR = repr(ALL_ELEMENTS) + + return [ + { + 'selector': [ALL_ELEMENTS_REPR if segment is ALL_ELEMENTS else segment for segment in entry['selector']], + 'metadata': utils.to_reversible_json_structure(entry['metadata']), + } + for entry in self.to_internal_simple_structure() + ] + + def to_internal_simple_structure(self) -> typing.Sequence[typing.Dict]: + """ + Converts metadata to a simple structure, similar to JSON, but with values + left as Python values. + + The structure exposes how metadata is stored internally (metadata for ``ALL_ELEMENTS`` + separate from metadata for individual elements) and can change in the future. + This method exist for debugging purposes and to allow serialization of metadata. + Use `to_simple_structure` method if you want to access semantically valid + representation of metadata. + + Returns + ------- + A list of dicts. + """ + + return self._to_internal_simple_structure([], self._current_metadata) + + @classmethod + def from_internal_json_structure(cls: typing.Type[T], json_structure: typing.Iterable[typing.Dict]) -> T: + """ + Constructs metadata object back from an internal JSON-compatible structure. + as made by ``to_internal_json_structure``. + + Parameters + ---------- + json_structure: + Iterable of the structure. + + Returns + ------- + Constructed metadata object. + """ + + ALL_ELEMENTS_REPR = repr(ALL_ELEMENTS) + + return cls.from_internal_simple_structure( + { + 'selector': [ALL_ELEMENTS if segment == ALL_ELEMENTS_REPR else segment for segment in entry['selector']], + 'metadata': utils.from_reversible_json_structure(entry['metadata']), + } for entry in json_structure + ) + + @classmethod + def from_internal_simple_structure(cls: typing.Type[T], structure: typing.Iterable[typing.Dict]) -> T: + """ + Constructs metadata object back from an internal simple structure, + as made by ``to_internal_simple_structure``. + + Parameters + ---------- + structure: + Iterable of the structure. + + Returns + ------- + Constructed metadata object. + """ + + metadata = cls() + + # TODO: Optimize, see: https://gitlab.com/datadrivendiscovery/d3m/issues/408 + for entry in structure: + metadata = metadata.update(entry['selector'], entry['metadata']) + + return metadata + + def _to_internal_simple_structure(self, selector: Selector, metadata_entry: typing.Optional[MetadataEntry]) -> typing.List[typing.Dict]: + output = [] + + selector = typing.cast(ListSelector, selector) + + if metadata_entry.metadata: + output.append({ + 'selector': list(selector), + 'metadata': metadata_entry.metadata, + }) + + if metadata_entry.all_elements is not None: + output += self._to_internal_simple_structure(selector + [ALL_ELEMENTS], metadata_entry.all_elements) + + for element_segment, element_metadata_entry in metadata_entry.elements.items(): + output += self._to_internal_simple_structure(selector + [element_segment], element_metadata_entry) + + return output + + def to_json_structure(self) -> typing.Sequence[typing.Dict]: + """ + Converts metadata to a JSON-compatible structure. + + The output matches the output one obtain by using `query` method and is a + semantically valid representation of metadata, but it does not matches + how metadata is stored internally. To obtain that, you can use + `to_internal_json_structure` method. + + It does not make a JSON structure which can then be parsed back to + reconstruct original metadata object. To obtain that, you can use + `to_internal_json_structure` method. + + Returns + ------- + A JSON-compatible list of dicts. + """ + + return utils.to_json_structure(self.to_simple_structure()) + + def to_simple_structure(self) -> typing.Sequence[typing.Dict]: + """ + Converts metadata to a simple structure, similar to JSON, but with values + left as Python values. + + The output matches the output one obtain by using `query` method and is a + semantically valid representation of metadata, but it does not matches + how metadata is stored internally. To obtain that, you can use + `to_internal_simple_structure` method. + + It does not make a structure which can then be converted back to + reconstruct original metadata object. To obtain that, you can use + `to_internal_simple_structure` method. + + Returns + ------- + A list of dicts. + """ + + return self._to_simple_structure([]) + + def _to_simple_structure(self, selector: Selector) -> typing.List[typing.Dict]: + output = [] + + selector = typing.cast(ListSelector, selector) + + if 'selector' in inspect.signature(self.query).parameters: + query = self.query + else: + def query(selector: Selector, *, ignore_all_elements: bool = False, remove_no_value: bool = True) -> frozendict.FrozenOrderedDict: + return self.query() # type: ignore + + metadata = query(selector=selector) + if metadata: + output.append({ + 'selector': list(selector), + 'metadata': metadata, + }) + + elements = self.get_elements(selector) + + for element in elements: + output += self._to_simple_structure(selector + [element]) + + return output + + def pretty_print(self, selector: Selector = None, handle: typing.IO[typing.Any] = None, _level: int = 0) -> None: + """ + Pretty-prints metadata to ``handle``, or `sys.stdout` if not specified. + + The output matches the output one obtain by using `query` method and is a + semantically valid representation of metadata, but it does not matches + how metadata is stored internally. To obtain that, you can use + `to_internal_json_structure` and `to_internal_simple_structure` methods. + + Parameters + ---------- + selector: + A selector to start pretty-printing at. + handle: + A handle to pretty-print to. Default is `sys.stdout`. + """ + + if selector is None: + selector = [] + + if handle is None: + handle = sys.stdout + + self.check_selector(selector) + + selector = list(selector) + + if 'selector' in inspect.signature(self.query).parameters: + query = self.query + else: + def query(selector: Selector, *, ignore_all_elements: bool = False, remove_no_value: bool = True) -> frozendict.FrozenOrderedDict: + return self.query() # type: ignore + + indent = ' ' * _level + + handle.write('{indent}Selector:\n{indent} {selector}\n'.format(indent=indent, selector=tuple(selector))) + + handle.write('{indent}Metadata:\n'.format(indent=indent)) + for line in json.dumps(utils.to_json_structure(query(selector=selector)), indent=1, allow_nan=False).splitlines(): + handle.write('{indent} {line}\n'.format(indent=indent, line=line)) + + elements = self.get_elements(selector) + + if not elements: + return + + if ALL_ELEMENTS in elements: + handle.write('{indent}All elements:\n'.format(indent=indent)) + self.pretty_print(selector + [ALL_ELEMENTS], handle=handle, _level=_level + 1) + + first_element = True + for element in elements: + if element is ALL_ELEMENTS: + continue + + if first_element: + handle.write('{indent}Elements:\n'.format(indent=indent)) + first_element = False + + self.pretty_print(selector + [element], handle=handle, _level=_level + 1) + + def _copy_elements_metadata(self, target_metadata: T, from_selector: ListSelector, + to_selector: ListSelector, selector: ListSelector, ignore_all_elements: bool) -> T: + # "ALL_ELEMENTS" is always first, if it exists, which works in our favor here. + # We are copying metadata for both "ALL_ELEMENTS" and elements themselves, so + # we do not have to merge metadata together for elements themselves. + elements = self.get_elements(from_selector + selector) + + for element in elements: + new_selector = selector + [element] + metadata = self._query(from_selector + new_selector, self._current_metadata, 0 if ignore_all_elements else len(from_selector)) + target_metadata = target_metadata.update(to_selector + new_selector, metadata) + target_metadata = self._copy_elements_metadata(target_metadata, from_selector, to_selector, new_selector, ignore_all_elements) + + return target_metadata + + def copy_to(self, target_metadata: T, from_selector: Selector, + to_selector: Selector = (), *, ignore_all_elements: bool = False) -> T: + """ + Recursively copies metadata to ``target_metadata``, starting at the + ``from_selector`` and to a selector starting at ``to_selector``. + """ + + metadata = self._query(from_selector, self._current_metadata, 0 if ignore_all_elements else len(from_selector)) + + # Do not copy top-level "schema" field to a lower level. + if from_selector == () and to_selector != () and 'schema' in metadata: + # Copy so that we can mutate. + metadata_dict = collections.OrderedDict(metadata) + del metadata_dict['schema'] + metadata = frozendict.FrozenOrderedDict(metadata_dict) + + target_metadata = target_metadata.update(to_selector, metadata) + + return self._copy_elements_metadata(target_metadata, list(from_selector), list(to_selector), [], ignore_all_elements) + + +class DataMetadata(Metadata): + """ + A class for metadata for data values. + + It checks all updates against container and data schemas. Note that as such empty (just created) metadata object + does not validate against schemas. Consider setting required fields manually or use `generate` method as a + helper to do so. + + It has additional helper methods for operating on metadata of tabular data. + + Parameters + ---------- + metadata: + Optional initial metadata for the top-level of the value. + for_value: + Optional value to automatically generate metadata for. DEPRECATED: use explicit generate method call instead. + generate_metadata: bool + Automatically generate metadata from ``for_value`` and update the metadata accordingly. + DEPRECATED: use explicit generate method call instead. + check: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + """ + + @deprecate.arguments('for_value', 'generate_metadata', message="use explicit generate method call instead") + @deprecate.arguments('source', 'timestamp', 'check', message="argument ignored") + def __init__(self, metadata: typing.Dict[str, typing.Any] = None, for_value: typing.Any = None, *, + generate_metadata: bool = True, check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None) -> None: + super().__init__(metadata=metadata) + + if for_value is not None and generate_metadata: + self._generate(for_value) + + @deprecate.arguments('source', 'timestamp', 'check', 'for_value', message="argument ignored") + def update(self: D, selector: Selector, metadata: typing.Dict[str, typing.Any], *, for_value: typing.Any = None, + check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None) -> D: + """ + Updates metadata with new ``metadata`` for data pointed to with ``selector``. + + If value of any field is ``NO_VALUE``, that field is deleted. + + It returns a copy of this metadata object with new metadata applied. + + Parameters + ---------- + selector: + A selector pointing to data. + metadata: + A map of fields and values with metadata. + for_value: + DEPRECATED: argument ignored. + check: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Returns + ------- + Updated metadata. + """ + + return super().update(selector=selector, metadata=metadata) + + @deprecate.arguments('source', 'timestamp', 'check', 'for_value', message="argument ignored") + def remove(self: D, selector: Selector, *, recursive: bool = False, strict_all_elements: bool = False, + for_value: typing.Any = None, check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None) -> D: + """ + Removes all metadata at ``selector``. + + Parameters + ---------- + selector: + A selector to remove metadata at. + recursive: + Should remove also all metadata under the ``selector``? + strict_all_elements: + If ``True``, then when removing ``ALL_ELEMENTS`` entry, do not remove also metadata for all elements it matches. + for_value: + DEPRECATED: argument ignored. + check: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Returns + ------- + Updated metadata. + """ + + return super().remove(selector=selector, recursive=recursive, strict_all_elements=strict_all_elements) + + @deprecate.function(message="use generate method instead") + @deprecate.arguments('source', 'timestamp', 'check', message="argument ignored") + def set_for_value(self: D, for_value: typing.Any = None, *, generate_metadata: bool = True, check: bool = True, + source: typing.Any = None, timestamp: datetime.datetime = None) -> D: + """ + DEPRECATED: use ``generate`` method instead. + + If ``generate_metadata`` is set, generate metadata from ``for_value`` and update the metadata accordingly. + + Parameters + ---------- + for_value: + Value to automatically generate metadata for. + generate_metadata: bool + Automatically generate metadata from ``for_value`` and update the metadata accordingly. + check: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Returns + ------- + Metadata object updated with automatically generated metadata. + """ + + if for_value is not None and generate_metadata: + return self.generate(for_value) + else: + return self + + def generate(self: D, value: typing.Any = None, *, compact: bool = False) -> D: + """ + Metadata about structure of data (dimensions) and structural types is + generated for the ``value``, and existing metadata is updated accordingly. + + Parameters + ---------- + value: + Value to automatically generate metadata for. + compact: + Compact automatically generated metadata. Produces equivalent but compact + metadata where equal metadata for all elements in a dimension are compacted + into ``ALL_ELEMENTS`` selector segment. + + Returns + ------- + Metadata object updated with automatically generated metadata. + """ + + new_metadata = copy.copy(self) + + new_metadata._generate(value, compact) + + return new_metadata + + def _generate(self, value: typing.Any = None, compact: bool = False) -> None: + # Importing here to prevent import cycle. And to not import it many times inside "_generate_metadata". + from d3m import container, types as d3m_types + + if value is None: + raise exceptions.InvalidArgumentValueError("\"value\" argument cannot be None.") + + generated_metadata_dict = self._generate_metadata(container, d3m_types, value, (), True) + + if compact: + # We make all metadata immutable so that it is hashable, which is required for the "_compact_generated_metadata". + for selector, metadata in generated_metadata_dict.items(): + generated_metadata_dict[selector] = utils.make_immutable_copy(metadata) + + # Because we generated all metadata we know that we can compact it. + # If some metadata holds for all elements we know that we can move it to "ALL_ELEMENTS". + generated_metadata_dict = self._compact_metadata(generated_metadata_dict, ALL_GENERATED_FIELDS) + + self._update_with_generated_metadata(generated_metadata_dict) + + # TODO: Also remove metadata for columns/rows which do not exist anymore. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/336 + + # TODO: Should we handle inheritance between semantic types here? + def has_semantic_type(self, selector: Selector, semantic_type: str) -> bool: + try: + return semantic_type in self.query_field(selector, 'semantic_types') + except KeyError: + return False + + @deprecate.arguments('source', 'timestamp', message="argument ignored") + def remove_semantic_type(self: D, selector: Selector, semantic_type: str, *, source: typing.Any = None, timestamp: datetime.datetime = None) -> D: + try: + semantic_types = self.query_field(selector, 'semantic_types') + except KeyError: + return self + if not semantic_types: + return self + new_semantic_types = tuple(st for st in semantic_types if st != semantic_type) + if new_semantic_types == semantic_types: + return self + return self.update(selector, {'semantic_types': new_semantic_types}) + + @deprecate.arguments('source', 'timestamp', message="argument ignored") + def add_semantic_type(self: D, selector: Selector, semantic_type: str, *, source: typing.Any = None, timestamp: datetime.datetime = None) -> D: + try: + semantic_types = self.query_field(selector, 'semantic_types') + except KeyError: + semantic_types = () + if semantic_type in semantic_types: + return self + semantic_types += (semantic_type,) + return self.update(selector, {'semantic_types': semantic_types}) + + # TODO: This does not look too efficient. Optimize? + def get_elements_with_semantic_type(self, selector: Selector, semantic_type: str) -> typing.Sequence[SelectorSegment]: + all_elements = self.get_elements(selector) + + return [element for element in all_elements if self.has_semantic_type(list(selector) + [element], semantic_type)] + + def query_column(self, column_index: int, *, at: Selector = (), ignore_all_elements: bool = False) -> frozendict.FrozenOrderedDict: + """ + Returns column metadata. + + This assumes that column metadata is stored under ``(ALL_ELEMENTS, column_index)``, at + optionally ``at`` selector, which might not necessary hold if metadata is not compacted. + Consider using `query_column_field`. + + Parameters + ---------- + column_index: + Column index to use. + at: + Selector at which to assume tabular metadata. + ignore_all_elements: + By default, metadata from ALL_ELEMENTS is merged with metadata for an element itself. + By setting this argument to ``True``, this is disabled and just metadata from an element is returned. + + Returns + ------- + Metadata of a given column. + """ + + return self.query(list(at) + [ALL_ELEMENTS, column_index], ignore_all_elements=ignore_all_elements) + + def query_column_field(self, column_index: int, field: str, *, at: Selector = (), strict_all_elements: bool = True) -> typing.Any: + """ + Returns ``field`` value of column metadata. Raises `KeyError` exception if metadata or field + is not set. + + ``field`` represents only top-level fields in metadata. + + Parameters + ---------- + column_index: + Column index to use. + field: + A field name to query. + at: + Selector at which to assume tabular metadata. + strict_all_elements: + If set, the method does not just return ``field`` value of column metadata, + but checks that the value really holds for all rows matching the ``selector``, + without exception. This is helpful also if metadata is not compacted and + ``field`` value is the same across all rows, but ``ALL_ELEMENTS`` metadata + does not contain that field. + + Returns + ------- + A value of ``field`` of a given column. + """ + + return self.query_field(list(at) + [ALL_ELEMENTS, column_index], field, strict_all_elements=strict_all_elements) + + @deprecate.arguments('source', 'timestamp', message="argument ignored") + def update_column(self: D, column_index: int, metadata: typing.Dict[str, typing.Any], *, at: Selector = (), source: typing.Any = None, timestamp: datetime.datetime = None) -> D: + """ + Updates column metadata with new ``metadata`` for column identified by ``column_index``. + + This stores column metadata under ``(ALL_ELEMENTS, column_index)``, at optionally ``at`` selector. + + Parameters + ---------- + column_index: + Column index to update. + metadata: + A map of fields and values with metadata. + at: + Selector at which to assume tabular metadata. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Returns + ------- + Updated column metadata. + """ + + return self.update(list(at) + [ALL_ELEMENTS, column_index], metadata) + + @deprecate.arguments('source', 'timestamp', 'for_value', message="argument ignored") + def remove_column(self: D, column_index: int, *, at: Selector = (), recursive: bool = False, strict_all_elements: bool = False, + for_value: typing.Any = None, source: typing.Any = None, timestamp: datetime.datetime = None) -> D: + """ + Removes all column metadata for column ``column_index``. + + This removes column metadata under ``(ALL_ELEMENTS, column_index)``, at optionally ``at`` selector. + It does not move to the left metadata for columns after the removed column. + If you want that, use ``remove_columns``. + + Parameters + ---------- + column_index: + Column index to remove. + at: + Selector at which to assume tabular metadata. + recursive: + Should remove also all metadata under the ``selector``? + strict_all_elements: + If ``True``, then when removing ``ALL_ELEMENTS`` entry, do not remove also metadata for all elements it matches. + for_value: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Returns + ------- + Updated metadata. + """ + + return self.remove( + list(at) + [ALL_ELEMENTS, column_index], recursive=recursive, strict_all_elements=strict_all_elements, + ) + + def get_columns_with_semantic_type(self, semantic_type: str, *, at: Selector = ()) -> typing.Sequence[SelectorSegment]: + return self.get_elements_with_semantic_type(list(at) + [ALL_ELEMENTS], semantic_type) + + def list_columns_with_semantic_types(self, semantic_types: typing.Sequence[str], *, at: Selector = ()) -> typing.Sequence[int]: + """ + This is similar to ``get_columns_with_semantic_type``, but it returns all column indices + for a dimension instead of ``ALL_ELEMENTS`` element. + + Moreover, it operates on a list of semantic types, where a column is returned + if it matches any semantic type on the list. + """ + + columns = [] + + for element in self.get_elements(list(at) + [ALL_ELEMENTS]): + try: + metadata_semantic_types = self.query_field(list(at) + [ALL_ELEMENTS, element], 'semantic_types') + except KeyError: + metadata_semantic_types = () + + # TODO: Should we handle inheritance between semantic types here? + if any(semantic_type in metadata_semantic_types for semantic_type in semantic_types): + if element is ALL_ELEMENTS: + try: + dimension = self.query_field(list(at) + [ALL_ELEMENTS], 'dimension') + except KeyError: + dimension = {} + return list(range(dimension.get('length', 0))) + else: + columns.append(typing.cast(int, element)) + + return columns + + def list_columns_with_structural_types( + self, structural_types: typing.Union[typing.Callable, typing.Sequence[typing.Union[str, type]]], *, + at: Selector = (), + ) -> typing.Sequence[int]: + """ + Returns a list of columns matching any of the structural types listed in + ``structural_types``. Matching allows subclasses of those types. ``structural_types`` can also be + a function to call to check a structural type. + """ + + columns = [] + + if callable(structural_types): + predicate = structural_types + else: + def predicate(typ: type) -> bool: + return any(utils.matches_structural_type(typ, structural_type) for structural_type in typing.cast(typing.Sequence[typing.Union[str, type]], structural_types)) + + for element in self.get_elements(list(at) + [ALL_ELEMENTS]): + try: + metadata_structural_type = self.query_field(list(at) + [ALL_ELEMENTS, element], 'structural_type') + except KeyError: + continue + + if predicate(metadata_structural_type): + if element is ALL_ELEMENTS: + try: + dimension = self.query_field(list(at) + [ALL_ELEMENTS], 'dimension') + except KeyError: + dimension = {} + return list(range(dimension.get('length', 0))) + else: + columns.append(typing.cast(int, element)) + + return columns + + def _merge_generated_metadata(self, old_metadata: frozendict.FrozenOrderedDict, metadata: frozendict.FrozenOrderedDict) -> frozendict.FrozenOrderedDict: + # Copy so that we can mutate. + new_metadata = collections.OrderedDict(metadata) + + # Use generated "name" only if "name" does not already exist. + # This holds even if existing "name" is "NO_VALUE". + if 'name' in new_metadata and 'name' in old_metadata: + del new_metadata['name'] + + if 'name' in new_metadata.get('dimension', {}) and 'name' in old_metadata.get('dimension', {}): + # Copy so that we can mutate. + new_metadata['dimension'] = collections.OrderedDict(new_metadata['dimension']) + del new_metadata['dimension']['name'] + new_metadata['dimension'] = frozendict.FrozenOrderedDict(new_metadata['dimension']) + + if 'semantic_types' in new_metadata: + semantic_types = list(old_metadata.get('semantic_types', [])) + for semantic_type in new_metadata['semantic_types']: + if semantic_type not in semantic_types: + # Only one tabular semantic type can exist at a time. + if semantic_type in TABULAR_SEMANTIC_TYPES: + semantic_types = [st for st in semantic_types if st not in TABULAR_SEMANTIC_TYPES] + semantic_types.append(semantic_type) + new_metadata['semantic_types'] = tuple(semantic_types) + + if 'semantic_types' in new_metadata.get('dimension', {}): + semantic_types = list(old_metadata.get('dimension', {}).get('semantic_types', [])) + for semantic_type in new_metadata['dimension']['semantic_types']: + if semantic_type not in semantic_types: + # Only one tabular semantic type can exist at a time. + if semantic_type in TABULAR_SEMANTIC_TYPES: + semantic_types = [st for st in semantic_types if st not in TABULAR_SEMANTIC_TYPES] + semantic_types.append(semantic_type) + # Copy so that we can mutate. + new_metadata['dimension'] = collections.OrderedDict(new_metadata['dimension']) + new_metadata['dimension']['semantic_types'] = tuple(semantic_types) + new_metadata['dimension'] = frozendict.FrozenOrderedDict(new_metadata['dimension']) + + # If structural type was not generated now, but it exists before, we have to remove it. + # Here we just delete it from "old_metadata" so that it is not re-set back, while + # we really handle it in "_update_with_generated_metadata". + if 'structural_type' not in new_metadata and 'structural_type' in old_metadata: + # Copy so that we can mutate. + old_metadata_dict = collections.OrderedDict(old_metadata) + del old_metadata_dict['structural_type'] + old_metadata = frozendict.FrozenOrderedDict(old_metadata_dict) + + return self._merge_metadata(old_metadata, frozendict.FrozenOrderedDict(new_metadata)) + + def _diff_generated_metadata(self, element_metadata: frozendict.FrozenOrderedDict, metadata: frozendict.FrozenOrderedDict) -> frozendict.FrozenOrderedDict: + """ + When preparing updates for automatically generated metadata we want to make sure we do not override any metadata + directly set on elements with metadata on ``ALL_ELEMENTS``. In this method we compute which metadata to update + after the automatically generated metadata is set for ``ALL_ELEMENTS`` to restore the metadata directly set + on elements. + """ + + # Copy so that we can mutate. + new_element_metadata = collections.OrderedDict(element_metadata) + + # No need to set name if it is equal to metadata on "ALL_ELEMENTS". + if 'name' in new_element_metadata and 'name' in metadata and new_element_metadata['name'] == metadata['name']: + del new_element_metadata['name'] + + # No need to set name if it is equal to metadata on "ALL_ELEMENTS". + if 'name' in new_element_metadata.get('dimension', {}) and 'name' in metadata.get('dimension', {}) and new_element_metadata['dimension']['name'] == metadata['dimension']['name']: + # Copy so that we can mutate. + new_element_metadata['dimension'] = collections.OrderedDict(new_element_metadata['dimension']) + del new_element_metadata['dimension']['name'] + new_element_metadata['dimension'] = frozendict.FrozenOrderedDict(new_element_metadata['dimension']) + + if 'semantic_types' in new_element_metadata and 'semantic_types' in metadata: + # No need to merge semantic types if they are equal to metadata on "ALL_ELEMENTS". + if set(new_element_metadata['semantic_types']) == set(metadata['semantic_types']): + del new_element_metadata['semantic_types'] + else: + semantic_types = list(new_element_metadata['semantic_types']) + for semantic_type in metadata['semantic_types']: + if semantic_type not in semantic_types: + # Only one tabular semantic type can exist at a time. + if semantic_type in TABULAR_SEMANTIC_TYPES: + semantic_types = [st for st in semantic_types if st not in TABULAR_SEMANTIC_TYPES] + semantic_types.append(semantic_type) + new_element_metadata['semantic_types'] = tuple(semantic_types) + + if 'semantic_types' in new_element_metadata.get('dimension', {}) and 'semantic_types' in metadata.get('dimension', {}): + # No need to merge semantic types if they are equal to metadata on "ALL_ELEMENTS". + if set(new_element_metadata['dimension']['semantic_types']) == set(metadata['dimension']['semantic_types']): + new_element_metadata['dimension'] = collections.OrderedDict(new_element_metadata['dimension']) + del new_element_metadata['dimension']['semantic_types'] + new_element_metadata['dimension'] = frozendict.FrozenOrderedDict(new_element_metadata['dimension']) + else: + semantic_types = list(new_element_metadata['dimension']['semantic_types']) + for semantic_type in metadata['dimension']['semantic_types']: + if semantic_type not in semantic_types: + # Only one tabular semantic type can exist at a time. + if semantic_type in TABULAR_SEMANTIC_TYPES: + semantic_types = [st for st in semantic_types if st not in TABULAR_SEMANTIC_TYPES] + semantic_types.append(semantic_type) + # Copy so that we can mutate. + new_element_metadata['dimension'] = collections.OrderedDict(new_element_metadata['dimension']) + new_element_metadata['dimension']['semantic_types'] = tuple(semantic_types) + new_element_metadata['dimension'] = frozendict.FrozenOrderedDict(new_element_metadata['dimension']) + + # Structural type is always set or removed by generated metadata, so it should not be directly set on elements. + if 'structural_type' in new_element_metadata: + del new_element_metadata['structural_type'] + + for generated_field in ALL_GENERATED_FIELDS: + # We already processed these. + if generated_field in {'name', 'dimension', 'semantic_types', 'structural_type'}: + continue + + # No need to set this field if it is equal to metadata on "ALL_ELEMENTS". + if generated_field in new_element_metadata and generated_field in metadata and new_element_metadata[generated_field] == metadata[generated_field]: + del new_element_metadata[generated_field] + + # We iterate over a list so that we can change dict while iterating. + for field in list(new_element_metadata.keys()): + # We already processed these. + if field in ALL_GENERATED_FIELDS: + continue + + # Other fields are never generated, so they are never overridden, so no need to set them again. + del new_element_metadata[field] + + if 'dimension' in new_element_metadata: + # Copy so that we can mutate. + new_element_metadata['dimension'] = collections.OrderedDict(new_element_metadata['dimension']) + + # Length is always set by generated metadata, so it should not be directly set on elements. + if 'length' in new_element_metadata['dimension']: + del new_element_metadata['dimension']['length'] + + # We iterate over a list so that we can change dict while iterating. + for field in list(new_element_metadata['dimension'].keys()): + # We already processed these. + if field in {'name', 'semantic_types'}: + continue + + # Other fields are never generated, so they are never overridden, so no need to set them again. + del new_element_metadata['dimension'][field] + + new_element_metadata['dimension'] = frozendict.FrozenOrderedDict(new_element_metadata['dimension']) + + # If dimension ended up empty, remove it. + if not new_element_metadata['dimension']: + del new_element_metadata['dimension'] + + return frozendict.FrozenOrderedDict(new_element_metadata) + + @classmethod + def _generate_metadata(cls: typing.Type[D], container: types.ModuleType, d3m_types: types.ModuleType, value: typing.Any, + selector: TupleSelector, is_root: bool = False) -> typing.Dict[TupleSelector, typing.Dict]: + """ + Returned metadata should be additionally compacted before use. + + We make sure that the first element of the returned dict is the entry which corresponds to the ``selector``. + + Important: Any top-level field set by this method should be listed in ``ALL_GENERATED_KEYS``. + """ + + generated_metadata: dict = {} + + if is_root: + generated_metadata['schema'] = CONTAINER_SCHEMA_VERSION + + # We use a simple type here, not "utils.get_type" because it is faster and also because we anyway + # traverse the data structure ourselves and store nested typing information ourselves into metadata. + generated_metadata['structural_type'] = type(value) + + # TODO: Traverse structure also for Graph objects. + # Fast path. We first check if the value is of a simple data type. + if isinstance(value, d3m_types.simple_data_types): # type: ignore + # We just store structural type of the value (already present in "generated_metadata"). + return collections.OrderedDict([(selector, generated_metadata)]) + + if isinstance(value, container.List): # type: ignore + generated_metadata['dimension'] = { + 'length': len(value), + } + + metadata_dict = collections.OrderedDict([(selector, generated_metadata)]) + + metadata_dict_list: typing.List[typing.Dict[TupleSelector, typing.Dict]] = [] + for v in value: + # We recurse with selector set to "()"so that it is easier to compare results for equality. + metadata_dict_list.append(cls._generate_metadata(container, d3m_types, v, ())) + + if metadata_dict_list: + # Equality of "OrderedDict" also checks for the equality in order of fields. + if all(element_dict == metadata_dict_list[0] for element_dict in metadata_dict_list): + selector_all_elements = selector + (ALL_ELEMENTS,) + + # All elements are equal, so we use the first element. + for element_selector, element_metadata in metadata_dict_list[0].items(): + # We recursed with selector set to "()" so we have to adapt the real selector now. + new_selector = selector_all_elements + element_selector + assert new_selector not in metadata_dict + metadata_dict[new_selector] = element_metadata + + else: + for element_index, element_dict in enumerate(metadata_dict_list): + for element_selector, element_metadata in element_dict.items(): + # We recursed with selector set to "()" so we have to adapt the real selector now. + new_selector = selector + (element_index,) + element_selector + assert new_selector not in metadata_dict + metadata_dict[new_selector] = element_metadata + + return metadata_dict + + if isinstance(value, container.Dataset): # type: ignore + generated_metadata['dimension'] = { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': len(value), + } + + metadata_dict = collections.OrderedDict([(selector, generated_metadata)]) + + for k, v in value.items(): + if not isinstance(k, str): + raise TypeError("Dataset resource ID has to be a string, not: {k_type}".format(k_type=type(k))) + metadata_dict.update(cls._generate_metadata(container, d3m_types, v, selector + (k,))) + + # It is unlikely that metadata is equal across dataset resources, so we do not try to compact metadata here. + + return metadata_dict + + if isinstance(value, container.DataFrame): # type: ignore + if len(value.shape) != 2: + raise ValueError("Only two-dimensional DataFrames are supported, at {selector}.".format(selector=selector)) + + generated_metadata['semantic_types'] = ['https://metadata.datadrivendiscovery.org/types/Table'] + + generated_metadata['dimension'] = { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': value.shape[0], + } + + metadata_dict = collections.OrderedDict([(selector, generated_metadata)]) + + # Reusing the variable for next dimension. + generated_metadata = { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': value.shape[1], + }, + } + + selector_all_rows = selector + (ALL_ELEMENTS,) + metadata_dict[selector_all_rows] = generated_metadata + + for column_index, dtype in enumerate(value.dtypes): + column_metadata = {} + + # Only if a column name is a string. DataFrame can have a sequence/numbers for column names + # but those are generally automatically generated so we do not use them as column names here. + if isinstance(value.columns[column_index], str): + # We set the name first, so that recursive calls to "_generate_metadata" can potentially + # override it. "_generate_metadata" does not do it for now, but it could do it in the future. + # Generated names to not override names if they already exists in metadata, which is + # handled in the "_update_with_generated_metadata" method. + column_metadata['name'] = value.columns[column_index] + + selector_all_rows_column = selector_all_rows + (column_index,) + + # Values are objects. This could be something as simple as a Python string, or a whole other container value nested. + if dtype.kind == 'O': + metadata_column_dict_list: typing.List[typing.Dict[TupleSelector, dict]] = [] + for row_index, cell_value in enumerate(value.iloc[:, column_index]): + # We recurse with selector set to "()"so that it is easier to compare results for equality. + metadata_column_dict_list.append(cls._generate_metadata(container, d3m_types, cell_value, ())) + + if metadata_column_dict_list: + # Equality of "OrderedDict" also checks for the equality in order of fields. + if all(row_dict == metadata_column_dict_list[0] for row_dict in metadata_column_dict_list): + # All rows are equal, so we use the first row. + for row_selector, row_metadata in metadata_column_dict_list[0].items(): + # We recursed with selector set to "()" so we have to adapt the real selector now. + new_selector = selector_all_rows_column + row_selector + if new_selector == selector_all_rows_column: + row_metadata.update(column_metadata) + assert new_selector not in metadata_dict + metadata_dict[new_selector] = row_metadata + + else: + metadata_dict[selector_all_rows_column] = column_metadata + + for row_index, row_dict in enumerate(metadata_column_dict_list): + for row_selector, row_metadata in row_dict.items(): + # We recursed with selector set to "()" so we have to adapt the real selector now. + new_selector = selector + (row_index, column_index) + row_selector + assert new_selector not in metadata_dict + metadata_dict[new_selector] = row_metadata + + else: + metadata_dict[selector_all_rows_column] = column_metadata + + else: + # DataFrame is trying to be smart and returns sometimes Python types instead + # of numpy types when retrieving values from it. On the other hand, dtypes are + # generally numpy types. So there can be discrepancy between recorded structural + # type in metadata and what you get for some operations out of a DataFrame. + # See: https://github.com/pandas-dev/pandas/issues/20791 + # https://github.com/pandas-dev/pandas/issues/13468 + column_metadata['structural_type'] = dtype.type + metadata_dict[selector_all_rows_column] = column_metadata + + return metadata_dict + + if isinstance(value, container.ndarray): # type: ignore + if not value.shape: + raise ValueError("Zero-dimensional arrays are not supported, at {selector}.".format(selector=selector)) + + metadata_dict = collections.OrderedDict() + + for dimension_index, dimension_length in enumerate(value.shape): + generated_metadata['dimension'] = { + 'length': dimension_length, + } + + if len(value.shape) == 2: + if dimension_index == 0: + generated_metadata['semantic_types'] = ['https://metadata.datadrivendiscovery.org/types/Table'] + generated_metadata['dimension']['name'] = 'rows' + generated_metadata['dimension']['semantic_types'] = ['https://metadata.datadrivendiscovery.org/types/TabularRow'] + elif dimension_index == 1: + generated_metadata['dimension']['name'] = 'columns' + generated_metadata['dimension']['semantic_types'] = ['https://metadata.datadrivendiscovery.org/types/TabularColumn'] + + metadata_dict[selector + (ALL_ELEMENTS,) * dimension_index] = generated_metadata + + # Reusing the variable for next dimension. + generated_metadata = {} + + if value.dtype.kind == 'O': + metadata_cell_dict_list: typing.List[typing.Dict[TupleSelector, typing.Dict]] = [] + metadata_cell_indices: typing.List[typing.Tuple] = [] + + iterator = numpy.nditer(value, flags=['multi_index', 'refs_ok']) + while not iterator.finished: + # We recurse with selector set to "()"so that it is easier to compare results for equality. + metadata_cell_dict_list.append(cls._generate_metadata(container, d3m_types, iterator.value.item(), ())) + metadata_cell_indices.append(tuple(iterator.multi_index)) + iterator.iternext() + + if metadata_cell_dict_list: + # Equality of "OrderedDict" also checks for the equality in order of fields. + if all(cell_dict == metadata_cell_dict_list[0] for cell_dict in metadata_cell_dict_list): + selector_all_cells = selector + (ALL_ELEMENTS,) * len(value.shape) + + # All cells are equal, so we use the first cell. + for cell_selector, cell_metadata in metadata_cell_dict_list[0].items(): + # We recursed with selector set to "()" so we have to adapt the real selector now. + new_selector = selector_all_cells + cell_selector + assert new_selector not in metadata_dict + metadata_dict[new_selector] = cell_metadata + + else: + for cell_index, cell_dict in zip(metadata_cell_indices, metadata_cell_dict_list): + for cell_selector, cell_metadata in cell_dict.items(): + # We recursed with selector set to "()" so we have to adapt the real selector now. + new_selector = selector + cell_index + cell_selector + assert new_selector not in metadata_dict + metadata_dict[new_selector] = cell_metadata + + else: + metadata_dict[selector + (ALL_ELEMENTS,) * len(value.shape)] = {'structural_type': value.dtype.type} + + return metadata_dict + + # We went through all container types and none matched. + if is_root: + assert not isinstance(value, d3m_types.Container), type(value) # type: ignore + raise TypeError("Value is not of a container type, but '{type}'.".format(type=type(value))) + + # A special case for dicts, for which we traverse the structure. + if isinstance(value, dict): + generated_metadata['dimension'] = { + 'length': len(value), + } + + metadata_dict = collections.OrderedDict([(selector, generated_metadata)]) + + metadata_dict_list = [] + metadata_indices: typing.List[typing.Tuple] = [] + for k, v in value.items(): + if not isinstance(k, (str, int)): + raise TypeError("Dict key has to be a string or an integer, not: {k_type}".format(k_type=type(k))) + # We recurse with selector set to "()"so that it is easier to compare results for equality. + metadata_dict_list.append(cls._generate_metadata(container, d3m_types, v, ())) + metadata_indices.append(k) + + if metadata_dict_list: + # Equality of "OrderedDict" also checks for the equality in order of fields. + if all(element_dict == metadata_dict_list[0] for element_dict in metadata_dict_list): + selector_all_elements = selector + (ALL_ELEMENTS,) + + # All elements are equal, so we use the first element. + for element_selector, element_metadata in metadata_dict_list[0].items(): + # We recursed with selector set to "()" so we have to adapt the real selector now. + new_selector = selector_all_elements + element_selector + assert new_selector not in metadata_dict + metadata_dict[new_selector] = element_metadata + + else: + for element_index, element_dict in zip(metadata_indices, metadata_dict_list): + for element_selector, element_metadata in element_dict.items(): + # We recursed with selector set to "()" so we have to adapt the real selector now. + new_selector = selector + (element_index,) + element_selector + assert new_selector not in metadata_dict + metadata_dict[new_selector] = element_metadata + + return metadata_dict + + # We checked for all simple data types, container types, and a dict. Nothing else is left. + assert not isinstance(value, d3m_types.Data) # type: ignore + raise TypeError("Value is not of a data type, but '{type}'.".format(type=type(value))) + + def _update_with_generated_metadata(self, generated_metadata_dict: typing.Dict[TupleSelector, dict]) -> None: + """ + This method works well really just with generated metadata. It has some assumptions what ``generated_metadata_dict`` + contains and how to merge things (merge semantic types, do not override names, clear unset structural types). + """ + + # We first preprocess given updates. We have to specially merge some fields and respect overrides + # on direct elements. + updates: typing.List[typing.Tuple[TupleSelector, dict]] = [] + for selector, metadata in generated_metadata_dict.items(): + existing_metadata, metadata_exceptions = self.query_with_exceptions(selector, remove_no_value=False) + + # If structural type was not generated now, but it exists before, we have to remove it. In "_merge_generated_metadata" we make sure + # it is not re-set back, and here we add an update at the beginning which removes it. The reason why it is at the beginning is that + # it could be that the reason why there is no "structural_type" in "metadata" is because it was moved to metadata for corresponding + # "ALL_ELEMENTS". So, the order is then: we remove it through direct selector, then maye "ALL_ELEMENTS" selector re-sets it back, + # and merged metadata does not re-set it, because we made sure about that in "_merge_generated_metadata". + if 'structural_type' not in metadata and 'structural_type' in existing_metadata: + updates.insert(0, (selector, {'structural_type': NO_VALUE})) + + metadata = self._merge_generated_metadata(existing_metadata, metadata) + + updates.append((selector, metadata)) + + for exception_selector, exception_metadata in metadata_exceptions.items(): + diff_metadata = self._diff_generated_metadata(exception_metadata, metadata) + + if diff_metadata: + updates.append((exception_selector, diff_metadata)) + + for selector, metadata in updates: + metadata = utils.make_immutable_copy(metadata) + + if not isinstance(metadata, frozendict.FrozenOrderedDict): + raise exceptions.InvalidArgumentTypeError("Metadata should be a dict.") + + self._current_metadata = self._update(selector, self._current_metadata, metadata) + + @deprecate.function(message="create a DataMetadata instance explicitly instead") + @deprecate.arguments('source', 'timestamp', 'check', message="argument ignored") + def clear(self: D, metadata: typing.Dict[str, typing.Any] = None, *, for_value: typing.Any = None, + generate_metadata: bool = True, check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None) -> D: + """ + DEPRECATED: create a DataMetadata instance explicitly instead. + + Creates and returns a new (clear) metadata object. + + Parameters + ---------- + metadata: + Optional new initial metadata for the top-level of the value. + for_value: + Optional value to automatically generate metadata for. + generate_metadata: bool + Automatically generate metadata from ``for_value`` and update the metadata accordingly. + check: + DEPRECATED: argument ignored. + source: + DEPRECATED: argument ignored. + timestamp: + DEPRECATED: argument ignored. + + Returns + ------- + New metadata object. + """ + + # We call wrapped parent method directly so that there are no double warnings. + new_metadata = super().clear.__wrapped__(self, metadata=metadata) + + if for_value is not None and generate_metadata: + new_metadata._generate(for_value) + + return new_metadata + + # TODO: Check if structural types match the real type of a value. + def check(self, value: typing.Any) -> None: + """ + Checks that all metadata has a corresponding data in ``value`` and that every + metadata value is valid according to schema. If not it raises an exception. + + Parameters + ---------- + value: + Value to check against. + """ + + self._check_value(self._current_metadata, value, []) + self._check_metadata([]) + + @classmethod + def _check_value(cls, metadata_entry: MetadataEntry, value: typing.Any, path: typing.List[SimpleSelectorSegment]) -> None: + if metadata_entry.all_elements is not None: + try: + # We should be able to at least compute length at this dimension + # (to signal that it is a sequence or a map). + len(value) + except Exception as error: + raise ValueError("ALL_ELEMENTS set but dimension missing at {path}.".format(path=path)) from error + + if isinstance(value, numpy.matrix): + # One cannot iterate over a matrix segment by segment. You always get back + # a matrix (2D structure) and not an array of rows or columns. By converting + # it to an array such iteration segment by segment works. + value = numpy.array(value) + + if isinstance(value, pandas.DataFrame): + for element_segment, element_metadata_entry in metadata_entry.elements.items(): + try: + # Fetch a row as a list. + element_value = [value.iloc[element_segment, k] for k in range(len(value.columns))] + except Exception as error: + raise ValueError("'{element_segment}' at {path} cannot be resolved.".format(element_segment=element_segment, path=path)) from error + + cls._check_value(element_metadata_entry, element_value, path + [element_segment]) + + else: + for element_segment, element_metadata_entry in metadata_entry.elements.items(): + try: + element_value = value[element_segment] + except Exception as error: + raise ValueError("'{element_segment}' at {path} cannot be resolved.".format(element_segment=element_segment, path=path)) from error + + cls._check_value(element_metadata_entry, element_value, path + [element_segment]) + + def _check_metadata(self, selector: ListSelector) -> None: + metadata = self.query(selector) + + if selector: + DATA_SCHEMA_VALIDATOR.validate(metadata) + else: + CONTAINER_SCHEMA_VALIDATOR.validate(metadata) + + for element in self.get_elements(selector): + self._check_metadata(selector + [element]) + + @classmethod + @deprecate.arguments('for_value', message="argument ignored") + def check_selector(cls, selector: Selector, for_value: typing.Any = None) -> None: + """ + Checks that a given ``selector`` is a valid selector. If ``selector`` is invalid it raises an exception. + + It checks that it is a tuple or a list and currently we require that all segments of a selector + are strings, integers, or a special value ``ALL_ELEMENTS``. + + Parameters + ---------- + selector: + Selector to check. + for_value: + DEPRECATED: argument ignored. + """ + + super().check_selector(selector=selector) + + def get_column_index_from_column_name(self, column_name: str, *, at: Selector = ()) -> int: + column_indices = [] + + for column_index in range(self.query_field(list(at) + [ALL_ELEMENTS], 'dimension')['length']): + try: + if self.query_field(list(at) + [ALL_ELEMENTS, column_index], 'name') == column_name: + column_indices.append(column_index) + except KeyError: + pass + + if len(column_indices) > 1: + raise KeyError( + "Cannot resolve column name '{column_name}' at '{at}' because of duplicate column names".format( + column_name=column_name, + at=at, + ), + ) + elif column_indices: + return column_indices[0] + else: + raise KeyError( + "Cannot resolve column name '{column_name}' at '{at}' because column could not be found.".format( + column_name=column_name, + at=at, + ), + ) + + def select_columns(self: D, columns: typing.Sequence[SimpleSelectorSegment], *, allow_empty_columns: bool = False) -> D: + """ + Returns a new metadata object with metadata only for given ``columns``. + Moreover, columns are renumbered based on the position in ``columns`` list. + Top-level metadata stays unchanged, except for updating the length of the columns dimension to + the number of columns. + + So if the ``columns`` is ``[3, 6, 5]`` then output metadata will have three columns, ``[0, 1, 2]``, + mapping metadata for columns ``3`` to ``0``, ``6`` to ``1`` and ``5`` to ``2``. + + This allows also duplication of columns. + """ + + if not columns and not allow_empty_columns: + raise exceptions.InvalidArgumentValueError("No columns selected.") + + # This makes a copy so that we can modify metadata in-place. + outputs_metadata = self.update( + (ALL_ELEMENTS,), + { + 'dimension': { + 'length': len(columns), + }, + }, + ) + + for element_metadata_entry in itertools.chain( + [outputs_metadata._current_metadata.all_elements], + outputs_metadata._current_metadata.elements.values(), + ): + if element_metadata_entry is None: + continue + + elements = element_metadata_entry.elements + new_elements_evolver = utils.EMPTY_PMAP.evolver() + for i, column_index in enumerate(columns): + if column_index in elements: + # If "column_index" is really numeric, we re-enumerate it. + if isinstance(column_index, int): + new_elements_evolver.set(i, elements[column_index]) + else: + new_elements_evolver.set(column_index, elements[column_index]) + element_metadata_entry.elements = new_elements_evolver.persistent() + element_metadata_entry.is_elements_empty = not element_metadata_entry.elements + element_metadata_entry.update_is_empty() + + # TODO: Update boundary columns and "confidence for" references. + + return outputs_metadata + + def remove_columns(self: D, column_indices: typing.Sequence[int]) -> D: + """ + Removes columns from metadata. + + It moves to the left metadata for columns after removed columns. + If you do not want that, use ``remove_column``. + + It throws an exception if no columns would be left after removing columns. + """ + + columns = list(range(self.query_field((ALL_ELEMENTS,), 'dimension')['length'])) + + if not columns: + raise ValueError("No columns to remove.") + + for column_index in column_indices: + columns.remove(column_index) + + if not columns: + raise ValueError("Removing columns would have removed the last column.") + + # TODO: Update boundary columns and "confidence for" references. + + return self.select_columns(columns) + + def append_columns(self: D, right: D, *, use_right_metadata: bool = False) -> D: + """ + Appends metadata for all columns from ``right`` to the right of this metadata. + + Top-level metadata of ``right`` is ignored, not merged, except if ``use_right_metadata`` + is set, in which case top-level metadata of this metadata is ignored and one from ``right`` is + used instead. + """ + + left_length = self.query_field((ALL_ELEMENTS,), 'dimension')['length'] + right_length = right.query_field((ALL_ELEMENTS,), 'dimension')['length'] + + if not use_right_metadata: + outputs_metadata = self + + for column_index in range(right_length): + # To go over "ALL_ELEMENTS" and all rows. + for element in right.get_elements(()): + outputs_metadata = right.copy_to(outputs_metadata, [element, ALL_ELEMENTS], [element, left_length + column_index], ignore_all_elements=True) + outputs_metadata = right.copy_to(outputs_metadata, [element, column_index], [element, left_length + column_index], ignore_all_elements=True) + + else: + # This makes a copy so that we can modify metadata in-place. + outputs_metadata = right.update( + (ALL_ELEMENTS,), + {}, + ) + + # Move columns and make space for left metadata to be prepended. + # We iterate over a list so that we can change dict while iterating. + for element_metadata_entry in itertools.chain( + [outputs_metadata._current_metadata.all_elements], + outputs_metadata._current_metadata.elements.values(), + ): + if element_metadata_entry is None: + continue + + new_elements_evolver = element_metadata_entry.elements.evolver() + for element, metadata in element_metadata_entry.elements.items(reverse=True): + new_elements_evolver.remove(element) + new_elements_evolver.set(element + left_length, metadata) + element_metadata_entry.elements = new_elements_evolver.persistent() + element_metadata_entry.is_elements_empty = not element_metadata_entry.elements + element_metadata_entry.update_is_empty() + + for column_index in range(left_length): + # To go over "ALL_ELEMENTS" and all rows. + for element in right.get_elements(()): + outputs_metadata = self.copy_to(outputs_metadata, [element, ALL_ELEMENTS], [element, column_index], ignore_all_elements=True) + outputs_metadata = self.copy_to(outputs_metadata, [element, column_index], [element, column_index], ignore_all_elements=True) + + outputs_metadata = outputs_metadata.update((ALL_ELEMENTS,), {'dimension': {'length': left_length + right_length}}) + + # TODO: Update boundary columns and "confidence for" references. + + return outputs_metadata + + def insert_columns(self: D, columns: D, at_column_index: int) -> D: + """ + Inserts metadata for all columns from ``columns`` before ``at_column_index`` column in this metadata, + pushing all existing columns to the right. + + E.g., ``at_column_index == 0`` means inserting ``columns`` at the beginning of this metadata. + + Top-level metadata of ``columns`` is ignored. + """ + + columns_length = columns.query_field((ALL_ELEMENTS,), 'dimension')['length'] + + if at_column_index < 0: + raise exceptions.InvalidArgumentValueError("\"at_column_index\" is smaller than 0.") + if at_column_index > columns_length: + raise exceptions.InvalidArgumentValueError("\"at_column_index\" is larger than the range of existing columns.") + + if at_column_index == 0: + return columns.append_columns(self, use_right_metadata=True) + + if at_column_index == columns_length: + return self.append_columns(columns) + + # TODO: This could probably be optimized without all the slicing and joining. + + before = self.select_columns(list(range(0, at_column_index))) + after = self.select_columns(list(range(at_column_index, columns_length))) + + # TODO: Update boundary columns and "confidence for" references. + + return before.append_columns(columns).append_columns(after) + + def _replace_column(self: D, column_index: int, columns: 'DataMetadata', columns_column_index: int) -> D: + outputs_metadata = self.remove_column(column_index) + + # To go over "ALL_ELEMENTS" and all rows. + for element in columns.get_elements(()): + outputs_metadata = columns.copy_to(outputs_metadata, [element, ALL_ELEMENTS], [element, column_index], ignore_all_elements=True) + outputs_metadata = columns.copy_to(outputs_metadata, [element, columns_column_index], [element, column_index], ignore_all_elements=True) + + return outputs_metadata + + def replace_columns(self: D, columns: D, column_indices: typing.Sequence[int]) -> D: + """ + Replaces columns listed in ``column_indices`` with ``columns``, in order, in this metadata. + + ``column_indices`` and ``columns`` do not have to match in number of columns. Columns are first + replaced in order for matching indices and columns. If then there are more ``column_indices`` than + ``columns``, additional ``column_indices`` columns are removed. If there are more ``columns`` than + ``column_indices`` columns, then additional ``columns`` are inserted after the last replaced column. + + If ``column_indices`` is empty, then the behavior is equivalent to calling ``append_columns``. + + Top-level metadata of ``columns`` is ignored. + """ + + # TODO: This could probably be optimized without all the slicing and joining. + + if not column_indices: + return self.append_columns(columns) + + outputs = self + columns_length = columns.query_field((ALL_ELEMENTS,), 'dimension')['length'] + columns_to_remove = [] + i = 0 + + # This loop will run always at least once, so "column_index" will be set. + while i < len(column_indices): + column_index = column_indices[i] + + if i < columns_length: + outputs = outputs._replace_column(column_index, columns, i) + else: + # If there are more column indices than columns in "columns", we + # select additional columns for removal. + columns_to_remove.append(column_index) + + i += 1 + + # When there are less column indices than columns in "columns", we insert the rest after + # the last replaced column. + if i < columns_length: + columns = columns.select_columns(list(range(i, columns_length))) + # "column_index" points to the last place we inserted a column, so "+ 1" points after it. + outputs = outputs.insert_columns(columns, column_index + 1) + + # We remove columns at the end so that we do not break and column index used before. + # When removing columns, column indices shift. + if columns_to_remove: + outputs = outputs.remove_columns(columns_to_remove) + + # TODO: Update boundary columns and "confidence for" references. + + return outputs + + def _check_same_number_of_samples(self, metadata: 'DataMetadata') -> None: + if self.query_field((), 'dimension')['length'] != metadata.query_field((), 'dimension')['length']: + raise ValueError("Data does not match in the number of samples.") + + def get_index_columns(self, *, at: Selector = ()) -> typing.Sequence[int]: + """ + Returns column indices of the primary index columns. + + It makes sure ``d3mIndex`` is always first listed. + """ + + index_columns = self.list_columns_with_semantic_types(('https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey'), at=at) + + def d3m_index_first(index_column: int) -> int: + try: + if self.query_field((ALL_ELEMENTS, index_column), 'name') == 'd3mIndex': + return -1 + except KeyError: + pass + + return 0 + + return sorted(index_columns, key=d3m_index_first) + + def horizontal_concat(self: D, right: D, *, use_index: bool = True, remove_second_index: bool = True, use_right_metadata: bool = False) -> D: + """ + Similar to ``append_columns``, but it respects primary index columns, by default. + + It is required that both inputs have the same number of samples. + """ + + self._check_same_number_of_samples(right) + + left_indices = self.get_index_columns() + right_indices = right.get_index_columns() + + if left_indices and right_indices: + if use_index: + # TODO: Reorder metadata rows as well. + # We cannot really do this without data? + pass + + # Removing second primary key column. + if remove_second_index: + right = right.remove_columns(right_indices) + + # TODO: Update boundary columns and "confidence for" references. + + return self.append_columns(right, use_right_metadata=use_right_metadata) + + def set_table_metadata(self: D, *, at: Selector = ()) -> D: + at = list(at) + + outputs_metadata = self + + try: + dimension = self.query_field(at + [ALL_ELEMENTS], 'dimension') + except KeyError: + dimension = None + + # If input is at least 2D, then we set table metadata. + if dimension is not None: + metadata = outputs_metadata.query(at) + + semantic_types = list(metadata.get('semantic_types', [])) + if 'https://metadata.datadrivendiscovery.org/types/Table' not in semantic_types: + semantic_types.append('https://metadata.datadrivendiscovery.org/types/Table') + + dimension_semantic_types = list(metadata.get('dimension', {}).get('semantic_types', [])) + if 'https://metadata.datadrivendiscovery.org/types/TabularRow' not in dimension_semantic_types: + dimension_semantic_types.append('https://metadata.datadrivendiscovery.org/types/TabularRow') + dimension_semantic_types = [semantic_type for semantic_type in dimension_semantic_types if semantic_type not in {'https://metadata.datadrivendiscovery.org/types/TabularColumn'}] + + outputs_metadata = outputs_metadata.update(at, { + 'dimension': { + 'name': 'rows', + 'semantic_types': dimension_semantic_types, + }, + 'semantic_types': semantic_types, + }) + + metadata = outputs_metadata.query(at + [ALL_ELEMENTS]) + + dimension_semantic_types = list(metadata.get('dimension', {}).get('semantic_types', [])) + if 'https://metadata.datadrivendiscovery.org/types/TabularColumn' not in dimension_semantic_types: + dimension_semantic_types.append('https://metadata.datadrivendiscovery.org/types/TabularColumn') + dimension_semantic_types = [semantic_type for semantic_type in dimension_semantic_types if semantic_type not in {'https://metadata.datadrivendiscovery.org/types/TabularRow'}] + + new_metadata: typing.Dict = { + 'dimension': { + 'name': 'columns', + 'semantic_types': dimension_semantic_types, + }, + } + + if 'semantic_types' in metadata: + new_metadata['semantic_types'] = [semantic_type for semantic_type in metadata['semantic_types'] if semantic_type not in {'https://metadata.datadrivendiscovery.org/types/Table'}] + if not new_metadata['semantic_types']: + new_metadata['semantic_types'] = NO_VALUE + + outputs_metadata = outputs_metadata.update(at + [ALL_ELEMENTS], new_metadata) + + selector: ListSelector = at + [ALL_ELEMENTS, ALL_ELEMENTS] + while True: + try: + dimension = self.query_field(selector, 'dimension') + except KeyError: + break + + metadata = outputs_metadata.query(selector) + + new_metadata = {} + + if 'semantic_types' in metadata: + new_metadata['semantic_types'] = [semantic_type for semantic_type in metadata['semantic_types'] if semantic_type not in {'https://metadata.datadrivendiscovery.org/types/Table'}] + if not new_metadata['semantic_types']: + new_metadata['semantic_types'] = NO_VALUE + + if 'semantic_types' in dimension: + new_metadata['dimension'] = {} + + dimension_semantic_types = list(dimension['semantic_types']) + if 'https://metadata.datadrivendiscovery.org/types/TabularColumn' in dimension_semantic_types and dimension.get('name', None) == 'columns': + new_metadata['dimension']['name'] = NO_VALUE + if 'https://metadata.datadrivendiscovery.org/types/TabularRow' in dimension_semantic_types and dimension.get('name', None) == 'rows': + new_metadata['dimension']['name'] = NO_VALUE + + dimension_semantic_types = [ + semantic_type for semantic_type in dimension_semantic_types + if semantic_type not in {'https://metadata.datadrivendiscovery.org/types/TabularColumn', 'https://metadata.datadrivendiscovery.org/types/TabularRow'} + ] + new_metadata['dimension']['semantic_types'] = dimension_semantic_types + if not new_metadata['dimension']['semantic_types']: + new_metadata['dimension']['semantic_types'] = NO_VALUE + + if new_metadata: + outputs_metadata = outputs_metadata.update(selector, new_metadata) + + selector.append(ALL_ELEMENTS) + + return outputs_metadata + + def get_column_references_by_column_index(self, current_resource_id: str, *, at: Selector = ()) -> typing.Dict[str, typing.Dict[ColumnReference, typing.List[ColumnReference]]]: + references: typing.Dict[str, typing.Dict[ColumnReference, typing.List[ColumnReference]]] = { + 'confidence_for': {}, + 'rank_for': {}, + 'boundary_for': {}, + 'foreign_key': {}, + } + + for column_index in range(self.query_field(list(at) + [ALL_ELEMENTS], 'dimension')['length']): + column_metadata = self.query_column(column_index, at=at) + + column_reference = ColumnReference(current_resource_id, column_index) + + if 'confidence_for' in column_metadata and 'column_indices' in column_metadata['confidence_for']: + reference_resource_id = column_metadata['confidence_for'].get('resource_id', current_resource_id) + + references['confidence_for'][column_reference] = [ + ColumnReference(reference_resource_id, reference_column_index) + for reference_column_index in column_metadata['confidence_for']['column_indices'] + ] + + if 'rank_for' in column_metadata and 'column_indices' in column_metadata['rank_for']: + reference_resource_id = column_metadata['rank_for'].get('resource_id', current_resource_id) + + references['rank_for'][column_reference] = [ + ColumnReference(reference_resource_id, reference_column_index) + for reference_column_index in column_metadata['rank_for']['column_indices'] + ] + + if 'boundary_for' in column_metadata and 'column_index' in column_metadata['boundary_for']: + reference_resource_id = column_metadata['boundary_for'].get('resource_id', current_resource_id) + + references['boundary_for'][column_reference] = [ + ColumnReference(reference_resource_id, column_metadata['boundary_for']['column_index']), + ] + + if 'foreign_key' in column_metadata and column_metadata['foreign_key']['type'] == 'COLUMN' and 'column_index' in column_metadata['foreign_key']: + reference_resource_id = column_metadata['foreign_key']['resource_id'] + + references['foreign_key'][column_reference] = [ + ColumnReference(reference_resource_id, column_metadata['foreign_key']['column_index']), + ] + + return references + + +class PrimitiveMetadata(Metadata): + """ + A class for metadata for primitives. + + It checks all updates against primitive schema. Note that as such empty (just created) metadata object + does not validate against the schema. If an instance is set on a primitive class, primitive's metaclass + logic will automatically link metadata object with the primitive class and generate required metadata. + """ + + def __init__(self, metadata: typing.Dict[str, typing.Any] = None) -> None: + super().__init__(metadata=metadata) + + # We do not do validation here because provided metadata on its own is + # probably not sufficient for validation to pass. Validation happens + # inside "contribute_to_class" method instead. + + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + self.primitive: typing.Type[base.PrimitiveBase] = None + + # Not adhering to Liskov substitution principle: we do not have "selector" argument. + @deprecate.arguments('source', 'timestamp', message="argument ignored") + def update(self: P, metadata: typing.Dict[str, typing.Any], *, source: typing.Any = None, timestamp: datetime.datetime = None) -> P: # type: ignore + new_metadata = super().update(selector=(), metadata=metadata) + + self._validate(new_metadata.query()) + + return new_metadata + + @deprecate.function(message="create a PrimitiveMetadata instance explicitly instead") + @deprecate.arguments('source', 'timestamp', message="argument ignored") + def clear(self: P, metadata: typing.Dict[str, typing.Any] = None, *, source: typing.Any = None, timestamp: datetime.datetime = None) -> P: + return super().clear(metadata=metadata) + + # Not adhering to Liskov substitution principle: we do not have "selector" argument. + def query(self) -> frozendict.FrozenOrderedDict: # type: ignore + return super().query(selector=()) + + # "primitive" should be of PrimitiveBase here, but we do not want to introduce a + # cyclic dependency. We validate the type at runtime in the method. + def contribute_to_class(self: P, primitive: typing.Any) -> None: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + if self.primitive is not None: + raise exceptions.InvalidStateError("Primitive is already set to '{primitive}'.".format(primitive=self.primitive)) + + if not issubclass(primitive, base.PrimitiveBase): + raise exceptions.InvalidArgumentTypeError("Primitive argument is not a subclass of 'PrimitiveBase' class.") + + self.primitive = primitive + + self._generate_and_update() + + @classmethod + def _validate_contact_information(cls, metadata: typing.Dict) -> None: + # See https://gitlab.com/datadrivendiscovery/d3m/issues/178 for motivation for this check. + + # If it is a locally registered/used primitive, we do not validate contact information. + if 'installation' not in metadata: + return + + if 'source' not in metadata: + logger.warning( + "%(python_path)s: No \"source\" field in the primitive metadata. Metadata should contain contact information and bug reporting URI.", + { + 'python_path': metadata['python_path'], + }, + ) + return + + if not metadata['source'].get('contact', None): + logger.warning( + "%(python_path)s: Contact information such as the email address of the author " + "(e.g., \"mailto:author@example.com\") should be specified in primitive metadata in its \"source.contact\" field.", + { + 'python_path': metadata['python_path'], + }, + ) + + # If the list is empty, it is also false. + if not metadata['source'].get('uris', None): + logger.warning( + "%(python_path)s: A bug reporting URI should be specified in primitive metadata in its \"source.uris\" field.", + { + 'python_path': metadata['python_path'], + }, + ) + + # Make sure a primitive provides a description (through docstring). Because we use special metaclass + # which inherits description from a base class, we have to check the description itself. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/167 + @classmethod + def _validate_description(cls, metadata: typing.Dict) -> None: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + if 'description' not in metadata or not metadata['description'] or metadata['description'].startswith(base.DEFAULT_DESCRIPTION): + logger.warning( + "%(python_path)s: Primitive is not providing a description through its docstring.", + { + 'python_path': metadata['python_path'], + }, + ) + + # Checks that the primitive's Python path complies with namespace requirements. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/3 + @classmethod + def _validate_namespace_compliance(cls, python_path: str, primitive_family: typing.Union[PrimitiveFamily, str]) -> None: # type: ignore + segments = python_path.split('.') + + if len(segments) != 5: + logger.warning( + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification. " + "Reason: must have 5 segments.", + { + 'python_path': python_path, + }, + ) + else: + if segments[0] != 'd3m' or segments[1] != 'primitives': + logger.warning( + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification. " + "Reason: must start with \"d3m.primitives\".", + { + 'python_path': python_path, + }, + ) + + family = segments[2] + name = segments[3] + kind = segments[4] + + # "primitive_family" could also already be a string. + if isinstance(primitive_family, str): + primitive_family_name = primitive_family + else: + primitive_family_name = primitive_family.name + + if family != primitive_family_name.lower(): # type: ignore + logger.warning( + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification. " + "Reason: primitive family segment must match primitive's primitive family.", + { + 'python_path': python_path, + }, + ) + + if name not in primitive_names.PRIMITIVE_NAMES: + logger.warning( + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification. " + "Reason: must have a known primitive name segment.", + { + 'python_path': python_path, + }, + ) + + if not kind[0].isupper(): + logger.warning( + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification. " + "Reason: primitive kind segment must start with upper case.", + { + 'python_path': python_path, + }, + ) + + @classmethod + def _validate(cls, metadata: typing.Dict) -> None: + PRIMITIVE_SCHEMA_VALIDATOR.validate(metadata) + + cls._validate_installation(metadata) + cls._validate_volumes(metadata) + cls._validate_docker_containers(metadata) + cls._validate_hyperparams_to_tune(metadata) + cls._validate_optional_constructor_arguments(metadata) + cls._validate_namespace_compliance(metadata['python_path'], metadata['primitive_family']) + cls._validate_contact_information(metadata) + cls._validate_description(metadata) + + def _generate_and_update(self) -> None: + generated_metadata = self._generate_metadata_for_primitive() + + self._update_in_place((), generated_metadata, self._current_metadata) + + self._validate(self.query()) + + @classmethod + def _validate_installation(cls, metadata: typing.Dict) -> None: + for entry in metadata.get('installation', []): + # We can check simply equality because metadata enumerations are equal to strings as well, + # and "entry['type']" can be both a string or an enumeration instance. + if entry['type'] != PrimitiveInstallationType.PIP: + continue + + if 'package' in entry: + if '/' in entry['package']: + raise exceptions.InvalidMetadataError("Invalid package name '{package_name}'. If you want to use an URI pointing to a package, use 'package_uri' instead.".format( + package_name=entry['package'], + )) + + continue + + if 'package_uri' not in entry: + continue + + if entry['package_uri'].startswith('git+git@'): + # "git+git@git.myproject.org:MyProject" format cannot be parsed with urlparse. + raise exceptions.InvalidMetadataError("Only git+http and git+https URI schemes are allowed.") + + parsed_uri = url_parse.urlparse(entry['package_uri']) + + # It is not a git pip URI. For now we then do not validate it. + if not parsed_uri.scheme.startswith('git'): + continue + + if parsed_uri.scheme not in ['git+http', 'git+https']: + raise exceptions.InvalidMetadataError("Only git+http and git+https URI schemes are allowed.") + + if '@' not in parsed_uri.path: + raise exceptions.InvalidMetadataError("Package URI does not include a commit hash: {package_uri}".format(package_uri=entry['package_uri'])) + + path, commit_hash = parsed_uri.path.rsplit('@', 1) + + if not COMMIT_HASH_REGEX.match(commit_hash): + raise exceptions.InvalidMetadataError("Package URI does not include a commit hash: {package_uri}".format(package_uri=entry['package_uri'])) + + if not parsed_uri.fragment: + raise exceptions.InvalidMetadataError("Package URI does not include a '#egg=package_name' URI suffix.") + + parsed_fragment = url_parse.parse_qs(parsed_uri.fragment, strict_parsing=True) + + if 'egg' not in parsed_fragment: + raise exceptions.InvalidMetadataError("Package URI does not include a '#egg=package_name' URI suffix.") + + @classmethod + def _validate_optional_constructor_arguments(cls, metadata: typing.Dict) -> None: + installation = metadata.get('installation', []) + + containers = [entry for entry in installation if entry.get('type', None) == PrimitiveInstallationType.DOCKER] + if containers and 'docker_containers' not in metadata['primitive_code'].get('instance_methods', {})['__init__']['arguments']: + raise exceptions.InvalidPrimitiveCodeError("Primitive defines a Docker container dependency but does not accept 'docker_containers' argument to the constructor.") + + volumes = cls._get_volumes(metadata) + if volumes and 'volumes' not in metadata['primitive_code'].get('instance_methods', {})['__init__']['arguments']: + raise exceptions.InvalidPrimitiveCodeError("Primitive defines a volume dependency but does not accept 'volumes' argument to the constructor.") + + @classmethod + def _validate_hyperparams_to_tune(cls, metadata: typing.Dict) -> None: + hyperparams = metadata['primitive_code'].get('hyperparams', {}) + + for name in metadata.get('hyperparams_to_tune', []): + if name not in hyperparams: + raise exceptions.InvalidMetadataError("Hyper-parameter in 'hyperparams_to_tune' metadata does not exist: {name}".format(name=name)) + + def _generate_metadata_for_primitive(self) -> typing.Dict[str, typing.Any]: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + type_arguments = self._get_type_arguments() + class_attributes = self._get_class_attributes() + hyperparams_class = typing.cast(typing.Type[hyperparams_module.Hyperparams], type_arguments[base.Hyperparams]) + arguments, instance_methods = self._get_arguments_and_methods(hyperparams_class, type_arguments) + self._validate_constructor(instance_methods) + self._validate_multi_produce(instance_methods) + self._validate_fit_multi_produce(instance_methods) + hyperparams = self._get_hyperparams(hyperparams_class) + class_methods = self._get_class_methods(type_arguments) + instance_attributes = self._get_instance_attributes() + params = self._get_params(type_arguments) + + # Sanity check. + hyperparams_keys = set(hyperparams.keys()) + # We can check simply equality because metadata enumerations are equal to strings as well, + # and "argument['kind']" can be both a string or an enumeration instance. + non_hyperparameter_arguments_keys = {name for name, argument in arguments.items() if argument['kind'] != PrimitiveArgumentKind.HYPERPARAMETER} + overlapping_keys = hyperparams_keys & non_hyperparameter_arguments_keys + if len(overlapping_keys): + raise exceptions.InvalidPrimitiveCodeError("Hyper-paramater names are overlapping with non-hyperparameter argument names: {overlapping_keys}".format(overlapping_keys=overlapping_keys)) + + primitive_code = { + # We have to convert parameters to their names because JSON schema supports only strings for keys. + 'class_type_arguments': {parameter.__name__: argument for parameter, argument in type_arguments.items()}, + 'interfaces_version': d3m.__version__, + 'interfaces': self._get_interfaces(), + 'hyperparams': hyperparams, + 'arguments': arguments, + 'class_methods': class_methods, + 'instance_methods': instance_methods, + 'class_attributes': class_attributes, + 'instance_attributes': instance_attributes, + } + + if params is not None: + primitive_code['params'] = params + + result = { + 'schema': PRIMITIVE_SCHEMA_VERSION, + 'original_python_path': '{module}.{class_name}'.format( + module=self.primitive.__module__, + class_name=self.primitive.__name__, + ), + 'primitive_code': primitive_code, + 'structural_type': self.primitive, + } + + description = inspect.cleandoc(getattr(self.primitive, '__doc__', None) or '') or None + if description is not None: + result['description'] = description + + digest = self._compute_primitive_digest() + if digest is not None: + result['digest'] = digest + + return result + + def _compute_primitive_digest(self) -> typing.Optional[str]: + primitive_metadata = self.query() + + # We use installation metadata for digest because it uniquely identifies the content of the primitive. + # TODO: Some primitives install extra code/data from their setup.py during installation. Could we capture that with digest as well? + installation = primitive_metadata.get('installation', None) + + if not installation: + return None + + # We use "to_json_structure" here and not "to_reversible_json_structure" + # because pickled values might not be deterministic. + to_digest = utils.to_json_structure({ + # We include primitive ID as well, so that different primitives + # from the same package do not have the same digest. + 'id': primitive_metadata['id'], + 'installation': installation, + }) + + return utils.compute_digest(to_digest) + + # Using typing.TypeVar in type signature does not really work, so we are using type instead. + # See: https://github.com/python/typing/issues/520 + def _get_type_arguments(self) -> typing.Dict[type, type]: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + # This call also catches if type parameter has been overridden with a new type variable. + # This means that we for free get to make sure type parameters from the base class stay + # as they are expected to be. It also fetches them recursively, so one cannot hide a + # type parameter (but can fix it to a fixed type instead of leaving it open for a + # subclass to choose it). + type_arguments = utils.get_type_arguments(self.primitive, unique_names=True) + + for parameter, argument in type_arguments.items(): + # Params type argument is optional and can be set to None. + if parameter == base.Params and issubclass(argument, type(None)): + continue + + if not utils.is_subclass(argument, parameter): + raise exceptions.InvalidPrimitiveCodeError("Type parameter '{name}' has type '{type}' and not an expected type: {expected}".format( + name=parameter.__name__, type=argument, expected=parameter.__bound__, # type: ignore + )) + + return type_arguments + + def _resolve_type(self, obj: type, type_arguments: typing.Dict[type, type]) -> type: + if obj in type_arguments: + return type_arguments[obj] + else: + return obj + + def _get_interfaces(self) -> typing.Tuple[str, ...]: + mro = [parent for parent in inspect.getmro(self.primitive) if parent.__module__.startswith('d3m.primitive_interfaces.')] + + interfaces: typing.List[str] = [] + for parent in mro: + interface = utils.get_full_name(parent) + # Remove package name. + interface = '.'.join(interface.split('.')[2:]) + if interface not in interfaces: + interfaces.append(interface) + + if not len(interfaces): + raise exceptions.InvalidPrimitiveCodeError("The primitive does not implement a standard interface.") + + return tuple(interfaces) + + # Using typing.TypeVar in type signature does not really work, so we are using type instead. + # See: https://github.com/python/typing/issues/520 + def _get_params(self, type_arguments: typing.Dict[type, type]) -> typing.Optional[typing.Dict[str, type]]: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + params = type_arguments.get(base.Params, type(None)) + + if issubclass(params, type(None)): + return None + + return params.__params_items__ # type: ignore + + def _get_hyperparams(self, hyperparams_class: 'typing.Type[hyperparams_module.Hyperparams]') -> typing.Dict[str, typing.Dict]: + # We check this here and not during hyper-parameter construction itself because + # we want to require this only once it is used with a primitive. Hyper-parameters + # might be used and constructed in other settings as well. + for hyperparameter_name, hyperparameter in hyperparams_class.configuration.items(): + if not set(hyperparameter.semantic_types) & HYPERPARAMETER_REQUIRED_SEMANTIC_TYPES: + raise exceptions.InvalidPrimitiveCodeError( + "Hyper-parameter '{hyperparameter_name}' does not contain any of required semantic types: {required}".format( + hyperparameter_name=hyperparameter_name, + required=sorted(HYPERPARAMETER_REQUIRED_SEMANTIC_TYPES), + ), + ) + + return hyperparams_class.to_simple_structure() + + def _get_class_attributes(self) -> typing.Dict[str, type]: + result = {} + + for attribute_name, attribute in inspect.getmembers(self.primitive): + if attribute_name.startswith('_'): + continue + + if utils.is_class_method_on_class(attribute) or utils.is_instance_method_on_class(attribute): + continue + + result[attribute_name] = type(attribute) + + result_keys = set(result.keys()) + expected_result_keys = set(EXPECTED_CLASS_ATTRIBUTES.keys()) + + missing = expected_result_keys - result_keys + if len(missing): + raise exceptions.InvalidPrimitiveCodeError("Not all expected public class attributes exist: {missing}".format(missing=missing)) + + extra = result_keys - expected_result_keys + if len(extra): + raise exceptions.InvalidPrimitiveCodeError("Additional unexpected public class attributes exist, consider making them private by prefixing them with '_': {extra}".format(extra=extra)) + + for attribute_name, attribute in result.items(): + if not utils.is_subclass(attribute, EXPECTED_CLASS_ATTRIBUTES[attribute_name]): + raise exceptions.InvalidPrimitiveCodeError("Class attribute '{attribute_name}' does not have an expected type.".format(attribute_name=attribute_name)) + + return result + + # Using typing.TypeVar in type signature does not really work, so we are using type instead. + # See: https://github.com/python/typing/issues/520 + def _get_arguments_and_methods( + self, hyperparams_class: 'typing.Type[hyperparams_module.Hyperparams]', type_arguments: typing.Dict[type, type], + ) -> typing.Tuple[typing.Dict[str, typing.Dict], typing.Dict[str, typing.Dict]]: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + from d3m import types as types_module + + arguments: typing.Dict[str, typing.Dict] = {} + methods: typing.Dict[str, typing.Dict] = {} + + for method_name, method in inspect.getmembers(self.primitive): + if method_name.startswith('_') and method_name != '__init__': + continue + + if not utils.is_instance_method_on_class(method): + continue + + # To make get_type_hints find method's module while the primitive's + # module is still being defined (and this method was indirectly called + # from primitive's metaclass). + method.im_class = self.primitive + + type_hints = utils.get_type_hints(method) + + if not type_hints: + raise exceptions.InvalidPrimitiveCodeError("Cannot get types for method '{method_name}'.".format(method_name=method_name)) + + if 'return' not in type_hints: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' is missing a type for the return value.".format(method_name=method_name)) + + if method_name.startswith('produce_') or method_name == 'produce': + method_kind = PrimitiveMethodKind.PRODUCE + + if getattr(method, '__singleton__', False): + singleton_produce_method = True + else: + singleton_produce_method = False + + method_inputs_across_samples = getattr(method, '__inputs_across_samples__', ()) + elif method_name.startswith('produce'): + raise exceptions.InvalidPrimitiveCodeError("Produce method should start with 'produce_' and not be '{method_name}'.".format(method_name=method_name)) + else: + method_kind = PrimitiveMethodKind.OTHER + + singleton_produce_method = None + method_inputs_across_samples = None + + if hasattr(method, '__singleton__'): + raise exceptions.InvalidPrimitiveCodeError("Only produce methods can be set as singleton or not: {method_name}.".format(method_name=method_name)) + if hasattr(method, '__inputs_across_samples__'): + raise exceptions.InvalidPrimitiveCodeError("Only arguments of produce methods can be set to compute accross samples or not: {method_name}.".format(method_name=method_name)) + + method_arguments = [] + + # We skip the first argument (self). + for argument_name, argument in list(inspect.signature(method).parameters.items())[1:]: + if argument.kind != inspect.Parameter.KEYWORD_ONLY: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has a non-keyword argument '{argument_name}'.".format(method_name=method_name, argument_name=argument_name)) + + has_default = argument.default is not inspect.Parameter.empty + + if argument_name.startswith('_'): + if not has_default: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has a non-optional private argument '{argument_name}'.".format( + method_name=method_name, argument_name=argument_name, + )) + + continue + + if not ARGUMENT_NAME_REGEX.match(argument_name): + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument with an invalid name '{argument_name}'.".format( + method_name=method_name, argument_name=argument_name + )) + + if argument_name not in type_hints: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' is missing a type for argument '{argument_name}'.".format(method_name=method_name, argument_name=argument_name)) + + argument_type = self._resolve_type(type_hints[argument_name], type_arguments) + + standard_argument_description = typing.cast( + typing.Dict, + STANDARD_RUNTIME_ARGUMENTS.get(argument_name, None) or STANDARD_PIPELINE_ARGUMENTS.get(argument_name, None), + ) + if standard_argument_description is not None: + try: + expected_type = self._get_argument_type(standard_argument_description, type_arguments) + except KeyError: + raise exceptions.InvalidPrimitiveCodeError( + "Method '{method_name}' has an argument '{argument_name}' for which an expected type cannot be determined. Is a type parameter missing?".format( + method_name=method_name, argument_name=argument_name, + ) + ) + + # Types have to match here exactly. This is what class type arguments are for. + if argument_type != expected_type: + raise exceptions.InvalidPrimitiveCodeError( + "Method '{method_name}' has an argument '{argument_name}' with type '{argument_type}' and not an expected type: {expected_type}".format( + method_name=method_name, argument_name=argument_name, + argument_type=argument_type, expected_type=expected_type, + ) + ) + + if 'default' in standard_argument_description: + if not has_default: + raise exceptions.InvalidPrimitiveCodeError( + "Method '{method_name}' has an argument '{argument_name}' which does not have a default value, but it should.".format( + method_name=method_name, argument_name=argument_name, + ) + ) + + if argument.default != standard_argument_description['default']: + raise exceptions.InvalidPrimitiveCodeError( + "Method '{method_name}' has an argument '{argument_name}' with a different default value: {argument_default} != {expected_default}.".format( + method_name=method_name, argument_name=argument_name, + argument_default=argument.default, expected_default=standard_argument_description['default'], + ) + ) + + else: + if has_default: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument '{argument_name}' which has a default value, but it should not.".format( + method_name=method_name, argument_name=argument_name, + )) + + if argument_name in STANDARD_RUNTIME_ARGUMENTS: + argument_kind = PrimitiveArgumentKind.RUNTIME + else: + assert argument_name in STANDARD_PIPELINE_ARGUMENTS, "argument_name not in STANDARD_PIPELINE_ARGUMENTS" + argument_kind = PrimitiveArgumentKind.PIPELINE + + # Constructor cannot have additional non-private custom arguments. + elif method_name == '__init__': + raise exceptions.InvalidPrimitiveCodeError( + "Constructor cannot have non-private custom arguments, but it has an argument '{argument_name}'.".format( + argument_name=argument_name, + ) + ) + + elif argument_name in hyperparams_class.configuration: + # Types have to match here exactly. + if argument_type != hyperparams_class.configuration[argument_name].structural_type: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument '{argument_name}' overriding a hyper-parameter with a different type: {argument_type} != {hyperparameter_type}.".format( # noqa + method_name=method_name, argument_name=argument_name, + argument_type=argument_type, hyperparameter_type=hyperparams_class.configuration[argument_name].structural_type, + )) + + # Arguments overriding a hyper-parameter should not have a default value and caller should pass a value in. + if has_default: + raise exceptions.InvalidPrimitiveCodeError( + "Method '{method_name}' has an argument '{argument_name}' overriding a hyper-parameter which has a default value, but it should not.".format( + method_name=method_name, argument_name=argument_name, + ) + ) + + argument_kind = PrimitiveArgumentKind.HYPERPARAMETER + + else: + # Any other argument should be something the rest of the pipeline can provide: + # a container value, data value, or another primitive. + expected_types: typing.Tuple[type, ...] = types_module.Container + types_module.Data + (base.PrimitiveBase,) + + if not utils.is_subclass(argument_type, typing.Union[expected_types]): + raise exceptions.InvalidPrimitiveCodeError( + "Method '{method_name}' has an argument '{argument_name}' with type '{argument_type}' and not an expected type: {expected_types}".format( + method_name=method_name, argument_name=argument_name, + argument_type=argument_type, expected_types=expected_types + ) + ) + + # It should not have a default. Otherwise it is easy to satisfy the argument + # (just never connect anything to it in the pipeline). + if has_default: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument '{argument_name}' which has a default value, but it should not.".format( + method_name=method_name, argument_name=argument_name, + )) + + argument_kind = PrimitiveArgumentKind.PIPELINE + + method_arguments.append(argument_name) + + if argument_name in arguments: + if argument_type != arguments[argument_name]['type']: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument '{argument_name}' which does not match a type of a previous argument with the same name: {argument_type} != {previous_type}".format( # noqa + method_name=method_name, argument_name=argument_name, + argument_type=argument_type, previous_type=arguments[argument_name]['type'], + )) + + # This should hold because it depends only on the argument name. + assert argument_kind == arguments[argument_name]['kind'], "argument_kind mismatch" + + if has_default: + if 'default' not in arguments[argument_name]: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument '{argument_name}' which has a default value, but a previous argument with the same name did not have a default value.".format( # noqa + method_name=method_name, argument_name=argument_name, + )) + elif argument.default != arguments[argument_name]['default']: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument '{argument_name}' which does not have the same default value as a previous argument with the same name: {argument_default} != {previous_default}".format( # noqa + method_name=method_name, argument_name=argument_name, + argument_default=argument.default, + previous_default=arguments[argument_name]['default'], + )) + else: + if 'default' in arguments[argument_name]: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument '{argument_name}' which does not have a default value, but a previous argument with the same name had a default value.".format( # noqa + method_name=method_name, argument_name=argument_name, + )) + + else: + arguments[argument_name] = { + 'type': argument_type, + 'kind': argument_kind, + } + + if has_default: + arguments[argument_name]['default'] = argument.default + + methods[method_name] = { + 'kind': method_kind, + 'arguments': method_arguments, + 'returns': self._resolve_type(type_hints['return'], type_arguments), + } + + if singleton_produce_method is not None: + methods[method_name]['singleton'] = singleton_produce_method + + if method_inputs_across_samples is not None: + for method_input in method_inputs_across_samples: + if method_input not in method_arguments: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument '{method_input}' set as computing across samples, but it does not exist.".format( + method_name=method_name, method_input=method_input, + )) + + if arguments[method_input]['kind'] != PrimitiveArgumentKind.PIPELINE: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has an argument '{method_input}' set as computing across samples, but it is not a PIPELINE argument.".format( + method_name=method_name, method_input=method_input, + )) + + methods[method_name]['inputs_across_samples'] = method_inputs_across_samples + + description = inspect.cleandoc(getattr(method, '__doc__', None) or '') or None + if description is not None: + methods[method_name]['description'] = description + + return arguments, methods + + # Using typing.TypeVar in type signature does not really work, so we are using type instead. + # See: https://github.com/python/typing/issues/520 + def _get_argument_type(self, argument_description: typing.Dict[str, typing.Any], type_arguments: typing.Dict[type, type]) -> type: + if 'get_type' in argument_description: + return argument_description['get_type'](type_arguments) + else: + return argument_description['type'] + + # Using typing.TypeVar in type signature does not really work, so we are using type instead. + # See: https://github.com/python/typing/issues/520 + def _get_class_methods(self, type_arguments: typing.Dict[type, type]) -> typing.Dict[str, typing.Dict]: + methods: typing.Dict[str, typing.Dict] = {} + + for method_name, method in inspect.getmembers(self.primitive): + if method_name.startswith('_'): + continue + + if not utils.is_class_method_on_class(method): + continue + + type_hints = utils.get_type_hints(method) + + if not type_hints: + raise exceptions.InvalidPrimitiveCodeError("Cannot get types for method '{method_name}'.".format(method_name=method_name)) + + if 'return' not in type_hints: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' is missing a type for the return value.".format(method_name=method_name)) + + method_arguments = {} + + for argument_name, argument in inspect.signature(method).parameters.items(): + if argument.kind != inspect.Parameter.KEYWORD_ONLY: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has a non-keyword argument '{argument_name}'.".format(method_name=method_name, argument_name=argument_name)) + + has_default = argument.default is not inspect.Parameter.empty + + if argument_name.startswith('_'): + if not has_default: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' has a non-optional private argument '{argument_name}'.".format( + method_name=method_name, argument_name=argument_name, + )) + + continue + + if argument_name not in type_hints: + raise exceptions.InvalidPrimitiveCodeError("Method '{method_name}' is missing a type for argument '{argument_name}'.".format(method_name=method_name, argument_name=argument_name)) + + argument_type = self._resolve_type(type_hints[argument_name], type_arguments) + + argument_description = { + 'type': argument_type, + } + + if has_default: + argument_description['default'] = argument.default + + method_arguments[argument_name] = argument_description + + methods[method_name] = { + 'arguments': method_arguments, + 'returns': self._resolve_type(type_hints['return'], type_arguments), + } + + description = inspect.cleandoc(getattr(method, '__doc__', None) or '') or None + if description is not None: + methods[method_name]['description'] = description + + return methods + + @classmethod + def _validate_docker_containers(cls, metadata: typing.Dict) -> None: + installation = metadata.get('installation', []) + + containers: typing.List[str] = [] + + for entry in installation: + # We can check simply equality because metadata enumerations are equal to strings as well, + # and "entry['type']" can be both a string or an enumeration instance. + if entry.get('type', None) != PrimitiveInstallationType.DOCKER: + continue + + key = entry.get('key', None) + if key: + containers.append(key) + + containers_set = set(containers) + if len(containers_set) != len(containers): + for key in containers_set: + containers.remove(key) + raise exceptions.InvalidMetadataError("Same Docker image key reused across multiple installation entries: {extra_keys}".format(extra_keys=containers)) + + @classmethod + def _validate_volumes(cls, metadata: typing.Dict) -> None: + volumes: typing.List[str] = [] + + for entry in cls._get_volumes(metadata): + volumes.append(entry['key']) + + volumes_set = set(volumes) + if len(volumes_set) != len(volumes): + for key in volumes_set: + volumes.remove(key) + raise exceptions.InvalidMetadataError("Same volume key reused across multiple installation entries: {extra_keys}".format(extra_keys=volumes)) + + def _validate_constructor(self, instance_methods: typing.Dict[str, typing.Dict]) -> None: + if '__init__' not in instance_methods: + raise exceptions.InvalidPrimitiveCodeError("Constructor is missing.") + + if 'hyperparams' not in instance_methods['__init__']['arguments']: + raise exceptions.InvalidPrimitiveCodeError("Constructor's argument 'hyperparams' is required.") + + def _validate_multi_produce(self, instance_methods: typing.Dict[str, typing.Dict]) -> None: + if 'produce' not in instance_methods: + raise exceptions.InvalidPrimitiveCodeError("'produce' method is missing.") + + if 'multi_produce' not in instance_methods: + raise exceptions.InvalidPrimitiveCodeError("'multi_produce' method is missing.") + + # Initialize with runtime arguments. + expected_arguments = {'produce_methods', 'timeout', 'iterations'} + for method_name, method in instance_methods.items(): + if method['kind'] != PrimitiveMethodKind.PRODUCE: + continue + + if 'produce_methods' in method['arguments']: + raise exceptions.InvalidPrimitiveCodeError("Produce method cannot use 'produce_methods' argument: {method_name}".format(method_name=method_name)) + + expected_arguments.update(method['arguments']) + + arguments = set(instance_methods['multi_produce']['arguments']) + + missing = expected_arguments - arguments + if len(missing): + raise exceptions.InvalidPrimitiveCodeError( + "'multi_produce' method arguments have to be an union of all arguments of all produce methods, but it does not accept all expected arguments: {missing}".format( + missing=missing, + ) + ) + + extra = arguments - expected_arguments + if len(extra): + raise exceptions.InvalidPrimitiveCodeError( + "'multi_produce' method arguments have to be an union of all arguments of all produce methods, but it accepts unexpected arguments: {extra}".format( + extra=extra, + ) + ) + + def _validate_fit_multi_produce(self, instance_methods: typing.Dict[str, typing.Dict]) -> None: + if 'set_training_data' not in instance_methods: + raise exceptions.InvalidPrimitiveCodeError("'set_training_data' method is missing.") + + if 'produce' not in instance_methods: + raise exceptions.InvalidPrimitiveCodeError("'produce' method is missing.") + + if 'fit_multi_produce' not in instance_methods: + raise exceptions.InvalidPrimitiveCodeError("'fit_multi_produce' method is missing.") + + # Initialize with runtime arguments. + expected_arguments = {'produce_methods', 'timeout', 'iterations'} + for method_name, method in instance_methods.items(): + if method['kind'] == PrimitiveMethodKind.PRODUCE: + if 'produce_methods' in method['arguments']: + raise exceptions.InvalidPrimitiveCodeError("Produce method cannot use 'produce_methods' argument: {method_name}".format(method_name=method_name)) + + expected_arguments.update(method['arguments']) + + elif method_name == 'set_training_data': + if 'produce_methods' in method['arguments']: + raise exceptions.InvalidPrimitiveCodeError("'set_training_data' method cannot use 'produce_methods' argument: {method_name}".format(method_name=method_name)) + + expected_arguments.update(method['arguments']) + + arguments = set(instance_methods['fit_multi_produce']['arguments']) + + missing = expected_arguments - arguments + if len(missing): + raise exceptions.InvalidPrimitiveCodeError( + "'fit_multi_produce' method arguments have to be an union of all arguments of 'set_training_data' method and all produce methods, " + "but it does not accept all expected arguments: {missing}".format( + missing=missing, + ) + ) + + extra = arguments - expected_arguments + if len(extra): + raise exceptions.InvalidPrimitiveCodeError( + "'fit_multi_produce' method arguments have to be an union of all arguments of 'set_training_data' method and all produce methods, but it accepts unexpected arguments: {extra}".format( + extra=extra, + ) + ) + + # In the past we have validated instance attributes by creating an instance of the primitive and observe + # which instance attributes were created in a constructor. This was potentially resource intensive because + # primitives use constructor to initialize resources they use. Moreover, it did not detect attributes + # added outside the constructor (even if such practice is bad, it does happen). We could maybe do some + # static analysis instead, but it could also miss attributes, or have false positives. So, instead, we + # just document standard instance attributes and this is it. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/158 + def _get_instance_attributes(self) -> typing.Dict[str, type]: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + # Primitive instance attributes are standardized and fixed. + return { + 'hyperparams': hyperparams_module.Hyperparams, + 'random_seed': int, + 'docker_containers': typing.Dict[str, base.DockerContainer], + 'volumes': typing.Dict[str, str], + 'temporary_directory': typing.Optional[str], + } + + def get_hyperparams(self) -> 'hyperparams_module.Hyperparams': + return self.query()['primitive_code']['class_type_arguments']['Hyperparams'] + + def get_volumes(self) -> typing.Sequence[typing.Dict]: + return self._get_volumes(self.query()) + + @classmethod + def _get_volumes(cls, metadata: typing.Dict) -> typing.Sequence[typing.Dict]: + # We can check simply equality because metadata enumerations are equal to strings as well, + # and "entry['type']" can be both a string or an enumeration instance. + return [ + entry for entry in metadata.get('installation', []) + if entry.get('key', None) and entry.get('file_digest', None) and entry.get('type', None) in [PrimitiveInstallationType.FILE, PrimitiveInstallationType.TGZ] + ] + + # Not adhering to Liskov substitution principle: we are not returning a list. + def to_internal_json_structure(self) -> typing.Dict: # type: ignore + return utils.to_reversible_json_structure(self.to_internal_simple_structure()) + + # Not adhering to Liskov substitution principle: we are not returning a list. + def to_internal_simple_structure(self) -> typing.Dict: # type: ignore + return super().to_internal_simple_structure()[0]['metadata'] + + # Not adhering to Liskov substitution principle: we are not returning a list. + def to_json_structure(self) -> typing.Dict: # type: ignore + return utils.to_json_structure(self.to_simple_structure()) + + # Not adhering to Liskov substitution principle: we are not returning a list. + def to_simple_structure(self) -> typing.Dict: # type: ignore + return super().to_simple_structure()[0]['metadata'] + + +EXPECTED_CLASS_ATTRIBUTES = { + 'metadata': PrimitiveMetadata, + 'logger': logging.Logger, +} + + +def _get_inputs(type_arguments: typing.Dict[type, type]) -> type: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + return type_arguments[base.Inputs] + + +def _get_outputs(type_arguments: typing.Dict[type, type]) -> type: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + return type_arguments[base.Outputs] + + +def _get_input_labels(type_arguments: typing.Dict[type, type]) -> type: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import distance + + return type_arguments[distance.InputLabels] + + +# Arguments which can be fulfilled by other primitives in a pipeline. +STANDARD_PIPELINE_ARGUMENTS = { + 'inputs': { + 'get_type': _get_inputs, + }, + 'outputs': { + 'get_type': _get_outputs, + }, + 'input_labels': { + 'get_type': _get_input_labels, + }, +} + + +def _get_hyperparams(type_arguments: typing.Dict[type, type]) -> type: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + return type_arguments[base.Hyperparams] + + +def _get_docker_containers(type_arguments: typing.Dict[type, type]) -> type: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + return typing.Optional[typing.Dict[str, base.DockerContainer]] + + +def _get_params(type_arguments: typing.Dict[type, type]) -> type: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + return type_arguments[base.Params] + + +def _get_gradient_outputs(type_arguments: typing.Dict[type, type]) -> type: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + return base.Gradients[type_arguments[base.Outputs]] # type: ignore + + +def _get_module(type_arguments: typing.Dict[type, type]) -> type: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base + + return type_arguments[base.Module] + + +# Arguments which are meaningful only for a runtime executing a pipeline. +STANDARD_RUNTIME_ARGUMENTS = { + 'hyperparams': { + 'get_type': _get_hyperparams, + }, + 'random_seed': { + 'type': int, + 'default': 0, + }, + 'docker_containers': { + 'get_type': _get_docker_containers, + 'default': None, + }, + 'volumes': { + 'type': typing.Optional[typing.Dict[str, str]], + 'default': None, + }, + 'temporary_directory': { + 'type': typing.Optional[str], + 'default': None, + }, + 'timeout': { + 'type': typing.Optional[float], + 'default': None, + }, + 'iterations': { + 'type': typing.Optional[int], + 'default': None, + }, + 'produce_methods': { + 'type': typing.Sequence[str], + }, + 'params': { + 'get_type': _get_params, + }, + 'num_samples': { + 'type': int, + 'default': 1, + }, + 'gradient_outputs': { + 'get_type': _get_gradient_outputs, + }, + 'fine_tune': { + 'type': bool, + 'default': False, + }, + 'fine_tune_learning_rate': { + 'type': float, + 'default': 0.00001, + }, + 'fine_tune_weight_decay': { + 'type': float, + 'default': 0.00001, + }, + 'temperature': { + 'type': float, + 'default': 0, + }, + 'input_module': { + 'get_type': _get_module, + }, + 'module': { + 'get_type': _get_module, + }, +} + + +def metadata_serializer(obj: Metadata) -> dict: + data = { + 'metadata': pickle.dumps(obj), + } + + return data + + +def metadata_deserializer(data: dict) -> Metadata: + metadata = pickle.loads(data['metadata']) + + return metadata + + +if pyarrow_lib is not None: + pyarrow_lib._default_serialization_context.register_type( + Metadata, 'd3m.metadata', + custom_serializer=metadata_serializer, + custom_deserializer=metadata_deserializer, + ) diff --git a/d3m/d3m/metadata/hyperparams.py b/d3m/d3m/metadata/hyperparams.py new file mode 100644 index 0000000..afe5391 --- /dev/null +++ b/d3m/d3m/metadata/hyperparams.py @@ -0,0 +1,3370 @@ +import abc +import base64 +import collections +import copy +import functools +import importlib +import inspect +import logging +import numbers +import operator +import pickle +import re +import types +import typing + +import frozendict # type: ignore +import numpy # type: ignore +import typing_inspect # type: ignore +from pytypes import type_util # type: ignore +from scipy import special as scipy_special # type: ignore +from sklearn.utils import validation as sklearn_validation # type: ignore + +from . import base +from d3m import deprecate, exceptions, utils + +__all__ = ( + 'Hyperparameter', 'Primitive', 'Constant', 'Bounded', 'Enumeration', 'UniformBool', 'UniformInt', + 'Uniform', 'LogUniform', 'Normal', 'LogNormal', 'Union', 'Choice', 'Set', 'SortedSet', 'List', + 'SortedList', 'Hyperparams', +) + +logger = logging.getLogger(__name__) + +RandomState = typing.Union[numbers.Integral, numpy.integer, numpy.random.RandomState] + +T = typing.TypeVar('T') +S = typing.TypeVar('S', bound=typing.Sequence) + +# We want to make sure we do not support dots because they are used to delimit nested hyper-parameters. +HYPERPARAMETER_NAME_REGEX = re.compile(r'^[A-Za-z][A-Za-z_0-9]*$') + + +def _get_structural_type_argument(obj: typing.Any, type_var: typing.Any) -> type: + cls = typing_inspect.get_generic_type(obj) + + return utils.get_type_arguments(cls)[type_var] + + +def check_sample_size(obj: 'typing.Union[Hyperparameter, Hyperparams]', min_samples: int, max_samples: typing.Optional[int], with_replacement: bool) -> typing.Tuple[int, int]: + if with_replacement: + all_max_samples = None + else: + all_max_samples = obj.get_max_samples() + + if not isinstance(min_samples, int): + raise exceptions.InvalidArgumentTypeError("'min_samples' argument is not an int.") + if min_samples < 0: + raise exceptions.InvalidArgumentValueError("'min_samples' cannot be smaller than 0.") + if max_samples is not None: + if not isinstance(max_samples, int): + raise exceptions.InvalidArgumentTypeError("'max_samples' argument is not an int.") + if min_samples > max_samples: + raise exceptions.InvalidArgumentValueError("'min_samples' cannot be larger than 'max_samples'.") + if all_max_samples is not None and max_samples > all_max_samples: + raise exceptions.InvalidArgumentValueError("'max_samples' cannot be larger than {max_samples}.".format(max_samples=all_max_samples)) + else: + if all_max_samples is not None: + max_samples = all_max_samples + else: + raise exceptions.InvalidArgumentValueError("'max_samples' argument is required.") + + return min_samples, max_samples + + +# A special Python method which is stored efficiently +# when pickled. See PEP 307 for more details. +def __newobj__(cls: type, *args: typing.Any) -> typing.Any: + return cls.__new__(cls, *args) + + +def _is_defined_at_global_scope(cls: type) -> bool: + class_name = getattr(cls, '__name__', None) + class_module = inspect.getmodule(cls) + return class_name is not None and class_module is not None and getattr(class_module, class_name, None) is cls + + +def _recreate_hyperparams_class(base_cls: 'typing.Type[Hyperparams]', define_args_list: typing.Sequence[typing.Dict[str, typing.Any]]) -> typing.Any: + # We first have to recreate the class from the base class. + cls = base_cls + for args in define_args_list: + cls = cls.define(**args) + # And then we create a new instance of the object. + return cls.__new__(cls) + + +def _encode_generic_type(structural_type: type) -> typing.Union[type, typing.Dict]: + args = typing_inspect.get_last_args(structural_type) + + if not args: + return structural_type + + return { + 'origin': typing_inspect.get_origin(structural_type), + 'args': [_encode_generic_type(arg) for arg in args] + } + + +def _decode_generic_type(description: typing.Union[type, typing.Dict]) -> type: + if not isinstance(description, dict): + return description + + return description['origin'][tuple(_decode_generic_type(arg) for arg in description['args'])] + + +class HyperparameterMeta(utils.AbstractMetaclass, typing.GenericMeta): + pass + + +class Hyperparameter(typing.Generic[T], metaclass=HyperparameterMeta): + """ + A base class for hyper-parameter descriptions. + + A base hyper-parameter does not give any information about the space of the hyper-parameter, + besides a default value. + + Type variable ``T`` is optional and if not provided an attempt to automatically infer + it from ``default`` will be made. Attribute ``structural_type`` exposes this type. + + There is a special case when values are primitives. In this case type variable ``T`` and + ``structural_type`` should always be a primitive base class, but valid values used in + hyper-parameters can be both primitive instances (of that base class or its subclasses) + and primitive classes (that base class itself or its subclasses). Primitive instances + allow one to specify a primitive much more precisely: values of their hyper-parameters, + or even an already fitted primitive. + + This means that TA2 should take care and check if values it is planning to use for + this hyper-parameter are a primitive class or a primitive instance. It should make sure + that it always passes only a primitive instance to the primitive which has a hyper-parameter + expecting primitive(s). Even if the value is already a primitive instance, it must not + pass it directly, but should make a copy of the primitive instance with same hyper-parameters + and params. Primitive instances part of hyper-parameter definitions should be seen + as immutable and as a template for primitives to pass and not to directly use. + + TA2 is in the best position to create such instances during pipeline run as it has all + necessary information to construct primitive instances (and can control a random seed, + or example). Moreover, it is also more reasonable for TA2 to handle the life-cycle of + a primitive and do any additional processing of primitives. TA2 can create such a primitive + outside of the pipeline, or as part of the pipeline and pass it as a hyper-parameter + value to the primitive. The latter approach allows pipeline to describe how is the primitive + fitted and use data from the pipeline itself for fitting, before the primitive is passed on + as a hyper-parameter value to another primitive. + + Attributes + ---------- + name: + A name of this hyper-parameter in the configuration of all hyper-parameters. + structural_type: + A Python type of this hyper-parameter. All values of the hyper-parameter, including the default value, + should be of this type. + semantic_types: + A list of URIs providing semantic meaning of the hyper-parameter. This can help express how + the hyper-parameter is being used, e.g., as a learning rate or as kernel parameter. + description: + An optional natural language description of the hyper-parameter. + """ + + name: str + structural_type: typing.Type + semantic_types: typing.Sequence[str] + description: str + + def __init__(self, default: T, *, semantic_types: typing.Sequence[str] = None, description: str = None) -> None: + if semantic_types is None: + semantic_types = () + + self.name: str = None + self.semantic_types = semantic_types + self.description = description + + self._default = default + + # If subclass has not already set it. + if not hasattr(self, 'structural_type'): + structural_type = _get_structural_type_argument(self, T) # type: ignore + + if structural_type == typing.Any: + structural_type = self.infer_type(self._default) + + self.structural_type = structural_type + + self.validate_default() + + def contribute_to_class(self, name: str) -> None: + if self.name is not None and self.name != name: + raise exceptions.InvalidStateError("Name is already set to '{name}', cannot set to '{new_name}'.".format(name=self.name, new_name=name)) + + self.name = name + + def get_default(self, path: str = None) -> typing.Any: + """ + Returns a default value of a hyper-parameter. + + Remember to never modify it in-place it is a mutable value. Moreover, if it is + an instance of a primitive, also copy the instance before you use it to not + change its internal state. + + Parameters + ---------- + path: + An optional path to get defaults for nested hyper-parameters, if a hyper-parameter + has nested hyper-parameters. It can contain ``.`` to represent a path through + nested hyper-parameters. + + Returns + ------- + A default value. + """ + + if path is not None: + raise KeyError("Invalid path '{path}'.".format(path=path)) + + return self._default + + def check_type(self, value: typing.Any, cls: type) -> bool: + """ + Check that the type of ``value`` matches given ``cls``. + + There is a special case if ``value`` is a primitive class, in that case it is checked + that ``value`` is a subclass of ``cls``. + + Parameters + ---------- + value: + Value to check type for. + cls: + Type to check type against. + + Returns + ------- + ``True`` if ``value`` is an instance of ``cls``, or if ``value`` is a primitive + class, if it is a subclass of ``cls``. + """ + + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base as primitive_interfaces_base + + def get_type(obj: typing.Any) -> type: + if utils.is_type(obj) and issubclass(obj, primitive_interfaces_base.PrimitiveBase): + return obj + else: + return type(obj) + + value_type = type_util.deep_type(value, get_type=get_type) + + return utils.is_subclass(value_type, cls) + + def infer_type(self, value: typing.Any) -> type: + """ + Infers a structural type of ``value``. + + There is a special case if ``value`` is a primitive class, in that case it is returned + as is. + + Parameters + ---------- + value: + Value to infer a type for. + + Returns + ------- + Type of ``value``, or ``value`` itself if ``value`` is a primitive class. + """ + + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base as primitive_interfaces_base + + if utils.is_type(value) and issubclass(value, primitive_interfaces_base.PrimitiveBase): + return value + else: + return utils.get_type(value) + + def validate(self, value: T) -> None: + """ + Validates that a given ``value`` belongs to the space of the hyper-parameter. + + If not, it throws an exception. + + Parameters + ---------- + value: + Value to validate. + """ + + if not self.check_type(value, self.structural_type): + raise exceptions.InvalidArgumentTypeError("Value '{value}' {for_name}is not an instance of the structural type: {structural_type}".format( + value=value, for_name=self._for_name(), structural_type=self.structural_type, + )) + + def validate_default(self) -> None: + """ + Validates that a default value belongs to the space of the hyper-parameter. + + If not, it throws an exception. + """ + + self.validate(self._default) + + def _validate_finite_float(self, value: typing.Any) -> None: + """ + If ``value`` is a floating-point value, it validates that it is + a finite number (no infinity, no ``NaN``). + + If not, it throws an exception. + + Parameters + ---------- + value: + Value to validate. + """ + + if utils.is_float(type(value)) and not numpy.isfinite(value): + raise exceptions.InvalidArgumentValueError("A floating-point value {for_name}must be finite.".format(for_name=self._for_name())) + + def _for_name(self) -> str: + if getattr(self, 'name', None) is None: + return "" + else: + return "for hyper-parameter '{name}' ".format(name=self.name) + + def sample(self, random_state: RandomState = None) -> T: + """ + Samples a random value from the hyper-parameter search space. + + For the base class it always returns a ``default`` value because the space + is unknown. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + sklearn_validation.check_random_state(random_state) + + utils.log_once(logger, logging.WARNING, "Sampling a hyper-parameter '%(name)s' without known space. Using a default value.", {'name': self.name}, stack_info=True) + + return self.get_default() + + # Should not be called at the module importing time because it can trigger loading + # of all primitives in the "Primitive" hyper-parameter, which can lead to an import cycle. + def get_max_samples(self) -> typing.Optional[int]: + """ + Returns a maximum number of samples that can be returned at once using `sample_multiple`, + when ``with_replacement`` is ``False``. + + Returns + ------- + A maximum number of samples that can be returned at once. Or ``None`` if there is no limit. + """ + + return 1 + + def _check_sample_size(self, min_samples: int, max_samples: typing.Optional[int], with_replacement: bool) -> typing.Tuple[int, int]: + return check_sample_size(self, min_samples, max_samples, with_replacement) + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[T]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + For the base class it always returns only a ``default`` value because the space + is unknown. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + utils.log_once(logger, logging.WARNING, "Sampling a hyper-parameter '%(name)s' without known space. Using a default value.", {'name': self.name}, stack_info=True) + + if with_replacement: + size = random_state.randint(min_samples, max_samples + 1) + + return (self.get_default(),) * size + + else: + if min_samples > 0: + assert min_samples == 1, min_samples + assert max_samples == 1, max_samples + return (self.get_default(),) + elif max_samples < 1: + assert min_samples == 0, min_samples + assert max_samples == 0, max_samples + return () + else: + assert min_samples == 0, min_samples + assert max_samples == 1, max_samples + return typing.cast(typing.Sequence[T], () if random_state.rand() >= 0.5 else (self.get_default(),)) + + def __repr__(self) -> str: + return '{class_name}(default={default})'.format( + class_name=type(self).__name__, + default=self.get_default(), + ) + + def to_simple_structure(self) -> typing.Dict: + """ + Converts the hyper-parameter to a simple structure, similar to JSON, but with values + left as Python values. + + Returns + ------- + A dict. + """ + + structure = { + 'type': type(self), + 'default': self.get_default(), + 'structural_type': self.structural_type, + 'semantic_types': list(self.semantic_types), + } + + if self.description is not None: + structure['description'] = self.description + + return structure + + @deprecate.function(message="use value_to_json_structure method instead") + def value_to_json(self, value: T) -> typing.Any: + return self.value_to_json_structure(value) + + def value_to_json_structure(self, value: T) -> typing.Any: + """ + Converts a value of this hyper-parameter to a JSON-compatible value. + + Parameters + ---------- + value: + Value to convert. + + Returns + ------- + A JSON-compatible value. + """ + + self.validate(value) + + if utils.is_subclass(self.structural_type, typing.Union[str, int, float, bool, type(None)]): + if utils.is_float(type(value)) and not numpy.isfinite(value): + return { + 'encoding': 'pickle', + 'value': base64.b64encode(pickle.dumps(value)).decode('utf8'), + } + else: + return value + elif utils.is_subclass(self.structural_type, numpy.bool_): + return bool(value) + elif utils.is_subclass(self.structural_type, numpy.integer): + return int(value) + elif utils.is_subclass(self.structural_type, typing.Union[numpy.float32, numpy.float64]): + value = float(value) + if not numpy.isfinite(value): + return { + 'encoding': 'pickle', + 'value': base64.b64encode(pickle.dumps(value)).decode('utf8'), + } + else: + return value + else: + return { + 'encoding': 'pickle', + 'value': base64.b64encode(pickle.dumps(value)).decode('utf8'), + } + + @deprecate.function(message="use value_from_json_structure method instead") + def value_from_json(self, json: typing.Any) -> T: + return self.value_from_json_structure(json) + + def value_from_json_structure(self, json: typing.Any) -> T: + """ + Converts a JSON-compatible value to a value of this hyper-parameter. + + Parameters + ---------- + json: + A JSON-compatible value. + + Returns + ------- + Converted value. + """ + + if isinstance(json, dict): + if json.get('encoding', None) != 'pickle': + raise exceptions.NotSupportedError(f"Not supported hyper-parameter value encoding: {json.get('encoding', None)}") + if 'value' not in json: + raise exceptions.MissingValueError(f"'value' field is missing in encoded hyper-parameter value.") + + # TODO: Limit the types of values being able to load to prevent arbitrary code execution by a malicious pickle. + value = pickle.loads(base64.b64decode(json['value'].encode('utf8'))) + elif utils.is_subclass(self.structural_type, typing.Union[str, int, bool, type(None)]): + # Handle a special case when value was parsed from JSON as float, but we expect an int. + # If "json" is not really an integer then we set "value" to a float and leave + # to "validate" to raise an exception. + if isinstance(json, float) and json.is_integer(): + value = int(json) + else: + value = json + elif utils.is_subclass(self.structural_type, typing.Union[str, float, bool, type(None)]): + # Handle a special case when value was parsed from JSON as int, but we expect a float. + if isinstance(json, int): + value = float(json) + else: + value = json + elif utils.is_subclass(self.structural_type, typing.Union[str, int, float, bool, type(None)]): + # If both int and float are accepted we assume the user of the value knows how to + # differentiate between values or that precise numerical type does not matter. + value = json + else: + # Backwards compatibility. A string representing a pickle. + logger.warning("Converting hyper-parameter '%(name)s' from a deprecated JSON structure.", {'name': self.name}) + + # TODO: Limit the types of values being able to load to prevent arbitrary code execution by a malicious pickle. + value = pickle.loads(base64.b64decode(json.encode('utf8'))) + + self.validate(value) + + return value + + def traverse(self) -> 'typing.Iterator[Hyperparameter]': + """ + Traverse over all child hyper-parameters of this hyper-parameter. + + Yields + ------ + Hyperparamater + The next child hyper-parameter of this hyper-parameter. + """ + + # Empty generator by default. + yield from () # type: ignore + + def transform_value(self, value: T, transform: typing.Callable, index: int = 0) -> T: + """ + Transforms the value belonging to this hyper-parameter to a new value by + calling ``transform`` on it. If the hyper-parameter has child + hyper-parameters, it deconstructs the value, calls ``transform_value`` + recursively, and constructs the new value back. + + Parameters + ---------- + value: + A value to transform. + transform: + A function which receives as arguments: a hyper-parameter instance, + the value, and a sequence index of iterating over a structure, and + should return a new transformed value. It is called only for leaf + hyper-parameters (those without child hyper-parameters). + index: + A sequence index which should be passed to ``transform``. + Used when iterating over a structure by the parent. + It should be deterministic. + + Returns + ------- + A transformed value. + """ + + return transform(self, value, index) + + def can_accept_value_type(self, structural_type: typing.Union[type, typing.List[type]]) -> bool: + """ + Returns ``True`` if a hyper-parameter can accept a value of type ``structural_type``. + + Parameters + ---------- + structural_type: + A structural type. Can be a type or a list of types. + + Returns + ------- + If value of given type can be accepted by this hyper-parameter. + """ + + if structural_type is typing.Any: + return True + elif isinstance(structural_type, typing.List): + # Default implementation does not support a list of types. This is used for "Set" hyper-parameter. + return False + else: + return utils.is_subclass(structural_type, self.structural_type) + + # TODO: Remove once using Python 3.7 exclusively. + def __getstate__(self) -> dict: + state = dict(self.__dict__) + # Subclasses of generic classes cannot be pickled in Python 3.6, but instances of + # them can, because during runtime information about generic classes is removed. + # Pickling of hyper-parameter instances thus generally work without problems + # but if they are an instance of the a subclass of a generic class, a reference + # to that class is stored into "__orig_class__" which cannot be pickled. + # Because we do not really need it after we extracted "structural_type", + # we remove it here when pickling. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/155 + if '__orig_class__' in state: + del state['__orig_class__'] + + if 'structural_type' in state: + # A workaround for structural type being a generic class. + state['structural_type'] = _encode_generic_type(state['structural_type']) + + return state + + def __setstate__(self, state: dict) -> None: + if 'structural_type' in state: + state['structural_type'] = _decode_generic_type(state['structural_type']) + + self.__dict__ = state + + +class Primitive(Hyperparameter[T]): + """ + A hyper-parameter describing a primitive or primitives. + + Matching primitives are determined based on their structural type (a matching primitive + has to be an instance or a subclass of the structural type), their primitive's family + (a matching primitive's family has to be among those listed in the hyper-parameter), + their algorithm types (a matching primitive has to implement at least one of the + listed in the hyper-parameter), and produce methods provided (a matching primitive + has to provide all of the listed in the hyper-parameter). + + Remember that valid values of a hyper-parameter which has primitive values are both + primitive instances and primitive classes, but the structural type is always just a + primitive base class. Hyper-parameter values being passed to a primitive which has + a hyper-parameter expecting primitive(s) should always be primitive instances. + + The default sampling method returns always classes (or a default value, which can be a + primitive instance), but alternative implementations could sample across instances + (and for example across also primitive's hyper-parameters). + + Attributes + ---------- + primitive_families: + A list of primitive families a matching primitive should be part of. + algorithm_types: + A list of algorithm types a matching primitive should implement at least one. + produce_methods: + A list of produce methods a matching primitive should provide all. + """ + + primitive_families: 'typing.Sequence[base.PrimitiveFamily]' + algorithm_types: 'typing.Sequence[base.PrimitiveAlgorithmType]' + produce_methods: typing.Sequence[str] + + def __init__(self, default: typing.Type[T], primitive_families: 'typing.Sequence[base.PrimitiveFamily]' = None, # type: ignore + algorithm_types: 'typing.Sequence[base.PrimitiveAlgorithmType]' = None, produce_methods: typing.Sequence[str] = None, *, # type: ignore + semantic_types: typing.Sequence[str] = None, description: str = None) -> None: + if primitive_families is None: + primitive_families = () + if algorithm_types is None: + algorithm_types = () + if produce_methods is None: + produce_methods = () + + # Convert any strings to enums. + self.primitive_families: typing.Tuple[base.PrimitiveFamily, ...] = tuple(base.PrimitiveFamily[primitive_family] for primitive_family in primitive_families) # type: ignore + self.algorithm_types: typing.Tuple[base.PrimitiveAlgorithmType, ...] = tuple(base.PrimitiveAlgorithmType[algorithm_type] for algorithm_type in algorithm_types) # type: ignore + self.produce_methods = tuple(produce_methods) + + for primitive_family in self.primitive_families: # type: ignore + if primitive_family not in list(base.PrimitiveFamily): + raise exceptions.InvalidArgumentValueError("Unknown primitive family '{primitive_family}'.".format(primitive_family=primitive_family)) + for algorithm_type in self.algorithm_types: # type: ignore + if algorithm_type not in list(base.PrimitiveAlgorithmType): + raise exceptions.InvalidArgumentValueError("Unknown algorithm type '{algorithm_type}'.".format(algorithm_type=algorithm_type)) + for produce_method in self.produce_methods: + if produce_method != 'produce' and not produce_method.startswith('produce_'): + raise exceptions.InvalidArgumentValueError("Invalid produce method name '{produce_method}'.".format(produce_method=produce_method)) + + self.matching_primitives: typing.Sequence[typing.Union[T, typing.Type[T]]] = None + + # Used for sampling. + # See: https://github.com/numpy/numpy/issues/15935 + self._choices: numpy.ndarray = None + + # Default value is checked by parent class calling "validate". + + super().__init__(default, semantic_types=semantic_types, description=description) # type: ignore + + # "all_primitives" is not "Sequence[Type[PrimitiveBase]]" to not introduce an import cycle. + def populate_primitives(self, all_primitives: typing.Sequence[type] = None) -> None: + """ + Populate a list of matching primitives. + + Called automatically when needed using `d3m.index` primitives. If this is not desired, + this method should be called using a list of primitive classes to find matching + primitives among. + + Parameters + ---------- + all_primitives: + An alternative list of all primitive classes to find matching primitives among. + """ + + if all_primitives is None: + # Importing here to prevent import cycle. + from d3m import index + + index.load_all() + all_primitives = index.get_loaded_primitives() # type: ignore + + matching_primitives = [] + for primitive in all_primitives: + try: + self.validate(primitive) + matching_primitives.append(primitive) + except (exceptions.InvalidArgumentTypeError, exceptions.InvalidArgumentValueError): + pass + + default = self.get_default() + + if utils.is_type(default): + if default not in matching_primitives: + matching_primitives.append(default) # type: ignore + else: + if type(default) not in matching_primitives: + matching_primitives.append(default) # type: ignore + else: + matching_primitives[matching_primitives.index(type(default))] = default # type: ignore + + self.matching_primitives = matching_primitives + self._choices = numpy.array(matching_primitives, dtype=object) + + def validate(self, value: typing.Union[T, typing.Type[T]]) -> None: + # Importing here to prevent import cycle. + from d3m.primitive_interfaces import base as primitive_interfaces_base + + super().validate(typing.cast(T, value)) + + if utils.is_type(value): + primitive_class = typing.cast(typing.Type[primitive_interfaces_base.PrimitiveBase], value) + + # Additional check that we really have a primitive. + if not utils.is_subclass(primitive_class, primitive_interfaces_base.PrimitiveBase): + raise exceptions.InvalidArgumentTypeError("Value '{value}' {for_name}is not a subclass of 'PrimitiveBase' class.".format( + value=value, for_name=self._for_name(), + )) + else: + primitive_class = typing.cast(typing.Type[primitive_interfaces_base.PrimitiveBase], type(value)) + + # Additional check that we really have a primitive. + if not utils.is_subclass(primitive_class, primitive_interfaces_base.PrimitiveBase): + raise exceptions.InvalidArgumentTypeError("Value '{value}' {for_name}is not an instance of 'PrimitiveBase' class.".format( + value=value, for_name=self._for_name(), + )) + + primitive_family = primitive_class.metadata.query()['primitive_family'] + if self.primitive_families and primitive_family not in self.primitive_families: + raise exceptions.InvalidArgumentValueError( + "Primitive '{value}' {for_name}has primitive family '{primitive_family}' and not any of: {primitive_families}".format( + value=value, for_name=self._for_name(), + primitive_family=primitive_family, primitive_families=self.primitive_families, + ) + ) + + algorithm_types = primitive_class.metadata.query()['algorithm_types'] + if self.algorithm_types and set(algorithm_types).isdisjoint(set(self.algorithm_types)): + raise exceptions.InvalidArgumentValueError( + "Primitive '{value}' {for_name}has algorithm types '{primitive_algorithm_types}' and not any of: {algorithm_types}".format( + value=value, for_name=self._for_name(), + primitive_algorithm_types=algorithm_types, algorithm_types=self.algorithm_types, + ) + ) + + produce_methods = { + method_name for method_name, method_description + in primitive_class.metadata.query()['primitive_code']['instance_methods'].items() + if method_description['kind'] == base.PrimitiveMethodKind.PRODUCE + } + if not set(self.produce_methods) <= produce_methods: + raise exceptions.InvalidArgumentValueError( + "Primitive '{value}' {for_name}has produce methods '{primitive_produce_methods}' and not all of: {produce_methods}".format( + value=value, for_name=self._for_name(), + primitive_produce_methods=produce_methods, produce_methods=self.produce_methods, + ) + ) + + def sample(self, random_state: RandomState = None) -> typing.Union[T, typing.Type[T]]: # type: ignore + """ + Samples a random value from the hyper-parameter search space. + + Returns a random primitive from primitives available through `d3m.index`, by default, + or those given to a manual call of `populate_primitives`. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + if self.matching_primitives is None: + self.populate_primitives() + + return random_state.choice(self._choices) + + def get_max_samples(self) -> typing.Optional[int]: + if self.matching_primitives is None: + self.populate_primitives() + + return len(self.matching_primitives) + + def sample_multiple( # type: ignore + self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False, + ) -> typing.Sequence[typing.Union[T, typing.Type[T]]]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + It samples primitives available through `d3m.index`, by default, + or those given to a manual call of `populate_primitives`. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + if self.matching_primitives is None: + self.populate_primitives() + + size = random_state.randint(min_samples, max_samples + 1) + + return tuple(random_state.choice(self._choices, size, replace=with_replacement)) + + def __repr__(self) -> str: + return '{class_name}(default={default}, primitive_families={primitive_families}, algorithm_types={algorithm_types})'.format( + class_name=type(self).__name__, + default=self.get_default(), + primitive_families=[primitive_family.name for primitive_family in self.primitive_families], # type: ignore + algorithm_types=[algorithm_type.name for algorithm_type in self.algorithm_types], # type: ignore + produce_methods=list(self.produce_methods), + ) + + @functools.lru_cache() + def to_simple_structure(self) -> typing.Dict: # type: ignore + structure = super().to_simple_structure() + structure.update({ + 'primitive_families': list(self.primitive_families), + 'algorithm_types': list(self.algorithm_types), + 'produce_methods': list(self.produce_methods), + }) + return structure + + @deprecate.function(message="use value_to_json_structure method instead") + def value_to_json(self, value: typing.Union[T, typing.Type[T]]) -> typing.Any: + return self.value_to_json_structure(value) + + def value_to_json_structure(self, value: typing.Union[T, typing.Type[T]]) -> typing.Any: + self.validate(value) + + if utils.is_type(value): + return {'class': value.metadata.query()['python_path']} # type: ignore + else: + return {'instance': base64.b64encode(pickle.dumps(value)).decode('utf8')} + + @deprecate.function(message="use value_from_json_structure method instead") + def value_from_json(self, json: typing.Any) -> typing.Union[T, typing.Type[T]]: # type: ignore + return self.value_from_json_structure(json) + + def value_from_json_structure(self, json: typing.Any) -> typing.Union[T, typing.Type[T]]: # type: ignore + if 'class' in json: + module_path, name = json['class'].rsplit('.', 1) + module = importlib.import_module(module_path) + value = getattr(module, name) + else: + # TODO: Limit the types of values being able to load to prevent arbitrary code execution by a malicious pickle. + value = pickle.loads(base64.b64decode(json['instance'].encode('utf8'))) + + self.validate(value) + + return value + + def can_accept_value_type(self, structural_type: typing.Union[type, typing.List[type]]) -> bool: + if structural_type is typing.Any: + return True + elif not super().can_accept_value_type(structural_type): + return False + + try: + # We now know that it is a primitive class and we can check other constraints. + self.validate(typing.cast(typing.Type[T], structural_type)) + return True + except Exception: + return False + + +class Constant(Hyperparameter[T]): + """ + A constant hyper-parameter that represents a constant default value. + + Type variable ``T`` is optional and if not provided an attempt to + automatically infer it from ``default`` will be made. + """ + + def validate(self, value: T) -> None: + super().validate(value) + + default = self.get_default() + if value != default: + raise exceptions.InvalidArgumentValueError("Value '{value}' {for_name}is not the constant default value '{default}'.".format(value=value, for_name=self._for_name(), default=default)) + + def sample(self, random_state: RandomState = None) -> T: + """ + Samples a random value from the hyper-parameter search space. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + sklearn_validation.check_random_state(random_state) + + return self.get_default() + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[T]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + For the base class it always returns only a ``default`` value because the space + is unknown. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + if with_replacement: + size = random_state.randint(min_samples, max_samples + 1) + + return (self.get_default(),) * size + + else: + if min_samples > 0: + assert min_samples == 1, min_samples + assert max_samples == 1, max_samples + return (self.get_default(),) + elif max_samples < 1: + assert min_samples == 0, min_samples + assert max_samples == 0, max_samples + return () + else: + assert min_samples == 0, min_samples + assert max_samples == 1, max_samples + return typing.cast(typing.Sequence[T], () if random_state.rand() >= 0.5 else (self.get_default(),)) + + +class Bounded(Hyperparameter[T]): + """ + A bounded hyper-parameter with lower and upper bounds, but no other + information about the distribution of the space of the hyper-parameter, + besides a default value. + + Both lower and upper bounds are inclusive by default. Each bound can be + also ``None`` to signal that the hyper-parameter is unbounded for that bound. + Both bounds cannot be ``None`` because then this is the same as + ``Hyperparameter`` class, so you can use that one directly. + + Type variable ``T`` is optional and if not provided an attempt to + automatically infer it from bounds and ``default`` will be made. + + Attributes + ---------- + lower: + A lower bound. + lower_inclusive: + Is the lower bound inclusive? + upper: + An upper bound. + upper_inclusive: + Is the upper bound inclusive? + """ + + lower: typing.Any + lower_inclusive: bool + upper: typing.Any + upper_inclusive: bool + + def __init__(self, lower: T, upper: T, default: T, *, lower_inclusive: bool = True, upper_inclusive: bool = True, semantic_types: typing.Sequence[str] = None, description: str = None) -> None: + self.lower = lower + self.upper = upper + self.lower_inclusive = lower_inclusive + self.upper_inclusive = upper_inclusive + + if self.lower is None and self.upper is None: + raise exceptions.InvalidArgumentValueError("Lower and upper bounds cannot both be None.") + + self._validate_finite_float(self.lower) + self._validate_finite_float(self.upper) + + if self.lower is None: + self.lower_inclusive = False + if self.upper is None: + self.upper_inclusive = False + + self._lower_compare, self._upper_compare, self._lower_interval, self._upper_interval = self._get_operators(self.lower_inclusive, self.upper_inclusive) + + # If subclass has not already set it. + if not hasattr(self, 'structural_type'): + structural_type = _get_structural_type_argument(self, T) # type: ignore + + if structural_type == typing.Any: + structural_types = list(self.infer_type(value) for value in [self.lower, self.upper, default] if value is not None) + type_util.simplify_for_Union(structural_types) + structural_type = typing.Union[tuple(structural_types)] # type: ignore + + self.structural_type = structural_type + + if self.lower is None or self.upper is None: + maybe_optional_structural_type = typing.cast(type, typing.Optional[self.structural_type]) # type: ignore + else: + maybe_optional_structural_type = self.structural_type + + if not self.check_type(self.lower, maybe_optional_structural_type): + raise exceptions.InvalidArgumentTypeError( + "Lower bound '{lower}' is not an instance of the structural type: {structural_type}".format( + lower=self.lower, structural_type=self.structural_type, + ) + ) + + if not self.check_type(self.upper, maybe_optional_structural_type): + raise exceptions.InvalidArgumentTypeError( + "Upper bound '{upper}' is not an instance of the structural type: {structural_type}".format( + upper=self.upper, structural_type=self.structural_type, + )) + + if self.lower is not None and self.upper is not None: + if not (self._lower_compare(self.lower, self.upper) and self._upper_compare(self.lower, self.upper)): + raise exceptions.InvalidArgumentValueError( + "Lower bound '{lower}' is not smaller than upper bound '{upper}'.".format( + lower=self.lower, upper=self.upper, + ) + ) + + self._initialize_effective_bounds() + + # Default value is checked to be inside bounds by parent class calling "validate". + + super().__init__(default, semantic_types=semantic_types, description=description) + + @classmethod + def _get_operators(cls, lower_inclusive: bool, upper_inclusive: bool) -> typing.Tuple[typing.Callable, typing.Callable, str, str]: + if lower_inclusive: + lower_compare = operator.le + lower_interval = '[' + else: + lower_compare = operator.lt + lower_interval = '(' + + if upper_inclusive: + upper_compare = operator.le + upper_interval = ']' + else: + upper_compare = operator.lt + upper_interval = ')' + + return lower_compare, upper_compare, lower_interval, upper_interval + + def _initialize_effective_bounds_float(self) -> None: + if self.lower_inclusive: + self._effective_lower = self.lower + else: + self._effective_lower = numpy.nextafter(self.lower, self.lower + 1) + + if self.upper_inclusive: + self._effective_upper = numpy.nextafter(self.upper, self.upper + 1) + else: + self._effective_upper = self.upper + + def _initialize_effective_bounds_int(self) -> None: + if self.lower_inclusive: + self._effective_lower = self.lower + else: + self._effective_lower = self.lower + 1 + + if self.upper_inclusive: + self._effective_upper = self.upper + 1 + else: + self._effective_upper = self.upper + + def _initialize_effective_bounds(self) -> None: + # If subclass has not already set it. + if getattr(self, '_effective_lower', None) is None or getattr(self, '_effective_upper', None) is None: + if self.lower is None or self.upper is None: + self._effective_lower = None + self._effective_upper = None + self._is_int = False + self._is_float = False + elif utils.is_int(type(self.lower)) and utils.is_int(type(self.upper)): + self._initialize_effective_bounds_int() + self._is_int = True + self._is_float = False + elif utils.is_float(type(self.lower)) and utils.is_float(type(self.upper)): + self._initialize_effective_bounds_float() + self._is_int = False + self._is_float = True + else: + self._effective_lower = None + self._effective_upper = None + self._is_int = False + self._is_float = False + + if self._effective_lower is not None and self._effective_upper is not None and not (self._effective_lower < self._effective_upper): + raise exceptions.InvalidArgumentValueError( + "Effective lower bound '{lower}' is not smaller than upper bound '{upper}'.".format( + lower=self.lower, upper=self.upper, + ) + ) + + def validate(self, value: T) -> None: + super().validate(value) + + # This my throw an exception if value is not comparable, but this is on purpose. + if self.lower is None: + if not (value is None or self._upper_compare(value, self.upper)): # type: ignore + raise exceptions.InvalidArgumentValueError( + "Value '{value}' {for_name}is outside of range {lower_interval}{lower}, {upper}{upper_interval}.".format( + value=value, for_name=self._for_name(), lower_interval=self._lower_interval, + lower=self.lower, upper=self.upper, upper_interval=self._upper_interval, + ), + ) + elif self.upper is None: + if not (value is None or self._lower_compare(self.lower, value)): # type: ignore + raise exceptions.InvalidArgumentValueError( + "Value '{value}' {for_name}is outside of range {lower_interval}{lower}, {upper}{upper_interval}.".format( + value=value, for_name=self._for_name(), lower_interval=self._lower_interval, + lower=self.lower, upper=self.upper, upper_interval=self._upper_interval, + ), + ) + else: + if not (self._lower_compare(self.lower, value) and self._upper_compare(value, self.upper)): # type: ignore + raise exceptions.InvalidArgumentValueError( + "Value '{value}' {for_name}is outside of range {lower_interval}{lower}, {upper}{upper_interval}.".format( + value=value, for_name=self._for_name(), lower_interval=self._lower_interval, + lower=self.lower, upper=self.upper, upper_interval=self._upper_interval, + ), + ) + + def validate_default(self) -> None: + if self.lower is None or self.upper is None: + maybe_optional_structural_type = typing.cast(type, typing.Optional[self.structural_type]) # type: ignore + else: + maybe_optional_structural_type = self.structural_type + + structural_type = self.structural_type + try: + self.structural_type = maybe_optional_structural_type + super().validate_default() + finally: + self.structural_type = structural_type + + def sample(self, random_state: RandomState = None) -> T: + """ + Samples a random value from the hyper-parameter search space. + + If it is bounded on both sides, it tries to sample from uniform distribution, + otherwise returns a ``default`` value. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + if getattr(self, '_is_int', False) or getattr(self, '_is_float', False): + utils.log_once( + logger, logging.WARNING, + "Sampling a bounded hyper-parameter '%(name)s' without known distribution. Sampling from a uniform distribution.", + {'name': self.name}, + stack_info=True, + ) + + if getattr(self, '_is_int', False): + return self.structural_type(random_state.randint(self._effective_lower, self._effective_upper)) + else: + return self.structural_type(random_state.uniform(self._effective_lower, self._effective_upper)) + + elif self.lower is not None and self.upper is not None: + utils.log_once( + logger, + logging.WARNING, + "Sampling a bounded hyper-parameter '%(name)s' with unsupported bounds. Using a default value.", + {'name': self.name}, + stack_info=True, + ) + + return self.get_default() + + else: + utils.log_once( + logger, + logging.WARNING, + "Sampling a semi-bounded hyper-parameter '%(name)s'. Using a default value.", + {'name': self.name}, stack_info=True, + ) + + return self.get_default() + + def get_max_samples(self) -> typing.Optional[int]: + if getattr(self, '_is_int', False): + return self._effective_upper - self._effective_lower + + elif getattr(self, '_is_float', False): + return None + + else: + return 1 + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[T]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + size = random_state.randint(min_samples, max_samples + 1) + + if with_replacement: + sample_list: list = [self.sample(random_state) for i in range(size)] + else: + sample_set: set = set() + sample_list = [] + while len(sample_list) != size: + value = self.sample(random_state) + if value not in sample_set: + sample_set.add(value) + sample_list.append(value) + + return tuple(sample_list) + + def __repr__(self) -> str: + return '{class_name}(lower={lower}, upper={upper}, default={default}, lower_inclusive={lower_inclusive}, upper_inclusive={upper_inclusive})'.format( + class_name=type(self).__name__, + lower=self.lower, + upper=self.upper, + default=self.get_default(), + lower_inclusive=self.lower_inclusive, + upper_inclusive=self.upper_inclusive, + ) + + def to_simple_structure(self) -> typing.Dict: + structure = super().to_simple_structure() + structure.update({ + 'lower': self.lower, + 'upper': self.upper, + 'lower_inclusive': self.lower_inclusive, + 'upper_inclusive': self.upper_inclusive, + }) + return structure + + +class Enumeration(Hyperparameter[T]): + """ + An enumeration hyper-parameter with a value drawn uniformly from a list of values. + + If ``None`` is a valid choice, it should be listed among ``values``. + + Type variable ``T`` is optional and if not provided an attempt to + automatically infer it from ``values`` will be made. + + Attributes + ---------- + values: + A list of choice values. + """ + + values: typing.Sequence[typing.Any] + + def __init__(self, values: typing.Sequence[T], default: T, *, semantic_types: typing.Sequence[str] = None, description: str = None) -> None: + self.values = values + + # Used for sampling. + # See: https://github.com/numpy/numpy/issues/15935 + self._choices = numpy.array(list(self.values), dtype=object) + + # If subclass has not already set it. + if not hasattr(self, 'structural_type'): + structural_type = _get_structural_type_argument(self, T) # type: ignore + + if structural_type == typing.Any: + structural_types = list(self.infer_type(value) for value in self.values) + type_util.simplify_for_Union(structural_types) + structural_type = typing.Union[tuple(structural_types)] # type: ignore + + self.structural_type = structural_type + + for value in self.values: + if not self.check_type(value, self.structural_type): + raise exceptions.InvalidArgumentTypeError("Value '{value}' is not an instance of the structural type: {structural_type}".format(value=value, structural_type=self.structural_type)) + + # This also raises an exception if there is a "1.0" and "1" value in the list, so a float and + # and int of equal value. This is important because when storing as JSON floats can be converted + # to ints it they are integers. So we could not know which enumeration value it represents. + if utils.has_duplicates(self.values): + raise exceptions.InvalidArgumentValueError("Values '{values}' contain duplicates.".format(values=self.values)) + + self._has_nan = any(utils.is_float(type(value)) and numpy.isnan(value) for value in self.values) + + # Default value is checked to be among values by parent class calling "validate". + + super().__init__(default, semantic_types=semantic_types, description=description) + + def validate(self, value: T) -> None: + # We have to specially handle NaN because it is not equal to any value. + if value not in self.values and not (self._has_nan and utils.is_float(type(value)) and numpy.isnan(value)): + raise exceptions.InvalidArgumentValueError("Value '{value}' {for_name}is not among values.".format(value=value, for_name=self._for_name())) + + def sample(self, random_state: RandomState = None) -> T: + """ + Samples a random value from the hyper-parameter search space. + + It samples a value from ``values``. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + return random_state.choice(self._choices) + + def get_max_samples(self) -> typing.Optional[int]: + return len(self.values) + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[T]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + It samples values from ``values``. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + size = random_state.randint(min_samples, max_samples + 1) + + return tuple(random_state.choice(self._choices, size, replace=with_replacement)) + + def __repr__(self) -> str: + return '{class_name}(values={values}, default={default})'.format( + class_name=type(self).__name__, + values=self.values, + default=self.get_default(), + ) + + def to_simple_structure(self) -> typing.Dict: + structure = super().to_simple_structure() + structure.update({ + 'values': list(self.values), + }) + return structure + + +class UniformBool(Enumeration[bool]): + """ + A bool hyper-parameter with a value drawn uniformly from ``{True, False}``. + """ + + def __init__(self, default: bool, *, semantic_types: typing.Sequence[str] = None, description: str = None) -> None: + super().__init__([True, False], default, semantic_types=semantic_types, description=description) + + def __repr__(self) -> str: + return '{class_name}(default={default})'.format( + class_name=type(self).__name__, + default=self.get_default(), + ) + + def to_simple_structure(self) -> typing.Dict: + structure = super().to_simple_structure() + del structure['values'] + return structure + + +class UniformInt(Bounded[int]): + """ + An int hyper-parameter with a value drawn uniformly from ``[lower, upper)``, + by default. + + Attributes + ---------- + lower: + A lower bound. + lower_inclusive: + Is the lower bound inclusive? + upper: + An upper bound. + upper_inclusive: + Is the upper bound inclusive? + """ + + lower: int + lower_inclusive: bool + upper: int + upper_inclusive: bool + + def __init__( + self, lower: int, upper: int, default: int, *, lower_inclusive: bool = True, upper_inclusive: bool = False, + semantic_types: typing.Sequence[str] = None, description: str = None, + ) -> None: + # Just to make sure because parent class allow None values. + if lower is None or upper is None: + raise exceptions.InvalidArgumentValueError("Bounds cannot be None.") + + # Default value is checked to be inside bounds by parent class calling "validate". + + super().__init__(lower, upper, default, lower_inclusive=lower_inclusive, upper_inclusive=upper_inclusive, semantic_types=semantic_types, description=description) + + def _initialize_effective_bounds(self) -> None: + self._initialize_effective_bounds_int() + + super()._initialize_effective_bounds() + + def sample(self, random_state: RandomState = None) -> int: + """ + Samples a random value from the hyper-parameter search space. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + return self.structural_type(random_state.randint(self._effective_lower, self._effective_upper)) + + def get_max_samples(self) -> typing.Optional[int]: + return self._effective_upper - self._effective_lower + + +class Uniform(Bounded[float]): + """ + A float hyper-parameter with a value drawn uniformly from ``[lower, upper)``, + by default. + + If ``q`` is provided, then the value is drawn according to ``round(uniform(lower, upper) / q) * q``. + + Attributes + ---------- + lower: + A lower bound. + upper: + An upper bound. + q: + An optional quantization factor. + lower_inclusive: + Is the lower bound inclusive? + upper_inclusive: + Is the upper bound inclusive? + """ + + lower: float + upper: float + q: float + lower_inclusive: bool + upper_inclusive: bool + + def __init__( + self, lower: float, upper: float, default: float, q: float = None, *, lower_inclusive: bool = True, upper_inclusive: bool = False, + semantic_types: typing.Sequence[str] = None, description: str = None, + ) -> None: + # Just to make sure because parent class allow None values. + if lower is None or upper is None: + raise exceptions.InvalidArgumentValueError("Bounds cannot be None.") + + self.q = q + + # Default value is checked to be inside bounds by parent class calling "validate". + + super().__init__(lower, upper, default, lower_inclusive=lower_inclusive, upper_inclusive=upper_inclusive, semantic_types=semantic_types, description=description) + + def _initialize_effective_bounds(self) -> None: + self._initialize_effective_bounds_float() + + super()._initialize_effective_bounds() + + def sample(self, random_state: RandomState = None) -> float: + """ + Samples a random value from the hyper-parameter search space. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + value = random_state.uniform(self._effective_lower, self._effective_upper) + + if self.q is None: + return self.structural_type(value) + else: + return self.structural_type(numpy.round(value / self.q) * self.q) + + def get_max_samples(self) -> typing.Optional[int]: + return None + + def __repr__(self) -> str: + return '{class_name}(lower={lower}, upper={upper}, q={q}, default={default}, lower_inclusive={lower_inclusive}, upper_inclusive={upper_inclusive})'.format( + class_name=type(self).__name__, + lower=self.lower, + upper=self.upper, + q=self.q, + default=self.get_default(), + lower_inclusive=self.lower_inclusive, + upper_inclusive=self.upper_inclusive, + ) + + def to_simple_structure(self) -> typing.Dict: + structure = super().to_simple_structure() + + structure.update({ + 'lower': self.lower, + 'upper': self.upper, + 'lower_inclusive': self.lower_inclusive, + 'upper_inclusive': self.upper_inclusive, + }) + + if self.q is not None: + structure['q'] = self.q + + return structure + + +class LogUniform(Bounded[float]): + """ + A float hyper-parameter with a value drawn from ``[lower, upper)``, by default, + according to ``exp(uniform(log(lower), log(upper)))`` + so that the logarithm of the value is uniformly distributed. + + If ``q`` is provided, then the value is drawn according to ``round(exp(uniform(log(lower), log(upper))) / q) * q``. + + Attributes + ---------- + lower: + A lower bound. + upper: + An upper bound. + q: + An optional quantization factor. + lower_inclusive: + Is the lower bound inclusive? + upper_inclusive: + Is the upper bound inclusive? + """ + + lower: float + upper: float + q: float + lower_inclusive: bool + upper_inclusive: bool + + def __init__( + self, lower: float, upper: float, default: float, q: float = None, *, lower_inclusive: bool = True, upper_inclusive: bool = False, + semantic_types: typing.Sequence[str] = None, description: str = None, + ) -> None: + # Just to make sure because parent class allow None values. + if lower is None or upper is None: + raise exceptions.InvalidArgumentValueError("Bounds cannot be None.") + + self.q = q + + # Default value is checked to be inside bounds by parent class calling "validate". + + super().__init__(lower, upper, default, lower_inclusive=lower_inclusive, upper_inclusive=upper_inclusive, semantic_types=semantic_types, description=description) + + def _initialize_effective_bounds(self) -> None: + self._initialize_effective_bounds_float() + + super()._initialize_effective_bounds() + + def sample(self, random_state: RandomState = None) -> float: + """ + Samples a random value from the hyper-parameter search space. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + value = numpy.exp(random_state.uniform(numpy.log(self._effective_lower), numpy.log(self._effective_upper))) + + if self.q is None: + return self.structural_type(value) + else: + return self.structural_type(numpy.round(value / self.q) * self.q) + + def get_max_samples(self) -> typing.Optional[int]: + return None + + def __repr__(self) -> str: + return '{class_name}(lower={lower}, upper={upper}, q={q}, default={default}, lower_inclusive={lower_inclusive}, upper_inclusive={upper_inclusive})'.format( + class_name=type(self).__name__, + lower=self.lower, + upper=self.upper, + q=self.q, + default=self.get_default(), + lower_inclusive=self.lower_inclusive, + upper_inclusive=self.upper_inclusive, + ) + + def to_simple_structure(self) -> typing.Dict: + structure = super().to_simple_structure() + + structure.update({ + 'lower': self.lower, + 'upper': self.upper, + 'lower_inclusive': self.lower_inclusive, + 'upper_inclusive': self.upper_inclusive, + }) + + if self.q is not None: + structure['q'] = self.q + + return structure + + +class Normal(Hyperparameter[float]): + """ + A float hyper-parameter with a value drawn normally distributed according to ``mu`` and ``sigma``. + + If ``q`` is provided, then the value is drawn according to ``round(normal(mu, sigma) / q) * q``. + + Attributes + ---------- + mu: + A mean of normal distribution. + sigma: + A standard deviation of normal distribution. + q: + An optional quantization factor. + """ + + mu: float + sigma: float + q: float + + def __init__(self, mu: float, sigma: float, default: float, q: float = None, *, semantic_types: typing.Sequence[str] = None, description: str = None) -> None: + self.mu = mu + self.sigma = sigma + self.q = q + + self._validate_finite_float(self.mu) + self._validate_finite_float(self.sigma) + self._validate_finite_float(self.q) + + super().__init__(default, semantic_types=semantic_types, description=description) + + def sample(self, random_state: RandomState = None) -> float: + """ + Samples a random value from the hyper-parameter search space. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + value = random_state.normal(self.mu, self.sigma) + + if self.q is None: + return self.structural_type(value) + else: + return self.structural_type(numpy.round(value / self.q) * self.q) + + def get_max_samples(self) -> typing.Optional[int]: + return None + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[T]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + size = random_state.randint(min_samples, max_samples + 1) + + if with_replacement: + sample_list: list = [self.sample(random_state) for i in range(size)] + else: + sample_set: set = set() + sample_list = [] + while len(sample_list) != size: + value = self.sample(random_state) + if value not in sample_set: + sample_set.add(value) + sample_list.append(value) + + return tuple(sample_list) + + def __repr__(self) -> str: + return '{class_name}(mu={mu}, sigma={sigma}, q={q}, default={default})'.format( + class_name=type(self).__name__, + mu=self.mu, + sigma=self.sigma, + q=self.q, + default=self.get_default(), + ) + + def to_simple_structure(self) -> typing.Dict: + structure = super().to_simple_structure() + + structure.update({ + 'mu': self.mu, + 'sigma': self.sigma, + }) + + if self.q is not None: + structure['q'] = self.q + + return structure + + +class LogNormal(Hyperparameter[float]): + """ + A float hyper-parameter with a value drawn according to ``exp(normal(mu, sigma))`` so that the logarithm of the value is + normally distributed. + + If ``q`` is provided, then the value is drawn according to ``round(exp(normal(mu, sigma)) / q) * q``. + + Attributes + ---------- + mu: + A mean of normal distribution. + sigma: + A standard deviation of normal distribution. + q: + An optional quantization factor. + """ + + mu: float + sigma: float + q: float + + def __init__(self, mu: float, sigma: float, default: float, q: float = None, *, semantic_types: typing.Sequence[str] = None, description: str = None) -> None: + self.mu = mu + self.sigma = sigma + self.q = q + + self._validate_finite_float(self.mu) + self._validate_finite_float(self.sigma) + self._validate_finite_float(self.q) + + super().__init__(default, semantic_types=semantic_types, description=description) + + def sample(self, random_state: RandomState = None) -> float: + """ + Samples a random value from the hyper-parameter search space. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + value = numpy.exp(random_state.normal(self.mu, self.sigma)) + + if self.q is None: + return self.structural_type(value) + else: + return self.structural_type(numpy.round(value / self.q) * self.q) + + def get_max_samples(self) -> typing.Optional[int]: + return None + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[T]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + size = random_state.randint(min_samples, max_samples + 1) + + if with_replacement: + sample_list: list = [self.sample(random_state) for i in range(size)] + else: + sample_set: set = set() + sample_list = [] + while len(sample_list) != size: + value = self.sample(random_state) + if value not in sample_set: + sample_set.add(value) + sample_list.append(value) + + return tuple(sample_list) + + def __repr__(self) -> str: + return '{class_name}(mu={mu}, sigma={sigma}, q={q}, default={default})'.format( + class_name=type(self).__name__, + mu=self.mu, + sigma=self.sigma, + q=self.q, + default=self.get_default(), + ) + + def to_simple_structure(self) -> typing.Dict: + structure = super().to_simple_structure() + + structure.update({ + 'mu': self.mu, + 'sigma': self.sigma, + }) + + if self.q is not None: + structure['q'] = self.q + + return structure + + +class Union(Hyperparameter[T]): + """ + A union hyper-parameter which combines multiple other hyper-parameters. + + This is useful when a hyper-parameter has multiple modalities and each modality + can be described with a different hyper-parameter. + + No relation or probability distribution between modalities is prescribed, but + default sampling implementation assumes uniform distribution of modalities. + + Type variable ``T`` does not have to be specified because the structural type + can be automatically inferred as a union of all hyper-parameters in configuration. + + This is similar to `Choice` hyper-parameter that it combines hyper-parameters, but + `Union` combines individual hyper-parameters, while `Choice` combines configurations + of multiple hyper-parameters. + + Attributes + ---------- + configuration: + A configuration of hyper-parameters to combine into one. It is important + that configuration uses an ordered dict so that order is reproducible + (default dict has unspecified order). + """ + + configuration: frozendict.FrozenOrderedDict + + def __init__(self, configuration: 'collections.OrderedDict[str, Hyperparameter]', default: str, *, semantic_types: typing.Sequence[str] = None, + description: str = None) -> None: + if default not in configuration: + raise exceptions.InvalidArgumentValueError("Default value '{default}' is not in configuration.".format(default=default)) + + self.default_hyperparameter = configuration[default] + self.configuration = frozendict.FrozenOrderedDict(configuration) + + # Used for sampling. + # See: https://github.com/numpy/numpy/issues/15935 + self._choices = numpy.array(list(self.configuration.values()), dtype=object) + + for name, hyperparameter in self.configuration.items(): + if not isinstance(name, str): + raise exceptions.InvalidArgumentTypeError("Hyper-parameter name is not a string: {name}".format(name=name)) + if not isinstance(hyperparameter, Hyperparameter): + raise exceptions.InvalidArgumentTypeError("Hyper-parameter description is not an instance of the Hyperparameter class: {name}".format(name=name)) + + # If subclass has not already set it. + if not hasattr(self, 'structural_type'): + structural_type = _get_structural_type_argument(self, T) # type: ignore + + if structural_type == typing.Any: + structural_type = typing.Union[tuple(hyperparameter.structural_type for hyperparameter in self.configuration.values())] # type: ignore + + self.structural_type = structural_type + + for name, hyperparameter in self.configuration.items(): + if not utils.is_subclass(hyperparameter.structural_type, self.structural_type): + raise exceptions.InvalidArgumentTypeError( + "Hyper-parameter '{name}' is not a subclass of the structural type: {structural_type}".format( + name=name, structural_type=self.structural_type, + ) + ) + + super().__init__(self.configuration[default].get_default(), semantic_types=semantic_types, description=description) + + def contribute_to_class(self, name: str) -> None: + super().contribute_to_class(name) + + for hyperparameter_name, hyperparameter in self.configuration.items(): + hyperparameter.contribute_to_class('{name}.{hyperparameter_name}'.format(name=self.name, hyperparameter_name=hyperparameter_name)) + + def validate(self, value: T) -> None: + # Check that value belongs to the structural type. + super().validate(value) + + for name, hyperparameter in self.configuration.items(): + try: + hyperparameter.validate(value) + # Value validated with at least one hyper-parameter, we can return. + return + except Exception: + pass + + raise exceptions.InvalidArgumentValueError("Value '{value}' {for_name}has not validated with any of configured hyper-parameters.".format(value=value, for_name=self._for_name())) + + def value_to_json_structure(self, value: T) -> typing.Any: + # We could first call "self.validate" and then once more traverse configuration, + # but we instead re-implement validation like it is implemented in "self.validate", + # but also convert the value once we find configuration which passes validation. + + # Check that value belongs to the structural type. + super().validate(value) + + for name, hyperparameter in self.configuration.items(): + try: + hyperparameter.validate(value) + # Value validated with this hyper-parameter. + return { + 'case': name, + 'value': hyperparameter.value_to_json_structure(value), + } + except Exception: + pass + + raise exceptions.InvalidArgumentValueError("Value '{value}' {for_name}has not validated with any of configured hyper-parameters.".format(value=value, for_name=self._for_name())) + + def value_from_json_structure(self, json: typing.Any) -> T: + if isinstance(json, dict): + value = self.configuration[json['case']].value_from_json_structure(json['value']) + + # No need to traverse configuration again, configuration's + # "value_from_json_structure" already validated the value. + # We just check that value belongs to the structural type. + super().validate(value) + + else: + # Backwards compatibility. We just take value as-is and hope JSON encoding has + # not changed the type from float to int in a way that it breaks the primitive. + logger.warning("Converting union hyper-parameter '%(name)s' from a deprecated JSON structure. It might be converted badly.", {'name': self.name}) + + value = super().value_to_json_structure(json) + + return value + + def sample(self, random_state: RandomState = None) -> T: + """ + Samples a random value from the hyper-parameter search space. + + It first chooses a hyper-parameter from its configuration and then + samples it. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + hyperparameter = random_state.choice(self._choices) + + return hyperparameter.sample(random_state) + + @functools.lru_cache() + def get_max_samples(self) -> typing.Optional[int]: # type: ignore + all_max_samples = 0 + for hyperparameter in self.configuration.values(): + hyperparameter_max_samples = hyperparameter.get_max_samples() + if hyperparameter_max_samples is None: + return None + else: + # TODO: Assumption here is that values between hyper-parameters are independent. What when they are not? + # For example, union of UniformInt(0, 10) and UniformInt(5, 15) does not have 20 samples, but only 15 possible. + all_max_samples += hyperparameter_max_samples + + return all_max_samples + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[T]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + size = random_state.randint(min_samples, max_samples + 1) + + if with_replacement: + sample_list: list = [self.sample(random_state) for i in range(size)] + else: + sample_set: set = set() + sample_list = [] + while len(sample_list) != size: + value = self.sample(random_state) + if value not in sample_set: + sample_set.add(value) + sample_list.append(value) + + return tuple(sample_list) + + @functools.lru_cache() + def __repr__(self) -> str: # type: ignore + return '{class_name}(configuration={{{configuration}}}, default={default})'.format( + class_name=type(self).__name__, + configuration=', '.join('{name}: {hyperparameter}'.format(name=name, hyperparameter=hyperparameter) for name, hyperparameter in self.configuration.items()), + default=self.get_default(), + ) + + @functools.lru_cache() + def to_simple_structure(self) -> typing.Dict: # type: ignore + structure = super().to_simple_structure() + structure.update({ + 'configuration': {name: hyperparameter.to_simple_structure() for name, hyperparameter in self.configuration.items()} + }) + return structure + + def traverse(self) -> 'typing.Iterator[Hyperparameter]': + yield from super().traverse() + + for hyperparameter in self.configuration.values(): + yield hyperparameter + yield from hyperparameter.traverse() + + +class Choice(Hyperparameter[typing.Dict]): + """ + A hyper-parameter which combines multiple hyper-parameter configurations into one + hyper-parameter. + + This is useful when a combination of hyper-parameters should exists together. + Then such combinations can be made each into one choice. + + No relation or probability distribution between choices is prescribed. + + This is similar to `Union` hyper-parameter that it combines hyper-parameters, but + `Choice` combines configurations of multiple hyper-parameters, while `Union` combines + individual hyper-parameters. + + Attributes + ---------- + choices: + A map between choices and their classes defining their hyper-parameters configuration. + """ + + choices: frozendict.frozendict + + def __init__(self, choices: 'typing.Dict[str, typing.Type[Hyperparams]]', default: str, *, semantic_types: typing.Sequence[str] = None, + description: str = None) -> None: + if default not in choices: + raise exceptions.InvalidArgumentValueError("Default value '{default}' is not among choices.".format(default=default)) + + choices = copy.copy(choices) + + for choice, hyperparams in choices.items(): + if not isinstance(choice, str): + raise exceptions.InvalidArgumentTypeError("Choice is not a string: {choice}".format(choice=choice)) + if not issubclass(hyperparams, Hyperparams): + raise exceptions.InvalidArgumentTypeError("Hyper-parameters space is not a subclass of 'Hyperparams' class: {choice}".format(choice=choice)) + if 'choice' in hyperparams.configuration: + raise ValueError("Hyper-parameters space contains a reserved hyper-paramater name 'choice': {choice}".format(choice=choice)) + + configuration = collections.OrderedDict(hyperparams.configuration) + configuration['choice'] = Hyperparameter[str](choice, semantic_types=['https://metadata.datadrivendiscovery.org/types/ChoiceParameter']) + + # We make a copy/subclass adding "choice" hyper-parameter. We add a name suffix to differentiate it from the parent class. + choices[choice] = hyperparams.define(configuration, class_name='{name}WithChoice'.format(name=hyperparams.__name__), module_name=hyperparams.__module__) + + self.default_hyperparams = choices[default] + self.choices = frozendict.frozendict(choices) + + # Used for sampling. + # See: https://github.com/numpy/numpy/issues/15935 + self._choices = numpy.array(list(self.choices.keys()), dtype=object) + + # Copy defaults and add "choice". + defaults = self.choices[default](self.choices[default].defaults(), choice=default) + + # If subclass has not already set it. + if not hasattr(self, 'structural_type'): + # Choices do not really have a free type argument, so this is probably the same as "dict". + self.structural_type = _get_structural_type_argument(self, T) + + super().__init__(defaults, semantic_types=semantic_types, description=description) + + # We go over all hyper-parameter configurations and set their names. This means that names should not already + # be set. This is by default so if "Hyperparams.define" is used, but if one defines a custom class, + # you have to define it like "class MyHyperparams(Hyperparams, set_names=False): ..." + def contribute_to_class(self, name: str) -> None: + super().contribute_to_class(name) + + for choice, hyperparams in self.choices.items(): + for hyperparameter_name, hyperparameter in hyperparams.configuration.items(): + hyperparameter.contribute_to_class('{name}.{choice}.{hyperparameter_name}'.format(name=self.name, choice=choice, hyperparameter_name=hyperparameter_name)) + + def get_default(self, path: str = None) -> typing.Any: + if path is None: + return super().get_default(path) + + if '.' not in path: + return self.choices[path].defaults() + else: + segment, rest = path.split('.', 1) + return self.choices[segment].defaults(rest) + + def validate(self, value: dict) -> None: + # Check that value belongs to the structural type, a dict. + super().validate(value) + + if 'choice' not in value: + raise exceptions.InvalidArgumentValueError("'choice' is missing in '{value}' {for_name}.".format(value=value, for_name=self._for_name())) + + self.choices[value['choice']].validate(value) + + def sample(self, random_state: RandomState = None) -> dict: + """ + Samples a random value from the hyper-parameter search space. + + It first chooses a hyper-parameters configuration from available choices and then + samples it. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + random_state = sklearn_validation.check_random_state(random_state) + + choice = random_state.choice(self._choices) + + sample = self.choices[choice].sample(random_state) + + # The "choice" hyper-parameter should be sampled to its choice value. + assert choice == sample['choice'], sample + + return sample + + @functools.lru_cache() + def get_max_samples(self) -> typing.Optional[int]: # type: ignore + all_max_samples = 0 + for hyperparams in self.choices.values(): + hyperparams_max_samples = hyperparams.get_max_samples() + if hyperparams_max_samples is None: + return None + else: + all_max_samples += hyperparams_max_samples + return all_max_samples + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[T]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + size = random_state.randint(min_samples, max_samples + 1) + + if with_replacement: + sample_list: list = [self.sample(random_state) for i in range(size)] + else: + sample_set: set = set() + sample_list = [] + while len(sample_list) != size: + value = self.sample(random_state) + if value not in sample_set: + sample_set.add(value) + sample_list.append(value) + + return tuple(sample_list) + + @functools.lru_cache() + def __repr__(self) -> str: # type: ignore + return '{class_name}(choices={{{choices}}}, default={default})'.format( + class_name=type(self).__name__, + choices=', '.join('{choice}: {hyperparams}'.format(choice=choice, hyperparams=hyperparams) for choice, hyperparams in self.choices.items()), + default=self.get_default(), + ) + + @functools.lru_cache() + def to_simple_structure(self) -> typing.Dict: # type: ignore + structure = super().to_simple_structure() + structure.update({ + 'choices': {choice: hyperparams.to_simple_structure() for choice, hyperparams in self.choices.items()} + }) + return structure + + @deprecate.function(message="use value_to_json_structure method instead") + def value_to_json(self, value: dict) -> typing.Any: + return self.value_to_json_structure(value) + + def value_to_json_structure(self, value: dict) -> typing.Any: + self.validate(value) + + return self.choices[value['choice']](value).values_to_json_structure() + + @deprecate.function(message="use value_from_json_structure method instead") + def value_from_json(self, json: typing.Any) -> dict: + return self.value_from_json_structure(json) + + def value_from_json_structure(self, json: typing.Any) -> dict: + value = self.choices[json['choice']].values_from_json_structure(json) + + self.validate(value) + + return value + + def traverse(self) -> 'typing.Iterator[Hyperparameter]': + yield from super().traverse() + + for hyperparams in self.choices.values(): + yield from hyperparams.traverse() + + def transform_value(self, value: dict, transform: typing.Callable, index: int = 0) -> dict: + if 'choice' not in value: + raise exceptions.InvalidArgumentValueError("'choice' is missing in '{value}' {for_name}.".format(value=value, for_name=self._for_name())) + + return self.choices[value['choice']].transform_value(value, transform, index + sorted(self.choices.keys()).index(value['choice'])) + + +# TODO: "elements" hyper-parameter still needs a default. Can we get rid of that somehow? It is not used. +# Maybe we should require that just top-level hyper-parameter instances need defaults, but not all. +class _Sequence(Hyperparameter[S]): + """ + Abstract class. Do not use directly. + + Attributes + ---------- + elements: + A hyper-parameter or hyper-parameters configuration of set elements. + min_size: + A minimal number of elements in the set. + max_size: + A maximal number of elements in the set. Can be ``None`` for no limit. + is_configuration: + Is ``elements`` a hyper-parameter or hyper-parameters configuration? + """ + + elements: 'typing.Union[Hyperparameter, typing.Type[Hyperparams]]' + min_size: int + max_size: int + is_configuration: bool + + def __init__( + self, elements: 'typing.Union[Hyperparameter, typing.Type[Hyperparams]]', default: S, min_size: int = 0, max_size: int = None, *, + semantic_types: typing.Sequence[str] = None, description: str = None, + ) -> None: + self.elements = elements + self.min_size = min_size + self.max_size = max_size + self.is_configuration = utils.is_type(self.elements) and issubclass(typing.cast(type, self.elements), Hyperparams) + + if not isinstance(self.elements, Hyperparameter) and not self.is_configuration: + raise exceptions.InvalidArgumentTypeError("'elements' argument is not an instance of the Hyperparameter class or a subclass of the Hyperparams class.") + + if not isinstance(self.min_size, int): + raise exceptions.InvalidArgumentTypeError("'min_size' argument is not an int.") + if self.min_size < 0: + raise exceptions.InvalidArgumentValueError("'min_size' cannot be smaller than 0.") + if self.max_size is not None: + if not isinstance(self.max_size, int): + raise exceptions.InvalidArgumentTypeError("'max_size' argument is not an int.") + if self.min_size > self.max_size: + raise exceptions.InvalidArgumentValueError("'min_size' cannot be larger than 'max_size'.") + + # If subclass has not already set it. + if not hasattr(self, 'structural_type'): + structural_type = _get_structural_type_argument(self, S) # type: ignore + + if structural_type == typing.Any: + if self.is_configuration: + structural_type = typing.Sequence[self.elements] # type: ignore + else: + structural_type = typing.Sequence[elements.structural_type] # type: ignore + + self.structural_type = structural_type + + if not utils.is_subclass(self.structural_type, typing.Sequence): + raise exceptions.InvalidArgumentTypeError("Structural type is not a subclass of a sequence.") + + elements_type = utils.get_type_arguments(self.structural_type)[typing.T_co] # type: ignore + if self.is_configuration: + if elements_type is not self.elements: + raise exceptions.InvalidArgumentTypeError("Structural type does not match hyper-parameters configuration type.") + else: + if elements_type is not elements.structural_type: + raise exceptions.InvalidArgumentTypeError("Structural type does not match elements hyper-parameter's structural type.") + + # Default value is checked by parent class calling "validate". + + super().__init__(default, semantic_types=semantic_types, description=description) + + # We go over the hyper-parameters configuration and set their names. This means that names should not already + # be set. This is by default so if "Hyperparams.define" is used, but if one defines a custom class, + # you have to define it like "class MyHyperparams(Hyperparams, set_names=False): ..." + def contribute_to_class(self, name: str) -> None: + super().contribute_to_class(name) + + if self.is_configuration: + for hyperparameter_name, hyperparameter in typing.cast(typing.Type[Hyperparams], self.elements).configuration.items(): + hyperparameter.contribute_to_class('{name}.{hyperparameter_name}'.format(name=self.name, hyperparameter_name=hyperparameter_name)) + else: + self.elements.contribute_to_class('{name}.elements'.format(name=self.name)) + + def get_default(self, path: str = None) -> typing.Any: + # If "path" is "None" we want to return what was set as a default for this hyper-parameter + # which might be different than hyper-parameters configuration defaults. + if path is None or not self.is_configuration: + return super().get_default(path) + else: + return typing.cast(Hyperparams, self.elements).defaults(path) + + def validate(self, value: S) -> None: + # Check that value belongs to the structural type. + super().validate(value) + + cast_value = typing.cast(typing.Sequence, value) + + for v in cast_value: + self.elements.validate(v) + + if not self.min_size <= len(cast_value): + raise exceptions.InvalidArgumentValueError("Value '{value}' {for_name}has less than {min_size} elements.".format(value=value, for_name=self._for_name(), min_size=self.min_size)) + if self.max_size is not None and not len(cast_value) <= self.max_size: + raise exceptions.InvalidArgumentValueError("Value '{value}' {for_name}has more than {max_size} elements.".format(value=value, for_name=self._for_name(), max_size=self.max_size)) + + @abc.abstractmethod + def sample(self, random_state: RandomState = None) -> S: + pass + + @abc.abstractmethod + def get_max_samples(self) -> typing.Optional[int]: + pass + + @abc.abstractmethod + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[S]: + pass + + def __repr__(self) -> str: + return '{class_name}(elements={elements}, default={default}, min_size={min_size}, max_size={max_size})'.format( + class_name=type(self).__name__, + elements=self.elements, + default=self.get_default(), + min_size=self.min_size, + max_size=self.max_size, + ) + + @functools.lru_cache() + def to_simple_structure(self) -> typing.Dict: # type: ignore + structure = super().to_simple_structure() + structure.update({ + 'elements': self.elements.to_simple_structure(), + 'is_configuration': self.is_configuration, + 'min_size': self.min_size, + }) + + if self.max_size is not None: + structure['max_size'] = self.max_size + + return structure + + @deprecate.function(message="use value_to_json_structure method instead") + def value_to_json(self, value: S) -> typing.Any: + return self.value_to_json_structure(value) + + def value_to_json_structure(self, value: S) -> typing.Any: + self.validate(value) + + if self.is_configuration: + return [typing.cast(typing.Type[Hyperparams], self.elements)(v).values_to_json_structure() for v in typing.cast(typing.Sequence, value)] + else: + return [self.elements.value_to_json_structure(v) for v in typing.cast(typing.Sequence, value)] + + @deprecate.function(message="use value_from_json_structure method instead") + def value_from_json(self, json: typing.Any) -> S: + return self.value_from_json_structure(json) + + def value_from_json_structure(self, json: typing.Any) -> S: + if self.is_configuration: + value = typing.cast(S, tuple(typing.cast(typing.Type[Hyperparams], self.elements).values_from_json_structure(j) for j in json)) + else: + value = typing.cast(S, tuple(self.elements.value_from_json_structure(j) for j in json)) + + self.validate(value) + + return value + + def traverse(self) -> 'typing.Iterator[Hyperparameter]': + yield from super().traverse() + + if self.is_configuration: + yield from self.elements.traverse() + else: + yield self.elements + + def transform_value(self, value: S, transform: typing.Callable, index: int = 0) -> S: + cast_value = typing.cast(typing.Sequence, value) + + # We assume here that we can make a new instance of the sequence-type used + # for "value" by providing an iterator of new values to its constructor. + # This works for tuples which we are using by default to represent a set. + return type(value)(self.elements.transform_value(v, transform, index + i) for i, v in enumerate(cast_value)) # type: ignore + + def can_accept_value_type(self, structural_type: typing.Union[type, typing.List[type]]) -> bool: + if not isinstance(structural_type, typing.List): + # For parent method to return "False" because for "Set" hyper-parameter it has to be a list of types. + return super().can_accept_value_type(structural_type) + + if not self.min_size <= len(structural_type): + return False + if self.max_size is not None and not len(structural_type) <= self.max_size: + return False + + for st in structural_type: + if not self.elements.can_accept_value_type(st): + return False + + return True + + +class Set(_Sequence[S]): + """ + A set hyper-parameter which samples without replacement multiple times another hyper-parameter or hyper-parameters configuration. + + This is useful when a primitive is interested in more than one value of a hyper-parameter or hyper-parameters configuration. + + Values are represented as tuples of unique elements. The order of elements does not matter (two different orders of same + elements represent the same value), but order is meaningful and preserved to assure reproducibility. + + Type variable ``S`` does not have to be specified because the structural type + is a set from provided elements. + """ + + def validate(self, value: S) -> None: + super().validate(value) + + cast_value = typing.cast(typing.Sequence, value) + + if utils.has_duplicates(cast_value): + raise exceptions.InvalidArgumentValueError("Value '{value}' {for_name}has duplicate elements.".format(value=value, for_name=self._for_name())) + + def sample(self, random_state: RandomState = None) -> S: + """ + Samples a random value from the hyper-parameter search space. + + It first randomly chooses the size of the resulting sampled set + and then samples this number of unique elements. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + elements_max_samples = self.elements.get_max_samples() + if elements_max_samples is not None and elements_max_samples < self.min_size: + utils.log_once( + logger, + logging.WARNING, + "Elements hyper-parameter for hyper-parameter '%(name)s' cannot provide enough samples " + "(maximum %(elements_max_samples)s) to sample a set of at least %(min_size)s elements. Using a default value.", + {'name': self.name, 'elements_max_samples': elements_max_samples, 'min_size': self.min_size}, + stack_info=True, + ) + + return self.get_default() + + return self.elements.sample_multiple(min_samples=self.min_size, max_samples=self.max_size, random_state=random_state, with_replacement=False) # type: ignore + + @functools.lru_cache() + def get_max_samples(self) -> typing.Optional[int]: # type: ignore + max_samples = self.elements.get_max_samples() + if max_samples is None: + return None + elif max_samples < self.min_size: + # Theoretically this would be 0, but we sample with default value in this case. + return 1 + elif self.max_size is None: + return 2 ** max_samples - sum(scipy_special.comb(max_samples, j, exact=True) for j in range(self.min_size)) + else: + return sum(scipy_special.comb(max_samples, k, exact=True) for k in range(self.min_size, self.max_size + 1)) + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[S]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A set (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + size = random_state.randint(min_samples, max_samples + 1) + + if with_replacement: + sample_list: list = [self.sample(random_state) for i in range(size)] + else: + sample_set: set = set() + sample_list = [] + while len(sample_list) != size: + value = self.sample(random_state) + value_set: frozenset = frozenset(value) + if value_set not in sample_set: + sample_set.add(value_set) + sample_list.append(value) + + return tuple(sample_list) + + +class SortedSet(Set[S]): + """ + Similar to `Set` hyper-parameter, but elements of values are required to be sorted from smallest to largest, by default. + + Hyper-parameters configuration as elements is not supported. + + Attributes + ---------- + ascending: + Are values required to be sorted from smallest to largest (``True``) or the opposite (``False``). + """ + + ascending: bool + + def __init__( + self, elements: Hyperparameter, default: S, min_size: int = 0, max_size: int = None, *, + ascending: bool = True, semantic_types: typing.Sequence[str] = None, description: str = None, + ) -> None: + self.ascending = ascending + + if self.ascending: + self._compare = operator.lt + else: + self._compare = operator.gt + + super().__init__(elements, default, min_size, max_size, semantic_types=semantic_types, description=description) + + if self.is_configuration: + raise exceptions.NotSupportedError("Hyper-parameters configuration as elements is not supported.") + + def validate(self, value: S) -> None: + super().validate(value) + + if not all(self._compare(a, b) for a, b in zip(value, value[1:])): # type: ignore + raise exceptions.InvalidArgumentValueError("Value '{value}' {for_name}is not sorted.".format(value=value, for_name=self._for_name())) + + def sample(self, random_state: RandomState = None) -> S: + values = super().sample(random_state) + return type(values)(sorted(values, reverse=not self.ascending)) + + def to_simple_structure(self) -> typing.Dict: # type: ignore + structure = super().to_simple_structure() + structure['ascending'] = self.ascending + del structure['is_configuration'] + return structure + + +class List(_Sequence[S]): + """ + A list hyper-parameter which samples with replacement multiple times another hyper-parameter or hyper-parameters configuration. + + This is useful when a primitive is interested in more than one value of a hyper-parameter or hyper-parameters configuration. + + Values are represented as tuples of elements. The order of elements matters and is preserved but is not prescribed. + + Type variable ``S`` does not have to be specified because the structural type + is a set from provided elements. + """ + + def sample(self, random_state: RandomState = None) -> S: + """ + Samples a random value from the hyper-parameter search space. + + It first randomly chooses the size of the resulting sampled list + and then samples this number of elements. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + A sampled value. + """ + + if self.max_size is None: + utils.log_once( + logger, + logging.WARNING, + "Sampling an unlimited list hyper-parameter '%(name)s'. Using a default value.", + {'name': self.name}, + stack_info=True, + ) + + return self.get_default() + + return self.elements.sample_multiple(min_samples=self.min_size, max_samples=self.max_size, random_state=random_state, with_replacement=True) # type: ignore + + @functools.lru_cache() + def get_max_samples(self) -> typing.Optional[int]: # type: ignore + max_samples = self.elements.get_max_samples() + if max_samples is None: + return None + elif self.max_size is None: + # Theoretically this would be "None", but we sample with default value in this case. + return 1 + # Equal to: sum(max_samples ** k for k in range(self.min_size, self.max_size + 1)) + else: + if max_samples == 0: + return 0 + elif max_samples == 1: + return self.max_size - self.min_size + 1 + else: + return (max_samples ** self.min_size) * (max_samples ** (self.max_size - self.min_size + 1) - 1) / (max_samples - 1) + + def sample_multiple(self, min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[S]: + """ + Samples multiple random values from the hyper-parameter search space. At least ``min_samples`` + of them, and at most ``max_samples``. + + Parameters + ---------- + min_samples: + A minimum number of samples to return. + max_samples: + A maximum number of samples to return. + random_state: + A random seed or state to be used when sampling. + with_replacement: + Are we sampling with replacement or without? + + Returns + ------- + A list (represented as a tuple) of multiple sampled values. + """ + + min_samples, max_samples = self._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + size = random_state.randint(min_samples, max_samples + 1) + + if with_replacement: + sample_list: list = [self.sample(random_state) for i in range(size)] + else: + sample_set: set = set() + sample_list = [] + while len(sample_list) != size: + value = self.sample(random_state) + if value not in sample_set: + sample_set.add(value) + sample_list.append(value) + + return tuple(sample_list) + + +class SortedList(List[S]): + """ + Similar to `List` hyper-parameter, but elements of values are required to be sorted from smallest to largest, by default. + + Hyper-parameters configuration as elements is not supported. + + Attributes + ---------- + ascending: + Are values required to be sorted from smallest to largest (``True``) or the opposite (``False``). + """ + + ascending: bool + + def __init__( + self, elements: Hyperparameter, default: S, min_size: int = 0, max_size: int = None, *, + ascending: bool = True, semantic_types: typing.Sequence[str] = None, description: str = None, + ) -> None: + self.ascending = ascending + + if self.ascending: + self._compare = operator.le + else: + self._compare = operator.ge + + super().__init__(elements, default, min_size, max_size, semantic_types=semantic_types, description=description) + + if self.is_configuration: + raise exceptions.NotSupportedError("Hyper-parameters configuration as elements is not supported.") + + def validate(self, value: S) -> None: + super().validate(value) + + if not all(self._compare(a, b) for a, b in zip(value, value[1:])): # type: ignore + raise exceptions.InvalidArgumentValueError("Value '{value}' {for_name}is not sorted.".format(value=value, for_name=self._for_name())) + + def sample(self, random_state: RandomState = None) -> S: + values = super().sample(random_state) + return type(values)(sorted(values, reverse=not self.ascending)) + + @functools.lru_cache() + def get_max_samples(self) -> typing.Optional[int]: # type: ignore + max_samples = self.elements.get_max_samples() + if max_samples is None: + return None + elif self.max_size is None: + return None + else: + return sum(scipy_special.comb(max_samples + k - 1, k, exact=True) for k in range(self.min_size, self.max_size + 1)) + + def to_simple_structure(self) -> typing.Dict: # type: ignore + structure = super().to_simple_structure() + structure['ascending'] = self.ascending + del structure['is_configuration'] + return structure + + +class HyperparamsMeta(utils.AbstractMetaclass): + """ + A metaclass which provides the hyper-parameter description its name. + """ + + def __new__(mcls, class_name, bases, namespace, set_names=True, **kwargs): # type: ignore + # This should run only on subclasses of the "Hyperparams" class. + if bases != (dict,): + # Hyper-parameters configuration should be deterministic, so order matters. + configuration = collections.OrderedDict() + + # Create a (mutable) copy and don't modify the input argument. + namespace = collections.OrderedDict(namespace) + + # We traverse parent classes in order to keep hyper-parameters configuration deterministic. + for parent_class in bases: + # Using "isinstance" and not "issubclass" because we are comparing against a metaclass. + if isinstance(parent_class, mcls): + configuration.update(parent_class.configuration) + + for name, value in namespace.items(): + if name.startswith('_'): + continue + + if isinstance(value, Hyperparameter): + if name in base.STANDARD_PIPELINE_ARGUMENTS or name in base.STANDARD_RUNTIME_ARGUMENTS: + raise ValueError("Hyper-parameter name '{name}' is reserved because it is used as an argument in primitive interfaces.".format( + name=name, + )) + + if not HYPERPARAMETER_NAME_REGEX.match(name): + raise ValueError("Hyper-parameter name '{name}' contains invalid characters.".format( + name=name, + )) + + if set_names: + value.contribute_to_class(name) + + configuration[name] = value + + if isinstance(value, tuple) and len(value) == 1 and isinstance(value[0], Hyperparameter): + logger.warning("Probably invalid definition of a hyper-parameter. Hyper-parameter should be defined as class attribute without a trailing comma.", stack_info=True) + + for name in configuration.keys(): + # "name" might came from a parent class, but if not, then remove it + # from the namespace of the class we are creating. + if name in namespace: + del namespace[name] + + namespace['configuration'] = frozendict.FrozenOrderedDict(configuration) + + return super().__new__(mcls, class_name, bases, namespace, **kwargs) + + def __repr__(self): # type: ignore + return ''.format( + module=self.__module__, + class_name=self.__name__, + configuration=', '.join('{name}: {hyperparameter}'.format(name=name, hyperparameter=hyperparameter) for name, hyperparameter in self.configuration.items()), + ) + + def __setattr__(self, key, value): # type: ignore + if key == 'configuration': + raise AttributeError("Hyper-parameters configuration is immutable.") + + super().__setattr__(key, value) + + +H = typing.TypeVar('H', bound='Hyperparams') + + +class Hyperparams(dict, metaclass=HyperparamsMeta): + """ + A base class to be subclassed and used as a type for ``Hyperparams`` + type argument in primitive interfaces. An instance of this subclass + is passed as a ``hyperparams`` argument to primitive's constructor. + + You should subclass the class and configure class attributes to + hyper-parameters you want. They will be extracted out and put into + the ``configuration`` attribute. They have to be an instance of the + `Hyperparameter` class for this to happen. + + You can define additional methods and attributes on the class. + Prefix them with `_` to not conflict with future standard ones. + + When creating an instance of the class, all hyper-parameters have + to be provided. Default values have to be explicitly passed. + + Attributes + ---------- + configuration: + A hyper-parameters configuration. + """ + + configuration: typing.ClassVar[frozendict.FrozenOrderedDict] = frozendict.FrozenOrderedDict() + + def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None: + values = dict(*args, **kwargs) + + self.validate(values) + + super().__init__(values) + + self._hash: int = None + + @classmethod + def sample(cls: typing.Type[H], random_state: RandomState = None) -> H: + """ + Returns a hyper-parameters sample with all values sampled from their hyper-parameter configurations. + + Parameters + ---------- + random_state: + A random seed or state to be used when sampling. + + Returns + ------- + An instance of hyper-parameters. + """ + random_state = sklearn_validation.check_random_state(random_state) + + values = {} + + for name, hyperparameter in cls.configuration.items(): + values[name] = hyperparameter.sample(random_state) + + return cls(values) + + @classmethod + def get_max_samples(cls) -> typing.Optional[int]: + hyperparams_max_samples = 1 + for hyperparameter in cls.configuration.values(): + hyperparameter_max_samples = hyperparameter.get_max_samples() + if hyperparameter_max_samples is None: + return None + else: + # TODO: Assumption here is that hyper-parameters are independent. What when we will support dependencies? + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/46 + hyperparams_max_samples *= hyperparameter_max_samples + return hyperparams_max_samples + + @classmethod + def _check_sample_size(cls, min_samples: int, max_samples: typing.Optional[int], with_replacement: bool) -> typing.Tuple[int, int]: + return check_sample_size(cls, min_samples, max_samples, with_replacement) + + @classmethod + def sample_multiple(cls: typing.Type[H], min_samples: int = 0, max_samples: int = None, random_state: RandomState = None, *, with_replacement: bool = False) -> typing.Sequence[H]: + min_samples, max_samples = cls._check_sample_size(min_samples, max_samples, with_replacement) + + random_state = sklearn_validation.check_random_state(random_state) + + size = random_state.randint(min_samples, max_samples + 1) + + if with_replacement: + sample_list: list = [cls.sample(random_state) for i in range(size)] + else: + sample_set: set = set() + sample_list = [] + while len(sample_list) != size: + value = cls.sample(random_state) + if value not in sample_set: + sample_set.add(value) + sample_list.append(value) + + return tuple(sample_list) + + @classmethod + def defaults(cls: typing.Type[H], path: str = None) -> typing.Any: + """ + Returns a hyper-parameters sample with all values set to defaults. + + Parameters + ---------- + path: + An optional path to get defaults for. It can contain ``.`` to represent + a path through nested hyper-parameters. + + Returns + ------- + An instance of hyper-parameters or a default value of a hyper-parameter under ``path``. + """ + + if path is None: + values = {} + + for name, hyperparameter in cls.configuration.items(): + values[name] = hyperparameter.get_default() + + return cls(values) + + else: + if '.' not in path: + return cls.configuration[path].get_default() + else: + segment, rest = path.split('.', 1) + return cls.configuration[segment].get_default(rest) + + @classmethod + def validate(cls, values: dict) -> None: + configuration_keys = set(cls.configuration.keys()) + values_keys = set(values.keys()) + + missing = configuration_keys - values_keys + if len(missing): + raise exceptions.InvalidArgumentValueError("Not all hyper-parameters are specified: {missing}".format(missing=missing)) + + extra = values_keys - configuration_keys + if len(extra): + raise exceptions.InvalidArgumentValueError("Additional hyper-parameters are specified: {extra}".format(extra=extra)) + + for name, value in values.items(): + cls.configuration[name].validate(value) + + @classmethod + @functools.lru_cache() + def to_simple_structure(cls) -> typing.Dict: + """ + Converts the hyper-parameters configuration to a simple structure, similar to JSON, but with values + left as Python values. + + Returns + ------- + A dict. + """ + + return {name: hyperparameter.to_simple_structure() for name, hyperparameter in cls.configuration.items()} + + @classmethod + def define(cls: typing.Type[H], configuration: 'collections.OrderedDict[str, Hyperparameter]', *, + class_name: str = None, module_name: str = None, set_names: bool = False) -> typing.Type[H]: + """ + Define dynamically a subclass of this class using ``configuration`` and optional + ``class_name`` and ``module_name``. + + This is equivalent of defining a class statically in Python. ``configuration`` is what + you would otherwise provide through class attributes. + + Parameters + ---------- + configuration: + A hyper-parameters configuration. + class_name: + Class name of the subclass. + module_name: + Module name of the subclass. + set_names: + Should all hyper-parameters defined have their names set. By default ``False``. + This is different from when defining a static subclass, where the default is ``True`` + and names are set by the default. + + Returns + ------- + A subclass itself. + """ + + # Create a (mutable) copy and don't modify the input argument. + namespace: typing.Dict[str, typing.Any] = collections.OrderedDict(configuration) + + if class_name is None: + # We want automatically generated class names to be unique. + class_name = '{name}{id}'.format(name=cls.__name__, id=id(configuration)) + + if module_name is None: + frame = inspect.currentframe() + if frame is not None and frame.f_back is not None: + module_name = frame.f_back.f_globals['__name__'] + + if module_name is not None: + namespace['__module__'] = module_name + + return types.new_class(class_name, (cls,), {'set_names': set_names}, lambda ns: ns.update(namespace)) + + def values_to_json_structure(self) -> typing.Dict[str, typing.Dict]: + """ + Converts hyper-parameter values to a JSON-compatible structure. + + Returns + ------- + A JSON-compatible dict. + """ + + return {name: self.configuration[name].value_to_json_structure(value) for name, value in self.items()} + + @classmethod + def values_from_json_structure(cls: typing.Type[H], json: typing.Dict[str, typing.Dict]) -> H: + """ + Converts given JSON-compatible structure to an instance of this class with values + from the structure. + + Parameters + ---------- + json: + A JSON-compatible dict. + + Returns + ------- + An instance of this class with values from ``json`` argument. + """ + + return cls({name: cls.configuration[name].value_from_json_structure(value) for name, value in json.items()}) + + @classmethod + def traverse(cls) -> typing.Iterator[Hyperparameter]: + """ + Traverse over all hyper-parameters used in this hyper-parameters configuration. + + Yields + ------ + Hyperparamater + The next hyper-parameter used in this hyper-parameters configuration. + """ + + for hyperparameter in cls.configuration.values(): + yield hyperparameter + yield from hyperparameter.traverse() + + @classmethod + def transform_value(cls: typing.Type[H], values: dict, transform: typing.Callable, index: int = 0) -> H: + transformed_values = {} + for i, name in enumerate(sorted(values.keys())): + transformed_values[name] = cls.configuration[name].transform_value(values[name], transform, index + i) + + return cls(transformed_values) + + @classmethod + def can_accept_value_type(cls, structural_type: typing.Union[type, typing.List[type]]) -> bool: + if structural_type is typing.Any: + return True + elif isinstance(structural_type, typing.List): + # We do not support a list of types. This is used for "Set" hyper-parameter. + return False + else: + return utils.is_subclass(structural_type, cls) + + def replace(self: H, values: typing.Dict[str, typing.Any]) -> H: + """ + Creates a copy of hyper-parameters with values replaced with values from ``values``. + + This is equivalent of doing ``Hyperparams(hyperparams, **values)``. + + Parameters + ---------- + values: + Map between keys and values to replace. + + Returns + ------- + A copy of the object with replaced values. + """ + + return type(self)(self, **values) + + def __setitem__(self, key, value): # type: ignore + raise TypeError("Hyper-parameters are immutable.") + + def __delitem__(self, key): # type: ignore + raise TypeError("Hyper-parameters are immutable.") + + def clear(self): # type: ignore + raise TypeError("Hyper-parameters are immutable.") + + def pop(self, key, default=None): # type: ignore + raise TypeError("Hyper-parameters are immutable.") + + def popitem(self): # type: ignore + raise TypeError("Hyper-parameters are immutable.") + + def setdefault(self, key, default=None): # type: ignore + raise TypeError("Hyper-parameters are immutable.") + + def update(self, *args, **kwargs): # type: ignore + raise TypeError("Hyper-parameters are immutable.") + + def __repr__(self) -> str: + return '{class_name}({super})'.format(class_name=type(self).__name__, super=super().__repr__()) + + def __getstate__(self) -> dict: + return dict(self) + + def __setstate__(self, state: dict) -> None: + self.__init__(state) # type: ignore + + # In the past, we had to implement our own __reduce__ method because dict is otherwise pickled + # using a built-in implementation which does not call "__getstate__". But now we use it also + # to handle the case of classes defined using "define". + def __reduce__(self) -> typing.Tuple[typing.Callable, typing.Tuple, dict]: + # If class has been defined at the global scope of a module, we can use regular pickling approach. + if _is_defined_at_global_scope(self.__class__): + return __newobj__, (self.__class__,), self.__getstate__() + + base_cls = None + define_args_list: typing.List[typing.Dict[str, typing.Any]] = [] + for cls in inspect.getmro(self.__class__): + if _is_defined_at_global_scope(cls): + base_cls = cls + break + + if not issubclass(cls, Hyperparams): + raise pickle.PickleError("Class is not a subclass of \"Hyperparams\" class.") + + if set(cls.__dict__.keys()) - DEFAULT_HYPERPARAMS_CLASS_ATTRIBUTES: + raise pickle.PickleError("A class with custom attributes not defined at a global scope.") + + cls = typing.cast(typing.Type[Hyperparams], cls) + + define_args_list.insert(0, { + 'configuration': cls.configuration, + 'class_name': getattr(cls, '__name__', None), + 'module_name': getattr(cls, '__module__', None), + }) + + if base_cls is None: + raise pickle.PickleError("Cannot find a base class defined at a global scope.") + + if not issubclass(base_cls, Hyperparams): + raise pickle.PickleError("Found base class is not a subclass of \"Hyperparams\" class.") + + return _recreate_hyperparams_class, (base_cls, define_args_list), self.__getstate__() + + # It is immutable, so hash can be defined. + def __hash__(self) -> int: + if self._hash is None: + h = 0 + for key, value in self.items(): + h ^= hash((key, value)) + self._hash = h + return self._hash + + +# This is defined here so that we compute it only once. +DEFAULT_HYPERPARAMS_CLASS_ATTRIBUTES = set(Hyperparams.define(collections.OrderedDict()).__dict__.keys()) diff --git a/d3m/d3m/metadata/params.py b/d3m/d3m/metadata/params.py new file mode 100644 index 0000000..5af2bce --- /dev/null +++ b/d3m/d3m/metadata/params.py @@ -0,0 +1,138 @@ +import typing + +from d3m import exceptions, utils + + +class ParamsMeta(utils.AbstractMetaclass): + def __new__(mcls, class_name, bases, namespace, **kwargs): # type: ignore + for name, value in namespace.items(): + if name.startswith('_'): + continue + + if utils.is_class_method_on_class(value) or utils.is_instance_method_on_class(value): + continue + + raise TypeError("Only methods and attribute type annotations can be defined on Params class, not '{name}'.".format(name=name)) + + class_params_items = {} + class_annotations = namespace.get('__annotations__', {}) + + for name, value in class_annotations.items(): + value = typing._type_check(value, "Each annotation must be a type.") + + if name in namespace: + # Just update the annotation. + class_annotations[name] = value + else: + # Extract annotation out. + class_params_items[name] = value + + for name in class_params_items.keys(): + del class_annotations[name] + + # Set back updated annotations. + namespace['__annotations__'] = class_annotations + + params_items = {} + + for base in reversed(bases): + params_items.update(base.__dict__.get('__params_items__', {})) + + params_items.update(class_params_items) + + namespace['__params_items__'] = params_items + + return super().__new__(mcls, class_name, bases, namespace, **kwargs) + + +class Params(dict, metaclass=ParamsMeta): + """ + A base class to be subclassed and used as a type for ``Params`` type + argument in primitive interfaces. An instance of this subclass should + be returned from primitive's ``get_params`` method, and accepted in + ``set_params``. + + You should subclass the class and set type annotations on class attributes + for params available in the class. + + When creating an instance of the class, all parameters have to be provided. + """ + + def __init__(self, other: typing.Dict[str, typing.Any] = None, **values: typing.Any) -> None: + if other is None: + other = {} + + values = dict(other, **values) + + params_keys = set(self.__params_items__.keys()) # type: ignore + values_keys = set(values.keys()) + + missing = params_keys - values_keys + if len(missing): + raise exceptions.InvalidArgumentValueError("Not all parameters are specified: {missing}".format(missing=missing)) + + extra = values_keys - params_keys + if len(extra): + raise exceptions.InvalidArgumentValueError("Additional parameters are specified: {extra}".format(extra=extra)) + + for name, value in values.items(): + value_type = self.__params_items__[name] # type: ignore + if not utils.is_instance(value, value_type): + raise exceptions.InvalidArgumentTypeError("Value '{value}' for parameter '{name}' is not an instance of the type: {value_type}".format(value=value, name=name, value_type=value_type)) + + super().__init__(values) + + def __setitem__(self, key, value): # type: ignore + if key not in self.__params_items__: + raise ValueError("Additional parameter is specified: {key}".format(key=key)) + + value_type = self.__params_items__[key] + if not utils.is_instance(value, value_type): + raise TypeError("Value '{value}' for parameter '{name}' is not an instance of the type: {value_type}".format(value=value, name=key, value_type=value_type)) + + return super().__setitem__(key, value) + + def __delitem__(self, key): # type: ignore + raise AttributeError("You cannot delete parameters.") + + def clear(self): # type: ignore + raise AttributeError("You cannot delete parameters.") + + def pop(self, key, default=None): # type: ignore + raise AttributeError("You cannot delete parameters.") + + def popitem(self): # type: ignore + raise AttributeError("You cannot delete parameters.") + + def setdefault(self, key, default=None): # type: ignore + if key not in self.__params_items__: + raise ValueError("Additional parameter is specified: {key}".format(key=key)) + + default_type = self.__params_items__[key] + if not utils.is_instance(default, default_type): + raise TypeError("Value '{value}' for parameter '{name}' is not an instance of the type: {value_type}".format(value=default, name=key, value_type=default_type)) + + return super().setdefault(key, default) + + def update(self, other: typing.Dict[str, typing.Any] = None, **values: typing.Any) -> None: # type: ignore + if other is None: + other = {} + + values = dict(other, **values) + + params_keys = set(self.__params_items__.keys()) # type: ignore + values_keys = set(values.keys()) + + extra = values_keys - params_keys + if len(extra): + raise ValueError("Additional parameters are specified: {extra}".format(extra=extra)) + + for name, value in values.items(): + value_type = self.__params_items__[name] # type: ignore + if not utils.is_instance(value, value_type): + raise TypeError("Value '{value}' for parameter '{name}' is not an instance of the type: {value_type}".format(value=value, name=name, value_type=value_type)) + + super().update(values) + + def __repr__(self) -> str: + return '{class_name}({super})'.format(class_name=type(self).__name__, super=super().__repr__()) diff --git a/d3m/d3m/metadata/pipeline.py b/d3m/d3m/metadata/pipeline.py new file mode 100644 index 0000000..78cc286 --- /dev/null +++ b/d3m/d3m/metadata/pipeline.py @@ -0,0 +1,2970 @@ +import abc +import argparse +import collections +import copy +import datetime +import json +import logging +import os +import os.path +import pprint +import sys +import traceback +import typing +import uuid + +import dateparser # type: ignore + +from d3m import container, deprecate, environment_variables, exceptions, index, utils +from d3m.primitive_interfaces import base +from . import base as metadata_base, hyperparams as hyperparams_module + +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/66 +try: + from pyarrow import lib as pyarrow_lib # type: ignore +except ModuleNotFoundError: + pyarrow_lib = None + +__all__ = ( + 'Pipeline', 'Resolver', 'NoResolver', 'PrimitiveStep', 'SubpipelineStep', 'PlaceholderStep', +) + +logger = logging.getLogger(__name__) + +# Comma because we unpack the list of validators returned from "load_schema_validators". +PIPELINE_SCHEMA_VALIDATOR, = utils.load_schema_validators(metadata_base.SCHEMAS, ('pipeline.json',)) + +PIPELINE_SCHEMA_VERSION = 'https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json' + +CONTROL_HYPERPARAMETER_SEMANTIC_TYPE = 'https://metadata.datadrivendiscovery.org/types/ControlParameter' + + +class TypeInfo(typing.NamedTuple): + structural_type: type + singleton: typing.Optional[bool] + + +class Resolver: + """ + A resolver to resolve primitives and pipelines. + + It resolves primitives from available primitives on the system, + and resolves pipelines from files in pipeline search paths. + + Attributes + ---------- + strict_resolving: + If resolved pipeline or primitive does not fully match specified primitive reference, raise an exception? + strict_digest: + When loading pipelines or primitives, if computed digest does not match the one provided in metadata, raise an exception? + pipeline_search_paths: + A list of paths to directories with pipelines to resolve from. + Their files should be named ``.json``, ``.yml``, or ``.yaml``. + + Parameters + ---------- + strict_resolving: + If resolved pipeline or primitive does not fully match specified primitive reference, raise an exception? + strict_digest: + When loading pipelines or primitives, if computed digest does not match the one provided in metadata, raise an exception? + pipeline_search_paths: + A list of paths to directories with pipelines to resolve from. + Their files should be named ``.json``, ``.yml``, or ``.yaml``. + respect_environment_variable: + Use also (colon separated) pipeline search paths from ``PIPELINES_PATH`` environment variable? + load_all_primitives: + Load all primitives before attempting to resolve them. If ``False`` any primitive used in a + pipeline has to be loaded before calling the resolver. + primitives_blocklist: + A collection of primitive path prefixes to not (try to) load. + """ + + strict_resolving: bool + strict_digest: bool + pipeline_search_paths: typing.Sequence[str] + + def __init__(self, *, strict_resolving: bool = False, strict_digest: bool = False, + pipeline_search_paths: typing.Sequence[str] = None, + respect_environment_variable: bool = True, load_all_primitives: bool = True, + primitives_blocklist: typing.Collection[str] = None) -> None: + self.strict_resolving = strict_resolving + self.strict_digest = strict_digest + self.primitives_blocklist = primitives_blocklist + + if pipeline_search_paths is None: + self.pipeline_search_paths: typing.List[str] = [] + else: + self.pipeline_search_paths = typing.cast(typing.List[str], pipeline_search_paths) + + if respect_environment_variable: + self.pipeline_search_paths += [path for path in os.environ.get(environment_variables.PIPELINES_PATH, '').split(':') if path] + + self._load_all_primitives = load_all_primitives + self._primitives_loaded = False + self._get_primitive_failed: typing.Set[str] = set() + + def get_primitive(self, primitive_description: typing.Dict) -> typing.Optional[typing.Type[base.PrimitiveBase]]: + primitive = self._get_primitive(primitive_description) + + # This class always resolves a primitive, or throws an exception, but subclasses might return "None". + if primitive is not None: + self._check_primitive(primitive_description, primitive) + + return primitive + + @classmethod + def get_pipeline_class(cls) -> 'typing.Type[Pipeline]': + return Pipeline + + def get_pipeline(self, pipeline_description: typing.Dict) -> 'typing.Optional[Pipeline]': + pipeline = self._get_pipeline(pipeline_description) + + # This class always resolves a pipeline, or throws an exception, but subclasses might return "None". + if pipeline is not None: + self._check_pipeline(pipeline_description, pipeline) + + return pipeline + + def _get_pipeline(self, pipeline_description: typing.Dict) -> 'typing.Optional[Pipeline]': + # If more than just "id" and "digest" is in the pipeline description, + # then we assume it is a full pipeline description. Digest is optional. + if set(pipeline_description.keys()) - {'digest'} > {'id'}: + return self._from_structure(pipeline_description) + else: + return self._from_file(pipeline_description) + + def _from_structure(self, pipeline_description: typing.Dict) -> 'typing.Optional[Pipeline]': + return self.get_pipeline_class().from_json_structure(pipeline_description, resolver=self, strict_digest=self.strict_digest) + + def _from_file(self, pipeline_description: typing.Dict) -> 'typing.Optional[Pipeline]': + for path in self.pipeline_search_paths: + for extension in ['json', 'json.gz']: + pipeline_path = os.path.join(path, '{pipeline_id}.{extension}'.format(pipeline_id=pipeline_description['id'], extension=extension)) + try: + with utils.open(pipeline_path, 'r', encoding='utf8') as pipeline_file: + return self.get_pipeline_class().from_json(pipeline_file, resolver=self, strict_digest=self.strict_digest) + except FileNotFoundError: + pass + + for extension in ['yml', 'yaml', 'yml.gz', 'yaml.gz']: + pipeline_path = os.path.join(path, '{pipeline_id}.{extension}'.format(pipeline_id=pipeline_description['id'], extension=extension)) + try: + with utils.open(pipeline_path, 'r', encoding='utf8') as pipeline_file: + return self.get_pipeline_class().from_yaml(pipeline_file, resolver=self, strict_digest=self.strict_digest) + except FileNotFoundError: + pass + + raise exceptions.InvalidArgumentValueError("Unable to get pipeline '{pipeline_id}'.".format(pipeline_id=pipeline_description['id'])) + + def _get_primitive_by_path(self, primitive_description: typing.Dict) -> typing.Optional[typing.Type[base.PrimitiveBase]]: + if primitive_description['python_path'] in self._get_primitive_failed: + return None + + try: + # We first try to directly load the primitive using its Python path. + primitive = index.get_primitive(primitive_description['python_path']) + except Exception: + # We make sure we attempt to directly load the primitive only once. Otherwise error messages + # during loading could be printed out again and again, every time we try to get this primitive. + self._get_primitive_failed.add(primitive_description['python_path']) + primitive = None + + # Then we check that the loaded primitive matches the requested primitive ID. + # This way we can load primitive's without having to load all primitives in + # the common case, when the Python path of the primitive has not changed. + if primitive is not None and primitive.metadata.query()['id'] == primitive_description['id']: + return primitive + + return None + + def _load_primitives(self) -> None: + if not self._load_all_primitives: + return + + if self._primitives_loaded: + return + self._primitives_loaded = True + + # We attempt to load all primitives only once. Otherwise error messages for failed primitives + # during loading could be printed out again and again. + index.load_all(blocklist=self.primitives_blocklist) + + def _get_primitive(self, primitive_description: typing.Dict) -> typing.Optional[typing.Type[base.PrimitiveBase]]: + if not self._primitives_loaded: + primitive = self._get_primitive_by_path(primitive_description) + + if primitive is not None: + return primitive + + self._load_primitives() + + return index.get_primitive_by_id(primitive_description['id']) + + def _check_primitive(self, primitive_description: typing.Dict, primitive: typing.Type[base.PrimitiveBase]) -> None: + primitive_metadata = primitive.metadata.query() + + if primitive_metadata['version'] != primitive_description['version']: + if self.strict_resolving: + raise exceptions.MismatchError( + "Version for primitive '{primitive_id}' does not match the one specified in the primitive description. " + "Primitive description version: '{primitive_version}'. Resolved primitive version: '{resolved_primitive_version}'.".format( + primitive_id=primitive_metadata['id'], + primitive_version=primitive_description['version'], + resolved_primitive_version=primitive_metadata['version'], + ) + ) + else: + logger.warning( + "Version for primitive '%(primitive_id)s' does not match the one specified in the primitive description. " + "Primitive description version: '%(primitive_version)s'. Resolved primitive version: '%(resolved_primitive_version)s'.", + { + 'primitive_id': primitive_metadata['id'], + 'primitive_version': primitive_description['version'], + 'resolved_primitive_version': primitive_metadata['version'], + }, + ) + + if primitive_metadata['python_path'] != primitive_description['python_path']: + if self.strict_resolving: + raise exceptions.MismatchError( + "Python path for primitive '{primitive_id}' does not match the one specified in the primitive description. " + "Primitive description Python path: '{primitive_python_path}'. Resolved primitive Python path: '{resolved_primitive_python_path}'.".format( + primitive_id=primitive_metadata['id'], + primitive_python_path=primitive_description['python_path'], + resolved_primitive_python_path=primitive_metadata['python_path'], + ) + ) + else: + logger.warning( + "Python path for primitive '%(primitive_id)s' does not match the one specified in the primitive description. " + "Primitive description Python path: '%(primitive_python_path)s'. Resolved primitive Python path: '%(resolved_primitive_python_path)s'.", + { + 'primitive_id': primitive_metadata['id'], + 'primitive_python_path': primitive_description['python_path'], + 'resolved_primitive_python_path': primitive_metadata['python_path'], + }, + ) + + if primitive_metadata['name'] != primitive_description['name']: + if self.strict_resolving: + raise exceptions.MismatchError( + "Name for primitive '{primitive_id}' does not match the one specified in the primitive description. " + "Primitive description name: '{primitive_name}'. Resolved primitive name: '{resolved_primitive_name}'.".format( + primitive_id=primitive_metadata['id'], + primitive_name=primitive_description['name'], + resolved_primitive_name=primitive_metadata['name'], + ) + ) + else: + logger.warning( + "Name for primitive '%(primitive_id)s' does not match the one specified in the primitive description. " + "Primitive description name: '%(primitive_name)s'. Resolved primitive name: '%(resolved_primitive_name)s'.", + { + 'primitive_id': primitive_metadata['id'], + 'primitive_name': primitive_description['name'], + 'resolved_primitive_name': primitive_metadata['name'], + }, + ) + + if 'digest' in primitive_description: + assert primitive_description['digest'] is not None + + if primitive_metadata.get('digest', None) != primitive_description['digest']: + if self.strict_digest: + raise exceptions.DigestMismatchError( + "Digest for primitive '{primitive_id}' does not match the one specified in the primitive description. " + "Primitive description digest: {primitive_digest}. Resolved primitive digest: {resolved_primitive_digest}.".format( + primitive_id=primitive_metadata['id'], + primitive_digest=primitive_description['digest'], + resolved_primitive_digest=primitive_metadata.get('digest', None), + ) + ) + else: + logger.warning( + "Digest for primitive '%(primitive_id)s' does not match the one specified in the primitive description. " + "Primitive description digest: %(primitive_digest)s. Resolved primitive digest: %(resolved_primitive_digest)s.", + { + 'primitive_id': primitive_metadata['id'], + 'primitive_digest': primitive_description['digest'], + 'resolved_primitive_digest': primitive_metadata.get('digest', None), + }, + ) + + def _check_pipeline(self, pipeline_description: typing.Dict, pipeline: 'Pipeline') -> None: + # This can happen if the file has a filename for one pipeline ID but the contents have another pipeline ID. + if pipeline.id != pipeline_description['id']: + if self.strict_resolving: + raise exceptions.MismatchError( + "ID of pipeline '{resolved_pipeline_id}' does not match the one specified in the pipeline description. " + "Pipeline description ID: '{pipeline_id}'. Resolved pipeline ID: '{resolved_pipeline_id}'.".format( + pipeline_id=pipeline_description['id'], + resolved_pipeline_id=pipeline.id, + ) + ) + else: + logger.warning( + "ID of pipeline '%(resolved_pipeline_id)s' does not match the one specified in the pipeline description. " + "Pipeline description ID: '%(pipeline_id)s'. Resolved pipeline ID: '%(resolved_pipeline_id)s'.", + { + 'pipeline_id': pipeline_description['id'], + 'resolved_pipeline_id': pipeline.id, + }, + ) + + if 'digest' in pipeline_description: + assert pipeline_description['digest'] is not None + + pipeline_digest = pipeline.get_digest() + + if pipeline_digest != pipeline_description['digest']: + if self.strict_digest: + raise exceptions.DigestMismatchError( + "Digest for pipeline '{pipeline_id}' does not match the one specified in the pipeline description. " + "Pipeline description digest: {pipeline_digest}. Resolved pipeline digest: {resolved_pipeline_digest}.".format( + pipeline_id=pipeline.id, + pipeline_digest=pipeline_description['digest'], + resolved_pipeline_digest=pipeline_digest, + ) + ) + else: + logger.warning( + "Digest for pipeline '%(pipeline_id)s' does not match the one specified in the pipeline description. " + "Pipeline description digest: %(pipeline_digest)s. Resolved pipeline digest: %(resolved_pipeline_digest)s.", + { + 'pipeline_id': pipeline.id, + 'pipeline_digest': pipeline_description['digest'], + 'resolved_pipeline_digest': pipeline_digest, + }, + ) + + +class NoResolver(Resolver): + """ + A resolver which never resolves anything. + """ + + def _get_primitive(self, primitive_description: typing.Dict) -> typing.Optional[typing.Type[base.PrimitiveBase]]: + return None + + def _get_pipeline(self, pipeline_description: typing.Dict) -> 'typing.Optional[Pipeline]': + return None + + +S = typing.TypeVar('S', bound='StepBase') + + +class StepBase(metaclass=utils.AbstractMetaclass): + """ + Class representing one step in pipeline's execution. + + Attributes + ---------- + index: + An index of the step among steps in the pipeline. + resolver: + Resolver to use. + + Parameters + ---------- + resolver: + Resolver to use. + """ + + index: int + resolver: Resolver + + def __init__(self, *, resolver: typing.Optional[Resolver] = None) -> None: + self.resolver = self.get_resolver(resolver) + + self.index: int = None + + @classmethod + def get_resolver(cls, resolver: typing.Optional[Resolver]) -> Resolver: + if resolver is None: + return Resolver() + else: + return resolver + + @classmethod + @abc.abstractmethod + def get_step_type(cls) -> metadata_base.PipelineStepType: + pass + + def check_add(self, existing_steps: 'typing.Sequence[StepBase]', available_data_references: typing.AbstractSet[str]) -> None: + """ + Checks if a step can be added given existing steps and available + data references to provide to the step. It also checks if the + state of a step is suitable to be added at this point. + + Raises an exception if check fails. + + Parameters + ---------- + existing_steps: + Steps already in the pipeline. + available_data_references: + A set of available data references. + """ + + def set_index(self, index: int) -> None: + if self.index is not None: + raise exceptions.InvalidArgumentValueError("Index already set to {index}.".format(index=self.index)) + + self.index = index + + @abc.abstractmethod + def get_free_hyperparams(self) -> typing.Union[typing.Dict, typing.Sequence]: + """ + Returns step's hyper-parameters which have not been fixed by the pipeline. + + Returns + ------- + Hyper-parameters configuration for free hyper-parameters, or a list of those. + """ + + @abc.abstractmethod + def get_all_hyperparams(self) -> typing.Union[typing.Dict, typing.Sequence]: + """ + Returns step's hyper-parameters. + + Returns + ------- + Hyper-parameters configuration for all hyper-parameters, or a list of those. + """ + + @abc.abstractmethod + def get_input_data_references(self) -> typing.AbstractSet[str]: + pass + + @abc.abstractmethod + def get_output_data_references(self) -> typing.AbstractSet[str]: + pass + + @classmethod + @abc.abstractmethod + def from_json_structure(cls: typing.Type[S], step_description: typing.Dict, *, resolver: Resolver = None) -> S: + pass + + @abc.abstractmethod + def to_json_structure(self) -> typing.Dict: + pass + + +SP = typing.TypeVar('SP', bound='PrimitiveStep') + + +class PrimitiveStep(StepBase): + """ + Class representing a primitive execution step in pipeline's execution. + + Attributes + ---------- + primitive_description: + A description of the primitive specified for this step. Available if ``primitive`` could not be resolved. + primitive: + A primitive class associated with this step. + outputs: + A list of method names providing outputs for this step. + hyperparams: + A map of of fixed hyper-parameters to their values which are set + as part of a pipeline and should not be tuned during hyper-parameter tuning. + arguments: + A map between argument name and its description. Description contains + a data reference of an output of a prior step (or a pipeline input). + users: + Users associated with the primitive. + + Parameters + ---------- + primitive_description: + A description of the primitive specified for this step. Allowed only if ``primitive`` is not provided. + primitive: + A primitive class associated with this step. If not provided, resolved using ``resolver`` from ``primitive_description``. + """ + + primitive_description: typing.Dict + primitive: typing.Type[base.PrimitiveBase] + outputs: typing.List[str] + hyperparams: typing.Dict[str, typing.Dict] + arguments: typing.Dict[str, typing.Dict] + users: typing.List[typing.Dict] + + def __init__(self, primitive_description: typing.Dict = None, *, primitive: typing.Type[base.PrimitiveBase] = None, resolver: typing.Optional[Resolver] = None) -> None: + super().__init__(resolver=resolver) + + if primitive is None: + if primitive_description is None: + raise exceptions.InvalidArgumentValueError("\"primitive_description\" and \"primitive\" arguments are both None.") + + primitive = self.resolver.get_primitive(primitive_description) + elif primitive_description is not None: + raise exceptions.InvalidArgumentValueError("\"primitive_description\" and \"primitive\" arguments cannot be both provided.") + + if primitive is None: + # If still "None" it means resolver returned "None". + # We just store provided primitive description. + self.primitive_description = primitive_description + self.primitive = None + else: + self.primitive_description = None + self.primitive = primitive + + self.outputs: typing.List[str] = [] + self.hyperparams: typing.Dict[str, typing.Dict] = {} + self.arguments: typing.Dict[str, typing.Dict] = {} + self.users: typing.List[typing.Dict] = [] + + @classmethod + def get_step_type(cls) -> metadata_base.PipelineStepType: + return metadata_base.PipelineStepType.PRIMITIVE + + def add_argument(self, name: str, argument_type: typing.Any, data_reference: typing.Union[str, typing.Sequence[str]]) -> None: + """ + Associate a data reference to an argument of this step (and underlying primitive). + + Parameters + ---------- + name: + Argument name. + argument_type: + Argument type. + data_reference: + Data reference or a list of data references associated with this argument. + """ + + if name in self.arguments: + raise exceptions.InvalidArgumentValueError("Argument with name '{name}' already exists.".format(name=name)) + + if argument_type not in [metadata_base.ArgumentType.CONTAINER, metadata_base.ArgumentType.DATA]: + raise exceptions.InvalidArgumentValueError("Invalid argument type: {argument_type}".format(argument_type=argument_type)) + + if not isinstance(data_reference, str) and not utils.is_instance(data_reference, typing.Sequence[str]): + raise exceptions.InvalidArgumentTypeError("Data reference is not a string or a list of strings.".format(name=name)) + + if self.primitive is not None: + argument_metadata = self.primitive.metadata.query()['primitive_code'].get('arguments', {}).get(name, None) + + if argument_metadata is None: + raise exceptions.InvalidArgumentValueError( + "Unknown argument name '{name}' for primitive {primitive}.".format( + name=name, + primitive=self.primitive, + ), + ) + + if argument_metadata['kind'] != metadata_base.PrimitiveArgumentKind.PIPELINE: + raise exceptions.InvalidArgumentValueError( + "Pipelines can provide only pipeline arguments, '{name}' is of kind {kind}.".format( + name=name, + kind=argument_metadata['kind'], + ), + ) + + self.arguments[name] = { + 'type': argument_type, + 'data': data_reference, + } + + def add_output(self, output_id: str) -> None: + """ + Define an output from this step. + + Underlying primitive can have multiple produce methods but not all have to be + defined as outputs of the step. + + Parameters + ---------- + output_id: + A name of the method producing this output. + """ + + if output_id in self.outputs: + raise exceptions.InvalidArgumentValueError("Output with ID '{output_id}' already exists.".format(output_id=output_id)) + + if self.primitive is not None: + method_metadata = self.primitive.metadata.query()['primitive_code'].get('instance_methods', {}).get(output_id, None) + + if method_metadata is None: + raise exceptions.InvalidArgumentValueError( + "Unknown output ID '{output_id}' for primitive {primitive}.".format( + output_id=output_id, + primitive=self.primitive, + ), + ) + + if method_metadata['kind'] != metadata_base.PrimitiveMethodKind.PRODUCE: + raise exceptions.InvalidArgumentValueError( + "Primitives can output only from produce methods, '{output_id}' is of kind {kind}.".format( + output_id=output_id, + kind=method_metadata['kind'], + ), + ) + + self.outputs.append(output_id) + + def add_hyperparameter(self, name: str, argument_type: typing.Any, data: typing.Any) -> None: + """ + Associate a value for a hyper-parameter of this step (and underlying primitive). + + Parameters + ---------- + name: + Hyper-parameter name. + argument_type: + Argument type. + data: + Data reference associated with this hyper-parameter, or list of data references, or value itself. + """ + + if name in self.hyperparams: + raise exceptions.InvalidArgumentValueError("Hyper-parameter with name '{name}' already exists.".format(name=name)) + + if self.primitive is not None: + hyperparams = self.get_primitive_hyperparams() + + if name not in hyperparams.configuration: + raise exceptions.InvalidArgumentValueError( + "Unknown hyper-parameter name '{name}' for primitive {primitive}.".format( + name=name, + primitive=self.primitive, + ), + ) + + if argument_type == metadata_base.ArgumentType.VALUE: + hyperparams.configuration[name].validate(data) + + if argument_type in [metadata_base.ArgumentType.DATA, metadata_base.ArgumentType.PRIMITIVE]: + if utils.is_sequence(data): + if not len(data): + raise exceptions.InvalidArgumentValueError("An empty list of hyper-paramater values.") + + self.hyperparams[name] = { + 'type': argument_type, + 'data': data, + } + + def add_user(self, user_description: typing.Dict) -> None: + """ + Add a description of user to a list of users associated with the primitive. + + Parameters + ---------- + user_description: + User description. + """ + + if 'id' not in user_description: + raise exceptions.InvalidArgumentValueError("User description is missing user ID.") + + self.users.append(user_description) + + def check_add(self, existing_steps: typing.Sequence[StepBase], available_data_references: typing.AbstractSet[str]) -> None: + # Order of steps can be arbitrary during execution (given that inputs for a step are available), but we still + # want some partial order during construction. We want that arguments can already be satisfied by existing steps. + for argument_description in self.arguments.values(): + if utils.is_sequence(argument_description['data']): + data_references = argument_description['data'] + else: + data_references = typing.cast(typing.Sequence, [argument_description['data']]) + for data_reference in data_references: + if not isinstance(data_reference, str): + raise exceptions.InvalidArgumentTypeError("Argument data reference '{data_reference}' is not a string.".format(data_reference=data_reference)) + elif data_reference not in available_data_references: + raise exceptions.InvalidPipelineError("Argument data reference '{data_reference}' is not among available data references.".format( + data_reference=data_reference, + )) + + for hyperparameter_description in self.hyperparams.values(): + if hyperparameter_description['type'] == metadata_base.ArgumentType.DATA: + if utils.is_sequence(hyperparameter_description['data']): + data_references = hyperparameter_description['data'] + else: + data_references = typing.cast(typing.Sequence, [hyperparameter_description['data']]) + for data_reference in data_references: + if not isinstance(data_reference, str): + raise exceptions.InvalidArgumentTypeError("Hyper-parameter data reference '{data_reference}' is not a string.".format(data_reference=data_reference)) + elif data_reference not in available_data_references: + raise exceptions.InvalidPipelineError("Hyper-parameter data reference '{data_reference}' is not among available data references.".format( + data_reference=data_reference, + )) + elif hyperparameter_description['type'] == metadata_base.ArgumentType.PRIMITIVE: + if utils.is_sequence(hyperparameter_description['data']): + primitive_references = hyperparameter_description['data'] + else: + primitive_references = typing.cast(typing.Sequence, [hyperparameter_description['data']]) + for primitive_reference in primitive_references: + if not isinstance(primitive_reference, int): + raise exceptions.InvalidArgumentTypeError("Primitive reference '{primitive_reference}' is not an integer.".format(primitive_reference=primitive_reference)) + elif not 0 <= primitive_reference < len(existing_steps): + raise exceptions.InvalidPipelineError("Invalid primitive reference in a step: {primitive}".format(primitive=primitive_reference)) + elif not isinstance(existing_steps[primitive_reference], PrimitiveStep): + raise exceptions.InvalidArgumentTypeError("Primitive reference '{primitive_reference}' is not referencing a primitive step.".format(primitive_reference=primitive_reference)) + elif hyperparameter_description['type'] == metadata_base.ArgumentType.CONTAINER: + if not isinstance(hyperparameter_description['data'], str): + raise exceptions.InvalidArgumentTypeError("Hyper-parameter data reference '{data_reference}' is not a string.".format( + data_reference=hyperparameter_description['data'], + )) + elif hyperparameter_description['data'] not in available_data_references: + raise exceptions.InvalidPipelineError("Hyper-parameter data reference '{data_reference}' is not among available data references.".format( + data_reference=hyperparameter_description['data'], + )) + elif hyperparameter_description['type'] == metadata_base.ArgumentType.VALUE: + # "VALUE" hyper-parameter value has already been checked in "add_hyperparameter". + pass + else: + raise exceptions.UnexpectedValueError("Unknown hyper-parameter type: {hyperparameter_type}".format(hyperparameter_type=hyperparameter_description['type'])) + + # We do this check only if primitive has any arguments or outputs defined. + # Otherwise it can be used as a unfitted primitive value for a hyper-parameter to another primitive. + if self.primitive is not None and (self.arguments or self.outputs): + primitive_arguments = self.primitive.metadata.query()['primitive_code'].get('arguments', {}) + required_arguments_set = { + argument_name for argument_name, argument in primitive_arguments.items() if 'default' not in argument and argument['kind'] == metadata_base.PrimitiveArgumentKind.PIPELINE + } + + arguments_set = set(self.arguments.keys()) + + missing_arguments_set = required_arguments_set - arguments_set + if len(missing_arguments_set): + raise exceptions.InvalidArgumentValueError( + "Not all required arguments are provided for the primitive: {missing_arguments_set}".format( + missing_arguments_set=missing_arguments_set, + ) + ) + + def get_primitive_hyperparams(self) -> hyperparams_module.Hyperparams: + if self.primitive is None: + raise exceptions.InvalidStateError("Primitive has not been resolved.") + + return self.primitive.metadata.get_hyperparams() + + def get_free_hyperparams(self) -> typing.Dict: + free_hyperparams = collections.OrderedDict(self.get_primitive_hyperparams().configuration) + + for hyperparam in self.hyperparams: + del free_hyperparams[hyperparam] + + return free_hyperparams + + def get_all_hyperparams(self) -> typing.Dict: + return collections.OrderedDict(self.get_primitive_hyperparams().configuration) + + def get_input_data_references(self) -> typing.AbstractSet[str]: + data_references = set() + + for argument_description in self.arguments.values(): + if utils.is_sequence(argument_description['data']): + for data_reference in argument_description['data']: + data_references.add(data_reference) + else: + data_references.add(argument_description['data']) + + for hyperparameter_description in self.hyperparams.values(): + if hyperparameter_description['type'] == metadata_base.ArgumentType.VALUE: + continue + + if hyperparameter_description['type'] == metadata_base.ArgumentType.PRIMITIVE: + continue + + if utils.is_sequence(hyperparameter_description['data']): + for data_reference in hyperparameter_description['data']: + data_references.add(data_reference) + else: + data_references.add(hyperparameter_description['data']) + + return data_references + + def get_output_data_references(self) -> typing.AbstractSet[str]: + data_references = set() + + for output_id in self.outputs: + data_references.add('steps.{i}.{output_id}'.format(i=self.index, output_id=output_id)) + + return data_references + + @classmethod + def from_json_structure(cls: typing.Type[SP], step_description: typing.Dict, *, resolver: typing.Optional[Resolver] = None) -> SP: + step = cls(step_description['primitive'], resolver=resolver) + + for argument_name, argument_description in step_description.get('arguments', {}).items(): + argument_type = metadata_base.ArgumentType[argument_description['type']] + step.add_argument(argument_name, argument_type, argument_description['data']) + + for output_description in step_description.get('outputs', []): + step.add_output(output_description['id']) + + for hyperparameter_name, hyperparameter_description in step_description.get('hyperparams', {}).items(): + argument_type = metadata_base.ArgumentType[hyperparameter_description['type']] + + # If "primitive" is not available, we do not parse the value and we leave it in its JSON form. + if argument_type == metadata_base.ArgumentType.VALUE and step.primitive is not None: + hyperparams = step.get_primitive_hyperparams() + + if hyperparameter_name not in hyperparams.configuration: + raise exceptions.InvalidArgumentValueError( + "Unknown hyper-parameter name '{name}' for primitive {primitive}.".format( + name=hyperparameter_name, + primitive=step.primitive, + ), + ) + + data = hyperparams.configuration[hyperparameter_name].value_from_json_structure(hyperparameter_description['data']) + + else: + data = hyperparameter_description['data'] + + step.add_hyperparameter(hyperparameter_name, argument_type, data) + + for user_description in step_description.get('users', []): + step.add_user(user_description) + + return step + + def _output_to_json_structure(self, output_id: str) -> typing.Dict: + return {'id': output_id} + + def _hyperparameter_to_json_structure(self, hyperparameter_name: str) -> typing.Dict: + hyperparameter_description = copy.copy(self.hyperparams[hyperparameter_name]) + + hyperparameter_description['type'] = hyperparameter_description['type'].name + + # If "primitive" is not available, we have the value already in its JSON form. + if hyperparameter_description['type'] == metadata_base.ArgumentType.VALUE and self.primitive is not None: + hyperparams = self.get_primitive_hyperparams() + + if hyperparameter_name not in hyperparams.configuration: + raise exceptions.InvalidArgumentValueError( + "Unknown hyper-parameter name '{name}' for primitive {primitive}.".format( + name=hyperparameter_name, + primitive=self.primitive, + ), + ) + + hyperparameter_description['data'] = hyperparams.configuration[hyperparameter_name].value_to_json_structure(hyperparameter_description['data']) + + return hyperparameter_description + + def _argument_to_json_structure(self, argument_name: str) -> typing.Dict: + argument_description = copy.copy(self.arguments[argument_name]) + + argument_description['type'] = argument_description['type'].name + + return argument_description + + def to_json_structure(self) -> typing.Dict: + if self.primitive is None: + primitive_description = self.primitive_description + else: + primitive_metadata = self.primitive.metadata.query() + primitive_description = { + 'id': primitive_metadata['id'], + 'version': primitive_metadata['version'], + 'python_path': primitive_metadata['python_path'], + 'name': primitive_metadata['name'], + } + + if 'digest' in primitive_metadata: + primitive_description['digest'] = primitive_metadata['digest'] + + step_description = { + 'type': self.get_step_type().name, + 'primitive': primitive_description, + } + + if self.arguments: + step_description['arguments'] = {argument_name: self._argument_to_json_structure(argument_name) for argument_name in self.arguments.keys()} + + if self.outputs: + step_description['outputs'] = [self._output_to_json_structure(output_id) for output_id in self.outputs] + + if self.hyperparams: + hyperparams = {} + + for hyperparameter_name in self.hyperparams.keys(): + hyperparams[hyperparameter_name] = self._hyperparameter_to_json_structure(hyperparameter_name) + + step_description['hyperparams'] = hyperparams + + if self.users: + step_description['users'] = self.users + + return step_description + + def get_primitive_id(self) -> str: + if self.primitive is not None: + return self.primitive.metadata.query()['id'] + else: + return self.primitive_description['id'] + + +SS = typing.TypeVar('SS', bound='SubpipelineStep') + + +class SubpipelineStep(StepBase): + def __init__(self, pipeline_description: typing.Dict = None, *, pipeline: 'Pipeline' = None, resolver: typing.Optional[Resolver] = None) -> None: + super().__init__(resolver=resolver) + + if pipeline is None: + if pipeline_description is None: + raise exceptions.InvalidArgumentValueError("\"pipeline_description\" and \"pipeline\" arguments are both None.") + + pipeline = self.resolver.get_pipeline(pipeline_description) + elif pipeline_description is not None: + raise exceptions.InvalidArgumentValueError("\"pipeline_description\" and \"pipeline\" arguments cannot be both provided.") + + if pipeline is None: + # If still "None" it means resolver returned "None". + # We just store provided pipeline description. + self.pipeline_description = pipeline_description + self.pipeline = None + else: + self.pipeline_description = None + self.pipeline = pipeline + + self.inputs: typing.List[str] = [] + self.outputs: typing.List[typing.Optional[str]] = [] + + @classmethod + def get_step_type(cls) -> metadata_base.PipelineStepType: + return metadata_base.PipelineStepType.SUBPIPELINE + + def add_input(self, data_reference: str) -> None: + if self.pipeline is not None: + if len(self.inputs) == len(self.pipeline.inputs): + raise exceptions.InvalidArgumentValueError("All pipeline's inputs are already provided.") + + self.inputs.append(data_reference) + + def add_output(self, output_id: typing.Optional[str]) -> None: + """ + Define an output from this step. + + Underlying pipeline can have multiple outputs but not all have to be + defined as outputs of the step. They can be skipped using ``None``. + + Parameters + ---------- + output_id: + ID to be used in the data reference, mapping pipeline's outputs in order. + If ``None`` this pipeline's output is ignored and not mapped to a data reference. + """ + + if output_id is not None: + if output_id in self.outputs: + raise exceptions.InvalidArgumentValueError("Output with ID '{output_id}' already exists.".format(output_id=output_id)) + + if self.pipeline is not None: + if len(self.outputs) == len(self.pipeline.outputs): + raise exceptions.InvalidArgumentValueError("All pipeline's outputs are already mapped.") + + self.outputs.append(output_id) + + def check_add(self, existing_steps: 'typing.Sequence[StepBase]', available_data_references: typing.AbstractSet[str]) -> None: + # Order of steps can be arbitrary during execution (given that inputs for a step are available), but we still + # want some partial order during construction. We want that arguments can already be satisfied by existing steps. + for data_reference in self.inputs: + if not isinstance(data_reference, str): + raise exceptions.InvalidArgumentTypeError("Input data reference '{data_reference}' is not a string.".format(data_reference=data_reference)) + elif data_reference not in available_data_references: + raise exceptions.InvalidPipelineError("Input data reference '{data_reference}' is not among available data references.".format(data_reference=data_reference)) + + # TODO: Check that all inputs are satisfied? + + def get_free_hyperparams(self) -> typing.Sequence: + if self.pipeline is None: + raise exceptions.InvalidStateError("Pipeline has not been resolved.") + + return self.pipeline.get_free_hyperparams() + + def get_all_hyperparams(self) -> typing.Sequence: + if self.pipeline is None: + raise exceptions.InvalidStateError("Pipeline has not been resolved.") + + return self.pipeline.get_all_hyperparams() + + def get_input_data_references(self) -> typing.AbstractSet[str]: + return set(self.inputs) + + def get_output_data_references(self) -> typing.AbstractSet[str]: + data_references = set() + + for output_id in self.outputs: + if output_id is not None: + data_references.add('steps.{i}.{output_id}'.format(i=self.index, output_id=output_id)) + + return data_references + + @classmethod + def from_json_structure(cls: typing.Type[SS], step_description: typing.Dict, *, resolver: Resolver = None) -> SS: + step = cls(step_description['pipeline'], resolver=resolver) + + for input_description in step_description['inputs']: + step.add_input(input_description['data']) + + for output_description in step_description['outputs']: + step.add_output(output_description.get('id', None)) + + return step + + def _input_to_json_structure(self, data_reference: str) -> typing.Dict: + return {'data': data_reference} + + def _output_to_json_structure(self, output_id: typing.Optional[str]) -> typing.Dict: + if output_id is None: + return {} + else: + return {'id': output_id} + + def to_json_structure(self, *, nest_subpipelines: bool = False) -> typing.Dict: + if nest_subpipelines: + if self.pipeline is None: + raise exceptions.InvalidStateError("Pipeline has not been resolved.") + + pipeline_description = self.pipeline._to_json_structure(nest_subpipelines=True) + elif self.pipeline is None: + pipeline_description = self.pipeline_description + else: + pipeline_description = { + 'id': self.pipeline.id, + 'digest': self.pipeline.get_digest(), + } + + step_description = { + 'type': self.get_step_type().name, + 'pipeline': pipeline_description, + 'inputs': [self._input_to_json_structure(data_reference) for data_reference in self.inputs], + 'outputs': [self._output_to_json_structure(output_id) for output_id in self.outputs], + } + + return step_description + + def get_pipeline_id(self) -> str: + if self.pipeline is not None: + return self.pipeline.id + else: + return self.pipeline_description['id'] + + +SL = typing.TypeVar('SL', bound='PlaceholderStep') + + +class PlaceholderStep(StepBase): + def __init__(self, resolver: Resolver = None) -> None: + super().__init__(resolver=resolver) + + self.inputs: typing.List[str] = [] + self.outputs: typing.List[str] = [] + + @classmethod + def get_step_type(cls) -> metadata_base.PipelineStepType: + return metadata_base.PipelineStepType.PLACEHOLDER + + def add_input(self, data_reference: str) -> None: + self.inputs.append(data_reference) + + def add_output(self, output_id: str) -> None: + if output_id in self.outputs: + raise exceptions.InvalidArgumentValueError("Output with ID '{output_id}' already exists.".format(output_id=output_id)) + + self.outputs.append(output_id) + + def check_add(self, existing_steps: 'typing.Sequence[StepBase]', available_data_references: typing.AbstractSet[str]) -> None: + # Order of steps can be arbitrary during execution (given that inputs for a step are available), but we still + # want some partial order during construction. We want that arguments can already be satisfied by existing steps. + for data_reference in self.inputs: + if not isinstance(data_reference, str): + raise exceptions.InvalidArgumentTypeError("Input data reference '{data_reference}' is not a string.".format(data_reference=data_reference)) + elif data_reference not in available_data_references: + raise exceptions.InvalidArgumentValueError("Input data reference '{data_reference}' is not among available data references.".format(data_reference=data_reference)) + + def get_free_hyperparams(self) -> typing.Sequence: + return [] + + def get_all_hyperparams(self) -> typing.Sequence: + return [] + + def get_input_data_references(self) -> typing.AbstractSet[str]: + return set(self.inputs) + + def get_output_data_references(self) -> typing.AbstractSet[str]: + data_references = set() + + for output_id in self.outputs: + data_references.add('steps.{i}.{output_id}'.format(i=self.index, output_id=output_id)) + + return data_references + + @classmethod + def from_json_structure(cls: typing.Type[SL], step_description: typing.Dict, *, resolver: Resolver = None) -> SL: + step = cls(resolver=resolver) + + for input_description in step_description['inputs']: + step.add_input(input_description['data']) + + for output_description in step_description['outputs']: + step.add_output(output_description['id']) + + return step + + def _input_to_json_structure(self, data_reference: str) -> typing.Dict: + return {'data': data_reference} + + def _output_to_json_structure(self, output_id: str) -> typing.Dict: + return {'id': output_id} + + def to_json_structure(self) -> typing.Dict: + step_description = { + 'type': self.get_step_type().name, + 'inputs': [self._input_to_json_structure(data_reference) for data_reference in self.inputs], + 'outputs': [self._output_to_json_structure(output_id) for output_id in self.outputs], + } + + return step_description + + +P = typing.TypeVar('P', bound='Pipeline') + + +class Pipeline: + """ + Class representing a pipeline. + + Attributes + ---------- + id: + A unique ID to identify this pipeline. + created: + Timestamp of pipeline creation in UTC timezone. + source: + Description of source. + name: + Name of the pipeline. + description: + Description of the pipeline. + users: + Users associated with the pipeline. + inputs: + A sequence of input descriptions which provide names for pipeline inputs. + outputs: + A sequence of output descriptions which provide data references for pipeline outputs. + steps: + A sequence of steps defining this pipeline. + + Parameters + ---------- + pipeline_id: + Optional ID for the pipeline. If not provided, it is automatically generated. + context: + DEPRECATED: argument ignored. + created: + Optional timestamp of pipeline creation in UTC timezone. If not provided, the current time will be used. + source: + Description of source. Optional. + name: + Name of the pipeline. Optional. + description: + Description of the pipeline. Optional. + """ + + id: str + created: datetime.datetime + source: typing.Dict + name: str + description: str + users: typing.List[typing.Dict] + inputs: typing.List[typing.Dict] + outputs: typing.List[typing.Dict] + steps: typing.List[StepBase] + + @deprecate.arguments('context', message="argument ignored") + def __init__( + self, pipeline_id: str = None, *, context: metadata_base.Context = None, + created: datetime.datetime = None, source: typing.Dict = None, name: str = None, + description: str = None + ) -> None: + if pipeline_id is None: + pipeline_id = str(uuid.uuid4()) + + if created is None: + created = datetime.datetime.now(datetime.timezone.utc) + elif created.tzinfo is None or created.tzinfo.utcoffset(created) is None: + raise exceptions.InvalidArgumentValueError("'created' timestamp is missing timezone information.") + else: + # Convert to UTC timezone and set "tzinfo" to "datetime.timezone.utc". + created = created.astimezone(datetime.timezone.utc) + + self.id = pipeline_id + self.created = created + self.source = source + self.name = name + self.description = description + + self.inputs: typing.List[typing.Dict] = [] + self.outputs: typing.List[typing.Dict] = [] + self.steps: typing.List[StepBase] = [] + self.users: typing.List[typing.Dict] = [] + + def add_input(self, name: str = None) -> str: + """ + Add an input to the pipeline. + + Parameters + ---------- + name: + Optional human friendly name for the input. + + Returns + ------- + Data reference for the input added. + """ + + input_description = {} + + if name is not None: + input_description['name'] = name + + self.inputs.append(input_description) + + return 'inputs.{i}'.format(i=len(self.inputs) - 1) + + def add_output(self, data_reference: str, name: str = None) -> str: + """ + Add an output to the pipeline. + + Parameters + ---------- + data_reference: + Data reference to use as an output. + name: + Optional human friendly name for the output. + + Returns + ------- + Data reference for the output added. + """ + + if data_reference not in self.get_available_data_references(): + raise exceptions.InvalidArgumentValueError("Invalid data reference '{data_reference}'.".format(data_reference=data_reference)) + + output_description = { + 'data': data_reference, + } + + if name is not None: + output_description['name'] = name + + self.outputs.append(output_description) + + return 'outputs.{i}'.format(i=len(self.outputs) - 1) + + def add_step(self, step: StepBase) -> None: + """ + Add a step to the sequence of steps in the pipeline. + + Parameters + ---------- + step: + A step to add. + """ + + if not isinstance(step, StepBase): + raise exceptions.InvalidArgumentTypeError("Step is not an instance of StepBase.") + + step.set_index(len(self.steps)) + + try: + step.check_add(self.steps, self.get_available_data_references()) + except Exception as error: + raise exceptions.InvalidArgumentValueError("Cannot add step {step_index}.".format(step_index=step.index)) from error + + self.steps.append(step) + + def replace_step(self, index: int, replacement_step: StepBase) -> None: + """ + Replace an existing step (generally a placeholder) with a new step + (generally a subpipeline). It makes sure that all inputs are available + at that point in the pipeline, and all outputs needed later from this + step stay available after replacement. + + If the old pipeline (one before the step being replaced) has already been + made public under some ID, make sure that new pipeline (one with replaced + step) has a new different ID before making it public. + + Parameters + ---------- + index: + Index of the step to replace. + replacement_step: + A new step. + """ + + # TODO: Handle the case when there is a primitive reference to this step (which is a primitive step in such case). + # If we are replacing it with a sub-pipeline or placeholder, we should fail. + + if not 0 <= index < len(self.steps): + raise exceptions.InvalidArgumentValueError("Step index does not point to an existing step.") + + if not isinstance(replacement_step, StepBase): + raise exceptions.InvalidArgumentTypeError("Step is not an instance of StepBase.") + + replacement_step.set_index(index) + + try: + replacement_step.check_add(self.steps[0:index], self.get_available_data_references(index)) + except Exception as error: + raise exceptions.InvalidArgumentValueError("Cannot replace step {step_index}.".format(step_index=index)) from error + + # Which inputs are needed later on? + later_input_data_references: typing.Set[str] = set() + for step in self.steps[index + 1:]: + later_input_data_references.update(step.get_input_data_references()) + + # Compute which data references needed later are contributed by existing step? + used_output_data_references = self.steps[index].get_output_data_references() & later_input_data_references + + # A replacement step has to contribute at least those data references as well. + if not replacement_step.get_output_data_references() >= used_output_data_references: + raise exceptions.InvalidArgumentValueError("Cannot replace step {step_index}. Replacement step is not providing needed outputs: {missing_outputs}".format( + step_index=index, + missing_outputs=sorted(used_output_data_references - replacement_step.get_output_data_references()), + )) + + self.steps[index] = replacement_step + + def add_user(self, user_description: typing.Dict) -> None: + """ + Add a description of user to a list of users associated with the pipeline. + + Parameters + ---------- + user_description: + User description. + """ + + if 'id' not in user_description: + raise exceptions.InvalidArgumentValueError("User description is missing user ID.") + + self.users.append(user_description) + + def get_free_hyperparams(self) -> typing.Sequence: + """ + Returns pipeline's hyper-parameters which have not been fixed by the pipeline as + a list of free hyper-parameters for each step, in order of steps. + + Returns + ------- + A list of hyper-parameters configuration for free hyper-parameters for each step. + """ + + return [step.get_free_hyperparams() for step in self.steps] + + def get_all_hyperparams(self) -> typing.Sequence: + """ + Returns pipeline's hyper-parameters as a list of hyper-parameters + for each step, in order of steps. + + Returns + ------- + A list of hyper-parameters configuration for all hyper-parameters for each step. + """ + + return [step.get_all_hyperparams() for step in self.steps] + + def has_placeholder(self) -> bool: + """ + Returns ``True`` if the pipeline has a placeholder step, in the pipeline itself, or any subpipeline. + + Returns + ------- + ``True`` if the pipeline has a placeholder step. + """ + + for step in self.steps: + if isinstance(step, PlaceholderStep): + return True + elif isinstance(step, SubpipelineStep): + if step.pipeline is None: + raise exceptions.InvalidStateError("Pipeline has not been resolved.") + elif step.pipeline.has_placeholder(): + return True + + return False + + def get_available_data_references(self, for_step: int = None) -> typing.AbstractSet[str]: + """ + Returns a set of data references provided by existing steps (and pipeline inputs). + + Those data references can be used by consequent steps as their inputs. + + Attributes + ---------- + for_step: + Instead of using all existing steps, use only steps until ``for_step`` step. + + Returns + ------- + A set of data references. + """ + + data_references = set() + + for i, input_description in enumerate(self.inputs): + data_references.add('inputs.{i}'.format(i=i)) + + for step in self.steps[0:for_step]: + output_data_references = step.get_output_data_references() + + existing_data_references = data_references & output_data_references + if existing_data_references: + raise exceptions.InvalidPipelineError("Steps have overlapping output data references: {existing_data_references}".format(existing_data_references=existing_data_references)) + + data_references.update(output_data_references) + + return data_references + + @deprecate.function(message="use get_producing_outputs method instead") + def get_exposable_outputs(self) -> typing.AbstractSet[str]: + return self.get_producing_outputs() + + def get_producing_outputs(self) -> typing.AbstractSet[str]: + """ + Returns a set of recursive data references of all values produced by the pipeline + during its run. + + This represents outputs of each step of the pipeline, the outputs of the pipeline + itself, but also exposable outputs of any sub-pipeline. The latter are prefixed with + the step prefix, e.g., ``steps.1.steps.4.produce`` is ``steps.4.produce`` output + of a sub-pipeline step with index 1. + + Outputs of sub-pipelines are represented twice, as an output of the step and + as an output of the sub-pipeline. This is done because not all outputs of a sub-pipeline + are necessary exposed as an output of a step because they might not be used in + the outer pipeline, but the sub-pipeline still defines them. + + A primitive might have additional produce methods which could be called but they + are not listed among step's outputs. Data references related to those produce + methods are not returned. + + Returns + ------- + A set of recursive data references. + """ + + exposable_outputs: typing.Set[str] = set() + + for step_index, step in enumerate(self.steps): + output_data_references = set(step.get_output_data_references()) + + if isinstance(step, SubpipelineStep): + for exposable_output in step.pipeline.get_producing_outputs(): + output_data_references.add('steps.{step_index}.{exposable_output}'.format( + step_index=step_index, + exposable_output=exposable_output, + )) + + existing_data_references = exposable_outputs & output_data_references + if existing_data_references: + raise exceptions.InvalidPipelineError("Steps have overlapping exposable data references: {existing_data_references}".format(existing_data_references=existing_data_references)) + + exposable_outputs.update(output_data_references) + + for i, output_description in enumerate(self.outputs): + exposable_outputs.add('outputs.{i}'.format(i=i)) + + return exposable_outputs + + def check(self, *, allow_placeholders: bool = False, standard_pipeline: bool = True, input_types: typing.Dict[str, type] = None) -> None: + """ + Check if the pipeline is a valid pipeline. + + It supports checking against non-resolved primitives and pipelines, but in that case + checking will be very limited. Make sure you used a strict resolver to assure + full checking of this pipeline and any sub-pipelines. + + Raises an exception if check fails. + + Parameters + ---------- + allow_placeholders: + Do we allow placeholders in a pipeline? + standard_pipeline: + Check it as a standard pipeline (inputs are Dataset objects, output is a DataFrame)? + input_types: + A map of types available as inputs. If provided, overrides ``standard_pipeline``. + """ + + self._check(allow_placeholders, standard_pipeline, input_types) + + def _check(self, allow_placeholders: bool, standard_pipeline: bool, input_types: typing.Optional[typing.Dict[str, type]]) -> typing.Sequence[TypeInfo]: + # Generating JSON also checks it against the pipeline schema. + # We do not set "nest_subpipelines" because recursive checks are done + # by this method's recursive call (when sub-pipelines are resolved). + self.to_json_structure() + + # Map between available data references and their types. + environment: typing.Dict[str, TypeInfo] = {} + + # Inputs are never singleton. + if input_types is not None: + if len(self.inputs) != len(input_types): + raise exceptions.InvalidPipelineError("Pipeline '{pipeline_id}' accepts {inputs} input(s), but {input_types} provided.".format( + pipeline_id=self.id, + inputs=len(self.inputs), + input_types=len(input_types), + )) + + for data_reference, structural_type in input_types.items(): + environment[data_reference] = TypeInfo(structural_type, False) + elif standard_pipeline: + for i, input_description in enumerate(self.inputs): + environment['inputs.{i}'.format(i=i)] = TypeInfo(container.Dataset, False) + else: + for i, input_description in enumerate(self.inputs): + # We do not really know what the inputs are. + environment['inputs.{i}'.format(i=i)] = TypeInfo(typing.Any, False) # type: ignore + + for step_index, step in enumerate(self.steps): + assert step_index == step.index + + if isinstance(step, PlaceholderStep): + if not allow_placeholders: + raise exceptions.InvalidPipelineError("Step {step_index} of pipeline '{pipeline_id}' is a placeholder but there should be no placeholders.".format( + step_index=step_index, + pipeline_id=self.id, + )) + + for data_reference in step.inputs: + # This is checked already during pipeline construction in "check_add". + assert data_reference in environment + + for data_reference in step.get_output_data_references(): + # This is checked already during pipeline construction in "add_output". + assert data_reference not in environment + + # We cannot really know a type of the placeholder output given current pipeline description. + environment[data_reference] = TypeInfo(typing.Any, None) # type: ignore + + elif isinstance(step, SubpipelineStep): + subpipeline_input_types: typing.Dict[str, type] = {} + for i, data_reference in enumerate(step.inputs): + # This is checked already during pipeline construction in "check_add". + assert data_reference in environment + + input_data_reference = 'inputs.{i}'.format(i=i) + + assert input_data_reference not in subpipeline_input_types + subpipeline_input_types[input_data_reference] = environment[data_reference].structural_type + + # Resolving is optional. Of course full checking is not really possible without resolving. + if step.pipeline is not None: + outputs_types = step.pipeline._check(allow_placeholders, False, subpipeline_input_types) + + for i, output_id in enumerate(step.outputs): + if output_id is not None: + output_data_reference = 'steps.{i}.{output_id}'.format(i=step.index, output_id=output_id) + + # This is checked already during pipeline construction in "add_output". + assert output_data_reference not in environment + + if step.pipeline is not None: + environment[output_data_reference] = outputs_types[i] + else: + # We cannot really know a type of the output without resolving. + environment[output_data_reference] = TypeInfo(typing.Any, None) # type: ignore + + elif isinstance(step, PrimitiveStep): + if step.primitive is not None: + primitive_metadata = step.primitive.metadata.query() + primitive_methods = primitive_metadata['primitive_code'].get('instance_methods', {}) + primitive_arguments = primitive_metadata['primitive_code'].get('arguments', {}) + + for argument_name, argument_description in step.arguments.items(): + # This is checked already during pipeline construction in "check_add". + if utils.is_sequence(argument_description['data']): + for data_reference in argument_description['data']: + assert data_reference in environment + else: + assert argument_description['data'] in environment + + if step.primitive is not None: + # This is checked already during pipeline construction in "add_argument". + assert argument_name in primitive_arguments + + if argument_description['type'] == metadata_base.ArgumentType.DATA: + type_info = environment[argument_description['data']] + + # The error is only if it is exactly "False". If it is "None", we do not know and we do not want any false positives. + if type_info.singleton == False: # noqa + raise exceptions.InvalidPipelineError( + "Argument '{argument_name}' of step {step_index} of pipeline '{pipeline_id}' is singleton data, but available data reference is not.".format( + argument_name=argument_name, + step_index=step_index, + pipeline_id=self.id, + ), + ) + + # We cannot really check if types match because we do not know + # the type of elements from just container structural type. + elif step.primitive is not None: + assert argument_description['type'] == metadata_base.ArgumentType.CONTAINER, argument_description['type'] + + if utils.is_sequence(argument_description['data']): + if not utils.is_subclass(primitive_arguments[argument_name]['type'], container.List): + raise exceptions.InvalidPipelineError( + "Argument '{argument_name}' of step {step_index} of pipeline '{pipeline_id}' should have type 'List' to support getting a list of values, " + "but it has type '{argument_type}'.".format( + argument_name=argument_name, + step_index=step_index, + pipeline_id=self.id, + argument_type=primitive_arguments[argument_name]['type'], + ), + ) + + else: + type_info = environment[argument_description['data']] + + if type_info.structural_type is typing.Any or primitive_arguments[argument_name]['type'] is typing.Any: + # No type information. + pass + elif not utils.is_subclass(type_info.structural_type, primitive_arguments[argument_name]['type']): + raise exceptions.InvalidPipelineError( + "Argument '{argument_name}' of step {step_index} of pipeline '{pipeline_id}' has type '{argument_type}', but it is getting a type '{input_type}'.".format( + argument_name=argument_name, + step_index=step_index, + pipeline_id=self.id, + argument_type=primitive_arguments[argument_name]['type'], + input_type=type_info.structural_type, + ), + ) + + if step.primitive is not None: + hyperparams = step.get_primitive_hyperparams() + + for hyperparameter_name, hyperparameter_description in step.hyperparams.items(): + # This is checked already during pipeline construction in "add_hyperparameter". + assert hyperparameter_name in hyperparams.configuration + + if hyperparameter_description['type'] == metadata_base.ArgumentType.DATA: + if utils.is_sequence(hyperparameter_description['data']): + data_references = hyperparameter_description['data'] + else: + data_references = typing.cast(typing.Sequence, [hyperparameter_description['data']]) + + for data_reference in data_references: + # This is checked already during pipeline construction in "check_add". + assert data_reference in environment + + if not isinstance(data_reference, str): + raise exceptions.InvalidArgumentTypeError("Hyper-parameter data reference '{data_reference}' is not a string.".format(data_reference=data_reference)) + + type_info = environment[data_reference] + + # The error is only if it is exactly "False". If it is "None", we do not know and we do not want any false positives. + if type_info.singleton == False: # noqa + raise exceptions.InvalidPipelineError( + "Hyper-parameter '{hyperparameter_name}' of step {step_index} of pipeline '{pipeline_id}' is singleton data, " + "but available data reference '{data_reference}' is not.".format( + hyperparameter_name=hyperparameter_name, + step_index=step_index, + pipeline_id=self.id, + data_reference=data_reference, + ), + ) + + # We cannot really check if types match because we do not know + # the type of elements from just container structural type. + + elif hyperparameter_description['type'] == metadata_base.ArgumentType.PRIMITIVE: + if utils.is_sequence(hyperparameter_description['data']): + primitive_references = hyperparameter_description['data'] + else: + primitive_references = typing.cast(typing.Sequence, [hyperparameter_description['data']]) + + primitives = [] + for primitive_reference in primitive_references: + # This is checked already during pipeline construction in "check_add". + assert 0 <= primitive_reference < step_index + + primitive_step = self.steps[primitive_reference] + + if not isinstance(primitive_step, PrimitiveStep): + raise exceptions.InvalidPipelineError( + "Hyper-parameter '{hyperparameter_name}' of step {step_index} of pipeline '{pipeline_id}' " + "does not point to a primitive step (step {primitive_reference}).".format( + hyperparameter_name=hyperparameter_name, + step_index=step_index, + pipeline_id=self.id, + primitive_reference=primitive_reference, + ), + ) + + if primitive_step.primitive is None: + primitives.append(typing.Any) + else: + primitives.append(primitive_step.primitive) + + if utils.is_sequence(hyperparameter_description['data']): + if not hyperparams.configuration[hyperparameter_name].can_accept_value_type(primitives): + raise exceptions.InvalidPipelineError( + "Hyper-parameter '{hyperparameter_name}' of step {step_index} of pipeline '{pipeline_id}' cannot accept primitives {primitives}.".format( + hyperparameter_name=hyperparameter_name, + step_index=step_index, + pipeline_id=self.id, + primitives=primitives, + ), + ) + else: + assert len(primitives) == 1 + + if not hyperparams.configuration[hyperparameter_name].can_accept_value_type(primitives[0]): + raise exceptions.InvalidPipelineError( + "Hyper-parameter '{hyperparameter_name}' of step {step_index} of pipeline '{pipeline_id}' cannot accept a primitive '{primitive}'.".format( + hyperparameter_name=hyperparameter_name, + step_index=step_index, + pipeline_id=self.id, + primitive=primitives[0], + ), + ) + + elif hyperparameter_description['type'] == metadata_base.ArgumentType.CONTAINER: + # This is checked already during pipeline construction in "check_add". + assert hyperparameter_description['data'] in environment + + type_info = environment[hyperparameter_description['data']] + + if not hyperparams.configuration[hyperparameter_name].can_accept_value_type(type_info.structural_type): + raise exceptions.InvalidPipelineError( + "Hyper-parameter '{hyperparameter_name}' of step {step_index} of pipeline '{pipeline_id}' cannot accept a value of type '{input_type}'.".format( + hyperparameter_name=hyperparameter_name, + step_index=step_index, + pipeline_id=self.id, + input_type=type_info.structural_type, + ), + ) + + elif hyperparameter_description['type'] == metadata_base.ArgumentType.VALUE: + # "VALUE" hyper-parameter value has already been checked in "add_hyperparameter". + pass + + else: + raise exceptions.UnexpectedValueError("Unknown hyper-parameter type: {hyperparameter_type}".format(hyperparameter_type=hyperparameter_description['type'])) + + for output_id in step.outputs: + output_data_reference = 'steps.{i}.{output_id}'.format(i=step.index, output_id=output_id) + + assert output_data_reference not in environment + + if step.primitive is not None: + # This is checked already during pipeline construction in "add_output". + assert output_id in primitive_methods + + method_description = primitive_methods[output_id] + + produce_type = method_description['returns'] + + # This should be checked by some other part of the code (like primitive validation). + assert issubclass(produce_type, base.CallResult), produce_type + + output_type = utils.get_type_arguments(produce_type)[base.T] # type: ignore + + environment[output_data_reference] = TypeInfo(output_type, method_description.get('singleton', False)) + else: + # We cannot really know a type of the output without resolving. + environment[output_data_reference] = TypeInfo(typing.Any, None) # type: ignore + + else: + raise exceptions.UnexpectedValueError("Unknown step type: {step_type}".format(step_type=type(step))) + + outputs_types = [] + for output_description in self.outputs: + # This is checked already during pipeline construction in "add_output". + assert output_description['data'] in environment, output_description['data'] + + outputs_types.append(environment[output_description['data']]) + + return outputs_types + + @classmethod + def from_yaml(cls: typing.Type[P], string_or_file: typing.Union[str, typing.IO[typing.Any]], *, resolver: typing.Optional[Resolver] = None, + strict_digest: bool = False) -> P: + description = utils.yaml_load(string_or_file) + + return cls.from_json_structure(description, resolver=resolver, strict_digest=strict_digest) + + @classmethod + def from_json(cls: typing.Type[P], string_or_file: typing.Union[str, typing.IO[typing.Any]], *, resolver: typing.Optional[Resolver] = None, + strict_digest: bool = False) -> P: + if isinstance(string_or_file, str): + description = json.loads(string_or_file) + else: + description = json.load(string_or_file) + + return cls.from_json_structure(description, resolver=resolver, strict_digest=strict_digest) + + @classmethod + def _get_step_class(cls, step_type: typing.Any) -> StepBase: + if step_type == metadata_base.PipelineStepType.PRIMITIVE: + return PrimitiveStep + elif step_type == metadata_base.PipelineStepType.SUBPIPELINE: + return SubpipelineStep + elif step_type == metadata_base.PipelineStepType.PLACEHOLDER: + return PlaceholderStep + else: + raise exceptions.InvalidArgumentValueError("Invalid step type '{step_type}'.".format(step_type=step_type)) + + @classmethod + def _get_source(cls, pipeline_description: typing.Dict) -> typing.Optional[typing.Dict]: + return pipeline_description.get('source', None) + + @classmethod + def _canonical_pipeline_description(cls, pipeline_description: typing.Dict) -> typing.Dict: + """ + Before we compute digest of the pipeline description, we have to convert it to a + canonical structure. + + Currently, this is just removing any sub-pipelines the description might have nested. + """ + + pipeline_description = copy.deepcopy(pipeline_description) + + for step_description in pipeline_description['steps']: + if step_description['type'] == metadata_base.PipelineStepType.SUBPIPELINE: + new_description = { + 'id': step_description['pipeline']['id'], + } + if 'digest' in step_description['pipeline']: + new_description['digest'] = step_description['pipeline']['digest'] + step_description['pipeline'] = new_description + + # Not really part of pipeline schema, but used in evaluation. Digest should + # not be computed using it, if it was passed in. We also do not want to store + # it in metalearning database as part of the pipeline document so that we are + # not storing same pipeline multiple times, just with different rank values. + if 'pipeline_rank' in pipeline_description: + del pipeline_description['pipeline_rank'] + + return pipeline_description + + @classmethod + def from_json_structure(cls: typing.Type[P], pipeline_description: typing.Dict, *, resolver: typing.Optional[Resolver] = None, + strict_digest: bool = False) -> P: + PIPELINE_SCHEMA_VALIDATOR.validate(pipeline_description) + + if 'digest' in pipeline_description: + digest = utils.compute_digest(cls._canonical_pipeline_description(pipeline_description)) + + if digest != pipeline_description['digest']: + if strict_digest: + raise exceptions.DigestMismatchError( + "Digest for pipeline '{pipeline_id}' does not match a computed one. Provided digest: {pipeline_digest}. Computed digest: {new_pipeline_digest}.".format( + pipeline_id=pipeline_description['id'], + pipeline_digest=pipeline_description['digest'], + new_pipeline_digest=digest, + ) + ) + else: + logger.warning( + "Digest for pipeline '%(pipeline_id)s' does not match a computed one. Provided digest: %(pipeline_digest)s. Computed digest: %(new_pipeline_digest)s.", + { + 'pipeline_id': pipeline_description['id'], + 'pipeline_digest': pipeline_description['digest'], + 'new_pipeline_digest': digest, + }, + ) + + # If no timezone information is provided, we assume UTC. If there is timezone information, + # we convert timestamp to UTC in the constructor of "Pipeline". + created = dateparser.parse(pipeline_description['created'], settings={'TIMEZONE': 'UTC'}) + source = cls._get_source(pipeline_description) + + pipeline = cls( + pipeline_id=pipeline_description['id'], created=created, source=source, + name=pipeline_description.get('name', None), description=pipeline_description.get('description', None) + ) + + for input_description in pipeline_description['inputs']: + pipeline.add_input(input_description.get('name', None)) + + for step_description in pipeline_description['steps']: + step = cls._get_step_class(step_description['type']).from_json_structure(step_description, resolver=resolver) + pipeline.add_step(step) + + for output_description in pipeline_description['outputs']: + pipeline.add_output(output_description['data'], output_description.get('name', None)) + + for user_description in pipeline_description.get('users', []): + pipeline.add_user(user_description) + + return pipeline + + def _inputs_to_json_structure(self) -> typing.Sequence[typing.Dict]: + return self.inputs + + def _outputs_to_json_structure(self) -> typing.Sequence[typing.Dict]: + return self.outputs + + def _source_to_json_structure(self) -> typing.Optional[typing.Dict]: + return self.source + + def _users_to_json_structure(self) -> typing.Optional[typing.Sequence[typing.Dict]]: + # Returns "None" if an empty list. + return self.users or None + + def _to_json_structure(self, *, nest_subpipelines: bool = False) -> typing.Dict: + # Timestamp should already be in UTC and in particular "tzinfo" should be "datetime.timezone.utc". + assert self.created.tzinfo == datetime.timezone.utc, self.created + # We remove timezone information before formatting to not have "+00:00" added and + # we then manually add "Z" instead (which has equivalent meaning). + created = self.created.replace(tzinfo=None).isoformat('T') + 'Z' + + pipeline_description: typing.Dict = { + 'id': self.id, + 'schema': PIPELINE_SCHEMA_VERSION, + 'created': created, + 'inputs': self._inputs_to_json_structure(), + 'outputs': self._outputs_to_json_structure(), + 'steps': [], + } + + source = self._source_to_json_structure() + if source is not None: + pipeline_description['source'] = source + + users = self._users_to_json_structure() + if users is not None: + pipeline_description['users'] = users + + if self.name is not None: + pipeline_description['name'] = self.name + if self.description is not None: + pipeline_description['description'] = self.description + + for step in self.steps: + if isinstance(step, SubpipelineStep): + pipeline_description['steps'].append(step.to_json_structure(nest_subpipelines=nest_subpipelines)) + else: + pipeline_description['steps'].append(step.to_json_structure()) + + pipeline_description['digest'] = utils.compute_digest(self._canonical_pipeline_description(pipeline_description)) + + return pipeline_description + + def to_json_structure(self, *, nest_subpipelines: bool = False, canonical: bool = False) -> typing.Dict: + if canonical: + nest_subpipelines = False + + pipeline_description = self._to_json_structure(nest_subpipelines=nest_subpipelines) + + if canonical: + pipeline_description = self._canonical_pipeline_description(pipeline_description) + + PIPELINE_SCHEMA_VALIDATOR.validate(pipeline_description) + + return pipeline_description + + def to_json(self, file: typing.IO[typing.Any] = None, *, nest_subpipelines: bool = False, canonical: bool = False, **kwargs: typing.Any) -> typing.Optional[str]: + obj = self.to_json_structure(nest_subpipelines=nest_subpipelines, canonical=canonical) + + if 'allow_nan' not in kwargs: + kwargs['allow_nan'] = False + + if file is None: + return json.dumps(obj, **kwargs) + else: + json.dump(obj, file, **kwargs) + return None + + def to_yaml(self, file: typing.IO[typing.Any] = None, *, nest_subpipelines: bool = False, canonical: bool = False, **kwargs: typing.Any) -> typing.Optional[str]: + obj = self.to_json_structure(nest_subpipelines=nest_subpipelines, canonical=canonical) + + return utils.yaml_dump(obj, stream=file, **kwargs) + + def equals(self, pipeline: P, *, strict_order: bool = False, only_control_hyperparams: bool = False) -> bool: + """ + Check if the two pipelines are equal in the sense of isomorphism. + + Parameters + ---------- + pipeline: + A pipeline instance. + strict_order: + If true, we will treat inputs of `Set` hyperparameters as a list, and the order of primitives are determined by their step indices. + Otherwise we will try to sort contents of `Set` hyperparameters so the orders of their contents are not important, + and we will try topological sorting to determine the order of nodes. + only_control_hyperparams: + If true, equality checks will not happen for any hyperparameters that are not of the ``ControlParameter`` semantic type, i.e. + there will be no checks for hyperparameters that are specific to the hyperparameter optimization phase, and not part of the + logic of the pipeline. + + Notes + ----- + This algorithm checks if the two pipelines are equal in the sense of isomorphism by solving a graph isomorphism + problem. The general graph isomorphism problem is known to be neither P nor NP-complete. However, + our pipelines are DAGs so we could have an algorithm to check its isomorphism in polynomial time. + + The complexity of this algorithm is around :math:`O((V + E)logV)`, where :math:`V` is the number of steps in the + pipeline and :math:`E` is the number of output references. It tries to assign unique orders to all nodes layer + by layer greedily followed by a topological sort using DFS. Then we can get a unique, hashable & comparable + tuple representing the structure of the pipeline. It is also a unique representation of the equivalence class of + a pipeline in the sense of isomorphism. + """ + + # TODO: We could cache the representation once the pipeline is freezed. + return \ + PipelineHasher(self, strict_order, only_control_hyperparams).unique_equivalence_class_repr() == \ + PipelineHasher(pipeline, strict_order, only_control_hyperparams).unique_equivalence_class_repr() + + def hash(self, *, strict_order: bool = False, only_control_hyperparams: bool = False) -> int: + """ + Get the hash value of a pipeline. It simply hashes the unique representation of the equivalence class of + a pipeline in the sense of isomorphism. + + strict_order: + If true, we will treat inputs of `Set` hyperparameters as a list, and the order of primitives are determined by their step indices. + Otherwise we will try to sort contents of `Set` hyperparameters so the orders of their contents are not important, + and we will try topological sorting to determine the order of nodes. + only_control_hyperparams: + If true, equality checks will not happen for any hyperparameters that are not of the ``ControlParameter`` semantic type, i.e. + there will be no checks for hyperparameters that are specific to the hyperparameter optimization phase, and not part of the + logic of the pipeline. + """ + + # TODO: We could cache the hash once the pipeline is freezed. + return hash(PipelineHasher(self, strict_order, only_control_hyperparams)) + + def get_digest(self) -> str: + return self._to_json_structure(nest_subpipelines=False)['digest'] + + +# There are several forms of input indices. +# 1. Named arguments. They are typically strings or tuple-wrapped strings. +# 2. Pipeline outputs. They are integers. +# 3. Value-type & container-type hyperparameters. They are strings. +# 4. Data-type hyperparameters. They are tuples like (name, type) or (name, type, index). +# 5. Primitive-type hyperparameters. They are strings or tuples like (name, index). +InputIndex = typing.Union[int, str, typing.Tuple[str], typing.Tuple[str, str], typing.Tuple[str, int], typing.Tuple[str, str, int]] +OutputIndex = int +Edge = typing.NamedTuple('Edge', [('input_index', InputIndex), ('output_index', OutputIndex)]) +PD = typing.TypeVar('PD', bound='PipelineDAG') + + +class OrderedNode(metaclass=utils.AbstractMetaclass): + """This class represents a node in a DAG. + + Parameters + ---------- + name: + The name of this node. + topological_order: + The topological order of this node in the DAG. + inputs_ref: + The inputs containing unresolved reference strings or list of indices. + + Attributes + ---------- + name: + The name of this node. + topological_order: + The topological order of a node in a DAG. + global_order: + The global order of a node in a DAG. + inputs: + The inputs of the node. They serve as the edges in a DAG. + children: + The descendants of this node. + """ + + name: str + topological_order: int + global_order: int + inputs: typing.Dict + children: typing.Dict + + def __init__(self, name: str, topological_order: int = 0, inputs_ref: typing.Optional[typing.Union[typing.Dict[InputIndex, str], typing.List[str]]] = None) -> None: + self.name = name + self.topological_order: int = topological_order + + if inputs_ref is None: + inputs_ref = collections.OrderedDict() + elif isinstance(inputs_ref, list): + inputs_ref = collections.OrderedDict(enumerate(inputs_ref)) + self._inputs_ref = inputs_ref + + self.global_order: typing.Optional[int] = None + self.inputs: typing.Dict[InputIndex, typing.Tuple['OrderedNode', int]] = collections.OrderedDict() + self.children: typing.DefaultDict['OrderedNode', typing.Set[InputIndex]] = collections.defaultdict(set) + self._frozen = False + self._unique_equivalence_class_repr: typing.Optional[typing.Tuple] = None + + @property + def inputs_count(self) -> int: + """ + Returns the count of inputs. + """ + return len(self._inputs_ref) + + def outputs(self) -> typing.DefaultDict[OutputIndex, typing.Set[typing.Tuple['OrderedNode', InputIndex]]]: + reverse_dict: typing.DefaultDict[OutputIndex, typing.Set[typing.Tuple[OrderedNode, InputIndex]]] = collections.defaultdict(set) + for node, input_indices in self.children.items(): + for input_index in input_indices: + output_index = node.inputs[input_index][1] + reverse_dict[output_index].add((node, input_index)) + return reverse_dict + + @property + def frozen(self) -> bool: + """ + If a node is frozen, its representation can be cached. + + Returns + ------- + The frozen state of the node. + """ + + return self._frozen + + @frozen.setter + def frozen(self, value: bool) -> None: + assert isinstance(value, bool) + self._frozen = value + if not value: + # Force cleanup. + self._unique_equivalence_class_repr = None + + def add_child(self, node: 'OrderedNode', edge: Edge) -> None: + """ + Add a child node. + + Parameters + ---------- + node: + The child node. + edge: + The edge connects parent node and child node. + """ + + self.children[node].add(edge.input_index) + node.inputs[edge.input_index] = (self, edge.output_index) + + def remove_child(self, child: 'OrderedNode', input_index: typing.Optional[InputIndex]) -> None: + """ + Remove a child node. + + Parameters + ---------- + child: + The child node. + input_index: + The related input index of the child node. If it is None, all edges between the child ndoe and the parent node will be removed. + """ + + if input_index is None: + for input_index in self.children[child]: + del child.inputs[input_index] + del self.children[child] + else: + edges = self.children[child] + edges.remove(input_index) + del child.inputs[input_index] + if not edges: + del self.children[child] + + def change_input(self, input_index: InputIndex, new_parent: 'OrderedNode', new_input_index: typing.Optional[InputIndex] = None, new_output_index: typing.Optional[OutputIndex] = None) -> None: + """ + Change the input of the node. + + Parameters + ---------- + input_index: + The input index we want to change. + new_parent: + The new parent of the node. + new_input_index: + The new input index. If it is None, the original index will be kept. + new_output_index: + The new output index. If it is None, the original index will be kept. + """ + + parent, output_index = self.inputs[input_index] + parent.remove_child(self, input_index) + if new_output_index is None: + new_output_index = output_index + if new_input_index is None: + new_input_index = input_index + else: + del self.inputs[input_index] + new_parent.add_child(self, Edge(input_index=new_input_index, output_index=new_output_index)) + + def join(self, node_with_inputs: 'OrderedNode') -> None: + """ + Join by the edges of the nodes. + + Two nodes can be joined only if the output indices of node A (`self` here) match the input indices of node B (`node_with_inputs` here). + The join operation needs two nodes: A and B. Suppose A's children are {A+} and B's parents are {B-}. + + It removes all edges between B and {B-} & between A and {A+}, then creating new edges to connect {B-} and {A+}. + + Parameters + ---------- + node_with_inputs: + The node which provides inputs. + + Notes + ----- + The function is named ``join`` because it is similar to "join" of SQL since they both concatenate items by their common indices. + """ + + outputs = self.outputs() + # Set & dict size will be changed during iteration. Use a list to fix them. + for input_index, (parent, parent_output_index) in list(node_with_inputs.inputs.items()): + assert isinstance(input_index, int) + for child, child_input in outputs[input_index]: + child.change_input(child_input, parent, new_output_index=parent_output_index) + parent.remove_child(node_with_inputs, input_index) + + @abc.abstractmethod + def reference_name(self) -> int: + """ + The name to reference itself. + """ + + @abc.abstractmethod + def output_reference_names(self) -> typing.List[str]: + """ + The names for other nodes to refer its outputs. + """ + + def resolve_input_references(self, nodes_outputs_reverse_dict: typing.Dict[str, typing.Tuple['OrderedNode', OutputIndex]]) -> None: + """ + Resolve input references with a lookup dict. + """ + + for input_index, ref in self._inputs_ref.items(): + parent, output_index = nodes_outputs_reverse_dict[ref] + parent.add_child(self, Edge(input_index=input_index, output_index=output_index)) + + def _unique_ordered_inputs(self) -> typing.Tuple: + input_orders = [(name, parent.global_order, output_index) for name, (parent, output_index) in self.inputs.items()] + input_orders.sort() + return tuple(input_orders) + + def unique_equivalence_class_repr(self) -> typing.Tuple: + """ + Get the unique representation of the equivalence class of the node in the sense of isomorphism. + """ + + if not self.frozen or self._unique_equivalence_class_repr is None: + repr_tuple = (self.name, self._unique_ordered_inputs(), self.topological_order) + if self.frozen: + self._unique_equivalence_class_repr = repr_tuple + else: + self._unique_equivalence_class_repr = None + return repr_tuple + + return self._unique_equivalence_class_repr + + +class InputsNode(OrderedNode): + """This class represents the inputs of a pipeline. This node is unique in a pipeline. + + Parameters + ---------- + pipeline_inputs: + Inputs of the pipeline. It is a list contains description dicts of inputs. Their order matters. + They will not be resolved as data reference strings, so we use `pipeline_inputs` as its name instead of `inputs_ref` which will be resolved. + """ + def __init__(self, pipeline_inputs: typing.List[typing.Dict]) -> None: + super().__init__('Inputs') + + self.pipeline_inputs = copy.deepcopy(pipeline_inputs) + self.global_order = 0 + + @property + def inputs_count(self) -> int: + """ + Return the count of inputs. + """ + return len(self.pipeline_inputs) + + def reference_name(self) -> int: + """ + We specify that the input node has index -1. + """ + + return -1 + + def output_reference_names(self) -> typing.List[str]: + """ + The names for other nodes to refer its outputs. + """ + + return ['inputs.{i}'.format(i=i) for i in range(self.inputs_count)] + + def unique_equivalence_class_repr(self) -> typing.Tuple: + """ + Get the unique representation of the equivalence class of the node in the sense of isomorphism. + """ + + return self.name, self.inputs_count + + +class OutputsNode(OrderedNode): + """This class represents the outputs of a pipeline. This node is unique in a pipeline. + + Parameters + ---------- + pipeline_outputs: + Outputs of a pipeline. It is a list contains description dicts of outputs. Their order matters. + """ + def __init__(self, pipeline_outputs: typing.List[typing.Dict]) -> None: + super().__init__('Outputs', inputs_ref=[v['data'] for v in pipeline_outputs]) + + self.outputs_count = len(pipeline_outputs) + + def reference_name(self) -> int: + """ + We specify that the output node has index -2. + """ + + return -2 + + def output_reference_names(self) -> typing.List[str]: + """ + The names for other nodes to refer its outputs. + """ + + return [] + + +class PrimitiveNode(OrderedNode): + """ + This class represents a primitive step in a DAG. + + Attributes + ---------- + index: + The index of this step in the pipeline. + primitive_step: + The PrimitiveStep instance. + _steps_ref: + Raw inputs info contains step reference indices. + steps: + Steps used by this node as parameters or hyperparameters. + values: + Inputs contains simple value. + strict_order: + If true, we will treat inputs of `Set` hyperparameters as a list. + Otherwise we will try to sort their contents so the orders of their contents are not important. + only_control_hyperparams: + If true, hyperparameters that are not of the `ControlParameter` semantic type. will not be included + in the node's representation. + """ + + index: int + primitive_step: PrimitiveStep + _steps_ref: typing.Dict + steps: typing.Dict + values: typing.Dict + strict_order: bool + only_control_hyperparams: bool + + def __init__(self, primitive: PrimitiveStep, *, strict_order: bool, only_control_hyperparams: bool) -> None: + # We wraps argument names with a tuple to unify sorting. + super().__init__(primitive.get_primitive_id(), inputs_ref={(k,): v['data'] for k, v in primitive.arguments.items()}) + + self.index: int = primitive.index + self.primitive_step = primitive + self.strict_order = strict_order + self.only_control_hyperparams = only_control_hyperparams + + self._outputs: typing.List[str] = primitive.outputs.copy() + self._steps_ref: typing.Dict[InputIndex, int] = collections.OrderedDict() + self.steps: typing.Dict[InputIndex, OrderedNode] = collections.OrderedDict() + self.values: typing.Dict[str, typing.Any] = collections.OrderedDict() + + if self.primitive_step.primitive is not None: + hyperparameters = self.primitive_step.get_primitive_hyperparams().configuration + else: + hyperparameters = None + + # Resolve hyper-parameters. For sequential hyperparameters, we consider their order matters. + for name, hyperparameter_description in primitive.hyperparams.items(): + if only_control_hyperparams and hyperparameters is not None and CONTROL_HYPERPARAMETER_SEMANTIC_TYPE not in hyperparameters[name].semantic_types: + continue + is_set = isinstance(hyperparameters[name], hyperparams_module.Set) if hyperparameters is not None else False + if hyperparameter_description['type'] == metadata_base.ArgumentType.DATA: + if utils.is_sequence(hyperparameter_description['data']): + data_references: typing.List[str] = typing.cast(typing.List[str], hyperparameter_description['data']) + if is_set and not strict_order: + data_references = sorted(data_references) + for i, data_reference in enumerate(data_references): + self._inputs_ref[name, metadata_base.ArgumentType.DATA.name, i] = data_reference + else: + self._inputs_ref[name, metadata_base.ArgumentType.DATA.name] = hyperparameter_description['data'] + elif hyperparameter_description['type'] == metadata_base.ArgumentType.PRIMITIVE: + if utils.is_sequence(hyperparameter_description['data']): + primitive_references: typing.List[int] = typing.cast(typing.List[int], hyperparameter_description['data']) + if is_set and not strict_order: + primitive_references = sorted(primitive_references) + for i, primitive_reference in enumerate(primitive_references): + self._steps_ref[name, i] = primitive_reference + else: + self._steps_ref[name] = hyperparameter_description['data'] + elif hyperparameter_description['type'] == metadata_base.ArgumentType.CONTAINER: + self._inputs_ref[name, metadata_base.ArgumentType.CONTAINER.name] = hyperparameter_description['data'] + elif hyperparameter_description['type'] == metadata_base.ArgumentType.VALUE: + data = hyperparameter_description['data'] + if is_set and not strict_order: + assert isinstance(data, list) + # encode the value + simple_data = self._serialize_hyperparamter_value(name, data, True) + assert utils.is_sequence(simple_data) + data = [x for _, x in sorted(zip(simple_data, data), key=lambda pair: pair[0])] + self.values[name] = data + else: + raise exceptions.UnexpectedValueError("Unknown hyper-parameter type: {hyperparameter_type}".format(hyperparameter_type=hyperparameter_description['type'])) + + def reference_name(self) -> int: + return self.index + + def output_reference_names(self) -> typing.List[str]: + """ + The names for other nodes to refer its outputs. + """ + + return ['steps.{i}.{output_id}'.format(i=self.index, output_id=output_id) for output_id in self._outputs] + + def resolve_step_references(self, nodes_reverse_dict: typing.Dict[int, OrderedNode]) -> None: + """ + Resolve step references with a lookup dict. + """ + + for input_index, ref in self._steps_ref.items(): + self.steps[input_index] = nodes_reverse_dict[ref] + + def _serialize_hyperparamter_value(self, name: str, data: typing.Any, is_sequence: bool) -> typing.Any: + if self.primitive_step.primitive is not None: + configuration = self.primitive_step.get_primitive_hyperparams().configuration + if name not in configuration: + raise exceptions.InvalidArgumentValueError( + "Unknown hyper-parameter name '{name}' for primitive {primitive}.".format( + name=name, + primitive=self.primitive_step.primitive, + ), + ) + hyperparameter = configuration[name] + else: + hyperparameter = hyperparams_module.Hyperparameter[type(data)](data) # type: ignore + + serialized = hyperparameter.value_to_json_structure(data) + + if is_sequence: + return [json.dumps(s, sort_keys=True) for s in serialized] + else: + return json.dumps(serialized, sort_keys=True) + + def _unique_serialized_values(self) -> typing.Tuple: + values = [(name, self._serialize_hyperparamter_value(name, data, False)) for name, data in self.values.items()] + # Sort by value names. + values.sort() + return tuple(values) + + def _unique_step_references(self) -> typing.Tuple: + steps_orders = [(name, node.global_order) for name, node in self.steps.items()] + steps_orders.sort() + return tuple(steps_orders) + + def unique_equivalence_class_repr(self) -> typing.Tuple: + """ + Get the unique representation of the equivalence class of the node in the sense of isomorphism. + """ + + if not self.frozen or self._unique_equivalence_class_repr is None: + repr_tuple = (self.name, self._unique_ordered_inputs(), self._unique_step_references(), self._unique_serialized_values(), self.topological_order) + if self.frozen: + self._unique_equivalence_class_repr = repr_tuple + else: + self._unique_equivalence_class_repr = None + return repr_tuple + + return self._unique_equivalence_class_repr + + +class PlaceholderNode(OrderedNode): + """ + This class represents a placeholder step in a DAG. + + Attributes + ---------- + index: + The index of this step in the pipeline. + """ + + index: int + + def __init__(self, placeholder: PlaceholderStep) -> None: + super().__init__(PlaceholderStep.__name__, inputs_ref=placeholder.inputs.copy()) + self.index: int = placeholder.index + self._outputs: typing.List[str] = placeholder.outputs.copy() + + def reference_name(self) -> int: + return self.index + + def output_reference_names(self) -> typing.List[str]: + """ + The names for other nodes to refer its outputs. + """ + + return ['steps.{i}.{output_id}'.format(i=self.index, output_id=output_id) for output_id in self._outputs] + + +class SubpipelineNode(OrderedNode): + """ + This class represents a subpipeline step in a DAG. + + If this sub-pipeline has been resolved, then its graph is expected to be merged into its parent graph; + otherwise `unique_equivalence_class_repr()` is called to get a unique representation according to its ID. + + Parameters + ---------- + subpipeline: + A subpipeline instance. + + Attributes + ---------- + index: + The index of this step in the pipeline. + pipeline_id: + The pipeline ID of subpipeline. + pipeline: + The sub-pipeline instance. If the sub-pipeline hasn't been resolved, it should be `None`. + strict_order: + If true, we will treat inputs of `Set` hyperparameters as a list. + Otherwise we will try to sort their contents so the orders of their contents are not important. + only_control_hyperparams: + If true, hyperparameters that are not of the ``ControlParameter`` semantic type will not be included + in the graph representation of this subpipeline's primitive steps. + """ + + index: int + pipeline_id: str + pipeline: typing.Optional[Pipeline] + strict_order: bool + only_control_hyperparams: bool + + def __init__(self, subpipeline: SubpipelineStep, *, strict_order: bool, only_control_hyperparams: bool) -> None: + super().__init__(SubpipelineStep.__name__, inputs_ref=subpipeline.inputs.copy()) + self.strict_order = strict_order + self.only_control_hyperparams = only_control_hyperparams + self.index: int = subpipeline.index + + assert subpipeline.outputs is not None + + self._outputs: typing.List[str] = subpipeline.outputs.copy() + self.pipeline_id: str = subpipeline.get_pipeline_id() + self.pipeline: typing.Optional[Pipeline] = subpipeline.pipeline + + def graph(self) -> typing.Optional['PipelineDAG']: + """ + Get the graph of the pipeline inside. + + Returns + ------- + If this node has been resolved, return the graph; return None otherwise. + """ + + if self.pipeline is not None: + return PipelineDAG(self.pipeline, strict_order=self.strict_order, only_control_hyperparams=self.only_control_hyperparams) + return None + + def reference_name(self) -> int: + return self.index + + def output_reference_names(self) -> typing.List[str]: + """ + The names for other nodes to refer its outputs. + """ + + # Do not export null output_id. + return ['steps.{i}.{output_id}'.format(i=self.index, output_id=output_id) for output_id in self._outputs if output_id is not None] + + def unique_equivalence_class_repr(self) -> typing.Tuple: + """ + Get the unique representation of the equivalence class of the node in the sense of isomorphism. + + This is only used when the sub-pipeline hasn't been resolved. Otherwise, its graph should be used. + """ + return super().unique_equivalence_class_repr() + (self.pipeline_id,) + + +class PipelineDAG: + """ + Directed acyclic graph builder for a pipeline. + + It has an input node as the head of the DAG and an output node as the tail. + + Attributes + ---------- + pipeline: + The associated pipeline instance. + step_nodes: + These nodes belong to the steps of the pipeline, ordered by their index (including the extra inputs node & outputs node). + It will be changed if we try to expand this graph. + nodes: + A set of **all** nodes in the graph. + It will be changed if we try to expand this graph. + strict_order: + If true, we will treat inputs of `Set` hyperparameters as a list. + Otherwise we will try to sort their contents so the orders of their contents are not important. + only_control_hyperparams: + If true, hyperparameters that are not of the ``ControlParameter`` semantic type will not be included + in the graph representation of this pipeline's primitive steps. + """ + + pipeline: Pipeline + step_nodes: typing.List[OrderedNode] + nodes: typing.Set[OrderedNode] + strict_order: bool + only_control_hyperparams: bool + + def __init__(self, pipeline: Pipeline, *, strict_order: bool, only_control_hyperparams: bool) -> None: + self.pipeline = pipeline + self.strict_order = strict_order + self.only_control_hyperparams = only_control_hyperparams + + self.step_nodes: typing.List[OrderedNode] = [] + self._nodes_reverse_dict: typing.Dict[int, OrderedNode] = {} + self._nodes_outputs_reverse_dict: typing.Dict[str, typing.Tuple[OrderedNode, OutputIndex]] = {} + + self.inputs_node = InputsNode(pipeline.inputs) + self.outputs_node = OutputsNode(pipeline.outputs) + + self.step_nodes.append(self.inputs_node) + self.step_nodes.extend(self._convert_step_to_node(step) for step in pipeline.steps) + self.step_nodes.append(self.outputs_node) + + self.nodes: typing.Set[OrderedNode] = set(self.step_nodes) + + # Build reversed mappings. + for node in self.step_nodes: + self._update_references(node) + + # Build the DAG. + for node in self.step_nodes: + self._resolve_references(node) + + def _convert_step_to_node(self, step: StepBase) -> OrderedNode: + node: OrderedNode + if isinstance(step, PrimitiveStep): + node = PrimitiveNode(step, strict_order=self.strict_order, only_control_hyperparams=self.only_control_hyperparams) + elif isinstance(step, PlaceholderStep): + node = PlaceholderNode(step) + elif isinstance(step, SubpipelineStep): + node = SubpipelineNode(step, strict_order=self.strict_order, only_control_hyperparams=self.only_control_hyperparams) + else: + # New type of steps should be added here. + raise NotImplementedError("Step type={t} is not supported.".format(t=type(step))) + return node + + def _update_references(self, node: OrderedNode) -> None: + for output_index, output_id in enumerate(node.output_reference_names()): + self._nodes_outputs_reverse_dict[output_id] = (node, output_index) + self._nodes_reverse_dict[node.reference_name()] = node + + def _resolve_references(self, node: OrderedNode) -> None: + node.resolve_input_references(self._nodes_outputs_reverse_dict) + if isinstance(node, PrimitiveNode): + node.resolve_step_references(self._nodes_reverse_dict) + + def body_nodes(self) -> typing.Set[OrderedNode]: + """ + Return all nodes expect the inputs node and outputs node in the graph. + """ + + return self.nodes - {self.inputs_node, self.outputs_node} + + def expand_node(self, node: OrderedNode, graph: PD) -> None: + """ + Replace a node with a graph. + """ + + assert node in self.nodes + + # Update node records. + loc = self.step_nodes.index(node) + self.step_nodes = self.step_nodes[:loc] + graph.step_nodes[1:-1] + self.step_nodes[loc + 1:] + self.nodes.remove(node) + self.nodes.update(graph.body_nodes()) + + # Join nodes. + graph.inputs_node.join(node) + node.join(graph.outputs_node) + + def expand_subpipelines(self, recursive: bool = True) -> None: + """ + Extract all nodes inside a subpipeline's graph and integrate them into this graph. + + Parameters + ---------- + recursive: + If true, we will expand subpipelines of all depth (that is, subpipelines of subpipelines). + """ + + # Pick up subpipeline nodes into a list because expanding nodes will change the graph. + subpipelines: typing.List[SubpipelineNode] = [node for node in self.nodes if isinstance(node, SubpipelineNode)] + for subpipeline_node in subpipelines: + subgraph: typing.Optional[PipelineDAG] = subpipeline_node.graph() + if subgraph is not None: + if recursive: + subgraph.expand_subpipelines(recursive=recursive) + self.expand_node(subpipeline_node, subgraph) + + +class PipelineHasher: + """ + Hash helper for pipelines. + + This algorithm checks if the two pipelines are equal in the sense of isomorphism by solving a graph isomorphism + problem. The general graph isomorphism problem is known to be neither P nor NP-complete. However, + our pipelines are DAGs so we could have an algorithm to check its isomorphism in polynomial time. + + The complexity of this algorithm is around :math:`O((V + E)logV)`, where :math:`V` is the number of steps in the + pipeline and :math:`E` is the number of output references. + + The algorithm follows these steps: + + 1. Construct a DAG from the given pipeline. A directed edge is pointed from A to B if A depends on B directly. + 2. Perform topological sort on the DAG using DFS. Nodes with same topological order are put into the same layer. + 3. Using a greedy algorithm to get 'global' orders of nodes. + It sorts the nodes in the same layer by making use of the global order of nodes they depend on. + 4. Get a unique, hashable & comparable tuple representing the structure of the pipeline according to the global order of nodes. + It also provides a unique representation of the equivalence class of a pipeline in the sense of isomorphism. + + And about supporting new steps, one should extend PipelineDAG._convert_step_to_node`. + + Attributes + ---------- + pipeline: + The associated pipeline instance. + graph: + The graph representation of the pipeline. + strict_order: + If true, we will treat inputs of `Set` hyperparameters as a list, and the order of primitives are determined by their step indices. + Otherwise we will try to sort contents of `Set` hyperparameters so the orders of their contents are not important, + and we will try topological sorting to determine the order of nodes. + """ + + pipeline: Pipeline + graph: PipelineDAG + strict_order: bool + + def __init__(self, pipeline: Pipeline, strict_order: bool = False, only_control_hyperparams: bool = False) -> None: + self.pipeline = pipeline + self.strict_order = strict_order + self.graph = PipelineDAG(pipeline, strict_order=strict_order, only_control_hyperparams=only_control_hyperparams) + self.graph.expand_subpipelines(recursive=True) + + self._hash: typing.Optional[int] = None + self._representation: typing.Optional[typing.Tuple] = None + self._layers: typing.List[typing.List[OrderedNode]] = [[self.graph.inputs_node]] + + self._unordered_nodes: typing.Set[OrderedNode] = set() + + def _dfs_topological_ordering(self, node: OrderedNode) -> OrderedNode: + for parent, output_index in node.inputs.values(): + if parent in self._unordered_nodes: + self._dfs_topological_ordering(parent) + node.topological_order = max(node.topological_order, parent.topological_order + 1) + + self._unordered_nodes.remove(node) + + # Classify it into layers. + while len(self._layers) < node.topological_order + 1: + self._layers.append([]) + self._layers[node.topological_order].append(node) + + return node + + def _global_ordering(self) -> None: + global_order = -1 + for layer in self._layers: + for node in layer: + node.frozen = True # Enable cache so we can be much faster in comparison. + layer.sort(key=lambda x: x.unique_equivalence_class_repr()) + last = None + for j, node in enumerate(layer): + # Keep symmetric. Nodes with same local_order should have same global_order. + if node.unique_equivalence_class_repr() != last: + global_order += 1 + last = node.unique_equivalence_class_repr() + node.global_order = global_order + + def unique_equivalence_class_repr(self) -> typing.Tuple: + """ + Get the unique representation of the equivalence class of the pipeline in the sense of isomorphism. + """ + + if self._representation is None: + if self.strict_order: + for i, node in enumerate(self.graph.step_nodes): + node.topological_order = i + node.global_order = i + self._representation = tuple(node.unique_equivalence_class_repr() for node in self.graph.step_nodes) + else: + self._unordered_nodes = self.graph.nodes.copy() + self._unordered_nodes.remove(self.graph.inputs_node) + # Perform topological sort. + while self._unordered_nodes: + node = next(iter(self._unordered_nodes)) # Retrieve an item without deleting it. + self._dfs_topological_ordering(node) + + self._global_ordering() + self._representation = tuple(node.unique_equivalence_class_repr() for layer in self._layers for node in layer) + + return self._representation + + def __hash__(self) -> int: + if self._hash is None: + self._hash = hash(self.unique_equivalence_class_repr()) + return self._hash + + +def get_pipeline( + pipeline_path: str, *, strict_resolving: bool = False, strict_digest: bool = False, + pipeline_search_paths: typing.Sequence[str] = None, respect_environment_variable: bool = True, load_all_primitives: bool = True, + resolver_class: typing.Type[Resolver] = Resolver, pipeline_class: typing.Type[Pipeline] = Pipeline, +) -> Pipeline: + resolver = resolver_class( + strict_resolving=strict_resolving, strict_digest=strict_digest, pipeline_search_paths=pipeline_search_paths, + respect_environment_variable=respect_environment_variable, load_all_primitives=load_all_primitives, + ) + + if os.path.exists(pipeline_path): + with utils.open(pipeline_path, 'r', encoding='utf8') as pipeline_file: + if pipeline_path.endswith('.yml') or pipeline_path.endswith('.yaml'): + return pipeline_class.from_yaml(pipeline_file, resolver=resolver, strict_digest=strict_digest) + elif pipeline_path.endswith('.json'): + return pipeline_class.from_json(pipeline_file, resolver=resolver, strict_digest=strict_digest) + else: + raise ValueError("Unknown file extension.") + else: + return resolver.get_pipeline({'id': pipeline_path}) + + +def describe_handler( + arguments: argparse.Namespace, *, resolver_class: typing.Type[Resolver] = None, + no_resolver_class: typing.Type[Resolver] = None, pipeline_class: typing.Type[Pipeline] = None, +) -> None: + if resolver_class is None: + resolver_class = Resolver + if no_resolver_class is None: + no_resolver_class = NoResolver + if pipeline_class is None: + pipeline_class = Pipeline + + if getattr(arguments, 'no_resolving', False): + resolver: Resolver = no_resolver_class() + else: + resolver = resolver_class( + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + + output_stream = getattr(arguments, 'output', sys.stdout) + + has_errored = False + + for pipeline_path in arguments.pipelines: + if getattr(arguments, 'list', False): + print(pipeline_path, file=output_stream) + + try: + with utils.open(pipeline_path, 'r', encoding='utf8') as pipeline_file: + if pipeline_path.endswith('.yml') or pipeline_path.endswith('.yaml') or pipeline_path.endswith('.yml.gz') or pipeline_path.endswith('.yaml.gz'): + pipeline = pipeline_class.from_yaml( + pipeline_file, + resolver=resolver, + strict_digest=getattr(arguments, 'strict_digest', False), + ) + elif pipeline_path.endswith('.json') or pipeline_path.endswith('.json.gz'): + pipeline = pipeline_class.from_json( + pipeline_file, + resolver=resolver, + strict_digest=getattr(arguments, 'strict_digest', False), + ) + else: + raise ValueError("Unknown file extension.") + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=output_stream) + print(f"Error parsing pipeline: {pipeline_path}", file=output_stream) + has_errored = True + continue + else: + raise Exception(f"Error parsing pipeline: {pipeline_path}") from error + + if getattr(arguments, 'check', True): + try: + pipeline.check( + allow_placeholders=getattr(arguments, 'allow_placeholders', False), + standard_pipeline=getattr(arguments, 'standard_pipeline', True), + ) + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=output_stream) + print(f"Error checking pipeline: {pipeline_path}", file=output_stream) + has_errored = True + continue + else: + raise Exception("Error checking pipeline: {pipeline_path}".format(pipeline_path=pipeline_path)) from error + + try: + if getattr(arguments, 'set_source_name', None) is not None: + if pipeline.source is None: + pipeline.source = {} + if arguments.set_source_name: + pipeline.source['name'] = arguments.set_source_name + elif 'name' in pipeline.source: + del pipeline.source['name'] + if not pipeline.source: + pipeline.source = None + + pipeline_description = pipeline.to_json_structure(canonical=True) + + if getattr(arguments, 'print', False): + pprint.pprint(pipeline_description, stream=output_stream) + else: + json.dump( + pipeline_description, + output_stream, + indent=(getattr(arguments, 'indent', 2) or None), + sort_keys=getattr(arguments, 'sort_keys', False), + allow_nan=False, + ) # type: ignore + output_stream.write('\n') + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=output_stream) + print(f"Error describing pipeline: {pipeline_path}", file=output_stream) + has_errored = True + continue + else: + raise Exception(f"Error describing pipeline: {pipeline_path}") from error + + if has_errored: + sys.exit(1) + + +if pyarrow_lib is not None: + pyarrow_lib._default_serialization_context.register_type( + Pipeline, 'd3m.pipeline', pickle=True, + ) + + +def main(argv: typing.Sequence) -> None: + raise exceptions.NotSupportedError("This CLI has been removed. Use \"python3 -m d3m pipeline describe\" instead.") + + +if __name__ == '__main__': + main(sys.argv) diff --git a/d3m/d3m/metadata/pipeline_run.py b/d3m/d3m/metadata/pipeline_run.py new file mode 100644 index 0000000..3262c29 --- /dev/null +++ b/d3m/d3m/metadata/pipeline_run.py @@ -0,0 +1,1683 @@ +import argparse +import collections +import copy +import datetime +import enum +import json +import logging +import os.path +import re +import sys +import traceback +import typing +import uuid +import yaml + +import dateparser # type: ignore +import git # type: ignore +import GPUtil # type: ignore + +import d3m +from d3m import container, environment_variables, exceptions, utils, types +from d3m.metadata import base as metadata_base, hyperparams as hyperparams_module, pipeline as pipeline_module, problem +from d3m.primitive_interfaces import base + +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/66 +try: + from pyarrow import lib as pyarrow_lib # type: ignore +except ModuleNotFoundError: + pyarrow_lib = None + +__all__ = ('PipelineRun', 'User', 'RuntimeEnvironment') + +logger = logging.getLogger(__name__) + +DOCKER_MAC_ADDRESS_MASK = 0x0242ac110000 +PROC_INFO_RE = re.compile(r'^([^:]+?)\s*:\s*(.*)$') +PROC_MEMORY_PATH = '/proc/meminfo' +PROC_CPU_PATH = '/proc/cpuinfo' +PROC_CPU_MODEL_NAME_KEY = 'model name' +PROC_CPU_PHYSICAL_ID_KEY = 'physical id' +PROC_CPU_CORES_KEY = 'cpu cores' +PROC_TOTAL_MEMORY_KEY = 'MemTotal' +CGROUP_MEMORY_LIMIT_PATH = '/sys/fs/cgroup/memory/memory.limit_in_bytes' +CGROUP_CPU_SHARES_PATH = '/sys/fs/cgroup/cpu/cpu.shares' +CGROUP_CPU_CFS_PERIOD_US_PATH = '/sys/fs/cgroup/cpu/cpu.cfs_period_us' +CGROUP_CPU_CFS_QUOTA_US_PATH = '/sys/fs/cgroup/cpu/cpu.cfs_quota_us' + +WORKER_ID_NAMESPACE = uuid.UUID('2e4b9ab7-2207-4975-892b-0e01bf95babf') + +# Comma because we unpack the list of validators returned from "load_schema_validators". +PIPELINE_RUN_SCHEMA_VALIDATOR, = utils.load_schema_validators(metadata_base.SCHEMAS, ('pipeline_run.json',)) + +PIPELINE_RUN_SCHEMA_VERSION = 'https://metadata.datadrivendiscovery.org/schemas/v0/pipeline_run.json' + + +class User(dict): + def __init__(self, id_: str, chosen: bool = False, rationale: str = None) -> None: + super().__init__() + + self['id'] = id_ + self['chosen'] = chosen + + if rationale is not None: + self['rationale'] = rationale + + @classmethod + def _yaml_representer(cls, dumper: yaml.Dumper, data: typing.Any) -> typing.Any: + return dumper.represent_dict(data) + + +utils.yaml_add_representer(User, User._yaml_representer) + + +class PipelineRunStep: + def __init__( + self, step_type: metadata_base.PipelineStepType, start: str, environment: typing.Dict[str, typing.Any] = None + ) -> None: + self.type = step_type + self.status: typing.Dict[str, typing.Any] = {} + self.start: str = start + self.end: str = None + self.environment = environment + + def to_json_structure(self) -> typing.Dict: + if self.start is None: + raise exceptions.InvalidStateError("Start timestamp not set.") + + if self.end is None: + raise exceptions.InvalidStateError("End timestamp not set.") + + if 'state' not in self.status: + raise exceptions.InvalidStateError("Status not set.") + + json_structure = { + 'type': self.type.name, + 'status': self.status, + 'start': self.start, + 'end': self.end + } + + if self.environment is not None: + json_structure['environment'] = self.environment + + return json_structure + + def set_successful(self, message: str = None) -> None: + self.status['state'] = metadata_base.PipelineRunStatusState.SUCCESS.name + if message is not None and message: + self.status['message'] = message + + def set_failed(self, message: str = None) -> None: + self.status['state'] = metadata_base.PipelineRunStatusState.FAILURE.name + if message is not None and message: + self.status['message'] = message + + def set_end_timestamp(self) -> None: + self.end = utils.datetime_for_json(datetime.datetime.now(datetime.timezone.utc)) + + +class PipelineRunPrimitiveStep(PipelineRunStep): + def __init__( + self, step: pipeline_module.PrimitiveStep, start: str, environment: typing.Dict[str, typing.Any] = None, + ) -> None: + super().__init__( + step_type=metadata_base.PipelineStepType.PRIMITIVE, + start=start, + environment=environment + ) + + self.hyperparams: hyperparams_module.Hyperparams = None + self.pipeline_hyperparams: typing.Set[str] = None + self.random_seed: typing.Optional[int] = None + self.method_calls: typing.List[typing.Dict[str, typing.Any]] = [] + self.arguments = step.arguments + + def to_json_structure(self) -> typing.Dict: + json_structure = super().to_json_structure() + + # Validate that the Method calls are finished, and they have status. + for method_call in self.method_calls: + if 'end' not in method_call: + raise exceptions.InvalidStateError("End timestamp not set.") + if 'status' not in method_call: + raise exceptions.InvalidStateError("Status not set.") + + if self.method_calls: + json_structure['method_calls'] = self.method_calls + + if self.random_seed is not None: + json_structure['random_seed'] = self.random_seed + + hyperparams_json_structure = self._hyperparams_to_json_structure() + if hyperparams_json_structure is not None: + json_structure['hyperparams'] = hyperparams_json_structure + + return json_structure + + def _hyperparams_to_json_structure(self) -> typing.Optional[typing.Dict]: + if self.hyperparams is None: + return None + + hyperparams_json = {} + + for hyperparameter_name, value in self.hyperparams.items(): + if hyperparameter_name in self.pipeline_hyperparams: + continue + + hyperparams_json[hyperparameter_name] = { + 'type': metadata_base.ArgumentType.VALUE.name, + 'data': self.hyperparams.configuration[hyperparameter_name].value_to_json_structure(value), + } + + if hyperparams_json: + return hyperparams_json + else: + return None + + def add_method_call( + self, method_name: str, *, runtime_arguments: typing.Dict = None, + environment: typing.Dict[str, typing.Any] = None + ) -> int: + """ + Returns + ------- + The id of the method call. + """ + + if runtime_arguments is None: + runtime_arguments = {} + else: + # We convert everything directly to json structure. + def recurse(item: typing.Any) -> typing.Any: + if isinstance(item, enum.Enum): + return item.name + elif not isinstance(item, typing.Dict): + return item + else: + _json_structure = {} + for key, value in item.items(): + _json_structure[key] = recurse(value) + return _json_structure + + runtime_arguments = recurse(runtime_arguments) + + if method_name == '__init__' and runtime_arguments: + raise exceptions.InvalidArgumentValueError( + f'MethodCall with method `__init__` cannot have arguments. ' + f'Hyper-parameters are the arguments to `__init__`.' + ) + + method_call: typing.Dict[str, typing.Any] = { + 'name': method_name, + } + + if runtime_arguments: + method_call['arguments'] = runtime_arguments + + # we store everything as json structure. + if environment is not None: + method_call['environment'] = environment + + self.method_calls.append(method_call) + return len(self.method_calls) - 1 + + def set_method_call_start_timestamp(self, method_call_id: int) -> None: + self.method_calls[method_call_id]['start'] = utils.datetime_for_json(datetime.datetime.now()) + + def set_method_call_end_timestamp(self, method_call_id: int) -> None: + if 'start' not in self.method_calls[method_call_id]: + raise exceptions.InvalidStateError("Start timestamp not set.") + self.method_calls[method_call_id]['end'] = utils.datetime_for_json(datetime.datetime.now()) + + def set_method_call_result_metadata(self, method_call_id: int, result: typing.Union[base.CallResult, base.MultiCallResult]) -> None: + metadata = None + if isinstance(result, base.CallResult): + if result.value is not None and isinstance(result.value, types.Container): + metadata = { + # TODO: Should we use "to_internal_json_structure" here? + 'value': result.value.metadata.to_json_structure() + } + elif isinstance(result, base.MultiCallResult): + metadata = { + # TODO: Should we use "to_internal_json_structure" here? + produce_method_name: value.metadata.to_json_structure() + for produce_method_name, value in result.values.items() + if value is not None and isinstance(value, types.Container) + } + + # check if metadata is empty + if metadata is not None: + for key, value in metadata.items(): + if value is not None: + self.method_calls[method_call_id]['metadata'] = metadata + break + + def set_method_call_successful(self, method_call_id: int, message: str = None) -> None: + self.method_calls[method_call_id]['status'] = { + 'state': metadata_base.PipelineRunStatusState.SUCCESS.name, + } + if message is not None and message: + self.method_calls[method_call_id]['status']['message'] = message + + def set_method_call_failed(self, method_call_id: int, message: str = None) -> None: + self.method_calls[method_call_id]['status'] = { + 'state': metadata_base.PipelineRunStatusState.FAILURE.name, + } + if message is not None and message: + self.method_calls[method_call_id]['status']['message'] = message + + def get_method_call_logging_callback(self, method_call_id: int) -> typing.Callable: + if 'logging' not in self.method_calls[method_call_id]: + self.method_calls[method_call_id]['logging'] = [] + return self.method_calls[method_call_id]['logging'].append + + +class PipelineRunSubpipelineStep(PipelineRunStep): + def __init__(self, start: str, random_seed: int, environment: typing.Dict[str, typing.Any] = None) -> None: + super().__init__( + step_type=metadata_base.PipelineStepType.SUBPIPELINE, + start=start, + environment=environment, + ) + + self.random_seed = random_seed + self.steps: typing.List[typing.Dict] = [] + + def to_json_structure(self) -> typing.Dict: + json_structure = super().to_json_structure() + json_structure['random_seed'] = self.random_seed + if self.steps: + json_structure['steps'] = self.steps + return json_structure + + def add_step(self, step: typing.Dict) -> None: + self.steps.append(step) + + +class PipelineRun: + STEPS = 'steps' + METHOD_CALLS = 'method_calls' + + def __init__( + self, pipeline: pipeline_module.Pipeline, problem_description: problem.Problem = None, *, + phase: metadata_base.PipelineRunPhase, context: metadata_base.Context, + environment: typing.Dict[str, typing.Any], random_seed: int, previous_pipeline_run: 'PipelineRun' = None, + is_standard_pipeline: bool = False, users: typing.Sequence[User] = None, + ) -> None: + self.schema = PIPELINE_RUN_SCHEMA_VERSION + + self.pipeline = { + 'id': pipeline.id, + 'digest': pipeline.get_digest(), + } + + self.datasets: typing.List[typing.Dict[str, typing.Any]] = [] + + self.problem: typing.Dict[str, typing.Any] = None + if problem_description is not None: + self._set_problem(problem_description) + + self.steps: typing.List[PipelineRunStep] = [] + self.status: typing.Dict[str, typing.Any] = {} + self.start: str = None + self.end: str = None + + self.run: typing.Dict[str, typing.Any] = { + 'phase': phase.name, + 'is_standard_pipeline': is_standard_pipeline, + } + self.context = context + self.previous_pipeline_run = previous_pipeline_run + + if users is None: + self.users: typing.List[User] = [] + else: + self.users = list(users) + + self.environment = environment + self.random_seed = random_seed + self.is_standard_pipeline = is_standard_pipeline + + self._components: typing.Dict[str, typing.Any] = {} + self._step_start_timestamps: typing.Dict[int, str] = {} + + def _to_json_structure(self) -> typing.Dict: + if self.start is None: + raise exceptions.InvalidStateError("Start timestamp not set.") + + if self.end is None: + raise exceptions.InvalidStateError("End timestamp not set.") + + if 'state' not in self.status: + raise exceptions.InvalidStateError("Status not set.") + + # Scoring datasets are set only when scoring is used without data preparation. + if 'scoring' in self.run: + if 'data_preparation' in self.run: + if 'datasets' in self.run['scoring']: + raise exceptions.InvalidStateError( + "Scoring datasets must not be provided when scoring is used with data preparation pipeline.", + ) + elif 'datasets' not in self.run['scoring']: + raise exceptions.InvalidStateError( + "Scoring datasets must be provided when scoring is used without data preparation pipeline.", + ) + + json_structure = { + 'schema': self.schema, + 'pipeline': self.pipeline, + 'datasets': self.datasets, + 'status': self.status, + 'start': self.start, + 'end': self.end, + 'run': self.run, + 'environment': self.environment, + 'random_seed': self.random_seed, + } + + if self.steps: + json_structure['steps'] = [step.to_json_structure() for step in self.steps] + + if self.previous_pipeline_run is not None: + json_structure['previous_pipeline_run'] = { + 'id': self.previous_pipeline_run.get_id() + } + + if self.context is not None: + json_structure['context'] = self.context.name + + if self.problem is not None: + json_structure['problem'] = self.problem + + if self.users: + json_structure['users'] = self.users + + json_structure['id'] = utils.compute_hash_id(json_structure) + + return json_structure + + def to_json_structure(self) -> typing.Dict: + # We raise exception here instead of waiting for schema validation to fails to provide a more helpful error message. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/355 + if not self.is_standard_pipeline and not self.datasets: + raise exceptions.InvalidStateError("Pipeline run for a non-standard pipeline cannot be converted to a JSON structure.") + + # TODO: Remove "utils.to_json_structure" once sure that "_to_json_structure" really returns a JSON structure. + json_structure = utils.to_json_structure(self._to_json_structure()) + + PIPELINE_RUN_SCHEMA_VALIDATOR.validate(json_structure) + + return json_structure + + def to_yaml(self, file: typing.IO[typing.Any], *, appending: bool = False, **kwargs: typing.Any) -> typing.Optional[str]: + obj = self.to_json_structure() + + if appending and 'explicit_start' not in kwargs: + kwargs['explicit_start'] = True + + return utils.yaml_dump(obj, stream=file, **kwargs) + + def add_input_dataset(self, dataset: container.Dataset) -> None: + metadata = dataset.metadata.query(()) + self.datasets.append({ + 'id': metadata['id'], + 'digest': metadata['digest'], + }) + + def add_primitive_step(self, step: pipeline_module.PrimitiveStep) -> int: + if not isinstance(step, pipeline_module.PrimitiveStep): + raise exceptions.InvalidArgumentTypeError('step must be of type PrimitiveStep, not {}'.format(type(step))) + self.steps.append( + PipelineRunPrimitiveStep(step, self._step_start_timestamps[len(self.steps)]) + ) + return len(self.steps) - 1 + + def _get_primitive_step(self, primitive_step_id: int) -> PipelineRunPrimitiveStep: + if primitive_step_id >= len(self.steps): + raise exceptions.InvalidArgumentValueError('There does not exist a step with id {}'.format(primitive_step_id)) + + primitive_step = self.steps[primitive_step_id] + if not isinstance(primitive_step, PipelineRunPrimitiveStep): + raise exceptions.InvalidArgumentValueError('Step id {} does not refer to a PipelineRunPrimitiveStep'.format(primitive_step_id)) + + return primitive_step + + def set_primitive_step_hyperparams( + self, primitive_step_id: int, + hyperparams: hyperparams_module.Hyperparams, + pipeline_hyperparams: typing.Dict[str, typing.Dict], + ) -> None: + primitive_step = self._get_primitive_step(primitive_step_id) + primitive_step.hyperparams = hyperparams + primitive_step.pipeline_hyperparams = set(pipeline_hyperparams.keys()) + + def set_primitive_step_random_seed(self, primitive_step_id: int, random_seed: int) -> None: + primitive_step = self._get_primitive_step(primitive_step_id) + primitive_step.random_seed = random_seed + + def add_subpipeline_step(self, subpipeline_run: 'PipelineRun') -> int: + pipeline_run_subpipeline_step = PipelineRunSubpipelineStep( + self._step_start_timestamps[len(self.steps)], subpipeline_run.random_seed + ) + + for step_id, step in enumerate(subpipeline_run.steps): + step_json = step.to_json_structure() + pipeline_run_subpipeline_step.add_step(step_json) + state = step_json['status']['state'] + message = step_json['status'].get('message', None) + if state == metadata_base.PipelineRunStatusState.SUCCESS.name: + pipeline_run_subpipeline_step.set_successful(message) + elif state == metadata_base.PipelineRunStatusState.FAILURE.name: + message = 'Failed on subpipeline step {}:\n{}'.format(step_id, message) + pipeline_run_subpipeline_step.set_failed(message) + if message is not None and message: + self.status['message'] = message + else: + raise exceptions.UnexpectedValueError('unknown subpipeline status state: {}'.format(state)) + + self.steps.append(pipeline_run_subpipeline_step) + + return len(self.steps) - 1 + + def add_method_call_to_primitive_step( + self, primitive_step_id: int, method_name: str, *, + runtime_arguments: typing.Dict = None, environment: typing.Dict[str, typing.Any] = None + ) -> typing.Tuple[int, int]: + if runtime_arguments is None: + runtime_arguments = {} + + # TODO allow runtime arguments not specified in pipeline? + primitive_step = self._get_primitive_step(primitive_step_id) + method_call_id = primitive_step.add_method_call( + method_name, runtime_arguments=runtime_arguments, environment=environment + ) + return (primitive_step_id, method_call_id) + + def get_method_call_logging_callback( + self, step_and_method_call_id: typing.Tuple[int, int] + ) -> typing.Callable: + step_id, method_call_id = step_and_method_call_id + primitive_step = self._get_primitive_step(step_id) + return primitive_step.get_method_call_logging_callback(method_call_id) + + def run_started(self) -> None: + self.start = utils.datetime_for_json(datetime.datetime.now(datetime.timezone.utc)) + + def _set_end_timestamp(self) -> None: + self.end = utils.datetime_for_json(datetime.datetime.now(datetime.timezone.utc)) + + def step_started(self, step_id: int) -> None: + self._step_start_timestamps[step_id] = utils.datetime_for_json(datetime.datetime.now(datetime.timezone.utc)) + + def method_call_started(self, step_and_method_call_id: typing.Tuple[int, int]) -> None: + step_id, method_call_id = step_and_method_call_id + primitive_step = self._get_primitive_step(step_id) + primitive_step.set_method_call_start_timestamp(method_call_id) + + def set_method_call_result_metadata( + self, step_and_method_call_id: typing.Tuple[int, int], + result: typing.Union[base.CallResult, base.MultiCallResult] + ) -> None: + step_id, method_call_id = step_and_method_call_id + primitive_step = self._get_primitive_step(step_id) + primitive_step.set_method_call_result_metadata(method_call_id, result) + + def run_successful(self, message: str = None) -> None: + self._set_end_timestamp() + self.status['state'] = metadata_base.PipelineRunStatusState.SUCCESS.name + if message is not None and message: + self.status['message'] = message + + def step_successful(self, step_id: int, message: str = None) -> None: + if step_id >= len(self.steps): + raise exceptions.InvalidArgumentValueError('There does not exist a step with id {}'.format(step_id)) + self.steps[step_id].set_end_timestamp() + self.steps[step_id].set_successful(message) + + def method_call_successful(self, step_and_method_call_id: typing.Tuple[int, int], message: str = None) -> None: + step_id, method_call_id = step_and_method_call_id + primitive_step = self._get_primitive_step(step_id) + primitive_step.set_method_call_end_timestamp(method_call_id) + primitive_step.set_method_call_successful(method_call_id, message) + + def run_failed(self, message: str = None) -> None: + self._set_end_timestamp() + self.status['state'] = metadata_base.PipelineRunStatusState.FAILURE.name + if message is not None and message: + self.status['message'] = message + + def step_failed(self, step_id: int, message: str = None) -> None: + if step_id >= len(self.steps): + return + self.steps[step_id].set_end_timestamp() + self.steps[step_id].set_failed(message) + + def method_call_failed(self, step_and_method_call_id: typing.Tuple[int, int], message: str = None) -> None: + step_id, method_call_id = step_and_method_call_id + if step_id >= len(self.steps): + return + primitive_step = self._get_primitive_step(step_id) + primitive_step.set_method_call_end_timestamp(method_call_id) + primitive_step.set_method_call_failed(method_call_id, message) + + def is_failed(self) -> bool: + return self.status['state'] == metadata_base.PipelineRunStatusState.FAILURE.name + + def _set_problem(self, problem_description: problem.Problem) -> None: + self.problem = { + 'id': problem_description['id'], + 'digest': problem_description.get_digest(), + } + + def set_fold_group(self, fold_group_id: uuid.UUID, fold: int) -> None: + self.run['fold_group'] = { + 'id': str(fold_group_id), + 'fold': fold, + } + + def set_data_preparation_pipeline_run( + self, data_preparation_pipeline_run: 'PipelineRun' + ) -> None: + if data_preparation_pipeline_run.start is None: + raise exceptions.InvalidArgumentValueError("Data preparation pipeline start timestamp argument not provided.") + + if data_preparation_pipeline_run.end is None: + raise exceptions.InvalidArgumentValueError("Data preparation pipeline end timestamp argument not provided.") + + self.run['data_preparation'] = { + 'pipeline': data_preparation_pipeline_run.pipeline, + 'steps': [step.to_json_structure() for step in data_preparation_pipeline_run.steps], + 'status': data_preparation_pipeline_run.status, + 'start': data_preparation_pipeline_run.start, + 'end': data_preparation_pipeline_run.end, + 'random_seed': data_preparation_pipeline_run.random_seed, + } + + if data_preparation_pipeline_run.is_failed(): + message = 'Data preparation pipeline failed:\n{}'.format( + data_preparation_pipeline_run.status['message'] + ) + self.status['state'] = metadata_base.PipelineRunStatusState.FAILURE.name + if message is not None and message: + self.status['message'] = message + + def set_scoring_pipeline_run( + self, scoring_pipeline_run: 'PipelineRun', scoring_datasets: typing.Sequence[typing.Any] = None, + ) -> None: + if scoring_pipeline_run.start is None: + raise exceptions.InvalidArgumentValueError("Scoring pipeline start timestamp argument not provided.") + + if scoring_pipeline_run.end is None: + raise exceptions.InvalidArgumentValueError("Scoring pipeline end timestamp argument not provided.") + + self.run['scoring'] = { + 'pipeline': scoring_pipeline_run.pipeline, + 'steps': [step.to_json_structure() for step in scoring_pipeline_run.steps], + 'status': scoring_pipeline_run.status, + 'start': scoring_pipeline_run.start, + 'end': scoring_pipeline_run.end, + 'random_seed': scoring_pipeline_run.random_seed, + } + + if scoring_datasets: + self.run['scoring']['datasets'] = [] + for dataset in scoring_datasets: + metadata = dataset.metadata.query(()) + self.run['scoring']['datasets'].append({ + 'id': metadata['id'], + 'digest': metadata['digest'], + }) + + if scoring_pipeline_run.is_failed(): + message = 'Scoring pipeline failed:\n{}'.format( + scoring_pipeline_run.status['message'] + ) + self.status['state'] = metadata_base.PipelineRunStatusState.FAILURE.name + if message is not None and message: + self.status['message'] = message + + def set_scores( + self, scores: container.DataFrame, metrics: typing.Sequence[typing.Dict], + ) -> None: + if not self.is_standard_pipeline: + raise exceptions.InvalidStateError("Setting scores for non-standard pipelines is not allowed.") + + json_scores = [] + + if 'normalized' in scores.columns: + columns = ['metric', 'value', 'normalized'] + else: + columns = ['metric', 'value'] + + for row in scores.loc[:, columns].itertuples(index=False, name=None): + metric, value = row[:2] + + json_scores.append( + { + # TODO: Why is "deepcopy" needed here? + 'metric': copy.deepcopy(self._get_metric_description(metric, metrics)), + 'value': float(value), + }, + ) + + if len(row) == 3: + json_scores[-1]['normalized'] = float(row[2]) + + if not json_scores: + return + + if 'results' not in self.run: + self.run['results'] = {} + + if 'scores' not in self.run['results']: + self.run['results']['scores'] = json_scores + else: + raise exceptions.InvalidStateError("Scores already set for pipeline run.") + + def _get_metric_description(self, metric: str, performance_metrics: typing.Sequence[typing.Dict]) -> typing.Dict: + """ + Returns a metric description from a list of them, given metric. + + Parameters + ---------- + metric: + A metric name. + performance_metrics: + A list of performance metric descriptions requested for scoring. + + Returns + ------- + A metric description. + """ + + for performance_metric in performance_metrics: + if performance_metric['metric'] == metric: + metric_description = { + 'metric': performance_metric['metric'].name, + } + + if performance_metric.get('params', {}): + metric_description['params'] = performance_metric['params'] + + return metric_description + + return { + 'metric': metric, + } + + def set_predictions(self, predictions: container.DataFrame) -> None: + if not self.is_standard_pipeline: + raise exceptions.InvalidStateError("Setting predictions for non-standard pipelines is not allowed.") + + if not isinstance(predictions, container.DataFrame): + logger.warning("Unable to set predictions for pipeline run because predictions are not a DataFrame.") + return + + try: + json_predictions: typing.Dict[str, typing.List] = { + 'header': [], + 'values': [], + } + + column_names = [] + for column_index in range(len(predictions.columns)): + # We use column name from the DataFrame is metadata does not have it. This allows a bit more compatibility. + column_names.append(predictions.metadata.query_column(column_index).get('name', predictions.columns[column_index])) + + # "tolist" converts values to Python values and does not keep them as numpy.float64 or other special types. + json_predictions['values'].append(utils.to_json_structure(predictions.iloc[:, column_index].tolist())) + + json_predictions['header'] += column_names + + except Exception as error: + logger.warning("Unable to convert predictions to JSON structure for pipeline run.", exc_info=error) + return + + if 'results' not in self.run: + self.run['results'] = {} + + if 'predictions' not in self.run['results']: + self.run['results']['predictions'] = json_predictions + else: + raise exceptions.InvalidStateError("Predictions already set for pipeline run.") + + def get_id(self) -> str: + return self._to_json_structure()['id'] + + @classmethod + def json_structure_equals(cls, pipeline_run1: typing.Dict, pipeline_run2: typing.Dict) -> bool: + """ + Checks whether two pipeline runs in a JSON structure are equal. + This ignores the pipeline run id and all timestamps. + """ + + if not isinstance(pipeline_run1, collections.Mapping) or not isinstance(pipeline_run2, collections.Mapping): + raise exceptions.InvalidArgumentTypeError("Pipeline run arguments must be dicts.") + + return utils.json_structure_equals(pipeline_run1, pipeline_run2, {'id', 'start', 'end', 'environment', 'logging'}) + + +class RuntimeEnvironment(dict): + def __init__( + self, *, + worker_id: str = None, + cpu_resources: typing.Dict[str, typing.Any] = None, + memory_resources: typing.Dict[str, typing.Any] = None, + gpu_resources: typing.Dict[str, typing.Any] = None, + reference_benchmarks: typing.Sequence[str] = None, + reference_engine_version: str = None, + engine_version: str = None, + base_docker_image: typing.Dict[str, str] = None, + docker_image: typing.Dict[str, str] = None, + ) -> None: + """ + Create an instance of the runtime environment description in which a pipeline is run. + + All values stored in an instance should be JSON compatible. + + Parameters + ---------- + worker_id: + A globally unique identifier for the machine on which the runtime is running. + The idea is that multiple runs on the same system can be grouped together. + If not provided, `uuid.getnode()` is used to obtain an identifier. + cpu_resources: + A description of the CPU resources available in this environment. + memory_resources: + A description of the memory resources available in this environment. + gpu_resources: + A description of the GPU resources available in this environment. + reference_benchmarks: + A list of ids of standard and optional additional benchmarks which were run in the same or + equivalent RuntimeEnvironment. The timing characteristics of these benchmarks can be + expected to be the same as anything timed in this RuntimeEnvironment. + reference_engine_version: + A git commit hash or version number for the reference engine used. If subclassing the + reference engine, list it here. + engine_version: + A git commit hash or version number for the engine used. This is primarily useful for the + author. If using the reference engine directly, list its git commit hash or version number + here as well as in the reference_engine_version. + base_docker_image: + If the engine was run in a public or known docker container, specify the base docker image + description here. + docker_image: + If the engine was run in a public or known docker container, specify the actual docker + image description here. This is primarily useful for the author. + """ + + super().__init__() + + if worker_id is None: + worker_id = self._get_worker_id() + self['worker_id'] = worker_id + + resources = {} + if cpu_resources is None: + cpu_resources = self._get_cpu_resources() + if cpu_resources is not None: + resources['cpu'] = cpu_resources + if memory_resources is None: + memory_resources = self._get_memory_resources() + if memory_resources is not None: + resources['memory'] = memory_resources + if gpu_resources is None: + gpu_resources = self._get_gpu_resources() + if gpu_resources is not None: + resources['gpu'] = gpu_resources + + if resources: + self['resources'] = resources + + if reference_benchmarks is not None: + self['reference_benchmarks'] = reference_benchmarks + + if reference_engine_version is None: + reference_engine_version = self._get_reference_engine_version() + self['reference_engine_version'] = reference_engine_version + + if engine_version is None: + engine_version = self['reference_engine_version'] + self['engine_version'] = engine_version + + if base_docker_image is None: + base_docker_image = self._get_docker_image( + environment_variables.D3M_BASE_IMAGE_NAME, + environment_variables.D3M_BASE_IMAGE_DIGEST, + ) + if base_docker_image is not None: + self['base_docker_image'] = base_docker_image + + if docker_image is None: + docker_image = self._get_docker_image( + environment_variables.D3M_IMAGE_NAME, + environment_variables.D3M_IMAGE_DIGEST, + ) + if docker_image is not None: + self['docker_image'] = docker_image + + # Here we assume that all values stored in "self" are JSON compatible. + self['id'] = utils.compute_hash_id(self) + + @classmethod + def _get_reference_engine_version(cls) -> str: + try: + # Get the git commit hash of the d3m repository. + path = os.path.abspath(d3m.__file__).rsplit('d3m', 1)[0] + return utils.current_git_commit( + path=path, search_parent_directories=False, + ) + except git.exc.InvalidGitRepositoryError: + return d3m.__version__ + + @classmethod + def _get_worker_id(cls) -> str: + """ + Compute the worker id. + """ + + mac_address = uuid.getnode() + + if mac_address >> 16 == DOCKER_MAC_ADDRESS_MASK >> 16: + # Docker generates MAC addresses in the range 02:42:ac:11:00:00 to 02:42:ac:11:ff:ff + # if one is not provided in the configuration + logger.warning( + "'worker_id' was generated using the MAC address inside Docker " + "container and is not a reliable compute resource identifier." + ) + elif (mac_address >> 40) % 2 == 1: + # uuid.getnode docs state: + # If all attempts to obtain the hardware address fail, we choose a + # random 48-bit number with its eighth bit set to 1 as recommended + # in RFC 4122. + logger.warning( + "'worker_id' was generated using a random number because the " + "MAC address could not be determined." + ) + + return str(uuid.uuid5(WORKER_ID_NAMESPACE, json.dumps(mac_address, sort_keys=True))) + + @classmethod + def _get_docker_image(cls, image_name_env_var: str, image_digest_env_var: str) -> typing.Optional[typing.Dict]: + """ + Returns the docker image description. + """ + + docker_image = {} + + if image_name_env_var not in os.environ: + logger.warning('Docker image environment variable not set: %(variable_name)s', { + 'variable_name': image_name_env_var, + }) + elif os.environ[image_name_env_var]: + docker_image['image_name'] = os.environ[image_name_env_var] + + if image_digest_env_var not in os.environ: + logger.warning('Docker image environment variable not set: %(variable_name)s', { + 'variable_name': image_digest_env_var, + }) + elif os.environ[image_digest_env_var]: + docker_image['image_digest'] = os.environ[image_digest_env_var] + + if docker_image: + return docker_image + else: + return None + + @classmethod + def _get_configured(cls, environment_variable: str) -> typing.Optional[str]: + if environment_variable not in os.environ: + logger.warning('Configuration environment variable not set: %(variable_name)s', { + 'variable_name': environment_variable, + }) + return None + elif os.environ[environment_variable]: + return os.environ[environment_variable] + else: + return None + + # TODO: Split into more methods. + @classmethod + def _get_cpu_resources(cls) -> typing.Optional[typing.Dict[str, typing.Any]]: + cpu_resource: typing.Dict[str, typing.Any] = {} + + cpu_info: typing.Sequence[typing.Dict[str, str]] = [] + try: + cpu_info = cls._read_info_file(PROC_CPU_PATH) + except Exception as error: + logger.warning( + "Failed to get CPU information from '%(proc_cpu_path)s': %(error)s", + { + 'proc_cpu_path': PROC_CPU_PATH, + 'error': error, + }, + ) + + # devices + if cpu_info: + cpu_resource['devices'] = [ + { + 'name': cpu[PROC_CPU_MODEL_NAME_KEY], + } + for cpu in cpu_info + ] + + # physical_present + if cpu_info: + physical_ids: typing.Set[str] = set() + physical_present = 0 + for cpu in cpu_info: + physical_id = cpu[PROC_CPU_PHYSICAL_ID_KEY] + if physical_id in physical_ids: + continue + physical_ids.add(physical_id) + physical_present += int(cpu[PROC_CPU_CORES_KEY]) + cpu_resource['physical_present'] = physical_present + + # logical_present + if cpu_info: + cpu_resource['logical_present'] = len(cpu_info) + + # configured_available + configured_available = cls._get_configured( + environment_variables.D3M_CPU, + ) + if configured_available is not None: + cpu_resource['configured_available'] = configured_available + + # constraints + constraints = {} + try: + with open(CGROUP_CPU_SHARES_PATH, 'r', encoding='ascii') as file: + cpu_shares = int(file.read().strip()) + if cpu_shares < 1e5: + constraints['cpu_shares'] = cpu_shares + except Exception as error: + logger.warning( + "Failed to get CPU information from '%(cgroup_cpu_shares_path)s': %(error)s", + { + 'cgroup_cpu_shares_path': CGROUP_CPU_SHARES_PATH, + 'error': error, + }, + ) + try: + with open(CGROUP_CPU_CFS_PERIOD_US_PATH, 'r', encoding='ascii') as file: + cfs_period_us = int(file.read().strip()) + constraints['cfs_period_us'] = cfs_period_us + except Exception as error: + logger.warning( + "Failed to get CPU information from '%(cgroup_cpu_cfs_period_us_path)s': %(error)s", + { + 'cgroup_cpu_cfs_period_us_path': CGROUP_CPU_CFS_PERIOD_US_PATH, + 'error': error, + }, + ) + try: + with open(CGROUP_CPU_CFS_QUOTA_US_PATH, 'r', encoding='ascii') as file: + cfs_quota_us = int(file.read().strip()) + if cfs_quota_us >= 0: + constraints['cfs_quota_us'] = cfs_quota_us + except Exception as error: + logger.warning( + "Failed to get CPU information from '%(cgroup_cpu_cfs_quota_us_path)s': %(error)s", + { + 'cgroup_cpu_cfs_quota_us_path': CGROUP_CPU_CFS_QUOTA_US_PATH, + 'error': error, + }, + ) + + if 'cfs_period_us' in constraints and 'cfs_quota_us' not in constraints: + del constraints['cfs_period_us'] + + if constraints: + cpu_resource['constraints'] = constraints + + if cpu_resource: + return cpu_resource + else: + return None + + @classmethod + def _read_info_file(cls, path: str) -> typing.Sequence[typing.Dict[str, str]]: + info: typing.List[typing.Dict[str, str]] = [{}] + + with open(path, 'r', encoding='ascii') as file: + for line in file: + line = line.strip() + if not line: + info.append({}) + continue + + match = PROC_INFO_RE.match(line) + if match is None: + raise ValueError("Error parsing.") + + key, value = match.groups() + info[-1][key] = value + + if not info[-1]: + del info[-1] + + return info + + # TODO: Split into more methods. + # TODO: Get memory devices. Consider lshw. + @classmethod + def _get_memory_resources(cls) -> typing.Optional[typing.Dict[str, typing.Any]]: + memory_resource: typing.Dict[str, typing.Any] = {} + + # total_memory (bytes) + try: + memory_info = cls._read_info_file(PROC_MEMORY_PATH)[0] + total_memory_kb = int(memory_info[PROC_TOTAL_MEMORY_KEY].split()[0]) + memory_resource['total_memory'] = total_memory_kb * 1024 + except Exception as error: + logger.warning( + "Failed to get memory information from '%(proc_memory_path)s': %(error)s", + { + 'proc_memory_path': PROC_MEMORY_PATH, + 'error': error, + }, + ) + + # configured_memory + configured_memory = cls._get_configured( + environment_variables.D3M_RAM, + ) + if configured_memory is not None: + memory_resource['configured_memory'] = configured_memory + + # constraints + constraints = {} + try: + with open(CGROUP_MEMORY_LIMIT_PATH, 'r', encoding='ascii') as file: + memory_limit = int(file.read().strip()) + if memory_limit < (sys.maxsize // 4096) * 4096: + constraints['memory_limit'] = memory_limit + except FileNotFoundError: + pass + except Exception as error: + logger.warning( + "Failed to get memory information from '%(cgroup_memory_limit_path)s': %(error)s", + { + 'cgroup_memory_limit_path': CGROUP_MEMORY_LIMIT_PATH, + 'error': error, + }, + ) + + if constraints: + memory_resource['constraints'] = constraints + + if memory_resource: + return memory_resource + else: + return None + + # TODO: Split into more methods. + # TODO: Get GPU constraints. + # TODO: Get GPU memory limit configuration. + @classmethod + def _get_gpu_resources(cls) -> typing.Optional[typing.Dict[str, typing.Any]]: + gpu_resource: typing.Dict[str, typing.Any] = {} + + gpus: typing.List[GPUtil.GPU] = [] + try: + gpus = GPUtil.getGPUs() + except Exception as error: + logger.warning( + "Failed to get GPU information: %(error)s", + { + 'error': error, + }, + ) + + # devices + if gpus: + gpu_resource['devices'] = [ + { + 'name': gpu.name, + 'memory': int(gpu.memoryTotal) * 2**20, + } + for gpu in gpus + ] + + # total_memory (bytes) + if gpus: + total_memory_mib = sum(gpu.memoryTotal for gpu in gpus) + gpu_resource['total_memory'] = int(total_memory_mib) * 2**20 + + if gpu_resource: + return gpu_resource + else: + return None + + @classmethod + def _yaml_representer(cls, dumper: yaml.Dumper, data: typing.Any) -> typing.Any: + return dumper.represent_dict(data) + + +utils.yaml_add_representer(RuntimeEnvironment, RuntimeEnvironment._yaml_representer) + + +def _validate_pipeline_run_random_seeds(pipeline_run: typing.Dict) -> None: + if 'random_seed' not in pipeline_run: + raise exceptions.InvalidPipelineRunError("Pipeline run is missing a random seed.") + + if 'run' in pipeline_run: + if 'data_preparation' in pipeline_run['run'] and 'random_seed' not in pipeline_run['run']['data_preparation']: + raise exceptions.InvalidPipelineRunError("Data preparation pipeline run is missing a random seed.") + + if 'scoring' in pipeline_run['run'] and 'random_seed' not in pipeline_run['run']['scoring']: + raise exceptions.InvalidPipelineRunError("Scoring pipeline run is missing a random seed.") + + for step in pipeline_run.get('steps', []): + if step['type'] == 'SUBPIPELINE': + _validate_pipeline_run_random_seeds(step) + + +def _validate_pipeline_run_timestamps(pipeline_run: typing.Dict, parent_start: datetime.datetime = None, parent_end: datetime.datetime = None) -> None: + if 'start' not in pipeline_run: + raise exceptions.InvalidPipelineRunError("Pipeline run is missing a start timestamp.") + if 'end' not in pipeline_run: + raise exceptions.InvalidPipelineRunError("Pipeline run is missing an end timestamp.") + + start = dateparser.parse(pipeline_run['start'], settings={'TIMEZONE': 'UTC'}) + end = dateparser.parse(pipeline_run['end'], settings={'TIMEZONE': 'UTC'}) + + if start >= end: + raise exceptions.InvalidPipelineRunError("Pipeline run contains a start timestamp which occurs after the corresponding end timestamp.") + + if parent_start is not None and parent_end is not None: + if start <= parent_start or parent_end <= start: + raise exceptions.InvalidPipelineRunError("Pipeline run contains a start timestamp which occurs outside the parent timestamp range.") + + if end <= parent_start or parent_end <= end: + raise exceptions.InvalidPipelineRunError("Pipeline run contains an end timestamp which occurs outside the parent timestamp range.") + + for step in pipeline_run.get('steps', []): + for method_call in pipeline_run.get('method_calls', []): + _validate_pipeline_run_timestamps(method_call, start, end) + + _validate_pipeline_run_timestamps(step, start, end) + + if 'run' in pipeline_run: + if 'data_preparation' in pipeline_run['run']: + _validate_pipeline_run_timestamps(pipeline_run['run']['data_preparation']) + + if 'scoring' in pipeline_run['run']: + _validate_pipeline_run_timestamps(pipeline_run['run']['scoring']) + + +def _validate_success_step(step: typing.Dict) -> None: + if step['type'] == metadata_base.PipelineStepType.PRIMITIVE: + for method_call in step.get('method_calls', []): + if method_call['status']['state'] != metadata_base.PipelineRunStatusState.SUCCESS: + raise exceptions.InvalidPipelineRunError( + "Step with '{expected_status}' status has a method call with '{status}' status".format( + expected_status=metadata_base.PipelineRunStatusState.SUCCESS, + status=method_call['status']['state'], + ), + ) + elif step['type'] == metadata_base.PipelineStepType.SUBPIPELINE: + _recurse_success(step) + else: + raise exceptions.UnexpectedValueError("Invalid pipeline run step type: {step_type}".format(step_type=step['type'])) + + +def _validate_failure_step(step: typing.Dict) -> None: + if step['type'] == metadata_base.PipelineStepType.PRIMITIVE: + found_a_method_call_failure = False + for method_call in step.get('method_calls', []): + if found_a_method_call_failure: + raise exceptions.InvalidPipelineRunError( + "There exists a method call after a method call with '{status}' status.".format( + status=metadata_base.PipelineRunStatusState.FAILURE, + ), + ) + if method_call['status']['state'] == metadata_base.PipelineRunStatusState.FAILURE: + found_a_method_call_failure = True + elif step['type'] == metadata_base.PipelineStepType.SUBPIPELINE: + _recurse_failure(step) + else: + raise exceptions.UnexpectedValueError("Invalid pipeline run step type: {step_type}".format(step_type=step['type'])) + + +def _recurse_success(json_structure: typing.Dict) -> None: + if 'steps' not in json_structure: + raise exceptions.InvalidPipelineRunError("Successful pipeline run with missing steps.") + + for step in json_structure['steps']: + if step['status']['state'] != metadata_base.PipelineRunStatusState.SUCCESS: + raise exceptions.InvalidPipelineRunError( + "Pipeline run with '{expected_status}' status has a step with '{status}' status".format( + expected_status=metadata_base.PipelineRunStatusState.SUCCESS, + status=step['status']['state'], + ), + ) + + _validate_success_step(step) + + +def _recurse_failure(json_structure: typing.Dict) -> None: + found_a_step_failure = False + for step in json_structure.get('steps', []): + if found_a_step_failure: + raise exceptions.InvalidPipelineRunError( + "There exists a step after a step with '{status}' status.".format( + status=metadata_base.PipelineRunStatusState.FAILURE, + ), + ) + + if step['status']['state'] == metadata_base.PipelineRunStatusState.SUCCESS: + _validate_success_step(step) + elif step['status']['state'] == metadata_base.PipelineRunStatusState.FAILURE: + found_a_step_failure = True + _validate_failure_step(step) + + +def _validate_pipeline_run_status_consistency(pipeline_run: typing.Dict) -> None: + """ + Verifies that the success or failure states of pipeline run components are consistent with each other. + Any failure state should be propagated upwards to all parents in the pipeline run. The runtime should + "short-circuit", meaning any failure state in the pipeline run should be the final component. + """ + + state = pipeline_run['status']['state'] + if state == metadata_base.PipelineRunStatusState.SUCCESS: + _recurse_success(pipeline_run) + elif state == metadata_base.PipelineRunStatusState.FAILURE: + _recurse_failure(pipeline_run) + else: + raise exceptions.UnexpectedValueError("Invalid pipeline run state: {state}".format(state=state)) + + +def _get_pipeline_run_references(pipeline_run: typing.Dict) -> typing.List[typing.Dict]: + pipeline_run_references: typing.List[typing.Dict] = [] + + pipeline_run_references += pipeline_run.get('environment', {}).get('reference_benchmarks', []) + + for step in pipeline_run.get('steps', []): + pipeline_run_references += step.get('environment', {}).get('reference_benchmarks', []) + + for method_call in step.get('method_calls', []): + pipeline_run_references += method_call.get('environment', {}).get('reference_benchmarks', []) + + return pipeline_run_references + + +def validate_pipeline_run(pipeline_run: typing.Dict) -> None: + """ + Validates that the pipeline run is valid for the purpose of insertion in the metalearning database. + If not, an exception is raised. + + Generally, metalearning database has additional requirements not captured by JSON schema. + + Parameters + ---------- + pipeline_run: + Pipeline run document. + """ + + PIPELINE_RUN_SCHEMA_VALIDATOR.validate(pipeline_run) + + if pipeline_run['schema'] != PIPELINE_RUN_SCHEMA_VERSION: + raise exceptions.InvalidPipelineRunError( + "Schema field is not '{expected_schema}', but '{actual_schema}'.".format( + expected_schema=pipeline_module.PIPELINE_SCHEMA_VERSION, + actual_schema=pipeline_run['schema'], + ), + ) + + computed_id = utils.compute_hash_id(pipeline_run) + + if pipeline_run['id'] != computed_id: + raise exceptions.InvalidPipelineRunError( + "ID field is not '{computed_id}', but '{actual_id}'.".format( + computed_id=computed_id, + actual_id=pipeline_run['id'], + ), + ) + + for dataset in list(pipeline_run['datasets']) + list(pipeline_run['run'].get('scoring', {}).get('datasets', [])): + if set(dataset.keys()) != {'id', 'digest'}: + raise exceptions.InvalidPipelineRunError("Invalid dataset reference: {dataset}".format(dataset=dataset)) + + pipelines = [pipeline_run['pipeline']] + if 'data_preparation' in pipeline_run['run']: + pipelines.append(pipeline_run['run']['data_preparation']['pipeline']) + if 'scoring' in pipeline_run['run']: + pipelines.append(pipeline_run['run']['scoring']['pipeline']) + + for pipeline in pipelines: + if set(pipeline.keys()) != {'id', 'digest'}: + raise exceptions.InvalidPipelineRunError("Invalid pipeline reference: {pipeline}".format(pipeline=pipeline)) + + if 'problem' in pipeline_run and set(pipeline_run['problem'].keys()) != {'id', 'digest'}: + raise exceptions.InvalidPipelineRunError("Invalid problem reference: {problem}".format(problem=pipeline_run['problem'])) + + referenced_pipeline_runs = [] + if 'previous_pipeline_run' in pipeline_run: + referenced_pipeline_runs.append(pipeline_run['previous_pipeline_run']) + referenced_pipeline_runs += _get_pipeline_run_references(pipeline_run) + if 'scoring' in pipeline_run['run']: + referenced_pipeline_runs += _get_pipeline_run_references(pipeline_run['run']['scoring']) + if 'data_preparation' in pipeline_run['run']: + referenced_pipeline_runs += _get_pipeline_run_references(pipeline_run['run']['data_preparation']) + + for referenced_pipeline_run in referenced_pipeline_runs: + if set(referenced_pipeline_run.keys()) != {'id'}: + raise exceptions.InvalidPipelineRunError("Invalid pipeline run reference: {pipeline_run}".format(pipeline_run=referenced_pipeline_run)) + + _validate_pipeline_run_status_consistency(pipeline_run) + _validate_pipeline_run_timestamps(pipeline_run) + _validate_pipeline_run_random_seeds(pipeline_run) + + +def validate_pipeline(pipeline_description: typing.Dict) -> None: + """ + Validates that the pipeline is valid for the purpose of insertion in the metalearning database. + If not, an exception is raised. + + Generally, metalearning database has additional requirements not captured by JSON schema. + + Parameters + ---------- + pipeline_description: + Pipeline.. + """ + + # Also validates against the schema. It validates top-level "digest" field if it exists. + pipeline = pipeline_module.Pipeline.from_json_structure(pipeline_description, resolver=pipeline_module.NoResolver(strict_digest=True), strict_digest=True) + + if pipeline_description['schema'] != pipeline_module.PIPELINE_SCHEMA_VERSION: + raise exceptions.InvalidPipelineError( + "Schema field is not '{expected_schema}', but '{actual_schema}'.".format( + expected_schema=pipeline_module.PIPELINE_SCHEMA_VERSION, + actual_schema=pipeline_description['schema'], + ), + ) + + # If there is "digest" field we know that it has matched the pipeline. + if 'digest' not in pipeline_description: + raise exceptions.InvalidPipelineError("Digest field is required.") + + # Also validates that there are no nested sub-pipelines. + if pipeline_description != pipeline._canonical_pipeline_description(pipeline_description): + raise exceptions.InvalidPipelineError("Pipeline description is not in canonical structure.") + + # We allow non-standard pipelines but require that all inputs are "Dataset" objects. + input_types = {'inputs.{i}'.format(i=i): container.Dataset for i in range(len(pipeline.inputs))} + pipeline.check(allow_placeholders=False, standard_pipeline=False, input_types=input_types) + + for step in pipeline.steps: + if isinstance(step, pipeline_module.SubpipelineStep): + # We are using "NoResolver", so we have "pipeline_description" available. + if 'digest' not in step.pipeline_description: + raise exceptions.InvalidPipelineError("Digest field in steps is required.") + elif isinstance(step, pipeline_module.PrimitiveStep): + # We are using "NoResolver", so we have "primitive_description" available. + if 'digest' not in step.primitive_description: + # A special case to handle a v2019.6.7 version of the core package where compute scores primitive + # did not have a digest because it was lacking "installation" section in metadata. + # See: https://gitlab.com/datadrivendiscovery/d3m/merge_requests/280 + if step.primitive_description['id'] == '799802fb-2e11-4ab7-9c5e-dda09eb52a70' and step.primitive_description['version'] == '0.3.0': + continue + raise exceptions.InvalidPipelineError("Digest field in steps is required.") + else: + raise exceptions.InvalidPipelineError("Unknown step type: {type}".format(type=type(step))) + + +def validate_problem(problem_description_json_structure: typing.Dict) -> None: + """ + Validates that the problem description is valid for the purpose of insertion in the metalearning database. + If not, an exception is raised. + + Generally, metalearning database has additional requirements not captured by JSON schema. + + Parameters + ---------- + problem_description_json_structure: + Problem description as JSON structure. + """ + + if 'digest' not in problem_description_json_structure: + raise exceptions.InvalidProblemError("Digest field is required.") + + # Also validates against the schema and checks the digest. + problem_description = problem.Problem.from_json_structure(problem_description_json_structure, strict_digest=True) + + if problem_description['schema'] != problem.PROBLEM_SCHEMA_VERSION: + raise exceptions.InvalidProblemError( + "Schema field is not '{expected_schema}', but '{actual_schema}'.".format( + expected_schema=problem.PROBLEM_SCHEMA_VERSION, + actual_schema=problem_description['schema'], + ), + ) + + canonical_problem_description = problem_description._canonical_problem_description(problem_description) + + if problem_description != canonical_problem_description: + raise exceptions.InvalidProblemError("Problem description is not in canonical structure.") + + if problem_description.get('source', {}).get('from', {}).get('type', None) == 'REDACTED': + problem_reference = problem_description['source']['from'].get('problem', {}) + if set(problem_reference.keys()) != {'id', 'digest'}: + raise exceptions.InvalidProblemError("Invalid problem description reference for \"source.from.problem\": {problem}".format(problem=problem_reference)) + + +def validate_dataset(dataset_description: typing.Dict) -> None: + """ + Validates that the dataset description is valid for the purpose of insertion in the metalearning database. + If not, an exception is raised. + + Generally, metalearning database has additional requirements not captured by JSON schema. + + Parameters + ---------- + dataset_description: + Dataset description. + """ + + metadata_base.CONTAINER_SCHEMA_VALIDATOR.validate(dataset_description) + + if dataset_description['schema'] != metadata_base.CONTAINER_SCHEMA_VERSION: + raise exceptions.InvalidDatasetError( + "Schema field is not '{expected_schema}', but '{actual_schema}'.".format( + expected_schema=metadata_base.CONTAINER_SCHEMA_VERSION, + actual_schema=dataset_description['schema'], + ), + ) + + if 'id' not in dataset_description: + raise exceptions.InvalidDatasetError("ID field is required.") + + if 'digest' not in dataset_description: + raise exceptions.InvalidDatasetError("Digest field is required.") + + # Also validates that there are no nested sub-pipelines. + if dataset_description != container.Dataset._canonical_dataset_description(dataset_description): + raise exceptions.InvalidDatasetError("Dataset description is not in canonical structure.") + + if dataset_description['structural_type'] != 'd3m.container.dataset.Dataset': + raise exceptions.InvalidDatasetError("Structural type is not 'd3m.container.dataset.Dataset', but '{type}'.".format(type=dataset_description['structural_type'])) + + if dataset_description.get('source', {}).get('from', {}).get('type', None) == 'REDACTED': + dataset_reference = dataset_description['source']['from'].get('dataset', {}) + if set(dataset_reference.keys()) != {'id', 'digest'}: + raise exceptions.InvalidDatasetError("Invalid dataset reference for \"source.from.dataset\": {dataset}".format(dataset=dataset_reference)) + + +def validate_primitive(primitive_json_structure: typing.Dict) -> None: + """ + Validates that the primitive description is valid for the purpose of insertion in the metalearning database. + If not, an exception is raised. + + Generally, metalearning database has additional requirements not captured by JSON schema. + + Parameters + ---------- + primitive_json_structure: + Primitive description as JSON structure. + """ + + if 'digest' not in primitive_json_structure: + raise exceptions.InvalidProblemError("Digest field is required.") + + metadata_base.PrimitiveMetadata._validate(primitive_json_structure) + + +def pipeline_run_handler(arguments: argparse.Namespace) -> None: + has_errored = False + + for pipeline_run_path in arguments.pipeline_runs: + if getattr(arguments, 'list', False): + print(pipeline_run_path) + + try: + with utils.open(pipeline_run_path, 'r', encoding='utf8') as pipeline_run_file: + if pipeline_run_path.endswith('.yml') or pipeline_run_path.endswith('.yaml') or pipeline_run_path.endswith('.yml.gz') or pipeline_run_path.endswith('.yaml.gz'): + pipeline_runs: typing.Iterator[typing.Dict] = utils.yaml_load_all(pipeline_run_file) + else: + pipeline_runs = typing.cast(typing.Iterator[typing.Dict], [json.load(pipeline_run_file)]) + + # It has to be inside context manager because YAML loader returns a lazy iterator + # which requires an open file while iterating. + for pipeline_run in pipeline_runs: + validate_pipeline_run(pipeline_run) + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=sys.stdout) + print(f"Error validating a pipeline run: {pipeline_run_path}") + has_errored = True + continue + else: + raise Exception(f"Error validating a pipeline run: {pipeline_run_path}") from error + + if has_errored: + sys.exit(1) + + +def pipeline_handler( + arguments: argparse.Namespace, *, resolver_class: typing.Type[pipeline_module.Resolver] = None, + no_resolver_class: typing.Type[pipeline_module.Resolver] = None, pipeline_class: typing.Type[pipeline_module.Pipeline] = None, +) -> None: + has_errored = False + + for pipeline_path in arguments.pipelines: + if getattr(arguments, 'list', False): + print(pipeline_path) + + try: + with utils.open(pipeline_path, 'r', encoding='utf8') as pipeline_file: + if pipeline_path.endswith('.yml') or pipeline_path.endswith('.yaml') or pipeline_path.endswith('.yml.gz') or pipeline_path.endswith('.yaml.gz'): + pipelines: typing.Iterator[typing.Dict] = utils.yaml_load_all(pipeline_file) + else: + pipelines = typing.cast(typing.Iterator[typing.Dict], [json.load(pipeline_file)]) + + for pipeline in pipelines: + validate_pipeline(pipeline) + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=sys.stdout) + print(f"Error validating a pipeline: {pipeline_path}") + has_errored = True + continue + else: + raise Exception(f"Error validating a pipeline: {pipeline_path}") from error + + if has_errored: + sys.exit(1) + + +def problem_handler(arguments: argparse.Namespace, *, problem_resolver: typing.Callable = None) -> None: + has_errored = False + + for problem_path in arguments.problems: + if getattr(arguments, 'list', False): + print(problem_path) + + try: + with utils.open(problem_path, 'r', encoding='utf8') as problem_file: + if problem_path.endswith('.yml') or problem_path.endswith('.yaml') or problem_path.endswith('.yml.gz') or problem_path.endswith('.yaml.gz'): + problems: typing.Iterator[typing.Dict] = utils.yaml_load_all(problem_file) + else: + problems = typing.cast(typing.Iterator[typing.Dict], [json.load(problem_file)]) + + for problem in problems: + validate_problem(problem) + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=sys.stdout) + print(f"Error validating a problem: {problem_path}") + has_errored = True + continue + else: + raise Exception(f"Error validating a problem: {problem_path}") from error + + if has_errored: + sys.exit(1) + + +def dataset_handler(arguments: argparse.Namespace, *, dataset_resolver: typing.Callable = None) -> None: + has_errored = False + + for dataset_path in arguments.datasets: + if getattr(arguments, 'list', False): + print(dataset_path) + + try: + with utils.open(dataset_path, 'r', encoding='utf8') as dataset_file: + if dataset_path.endswith('.yml') or dataset_path.endswith('.yaml'): + datasets: typing.Iterator[typing.Dict] = utils.yaml_load_all(dataset_file) + else: + datasets = typing.cast(typing.Iterator[typing.Dict], [json.load(dataset_file)]) + + for dataset in datasets: + validate_dataset(dataset) + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=sys.stdout) + print(f"Error validating a dataset: {dataset_path}") + has_errored = True + continue + else: + raise Exception(f"Error validating a dataset: {dataset_path}") from error + + if has_errored: + sys.exit(1) + + +def primitive_handler(arguments: argparse.Namespace) -> None: + has_errored = False + + for primitive_path in arguments.primitives: + if getattr(arguments, 'list', False): + print(primitive_path) + + try: + with utils.open(primitive_path, 'r', encoding='utf8') as primitive_file: + if primitive_path.endswith('.yml') or primitive_path.endswith('.yaml'): + primitives: typing.Iterator[typing.Dict] = utils.yaml_load_all(primitive_file) + else: + primitives = typing.cast(typing.Iterator[typing.Dict], [json.load(primitive_file)]) + + for primitive in primitives: + validate_primitive(primitive) + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=sys.stdout) + print(f"Error validating a primitive description: {primitive_path}") + has_errored = True + continue + else: + raise Exception(f"Error validating a primitive description: {primitive_path}") from error + + if has_errored: + sys.exit(1) + + +if pyarrow_lib is not None: + pyarrow_lib._default_serialization_context.register_type( + PipelineRun, 'd3m.pipeline_run', pickle=True, + ) diff --git a/d3m/d3m/metadata/primitive_names.py b/d3m/d3m/metadata/primitive_names.py new file mode 100644 index 0000000..5379a0f --- /dev/null +++ b/d3m/d3m/metadata/primitive_names.py @@ -0,0 +1,392 @@ +# Primitive Python paths (Python paths under which primitives registers themselves) have to adhere to namespace rules. +# Those rules describe that the Python path consists of multiple segments, one of them being "primitive name". Those +# names should be a general name to describe the logic of a primitive with the idea that multiple implementations +# of the same logic share the same name. This file contains a list of known and allowed primitive names. +# Names should be descriptive and something which can help people understand what the primitive is about. +# You can assume general understanding of data science concepts and names. +# +# Everyone is encouraged to help currate this list and suggest improvements (merging, removals, additions) +# of values in that list by submitting a merge request. We are not strict about names here, the main purpose of +# this list is to encourage collaboration and primitive name reuse when that is reasonable. Please check the list +# first when deciding on a Python path of your primitive and see if it can fit well under an existing name. +# +# On Linux, you can sort the list by running: +# +# grep "^ *'" d3m/metadata/primitive_names.py | env LC_COLLATE=C sort -u +# +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/3 + +PRIMITIVE_NAMES = [ + 'categorical_to_binary', + 'discrete_cosine_transform', + 'fast_fourier_transform', + 'holt_smoothing', + 'holt_winters_exponential_smoothing', + 'mean_average_transform', + 'non_negative_matrix_factorization', + 'pyod_cof', + 'simple_exponential_smoothing', + 'time_stamp_validation', + 'time_series_moving_average', + 'time_series_seasonality_trend_decomposition', + 'variational_auto_encoder', + 'ada_boost', + 'adaptive_simultaneous_markov_blanket', + 'add', + 'add_semantic_types', + 'adjacency_spectral_embedding', + 'ape', + 'ard', + 'arima', + 'audio_featurization', + 'audio_reader', + 'audio_slicer', + 'audio_transfer', + 'average_pooling_1d', + 'average_pooling_2d', + 'average_pooling_3d', + 'bagging', + 'batch_normalization', # To be used with "layer" primitive family. + 'bayesian_logistic_regression', # To be used with "classification" primitive family. + 'bernoulli_naive_bayes', + 'bert_classifier', + 'binarizer', + 'binary_crossentropy', + 'binary_encoder', + 'cast_to_type', + 'categorical_accuracy', + 'categorical_crossentropy', + 'categorical_hinge', + 'categorical_imputer', + 'channel_averager', + 'clean_augmentation', + 'cleaning_featurizer', + 'cluster', + 'cluster_curve_fitting_kmeans', + 'column_fold', + 'column_map', + 'column_parser', + 'column_type_profiler', + 'compute_scores', + 'concat', + 'conditioner', + 'construct_predictions', + 'convolution_1d', + 'convolution_2d', + 'convolution_3d', + 'convolutional_neural_net', + 'corex_continuous', + 'corex_supervised', + 'corex_text', + 'cosine_proximity', + 'count_vectorizer', + 'cover_tree', + 'croc', + 'csv_reader', + 'cut_audio', + 'data_conversion', + 'dataframe_to_list', + 'dataframe_to_list_of_list', + 'dataframe_to_list_of_ndarray', + 'dataframe_to_ndarray', + 'dataframe_to_tensor', + 'datamart_augmentation', + 'datamart_download', + 'dataset_map', + 'dataset_text_reader', + 'dataset_to_dataframe', + 'dataset_sample', + 'datetime_field_compose', + 'datetime_range_filter', + 'decision_tree', + 'deep_feature_synthesis', + 'deep_markov_bernoulli_forecaster', + 'deep_markov_categorical_forecaster', + 'deep_markov_gaussian_forecaster', + 'denormalize', + 'dense', + 'diagonal_mvn', + 'dict_vectorizer', + 'dimension_selection', + 'discriminative_structured_classifier', + 'do_nothing', + 'do_nothing_for_dataset', + 'doc_2_vec', + 'dropout', + 'dummy', + 'echo_ib', + 'echo_linear', + 'edge_list_to_graph', + 'ekss', + 'elastic_net', + 'encoder', + 'enrich_dates', + 'ensemble_forest', + 'ensemble_voting', + 'extra_trees', + 'extract_columns', + 'extract_columns_by_semantic_types', + 'extract_columns_by_structural_types', + 'fast_ica', + 'fast_lad', + 'feature_agglomeration', + 'feed_forward_neural_net', + 'fixed_split_dataset_split', + 'flatten', + 'forward', + 'gaussian', # To be used with "classification" or "clustering" primitive family. + 'gaussian_naive_bayes', + 'gaussian_process', + 'gaussian_random_projection', + 'gcn_mixhop', + 'general_relational_dataset', + 'generative_structured_classifier', + 'generic_univariate_select', + 'geocoding', + 'glda', + 'global_average_pooling_1d', + 'global_average_pooling_2d', + 'global_average_pooling_3d', + 'global_causal_discovery', + 'global_structure_imputer', + 'gmm', + 'go_dec', + 'goturn', + 'gradient_boosting', + 'graph_node_splitter', + 'graph_to_edge_list', + 'graph_transformer', + 'grasta', + 'grasta_masked', + 'greedy_imputation', + 'grouping_field_compose', + 'grouse', + 'hdbscan', + 'hdp', + 'hinge', + 'horizontal_concat', + 'i3d', + 'i_vector_extractor', + 'ibex', + 'identity_parentchildren_markov_blanket', + 'image_reader', + 'image_transfer', + 'image_transfer_learning_transformer', + 'imputer', + 'inceptionV3_image_feature', + 'increment', + 'iqr_scaler', + 'iterative_labeling', + 'iterative_regression_imputation', + 'joint_mutual_information', + 'k_means', + 'k_neighbors', + 'kernel_pca', + 'kernel_ridge', + 'kfold_dataset_split', + 'kfold_time_series_split', + 'kss', + 'kullback_leibler_divergence', + 'l1_low_rank', + 'label_decoder', + 'label_encoder', + 'labler', + 'laplacian_spectral_embedding', + 'largest_connected_component', + 'lars', + 'lasso', + 'lasso_cv', + 'lda', + 'light_gbm', + 'linear', + 'linear_discriminant_analysis', + 'linear_svc', + 'linear_svr', + 'link_prediction', # To be used with "collaborative_filtering" or "graph_matching" primitive family. + 'list_to_dataframe', + 'list_to_ndarray', + 'load_edgelist', + 'load_graphs', + 'load_single_graph', + 'local_structure_imputer', + 'log_mel_spectrogram', + 'logcosh', + 'logistic_regression', # To be used with "classification" primitive family. + 'loss', + 'lstm', + 'lupi_svm', + 'max_abs_scaler', + 'max_pooling_1d', + 'max_pooling_2d', + 'max_pooling_3d', + 'mean_absolute_error', + 'mean_absolute_percentage_error', + 'mean_baseline', + 'mean_imputation', + 'mean_squared_error', + 'mean_squared_logarithmic_error', + 'merge_partial_predictions', + 'metafeature_extractor', + 'mice_imputation', + 'min_max_scaler', + 'missing_indicator', + 'mlp', + 'model', + 'monomial', + 'multinomial_naive_bayes', + 'multitable_featurization', + 'mutual_info', # To be used with "classification" or "regression" primitive family. + 'naive_bayes', + 'ndarray_to_dataframe', + 'ndarray_to_list', + 'nearest_centroid', + 'nk_sent2vec', + 'no_split_dataset_split', + 'non_parametric', # To be used with "clustering" primitive family. + 'normalize_column_references', + 'normalize_graphs', + 'normalizer', + 'null', + 'number_of_clusters', + 'numeric_range_filter', + 'nystroem', + 'one_hot_encoder', + 'ordinal_encoder', + 'out_of_sample_adjacency_spectral_embedding', + 'out_of_sample_laplacian_spectral_embedding', + 'output_dataframe', + 'owl', # To be used with "regression" primitive family. + 'parser', # To be used with "collaborative_filtering", "graph_matching", "vertex_nomination", or "community_detection" primitive family. + 'pass_to_ranks', + 'passive_aggressive', + 'pca', + 'pca_features', + 'pcp_ialm', + 'poisson', + 'polynomial_features', + 'primitive_sum', + 'profiler', + 'huber_pca', + 'low_rank_imputer', + 'high_rank_imputer', + 'quadratic_discriminant_analysis', + 'quantile_transformer', + 'ragged_dataset_reader', + 'random', + 'random_classifier', + 'random_forest', + 'random_projection_time_series_featurization', + 'random_sampling_imputer', + 'random_trees_embedding', + 'rank', # To be used with "classification" primitive family. + 'ravel', + 'rbf_sampler', + 'recommender_system', # To be used with "collaborative_filtering" primitive family. + 'redact_columns', + 'regex_filter', + 'relational_time_series', + 'remote_sensing_pretrained', + 'remove_columns', + 'remove_duplicate_columns', + 'remove_semantic_types', + 'rename_duplicate_name', + 'replace_semantic_types', + 'replace_singletons', + 'resnet50_image_feature', + 'resnext101_kinetics_video_features', + 'retina_net', + 'reverse', + 'rfd', + 'rfe', + 'rffeatures', + 'rfm_precondition_ed_gaussian_krr', + 'rfm_precondition_ed_polynomial_krr', + 'ridge', + 'rnn_time_series', + 'robust_scaler', + 'rpca_lbd', + 'score_based_markov_blanket', + 'sdne', + 'search', + 'search_hybrid', + 'search_hybrid_numeric', + 'search_numeric', + 'seeded', # To be used with "graph_matching" primitive family. + 'seeded_graph_matching', # To be used with "vertex_nomination" primitive family. + 'segment_curve_fitter', + 'select_fwe', + 'select_percentile', + 'sequence_to_bag_of_tokens', + 'sgd', + 'shapelet_learning', + 'signal_dither', + 'signal_framer', + 'signal_mfcc', + 'simon', + 'simple_imputer', + 'simultaneous_markov_blanket', + 'sparse_categorical_crossentropy', + 'sparse_pca', + 'sparse_random_projection', + 'spectral', # To be used with "vertex_nomination" primitive family. + 'spectral_graph', # To be used with "clustering" primitive family. + 'splitter', + 'squared_hinge', + 'ssc_admm', + 'ssc_cvx', + 'ssc_omp', + 'stack_ndarray_column', + 'stacking', # To be used with "operator" primitive family. + 'standard_scaler', + 'string_imputer', + 'structured', # To be used with "classification" primitive family. + 'subtract', + 'sum', + 'svc', + 'svr', + 't_distributed_stochastic_neighbor_embedding', + 'tabular_extractor', + 'targets_reader', + 'tensor_machines_binary', # To be used with "classification" primitive family. + 'tensor_machines_regularized_least_squares', + 'term_filter', + 'text_classifier', + 'text_encoder', + 'text_reader', + 'text_summarization', + 'text_to_vocabulary', + 'text_tokenizer', + 'tfidf_vectorizer', + 'time_series_forecasting', + 'time_series_formatter', + 'time_series_neighbours', + 'time_series_reshaper', + 'time_series_to_list', + 'to_numeric', + 'topic_vectorizer', + 'train_score_dataset_split', + 'trecs', + 'tree_augmented_naive_bayes', + 'trim_regressor', + 'truncated_svd', + 'unary_encoder', + 'unfold', + 'unicorn', + 'uniform_segmentation', + 'update_semantic_types', + 'variance_threshold', + 'vector_autoregression', + 'vertical_concatenate', + 'vgg16', + 'vgg16_image_feature', + 'video_reader', + 'voter', + 'voting', + 'wikifier', + 'word_2_vec', + 'word_embedding_builder', + 'xgboost_dart', + 'xgboost_gbtree', + 'yolo', + 'zero_count', +] diff --git a/d3m/d3m/metadata/problem.py b/d3m/d3m/metadata/problem.py new file mode 100644 index 0000000..520d5f3 --- /dev/null +++ b/d3m/d3m/metadata/problem.py @@ -0,0 +1,1039 @@ +import abc +import argparse +import copy +import functools +import json +import logging +import math +import os.path +import pprint +import sys +import traceback +import typing +from urllib import parse as url_parse + +from . import base +from d3m import deprecate, exceptions, utils + +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/66 +try: + from pyarrow import lib as pyarrow_lib # type: ignore +except ModuleNotFoundError: + pyarrow_lib = None + +__all__ = ('TaskKeyword', 'PerformanceMetric', 'Problem') + +logger = logging.getLogger(__name__) + +# Comma because we unpack the list of validators returned from "load_schema_validators". +PROBLEM_SCHEMA_VALIDATOR, = utils.load_schema_validators(base.SCHEMAS, ('problem.json',)) + +PROBLEM_SCHEMA_VERSION = 'https://metadata.datadrivendiscovery.org/schemas/v0/problem.json' + + +def sigmoid(x: float) -> float: + """ + Numerically stable scaled logistic function. + + Maps all values ``x`` to [0, 1]. Values between -1000 and 1000 are + mapped reasonably far from 0 and 1, after which the function + converges to bounds. + + Parameters + ---------- + x: + Input. + + Returns + ------- + Output. + """ + + scale = 1 / 1000 + + if x >= 0: + ex = math.exp(scale * -x) + return 1 / (1 + ex) + else: + ex = math.exp(scale * x) + return ex / (1 + ex) + + +class TaskKeywordBase: + _d3m_map: typing.Dict[str, 'TaskKeywordBase'] = {} + + @classmethod + def get_map(cls) -> dict: + """ + Returns the map between D3M problem description JSON string and enum values. + + Returns + ------- + The map. + """ + + return cls._d3m_map + + @classmethod + def parse(cls, name: str) -> 'TaskKeywordBase': + """ + Converts D3M problem description JSON string into enum value. + + Parameters + ---------- + name: + D3M problem description JSON string. + + Returns + ------- + Enum value. + """ + + return cls.get_map()[name] + + def unparse(self) -> str: + """ + Converts enum value to D3M problem description JSON string. + + Returns + ------- + D3M problem description JSON string. + """ + + for key, value in self.get_map().items(): + if self == value: + return key + + raise exceptions.InvalidStateError("Cannot convert {self}.".format(self=self)) + + def __ge__(self, other: typing.Any) -> bool: + if self.__class__ is other.__class__: + return list(self.__class__.__members__.keys()).index(self.value) >= list(other.__class__.__members__.keys()).index(other.value) # type: ignore + return NotImplemented + + def __gt__(self, other: typing.Any) -> bool: + if self.__class__ is other.__class__: + return list(self.__class__.__members__.keys()).index(self.value) > list(other.__class__.__members__.keys()).index(other.value) # type: ignore + return NotImplemented + + def __le__(self, other: typing.Any) -> bool: + if self.__class__ is other.__class__: + return list(self.__class__.__members__.keys()).index(self.value) <= list(other.__class__.__members__.keys()).index(other.value) # type: ignore + return NotImplemented + + def __lt__(self, other: typing.Any) -> bool: + if self.__class__ is other.__class__: + return list(self.__class__.__members__.keys()).index(self.value) < list(other.__class__.__members__.keys()).index(other.value) # type: ignore + return NotImplemented + + +TaskKeyword = utils.create_enum_from_json_schema_enum( + 'TaskKeyword', base.DEFINITIONS_JSON, + 'definitions.problem.properties.task_keywords.items.oneOf[*].enum[*]', + module=__name__, base_class=TaskKeywordBase, +) + +TaskKeyword._d3m_map.update({ + 'classification': TaskKeyword.CLASSIFICATION, # type: ignore + 'regression': TaskKeyword.REGRESSION, # type: ignore + 'clustering': TaskKeyword.CLUSTERING, # type: ignore + 'linkPrediction': TaskKeyword.LINK_PREDICTION, # type: ignore + 'vertexNomination': TaskKeyword.VERTEX_NOMINATION, # type: ignore + 'vertexClassification': TaskKeyword.VERTEX_CLASSIFICATION, # type: ignore + 'communityDetection': TaskKeyword.COMMUNITY_DETECTION, # type: ignore + 'graphMatching': TaskKeyword.GRAPH_MATCHING, # type: ignore + 'forecasting': TaskKeyword.FORECASTING, # type: ignore + 'collaborativeFiltering': TaskKeyword.COLLABORATIVE_FILTERING, # type: ignore + 'objectDetection': TaskKeyword.OBJECT_DETECTION, # type: ignore + 'semiSupervised': TaskKeyword.SEMISUPERVISED, # type: ignore + 'binary': TaskKeyword.BINARY, # type: ignore + 'multiClass': TaskKeyword.MULTICLASS, # type: ignore + 'multiLabel': TaskKeyword.MULTILABEL, # type: ignore + 'univariate': TaskKeyword.UNIVARIATE, # type: ignore + 'multivariate': TaskKeyword.MULTIVARIATE, # type: ignore + 'overlapping': TaskKeyword.OVERLAPPING, # type: ignore + 'nonOverlapping': TaskKeyword.NONOVERLAPPING, # type: ignore + 'tabular': TaskKeyword.TABULAR, # type: ignore + 'relational': TaskKeyword.RELATIONAL, # type: ignore + 'nested': TaskKeyword.NESTED, # type: ignore + 'image': TaskKeyword.IMAGE, # type: ignore + 'audio': TaskKeyword.AUDIO, # type: ignore + 'video': TaskKeyword.VIDEO, # type: ignore + 'speech': TaskKeyword.SPEECH, # type: ignore + 'text': TaskKeyword.TEXT, # type: ignore + 'graph': TaskKeyword.GRAPH, # type: ignore + 'multiGraph': TaskKeyword.MULTIGRAPH, # type: ignore + 'timeSeries': TaskKeyword.TIME_SERIES, # type: ignore + 'grouped': TaskKeyword.GROUPED, # type: ignore + 'geospatial': TaskKeyword.GEOSPATIAL, # type: ignore + 'remoteSensing': TaskKeyword.REMOTE_SENSING, # type: ignore + 'lupi': TaskKeyword.LUPI, # type: ignore + 'missingMetadata': TaskKeyword.MISSING_METADATA, # type: ignore +}) + + +class PerformanceMetricBase: + _d3m_map: typing.ClassVar[typing.Dict[str, 'PerformanceMetricBase']] = {} + _requires_confidence_set: typing.ClassVar[typing.Set['PerformanceMetricBase']] = set() + _requires_rank_set: typing.ClassVar[typing.Set['PerformanceMetricBase']] = set() + _best_value_map: typing.ClassVar[typing.Dict['PerformanceMetricBase', float]] = {} + _worst_value_map: typing.ClassVar[typing.Dict['PerformanceMetricBase', float]] = {} + _additional_score_class_map: typing.ClassVar[typing.Dict['PerformanceMetricBase', type]] = {} + + @classmethod + def get_map(cls) -> dict: + """ + Returns the map between D3M problem description JSON string and enum values. + + Returns + ------- + The map. + """ + + return cls._d3m_map + + @classmethod + def parse(cls, name: str) -> 'PerformanceMetricBase': + """ + Converts D3M problem description JSON string into enum value. + + Parameters + ---------- + name: + D3M problem description JSON string. + + Returns + ------- + Enum value. + """ + + return cls.get_map()[name] + + def unparse(self) -> str: + """ + Converts enum value to D3M problem description JSON string. + + Returns + ------- + D3M problem description JSON string. + """ + + for key, value in self.get_map().items(): + if self == value: + return key + + raise exceptions.InvalidStateError("Cannot convert {self}.".format(self=self)) + + def requires_confidence(self) -> bool: + """ + Returns ``True`` if this metric requires confidence column. + + Returns + ------- + ``True`` if this metric requires confidence column. + """ + + return self in self._requires_confidence_set + + def requires_rank(self) -> bool: + """ + Returns ``True`` if this metric requires rank column. + + Returns + ------- + ``True`` if this metric requires rank column. + """ + + return self in self._requires_rank_set + + def best_value(self) -> float: + """ + The best possible value of the metric. + + Returns + ------- + The best possible value of the metric. + """ + + return self._best_value_map[self] + + def worst_value(self) -> float: + """ + The worst possible value of the metric. + + Returns + ------- + The worst possible value of the metric. + """ + + return self._worst_value_map[self] + + def normalize(self, value: float) -> float: + """ + Normalize the ``value`` for this metric so that it is between 0 and 1, + inclusive, where 1 is the best score and 0 is the worst. + + Parameters + ---------- + value: + Value of this metric to normalize. + + Returns + ------- + A normalized metric. + """ + + worst_value = self.worst_value() + best_value = self.best_value() + + return self._normalize(worst_value, best_value, value) + + @classmethod + def _normalize(cls, worst_value: float, best_value: float, value: float) -> float: + assert worst_value <= value <= best_value or worst_value >= value >= best_value, (worst_value, value, best_value) + + if math.isinf(best_value) and math.isinf(worst_value): + value = sigmoid(value) + if best_value > worst_value: # "best_value" == inf, "worst_value" == -inf + best_value = 1.0 + worst_value = 0.0 + else: # "best_value" == -inf, "worst_value" == inf + best_value = 0.0 + worst_value = 1.0 + elif math.isinf(best_value): + value = sigmoid(value - worst_value) + if best_value > worst_value: # "best_value" == inf + best_value = 1.0 + worst_value = 0.5 + else: # "best_value" == -inf + best_value = 0.0 + worst_value = 0.5 + elif math.isinf(worst_value): + value = sigmoid(best_value - value) + if best_value > worst_value: # "worst_value" == -inf + best_value = 0.5 + worst_value = 1.0 + else: # "worst_value" == inf + best_value = 0.5 + worst_value = 0.0 + + return (value - worst_value) / (best_value - worst_value) + + def get_class(self) -> typing.Any: + """ + Returns a class suitable for computing this metric. + """ + + # Importing here to prevent import cycle. + from d3m import metrics + + if self in metrics.class_map: + return metrics.class_map[self] # type: ignore + + if self in self._additional_score_class_map: + return self._additional_score_class_map[self] # type: ignore + + raise exceptions.NotSupportedError("Computing metric {metric} is not supported.".format(metric=self)) + + @classmethod + def register_metric(cls, name: str, *, best_value: float, worst_value: float, score_class: type, requires_confidence: bool = False, requires_rank: bool = False) -> None: + cls.register_value(name, name) # type: ignore + cls._best_value_map[cls[name]] = best_value # type: ignore + cls._worst_value_map[cls[name]] = worst_value # type: ignore + cls._additional_score_class_map[cls[name]] = score_class # type: ignore + + if requires_confidence: + PerformanceMetric._requires_confidence_set.add(cls[name]) # type: ignore + + if requires_rank: + PerformanceMetric._requires_rank_set.add(cls[name]) # type: ignore + + +PerformanceMetric = utils.create_enum_from_json_schema_enum( + 'PerformanceMetric', base.DEFINITIONS_JSON, + 'definitions.performance_metric.oneOf[*].properties.metric.enum[*]', + module=__name__, base_class=PerformanceMetricBase, +) + +PerformanceMetric._d3m_map.update({ + 'accuracy': PerformanceMetric.ACCURACY, # type: ignore + 'precision': PerformanceMetric.PRECISION, # type: ignore + 'recall': PerformanceMetric.RECALL, # type: ignore + 'f1': PerformanceMetric.F1, # type: ignore + 'f1Micro': PerformanceMetric.F1_MICRO, # type: ignore + 'f1Macro': PerformanceMetric.F1_MACRO, # type: ignore + 'rocAuc': PerformanceMetric.ROC_AUC, # type: ignore + 'rocAucMicro': PerformanceMetric.ROC_AUC_MICRO, # type: ignore + 'rocAucMacro': PerformanceMetric.ROC_AUC_MACRO, # type: ignore + 'meanSquaredError': PerformanceMetric.MEAN_SQUARED_ERROR, # type: ignore + 'rootMeanSquaredError': PerformanceMetric.ROOT_MEAN_SQUARED_ERROR, # type: ignore + 'meanAbsoluteError': PerformanceMetric.MEAN_ABSOLUTE_ERROR, # type: ignore + 'rSquared': PerformanceMetric.R_SQUARED, # type: ignore + 'normalizedMutualInformation': PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION, # type: ignore + 'jaccardSimilarityScore': PerformanceMetric.JACCARD_SIMILARITY_SCORE, # type: ignore + 'precisionAtTopK': PerformanceMetric.PRECISION_AT_TOP_K, # type: ignore + 'objectDetectionAP': PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION, # type: ignore + 'hammingLoss': PerformanceMetric.HAMMING_LOSS, # type: ignore + 'meanReciprocalRank': PerformanceMetric.MEAN_RECIPROCAL_RANK, # type: ignore + 'hitsAtK': PerformanceMetric.HITS_AT_K, # type: ignore +}) +PerformanceMetric._requires_confidence_set.update({ + PerformanceMetric.ROC_AUC, + PerformanceMetric.ROC_AUC_MICRO, + PerformanceMetric.ROC_AUC_MACRO, + PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION, +}) +PerformanceMetric._requires_rank_set.update({ + PerformanceMetric.MEAN_RECIPROCAL_RANK, + PerformanceMetric.HITS_AT_K, +}) +PerformanceMetric._best_value_map.update({ + PerformanceMetric.ACCURACY: 1.0, # type: ignore + PerformanceMetric.PRECISION: 1.0, # type: ignore + PerformanceMetric.RECALL: 1.0, # type: ignore + PerformanceMetric.F1: 1.0, # type: ignore + PerformanceMetric.F1_MICRO: 1.0, # type: ignore + PerformanceMetric.F1_MACRO: 1.0, # type: ignore + PerformanceMetric.ROC_AUC: 1.0, # type: ignore + PerformanceMetric.ROC_AUC_MICRO: 1.0, # type: ignore + PerformanceMetric.ROC_AUC_MACRO: 1.0, # type: ignore + PerformanceMetric.MEAN_SQUARED_ERROR: 0.0, # type: ignore + PerformanceMetric.ROOT_MEAN_SQUARED_ERROR: 0.0, # type: ignore + PerformanceMetric.MEAN_ABSOLUTE_ERROR: 0.0, # type: ignore + PerformanceMetric.R_SQUARED: 1.0, # type: ignore + PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION: 1.0, # type: ignore + PerformanceMetric.JACCARD_SIMILARITY_SCORE: 1.0, # type: ignore + PerformanceMetric.PRECISION_AT_TOP_K: 1.0, # type: ignore + PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION: 1.0, # type: ignore + PerformanceMetric.HAMMING_LOSS: 0.0, # type: ignore + PerformanceMetric.MEAN_RECIPROCAL_RANK: 1.0, # type: ignore + PerformanceMetric.HITS_AT_K: 1.0, # type: ignore +}) +PerformanceMetric._worst_value_map.update({ + PerformanceMetric.ACCURACY: 0.0, # type: ignore + PerformanceMetric.PRECISION: 0.0, # type: ignore + PerformanceMetric.RECALL: 0.0, # type: ignore + PerformanceMetric.F1: 0.0, # type: ignore + PerformanceMetric.F1_MICRO: 0.0, # type: ignore + PerformanceMetric.F1_MACRO: 0.0, # type: ignore + PerformanceMetric.ROC_AUC: 0.0, # type: ignore + PerformanceMetric.ROC_AUC_MICRO: 0.0, # type: ignore + PerformanceMetric.ROC_AUC_MACRO: 0.0, # type: ignore + PerformanceMetric.MEAN_SQUARED_ERROR: float('inf'), # type: ignore + PerformanceMetric.ROOT_MEAN_SQUARED_ERROR: float('inf'), # type: ignore + PerformanceMetric.MEAN_ABSOLUTE_ERROR: float('inf'), # type: ignore + PerformanceMetric.R_SQUARED: float('-inf'), # type: ignore + PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION: 0.0, # type: ignore + PerformanceMetric.JACCARD_SIMILARITY_SCORE: 0.0, # type: ignore + PerformanceMetric.PRECISION_AT_TOP_K: 0.0, # type: ignore + PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION: 0.0, # type: ignore + PerformanceMetric.HAMMING_LOSS: 1.0, # type: ignore + PerformanceMetric.MEAN_RECIPROCAL_RANK: 0.0, # type: ignore + PerformanceMetric.HITS_AT_K: 0.0, # type: ignore +}) + +# Here are all legacy (before v4.0.0) task types and task subtypes mapped to task keywords. +TASK_TYPE_TO_KEYWORDS_MAP: typing.Dict[typing.Optional[str], typing.List] = { + None: [], + 'classification': [TaskKeyword.CLASSIFICATION], # type: ignore + 'regression': [TaskKeyword.REGRESSION], # type: ignore + 'clustering': [TaskKeyword.CLUSTERING], # type: ignore + 'linkPrediction': [TaskKeyword.LINK_PREDICTION], # type: ignore + 'vertexClassification': [TaskKeyword.VERTEX_CLASSIFICATION], # type: ignore + 'vertexNomination': [TaskKeyword.VERTEX_NOMINATION], # type: ignore + 'communityDetection': [TaskKeyword.COMMUNITY_DETECTION], # type: ignore + 'graphMatching': [TaskKeyword.GRAPH_MATCHING], # type: ignore + 'timeSeriesForecasting': [TaskKeyword.TIME_SERIES, TaskKeyword.FORECASTING], # type: ignore + 'collaborativeFiltering': [TaskKeyword.COLLABORATIVE_FILTERING], # type: ignore + 'objectDetection': [TaskKeyword.OBJECT_DETECTION], # type: ignore + 'semiSupervisedClassification': [TaskKeyword.SEMISUPERVISED, TaskKeyword.CLASSIFICATION], # type: ignore + 'semiSupervisedRegression': [TaskKeyword.SEMISUPERVISED, TaskKeyword.REGRESSION], # type: ignore + 'binary': [TaskKeyword.BINARY], # type: ignore + 'multiClass': [TaskKeyword.MULTICLASS], # type: ignore + 'multiLabel': [TaskKeyword.MULTILABEL], # type: ignore + 'univariate': [TaskKeyword.UNIVARIATE], # type: ignore + 'multivariate': [TaskKeyword.MULTIVARIATE], # type: ignore + 'overlapping': [TaskKeyword.OVERLAPPING], # type: ignore + 'nonOverlapping': [TaskKeyword.NONOVERLAPPING], # type: ignore +} +JSON_TASK_TYPE_TO_KEYWORDS_MAP: typing.Dict[typing.Optional[str], typing.List] = { + None: [], + 'CLASSIFICATION': [TaskKeyword.CLASSIFICATION], # type: ignore + 'REGRESSION': [TaskKeyword.REGRESSION], # type: ignore + 'CLUSTERING': [TaskKeyword.CLUSTERING], # type: ignore + 'LINK_PREDICTION': [TaskKeyword.LINK_PREDICTION], # type: ignore + 'VERTEX_CLASSIFICATION': [TaskKeyword.VERTEX_CLASSIFICATION], # type: ignore + 'VERTEX_NOMINATION': [TaskKeyword.VERTEX_NOMINATION], # type: ignore + 'COMMUNITY_DETECTION': [TaskKeyword.COMMUNITY_DETECTION], # type: ignore + 'GRAPH_MATCHING': [TaskKeyword.GRAPH_MATCHING], # type: ignore + 'TIME_SERIES_FORECASTING': [TaskKeyword.TIME_SERIES, TaskKeyword.FORECASTING], # type: ignore + 'COLLABORATIVE_FILTERING': [TaskKeyword.COLLABORATIVE_FILTERING], # type: ignore + 'OBJECT_DETECTION': [TaskKeyword.OBJECT_DETECTION], # type: ignore + 'SEMISUPERVISED_CLASSIFICATION': [TaskKeyword.SEMISUPERVISED, TaskKeyword.CLASSIFICATION], # type: ignore + 'SEMISUPERVISED_REGRESSION': [TaskKeyword.SEMISUPERVISED, TaskKeyword.REGRESSION], # type: ignore + 'BINARY': [TaskKeyword.BINARY], # type: ignore + 'MULTICLASS': [TaskKeyword.MULTICLASS], # type: ignore + 'MULTILABEL': [TaskKeyword.MULTILABEL], # type: ignore + 'UNIVARIATE': [TaskKeyword.UNIVARIATE], # type: ignore + 'MULTIVARIATE': [TaskKeyword.MULTIVARIATE], # type: ignore + 'OVERLAPPING': [TaskKeyword.OVERLAPPING], # type: ignore + 'NONOVERLAPPING': [TaskKeyword.NONOVERLAPPING], # type: ignore +} + + +class Loader(metaclass=utils.AbstractMetaclass): + """ + A base class for problem loaders. + """ + + @abc.abstractmethod + def can_load(self, problem_uri: str) -> bool: + """ + Return ``True`` if this loader can load a problem from a given URI ``problem_uri``. + + Parameters + ---------- + problem_uri: + A URI to load a problem from. + + Returns + ------- + ``True`` if this loader can load a problem from ``problem_uri``. + """ + + @abc.abstractmethod + def load(self, problem_uri: str, *, problem_id: str = None, problem_version: str = None, + problem_name: str = None, strict_digest: bool = False, handle_score_split: bool = True) -> 'Problem': + """ + Loads the problem at ``problem_uri``. + + Parameters + ---------- + problem_uri: + A URI to load. + problem_id: + Override problem ID determined by the loader. + problem_version: + Override problem version determined by the loader. + problem_name: + Override problem name determined by the loader. + strict_digest: + If computed digest does not match the one provided in metadata, raise an exception? + handle_score_split: + Rename a scoring problem to not have the same name as testing problem + and update dataset references. + + Returns + ------- + A loaded problem. + """ + + @classmethod + def get_problem_class(cls) -> 'typing.Type[Problem]': + return Problem + + +class D3MProblemLoader(Loader): + """ + A class for loading of D3M problems. + + Loader support only loading from a local file system. + URI should point to the ``problemDoc.json`` file in the D3M problem directory. + """ + + SUPPORTED_VERSIONS = {'3.0', '3.1', '3.1.1', '3.1.2', '3.2.0', '3.2.1', '3.3.0', '3.3.1', '4.0.0', '4.1.0'} + + def can_load(self, dataset_uri: str) -> bool: + try: + parsed_uri = url_parse.urlparse(dataset_uri, allow_fragments=False) + except Exception: + return False + + if parsed_uri.scheme != 'file': + return False + + if parsed_uri.netloc not in ['', 'localhost']: + return False + + if not parsed_uri.path.startswith('/'): + return False + + if os.path.basename(parsed_uri.path) != 'problemDoc.json': + return False + + return True + + # "strict_digest" is not used because there is no digest in D3M problem descriptions. + def load(self, problem_uri: str, *, problem_id: str = None, problem_version: str = None, + problem_name: str = None, strict_digest: bool = False, handle_score_split: bool = True) -> 'Problem': + assert self.can_load(problem_uri) + + parsed_uri = url_parse.urlparse(problem_uri, allow_fragments=False) + + problem_doc_path = parsed_uri.path + + try: + with open(problem_doc_path, 'r', encoding='utf8') as problem_doc_file: + problem_doc = json.load(problem_doc_file) + except FileNotFoundError as error: + raise exceptions.ProblemNotFoundError( + "D3M problem '{problem_uri}' cannot be found.".format(problem_uri=problem_uri), + ) from error + + problem_schema_version = problem_doc.get('about', {}).get('problemSchemaVersion', '3.3.0') + if problem_schema_version not in self.SUPPORTED_VERSIONS: + logger.warning("Loading a problem with unsupported schema version '%(version)s'. Supported versions: %(supported_versions)s", { + 'version': problem_schema_version, + 'supported_versions': self.SUPPORTED_VERSIONS, + }) + + # To be compatible with problem descriptions which do not adhere to the schema and have only one entry for data. + if not isinstance(problem_doc['inputs']['data'], list): + problem_doc['inputs']['data'] = [problem_doc['inputs']['data']] + + performance_metrics = [] + for performance_metric in problem_doc['inputs']['performanceMetrics']: + params = {} + + if 'posLabel' in performance_metric: + params['pos_label'] = performance_metric['posLabel'] + + if 'K' in performance_metric: + params['k'] = performance_metric['K'] + + performance_metrics.append({ + 'metric': PerformanceMetric.parse(performance_metric['metric']), + }) + + if params: + performance_metrics[-1]['params'] = params + + inputs = [] + for data in problem_doc['inputs']['data']: + targets = [] + for target in data['targets']: + targets.append({ + 'target_index': target['targetIndex'], + 'resource_id': target['resID'], + 'column_index': target['colIndex'], + 'column_name': target['colName'], + }) + + if 'numClusters' in target: + targets[-1]['clusters_number'] = target['numClusters'] + + privileged_data_columns = [] + for privileged_data in data.get('privilegedData', []): + privileged_data_columns.append({ + 'privileged_data_index': privileged_data['privilegedDataIndex'], + 'resource_id': privileged_data['resID'], + 'column_index': privileged_data['colIndex'], + 'column_name': privileged_data['colName'], + }) + + problem_input = { + 'dataset_id': data['datasetID'], + } + + if targets: + problem_input['targets'] = targets + + if privileged_data_columns: + problem_input['privileged_data'] = privileged_data_columns + + if data.get('forecastingHorizon', {}).get('horizonValue', None): + problem_input['forecasting_horizon'] = { + 'resource_id': data['forecastingHorizon']['resID'], + 'column_index': data['forecastingHorizon']['colIndex'], + 'column_name': data['forecastingHorizon']['colName'], + 'horizon_value': data['forecastingHorizon']['horizonValue'], + } + + inputs.append(problem_input) + + document_problem_id = problem_doc['about']['problemID'] + # Handle a special case for SCORE dataset splits (those which have "targets.csv" file). + # They are the same as TEST dataset splits, but we present them differently, so that + # SCORE dataset splits have targets as part of data. Because of this we also update + # corresponding problem ID. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176 + if handle_score_split and os.path.exists(os.path.join(os.path.dirname(problem_doc_path), '..', 'targets.csv')) and document_problem_id.endswith('_TEST'): + document_problem_id = document_problem_id[:-5] + '_SCORE' + + # Also update dataset references. + for data in problem_doc.get('inputs', {}).get('data', []): + if data['datasetID'].endswith('_TEST'): + data['datasetID'] = data['datasetID'][:-5] + '_SCORE' + + # "dataSplits" is not exposed as a problem description. One should provide splitting + # configuration to a splitting pipeline instead. Similarly, "outputs" are not exposed either. + description = { + 'schema': PROBLEM_SCHEMA_VERSION, + 'id': problem_id or document_problem_id, + 'version': problem_version or problem_doc['about'].get('problemVersion', '1.0'), + 'name': problem_name or problem_doc['about']['problemName'], + 'location_uris': [ + # We reconstruct the URI to normalize it. + utils.fix_uri(problem_doc_path), + ], + 'problem': {}, + } + + task_keywords: typing.List = [] + + # Legacy (before v4.0.0). + task_keywords += TASK_TYPE_TO_KEYWORDS_MAP[problem_doc['about'].get('taskType', None)] + task_keywords += TASK_TYPE_TO_KEYWORDS_MAP[problem_doc['about'].get('taskSubType', None)] + + if problem_doc['about'].get('taskKeywords', []): + for task_keyword in problem_doc['about']['taskKeywords']: + task_keywords.append(TaskKeyword.parse(task_keyword)) + + if task_keywords: + description['problem']['task_keywords'] = sorted(set(task_keywords)) # type: ignore + + if performance_metrics: + description['problem']['performance_metrics'] = performance_metrics # type: ignore + + if problem_doc['about'].get('problemDescription', None): + description['description'] = problem_doc['about']['problemDescription'] # type: ignore + + if problem_doc['about'].get('problemURI', None): + typing.cast(typing.List[str], description['location_uris']).append(problem_doc['about']['problemURI']) + + if inputs: + description['inputs'] = inputs # type: ignore + + if 'dataAugmentation' in problem_doc: + description['data_augmentation'] = problem_doc['dataAugmentation'] + + # We do not want empty objects. + if not description['problem']: + del description['problem'] + + problem_class = self.get_problem_class() + + return problem_class(description) + + +P = typing.TypeVar('P', bound='Problem') + + +# TODO: It should be probably immutable. +class Problem(dict): + """ + A class representing a problem. + """ + + def __init__(self, problem_description: typing.Dict = None, *, strict_digest: bool = False) -> None: + super().__init__(problem_description) + + PROBLEM_SCHEMA_VALIDATOR.validate(self) + + if 'digest' in self: + digest = self.get_digest() + + if digest != self['digest']: + if strict_digest: + raise exceptions.DigestMismatchError( + "Digest for problem description '{problem_id}' does not match a computed one. Provided digest: {problem_digest}. Computed digest: {new_problem_digest}.".format( + problem_id=self['id'], + problem_digest=self['digest'], + new_problem_digest=digest, + ) + ) + else: + logger.warning( + "Digest for problem description '%(problem_id)s' does not match a computed one. Provided digest: %(problem_digest)s. Computed digest: %(new_problem_digest)s.", + { + 'problem_id': self['id'], + 'problem_digest': self['digest'], + 'new_problem_digest': digest, + }, + ) + + # We do not want it to be stored in the object because it can become + # obsolete. Use "get_digest" to get the current digest. + del self['digest'] + + loaders: typing.List[Loader] = [ + D3MProblemLoader(), + ] + + @classmethod + def load(cls, problem_uri: str, *, problem_id: str = None, problem_version: str = None, + problem_name: str = None, strict_digest: bool = False, handle_score_split: bool = True) -> 'Problem': + """ + Tries to load problem from ``problem_uri`` using all registered problem loaders. + + Parameters + ---------- + problem_uri: + A URI to load. + problem_id: + Override problem ID determined by the loader. + problem_version: + Override problem version determined by the loader. + problem_name: + Override problem name determined by the loader. + strict_digest: + If computed digest does not match the one provided in metadata, raise an exception? + handle_score_split: + Rename a scoring problem to not have the same name as testing problem + and update dataset references. + + Returns + ------- + A loaded problem. + """ + + for loader in cls.loaders: + if loader.can_load(problem_uri): + return loader.load( + problem_uri, problem_id=problem_id, problem_version=problem_version, + problem_name=problem_name, strict_digest=strict_digest, + handle_score_split=handle_score_split, + ) + + raise exceptions.ProblemUriNotSupportedError( + "No known loader could load problem from '{problem_uri}'.".format(problem_uri=problem_uri) + ) + + # TODO: Allow one to specify priority which would then insert loader at a different place and not at the end? + @classmethod + def register_loader(cls, loader: Loader) -> None: + """ + Registers a new problem loader. + + Parameters + ---------- + loader: + An instance of the loader class implementing a new loader. + """ + + cls.loaders.append(loader) + + def __repr__(self) -> str: + return self.__str__() + + def _get_description_keys(self) -> typing.Sequence[str]: + return 'id', 'name', 'location_uris' + + def __str__(self) -> str: + return '{class_name}({description})'.format( + class_name=type(self).__name__, + description=', '.join('{key}=\'{value}\''.format(key=key, value=self[key]) for key in self._get_description_keys() if key in self), + ) + + def copy(self: P) -> P: + return copy.deepcopy(self) + + @classmethod + def _canonical_problem_description(cls: typing.Type[P], problem_description: typing.Dict) -> P: + """ + Before we compute digest of the problem description, we have to convert it to a + canonical structure. + + Currently, this is just removing any local URIs the description might have. + """ + + # Making a copy. + problem_description = dict(problem_description) + + utils.filter_local_location_uris(problem_description) + + if 'digest' in problem_description: + del problem_description['digest'] + + return cls(problem_description) + + def get_digest(self) -> str: + # We use "to_json_structure" here and not "to_reversible_json_structure" + # because pickled values might not be deterministic. + return utils.compute_digest(utils.to_json_structure(self._to_simple_structure(canonical=True))) + + def _to_simple_structure(self, *, canonical: bool = False) -> typing.Dict: + problem_description = self + + if canonical: + problem_description = self._canonical_problem_description(self) + + return dict(problem_description) + + def to_simple_structure(self, *, canonical: bool = False) -> typing.Dict: + problem_description = self._to_simple_structure(canonical=canonical) + + problem_description['digest'] = self.get_digest() + + return problem_description + + @classmethod + def from_simple_structure(cls: typing.Type[P], structure: typing.Dict, *, strict_digest: bool = False) -> P: + return cls(structure, strict_digest=strict_digest) + + def to_json_structure(self, *, canonical: bool = False) -> typing.Dict: + """ + For standard enumerations we map them to strings. Non-standard problem + description fields we convert in a reversible manner. + """ + + PROBLEM_SCHEMA_VALIDATOR.validate(self) + + simple_structure = copy.deepcopy(self.to_simple_structure(canonical=canonical)) + + if simple_structure.get('problem', {}).get('task_keywords', []): + simple_structure['problem']['task_keywords'] = [task_keyword.name for task_keyword in simple_structure['problem']['task_keywords']] + if simple_structure.get('problem', {}).get('performance_metrics', []): + for metric in simple_structure['problem']['performance_metrics']: + metric['metric'] = metric['metric'].name + + return utils.to_reversible_json_structure(simple_structure) + + @classmethod + def from_json_structure(cls: typing.Type[P], structure: typing.Dict, *, strict_digest: bool = False) -> P: + """ + For standard enumerations we map them from strings. For non-standard problem + description fields we used a reversible conversion. + """ + + simple_structure = utils.from_reversible_json_structure(structure) + + # Legacy (before v4.0.0). + legacy_task_keywords: typing.List[TaskKeyword] = [] # type: ignore + legacy_task_keywords += JSON_TASK_TYPE_TO_KEYWORDS_MAP[simple_structure.get('problem', {}).get('task_type', None)] + legacy_task_keywords += JSON_TASK_TYPE_TO_KEYWORDS_MAP[simple_structure.get('problem', {}).get('task_subtype', None)] + + if legacy_task_keywords: + # We know "problem" field exists. + simple_structure['problem']['task_keywords'] = simple_structure['problem'].get('task_keywords', []) + legacy_task_keywords + + if simple_structure.get('problem', {}).get('task_keywords', []): + mapped_task_keywords = [] + for task_keyword in simple_structure['problem']['task_keywords']: + if isinstance(task_keyword, str): + mapped_task_keywords.append(TaskKeyword[task_keyword]) + else: + mapped_task_keywords.append(task_keyword) + simple_structure['problem']['task_keywords'] = mapped_task_keywords + if simple_structure.get('problem', {}).get('performance_metrics', []): + for metric in simple_structure['problem']['performance_metrics']: + if isinstance(metric['metric'], str): + metric['metric'] = PerformanceMetric[metric['metric']] + + return cls.from_simple_structure(simple_structure, strict_digest=strict_digest) + + +@deprecate.function(message="use Problem.load class method instead") +def parse_problem_description(problem_doc_path: str) -> Problem: + """ + Parses problem description according to ``problem.json`` metadata schema. + + It converts constants to enumerations when suitable. + + Parameters + ---------- + problem_doc_path: + File path to the problem description (``problemDoc.json``). + + Returns + ------- + A parsed problem. + """ + + return Problem.load(problem_uri=utils.fix_uri(problem_doc_path)) + + +def problem_serializer(obj: Problem) -> dict: + data: typing.Dict = { + 'problem': dict(obj), + } + + if type(obj) is not Problem: + data['type'] = type(obj) + + return data + + +def problem_deserializer(data: dict) -> Problem: + problem = data.get('type', Problem)(data['problem']) + return problem + + +if pyarrow_lib is not None: + pyarrow_lib._default_serialization_context.register_type( + Problem, 'd3m.problem', + custom_serializer=problem_serializer, + custom_deserializer=problem_deserializer, + ) + + +def get_problem(problem_uri: str, *, strict_digest: bool = False, datasets_dir: str = None, handle_score_split: bool = True) -> Problem: + if datasets_dir is not None: + datasets, problem_descriptions = utils.get_datasets_and_problems(datasets_dir, handle_score_split) + + if problem_uri in problem_descriptions: + problem_uri = problem_descriptions[problem_uri] + + problem_uri = utils.fix_uri(problem_uri) + + return Problem.load(problem_uri, strict_digest=strict_digest) + + +def describe_handler( + arguments: argparse.Namespace, *, problem_resolver: typing.Callable = None, +) -> None: + if problem_resolver is None: + problem_resolver = get_problem + + output_stream = getattr(arguments, 'output', sys.stdout) + + has_errored = False + + for problem_path in arguments.problems: + if getattr(arguments, 'list', False): + print(problem_path, file=output_stream) + + try: + problem = problem_resolver(problem_path, strict_digest=getattr(arguments, 'strict_digest', False)) + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=output_stream) + print(f"Error parsing problem: {problem_path}", file=output_stream) + has_errored = True + continue + else: + raise Exception(f"Error parsing problem: {problem_path}") from error + + try: + problem_description = problem.to_json_structure(canonical=True) + + if getattr(arguments, 'print', False): + pprint.pprint(problem_description, stream=output_stream) + elif not getattr(arguments, 'no_print', False): + json.dump( + problem_description, + output_stream, + indent=(getattr(arguments, 'indent', 2) or None), + sort_keys=getattr(arguments, 'sort_keys', False), + allow_nan=False, + ) # type: ignore + output_stream.write('\n') + except Exception as error: + if getattr(arguments, 'continue', False): + traceback.print_exc(file=output_stream) + print(f"Error describing problem: {problem_path}", file=output_stream) + has_errored = True + continue + else: + raise Exception(f"Error describing problem: {problem_path}") from error + + if has_errored: + sys.exit(1) + + +def main(argv: typing.Sequence) -> None: + raise exceptions.NotSupportedError("This CLI has been removed. Use \"python3 -m d3m problem describe\" instead.") + + +if __name__ == '__main__': + main(sys.argv) diff --git a/d3m/d3m/metadata/schemas/v0/container.json b/d3m/d3m/metadata/schemas/v0/container.json new file mode 100644 index 0000000..f0a8852 --- /dev/null +++ b/d3m/d3m/metadata/schemas/v0/container.json @@ -0,0 +1,62 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "id": "https://metadata.datadrivendiscovery.org/schemas/v0/container.json", + "title": "Container metadata", + "description": "Schema for metadata for the container (value passed between primitives).", + "type": "object", + "properties": { + "schema": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/schema" + }, + "id": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/id" + }, + "version": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/version" + }, + "digest": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/digest" + }, + "name": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/name" + }, + "other_names": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/other_names" + }, + "description": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/description" + }, + "keywords": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/keywords" + }, + "source": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/source" + }, + "structural_type": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/structural_type" + }, + "stored_size": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/stored_size" + }, + "approximate_stored_size": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/approximate_stored_size" + }, + "semantic_types": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/semantic_types" + }, + "dimension": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/dimension" + }, + "location_uris": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/location_uris" + }, + "data_metafeatures": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/data_metafeatures" + } + }, + "required": [ + "schema", + "structural_type" + ], + "additionalProperties": true +} diff --git a/d3m/d3m/metadata/schemas/v0/data.json b/d3m/d3m/metadata/schemas/v0/data.json new file mode 100644 index 0000000..d6e8ffe --- /dev/null +++ b/d3m/d3m/metadata/schemas/v0/data.json @@ -0,0 +1,64 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "id": "https://metadata.datadrivendiscovery.org/schemas/v0/datum.json", + "title": "Data metadata", + "description": "Schema for metadata for data itself (e.g., cells).", + "type": "object", + "properties": { + "name": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/name" + }, + "other_names": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/other_names" + }, + "description": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/description" + }, + "keywords": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/keywords" + }, + "source": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/source" + }, + "structural_type": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/structural_type" + }, + "media_types": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/media_types" + }, + "sampling_rate": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/sampling_rate" + }, + "stored_size": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/stored_size" + }, + "semantic_types": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/semantic_types" + }, + "dimension": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/dimension" + }, + "location_base_uris": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/location_base_uris" + }, + "file_columns": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/file_columns" + }, + "file_columns_count": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/file_columns_count" + }, + "foreign_key": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/foreign_key" + }, + "boundary_for": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/boundary_for" + }, + "data_metafeatures": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/data_metafeatures" + }, + "all_distinct_values": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/all_distinct_values" + } + }, + "additionalProperties": true +} diff --git a/d3m/d3m/metadata/schemas/v0/definitions.json b/d3m/d3m/metadata/schemas/v0/definitions.json new file mode 100644 index 0000000..ae536fc --- /dev/null +++ b/d3m/d3m/metadata/schemas/v0/definitions.json @@ -0,0 +1,4415 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "id": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json", + "definitions": { + "id": { + "type": "string", + "description": "A static id. It should never change for a given value, even if the value itself is changing. For example, all versions of the same primitive should have the same id. If possible, it should be a UUID generated in any way, but if there is an existing id available, it can be reused." + }, + "hash_id": { + "type": "string", + "description": "An UUIDv5 id computed by using UUID namespace \"8614b2cc-89ef-498e-9254-833233b3959b\" and JSON-serialized contents of the document without the \"id\" field for UUID name." + }, + "version": { + "type": "string", + "description": "A string representing a version. Versions can be PEP 440 version strings or a SHA256 hexadecimal digest of value's content, if applicable. In the former case they are compared according to PEP 440 rules." + }, + "digest": { + "type": "string", + "description": "A SHA256 hexadecimal digest of value's content. For datasets is digest over all files. For primitives it is a digest of its \"id\" and \"installation\" metadata. For other JSON-compatible structures, it is generally a digest of the canonical JSON-serialization of the structure, without the \"digest\" field itself.", + "pattern": "^[a-fA-F0-9]{64}$" + }, + "schema": { + "type": "string", + "description": "A URI representing a metadata.datadrivendiscovery.org schema and version to which metadata conforms.", + "format": "uri" + }, + "description": { + "type": "string", + "description": "A natural language description in an unspecified language." + }, + "name": { + "type": "string", + "description": "A human readable name in an unspecified language or format." + }, + "other_names": { + "type": "array", + "description": "Any other names associated with the value.", + "items": { + "$ref": "#/definitions/name" + }, + "minItems": 1 + }, + "python_path": { + "type": "string", + "description": "A fully-qualified Python path to primitive's class under the \"d3m.primitives\" namespace.", + "pattern": "^d3m\\.primitives\\." + }, + "original_python_path": { + "type": "string", + "description": "A fully-qualified Python path to primitive's class inside installable package and not one under the \"d3m.primitives\" namespace." + }, + "dimension": { + "type": "object", + "description": "Metadata for the dimension (e.g., rows and columns).", + "properties": { + "name": { + "$ref": "#/definitions/name" + }, + "description": { + "$ref": "#/definitions/description" + }, + "semantic_types": { + "$ref": "#/definitions/semantic_types" + }, + "length": { + "type": "integer", + "description": "Number of elements in a given dimension (number of samples, number of columns, etc.)." + }, + "sampling_rate": { + "allOf": [{"$ref": "#/definitions/sampling_rate"}], + "description": "If values in the dimension are sampled, this value represents the sampling rate in seconds." + } + }, + "required": [ + "length" + ], + "additionalProperties": true + }, + "data_metafeatures": { + "type": "object", + "description": "Some data metafeatures can apply both at the container (dataset) or internal data levels (resource, table, column). In any case they apply and hold for the whole underlying structure. For example, if \"number_distinct_values\" is set at a dataset level, it means that all columns in the dataset have this number of distinct values. If it is set only for a target column, then only that column has this number of distinct values, classes.", + "properties": { + "number_of_attributes": { + "type": "integer", + "description": "The number of attributes in the data." + }, + "number_of_instances": { + "type": "integer", + "description": "The number of instances in the data." + }, + "dimensionality": { + "type": "number", + "description": "Number of attributes divided by the number of instances." + }, + "number_of_numeric_attributes": { + "type": "integer", + "description": "Number of numeric attributes, which are not also categorical." + }, + "ratio_of_numeric_attributes": { + "type": "number", + "description": "Ratio of number of numeric attributes to total number of attributes." + }, + "number_of_string_attributes": { + "type": "integer", + "description": "Number of string attributes, which are not also categorical." + }, + "ratio_of_string_attributes": { + "type": "number", + "description": "Ratio of number of string attributes to total number of attributes." + }, + "number_of_categorical_attributes": { + "type": "integer", + "description": "Number of categorical attributes." + }, + "ratio_of_categorical_attributes": { + "type": "number", + "description": "Ratio of number of categorical attributes to total number of attributes." + }, + "number_of_other_attributes": { + "type": "integer", + "description": "Number of other (not numeric, not string, and not categorical) attributes." + }, + "ratio_of_other_attributes": { + "type": "number", + "description": "Ratio of number of other attributes to total number of attributes." + }, + "number_of_discrete_attributes": { + "type": "integer", + "description": "Number of discrete attributes. A discrete attribute is a numeric attribute with only integer values." + }, + "ratio_of_discrete_attributes": { + "type": "number", + "description": "Ratio of number of discrete attributes to total number of attributes. A discrete attribute is a numeric attribute with only integer values." + }, + "number_of_binary_attributes": { + "type": "integer", + "description": "Number of binary attributes. A binary attribute is a discrete attribute with exactly two values." + }, + "ratio_of_binary_attributes": { + "type": "number", + "description": "Ratio of number of binary attributes to total number of attributes. A binary attribute is a discrete attribute with exactly two values." + }, + "attribute_counts_by_structural_type": { + "type": "object", + "description": "A map between structural types as string and a count of attributes with that structural type.", + "additionalProperties": { + "type": "integer" + } + }, + "attribute_ratios_by_structural_type": { + "type": "object", + "description": "A map between structural types as string and a ratio of attributes with that structural type to all attributes.", + "additionalProperties": { + "type": "number" + } + }, + "attribute_counts_by_semantic_type": { + "type": "object", + "description": "A map between semantic types and a count of attributes with that semantic type. Attributes can have multiple semantic types.", + "additionalProperties": { + "type": "integer" + } + }, + "attribute_ratios_by_semantic_type": { + "type": "object", + "description": "A map between semantic types as string and a ratio of attributes with that semantic type to all attributes. Attributes can have multiple semantic types.", + "additionalProperties": { + "type": "number" + } + }, + "number_distinct_values": { + "type": "integer", + "description": "The number of distinct non-missing values for categorical or discrete values." + }, + "entropy_of_values": { + "type": "number", + "description": "The entropy of non-missing values. If values are not categorical or discrete, they are binned into \"number of all values\" ^ 1/3 bins." + }, + "value_counts_aggregate": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics of occurrence counts of non-missing values. If values are not categorical or discrete, they are binned into \"number of all values\" ^ 1/3 bins." + }, + "value_probabilities_aggregate": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics of probabilities of non-missing values. Probability of a value is defined as \"an occurrence count of a non-missing value\" / \"number of all non-missing values\". If values are not categorical or discrete, they are binned into \"number of all values\" ^ 1/3 bins." + }, + "values_aggregate": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics of numeric non-missing values." + }, + "number_distinct_values_of_categorical_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the number of distinct non-missing values in each categorical attributes." + }, + "number_distinct_values_of_numeric_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the number of distinct non-missing values in each numeric attributes." + }, + "number_distinct_values_of_discrete_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the number of distinct non-missing values in each discrete attributes." + }, + "mean_of_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the mean of numeric attributes." + }, + "standard_deviation_of_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the standard deviation of numeric attributes." + }, + "kurtosis_of_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the kurtosis of numeric attributes." + }, + "skew_of_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the skew of numeric attributes." + }, + "entropy_of_categorical_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the entropy of categorical attributes." + }, + "entropy_of_numeric_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the entropy of numeric attributes." + }, + "entropy_of_discrete_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the entropy of discrete attributes." + }, + "entropy_of_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the entropy of all (categorical and numeric) attributes." + }, + "joint_entropy_of_categorical_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the joint entropy of every categorical attribute with a given target." + }, + "joint_entropy_of_numeric_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the joint entropy of every numeric attribute with a given target." + }, + "joint_entropy_of_discrete_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the joint entropy of every discrete attribute with a given target." + }, + "joint_entropy_of_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the joint entropy of every (categorical and numeric) attribute with a given target." + }, + "mutual_information_of_categorical_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the mutual information of every categorical attribute with a given target." + }, + "mutual_information_of_numeric_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the mutual information of every numeric attribute with a given target." + }, + "mutual_information_of_discrete_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the mutual information of every discrete attribute with a given target." + }, + "mutual_information_of_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the mutual information of every (categorical and numeric) attribute with a given target." + }, + "pearson_correlation_of_numeric_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the pearson correlation between all pairs of numeric attributes. If set on a target column, it represents aggregate statistics about the pearson correlation of every numeric attribute with that target." + }, + "spearman_correlation_of_numeric_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the spearman correlation between all pairs of numeric attributes. If set on a target column, it represents aggregate statistics about the spearman correlation of every numeric attribute with that target." + }, + "canonical_correlation_of_numeric_attributes": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the canonical correlation between all pairs of numeric attributes. If set on a target column, it represents aggregate statistics about the canonical correlation of every numeric attribute with that target." + }, + "equivalent_number_of_categorical_attributes": { + "type": "number", + "description": "Number of categorical attributes needed to optimally describe the target (under the assumption of independence among attributes). Equals target's \"entropy_of_values\" divided by \"mutual_information_of_categorical_attributes.mean\"." + }, + "equivalent_number_of_numeric_attributes": { + "type": "number", + "description": "Number of numeric attributes needed to optimally describe the target (under the assumption of independence among attributes). Equals target's \"entropy_of_values\" divided by \"mutual_information_of_numeric_attributes.mean\"." + }, + "equivalent_number_of_discrete_attributes": { + "type": "number", + "description": "Number of discrete attributes needed to optimally describe the target (under the assumption of independence among attributes). Equals target's \"entropy_of_values\" divided by \"mutual_information_of_discrete_attributes.mean\"." + }, + "equivalent_number_of_attributes": { + "type": "number", + "description": "Number of all (categorical and numeric) attributes needed to optimally describe the target (under the assumption of independence among attributes). Equals target's \"entropy_of_values\" divided by \"mutual_information_of_attributes.mean\"." + }, + "categorical_noise_to_signal_ratio": { + "type": "number", + "description": "An estimate of the amount of irrelevant information in the categorical attributes regarding the target. Equals (\"entropy_of_categorical_attributes.mean\" - \"mutual_information_of_categorical_attributes.mean\") divided by \"mutual_information_of_categorical_attributes.mean\"." + }, + "numeric_noise_to_signal_ratio": { + "type": "number", + "description": "An estimate of the amount of irrelevant information in the numeric attributes regarding the target. Equals (\"entropy_of_numeric_attributes.mean\" - \"mutual_information_of_numeric_attributes.mean\") divided by \"mutual_information_of_numeric_attributes.mean\"." + }, + "discrete_noise_to_signal_ratio": { + "type": "number", + "description": "An estimate of the amount of irrelevant information in the discrete attributes regarding the target. Equals (\"entropy_of_discrete_attributes.mean\" - \"mutual_information_of_discrete_attributes.mean\") divided by \"mutual_information_of_discrete_attributes.mean\"." + }, + "noise_to_signal_ratio": { + "type": "number", + "description": "An estimate of the amount of irrelevant information in all (categorical and numeric) attributes regarding the target. Equals (\"entropy_of_attributes.mean\" - \"mutual_information_of_attributes.mean\") divided by \"mutual_information_of_attributes.mean\"." + }, + "number_of_missing_values": { + "type": "integer", + "description": "Number of missing values." + }, + "ratio_of_missing_values": { + "type": "number", + "description": "Ratio of number of missing values to number of all values." + }, + "number_of_present_values": { + "type": "integer", + "description": "Number of present values." + }, + "ratio_of_present_values": { + "type": "number", + "description": "Ratio of number of present values to number of all values." + }, + "number_of_numeric_values": { + "type": "integer", + "description": "Number of values that are strictly integers or floats. The value NaN is not counted." + }, + "ratio_of_numeric_values": { + "type": "number", + "description": "Ratio of number of values that are strictly integers or floats to number of all values. The value NaN is not counted." + }, + "number_of_positive_numeric_values": { + "type": "integer", + "description": "Number of positive values." + }, + "ratio_of_positive_numeric_values": { + "type": "number", + "description": "Ratio of number of positive values to number of all values." + }, + "number_of_negative_numeric_values": { + "type": "integer", + "description": "Number of negative values." + }, + "ratio_of_negative_numeric_values": { + "type": "number", + "description": "Ratio of number of negative values to number of all values." + }, + "number_of_numeric_values_equal_0": { + "type": "integer", + "description": "Number of 0 or 0.0 values." + }, + "ratio_of_numeric_values_equal_0": { + "type": "number", + "description": "Ratio of number of 0 or 0.0 values to number of all values." + }, + "number_of_numeric_values_equal_1": { + "type": "integer", + "description": "Number of 1 or 1.0 values." + }, + "ratio_of_numeric_values_equal_1": { + "type": "number", + "description": "Ratio of number of 1 or 1.0 values to number of all values." + }, + "number_of_numeric_values_equal_-1": { + "type": "integer", + "description": "Number of -1 and -1.0." + }, + "ratio_of_numeric_values_equal_-1": { + "type": "number", + "description": "Ratio of number of -1 and -1.0 to number of all values." + }, + "number_of_outlier_numeric_values": { + "allOf": [{"$ref": "#/definitions/outliers"}], + "description": "Outliers of numeric values." + }, + "number_of_instances_with_missing_values": { + "type": "integer", + "description": "Number of instances with missing values in one or more attributes." + }, + "ratio_of_instances_with_missing_values": { + "type": "number", + "description": "Ratio of number of instances with missing values in one or more attributes to number of all instances." + }, + "number_of_instances_with_present_values": { + "type": "integer", + "description": "Number of instances with present values in one or more attributes." + }, + "ratio_of_instances_with_present_values": { + "type": "number", + "description": "Ratio of number of instances with present values in one or more attributes to number of all instances." + }, + "natural_language_of_attribute": { + "type": "array", + "description": "Natural language detection that contains pairs of language code and count.", + "items": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "ISO 639-1 language code, e.g., \"en\", \"es\", \"zh\"." + }, + "count": { + "type": "integer", + "description": "Number of values in a attribute with the given language code." + } + }, + "required": [ + "code", + "count" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "length_of_string_values": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the length of string values." + }, + "token_count_in_string_values": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about the number of tokens per string value. Tokens are split by the space character." + }, + "numeric_char_density": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about numeric character density of string values. Density is defined to be the number of character that satisfies \"isdigit\" divided by the number characters in the string." + }, + "number_of_values_containing_numeric_char": { + "type": "integer", + "description": "Number of string values that contain at least one numeric character." + }, + "ratio_of_values_containing_numeric_char": { + "type": "number", + "description": "Ratio of number of string values that contain at least one numeric character to number of all string values." + }, + "number_of_tokens": { + "type": "integer", + "description": "Number of tokens in all string values. Tokens are split by the space character." + }, + "number_of_tokens_containing_numeric_char": { + "type": "integer", + "description": "Number of tokens in all string values that contain at least one numeric character." + }, + "ratio_of_tokens_containing_numeric_char": { + "type": "number", + "description": "Ratio of number of tokens in all string values that contain at least one numeric character to number of tokens in all string values." + }, + "number_of_tokens_split_by_punctuation": { + "type": "integer", + "description": "Number of tokens in all string values. Tokens are split by \"string.punctions\"." + }, + "number_of_tokens_split_by_punctuation_containing_numeric_char": { + "type": "integer", + "description": "Number of tokens in all string values that contain at least one numeric character." + }, + "ratio_of_tokens_split_by_punctuation_containing_numeric_char": { + "type": "number", + "description": "Ratio of number of tokens in all string values that contain at least one numeric character to number of tokens in all string values split by punctuation." + }, + "number_of_values_with_leading_spaces": { + "type": "integer", + "description": "Number of string values with leading whitespaces." + }, + "ratio_of_values_with_leading_spaces": { + "type": "number", + "description": "Ratio of number of string values with leading whitespaces to number of all string values." + }, + "number_of_values_with_trailing_spaces": { + "type": "integer", + "description": "Number of string values with trailing whitespaces." + }, + "ratio_of_values_with_trailing_spaces": { + "type": "number", + "description": "Ratio of number of string values with trailing whitespaces to number of all string values." + }, + "number_of_distinct_values": { + "type": "integer", + "description": "Number of distinct values. Missing values are ignored." + }, + "ratio_of_distinct_values": { + "type": "number", + "description": "Ratio of number of distinct values to number of all values. Missing values are ignored." + }, + "number_of_distinct_tokens": { + "type": "integer", + "description": "Number of distinct tokens in all string values. Tokens are split by the space character. Missing values are ignored." + }, + "ratio_of_distinct_tokens": { + "type": "number", + "description": "Ratio of number of distinct tokens in all string values to number of tokens in all string values. Tokens are split by the space character. Missing values are ignored." + }, + "number_of_distinct_tokens_split_by_punctuation": { + "type": "integer", + "description": "Number of distinct tokens in all string values. Tokens are split by \"string.punctions\". Missing values are ignored." + }, + "ratio_of_distinct_tokens_split_by_punctuation": { + "type": "number", + "description": "Ratio of number of distinct tokens in all string values to number of tokens in all string values. Tokens are split by \"string.punctions\". Missing values are ignored." + }, + "most_common_tokens": { + "type": "array", + "description": "Most common tokens and their counts and ratio. Tokens are split by the space character.", + "items": { + "type": "object", + "properties": { + "token": { + "type": "string", + "description": "Token string value." + }, + "count": { + "type": "integer", + "description": "Number of occurrences of this token in all string values." + }, + "ratio": { + "type": "number", + "description": "Ratio of number of occurrences of this token in all string values to number of tokens in all string tokens." + } + }, + "required": [ + "token", + "count" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "most_common_alphanumeric_tokens": { + "type": "array", + "description": "Most common alphanumeric tokens and their counts and ratio. A token is alphanumeric if \"isalnum\" returns \"True\". Tokens are split by the space character.", + "items": { + "type": "object", + "properties": { + "token": { + "type": "string", + "description": "Token string value." + }, + "count": { + "type": "integer", + "description": "Number of occurrences of this token in all string values." + }, + "ratio": { + "type": "number", + "description": "Ratio of number of occurrences of this token in all string values to number of tokens in all string tokens." + } + }, + "required": [ + "token", + "count" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "most_common_numeric_tokens": { + "type": "array", + "description": "Most common numeric tokens and their counts and ratio. Tokens are split by the space character.", + "items": { + "type": "object", + "properties": { + "token": { + "type": "string", + "description": "Token string value." + }, + "count": { + "type": "integer", + "description": "Number of occurrences of this token in all string values." + }, + "ratio": { + "type": "number", + "description": "Ratio of number of occurrences of this token in all string values to number of tokens in all string tokens." + } + }, + "required": [ + "token", + "count" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "most_common_tokens_split_by_punctuation": { + "type": "array", + "description": "Most common tokens and their counts and ratio. Tokens are split by \"string.punctions\".", + "items": { + "type": "object", + "properties": { + "token": { + "type": "string", + "description": "Token string value." + }, + "count": { + "type": "integer", + "description": "Number of occurrences of this token in all string values." + }, + "ratio": { + "type": "number", + "description": "Ratio of number of occurrences of this token in all string values to number of tokens in all string tokens." + } + }, + "required": [ + "token", + "count" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "most_common_punctuations": { + "type": "array", + "description": "The most common punctuations and their counts. Punctuations are defined by \"string.punctions\".", + "items": { + "type": "object", + "properties": { + "punctuation": { + "type": "string", + "description": "Punctuation string value." + }, + "count": { + "type": "integer", + "description": "Number of occurrence of this punctuation in all string values." + }, + "ratio": { + "type": "number", + "description": "Ratio of number of occurrences of this punctuation in all string values to number of characters in all string values." + }, + "punctuation_density_aggregate": { + "allOf": [{"$ref": "#/definitions/aggregate"}], + "description": "Aggregate statistics about punctuation density of string values for this punctuation. Punctuation density is the ratio of number of occurrences of this punctuation in the value to the number of characters in the value." + }, + "punctuation_density_outliers": { + "allOf": [{"$ref": "#/definitions/outliers"}], + "description": "Outliers of punctuation density of string values for this punctuation. Punctuation density is the ratio of number of occurrences of this punctuation in the value to the number of characters in the value." + } + }, + "required": [ + "punctuation", + "count" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "most_common_raw_values": { + "type": "array", + "description": "Most common values and their counts and ratio.", + "items": { + "type": "object", + "properties": { + "value": { + "type": "string", + "description": "Value in its raw string format." + }, + "count": { + "type": "integer", + "description": "Number of occurrences of this value in all values." + }, + "ratio": { + "type": "number", + "description": "Ratio of number of occurrences of this value in all values to number of all values." + } + }, + "required": [ + "value", + "count" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "default_accuracy": { + "type": "number", + "description": "The predictive accuracy obtained by always predicting the majority class." + }, + "pca": { + "type": "object", + "description": "The results of principal component analysis on the data using default hyper-parameters.", + "properties": { + "explained_variance_ratio_component_1": { + "type": "number", + "description": "The explained variance ratio of component 1." + }, + "explained_variance_ratio_component_2": { + "type": "number", + "description": "The explained variance ratio of component 2." + }, + "explained_variance_ratio_component_3": { + "type": "number", + "description": "The explained variance ratio of component 3." + }, + "eigenvalue_component_1": { + "type": "number", + "description": "The eigenvalue for component 1." + }, + "eigenvalue_component_2": { + "type": "number", + "description": "The eigenvalue for component 2." + }, + "eigenvalue_component_3": { + "type": "number", + "description": "The eigenvalue for component 3." + }, + "determinant_of_covariance": { + "type": "number", + "description": "The determinant of the covariance matrix." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "oner": { + "type": "object", + "description": "The results of training of Weka's OneR algorithm (or equivalent implementation) on the data using default hyper-parameters.", + "properties": { + "accuracy": { + "type": "number", + "description": "The predictive accuracy. Determines how much information is contained in the most predictive attribute." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "random_tree": { + "type": "object", + "description": "The results of training decision trees of various depths with random splits and other hyper-parameters set to defaults.", + "properties": { + "depth_1_error_rate": { + "type": "number", + "description": "The error rate resulting from training a depth 1 decision tree with a random split." + }, + "depth_1_kappa": { + "type": "number", + "description": "The kappa resulting from training a depth 1 decision tree with a random split." + }, + "depth_1_auc": { + "type": "number", + "description": "The auc resulting from training a depth 1 decision tree with a random split." + }, + "depth_2_error_rate": { + "type": "number", + "description": "The error rate resulting from training a depth 2 decision tree with a random split." + }, + "depth_2_kappa": { + "type": "number", + "description": "The kappa resulting from training a depth 2 decision tree with a random split." + }, + "depth_2_auc": { + "type": "number", + "description": "The auc resulting from training a depth 1 decision tree with a random split." + }, + "depth_3_error_rate": { + "type": "number", + "description": "The error rate resulting from training a depth 3 decision tree with a random split." + }, + "depth_3_kappa": { + "type": "number", + "description": "The kappa resulting from training a depth 3 decision tree with a random split." + }, + "depth_3_auc": { + "type": "number", + "description": "The auc resulting from training a depth 1 decision tree with a random split." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "decision_stump": { + "type": "object", + "description": "The results of training a depth 1 decision tree on the data with the best split based on entropy and other hyper-parameters set to defaults.", + "properties": { + "error_rate": { + "type": "number", + "description": "The error rate resulting from training a depth 1 decision tree with the best split based on entropy." + }, + "kappa": { + "type": "number", + "description": "The kappa resulting from training a depth 1 decision tree with the best split based on entropy." + }, + "auc": { + "type": "number", + "description": "The auc resulting from training a depth 1 decision tree with the best split based on entropy." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "naive_bayes": { + "type": "object", + "description": "The results of training a naive bayes classifier on the data using default hyper-parameters.", + "properties": { + "error_rate": { + "type": "number", + "description": "The error rate resulting from training a naive bayes classifier on the data." + }, + "kappa": { + "type": "number", + "description": "The kappa resulting from training a naive bayes classifier on the data." + }, + "auc": { + "type": "number", + "description": "The auc resulting from training a naive bayes classifier on the data." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "linear_discriminant_analysis": { + "type": "object", + "description": "The results of doing linear discriminant analysis classification on the data using default hyper-parameters.", + "properties": { + "error_rate": { + "type": "number", + "description": "The error rate resulting from doing linear discriminant analysis classification on the data." + }, + "kappa": { + "type": "number", + "description": "The kappa resulting from doing linear discriminant analysis classification on the data." + }, + "auc": { + "type": "number", + "description": "The auc resulting from doing linear discriminant analysis classification on the data." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "knn_1_neighbor": { + "type": "object", + "description": "The results of training a knn classifier on the data with k=1 and other hyper-parameters set to defaults.", + "properties": { + "error_rate": { + "type": "number", + "description": "The error rate resulting from training a knn classifier on the data with k=1." + }, + "kappa": { + "type": "number", + "description": "The kappa resulting from training a knn classifier on the data with k=1." + }, + "auc": { + "type": "number", + "description": "The auc resulting from training a knn classifier on the data with k=1." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "c45_decision_tree": { + "type": "object", + "description": "The results of training a C4.5 decision tree (or equivalent implementation) on the data using default hyper-parameters.", + "properties": { + "error_rate": { + "type": "number", + "description": "The error rate resulting from training a C4.5 decision tree on the data." + }, + "kappa": { + "type": "number", + "description": "The kappa resulting from training a C4.5 decision tree on the data." + }, + "auc": { + "type": "number", + "description": "The auc resulting from training a C4.5 decision tree on the data." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "rep_tree": { + "type": "object", + "description": "The results of training a decision tree using reduced-error pruning (implementation equivalent to Weka's REPTree) on the data using default hyper-parameters.", + "properties": { + "depth_1_error_rate": { + "type": "number", + "description": "The error rate resulting from training a decision tree using reduced-error pruning on the data with tree depth 1." + }, + "depth_1_kappa": { + "type": "number", + "description": "The kappa resulting from training a decision tree using reduced-error pruning on the data with tree depth 1." + }, + "depth_1_auc": { + "type": "number", + "description": "The auc resulting from training a decision tree using reduced-error pruning on the data with tree depth 1." + }, + "depth_2_error_rate": { + "type": "number", + "description": "The error rate resulting from training a decision tree using reduced-error pruning on the data with tree depth 2." + }, + "depth_2_kappa": { + "type": "number", + "description": "The kappa resulting from training a decision tree using reduced-error pruning on the data with tree depth 2." + }, + "depth_2_auc": { + "type": "number", + "description": "The auc resulting from training a decision tree using reduced-error pruning on the data with tree depth 2." + }, + "depth_3_error_rate": { + "type": "number", + "description": "The error rate resulting from training a decision tree using reduced-error pruning on the data with tree depth 3." + }, + "depth_3_kappa": { + "type": "number", + "description": "The kappa resulting from training a decision tree using reduced-error pruning on the data with tree depth 3." + }, + "depth_3_auc": { + "type": "number", + "description": "The auc resulting from training a decision tree using reduced-error pruning on the data with tree depth 3." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "jrip": { + "type": "object", + "description": "The results of training a propositional rule learner (implementation equivalent to Weka's JRip), Repeated Incremental Pruning to Produce Error Reduction (RIPPER), which was proposed by William W. Cohen as an optimized version of IREP.", + "properties": { + "error_rate": { + "type": "number", + "description": "The error rate resulting from training a propositional rule learner." + }, + "kappa": { + "type": "number", + "description": "The kappa rate resulting from training a propositional rule learner." + }, + "auc": { + "type": "number", + "description": "The auc resulting from training a propositional rule learner." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + }, + "naive_bayes_tree": { + "type": "object", + "description": "A decision tree with naive bayes classifiers at the leaves.", + "properties": { + "error_rate": { + "type": "number", + "description": "The error rate resulting from training with the naive bayes tree algorithm." + }, + "kappa": { + "type": "number", + "description": "The kappa rate resulting from training with the naive bayes tree algorithm." + }, + "auc": { + "type": "number", + "description": "The auc resulting from training with the naive bayes tree algorithm." + }, + "primitive": { + "allOf": [{"$ref": "#/definitions/primitive_reference"}], + "description": "A primitive used to compute these metafeatures." + }, + "random_seed": { + "type": "integer", + "description": "Random seed used, if a primitive accepts a random seed." + } + }, + "required": [ + "primitive" + ], + "additionalProperties": true + } + }, + "additionalProperties": true + }, + "docker_image": { + "description": "A reference to a docker image, including a name and a digest.", + "type": "object", + "properties": { + "image_name": { + "type": "string", + "description": "Docker image name including a label, and optionally prefixed with a registry." + }, + "image_digest": { + "type": "string", + "description": "Docker image digest.", + "pattern": "^sha256:[a-fA-F0-9]{64}$" + } + }, + "required": [ + "image_name", + "image_digest" + ], + "additionalProperties": true + }, + "installation": { + "type": "array", + "description": "Installation instructions for a primitive. Everything listed has to be installed, in order listed, for a primitive to work.", + "items": { + "type": "object", + "oneOf": [ + { + "properties": { + "type": { + "type": "string", + "enum": ["PIP"], + "description": "A Python package." + }, + "package": { + "type": "string", + "description": "Python package name." + }, + "version": { + "allOf": [{"$ref": "#/definitions/version"}], + "description": "Exact version string." + }, + "registry": { + "type": "string" + } + }, + "required": [ + "package", + "type", + "version" + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["PIP"], + "description": "A Python package. It should be installed with pip's \"--editable\" argument enabled." + }, + "package_uri": { + "type": "string", + "description": "Python package's canonical URI for installation with an exact version of the package, ideally git commit hash. If it is a git URI, \"#egg=package_name\" URI suffix is required." + } + }, + "required": [ + "package_uri", + "type" + ] + }, + { + "allOf": [ + { + "properties": { + "type": { + "type": "string", + "enum": ["DOCKER"], + "description": "A Docker image." + }, + "key": { + "type": "string", + "description": "When this Docker image runs, its address should be exposed to the primitive under this key." + } + }, + "required": [ + "type", + "key" + ] + }, + { + "$ref": "#/definitions/docker_image" + } + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["UBUNTU"], + "description": "A system package." + }, + "package": { + "type": "string", + "description": "Ubuntu package name." + }, + "version": { + "type": "string", + "description": "Exact version string. While the version is required it is not required to install exactly this version of the package with a primitive because generally it is hard to get a hold of an old version to install (old packages get removed or moved to an archive). Knowing a version author of a primitive used can help with debugging to maybe understand why a primitive is misbehaving." + } + }, + "required": [ + "package", + "type", + "version" + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["FILE"], + "description": "A file to be downloaded and then provided as a volume to the primitive during its run. Download should be equivalent to the example: \"curl https://example.com/file > /path/to/volume_file\"." + }, + "key": { + "type": "string", + "description": "A downloaded file path should be exposed to the primitive under this key." + }, + "file_uri": { + "type": "string", + "description": "Where to download the file from.", + "format": "uri" + }, + "file_digest": { + "type": "string", + "description": "A SHA256 hexadecimal digest of the file.", + "pattern": "^[a-fA-F0-9]{64}$" + } + }, + "required": [ + "key", + "type", + "file_uri", + "file_digest" + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["TGZ"], + "description": "A gzipped tar file to be downloaded, extracted to a directory, which is then provided as a volume to the primitive during its run. Extraction should be equivalent to the example: \"curl https://example.com/file.tgz | tar -xz -C /path/to/volume_dir\"." + }, + "key": { + "type": "string", + "description": "An extracted directory path should be exposed to the primitive under this key." + }, + "file_uri": { + "type": "string", + "description": "Where to download the file from.", + "format": "uri" + }, + "file_digest": { + "type": "string", + "description": "A SHA256 hexadecimal digest of the file.", + "pattern": "^[a-fA-F0-9]{64}$" + } + }, + "required": [ + "key", + "type", + "file_uri", + "file_digest" + ] + } + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "primitive_code": { + "type": "object", + "description": "Metadata describing the primitive's code.", + "properties": { + "class_type_arguments": { + "type": "object", + "description": "A map between type variables in primitive interfaces and their specified types for this primitive.", + "additionalProperties": { + "$ref": "#/definitions/structural_type" + } + }, + "interfaces_version": { + "description": "Version of d3m package in use by the primitive.", + "allOf": [{"$ref": "#/definitions/version"}] + }, + "interfaces": { + "type": "array", + "description": "A list of Python primitive interface classes used by the primitive in method resolution order.", + "items": { + "type": "string" + }, + "minItems": 1 + }, + "params": { + "type": "object", + "description": "A map between primitive's parameter names and their types.", + "additionalProperties": { + "$ref": "#/definitions/structural_type" + } + }, + "hyperparams": { + "$ref": "#/definitions/hyperparams_configuration" + }, + "arguments": { + "type": "object", + "description": "A map describing all arguments which the primitive as a whole accepts, mapping the name of the argument to its description.", + "additionalProperties": { + "type": "object", + "properties": { + "type": { + "$ref": "#/definitions/structural_type" + }, + "kind": { + "type": "string", + "oneOf": [ + {"enum": ["RUNTIME"], "description": "Arguments which are meaningful only for a runtime executing a pipeline."}, + {"enum": ["PIPELINE"], "description": "Arguments which can be fulfilled by other primitives in a pipeline."}, + {"enum": ["HYPERPARAMETER"], "description": "Arguments which are overriding a hyper-parameter value for a method call."} + ] + }, + "default": { + "allOf": [{"$ref": "#/definitions/python_value"}], + "description": "A default value. Omitted if an argument has no default value." + } + }, + "required": [ + "type", + "kind" + ], + "additionalProperties": true + } + }, + "class_methods": { + "type": "object", + "description": "A map between primitive's class method names and their descriptions.", + "additionalProperties": { + "type": "object", + "properties": { + "description": { + "$ref": "#/definitions/description" + }, + "arguments": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "type": { + "$ref": "#/definitions/structural_type" + }, + "default": { + "allOf": [{"$ref": "#/definitions/python_value"}], + "description": "A default value. Omitted if an argument has no default value." + } + }, + "additionalProperties": true, + "required": [ + "type" + ] + } + }, + "returns": { + "$ref": "#/definitions/structural_type" + } + }, + "required": [ + "returns" + ], + "additionalProperties": true + } + }, + "instance_methods": { + "type": "object", + "description": "A map between primitive's instance method names and their descriptions.", + "additionalProperties": { + "type": "object", + "properties": { + "kind": { + "type": "string", + "oneOf": [ + {"enum": ["PRODUCE"], "description": "Methods which outputs can be inputs to another primitive."}, + {"enum": ["OTHER"], "description": "Methods used by the runtime."} + ] + }, + "description": { + "$ref": "#/definitions/description" + }, + "arguments": { + "type": "array", + "description": "A list of argument names this method accepts. Their description can be found in primitive's \"arguments\" map.", + "items": { + "type": "string" + } + }, + "returns": { + "$ref": "#/definitions/structural_type" + }, + "singleton": { + "type": "boolean", + "description": "Is a produce method a singleton produce method?" + }, + "inputs_across_samples": { + "type": "array", + "description": "List of inputs a produce method uses across samples and not sample by sample.", + "items": { + "type": "string" + } + } + }, + "required": [ + "kind", + "arguments", + "returns" + ], + "additionalProperties": true + } + }, + "class_attributes": { + "type": "object", + "description": "A map between primitive's class attribute names and their types.", + "additionalProperties": { + "$ref": "#/definitions/structural_type" + } + }, + "instance_attributes": { + "type": "object", + "description": "A map between primitive's instance attribute names and their types.", + "additionalProperties": { + "$ref": "#/definitions/structural_type" + } + } + }, + "required": [ + "class_type_arguments", + "interfaces_version", + "interfaces" + ], + "additionalProperties": true + }, + "hyperparams_configuration": { + "type": "object", + "description": "A map describing the hyper-parameter configuration of the primitive, mapping the name of the hyper-parameter to its description.", + "additionalProperties": { + "$ref": "#/definitions/hyperparameter" + } + }, + "hyperparameter": { + "type": "object", + "description": "Description of a hyper-parameter.", + "properties": { + "type": { + "allOf": [{"$ref": "#/definitions/python_type"}], + "description": "A Python type of the hyper-parameter description itself." + }, + "default": { + "allOf": [{"$ref": "#/definitions/python_value"}], + "description": "A default value." + }, + "structural_type": { + "$ref": "#/definitions/structural_type" + }, + "semantic_types": { + "$ref": "#/definitions/semantic_types" + }, + "description": { + "$ref": "#/definitions/description" + }, + "lower": { + "$ref": "#/definitions/python_value" + }, + "upper": { + "$ref": "#/definitions/python_value" + }, + "upper_inclusive": { + "type": "boolean" + }, + "q": { + "type": "number" + }, + "mu": { + "type": "number" + }, + "sigma": { + "type": "number" + }, + "values": { + "type": "array", + "items": { + "$ref": "#/definitions/python_value" + } + }, + "configuration": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/hyperparameter" + } + }, + "primitive_families": { + "type": "array", + "items": { + "type": "string" + } + }, + "algorithm_types": { + "type": "array", + "items": { + "type": "string" + } + }, + "choices": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/hyperparams_configuration" + } + }, + "elements": { + "anyOf": [ + { + "$ref": "#/definitions/hyperparameter" + }, + { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/hyperparameter" + } + } + ] + }, + "is_configuration": { + "type": "boolean" + }, + "min_size": { + "type": "integer" + }, + "max_size": { + "type": "integer" + } + }, + "required": [ + "type", + "default", + "structural_type", + "semantic_types" + ], + "additionalProperties": true + }, + "structural_type": { + "$ref": "#/definitions/python_type" + }, + "media_types": { + "type": "array", + "description": "Media type of the value in its extended form defining encoding, e.g., \"text/plain; charset=utf-8\".", + "items": { + "type": "string" + }, + "minItems": 1 + }, + "sampling_rate": { + "type": "number", + "description": "Sampling rate (frequency) is the number of samples per second." + }, + "time_granularity": { + "type": "object", + "properties": { + "value": { + "type": "number" + }, + "unit": { + "enum": [ + "SECONDS", + "MINUTES", + "DAYS", + "WEEKS", + "MONTHS", + "YEARS", + "UNSPECIFIED" + ] + } + }, + "required": [ + "value", + "unit" + ], + "additionalProperties": true + }, + "stored_size": { + "type": "integer", + "description": "Size in bytes when or if stored to disk." + }, + "approximate_stored_size": { + "type": "integer", + "description": "Approximate size in bytes when or if stored to disk." + }, + "semantic_types": { + "type": "array", + "description": "A list of canonical URIs defining semantic types. Some commonly used URIs are listed as possible values here, but you can use any URI representing a semantic type.", + "items": { + "anyOf": [ + {"enum": ["http://schema.org/ImageObject"], "description": "Value is an image."}, + {"enum": ["http://schema.org/VideoObject"], "description": "Value is a video."}, + {"enum": ["http://schema.org/AudioObject"], "description": "Value is an audio clip."}, + {"enum": ["http://schema.org/Text"], "description": "Value is text/string."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Speech"], "description": "Value is an audio clip of human speech."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Graph"], "description": "Value is a graph structure or a node list of a graph structure."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/EdgeList"], "description": "Value is an edge list of a graph structure."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Table"], "description": "Value is tabular data."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Timeseries"], "description": "Value is time-series data."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/UnspecifiedStructure"], "description": "Value has unspecified structure."}, + {"enum": ["http://schema.org/Boolean"], "description": "Value represents a boolean."}, + {"enum": ["http://schema.org/Integer"], "description": "Value represents an integer."}, + {"enum": ["http://schema.org/Float"], "description": "Value represents a float."}, + {"enum": ["http://schema.org/DateTime"], "description": "Value represents a timestamp."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/FloatVector"], "description": "Value represents a vector of floats.", "parents": ["http://schema.org/DataType"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/JSON"], "description": "Value represents a JSON object.", "parents": ["http://schema.org/DataType"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/GeoJSON"], "description": "Value represents a GeoJSON object.", "parents": ["https://metadata.datadrivendiscovery.org/types/JSON"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/CategoricalData"], "description": "Value represents categorical data."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/OrdinalData"], "description": "Value represents ordinal data."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"], "description": "A column can have a role in a table."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/PrimaryKey"], "description": "Value serves as a primary key.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey"], "description": "Value serves as a primary key without uniqueness constraint to allow the same row to be repeated multiple times.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/UniqueKey"], "description": "Value serves as an unique key, i.e., it satisfies the uniqueness constraint among other values.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey"], "description": "Value serves as a potential grouping key to group rows (samples) together. Used in time-series datasets containing multiple time-series to hint how to identify individual time-series. If there are multiple columns with this semantic type the relation between them is unspecified, they can be used individually or in combination.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/GroupingKey"], "description": "Value serves as an active grouping key to group rows (samples) together. Used in time-series datasets containing multiple time-series to identify individual time-series. Each column with this semantic type should be used individually and if multiple columns with this semantic type exist, each column represent a different grouping.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Attribute"], "description": "Value serves as an attribute (input feature) to fit on or be used for analysis.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/ConstructedAttribute"], "description": "Value serves as a constructed attribute (input feature). This is set by primitives when constructing attributes. It should not be used for fitting.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/SuggestedTarget"], "description": "Value serves as a potential target variable for a problem. This is a property of input data.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/RedactedTarget"], "description": "Value is redacted, but would otherwise be a target variable for a problem. This is a property of input data.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Target"], "description": "Value serves as a target variable for a problem.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/PredictedTarget"], "description": "Value serves as a predict target variable for a problem. This is set by primitives when predicting targets.", "parents": ["https://metadata.datadrivendiscovery.org/types/Target"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/TrueTarget"], "description": "Value serves as a true target variable for a problem. This is set by a runtime based on problem description.", "parents": ["https://metadata.datadrivendiscovery.org/types/Target"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Score"], "description": "Value is a prediction score computed by comparing predicted and true target.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Confidence"], "description": "Value serves as a confidence of a predicted target variable. \"confidence_for\" metadata can be used to reference for which target column(s) this column is confidence for.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Rank"], "description": "Value serves as a rank of a predicted target variable. \"rank_for\" metadata can be used to reference for which target column(s) this column is rank for.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/SuggestedPrivilegedData"], "description": "Value serves as a potential privileged (available during fitting but not producing) attribute.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData"], "description": "Value is redacted, but would otherwise be a privileged attribute.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/PrivilegedData"], "description": "Value serves as a privileged (available during fitting but not producing) attribute.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/EdgeSource"], "description": "Value serves as a source of a graph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/DirectedEdgeSource"], "description": "Value serves as a source of a directed graph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/EdgeSource"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/UndirectedEdgeSource"], "description": "Value serves as a source of a undirected graph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/EdgeSource"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/SimpleEdgeSource"], "description": "Value serves as a source of a simple graph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/EdgeSource"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/MultiEdgeSource"], "description": "Value serves as a source of a multigraph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/EdgeSource"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/EdgeTarget"], "description": "Value serves as a target of a graph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/DirectedEdgeTarget"], "description": "Value serves as a target of a directed graph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/EdgeTarget"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/UndirectedEdgeTarget"], "description": "Value serves as a target of a undirected graph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/EdgeTarget"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/SimpleEdgeTarget"], "description": "Value serves as a target of a simple graph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/EdgeTarget"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/MultiEdgeTarget"], "description": "Value serves as a target of a multigraph edge.", "parents": ["https://metadata.datadrivendiscovery.org/types/EdgeTarget"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Time"], "description": "Value represents time.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Location"], "description": "Value represents a location.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Boundary"], "description": "Value represents a boundary.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/Interval"], "description": "Value represents an interval as a pair of start and end.", "parents": ["https://metadata.datadrivendiscovery.org/types/Boundary"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/IntervalStart"], "description": "Value represents a start of an interval.", "parents": ["https://metadata.datadrivendiscovery.org/types/Boundary"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/IntervalEnd"], "description": "Value represents an end of an interval.", "parents": ["https://metadata.datadrivendiscovery.org/types/Boundary"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/BoundingPolygon"], "description": "Value represents a bounding polygon as a series of (X, Y) coordinate pairs of vertices in counter-clockwise order.", "parents": ["https://metadata.datadrivendiscovery.org/types/Boundary"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/InstanceWeight"], "description": "Value serves as a weight for an instance.", "parents": ["https://metadata.datadrivendiscovery.org/types/ColumnRole"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/UnknownType"], "description": "It is not known what the value represents."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/FileName"], "description": "Value is a filename."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/DimensionType"], "description": "Value represents a dimension."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/DatasetResource"], "description": "Value is a dataset resource.", "parents": ["https://metadata.datadrivendiscovery.org/types/DimensionType"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/TabularRow"], "description": "Value is a row in tabular data.", "parents": ["https://metadata.datadrivendiscovery.org/types/DimensionType"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/TabularColumn"], "description": "Value is a column in tabular data.", "parents": ["https://metadata.datadrivendiscovery.org/types/DimensionType"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/MissingData"], "description": "Value is missing."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/InvalidData"], "description": "Value is present, but is invalid."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/HyperParameter"], "description": "Value is a hyper-parameter."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/TuningParameter"], "description": "Hyper-parameter is a tuning parameter of the primitive.", "parents": ["https://metadata.datadrivendiscovery.org/types/HyperParameter"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/ControlParameter"], "description": "Hyper-parameter is a control parameter of the primitive.", "parents": ["https://metadata.datadrivendiscovery.org/types/HyperParameter"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter"], "description": "Hyper-parameter is a parameter which controls the use of resources by the primitive.", "parents": ["https://metadata.datadrivendiscovery.org/types/HyperParameter"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/CPUResourcesUseParameter"], "description": "Hyper-parameter is a parameter which controls the use of CPU resources (cores) by the primitive.", "parents": ["https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/MetafeatureParameter"], "description": "Hyper-parameter controls which meta-feature is computed by the primitive.", "parents": ["https://metadata.datadrivendiscovery.org/types/HyperParameter"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/ChoiceParameter"], "description": "Hyper-parameter is selecting one choice among multiple hyper-parameters space choices.", "parents": ["https://metadata.datadrivendiscovery.org/types/HyperParameter"]}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint"], "description": "Resource is a dataset entry point."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/FilesCollection"], "description": "Resource is a files collection."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/TokenizableIntoNumericAndAlphaTokens"], "description": "Value can be tokenized into pure numeric tokens (satisfies \"isdigit\") and pure alphabetic tokens(satisfies \"isalpha\"). E.g., value \"123abc456\" can be tokenized into (\"123\", \"abc\", \"456\")."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/TokenizableByPunctuation"], "description": "Value can be tokenized by splitting on punctuation. E.g., value \"ab_cd;12\" can be tokenized into (\"ab\", \"cd\", \"12\")."}, + {"enum": ["https://metadata.datadrivendiscovery.org/types/AmericanPhoneNumber"], "description": "Value can be recognized as an American style phone number, e.g., \"(310)822-1511\" and \"1-310-822-1511\"."}, + {"enum": ["http://schema.org/email"], "description": "Value is an email address."}, + {"enum": ["http://schema.org/URL"], "description": "Value represents a URL."}, + {"enum": ["http://schema.org/address"], "description": "Value is an address, broadly defined."}, + {"enum": ["http://schema.org/State"], "description": "Value is a state, could be US or foreign."}, + {"enum": ["http://schema.org/City"], "description": "Value is a city, could be US or foreign."}, + {"enum": ["http://schema.org/Country"], "description": "Value is a country."}, + {"enum": ["http://schema.org/addressCountry"], "description": "Value is a country code."}, + {"enum": ["http://schema.org/postalCode"], "description": "Value is a US postal code."}, + {"enum": ["http://schema.org/latitude"], "description": "Value represents a latitude."}, + {"enum": ["http://schema.org/longitude"], "description": "Value represents a longitude."}, + { + "type": "string", + "description": "A URI not listed among commonly used URIs. Please feel encouraged to open a merge request adding semantic types you are using so that also others can learn about them.", + "format": "uri" + } + ] + } + }, + "location_uris": { + "type": "array", + "description": "A list of URIs where the value is stored.", + "items": { + "type": "string", + "format": "uri" + } + }, + "location_base_uris": { + "type": "array", + "description": "A list of URIs which can be used as a base to determine where the value is stored.", + "items": { + "type": "string", + "format": "uri" + } + }, + "source": { + "type": "object", + "description": "Information about the source. Author and other information how the value came to be.", + "properties": { + "name": { + "$ref": "#/definitions/name" + }, + "contact": { + "type": "string", + "description": "An URI to contact the source.", + "format": "uri" + }, + "uris": { + "type": "array", + "description": "A list of URIs where the value is coming from, e.g., website with a dataset, or source code for a primitive.", + "items": { + "type": "string", + "format": "uri" + } + }, + "published": { + "allOf": [{"$ref": "#/definitions/timestamp"}], + "description": "A timestamp when was the value made available." + }, + "license": { + "type": "string", + "description": "License under which the value is available." + }, + "citation": { + "type": "string", + "description": "Citation of the source." + }, + "human_subjects_research": { + "type": "boolean", + "description": "Does value contain human subjects data or not." + }, + "redacted": { + "type": "boolean", + "description": "Has the value been redacted." + }, + "from": { + "type": "object", + "oneOf": [ + { + "properties": { + "type": { + "type": "string", + "enum": ["REDACTED"], + "description": "The value has been redacted from the referenced value." + }, + "dataset": { + "$ref": "#/definitions/dataset_reference" + } + }, + "required": [ + "dataset", + "type" + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["REDACTED"], + "description": "The value has been redacted from the referenced value." + }, + "problem": { + "$ref": "#/definitions/problem_reference" + } + }, + "required": [ + "problem", + "type" + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["PIPELINE"], + "description": "The pipeline has been derived from another pipeline or pipelines." + }, + "pipelines": { + "type": "array", + "description": "A list of pipelines used to derive the pipeline.", + "items": { + "$ref": "#/definitions/pipeline_reference" + }, + "minItems": 1 + } + }, + "required": [ + "pipelines", + "type" + ] + } + ], + "additionalProperties": true + } + }, + "additionalProperties": true + }, + "keywords": { + "type": "array", + "description": "A list of keywords. Strings in an unspecified language and vocabulary.", + "items": { + "type": "string" + } + }, + "foreign_key": { + "type": "object", + "description": "Columns in a table in a dataset resource can reference other resources.", + "oneOf": [ + { + "properties": { + "type": { + "type": "string", + "enum": ["COLUMN"], + "description": "The foreign key is referencing a column in a table in a dataset resource." + }, + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_index": { + "$ref": "#/definitions/column_index" + } + }, + "required": [ + "type", + "resource_id", + "column_index" + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["COLUMN"], + "description": "The foreign key is referencing a column in a table in a dataset resource." + }, + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_name": { + "$ref": "#/definitions/column_name" + } + }, + "required": [ + "type", + "resource_id", + "column_name" + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["NODE_ATTRIBUTE"], + "description": "The foreign key is referencing a node attribute in a dataset resource, a graph." + }, + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "node_attribute": { + "$ref": "#/definitions/column_name" + } + }, + "required": [ + "type", + "resource_id", + "node_attribute" + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["EDGE_ATTRIBUTE"], + "description": "The foreign key is referencing an edge attribute in a dataset resource, a graph." + }, + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "edge_attribute": { + "$ref": "#/definitions/column_name" + } + }, + "required": [ + "type", + "resource_id", + "edge_attribute" + ] + }, + { + "properties": { + "type": { + "type": "string", + "enum": ["RESOURCE"], + "description": "The foreign key is referencing another dataset resource. The value is resource ID." + } + }, + "required": [ + "type" + ] + } + ] + }, + "boundary_for": { + "type": "object", + "description": "A column in a table can be a boundary for another column in the same table or a table in another dataset resource.", + "oneOf": [ + { + "properties": { + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_index": { + "$ref": "#/definitions/column_index" + } + }, + "required": [ + "column_index" + ] + }, + { + "properties": { + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_name": { + "$ref": "#/definitions/column_name" + } + }, + "required": [ + "column_name" + ] + } + ] + }, + "confidence_for": { + "type": "object", + "description": "A column in a table can be a confidence for other columns in the same table or a table in another dataset resource.", + "oneOf": [ + { + "properties": { + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_indices": { + "type": "array", + "items": { + "$ref": "#/definitions/column_index" + }, + "minItems": 1 + } + }, + "required": [ + "column_indices" + ] + }, + { + "properties": { + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_names": { + "type": "array", + "items": { + "$ref": "#/definitions/column_name" + }, + "minItems": 1 + } + }, + "required": [ + "column_names" + ] + } + ] + }, + "rank_for": { + "type": "object", + "description": "A column in a table can be a rank for other columns in the same table or a table in another dataset resource.", + "oneOf": [ + { + "properties": { + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_indices": { + "type": "array", + "items": { + "$ref": "#/definitions/column_index" + }, + "minItems": 1 + } + }, + "required": [ + "column_indices" + ] + }, + { + "properties": { + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_names": { + "type": "array", + "items": { + "$ref": "#/definitions/column_name" + }, + "minItems": 1 + } + }, + "required": [ + "column_names" + ] + } + ] + }, + "algorithm_types": { + "type": "array", + "description": "Algorithm type describes the underlying implementation of the primitive. It uses controlled, standardized, but open vocabulary which means that if types which would best describe your primitive are missing, please feel encouraged to open a merge request adding them.", + "items": { + "oneOf": [ + {"enum": ["RULE_BASED_FILTER"]}, + + {"enum": ["DUPLICATION_VALIDATION"]}, + {"enum": ["CONTINUITY_VALIDATION"]}, + {"enum": ["HP_FILTER"], "description": "https://en.wikipedia.org/wiki/Hodrick–Prescott_filter"}, + {"enum": ["BK_FILTER"]}, + {"enum": ["TEMPORAL_REGULARIZED_MATRIX_FACTORIZATION"]}, + {"enum": ["ANGLE_BASE_OUTLIER_DETECTION"]}, + {"enum": ["HISTOGRAM_BASED_OUTLIER_DETECTION"]}, + {"enum": ["ISOLATION_FOREST"]}, + {"enum": ["SUBSPACE_OUTLIER_DETECTION"]}, + {"enum": ["AUTOCORRELATION"], "description": "https://en.wikipedia.org/wiki/Autocorrelation"}, + {"enum": ["CATEGORICAL_TO_BINARY"]}, + {"enum": ["DISCRETE_COSINE_TRANSFORM"], "description": "https://en.wikipedia.org/wiki/Discrete_cosine_transform"}, + {"enum": ["FAST_FOURIER_TRANSFORM"], "description": "https://en.wikipedia.org/wiki/Fast_Fourier_transform"}, + {"enum": ["HOLT_SMOOTHING"], "description": "https://medium.com/datadriveninvestor/how-to-build-exponential-smoothing-models-using-python-simple-exponential-smoothing-holt-and-da371189e1a1"}, + {"enum": ["HOLT_WINTERS_EXPONENTIAL_SMOOTHING"], "description": "https://medium.com/datadriveninvestor/how-to-build-exponential-smoothing-models-using-python-simple-exponential-smoothing-holt-and-da371189e1a1"}, + {"enum": ["MATRIX_PROFILE"], "description": "https://en.wikipedia.org/wiki/Matrix_profile"}, + {"enum": ["MEAN_AVERAGE_TRANSFORM"], "description": "https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html"}, + {"enum": ["MOVING_AVERAGE_TRANSFORM"], "description": "https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html"}, + {"enum": ["NON_NEGATIVE_MATRIX_FACTORIZATION"], "description":"https://en.wikipedia.org/wiki/Non-negative_matrix_factorization"}, + {"enum": ["PYOD_COF"]}, + {"enum": ["SIMPLE_EXPONENTIAL_SMOOTHING"], "description": "https://medium.com/datadriveninvestor/how-to-build-exponential-smoothing-models-using-python-simple-exponential-smoothing-holt-and-da371189e1a1"}, + {"enum": ["SUM_CODING"]}, + {"enum": ["TIME_INTERVAL_TRANSFORM"], "description": "https://en.wikipedia.org/wiki/Time_interval_transform"}, + {"enum": ["VARIATIONAL_AUTO_ENCODER"],"description":"https://www.jeremyjordan.me/variational-autoencoders/"}, + {"enum": ["ACCURACY_SCORE"], "description": "https://en.wikipedia.org/wiki/Accuracy_and_precision"}, + {"enum": ["ADABOOST"], "description": "https://en.wikipedia.org/wiki/AdaBoost"}, + {"enum": ["ADAPTIVE_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Adaptive_algorithm"}, + {"enum": ["AGGREGATE_FUNCTION"], "description": "https://en.wikipedia.org/wiki/Aggregate_function"}, + {"enum": ["ALMEIDA_PINEDA_RECURRENT_BACKPROPAGATION"], "description": "https://en.wikipedia.org/wiki/Almeida%E2%80%93Pineda_recurrent_backpropagation"}, + {"enum": ["ALOPEX"], "description": "https://en.wikipedia.org/wiki/ALOPEX"}, + {"enum": ["ALTERNATING_DECISION_TREE"], "description": "https://en.wikipedia.org/wiki/Alternating_decision_tree"}, + {"enum": ["ANT_COLONY_OPTIMIZATION"], "description": "https://en.wikipedia.org/wiki/Ant_colony_optimization_algorithms"}, + {"enum": ["APPROXIMATE_DATA_AUGMENTATION"], "description": "Augmenting data approximately using data that has the best matching score."}, + {"enum": ["ARRAY_CONCATENATION"]}, + {"enum": ["ARRAY_SLICING"], "description": "https://en.wikipedia.org/wiki/Array_slicing"}, + {"enum": ["ASSOCIATION_RULE_LEARNING"], "description": "https://en.wikipedia.org/wiki/Association_rule_learning"}, + {"enum": ["ASSOCIATIVE_NEURAL_NETWORK"]}, + {"enum": ["ATTRACTOR_NETWORK"], "description": "https://en.wikipedia.org/wiki/Attractor_network"}, + {"enum": ["AUDIO_MIXING"], "description": "https://en.wikipedia.org/wiki/Audio_mixing_(recorded_music)"}, + {"enum": ["AUDIO_STREAM_MANIPULATION"], "description": "https://en.wikipedia.org/wiki/Audio_signal_processing"}, + {"enum": ["AUGMENTED_LAGRANGIAN_METHOD"], "description": "https://en.wikipedia.org/wiki/Augmented_Lagrangian_method"}, + {"enum": ["AUTOENCODER"], "description": "https://en.wikipedia.org/wiki/Autoencoder"}, + {"enum": ["AUTOREGRESSIVE_INTEGRATED_MOVING_AVERAGE"], "description": "https://en.wikipedia.org/wiki/Autoregressive_integrated_moving_average"}, + {"enum": ["BACKWARD_DIFFERENCE_CODING"], "description": "https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/#backward"}, + {"enum": ["BAG_OF_WORDS_MODEL"], "description": "https://en.wikipedia.org/wiki/Bag-of-words_model"}, + {"enum": ["BATCH_NORMALIZATION"]}, + {"enum": ["BAYESIAN_LINEAR_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Bayesian_linear_regression"}, + {"enum": ["BAYESIAN_MODEL_AVERAGING"], "description": "https://en.wikipedia.org/wiki/Bootstrap_aggregating"}, + {"enum": ["BAYESIAN_NETWORK"], "description": "https://en.wikipedia.org/wiki/Bayesian_network"}, + {"enum": ["BAYESIAN_OPTIMIZATION"]}, + {"enum": ["BELIEF_PROPAGATION"], "description": "https://en.wikipedia.org/wiki/Belief_propagation"}, + {"enum": ["BERT"], "description": "https://arxiv.org/abs/1810.04805"}, + {"enum": ["BINARY_CLASSIFICATION"], "description": "https://en.wikipedia.org/wiki/Binary_classification"}, + {"enum": ["BIRCH"], "description": "https://en.wikipedia.org/wiki/Bayesian_optimization"}, + {"enum": ["BOLTZMANN_MACHINE"], "description": "https://en.wikipedia.org/wiki/BIRCH"}, + {"enum": ["BOOSTING"], "description": "https://en.wikipedia.org/wiki/Boltzmann_machine"}, + {"enum": ["BOOTSTRAP_AGGREGATING"], "description": "https://en.wikipedia.org/wiki/Boosting_(machine_learning)"}, + {"enum": ["BOOTSTRAPPING"], "description": "https://en.wikipedia.org/wiki/Bootstrapping_(statistics)"}, + {"enum": ["BRANCH_AND_BOUND"], "description": "https://en.wikipedia.org/wiki/Branch_and_bound"}, + {"enum": ["BREADTH_FIRST_SEARCH"], "description": "https://en.wikipedia.org/wiki/Breadth-first_search"}, + {"enum": ["BRIER_SCORE"], "description": "https://en.wikipedia.org/wiki/Brier_score"}, + {"enum": ["BROOKS_IYENGAR"], "description": "https://en.wikipedia.org/wiki/Brooks%E2%80%93Iyengar_algorithm"}, + {"enum": ["BROWNBOOST"], "description": "https://en.wikipedia.org/wiki/BrownBoost"}, + {"enum": ["C45"], "description": "https://en.wikipedia.org/wiki/C4.5_algorithm"}, + {"enum": ["C50"]}, + {"enum": ["CANONICAL_CORRELATION_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Canonical_correlation"}, + {"enum": ["CASCADE_CORRELATION_NETWORK"]}, + {"enum": ["CASE_BASED_REASONING"], "description": "https://en.wikipedia.org/wiki/Case-based_reasoning"}, + {"enum": ["CATEGORY_ENCODER"]}, + {"enum": ["CAUSAL_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Causal_analysis"}, + {"enum": ["CLASSIFIER_CHAINS"], "description": "https://en.wikipedia.org/wiki/Classifier_chains"}, + {"enum": ["CN2"], "description": "https://en.wikipedia.org/wiki/CN2_algorithm"}, + {"enum": ["COBWEB"], "description": "https://en.wikipedia.org/wiki/Cobweb_(clustering)"}, + {"enum": ["COEFFICIENT_OF_DETERMINATION"], "description": "https://en.wikipedia.org/wiki/Coefficient_of_determination"}, + {"enum":["COLUMN_FILTER"], "description": "https://en.wikipedia.org/wiki/Column_filter"}, + {"enum": ["COLOR_SPACE_CONVERSION"], "description": "https://en.wikipedia.org/wiki/Color_space"}, + {"enum": ["COMMITTEE_MACHINE"], "description": "https://en.wikipedia.org/wiki/Committee_machine"}, + {"enum": ["COMPOSITIONAL_PATTERN_PRODUCING_NETWORK"], "description": "https://en.wikipedia.org/wiki/Compositional_pattern-producing_network"}, + {"enum": ["COMPUTER_ALGEBRA"], "description": "https://en.wikipedia.org/wiki/Computer_algebra"}, + {"enum": ["CONDITIONAL_RANDOM_FIELD"], "description": "https://en.wikipedia.org/wiki/Conditional_random_field"}, + {"enum": ["CONTEXTUAL_BANDIT"]}, + {"enum": ["CONVOLUTIONAL_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Convolutional_neural_network"}, + {"enum": ["CONVOLUTIONAL_NEURAL_NETWORK_LAYER"], "description": "https://en.wikipedia.org/wiki/Convolutional_neural_network#Convolutional_layer"}, + {"enum": ["COORDINATE_DESCENT"], "description": "https://en.wikipedia.org/wiki/Coordinate_descent"}, + {"enum": ["CORRELATION_CLUSTERING"], "description": "https://en.wikipedia.org/wiki/Correlation_clustering"}, + {"enum": ["CORTICAL_LEARNING"]}, + {"enum": ["COTRAINING"], "description": "https://en.wikipedia.org/wiki/Co-training"}, + {"enum": ["CROSS_ENTROPY"], "description": "https://en.wikipedia.org/wiki/Cross_entropy"}, + {"enum": ["CROSS_ENTROPY_METHOD"], "description": "https://en.wikipedia.org/wiki/Cross-entropy_method"}, + {"enum": ["CROSS_VALIDATION"], "description": "https://en.wikipedia.org/wiki/Cross-validation_(statistics)"}, + {"enum": ["CULTURAL_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Cultural_algorithm"}, + {"enum": ["DATA_CONVERSION"], "description": "https://en.wikipedia.org/wiki/Data_conversion"}, + {"enum": ["DATA_DENORMALIZATION"], "description": "https://en.wikipedia.org/wiki/Denormalization"}, + {"enum": ["DATA_MAPPING"], "description": "https://en.wikipedia.org/wiki/Data_mapping"}, + {"enum": ["DATA_NORMALIZATION"], "description": "https://en.wikipedia.org/wiki/Database_normalization"}, + {"enum": ["DATA_PROFILING"], "description": "https://en.wikipedia.org/wiki/Data_profiling"}, + {"enum": ["DATA_RETRIEVAL"], "description": "Obtaining additional data for augmentation"}, + {"enum": ["DATA_SPLITTING"], "description": "https://en.wikipedia.org/wiki/Training,_test,_and_validation_sets"}, + {"enum": ["DATA_STREAM_CLUSTERING"], "description": "https://en.wikipedia.org/wiki/Data_stream_clustering"}, + {"enum": ["DATA_STREAM_MINING"], "description": "https://en.wikipedia.org/wiki/Data_stream_mining"}, + {"enum": ["DATA_STRUCTURE_ALIGNMENT"], "description": "https://en.wikipedia.org/wiki/Data_structure_alignment"}, + {"enum": ["DBSCAN"], "description": "https://en.wikipedia.org/wiki/DBSCAN"}, + {"enum": ["DECISION_STUMP"], "description": "https://en.wikipedia.org/wiki/Decision_stump"}, + {"enum": ["DECISION_TREE"], "description": "https://en.wikipedia.org/wiki/Decision_tree"}, + {"enum": ["DEEP_BELIEF_NETWORK"], "description": "https://en.wikipedia.org/wiki/Deep_belief_network"}, + {"enum": ["DEEP_FEATURE_SYNTHESIS"], "description": "https://groups.csail.mit.edu/EVO-DesignOpt/groupWebSite/uploads/Site/DSAA_DSM_2015.pdf"}, + {"enum": ["DEEPLOG"], "description": "https://en.wikipedia.org/wiki/Deeplog"}, + {"enum": ["DEEP_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Deep_learning#Deep_neural_networks"}, + {"enum": ["DEINTERLACING"], "description": "https://en.wikipedia.org/wiki/Deinterlacing"}, + {"enum": ["DENSE_NEURAL_NETWORK_LAYER"]}, + {"enum": ["DISCRETIZATION"], "description": "https://en.wikipedia.org/wiki/Discretization"}, + {"enum": ["DPLL"], "description": "https://en.wikipedia.org/wiki/DPLL_algorithm"}, + {"enum": ["DROPOUT"], "description": "https://en.wikipedia.org/wiki/Dropout_(neural_networks)"}, + {"enum": ["DYNAMIC_NEURAL_NETWORK"]}, + {"enum": ["DYNAMIC_TIME_WARPING"], "description": "https://en.wikipedia.org/wiki/Dynamic_time_warping"}, + {"enum": ["EAGER_LEARNING"], "description": "https://en.wikipedia.org/wiki/Eager_learning"}, + {"enum": ["ECHO_STATE_NETWORK"], "description": "https://en.wikipedia.org/wiki/Echo_state_network"}, + {"enum": ["ECLAT"]}, + {"enum": ["EDGERANK"], "description": "https://en.wikipedia.org/wiki/EdgeRank"}, + {"enum": ["ELASTIC_NET_REGULARIZATION"], "description": "https://en.wikipedia.org/wiki/Elastic_net_regularization"}, + {"enum": ["ENCODE_BINARY"], "description": "https://en.wikipedia.org/wiki/Binary_code"}, + {"enum": ["ENCODE_ONE_HOT"], "description": "https://en.wikipedia.org/wiki/One-hot"}, + {"enum": ["ENCODE_ORDINAL"]}, + {"enum": ["ENCODE_UNARY"], "description": "https://en.wikipedia.org/wiki/Unary_numeral_system"}, + {"enum": ["EQUI_JOIN"], "description": "https://en.wikipedia.org/wiki/Join_(SQL)#Equi-join"}, + {"enum": ["ENSEMBLE_LEARNING"], "description": "https://en.wikipedia.org/wiki/Ensemble_learning"}, + {"enum": ["EVOLUTIONARY_ACQUISITION_OF_NEURAL_TOPOLOGIES"], "description": "https://en.wikipedia.org/wiki/Evolutionary_acquisition_of_neural_topologies"}, + {"enum": ["EVOLUTIONARY_MULTIMODAL_OPTIMIZATION"], "description": "https://en.wikipedia.org/wiki/Evolutionary_multimodal_optimization"}, + {"enum": ["EXPECTATION_MAXIMIZATION_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm"}, + {"enum": ["EXTENSION_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Extension_neural_network"}, + {"enum": ["EXTREME_LEARNING_MACHINE"], "description": "https://en.wikipedia.org/wiki/Extreme_learning_machine"}, + {"enum": ["F1_SCORE"], "description": "https://en.wikipedia.org/wiki/F1_score"}, + {"enum": ["FALSE_NEAREST_NEIGHBOR"], "description": "https://en.wikipedia.org/wiki/False_nearest_neighbor_algorithm"}, + {"enum": ["FASTICA"], "description": "https://en.wikipedia.org/wiki/FastICA"}, + {"enum": ["FEATURE_SCALING"], "description": "https://en.wikipedia.org/wiki/Feature_scaling"}, + {"enum": ["FEEDFORWARD_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Feedforward_neural_network"}, + {"enum": ["FELLEGI_SUNTER_ALGORITHM"]}, + {"enum": ["FILE_MANIPULATION"], "description": "https://en.wikipedia.org/wiki/Computer_file"}, + {"enum": ["FISHER_KERNEL"], "description": "https://en.wikipedia.org/wiki/Fisher_kernel"}, + {"enum": ["FLATTEN_NEURAL_NETWORK_LAYER"]}, + {"enum": ["FORWARD_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Forward_algorithm"}, + {"enum": ["FORWARD_BACKWARD_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm"}, + {"enum": ["FORWARD_DIFFERENCE_CODING"], "description": "https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/#forward"}, + {"enum": ["FRANK_WOLFE_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Frank%E2%80%93Wolfe_algorithm"}, + {"enum": ["FREQUENCY_TRANSFORM"], "description": "https://en.wikipedia.org/wiki/Frequency_domain"}, + {"enum": ["FUZZY_CLUSTERING"], "description": "https://en.wikipedia.org/wiki/Fuzzy_clustering"}, + {"enum": ["GAUSSIAN_BLUR"], "description": "https://en.wikipedia.org/wiki/Gaussian_blur"}, + {"enum": ["GAUSSIAN_PROCESS"], "description": "https://en.wikipedia.org/wiki/Gaussian_process"}, + {"enum": ["GENERALIZED_HEBBIAN_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Generalized_Hebbian_Algorithm"}, + {"enum": ["GENERATIVE_TOPOGRAPHIC_MAP"], "description": "https://en.wikipedia.org/wiki/Generative_topographic_map"}, + {"enum": ["GENETIC_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Genetic_algorithm"}, + {"enum": ["GENETIC_ALGORITHM_FOR_RULE_SET_PRODUCTION"], "description": "https://en.wikipedia.org/wiki/Genetic_Algorithm_for_Rule_Set_Production"}, + {"enum": ["GENETIC_PROGRAMMING"], "description": "https://en.wikipedia.org/wiki/Genetic_programming"}, + {"enum": ["GENETIC_SCALE_RECURRENT_NEURAL_NETWORK"]}, + {"enum": ["GLOVE"], "description": "https://en.wikipedia.org/wiki/GloVe_(machine_learning)"}, + {"enum": ["GRADIENT_BOOSTING"], "description": "https://en.wikipedia.org/wiki/Gradient_boosting"}, + {"enum": ["GRADIENT_DESCENT"], "description": "https://en.wikipedia.org/wiki/Gradient_descent"}, + {"enum": ["GRAPHICAL_LASSO"], "description": "https://en.wikipedia.org/wiki/Graphical_lasso"}, + {"enum": ["GROWING_SELF_ORGANIZING_MAP"], "description": "https://en.wikipedia.org/wiki/Growing_self-organizing_map"}, + {"enum": ["HARD_CLUSTERING"]}, + {"enum": ["HASHING"], "description": "https://en.wikipedia.org/wiki/Hash_function"}, + {"enum": ["HELMERT_CODING"], "description": "https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/#HELMERT"}, + {"enum": ["HEURISTIC"], "description": "https://en.wikipedia.org/wiki/Heuristic"}, + {"enum": ["HIDDEN_MARKOV_MODEL"], "description": "https://en.wikipedia.org/wiki/Hidden_Markov_model"}, + {"enum": ["HIDDEN_SEMI_MARKOV_MODEL"], "description": "https://en.wikipedia.org/wiki/Hidden_semi-Markov_model"}, + {"enum": ["HIERARCHICAL_CLUSTERING"], "description": "https://en.wikipedia.org/wiki/Hierarchical_clustering"}, + {"enum": ["HIERARCHICAL_TEMPORAL_MEMORY"], "description": "https://en.wikipedia.org/wiki/Hierarchical_temporal_memory"}, + {"enum": ["HIGHER_ORDER_SINGULAR_VALUE_DECOMPOSITION"], "description": "https://en.wikipedia.org/wiki/Higher-order_singular_value_decomposition"}, + {"enum": ["HOLDOUT"], "description": "https://en.wikipedia.org/wiki/Cross-validation_(statistics)#Holdout_method"}, + {"enum": ["HOLOGRAPHIC_ASSOCIATIVE_MEMORY"], "description": "https://en.wikipedia.org/wiki/Holographic_associative_memory"}, + {"enum": ["HOPFIELD_NETWORK"], "description": "https://en.wikipedia.org/wiki/Hopfield_network"}, + {"enum": ["HOSHEN_KOPELMAN_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Hoshen%E2%80%93Kopelman_algorithm"}, + {"enum": ["HYPER_BASIS_FUNCTION_NETWORK"], "description": "https://en.wikipedia.org/wiki/Hyper_basis_function_network"}, + {"enum": ["HYPERNEAT"], "description": "https://en.wikipedia.org/wiki/HyperNEAT"}, + {"enum": ["ID3"], "description": "https://en.wikipedia.org/wiki/ID3"}, + {"enum": ["IDENTITY_FUNCTION"], "description": "https://en.wikipedia.org/wiki/Identity_function"}, + {"enum": ["IMAGE_CROPPING"], "description": "https://en.wikipedia.org/wiki/Cropping_(image)"}, + {"enum": ["IMAGE_PADDING"]}, + {"enum": ["IMAGE_ROTATION"]}, + {"enum": ["IMAGE_SCALING"], "description": "https://en.wikipedia.org/wiki/Image_scaling"}, + {"enum": ["IMAGE_TRANSFORM"]}, + {"enum": ["IMAGENET"], "description": "https://en.wikipedia.org/wiki/ImageNet"}, + {"enum": ["IMPUTATION"], "description": "https://en.wikipedia.org/wiki/Imputation_(statistics)"}, + {"enum": ["INDEPENDENT_COMPONENT_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Independent_component_analysis"}, + {"enum": ["INFORMATION_ENTROPY"], "description": "https://en.wikipedia.org/wiki/Entropy_(information_theory)"}, + {"enum": ["INFORMATION_FUZZY_NETWORKS"], "description": "https://en.wikipedia.org/wiki/Information_fuzzy_networks"}, + {"enum": ["INFORMATION_THEORETIC_METAFEATURE_EXTRACTION"]}, + {"enum": ["INSTANCE_BASED_LEARNING"], "description": "https://en.wikipedia.org/wiki/Instance-based_learning"}, + {"enum": ["INSTANTANEOUSLY_TRAINED_NEURAL_NETWORKS"], "description": "https://en.wikipedia.org/wiki/Instantaneously_trained_neural_networks"}, + {"enum": ["ISOMAP"], "description": "https://en.wikipedia.org/wiki/Isomap"}, + {"enum": ["ITERATIVE_LABELING"], "description": "Algorithms iteratively label unlabeled examples for semi-supervised learning."}, + {"enum": ["IVECTOR_EXTRACTION"], "description": "I-vector extration. Dehak, Najim & Kenny, Patrick & Dehak, R & Dumouchel, Pierre & Ouellet, Pierre. (2011). Front-End Factor Analysis for Speaker Verification. Audio, Speech, and Language Processing, IEEE Transactions on. 19. 788 - 798. 10.1109/TASL.2010.2064307."}, + {"enum": ["JACCARD_INDEX"], "description": "https://en.wikipedia.org/wiki/Jaccard_index"}, + {"enum": ["JUNCTION_TREE_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Junction_tree_algorithm"}, + {"enum": ["K_FOLD"], "description": "https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation"}, + {"enum": ["K_MEANS_CLUSTERING"], "description": "https://en.wikipedia.org/wiki/K-means_clustering"}, + {"enum": ["K_MEANS_PLUS_PLUS"], "description": "https://en.wikipedia.org/wiki/K-means%2B%2B"}, + {"enum": ["K_NEAREST_NEIGHBORS"], "description": "https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm"}, + {"enum": ["K_Q_FLATS"], "description": "https://en.wikipedia.org/wiki/K_q-flats"}, + {"enum": ["K_SVD"], "description": "https://en.wikipedia.org/wiki/K-SVD"}, + {"enum": ["KERNEL_ADAPTIVE_FILTER"], "description": "https://en.wikipedia.org/wiki/Kernel_adaptive_filter"}, + {"enum": ["KERNEL_INDEPENDENT_COMPONENT_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Kernel-independent_component_analysis"}, + {"enum": ["KERNEL_METHOD"], "description": "https://en.wikipedia.org/wiki/Kernel_method"}, + {"enum": ["KERNEL_PERCEPTRON"], "description": "https://en.wikipedia.org/wiki/Kernel_perceptron"}, + {"enum": ["KERNEL_PRINCIPAL_COMPONENT_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Kernel_principal_component_analysis"}, + {"enum": ["KERNEL_RANDOM_FOREST"], "description": "https://en.wikipedia.org/wiki/Random_forest#Kernel_random_forest"}, + {"enum": ["LANDMARKING_METAFEATURE_EXTRACTION"]}, + {"enum": ["LARGE_MARGIN_NEAREST_NEIGHBOR"], "description": "https://en.wikipedia.org/wiki/Large_margin_nearest_neighbor"}, + {"enum": ["LASSO"], "description": "https://en.wikipedia.org/wiki/Lasso_(statistics)"}, + {"enum": ["LATENT_DIRICHLET_ALLOCATION"], "description": "https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation"}, + {"enum": ["LATENT_SEMANTIC_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Latent_semantic_analysis"}, + {"enum": ["LEARNING_USING_PRIVILEGED_INFORMATION"], "description": "Algorithm can leverage privileged information available in training data but absent in test data."}, + {"enum": ["LEARNING_VECTOR_QUANTIZATION"], "description": "https://en.wikipedia.org/wiki/Learning_vector_quantization"}, + {"enum": ["LEAST_SQUARES_SUPPORT_VECTOR_MACHINE"], "description": "https://en.wikipedia.org/wiki/Least_squares_support_vector_machine"}, + {"enum": ["LEAVE_ONE_OUT"], "description": "https://en.wikipedia.org/wiki/Cross-validation_(statistics)#Leave-one-out_cross-validation"}, + {"enum": ["LIGHTGBM"]}, + {"enum": ["LIMITED_MEMORY_BFGS"], "description": "https://en.wikipedia.org/wiki/Limited-memory_BFGS"}, + {"enum": ["LINDE_BUZO_GRAY_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Linde%E2%80%93Buzo%E2%80%93Gray_algorithm"}, + {"enum": ["LINEAR_DISCRIMINANT_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Linear_discriminant_analysis"}, + {"enum": ["LINEAR_FILTER"], "description": "https://en.wikipedia.org/wiki/Linear_filter"}, + {"enum": ["LINEAR_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Linear_regression"}, + {"enum": ["LOBPCG"], "description": "https://en.wikipedia.org/wiki/LOBPCG"}, + {"enum": ["LOCAL_OUTLIER_FACTOR"], "description": "https://en.wikipedia.org/wiki/Local_outlier_factor"}, + {"enum": ["LOCAL_SEARCH"], "description": "https://en.wikipedia.org/wiki/Local_search_(optimization)"}, + {"enum": ["LOGISTIC_MODEL_TREE"], "description": "https://en.wikipedia.org/wiki/Logistic_model_tree"}, + {"enum": ["LOGISTIC_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Logistic_regression"}, + {"enum": ["LOGITBOOST"], "description": "https://en.wikipedia.org/wiki/LogitBoost"}, + {"enum": ["LONG_SHORT_TERM_MEMORY"], "description": "https://en.wikipedia.org/wiki/Long_short-term_memory"}, + {"enum": ["LOW_RANK_MATRIX_APPROXIMATIONS"], "description": "https://en.wikipedia.org/wiki/Low-rank_matrix_approximations"}, + {"enum": ["LPBOOST"], "description": "https://en.wikipedia.org/wiki/LPBoost"}, + {"enum": ["MAP"], "description": "https://en.wikipedia.org/wiki/Map_(higher-order_function)"}, + {"enum": ["MARGIN_CLASSIFIER"], "description": "https://en.wikipedia.org/wiki/Margin_classifier"}, + {"enum": ["MARGIN_INFUSED_RELAXED_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Margin-infused_relaxed_algorithm"}, + {"enum": ["MARKOV_CHAIN"], "description": "https://en.wikipedia.org/wiki/Markov_chain"}, + {"enum": ["MARKOV_CHAIN_MONTE_CARLO"], "description": "https://en.wikipedia.org/wiki/Markov_chain_Monte_Carlo"}, + {"enum": ["MARKOV_DECISION_PROCESS"], "description": "https://en.wikipedia.org/wiki/Markov_decision_process"}, + {"enum": ["MARKOV_LOGIC_NETWORK"], "description": "https://en.wikipedia.org/wiki/Markov_logic_network"}, + {"enum": ["MARKOV_MODEL"], "description": "https://en.wikipedia.org/wiki/Markov_model"}, + {"enum": ["MARKOV_RANDOM_FIELD"], "description": "https://en.wikipedia.org/wiki/Markov_random_field"}, + {"enum": ["MAX_POOLING_NEURAL_NETWORK_LAYER"]}, + {"enum": ["MEAN_ABSOLUTE_ERROR"], "description": "https://en.wikipedia.org/wiki/Mean_absolute_error"}, + {"enum": ["MEAN_SHIFT"], "description": "https://en.wikipedia.org/wiki/Mean_shift"}, + {"enum": ["MEAN_SQUARED_ERROR"], "description": " https://en.wikipedia.org/wiki/Mean_squared_error"}, + {"enum": ["MEMETIC_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Memetic_algorithm"}, + {"enum": ["MEMORY_PREDICTION_FRAMEWORK"], "description": "https://en.wikipedia.org/wiki/Memory-prediction_framework"}, + {"enum": ["MERSENNE_TWISTER"], "description": "https://en.wikipedia.org/wiki/Mersenne_Twister"}, + {"enum": ["MFCC_FEATURE_EXTRACTION"], "description": "The HTK Book, http://www.dsic.upv.es/docs/posgrado/20/RES/materialesDocentes/alejandroViewgraphs/htkbook.pdf"}, + {"enum": ["MIN_CONFLICTS_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Min-conflicts_algorithm"}, + {"enum": ["MINIMUM_REDUNDANCY_FEATURE_SELECTION"], "description": "https://en.wikipedia.org/wiki/Minimum_redundancy_feature_selection"}, + {"enum": ["MINMAX_SCALER"]}, + {"enum": ["MM_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/MM_algorithm"}, + {"enum": ["MODEL_BASED_METAFEATURE_EXTRACTION"]}, + {"enum": ["MODULAR_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Modular_neural_network"}, + {"enum": ["MOMENTUM_CONTRAST"], "description": "Momentum Contrast for Unsupervised Visual Representation Learning, https://arxiv.org/pdf/1911.05722.pdf, He et al. FAIR"}, + {"enum": ["MONTE_CARLO_TREE_SEARCH"], "description": "https://en.wikipedia.org/wiki/Monte_Carlo_tree_search"}, + {"enum": ["MORAVEC_CORNER_DETECTION_ALGORITHM"]}, + {"enum": ["MOTION_COMPENSATION"], "description": "https://en.wikipedia.org/wiki/Motion_compensation"}, + {"enum": ["MULTI_ARMED_BANDIT"], "description": "https://en.wikipedia.org/wiki/Multi-armed_bandit"}, + {"enum": ["MULTICLASS_CLASSIFICATION"], "description": "https://en.wikipedia.org/wiki/Multiclass_classification"}, + {"enum": ["MULTILABEL_CLASSIFICATION"], "description": "https://en.wikipedia.org/wiki/Multi-label_classification"}, + {"enum": ["MULTILAYER_PERCEPTRON"], "description": "https://en.wikipedia.org/wiki/Multilayer_perceptron"}, + {"enum": ["MULTINOMIAL_LOGISTIC_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Multinomial_logistic_regression"}, + {"enum": ["MULTINOMIAL_NAIVE_BAYES"], "description": "http://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes, https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html"}, + {"enum": ["MULTIPLICATIVE_WEIGHT_UPDATE_METHOD"], "description": "https://en.wikipedia.org/wiki/Multiplicative_weight_update_method"}, + {"enum": ["MULTIVARIATE_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Multi-label_classification"}, + {"enum": ["MUTUAL_INFORMATION"], "description": "https://en.wikipedia.org/wiki/Mutual_information"}, + {"enum": ["N_GRAM"], "description": "https://en.wikipedia.org/wiki/N-gram"}, + {"enum": ["NAIVE_BAYES_CLASSIFIER"], "description": "https://en.wikipedia.org/wiki/Naive_Bayes_classifier"}, + {"enum": ["NEAREST_CENTROID_CLASSIFIER"], "description": "https://en.wikipedia.org/wiki/Nearest_centroid_classifier"}, + {"enum": ["NEIGHBOURHOOD_COMPONENTS_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Neighbourhood_components_analysis"}, + {"enum": ["NEURAL_NETWORK_BACKPROPAGATION"], "description": "https://en.wikipedia.org/wiki/Backpropagation"}, + {"enum": ["NEURO_FUZZY_NETWORK"], "description": "https://en.wikipedia.org/wiki/Neuro-fuzzy"}, + {"enum": ["NEUROEVOLUTION_OF_AUGMENTED_TOPOLOGIES"], "description": "https://en.wikipedia.org/wiki/Neuroevolution_of_augmenting_topologies"}, + {"enum": ["NOISE_REDUCTION"], "description": "https://en.wikipedia.org/wiki/Noise_reduction"}, + {"enum": ["NONOVERLAPPING_COMMUNITY_DETECTION"]}, + {"enum": ["NORMAL_DISTRIBUTION"], "description": "https://en.wikipedia.org/wiki/Normal_distribution"}, + {"enum": ["NUMERICAL_METHOD"], "description": "https://en.wikipedia.org/wiki/Numerical_method"}, + {"enum": ["ONE_RULE"]}, + {"enum": ["ONE_SHOT_ASSOCIATIVE_MEMORY"]}, + {"enum": ["ONE_SHOT_LEARNING"], "description": "https://en.wikipedia.org/wiki/One-shot_learning"}, + {"enum": ["OPTICS_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/OPTICS_algorithm"}, + {"enum": ["OPTIMISTIC_KNOWLEDGE_GRADIENT"], "description": "https://en.wikipedia.org/wiki/Optimistic_knowledge_gradient"}, + {"enum": ["ORTHOGONAL_POLYNOMIAL_CODING"], "description": "https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/#ORTHOGONAL"}, + {"enum": ["OVERLAPPING_CLUSTERING"]}, + {"enum": ["OVERLAPPING_COMMUNITY_DETECTION"]}, + {"enum": ["PACHINKO_ALLOCATION"], "description": "https://en.wikipedia.org/wiki/Pachinko_allocation"}, + {"enum": ["PAGERANK"], "description": "https://en.wikipedia.org/wiki/PageRank"}, + {"enum": ["PARAMETRIC_TRAJECTORY_MODELING"], "description": "Gish, H. and Ng, K., 1996, October. Parametric trajectory models for speech recognition. In Spoken Language, 1996. ICSLP 96. Proceedings., Fourth International Conference on (Vol. 1, pp. 466-469). IEEE."}, + {"enum": ["PARTIAL_LEAST_SQUARES_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Partial_least_squares_regression"}, + {"enum": ["PARTICLE_SWARM_OPTIMIZATION"], "description": "https://en.wikipedia.org/wiki/Particle_swarm_optimization"}, + {"enum": ["PASSIVE_AGGRESSIVE"], "description": "http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf"}, + {"enum": ["PERCEPTRON"], "description": "https://en.wikipedia.org/wiki/Perceptron"}, + {"enum": ["PHYSICAL_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Physical_neural_network"}, + {"enum": ["PIXELATION"], "description": "https://en.wikipedia.org/wiki/Pixelation"}, + {"enum": ["POLYNOMIAL_NEURAL_NETWORK"]}, + {"enum": ["POLYNOMIAL_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Polynomial_regression"}, + {"enum": ["POPULATION_BASED_INCREMENTAL_LEARNING"], "description": "https://en.wikipedia.org/wiki/Population-based_incremental_learning"}, + {"enum": ["PREFRONTAL_CORTEX_BASAL_GANGLIA_WORKING_MEMORY"], "description": "https://en.wikipedia.org/wiki/Prefrontal_cortex_basal_ganglia_working_memory"}, + {"enum": ["PRINCIPAL_COMPONENT_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Principal_component_analysis"}, + {"enum": ["PROBABILISTIC_DATA_CLEANING"]}, + {"enum": ["PROBABILISTIC_LATENT_SEMANTIC_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Probabilistic_latent_semantic_analysis"}, + {"enum": ["PROBABILISTIC_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Probabilistic_neural_network"}, + {"enum": ["PRUNING"], "description": "https://en.wikipedia.org/wiki/Pruning_(decision_trees)"}, + {"enum": ["PSIPRED"], "description": "https://en.wikipedia.org/wiki/PSIPRED"}, + {"enum": ["Q_LEARNING"], "description": "https://en.wikipedia.org/wiki/Q-learning"}, + {"enum": ["QUADRATIC_DISCRIMINANT_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Quadratic_classifier#Quadratic_discriminant_analysis"}, + {"enum": ["QUANTUM_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Quantum_neural_network"}, + {"enum": ["QUICKPROP"], "description": "https://en.wikipedia.org/wiki/Quickprop"}, + {"enum": ["RADIAL_BASIS_FUNCTION_NETWORK"], "description": "https://en.wikipedia.org/wiki/Radial_basis_function_network"}, + {"enum": ["RANDOM_FOREST"], "description": "https://en.wikipedia.org/wiki/Random_forest"}, + {"enum": ["RANDOM_GRAPH"], "description": "https://en.wikipedia.org/wiki/Random_graph"}, + {"enum": ["RANDOM_PROJECTION"], "description": "https://en.wikipedia.org/wiki/Random_projection"}, + {"enum": ["RANDOM_SUBSPACE_METHOD"], "description": "https://en.wikipedia.org/wiki/Random_subspace_method"}, + {"enum": ["RANDOM_WALK"], "description": "https://en.wikipedia.org/wiki/Random_walk"}, + {"enum": ["RANDOMIZED_WEIGHTED_MAJORITY_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Randomized_weighted_majority_algorithm"}, + {"enum": ["RANKBRAIN"], "description": "https://en.wikipedia.org/wiki/RankBrain"}, + {"enum": ["RANKING_SVM"], "description": "https://en.wikipedia.org/wiki/Ranking_SVM"}, + {"enum": ["RAPIDLY_EXPLORING_RANDOM_TREE"], "description": "https://en.wikipedia.org/wiki/Rapidly-exploring_random_tree"}, + {"enum": ["RECEIVER_OPERATING_CHARACTERISTIC"], "description": "https://en.wikipedia.org/wiki/Receiver_operating_characteristic"}, + {"enum": ["RECURRENT_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Recurrent_neural_network"}, + {"enum": ["RECURSIVE_LEAST_SQUARES"], "description": "https://en.wikipedia.org/wiki/Recursive_least_squares_filter"}, + {"enum": ["RECURSIVE_PARTITIONING"], "description": "https://en.wikipedia.org/wiki/Recursive_partitioning"}, + {"enum": ["REGULARIZATION_BY_SPECTRAL_FILTERING"], "description": "https://en.wikipedia.org/wiki/Regularization_by_spectral_filtering"}, + {"enum": ["REGULARIZED_LEAST_SQUARES"], "description": "https://en.wikipedia.org/wiki/Regularized_least_squares"}, + {"enum": ["REGULATORY_FEEDBACK_NETWORK"], "description": "https://en.wikipedia.org/wiki/Regulatory_feedback_network"}, + {"enum": ["REINFORCE_ALGORITHM"]}, + {"enum": ["REJECTION_SAMPLING"], "description": "https://en.wikipedia.org/wiki/Rejection_sampling"}, + {"enum": ["RELATIONAL_ALGEBRA"], "description": "https://en.wikipedia.org/wiki/Relational_algebra"}, + {"enum": ["RELATIONAL_DATA_MINING"], "description": "https://en.wikipedia.org/wiki/Relational_data_mining"}, + {"enum": ["RELIEF"], "description": "https://en.wikipedia.org/wiki/Relief_(feature_selection)"}, + {"enum": ["RESTRICTED_BOLTZMANN_MACHINE"], "description": "https://en.wikipedia.org/wiki/Restricted_Boltzmann_machine"}, + {"enum": ["RETINANET"], "description": "https://arxiv.org/abs/1708.02002"}, + {"enum": ["REVERSE_HELMERT_CODING"], "description": "https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/#reverse"}, + {"enum": ["REVERSE_MONTE_CARLO"], "description": "https://en.wikipedia.org/wiki/Reverse_Monte_Carlo"}, + {"enum": ["RIPPER"], "description": "https://en.wikipedia.org/wiki/Repeated_incremental_pruning_to_produce_error_reduction_(RIPPER)"}, + {"enum": ["ROBUST_PRINCIPAL_COMPONENT_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Robust_principal_component_analysis"}, + {"enum": ["RPROP"], "description": "https://en.wikipedia.org/wiki/Rprop"}, + {"enum": ["RULE_BASED_MACHINE_LEARNING"], "description": "https://en.wikipedia.org/wiki/Rule-based_machine_learning"}, + {"enum": ["SAMPLE_MERGING"]}, + {"enum": ["SAMPLE_SELECTION"]}, + {"enum": ["SELF_ORGANIZING_MAP"], "description": "https://en.wikipedia.org/wiki/Self-organizing_map"}, + {"enum": ["SEMIDEFINITE_EMBEDDING"], "description": "https://en.wikipedia.org/wiki/Semidefinite_embedding"}, + {"enum": ["SIGNAL_DITHERING"], "description": "https://en.wikipedia.org/wiki/Dither"}, + {"enum": ["SIGNAL_ENERGY"], "description": "https://en.wikipedia.org/wiki/Energy_(signal_processing)"}, + {"enum": ["SIGNAL_TO_NOISE_RATIO"], "description": "https://en.wikipedia.org/wiki/Signal-to-noise_ratio"}, + {"enum": ["SIMULATED_ANNEALING"], "description": "https://en.wikipedia.org/wiki/Simulated_annealing"}, + {"enum": ["SINGULAR_VALUE_DECOMPOSITION"], "description": "https://en.wikipedia.org/wiki/Singular-value_decomposition"}, + {"enum": ["SMOOTHED_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Smoothed_analysis"}, + {"enum": ["SOFT_CLUSTERING"], "description": "https://en.wikipedia.org/wiki/Fuzzy_clustering"}, + {"enum": ["SOFTMAX_FUNCTION"], "description": "https://en.wikipedia.org/wiki/Softmax_function"}, + {"enum": ["SPARSE_DICTIONARY_LEARNING"], "description": "https://en.wikipedia.org/wiki/Sparse_dictionary_learning"}, + {"enum": ["SPARSE_PCA"], "description": "https://en.wikipedia.org/wiki/Sparse_PCA"}, + {"enum": ["SPECTRAL_CLUSTERING"], "description": "https://en.wikipedia.org/wiki/Spectral_clustering"}, + {"enum": ["SPIKE_AND_SLAB_VARIABLE_SELECTION"], "description": "https://en.wikipedia.org/wiki/Spike-and-slab_variable_selection"}, + {"enum": ["SPIKING_NEURAL_NETWORKS"], "description": "https://en.wikipedia.org/wiki/Spiking_neural_network"}, + {"enum": ["SPRUCE"], "description": "https://gitlab.com/zinkov/spruce/blob/master/README.md"}, + {"enum": ["STATISTICAL_METAFEATURE_EXTRACTION"]}, + {"enum": ["STATISTICAL_MOMENT_ANALYSIS"], "description": "https://en.wikipedia.org/wiki/Moment_(mathematics)"}, + {"enum": ["STOCHASTIC_CHAINS_WITH_MEMORY_OF_VARIABLE_LENGTH"], "description": "https://en.wikipedia.org/wiki/Stochastic_chains_with_memory_of_variable_length"}, + {"enum": ["STOCHASTIC_GRADIENT_DESCENT"], "description": "https://en.wikipedia.org/wiki/Stochastic_gradient_descent"}, + {"enum": ["STOCHASTIC_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Stochastic_neural_network"}, + {"enum": ["STRICT_PARTITIONING_CLUSTERING"]}, + {"enum": ["STRICT_PARTITIONING_CLUSTERING_WITH_OUTLIERS"]}, + {"enum": ["STRUCTURED_KNN"], "description": "https://en.wikipedia.org/wiki/Structured_kNN"}, + {"enum": ["STRUCTURED_SPARSITY_REGULARIZATION"], "description": "https://en.wikipedia.org/wiki/Structured_sparsity_regularization"}, + {"enum": ["STRUCTURED_SUPPORT_VECTOR_MACHINE"], "description": "https://en.wikipedia.org/wiki/Structured_support_vector_machine"}, + {"enum": ["SUBSPACE_CLUSTERING"], "description": "https://en.wikipedia.org/wiki/Clustering_high-dimensional_data#Subspace_clustering"}, + {"enum": ["SUM_CODING"]}, + {"enum": ["SUPER_RECURSIVE_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Super-recursive_algorithm"}, + {"enum": ["SUPPORT_VECTOR_MACHINE"], "description": "https://en.wikipedia.org/wiki/Support_vector_machine"}, + {"enum": ["SYMBOLIC_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Symbolic_regression"}, + {"enum": ["T_DISTRIBUTED_STOCHASTIC_NEIGHBOR_EMBEDDING"], "description": "https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding"}, + {"enum": ["TELEMANOM"]}, + {"enum": ["TFIDF"], "description": "https://en.wikipedia.org/wiki/Tf-idf"}, + {"enum": ["TIKHONOV_REGULARIZATION"], "description": "https://en.wikipedia.org/wiki/Tikhonov_regularization"}, + {"enum": ["TIME_DELAY_NEURAL_NETWORK"], "description": "https://en.wikipedia.org/wiki/Time_delay_neural_network"}, + {"enum": ["TRUNCATED_NEWTON_METHOD"], "description": "https://en.wikipedia.org/wiki/Truncated_Newton_method"}, + {"enum": ["TRUNCATED_NORMAL_DISTRIBUTION"], "description": "https://en.wikipedia.org/wiki/Truncated_normal_distribution"}, + {"enum": ["UNIFORM_DISTRIBUTION"], "description": "https://en.wikipedia.org/wiki/Uniform_distribution_(continuous)"}, + {"enum": ["UNIFORM_TIME_SERIES_SEGMENTATION"], "description": "Time-series segmentation into fixed-sized segments (windows, frames)"}, + {"enum": ["UNIT_WEIGHTED_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Unit-weighted_regression"}, + {"enum": ["UNIVARIATE_REGRESSION"], "description": "https://en.wikipedia.org/wiki/Multi-label_classification"}, + {"enum": ["UNIVERSAL_PORTFOLIO_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Universal_portfolio_algorithm"}, + {"enum": ["VARIABLE_ORDER_MARKOV_MODEL"], "description": "https://en.wikipedia.org/wiki/Variable-order_Markov_model"}, + {"enum": ["VARIATIONAL_BAYESIAN_METHODS"], "description": "https://en.wikipedia.org/wiki/Variational_Bayesian_methods"}, + {"enum": ["VARIATIONAL_MESSAGE_PASSING"], "description": "https://en.wikipedia.org/wiki/Variational_message_passing"}, + {"enum": ["VECTOR_AUTOREGRESSION"], "description": "https://en.wikipedia.org/wiki/Vector_autoregression"}, + {"enum": ["VECTOR_QUANTIZATION"], "description": "https://en.wikipedia.org/wiki/Vector_quantization"}, + {"enum": ["VECTORIZATION"], "description": "https://en.wikipedia.org/wiki/Vectorization"}, + {"enum": ["VERSION_SPACE_LEARNING"], "description": "https://en.wikipedia.org/wiki/Version_space_learning"}, + {"enum": ["WAKE_SLEEP_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Wake-sleep_algorithm"}, + {"enum": ["WEIGHTED_MAJORITY_ALGORITHM"], "description": "https://en.wikipedia.org/wiki/Weighted_majority_algorithm_(machine_learning)"}, + {"enum": ["WINNOW"], "description": "https://en.wikipedia.org/wiki/Winnow_(algorithm)"}, + {"enum": ["WORD2VEC"], "description": "https://en.wikipedia.org/wiki/Word2vec"} + ] + }, + "minItems": 1 + }, + "primitive_family": { + "description": "Primitive family describes the high-level purpose/nature of the primitive. Only one value per primitive is possible. Consider splitting a primitive into multiple primitives if this represents a problem for you.", + "oneOf": [ + {"enum": ["REINFORCEMENT"], "description": "Reinforcement Module"}, + {"enum": ["ANOMALY_DETECTION"], "description": "TODS algorithms"}, + {"enum": ["CLASSIFICATION"], "description": "https://en.wikipedia.org/wiki/Statistical_classification"}, + {"enum": ["CLUSTERING"], "description": "https://en.wikipedia.org/wiki/Cluster_analysis"}, + {"enum": ["COLLABORATIVE_FILTERING"], "description": "https://en.wikipedia.org/wiki/Collaborative_filtering"}, + {"enum": ["COMMUNITY_DETECTION"], "description": "https://en.wikipedia.org/wiki/Community_search"}, + {"enum": ["DATA_AUGMENTATION"], "description": "Adding value to base data by adding information derived from internal and external sources."}, + {"enum": ["DATA_CLEANING"], "description": "https://en.wikipedia.org/wiki/Data_cleansing"}, + {"enum": ["DATA_COMPRESSION"], "description": "https://en.wikipedia.org/wiki/Data_compression"}, + {"enum": ["DATA_GENERATION"], "description": "https://en.wikipedia.org/wiki/Data_generating_process"}, + {"enum": ["DATA_PREPROCESSING"], "description": "https://en.wikipedia.org/wiki/Data_pre-processing"}, + {"enum": ["DATA_TRANSFORMATION"], "description": "https://en.wikipedia.org/wiki/Data_transformation"}, + {"enum": ["DATA_VALIDATION"], "description": "https://en.wikipedia.org/wiki/Data_validation"}, + {"enum": ["DATA_WRANGLING"], "description": "https://en.wikipedia.org/wiki/Data_wrangling"}, + {"enum": ["DIGITAL_IMAGE_PROCESSING"], "description": "https://en.wikipedia.org/wiki/Digital_image_processing"}, + {"enum": ["DIGITAL_SIGNAL_PROCESSING"], "description": "https://en.wikipedia.org/wiki/Digital_signal_processing"}, + {"enum": ["DIMENSIONALITY_REDUCTION"], "description": "https://en.wikipedia.org/wiki/Dimensionality_reduction"}, + {"enum": ["EVALUATION"], "description": "Primitives providing validation/evaluation, like cross-validation."}, + {"enum": ["FEATURE_CONSTRUCTION"], "description": "A primitive which creates new features."}, + {"enum": ["FEATURE_EXTRACTION"], "description": "https://en.wikipedia.org/wiki/Feature_extraction"}, + {"enum": ["FEATURE_SELECTION"], "description": "https://en.wikipedia.org/wiki/Feature_selection"}, + {"enum": ["GRAPH_CLUSTERING"]}, + {"enum": ["GRAPH_MATCHING"], "description": "https://en.wikipedia.org/wiki/Graph_matching"}, + {"enum": ["LAYER"], "description": "A primitive which is a neural network layer used in construction of a neural network."}, + {"enum": ["LEARNER"], "description": "A primitive which is a learner/model."}, + {"enum": ["LINK_PREDICTION"]}, + {"enum": ["LOSS_FUNCTION"], "description": "Primitives can take a loss function as an argument. This family of primitives provide such loss functions and they can be passed as an argument to other primitives."}, + {"enum": ["METALEARNING"], "description": "https://en.wikipedia.org/wiki/Meta_learning_(computer_science)"}, + {"enum": ["NATURAL_LANGUAGE_PROCESSING"], "description": "https://en.wikipedia.org/wiki/Natural_language_processing"}, + {"enum": ["NORMALIZATION"]}, + {"enum": ["OBJECT_DETECTION"], "description": "https://en.wikipedia.org/wiki/Object_detection"}, + {"enum": ["OPERATOR"], "description": "A simple mathematical operator."}, + {"enum": ["REGRESSION"], "description": "A primitive which can be used to address regression problems."}, + {"enum": ["SEMISUPERVISED_CLASSIFICATION"]}, + {"enum": ["SEMISUPERVISED_REGRESSION"]}, + {"enum": ["SIMILARITY_MODELING"], "description": "A primitive which attempts to learn or infer a measure of similarity or dissimilarity between pairs of instances."}, + {"enum": ["TIME_SERIES_CLASSIFICATION"], "description": "A primitive which can be used to address classification problems of time-series."}, + {"enum": ["TIME_SERIES_EMBEDDING"], "description": "A fixed-length representation of variable-length time series." }, + {"enum": ["TIME_SERIES_FORECASTING"]}, + {"enum": ["TIME_SERIES_SEGMENTATION"], "description": "A primitive which segments an input time-series into a sequence of discrete segments in order to reveal the underlying properties of its source. https://en.wikipedia.org/wiki/Time-series_segmentation."}, + {"enum": ["VERTEX_CLASSIFICATION"]}, + {"enum": ["VERTEX_NOMINATION"]}, + {"enum": ["VIDEO_PROCESSING"], "description": "https://en.wikipedia.org/wiki/Video_processing"}, + {"enum": ["SCHEMA_DISCOVERY"]}, + {"enum": ["REMOTE_SENSING"]} + ] + }, + "preconditions": { + "type": "array", + "description": "A set of requirements for the data given as an input to this primitive. For example, a primitive may not be able to handle data with missing values.", + "items": { + "oneOf": [ + {"enum": ["NO_MISSING_VALUES"], "description": "The primitive cannot handle missing values."}, + {"enum": ["NO_CATEGORICAL_VALUES"], "description": "The primitive cannot handle categorical values."}, + {"enum": ["NO_NEGATIVE_VALUES"], "description": "The primitive cannot handle negative values."}, + {"enum": ["NO_CONTINUOUS_VALUES"], "description": "The primitive cannot handle continuous values."}, + {"enum": ["NO_JAGGED_VALUES"], "description": "The primitive cannot handle values where different elements of data have different dimensions. Both numpy arrays and pandas support only fixed dimension sizes, but a list of lists could have some sub-lists of a different length to others, or a numpy array of objects where objects are numpy arrays of different sizes."}, + {"enum": ["NO_NESTED_VALUES"], "description": "The primitive cannot handle values where a container value contains nested other values with dimensions. E.g., a Pandas DataFrame having numpy arrays as values. Not just container types have dimensions."} + ] + }, + "minItems": 1 + }, + "effects": { + "type": "array", + "description": "A set of postconditions obtained by the data processed by this primitive. For example, a primitive may remove missing values.", + "items": { + "oneOf":[ + {"enum": ["NO_MISSING_VALUES"], "description": "The primitive removes missing values (e.g., imputation)."}, + {"enum": ["NO_CATEGORICAL_VALUES"], "description": "The primitive removes categorical columns (e.g., label encoder)."}, + {"enum": ["NO_NEGATIVE_VALUES"], "description": "The primitive produces only non-negative values."}, + {"enum": ["NO_CONTINUOUS_VALUES"], "description": "The data produced by this primitive is discretized."}, + {"enum": ["NO_JAGGED_VALUES"], "description": "The primitive produces values with fixed dimension sizes across all elements."}, + {"enum": ["NO_NESTED_VALUES"], "description": "The primitive produces values where a container value does not contain nested any other values with dimensions."} + ] + }, + "minItems": 1 + }, + "hyperparams_to_tune": { + "type": "array", + "description": "A list containing the significant hyper-parameter names of a primitive that should be tuned (for prioritizing hyper-parameter tuning). For instance, if a primitive has 10 hyper-parameters, this metadata may be used to specify the two or three that affect the results the most.", + "items": { + "type": "string" + }, + "minItems": 1 + }, + "outliers": { + "type": "array", + "description": "Number of outliers n sigma away from mean for some list of numbers.", + "items": { + "type": "object", + "properties": { + "n": { + "type": "integer" + }, + "count": { + "type": "integer" + } + }, + "required": [ + "n", + "count" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "aggregate": { + "type": "object", + "description": "Aggregate metadata about some list of numbers.", + "properties": { + "name": { + "$ref": "#/definitions/name" + }, + "description": { + "$ref": "#/definitions/description" + }, + "count": { + "type": "integer", + "description": "A count of values in the list." + }, + "min": { + "type": "number", + "description": "Minimum value of the list." + }, + "max": { + "type": "number", + "description": "Maximum value of the list." + }, + "mean": { + "type": "number", + "description": "Mean value of the list." + }, + "median": { + "type": "number", + "description": "Median value of the list." + }, + "std": { + "type": "number", + "description": "Unbiased standard deviation value of the list." + }, + "quartile_1": { + "type": "number", + "description": "The 25th percentile value of the list." + }, + "quartile_3": { + "type": "number", + "description": "The 75th percentile value of the list." + }, + "kurtosis": { + "type": "number", + "description": "The unbiased kurtosis of the distribution using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). Normalized by N-1." + }, + "skewness": { + "type": "number", + "description": "The unbiased skew of the distribution." + } + }, + "additionalProperties": true + }, + "python_value": { + "description": "A Python value. Schema allows a value of any type, even not JSON-compatible." + }, + "python_type": { + "description": "A Python type.", + "format": "python-type" + }, + "supported_media_types": { + "allOf": [{"$ref": "#/definitions/media_types"}], + "description": "Which media types a primitive knows how to manipulate." + }, + "timestamp": { + "type": "string", + "description": "A timestamp.", + "anyOf": [ + {"format": "date-time"}, + {"format": "date"} + ] + }, + "problem": { + "type": "object", + "properties": { + "task_keywords": { + "type": "array", + "description": "Keywords describing the task.", + "items": { + "oneOf": [ + {"enum": ["ANOMALY_DETECTION"]}, + {"enum": ["CLASSIFICATION"], "description": "https://en.wikipedia.org/wiki/Statistical_classification"}, + {"enum": ["REGRESSION"], "description": "https://en.wikipedia.org/wiki/Regression_analysis"}, + {"enum": ["CLUSTERING"], "description": "https://en.wikipedia.org/wiki/Cluster_analysis"}, + {"enum": ["LINK_PREDICTION"]}, + {"enum": ["VERTEX_NOMINATION"]}, + {"enum": ["VERTEX_CLASSIFICATION"]}, + {"enum": ["COMMUNITY_DETECTION"], "description": "https://en.wikipedia.org/wiki/Community_search"}, + {"enum": ["GRAPH_MATCHING"], "description": "https://en.wikipedia.org/wiki/Graph_matching"}, + {"enum": ["FORECASTING"]}, + {"enum": ["COLLABORATIVE_FILTERING"], "description": "https://en.wikipedia.org/wiki/Collaborative_filtering"}, + {"enum": ["OBJECT_DETECTION"], "description": "https://en.wikipedia.org/wiki/Object_detection"}, + {"enum": ["SEMISUPERVISED"]}, + {"enum": ["BINARY"]}, + {"enum": ["MULTICLASS"]}, + {"enum": ["MULTILABEL"]}, + {"enum": ["UNIVARIATE"]}, + {"enum": ["MULTIVARIATE"]}, + {"enum": ["OVERLAPPING"]}, + {"enum": ["NONOVERLAPPING"]}, + {"enum": ["TABULAR"]}, + {"enum": ["RELATIONAL"]}, + {"enum": ["NESTED"]}, + {"enum": ["IMAGE"]}, + {"enum": ["AUDIO"]}, + {"enum": ["VIDEO"]}, + {"enum": ["SPEECH"]}, + {"enum": ["TEXT"]}, + {"enum": ["GRAPH"]}, + {"enum": ["MULTIGRAPH"]}, + {"enum": ["TIME_SERIES"]}, + {"enum": ["GROUPED"]}, + {"enum": ["GEOSPATIAL"]}, + {"enum": ["REMOTE_SENSING"], "description": "https://en.wikipedia.org/wiki/Remote_sensing"}, + {"enum": ["LUPI"]}, + {"enum": ["MISSING_METADATA"]} + ] + }, + "minItems": 1 + }, + "performance_metrics": { + "type": "array", + "description": "For which performance metrics to optimize for?", + "items": { + "$ref": "#/definitions/performance_metric" + }, + "minItems": 1 + } + }, + "additionalProperties": true + }, + "problem_inputs": { + "type": "array", + "description": "A list describing input datasets for the problem and associated targets. This list should match the list of inputs to a solution pipeline, in order.", + "items": { + "type": "object", + "description": "A description of an input dataset.", + "properties": { + "dataset_id": { + "allOf": [{"$ref": "#/definitions/id"}], + "description": "An ID of a dataset associated with this input, among known or available datasets. Information which datasets precisely (version, digest, etc.) are inputs should be available elsewhere, e.g., in a pipeline run description, while this ID serves to map problem inputs to those datasets." + }, + "targets": { + "allOf": [{"$ref": "#/definitions/targets"}], + "description": "A list of targets used for this problem from this dataset." + }, + "forecasting_horizon": { + "type": "object", + "description": "In time series forecasting, the problem description can contain additional information about the horizon of forecast.", + "properties": { + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_index": { + "$ref": "#/definitions/column_index" + }, + "column_name": { + "$ref": "#/definitions/column_name" + }, + "horizon_value": { + "type": "number", + "description": "The maximum number of time steps in future the predictions will need to be made, in units of \"time_granularity\" of the referenced column." + } + }, + "required": [ + "resource_id", + "column_index", + "column_name", + "horizon_value" + ], + "additionalProperties": true + }, + "privileged_data": { + "type": "array", + "description": "A list of privileged data columns related to unavailable attributes during testing. These columns do not have data available in the test split of a dataset.", + "items": { + "type": "object", + "properties": { + "privileged_data_index": { + "type": "integer", + "description": "An index of the privileged data column in this list of privileged data columns, 0-based." + }, + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_index": { + "$ref": "#/definitions/column_index" + }, + "column_name": { + "$ref": "#/definitions/column_name" + } + }, + "required": [ + "privileged_data_index", + "resource_id", + "column_index", + "column_name" + ], + "additionalProperties": true + }, + "minItems": 1 + } + }, + "required": [ + "dataset_id" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "data_augmentation": { + "type": "array", + "description": "Information about internal or external sources of data that can be used to address the challenge of data augmentation.", + "items": { + "type": "object", + "properties": { + "domain": { + "allOf": [{"$ref": "#/definitions/keywords"}], + "description": "The application domain(s) of the source (e.g., government, census, economics)." + }, + "keywords": { + "allOf": [{"$ref": "#/definitions/keywords"}], + "description": "Additional tags that help narrow the search (e.g., housing, household income)." + } + }, + "additionalProperties": true + }, + "minItems": 1 + }, + "resource_id": { + "type": "string" + }, + "column_index": { + "type": "integer", + "description": "An index of the column, 0-based." + }, + "column_name": { + "allOf": [{"$ref": "#/definitions/name"}], + "description": "A name of the column. There are no restrictions on the content, length, it can contain whitespace, and names do not even have to be unique." + }, + "data_reference": { + "type": "string", + "description": "Data reference is a string which identifies an output of a step or a pipeline input and forms a data-flow connection between data available and an input to a step.", + "examples": [ + "steps.0.produce", + "inputs.1" + ] + }, + "context": { + "description": "Context in which a pipeline was run.", + "oneOf": [ + {"enum": ["PRETRAINING"], "description": "Pipeline was run during building/training of the system itself, e.g., during metalearning."}, + {"enum": ["TESTING"], "description": "Pipeline was run during development or testing of the system itself, e.g., during debugging. This is also a default context."}, + {"enum": ["EVALUATION"], "description": "Pipeline was run during evaluation of the system itself, e.g., blind evaluation."}, + {"enum": ["PRODUCTION"], "description": "Pipeline was run during regular (production) operation of the system."} + ] + }, + "users": { + "type": "array", + "description": "A list of users associated with the value.", + "items": { + "type": "object", + "properties": { + "id": { + "allOf": [{"$ref": "#/definitions/id"}], + "description": "Globally unique ID for this user. It can be opaque, but it should identify the same user across sessions. Consider using UUID variant 5 with namespace set to the name of your system and name to an ID in your system's database." + }, + "reason": { + "allOf": [{"$ref": "#/definitions/description"}], + "description": "A natural language description of what the user did to be on the list, e.g., \"Picked a pipeline from a list of pipelines.\"." + }, + "rationale": { + "allOf": [{"$ref": "#/definitions/description"}], + "description": "A natural language description by the user of what the user did, e.g., \"I picked a pipeline because it looks short in comparison with others.\"." + } + }, + "required": [ + "id" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "container_argument": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "A regular container type output from another step or pipeline's input.", + "enum": ["CONTAINER"] + }, + "data": { + "$ref": "#/definitions/data_reference" + } + }, + "required": [ + "type", + "data" + ], + "additionalProperties": true + }, + "container_arguments": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "A list of regular container type outputs from another steps or pipeline's inputs.", + "enum": ["CONTAINER"] + }, + "data": { + "type": "array", + "items": { + "$ref": "#/definitions/data_reference" + }, + "minItems": 1 + } + }, + "required": [ + "type", + "data" + ], + "additionalProperties": true + }, + "data_argument": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "A singleton output from another step in a pipeline. This means that container's sole element is passed as an argument to the primitive instead of the whole container value.", + "enum": ["DATA"] + }, + "data": { + "$ref": "#/definitions/data_reference" + } + }, + "required": [ + "type", + "data" + ], + "additionalProperties": true + }, + "data_arguments": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "A list of singleton outputs from other steps in a pipeline.", + "enum": ["DATA"] + }, + "data": { + "type": "array", + "items": { + "$ref": "#/definitions/data_reference" + }, + "minItems": 1 + } + }, + "required": [ + "type", + "data" + ], + "additionalProperties": true + }, + "primitive_argument": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "A primitive instance to be passed as a hyper-parameter. A primitive should be part of a pipeline and is identified by its step.", + "enum": ["PRIMITIVE"] + }, + "data": { + "type": "integer", + "description": "0-based index identifying a step of which primitive is used as a value." + } + }, + "required": [ + "type", + "data" + ], + "additionalProperties": true + }, + "primitive_arguments": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "A list of primitive instances to be passed as a hyper-parameter. Primitives should be part of a pipeline and are identified by their step.", + "enum": ["PRIMITIVE"] + }, + "data": { + "type": "array", + "items": { + "type": "integer", + "description": "0-based index identifying a step of which primitive is used as a value." + }, + "minItems": 1 + } + }, + "required": [ + "type", + "data" + ], + "additionalProperties": true + }, + "value_argument": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description": "A constant value of a hyper-parameter. Each hyper-parameter class knows how to convert its value to a JSON-compatible structure and back.", + "enum": ["VALUE"] + }, + "data": { + "description": "Hyper-parameter value as converted to a JSON-compatible structure by a hyper-parameter class." + } + }, + "required": [ + "type", + "data" + ], + "additionalProperties": true + }, + "arguments": { + "type": "object", + "description": "A mapping between primitive's arguments and their values. Primitive's arguments are passed in turn to primitive's methods which need them. Only those which are specified as kind \"PIPELINE\" in primitive's metadata can be specified here.", + "additionalProperties": false, + "patternProperties": { + "^[A-Za-z][A-Za-z_0-9]*$": { + "oneOf": [ + { + "$ref": "#/definitions/container_argument" + }, + { + "$ref": "#/definitions/container_arguments" + }, + { + "$ref": "#/definitions/data_argument" + } + ] + } + } + }, + "hyperparams": { + "type": "object", + "description": "A mapping between primitive's hyper-parameters and their values.", + "additionalProperties": false, + "patternProperties": { + "^[A-Za-z][A-Za-z_0-9]*([.][A-Za-z][A-Za-z_0-9]*)*$": { + "oneOf": [ + { + "$ref": "#/definitions/container_argument" + }, + { + "$ref": "#/definitions/data_argument" + }, + { + "$ref": "#/definitions/primitive_argument" + }, + { + "$ref": "#/definitions/value_argument" + }, + { + "$ref": "#/definitions/data_arguments" + }, + { + "$ref": "#/definitions/primitive_arguments" + } + ] + } + } + }, + "pipeline_inputs": { + "type": "array", + "description": "Inputs to a pipeline. The order of inputs matter. Inputs are referenced by steps using a data reference.", + "items": { + "type": "object", + "properties": { + "name": { + "$ref": "#/definitions/name" + } + }, + "additionalProperties": true + } + }, + "pipeline_outputs": { + "type": "array", + "description": "Outputs from a pipeline. The order of outputs matter. Each output references an output of a step and in this way makes that step output a pipeline output as well.", + "items": { + "type": "object", + "properties": { + "name": { + "$ref": "#/definitions/name" + }, + "data": { + "$ref": "#/definitions/data_reference" + } + }, + "required": [ + "data" + ], + "additionalProperties": true + } + }, + "pipeline_steps": { + "type": "array", + "description": "Steps defining pipeline's logic.", + "items": { + "type": "object", + "oneOf": [ + { + "properties": { + "type": { + "type": "string", + "description": "A step which runs a primitive.", + "enum": ["PRIMITIVE"] + }, + "primitive": { + "$ref": "#/definitions/primitive_reference" + }, + "arguments": { + "allOf": [{"$ref": "#/definitions/arguments"}], + "description": "Arguments to a primitive as a whole. Not all arguments defined by a primitive have to be specified here. Furthermore, only those which are specified as kind \"PIPELINE\" in primitive's metadata can be specified. Constructor arguments should not be specified here, because they can be automatically created from other information." + }, + "outputs": { + "type": "array", + "description": "A list of produce method names of this primitive which are outputs of this step.", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The name of the primitive's produce method which returns output data available by this primitive step." + } + }, + "required": [ + "id" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "hyperparams": { + "allOf": [{"$ref": "#/definitions/hyperparams"}], + "description": "Only those hyper-parameters which should be fixed as part of the pipeline should be specified here, e.g., control hyper-parameters. Any hyper-parameter specified here should not be further modified (e.g., tuned). Author of a pipeline decides which hyper-parameters are which, probably based on their semantic type." + }, + "users": { + "$ref": "#/definitions/users" + } + }, + "required": [ + "type", + "primitive" + ] + }, + { + "properties": { + "type": { + "type": "string", + "description": "A step which runs another pipeline.", + "enum": ["SUBPIPELINE"] + }, + "pipeline": { + "allOf": [{"$ref": "#/definitions/pipeline_or_pipeline_reference"}], + "description": "A pipeline to run at this step, of pipelines known to the system." + }, + "inputs": { + "type": "array", + "description": "Mapping between data references available in the context of the outer pipeline to inputs of sub-pipeline, in order.", + "items": { + "type": "object", + "properties": { + "data": { + "allOf": [{"$ref": "#/definitions/data_reference"}], + "description": "Data reference, probably of an output of a step or outer pipeline input, mapped to sub-pipeline's inputs in order." + } + }, + "required": [ + "data" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "outputs": { + "type": "array", + "description": "Mapping between outputs of a sub-pipeline to names under which they should be exposed as outputs of this step, in order. For example: [{\"id\": \"predictions\"}] would map the first output of a sub-pipeline to a data reference \"steps.X.predictions\" where \"X\" is the step number of a given sub-pipeline step.", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The name used in constructing the step's output data reference. If not provided, this output is skipped." + } + }, + "required": [ + "id" + ], + "additionalProperties": true + }, + "minItems": 1 + } + }, + "required": [ + "type", + "pipeline", + "inputs", + "outputs" + ] + }, + { + "properties": { + "type": { + "type": "string", + "description": "This step is used to represent a pipeline template which can be used to generate full pipelines. Not to be used in the metalearning context. Additional properties to further specify the placeholder constraints are allowed.", + "enum": ["PLACEHOLDER"] + }, + "inputs": { + "type": "array", + "description": "Mapping between data references available in the context of the outer pipeline which can be used as inputs to resulting sub-pipeline, in order. Resulting sub-pipeline does not have to use all the inputs, but it cannot use any other inputs.", + "items": { + "type": "object", + "properties": { + "data": { + "allOf": [{"$ref": "#/definitions/data_reference"}], + "description": "Data reference, probably of an output of a step or outer pipeline input, mapped to resulting sub-pipeline's inputs in order." + } + }, + "required": [ + "data" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "outputs": { + "type": "array", + "description": "Mapping between outputs of a resulting sub-pipeline to names under which they should be exposed as outputs of this step, in order. For example: [{\"id\": \"predictions\"}] would map the first output of a resulting sub-pipeline to a data reference \"steps.X.predictions\" where \"X\" is the step number of a given placeholder step.", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "The name used in constructing the step's output data reference." + } + }, + "required": [ + "id" + ], + "additionalProperties": true + }, + "minItems": 1 + } + }, + "required": [ + "type", + "inputs", + "outputs" + ] + } + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "model_features": { + "type": "array", + "description": "A set of features supported by an underlying model of a primitive.", + "items": { + "enum": [ + "BINARY", + "MULTICLASS", + "MULTILABEL", + "UNIVARIATE", + "MULTIVARIATE", + "OVERLAPPING", + "NONOVERLAPPING" + ] + }, + "minItems": 1 + }, + "primitive_reference": { + "type": "object", + "properties": { + "id": { + "$ref": "#/definitions/id" + }, + "version": { + "$ref": "#/definitions/version" + }, + "python_path": { + "$ref": "#/definitions/python_path" + }, + "name": { + "$ref": "#/definitions/name" + }, + "digest": { + "$ref": "#/definitions/digest" + } + }, + "required": [ + "id", + "version", + "python_path", + "name" + ], + "additionalProperties": true + }, + "file_columns": { + "type": "array", + "description": "When the value is referencing a file with columns (e.g., a CSV file), columns metadata might be known in advance.", + "items": { + "type": "object", + "properties": { + "column_index": { + "$ref": "#/definitions/column_index" + }, + "column_name": { + "$ref": "#/definitions/column_name" + }, + "description": { + "$ref": "#/definitions/description" + }, + "semantic_types": { + "$ref": "#/definitions/semantic_types" + }, + "foreign_key": { + "$ref": "#/definitions/foreign_key" + }, + "boundary_for" : { + "$ref": "#/definitions/boundary_for" + }, + "time_granularity" : { + "$ref": "#/definitions/time_granularity" + } + }, + "required": [ + "column_index", + "column_name" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "file_columns_count": { + "type": "integer", + "description": "When the value is referencing a file with columns (e.g., a CSV file), number of columns might be known in advance." + }, + "document_reference": { + "description": "A reference to another document.", + "type": "object", + "properties": { + "id": { + "$ref": "#/definitions/id" + }, + "digest": { + "$ref": "#/definitions/digest" + } + }, + "required": [ + "id" + ], + "additionalProperties": true + }, + "pipeline_run_reference": { + "description": "A reference to a pipeline run.", + "type": "object", + "properties": { + "id": { + "$ref": "#/definitions/hash_id" + } + }, + "required": [ + "id" + ], + "additionalProperties": true + }, + "problem_reference": { + "allOf": [{"$ref": "#/definitions/document_reference"}], + "description": "A reference to a problem." + }, + "dataset_reference": { + "allOf": [{"$ref": "#/definitions/document_reference"}], + "description": "A reference to a dataset." + }, + "pipeline_reference": { + "allOf": [{"$ref": "#/definitions/document_reference"}], + "description": "A reference to a pipeline." + }, + "problem_or_problem_reference": { + "anyOf": [ + { + "$ref": "#/definitions/problem_reference" + }, + { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/problem.json" + } + ] + }, + "dataset": { + "allOf": [{"$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/container.json"}], + "description": "A dataset." + }, + "dataset_or_dataset_reference": { + "anyOf": [ + { + "$ref": "#/definitions/dataset_reference" + }, + { + "$ref": "#/definitions/dataset" + } + ] + }, + "pipeline_or_pipeline_reference": { + "anyOf": [ + { + "$ref": "#/definitions/pipeline_reference" + }, + { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json" + } + ] + }, + "datasets": { + "description": "A list of input datasets. The order matters because it is mapped to pipeline inputs.", + "type": "array", + "minItems": 1, + "items": { + "$ref": "#/definitions/dataset_or_dataset_reference" + } + }, + "status": { + "description": "Indicates whether a pipeline, or some portion of it, ran successfully. May include a message with more details about the status.", + "type": "object", + "properties": { + "state": { + "type": "string", + "enum": ["SUCCESS", "FAILURE"] + }, + "message": { + "description": "Further information describing the status. Though not required, this is especially helpful in a FAILURE state. It can be or include a stacktrace.", + "type": "string" + } + }, + "required": [ + "state" + ], + "additionalProperties": true + }, + "logging": { + "description": "Python LogRecord entries recorded during a method call. See https://docs.python.org/3/library/logging.html#logging.LogRecord for more information.", + "type": "array", + "minItems": 0, + "items": { + "description": "A Python LogRecord entry. Other custom fields are allowed (Python LogRecord can be extended with custom fields).", + "type": "object", + "properties": { + "name": { + "description": "The name of the logger used to log the event represented by this LogRecord. Note that this name will always have this value, even though it may be emitted by a handler attached to a different (ancestor) logger.", + "type": "string" + }, + "msg": { + "description": "The non-interpolated event description message.", + "type": "string" + }, + "args": { + "description": "Arguments for message interpolation, when JSON-serializable.", + "type": ["object", "array"] + }, + "levelname": { + "description": "Level at which the logging call was made.", + "type": "string" + }, + "levelno": { + "description": "Level at which the logging call was made.", + "type": "integer" + }, + "pathname": { + "description": "The full pathname of the source file where the logging call was made.", + "type": "string" + }, + "filename": { + "description": "Just the filename of the source file where the logging call was made.", + "type": "string" + }, + "module": { + "description": "Python module name where the logging call was made.", + "type": "string" + }, + "exc_text": { + "description": "Python exception and formatted stack trace as text.", + "type": "string" + }, + "exc_type": { + "description": "Python exception type name.", + "type": "string" + }, + "stack_info": { + "description": "Formatted stack trace as text.", + "type": "string" + }, + "lineno": { + "description": "The line number in the source file where the logging call was made.", + "type": "integer" + }, + "funcName": { + "description": "The name of the function or method from which the logging call was made.", + "type": "string" + }, + "created": { + "type": "number" + }, + "msecs": { + "type": "number" + }, + "relativeCreated": { + "type": "number" + }, + "thread": { + "type": "integer" + }, + "threadName": { + "type": "string" + }, + "processName": { + "type": "string" + }, + "process": { + "type": "integer" + }, + "message": { + "description": "The interpolated event description message.", + "type": "string" + }, + "asctime": { + "type": "string" + } + }, + "required": [ + "name", + "msg", + "levelname", + "levelno", + "pathname", + "filename", + "module", + "lineno", + "funcName", + "created", + "msecs", + "relativeCreated", + "message", + "asctime" + ], + "additionalProperties": true + } + }, + "method_call": { + "description": "Information about a method called on the primitive.", + "oneOf": [ + { + "allOf": [ + { + "description": "Any method call except the constructor.", + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Name of the Python method called.", + "not": { + "enum": ["__init__"] + } + }, + "arguments": { + "$ref": "#/definitions/arguments", + "description": "Pipeline arguments to methods are provided in a standard way, but methods can have additional runtime arguments or arguments overriding hyper-parameters for a call. Those are the values have to be explicitly provided here." + } + }, + "required": [ + "name" + ], + "additionalProperties": true + }, + { + "$ref": "#/definitions/method_call_base" + } + ] + }, + { + "allOf": [ + { + "description": "A constructor method call.", + "type": "object", + "properties": { + "name": { + "description": "Name of the Python method called.", + "type": "string", + "enum": ["__init__"] + } + }, + "required": [ + "name" + ], + "not": { + "description": "Arguments to constructor should not be provided, because they are provided by the runtime and are runtime specific (paths to volumes, etc.).", + "required": [ + "arguments" + ] + }, + "additionalProperties": true + }, + { + "$ref": "#/definitions/method_call_base" + } + ] + } + ] + }, + "method_call_base": { + "description": "General information about a single method call, common to all method calls.", + "type": "object", + "properties": { + "logging": { + "$ref": "#/definitions/logging" + }, + "metadata": { + "description": "If the method call returns a container type, we store its metadata.", + "anyOf": [ + { + "type": "object", + "description": "For \"CallResult\", we store metadata under \"value\" key.", + "properties": { + "value": { + "$ref": "#/definitions/metadata_values" + } + } + }, + { + "type": "object", + "description": "For \"MultiCallResult\", keys should match \"values\" names, which are primitive's produce method names", + "additionalProperties": false, + "patternProperties": { + "^produce[A-Za-z_0-9]*$": { + "$ref": "#/definitions/metadata_values" + } + } + } + ] + }, + "status": { + "$ref": "#/definitions/status" + }, + "start": { + "allOf": [{"$ref": "#/definitions/timestamp"}], + "description": "Absolute timestamp of the start of the method call." + }, + "end": { + "allOf": [{"$ref": "#/definitions/timestamp"}], + "description": "Absolute timestamp of the end of the method call." + }, + "calls": { + "description": "The number of additional times this method was called consecutively with the exactly same arguments in same runtime environment. When omitted, this method was called once. When used, the corresponding \"start\" timestamp is recorded before the first method call and the corresponding \"end\" timestamp is recorded after the final method call. This is an optimization allowing the combining of identical consecutive method calls into one record.", + "type": "integer", + "minimum": 1 + }, + "environment": { + "$ref": "#/definitions/runtime_environment" + } + }, + "required": [ + "status", + "start", + "end" + ], + "additionalProperties": true + }, + "metadata_values": { + "description": "This matches the output of \"Metadata.to_json_structure\" method.", + "type": "array", + "items": { + "type": "object", + "properties": { + "selector": { + "type": "array", + "items": { + "type": ["string", "integer"] + }, + "minItems": 0 + }, + "metadata": { + "description": "Metadata associated with the value at \"selector\".", + "type": "object" + } + } + }, + "minItems": 0 + }, + "runtime_environment": { + "description": "A description of the runtime environment, including engine versions, Docker images, compute resources, and benchmarks.", + "type": "object", + "properties": { + "id": { + "description": "A hash ID computed over the whole runtime environment document to allow for faster identification of same runtime environments.", + "allOf": [{"$ref": "#/definitions/hash_id"}] + }, + "worker_id": { + "description": "A globally unique identifier for the machine on which this pipeline run occurred. The idea is that the worker specifies the system inside which the pipeline is run so that multiple runs on the same system can be grouped together.", + "type": "string" + }, + "reference_engine_version": { + "description": "A version of the released d3m core package with the reference engine used to run this pipeline. Provide the version of the released d3m core package even if your engine is subclassing the reference engine. Alternatively, if you are not using a released d3m core package, provide a git commit hash of the d3m core package repository with the reference engine you used.", + "anyOf": [ + { + "$ref": "#/definitions/version" + }, + { + "$ref": "#/definitions/git_commit" + } + ] + }, + "engine_version": { + "description": "A version of your engine used to run this pipeline (or reference engine, if directly using it). This can be useful for the author of the pipeline run to record, but is less useful for others. For others, \"reference_engine_version\" is probably more useful.", + "anyOf": [ + { + "$ref": "#/definitions/version" + }, + { + "$ref": "#/definitions/git_commit" + } + ] + }, + "base_docker_image": { + "description": "If a pipeline is run inside a Docker container which is based on a public image or known base image, then this field should specify that Docker image. I.e., if your system is using a private Docker image but is extending a \"complete\" Docker image, then list the \"complete\" Docker image here.", + "allOf": [{"$ref": "#/definitions/docker_image"}] + }, + "docker_image": { + "description": "If a pipeline is run inside a Docker container, this field should specify the Docker image used to run this pipeline. This can be useful for the author of the pipeline run to record, but is less useful for others. For others, \"base_docker_image\" is probably more useful.", + "allOf": [{"$ref": "#/definitions/docker_image"}] + }, + "resources": { + "$ref": "#/definitions/compute_resources" + }, + "reference_benchmarks": { + "$ref": "#/definitions/reference_benchmarks" + } + }, + "required": [ + "id", + "worker_id" + ] + }, + "pipeline_run_steps": { + "description": "All of the steps invoked in the pipeline run. There is a one-to-one correspondence between this array and the steps in the pipeline.", + "type": "array", + "items": { + "type": "object", + "oneOf": [ + { + "properties": { + "type": { + "type": "string", + "description": "A primitive step.", + "enum": ["PRIMITIVE"] + }, + "hyperparams": { + "allOf": [{"$ref": "#/definitions/hyperparams"}], + "description": "Together with hyper-parameters listed as part of a pipeline they complete all values necessary to instantiate \"hyperparams\" constructor argument of the primitive. All hyper-parameter values have to be listed explicitly, even if the value matches the default value of a hyper-parameter." + }, + "random_seed": { + "description": "Random seed used, if the primitive accepts a random seed.", + "type": "integer" + }, + "method_calls": { + "description": "Information about the methods called on the primitive, in the order called.", + "type": "array", + "items": { + "$ref": "#/definitions/method_call" + }, + "minItems": 1 + }, + "status": { + "$ref": "#/definitions/status" + }, + "start": { + "allOf": [{"$ref": "#/definitions/timestamp"}], + "description": "Absolute timestamp of the start of the execution of the primitive. Execution of the primitive starts with the first method call but it can also include any preparation work not captured by method calls, so timestamp can be sooner than the first method call timestamp." + }, + "end": { + "allOf": [{"$ref": "#/definitions/timestamp"}], + "description": "Absolute timestamp of the end of the execution of the primitive. Execution of the primitive ends with the last method call but it can also include any cleanup work not captured by method calls, so timestamp can be later than the last method call timestamp." + }, + "environment": { + "allOf": [{"$ref": "#/definitions/runtime_environment"}], + "description": "Provided if this step was run in a different runtime environment than the runtime environment specified at a higher level." + } + }, + "$comment": "TODO: Make \"start\" and \"end\" required when the next version of this schema is released.", + "required": [ + "type", + "status" + ], + "not": { + "required": [ + "steps" + ] + } + }, + { + "properties": { + "type": { + "type": "string", + "description": "A sub-pipeline step.", + "enum": ["SUBPIPELINE"] + }, + "steps": { + "allOf": [{"$ref": "#/definitions/pipeline_run_steps"}], + "description": "Steps of a sub-pipeline, recursively." + }, + "status": { + "$ref": "#/definitions/status" + }, + "start": { + "allOf": [{"$ref": "#/definitions/timestamp"}], + "description": "Absolute timestamp of the start of the execution of the sub-pipeline. Execution of the sub-pipeline starts with the execution of the first primitive but it can also include any preparation work not captured by primitive, so timestamp can be sooner than the first primitive timestamp." + }, + "end": { + "allOf": [{"$ref": "#/definitions/timestamp"}], + "description": "Absolute timestamp of the end of the execution of the sub-pipeline. Execution of the sub-pipeline ends with the execution of the last primitive but it can also include any cleanup work not captured by primitive, so timestamp can be later than the last primitive timestamp." + }, + "environment": { + "allOf": [{"$ref": "#/definitions/runtime_environment"}], + "description": "Provided if this step was run in a different runtime environment than the runtime environment specified at a higher level." + }, + "random_seed": { + "$ref": "#/definitions/pipeline_random_seed" + } + }, + "$comment": "TODO: Make \"start\", \"end\", and \"random_seed\" required when the next version of this schema is released.", + "required": [ + "type", + "status" + ], + "not": { + "required": [ + "hyperparams", + "random_seed", + "method_calls" + ] + } + } + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "performance_metric": { + "type": "object", + "properties": { + "metric": { + "type": "string" + }, + "params": { + "type": "object" + } + }, + "required": [ + "metric" + ], + "oneOf": [ + { + "properties": { + "metric": {"enum": ["ACCURACY"]} + } + }, + { + "properties": { + "metric": {"enum": ["PRECISION"]}, + "params": { + "type": "object", + "properties": { + "pos_label": { + "type": "string" + } + }, + "additionalProperties": true + } + } + }, + { + "properties": { + "metric": {"enum": ["RECALL"]}, + "params": { + "type": "object", + "properties": { + "pos_label": { + "type": "string" + } + }, + "additionalProperties": true + } + } + }, + { + "properties": { + "metric": {"enum": ["F1"]}, + "params": { + "type": "object", + "properties": { + "pos_label": { + "type": "string" + } + }, + "additionalProperties": true + } + } + }, + { + "properties": { + "metric": {"enum": ["F1_MICRO"]} + } + }, + { + "properties": { + "metric": {"enum": ["F1_MACRO"]} + } + }, + { + "properties": { + "metric": {"enum": ["ROC_AUC"]} + } + }, + { + "properties": { + "metric": {"enum": ["ROC_AUC_MICRO"]} + } + }, + { + "properties": { + "metric": {"enum": ["ROC_AUC_MACRO"]} + } + }, + { + "properties": { + "metric": {"enum": ["MEAN_SQUARED_ERROR"]} + } + }, + { + "properties": { + "metric": {"enum": ["ROOT_MEAN_SQUARED_ERROR"]} + } + }, + { + "properties": { + "metric": {"enum": ["MEAN_ABSOLUTE_ERROR"]} + } + }, + { + "properties": { + "metric": {"enum": ["R_SQUARED"]} + } + }, + { + "properties": { + "metric": {"enum": ["NORMALIZED_MUTUAL_INFORMATION"]} + } + }, + { + "properties": { + "metric": {"enum": ["JACCARD_SIMILARITY_SCORE"]}, + "params": { + "type": "object", + "properties": { + "pos_label": { + "type": "string" + } + }, + "additionalProperties": true + } + } + }, + { + "properties": { + "metric": {"enum": ["PRECISION_AT_TOP_K"]}, + "params": { + "type": "object", + "properties": { + "k": { + "type": "integer" + } + }, + "additionalProperties": true + } + } + }, + { + "properties": { + "metric": {"enum": ["OBJECT_DETECTION_AVERAGE_PRECISION"]} + } + }, + { + "properties": { + "metric": {"enum": ["HAMMING_LOSS"]} + } + }, + { + "properties": { + "metric": {"enum": ["HITS_AT_K"]}, + "params": { + "type": "object", + "properties": { + "k": { + "type": "integer" + } + }, + "additionalProperties": true + } + } + }, + { + "properties": { + "metric": {"enum": ["MEAN_RECIPROCAL_RANK"]} + } + } + ], + "additionalProperties": true + }, + "targets": { + "type": "array", + "items": { + "type": "object", + "properties": { + "target_index": { + "type": "integer", + "description": "An index of the target in this list of targets, 0-based." + }, + "resource_id": { + "$ref": "#/definitions/resource_id" + }, + "column_index": { + "$ref": "#/definitions/column_index" + }, + "column_name": { + "$ref": "#/definitions/column_name" + }, + "clusters_number": { + "type": "integer", + "description": "The number of clusters to be generated by the solution algorithm (if this information is known apriori)." + } + }, + "required": [ + "target_index", + "resource_id", + "column_index", + "column_name" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "scores": { + "description": "Scores should match the output of the scoring pipeline.", + "type": "array", + "items": { + "type": "object", + "properties": { + "metric": { + "description": "Description of a metric used. Generally it should match one from the problem description, but it can also be different.", + "anyOf": [ + { + "$ref": "#/definitions/performance_metric" + }, + { + "type": "object", + "properties": { + "metric": { + "description": "A custom metric name. Any custom metric name should match the metric name in the scoring pipeline output.", + "type": "string" + } + }, + "required": [ + "metric" + ], + "additionalProperties": true + } + ] + }, + "value": { + "description": "The value of the scoring metric.", + "type": "number" + }, + "normalized": { + "description": "The normalized value of the scoring metric. Value is from the [0, 1] interval, where higher is better.", + "type": "number" + } + }, + "required": [ + "metric", + "value" + ], + "additionalProperties": true + }, + "minItems": 1 + }, + "predictions": { + "description": "The predictions table generated from the pipeline, including the index column. This follows the MIT Lincoln Labs predictions format. There is a one-to-one correspondence between the header array and the values array.", + "type": "object", + "properties": { + "header": { + "description": "A list of predictions table's column names.", + "type": "array", + "minItems": 1, + "items": { + "$ref": "#/definitions/column_name" + } + }, + "values": { + "description": "An array of predictions. Every element of this array is a column of values corresponding to one header element.", + "type": "array", + "minItems": 1, + "items": { + "description": "A single column of values.", + "type": "array", + "minItems": 1, + "items": { + "description": "A single prediction value." + } + } + } + }, + "required": [ + "header", + "values" + ], + "additionalProperties": true + }, + "pipeline_run_results": { + "description": "The predictions of the pipeline and corresponding metric scores.", + "type": "object", + "properties": { + "scores": { + "$ref": "#/definitions/scores" + }, + "predictions": { + "$ref": "#/definitions/predictions" + } + }, + "additionalProperties": true + }, + "additional_pipeline": { + "description": "An auxiliary pipeline used for preparing data or scoring.", + "type": "object", + "properties": { + "pipeline": { + "$ref": "#/definitions/pipeline_or_pipeline_reference" + }, + "steps": { + "$ref": "#/definitions/pipeline_run_steps" + }, + "status": { + "$ref": "#/definitions/status" + }, + "start": { + "$ref": "#/definitions/pipeline_run_start" + }, + "end": { + "$ref": "#/definitions/pipeline_run_end" + }, + "random_seed": { + "$ref": "#/definitions/pipeline_random_seed" + }, + "environment": { + "$ref": "#/definitions/runtime_environment" + } + }, + "$comment": "TODO: Make \"start\", \"end\", and \"random_seed\" required when the next version of this schema is released.", + "required": [ + "pipeline", + "status" + ], + "additionalProperties": true + }, + "pipeline_run": { + "description": "How a pipeline was run and corresponding results.", + "type": "object", + "properties": { + "phase": { + "description": "A string representing the phase with which this pipeline run is associated.", + "anyOf": [ + { + "type": "string", + "enum": [ + "FIT", + "PRODUCE" + ] + }, + { + "type": "string", + "description": "Some other string representing the phase, for non-standard phases." + } + ] + }, + "is_standard_pipeline": { + "description": "Has been this pipeline run as a standard pipeline or not?", + "type": "boolean" + }, + "fold_group": { + "description": "Groups pipeline runs which belong together. E.g., they are part of the same cross-validation evaluation run.", + "type": "object", + "properties": { + "id": { + "$ref": "#/definitions/id" + }, + "fold": { + "description": "The cross-validation fold index. If not part of the cross-validation, this can be set to 0.", + "type": "integer", + "minimum": 0 + } + }, + "required": [ + "id", + "fold" + ], + "additionalProperties": true + }, + "data_preparation": { + "$ref": "#/definitions/additional_pipeline" + }, + "scoring": { + "allOf": [ + { + "$ref": "#/definitions/additional_pipeline" + }, + { + "properties": { + "datasets": { + "$ref": "#/definitions/datasets" + } + } + } + ] + }, + "results": { + "$ref": "#/definitions/pipeline_run_results" + } + }, + "oneOf": [ + { + "allOf": [ + { + "not": { + "required": [ + "data_preparation" + ] + } + }, + { + "not": { + "required": [ + "scoring" + ] + }, + "properties": { + "results": { + "not": { + "required": [ + "scores" + ] + } + } + } + } + ] + }, + { + "required": [ + "data_preparation" + ], + "not": { + "required": [ + "scoring" + ] + }, + "properties": { + "results": { + "not": { + "required": [ + "scores" + ] + } + } + } + }, + { + "properties": { + "scoring": { + "not": { + "required": [ + "datasets" + ] + } + } + }, + "required": [ + "data_preparation", + "scoring" + ] + }, + { + "properties": { + "scoring": { + "required": [ + "datasets" + ] + } + }, + "required": [ + "scoring" + ], + "not": { + "required": [ + "data_preparation" + ] + } + } + ], + "$comment": "TODO: Make \"is_standard_pipeline\" required when the next version of this schema is released.", + "required": [ + "phase" + ], + "additionalProperties": true + }, + "previous_pipeline_run": { + "allOf": [{"$ref": "#/definitions/pipeline_run_reference"}], + "description": "References a pipeline run that occurred immediately before this pipeline run. Used for reproducibility, for example a test run would reference the train run. If it is not provided, it indicates the first pipeline run." + }, + "compute_resources": { + "description": "Compute resources available.", + "type": "object", + "properties": { + "cpu": { + "description": "CPU devices on the worker. If possible, only those available ot the pipeline runtime, otherwise all.", + "type": "object", + "properties": { + "devices": { + "description": "An array of CPU devices.", + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "properties": { + "name": { + "description": "A physical CPU device name.", + "type": "string" + } + }, + "additionalProperties": true + } + }, + "physical_present": { + "description": "The number of physical CPU cores present on the worker, but not necessary fully available the pipeline runtime.", + "type": "integer", + "minimum": 1 + }, + "logical_present": { + "description": "The number of logical CPU cores present on the worker, but not necessary fully available the pipeline runtime.", + "type": "integer", + "minimum": 1 + }, + "configured_available": { + "description": "The amount of CPU resources available to the pipeline runtime in Kubernetes CPU units or equivalent. See https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu for more information.", + "type": "string" + }, + "constraints": { + "description": "Any constraints as found in the cgroups (e.g., inside of a resource limited Docker container).", + "type": "object" + } + }, + "additionalProperties": true + }, + "gpu": { + "description": "GPU devices on the worker. If possible, only those available ot the pipeline runtime, otherwise all.", + "type": "object", + "properties": { + "devices": { + "description": "An array of GPU devices.", + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "properties": { + "name": { + "description": "A GPU device name.", + "type": "string" + }, + "memory": { + "description": "The total GPU memory on this device, in bytes.", + "type": "integer", + "minimum": 1 + } + }, + "additionalProperties": true + } + }, + "total_memory": { + "description": "The total GPU memory over all devices, in bytes, but not necessary fully available the pipeline runtime.", + "type": "integer", + "minimum": 1 + }, + "configured_memory": { + "description": "The amount of GPU memory available to the pipeline runtime in Kubernetes memory units or equivalent. See https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory for more information.", + "type": "string" + }, + "constraints": { + "description": "Any constraints as found in the cgroups (e.g., inside of a resource limited Docker container).", + "type": "object" + } + }, + "additionalProperties": true + }, + "memory": { + "description": "Memory devices on the worker. If possible, only those available ot the pipeline runtime, otherwise all.", + "type": "object", + "properties": { + "devices": { + "description": "An array of memory devices.", + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "properties": { + "name": { + "description": "A physical memory device name.", + "type": "string" + }, + "memory": { + "description": "The amount of memory on this device, in bytes.", + "type": "integer", + "minimum": 1 + } + }, + "additionalProperties": true + } + }, + "total_memory": { + "description": "The total memory over all memory devices, in bytes, but not necessary fully available the pipeline runtime.", + "type": "integer", + "minimum": 1 + }, + "configured_memory": { + "description": "The amount of memory available to the pipeline runtime in Kubernetes memory units or equivalent. See https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory for more information.", + "type": "string" + }, + "constraints": { + "description": "Any constraints as found in the cgroups (e.g., inside of a resource limited Docker container).", + "type": "object" + } + }, + "additionalProperties": true + } + }, + "additionalProperties": true + }, + "reference_benchmarks": { + "description": "Reference benchmarks are pipeline runs of standard and optional additional benchmark pipelines which should be run on the worker during same or equivalent session so that this pipeline run can be expected to have the same timing characteristics. If it is known that worker configuration has not been changed between sessions, benchmark pipeline runs can be reused.", + "type": "array", + "minItems": 1, + "items": { + "$ref": "#/definitions/pipeline_run_reference" + } + }, + "git_commit": { + "description": "A reference to a particular git commit hash.", + "type": "string", + "pattern": "^[a-fA-F0-9]{40}$" + }, + "pipeline_run_start": { + "description": "Absolute timestamp of the start of the run of the pipeline.", + "allOf": [{"$ref": "#/definitions/timestamp"}] + }, + "pipeline_run_end": { + "description": "Absolute timestamp of the end of the run of the pipeline.", + "allOf": [{"$ref": "#/definitions/timestamp"}] + }, + "pipeline_random_seed": { + "type": "integer", + "description": "The main random seed used to run the pipeline." + }, + "pure_primitive": { + "type": "boolean", + "description": "Does a primitive behave as a pure function. Are produced values always the same for same hyper-parameter values, arguments, random seed, and method calls made, including the order of method calls? Are there no side effects (mutations of state outside of primitive's internal state) when running the primitive? If primitive is connecting to Internet or some other resource not controlled by the runtime, then primitive is not pure. If primitive caches files during execution, then primitive is pure despite modifying more than primitive's internal state, given that caching is implemented so that it does not leak information between different runs of a primitive.", + "default": true + }, + "can_use_gpus": { + "type": "boolean", + "description": "Can a primitive use GPUs if available? Caller should control available GPUs to the primitive through \"CUDA_VISIBLE_DEVICES\" environment variable.", + "default": true + }, + "all_distinct_values": { + "description": "All possible distinct non-missing values in a categorical attribute.", + "type": "array", + "minItems": 1 + } + } +} diff --git a/d3m/d3m/metadata/schemas/v0/pipeline.json b/d3m/d3m/metadata/schemas/v0/pipeline.json new file mode 100644 index 0000000..8b5ffe6 --- /dev/null +++ b/d3m/d3m/metadata/schemas/v0/pipeline.json @@ -0,0 +1,56 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "id": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "title": "Pipeline description", + "description": "Schema for a description of a pipeline.", + "type": "object", + "properties": { + "schema": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/schema" + }, + "id": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/id" + }, + "digest": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/digest" + }, + "source": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/source" + }, + "created": { + "allOf": [{"$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/timestamp"}], + "description": "A timestamp when was the pipeline was created." + }, + "name": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/name" + }, + "other_names": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/other_names" + }, + "description": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/description" + }, + "users": { + "allOf": [{"$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/users"}], + "description": "A list of users who are associated with the creation of this pipeline." + }, + "inputs": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pipeline_inputs" + }, + "outputs": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pipeline_outputs" + }, + "steps": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pipeline_steps" + } + }, + "required": [ + "id", + "schema", + "created", + "inputs", + "outputs", + "steps" + ], + "additionalProperties": true +} diff --git a/d3m/d3m/metadata/schemas/v0/pipeline_run.json b/d3m/d3m/metadata/schemas/v0/pipeline_run.json new file mode 100644 index 0000000..135c3d5 --- /dev/null +++ b/d3m/d3m/metadata/schemas/v0/pipeline_run.json @@ -0,0 +1,66 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "id": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline_run.json", + "title": "Pipeline run description", + "description": "Schema for a description of one run of a pipeline. Pipeline outputs and scores are recorded. It includes references to input dataset(s), a problem, and a pipeline.", + "type": "object", + "properties": { + "schema": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/schema" + }, + "id": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/hash_id" + }, + "problem": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/problem_or_problem_reference" + }, + "datasets": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/datasets" + }, + "pipeline": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pipeline_or_pipeline_reference" + }, + "steps": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pipeline_run_steps" + }, + "status": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/status" + }, + "start": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pipeline_run_start" + }, + "end": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pipeline_run_end" + }, + "run": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pipeline_run" + }, + "context": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/context" + }, + "previous_pipeline_run": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/previous_pipeline_run" + }, + "users": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/users" + }, + "environment": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/runtime_environment" + }, + "random_seed": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pipeline_random_seed" + } + }, + "$comment": "TODO: Make \"start\", \"end\", and \"random_seed\" required when the next version of this schema is released.", + "required": [ + "schema", + "id", + "datasets", + "pipeline", + "status", + "run", + "context", + "environment" + ], + "additionalProperties": true +} diff --git a/d3m/d3m/metadata/schemas/v0/primitive.json b/d3m/d3m/metadata/schemas/v0/primitive.json new file mode 100644 index 0000000..5003720 --- /dev/null +++ b/d3m/d3m/metadata/schemas/v0/primitive.json @@ -0,0 +1,94 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "id": "https://metadata.datadrivendiscovery.org/schemas/v0/primitive.json", + "title": "Primitive metadata", + "description": "Schema for metadata for primitives.", + "type": "object", + "properties": { + "schema": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/schema" + }, + "id": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/id" + }, + "version": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/version" + }, + "digest": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/digest" + }, + "name": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/name" + }, + "other_names": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/other_names" + }, + "description": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/description" + }, + "python_path": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/python_path" + }, + "original_python_path": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/original_python_path" + }, + "keywords": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/keywords" + }, + "source": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/source" + }, + "installation": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/installation" + }, + "primitive_code": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/primitive_code" + }, + "structural_type": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/structural_type" + }, + "location_uris": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/location_uris" + }, + "algorithm_types": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/algorithm_types" + }, + "primitive_family": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/primitive_family" + }, + "preconditions": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/preconditions" + }, + "effects": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/effects" + }, + "hyperparams_to_tune": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/hyperparams_to_tune" + }, + "supported_media_types": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/supported_media_types" + }, + "model_features": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/model_features" + }, + "pure_primitive": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/pure_primitive" + }, + "can_use_gpus": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/can_use_gpus" + } + }, + "required": [ + "algorithm_types", + "id", + "name", + "original_python_path", + "primitive_code", + "primitive_family", + "python_path", + "schema", + "structural_type", + "version" + ], + "additionalProperties": true +} diff --git a/d3m/d3m/metadata/schemas/v0/problem.json b/d3m/d3m/metadata/schemas/v0/problem.json new file mode 100644 index 0000000..d082e92 --- /dev/null +++ b/d3m/d3m/metadata/schemas/v0/problem.json @@ -0,0 +1,50 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "id": "https://metadata.datadrivendiscovery.org/schemas/v0/problem.json", + "title": "Problem description", + "description": "Schema for problem description.", + "type": "object", + "properties": { + "id": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/id" + }, + "version": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/version" + }, + "name": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/name" + }, + "other_names": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/other_names" + }, + "description": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/description" + }, + "digest": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/digest" + }, + "schema": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/schema" + }, + "source": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/source" + }, + "problem": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/problem" + }, + "inputs": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/problem_inputs" + }, + "data_augmentation": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/data_augmentation" + }, + "location_uris": { + "$ref": "https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/location_uris" + } + }, + "required": [ + "id", + "schema" + ], + "additionalProperties": true +} diff --git a/d3m/d3m/metrics.py b/d3m/d3m/metrics.py new file mode 100644 index 0000000..defe60c --- /dev/null +++ b/d3m/d3m/metrics.py @@ -0,0 +1,1100 @@ +import abc +import itertools +import typing + +import numpy # type: ignore +import pandas # type: ignore +from sklearn import metrics, preprocessing # type: ignore + +from d3m import container, exceptions, utils +from d3m.metadata import problem + +__ALL__ = ('class_map',) + +INDEX_COLUMN = 'd3mIndex' +CONFIDENCE_COLUMN = 'confidence' +RANK_COLUMN = 'rank' +EMPTY_VALUES = {numpy.nan, float('NaN'), ""} + +Truth = typing.TypeVar('Truth', bound=container.DataFrame) +Predictions = typing.TypeVar('Predictions', bound=container.DataFrame) +AllLabels = typing.TypeVar('AllLabels', bound=typing.Mapping[str, typing.Sequence]) + + +class Metric(metaclass=utils.AbstractMetaclass): + @abc.abstractmethod + def score(self, truth: Truth, predictions: Predictions) -> typing.Any: + raise NotImplementedError + + @classmethod + def align(cls, truth: Truth, predictions: Predictions) -> Predictions: + """ + Aligns columns and rows in ``predictions`` to match those in ``truth``. + + It requires that all index values in ``truth`` are present in ``predictions`` + and only those. It requires that any column name in ``truth`` is also + present in ``predictions``. Any additional columns present in ``predictions`` + are pushed to the right. + + Parameters + ---------- + truth: + Truth DataFrame. + predictions: + Predictions DataFrame. + + Returns + ------- + Predictions with aligned rows. + """ + + truth_columns_set = set(truth.columns) + predictions_columns_set = set(predictions.columns) + + if len(truth_columns_set) != len(truth.columns): + raise exceptions.InvalidArgumentValueError("Duplicate column names in predictions.") + if len(predictions_columns_set) != len(predictions.columns): + raise exceptions.InvalidArgumentValueError("Duplicate column names in predictions.") + + columns_diff = truth_columns_set - predictions_columns_set + if columns_diff: + raise exceptions.InvalidArgumentValueError(f"Not all columns which exist in truth exist in predictions: {sorted(columns_diff)}") + + if INDEX_COLUMN not in truth.columns: + raise exceptions.InvalidArgumentValueError(f"Index column '{INDEX_COLUMN}' is missing in truth.") + if INDEX_COLUMN not in predictions.columns: + raise exceptions.InvalidArgumentValueError(f"Index column '{INDEX_COLUMN}' is missing in predictions.") + + extra_predictions_columns = [column for column in predictions.columns if column not in truth_columns_set] + + # Reorder columns. + predictions = predictions.reindex(columns=list(truth.columns) + extra_predictions_columns) + + truth_index_set = set(truth.loc[:, INDEX_COLUMN]) + predictions_index_set = set(predictions.loc[:, INDEX_COLUMN]) + + if truth_index_set != predictions_index_set: + raise exceptions.InvalidArgumentValueError(f"Predictions and truth do not have the same set of index values.") + + truth_index_map: typing.Dict = {} + last_index = None + for i, index in enumerate(truth.loc[:, INDEX_COLUMN]): + if index in truth_index_map: + if last_index != index: + raise exceptions.InvalidArgumentValueError(f"Truth does not have all rows with same index value grouped together.") + else: + truth_index_map[index] = i + last_index = index + + predictions_index_order = [] + for index in predictions.loc[:, INDEX_COLUMN]: + predictions_index_order.append(truth_index_map[index]) + + # Reorder rows. + # TODO: How to not use a special column name? + # Currently it will fail if "__row_order__" already exists. We could set "allow_duplicates", but that would just hide + # the fact that we have a duplicated column. How can we then control over which one we really sort and which one we drop? + predictions.insert(0, '__row_order__', predictions_index_order) + predictions.sort_values(['__row_order__'], axis=0, inplace=True, kind='mergesort') + predictions.drop('__row_order__', axis=1, inplace=True) + predictions.reset_index(drop=True, inplace=True) + + return predictions + + @classmethod + def get_target_columns(cls, dataframe: pandas.DataFrame) -> pandas.DataFrame: + """ + Returns only target columns present in ``dataframe``. + """ + + columns = list(dataframe.columns) + + index_columns = columns.count(INDEX_COLUMN) + if index_columns < 1: + raise exceptions.InvalidArgumentValueError(f"Index column '{INDEX_COLUMN}' is missing in predictions.") + elif index_columns > 1: + raise exceptions.InvalidArgumentValueError(f"Predictions contain multiple index columns '{INDEX_COLUMN}': {index_columns}") + + dataframe = dataframe.drop(columns=[INDEX_COLUMN]) + + confidence_columns = columns.count(CONFIDENCE_COLUMN) + if confidence_columns > 1: + raise exceptions.InvalidArgumentValueError(f"Predictions contain multiple confidence columns '{CONFIDENCE_COLUMN}': {confidence_columns}") + elif confidence_columns: + dataframe = dataframe.drop(columns=[CONFIDENCE_COLUMN]) + + rank_columns = columns.count(RANK_COLUMN) + if rank_columns > 1: + raise exceptions.InvalidArgumentValueError(f"Predictions contain multiple rank columns '{RANK_COLUMN}': {rank_columns}") + elif rank_columns: + dataframe = dataframe.drop(columns=[RANK_COLUMN]) + + if not len(dataframe.columns): + raise exceptions.InvalidArgumentValueError(f"Predictions do not contain any target columns.") + + return dataframe + + @classmethod + def get_index_column(cls, dataframe: pandas.DataFrame) -> pandas.DataFrame: + """ + Returns only index column present in ``dataframe``. + """ + + columns = list(dataframe.columns) + + index_columns = columns.count(INDEX_COLUMN) + if index_columns < 1: + raise exceptions.InvalidArgumentValueError(f"Index column '{INDEX_COLUMN}' is missing in predictions.") + elif index_columns > 1: + raise exceptions.InvalidArgumentValueError(f"Predictions contain multiple index columns '{INDEX_COLUMN}': {index_columns}") + + return dataframe.loc[:, [INDEX_COLUMN]] + + @classmethod + def get_confidence_column(cls, dataframe: pandas.DataFrame) -> pandas.DataFrame: + """ + Returns only confidence column present in ``dataframe``. + """ + + columns = list(dataframe.columns) + + confidence_columns = columns.count(CONFIDENCE_COLUMN) + if confidence_columns < 1: + raise exceptions.InvalidArgumentValueError(f"Confidence column '{CONFIDENCE_COLUMN}' is missing in predictions.") + elif confidence_columns > 1: + raise exceptions.InvalidArgumentValueError(f"Predictions contain multiple confidence columns '{CONFIDENCE_COLUMN}': {confidence_columns}") + + return dataframe.loc[:, [CONFIDENCE_COLUMN]] + + @classmethod + def get_rank_column(cls, dataframe: pandas.DataFrame) -> pandas.DataFrame: + """ + Returns only rank column present in ``dataframe``. + """ + + columns = list(dataframe.columns) + + rank_columns = columns.count(RANK_COLUMN) + if rank_columns < 1: + raise exceptions.InvalidArgumentValueError(f"Rank column '{RANK_COLUMN}' is missing in predictions.") + elif rank_columns > 1: + raise exceptions.InvalidArgumentValueError(f"Predictions contain multiple rank columns '{RANK_COLUMN}': {rank_columns}") + + return dataframe.loc[:, [RANK_COLUMN]] + + @classmethod + def vectorize_columns(cls, dataframe: pandas.DataFrame) -> pandas.DataFrame: + """ + For every non-index column, convert all values in rows belonging to the + same index to one row with value being a tuple of values. The order of values + in a tuple follows the order of original rows and is preserved between columns. + """ + + columns_set = set(dataframe.columns) + + if len(columns_set) != len(dataframe.columns): + raise exceptions.InvalidArgumentValueError("Duplicate column names.") + + if INDEX_COLUMN not in dataframe.columns: + raise exceptions.InvalidArgumentValueError(f"Index column '{INDEX_COLUMN}' is missing.") + + columns_without_index = [column_name for column_name in dataframe.columns if column_name != INDEX_COLUMN] + + rows = {} + for index_value in dataframe.loc[:, INDEX_COLUMN].unique(): + rows[index_value] = { + # When we have multiple columns, some of them might not have values for all rows, + # and there are more rows because some other column needs them. In such case + # the column with less values should put an empty value in those extra rows + # (generally an empty string). + column_name: tuple(v for v in dataframe.loc[dataframe[INDEX_COLUMN] == index_value, column_name] if not cls.is_empty_value(v)) + for column_name in columns_without_index + } + + output = pandas.DataFrame.from_dict(rows, orient='index', columns=columns_without_index) + output.index.set_names([INDEX_COLUMN], inplace=True) + output.reset_index(inplace=True) + + return output + + @classmethod + def is_empty_value(cls, v: typing.Any) -> bool: + return v in EMPTY_VALUES or (isinstance(v, (float, numpy.float64, numpy.float32)) and numpy.isnan(v)) + + @classmethod + def one_hot_encode_target(cls, series: pandas.Series, all_labels: typing.Sequence) -> pandas.DataFrame: + """ + Returns one-hot-encoded dataframe where the columns are the labels of the target column, + which is provided as a series of tuples, where each tuple contains all labels of a + given sample. + """ + + mlb = preprocessing.MultiLabelBinarizer(all_labels) + encoded = mlb.fit_transform(series) + + return encoded + + @classmethod + def one_hot_encode_confidence(cls, series: pandas.Series, all_labels: typing.Sequence) -> pandas.DataFrame: + """ + Returns one-hot-encoded dataframe where the columns are the labels of the confidence column, + which is provided as a series of tuples, where each tuple contains confidence for all labels + of a given sample, ordered in order specified by ``labels``. + + Returned dataframe has instead of 0 or 1, a confidence value itself. + """ + + encoded = series.apply(pandas.Series) + encoded.columns = all_labels + + return encoded + + +class _AllAsMultiLabelBase(Metric): + def __init__(self, all_labels: AllLabels = None) -> None: + self.all_labels = all_labels + + def encode_targets(self, truth: Truth, predictions: Predictions) -> typing.Sequence[typing.Tuple[pandas.DataFrame, pandas.DataFrame, typing.Sequence]]: + truth_vectorized = self.vectorize_columns(truth) + predictions_vectorized = self.vectorize_columns(predictions) + + predictions_vectorized = self.align(truth_vectorized, predictions_vectorized) + + truth_targets = self.get_target_columns(truth_vectorized) + predictions_targets = self.get_target_columns(predictions_vectorized) + + if len(truth_targets.columns) != len(predictions_targets.columns): + raise exceptions.InvalidArgumentValueError(f"The number of target columns in truth ({len(truth_targets.columns)}) and predictions ({len(predictions_targets.columns)}) do not match.") + + truth_targets_columns_set = set(truth_targets.columns) + + # This holds from checks in "align". + assert truth_targets_columns_set == set(predictions_targets.columns), (truth_targets.columns, predictions_targets.columns) + + result = [] + for column in truth_targets.columns: + # We know that column names are unique because we check in "align". + truth_target = truth_targets[column] + predictions_target = predictions_targets[column] + + truth_target_values_set = set(itertools.chain.from_iterable(truth_target)) + predictions_target_values_set = set(itertools.chain.from_iterable(predictions_target)) + + # If all labels were provided. + if self.all_labels is not None and column in self.all_labels: + all_labels_set = set(self.all_labels[column]) + + extra_truth_target_values_set = truth_target_values_set - all_labels_set + if extra_truth_target_values_set: + raise exceptions.InvalidArgumentValueError(f"Truth contains extra labels: {sorted(extra_truth_target_values_set)}") + + extra_predictions_target_values_set = predictions_target_values_set - all_labels_set + if extra_predictions_target_values_set: + raise exceptions.InvalidArgumentValueError(f"Predictions contain extra labels: {sorted(extra_predictions_target_values_set)}") + + # Otherwise we infer all labels from available data. + else: + all_labels_set = truth_target_values_set | predictions_target_values_set + + all_labels = sorted(all_labels_set) + + truth_target_encoded = self.one_hot_encode_target(truth_target, all_labels) + predictions_target_encoded = self.one_hot_encode_target(predictions_target, all_labels) + + result.append((truth_target_encoded, predictions_target_encoded, all_labels)) + + return result + + def score(self, truth: Truth, predictions: Predictions) -> float: + # We encode all as multi-label. + encoded_targets = self.encode_targets(truth, predictions) + + if not encoded_targets: + raise exceptions.InvalidArgumentValueError("No target column.") + + scores = [] + for truth_target_encoded, predictions_target_encoded, labels in encoded_targets: + scores.append(self.score_one(truth_target_encoded, predictions_target_encoded, labels)) + + return float(numpy.mean(scores)) + + @abc.abstractmethod + def score_one(self, truth_target_encoded: pandas.DataFrame, predictions_target_encoded: pandas.DataFrame, all_labels: typing.Sequence) -> float: + raise NotImplementedError + + +class _MultiTaskBase(Metric): + def score(self, truth: Truth, predictions: Predictions) -> float: + predictions = self.align(truth, predictions) + + truth_targets = self.get_target_columns(truth) + predictions_targets = self.get_target_columns(predictions) + + if len(truth_targets.columns) != len(predictions_targets.columns): + raise exceptions.InvalidArgumentValueError(f"The number of target columns in truth ({len(truth_targets.columns)}) and predictions ({len(predictions_targets.columns)}) do not match.") + + if not len(truth_targets.columns): + raise exceptions.InvalidArgumentValueError("No target column.") + + # This holds from checks in "align". + assert set(truth_targets.columns) == set(predictions_targets.columns), (truth_targets.columns, predictions_targets.columns) + + scores = [] + for column in truth_targets.columns: + # We know that column names are unique because we check in "align". + truth_target = truth_targets[column] + predictions_target = predictions_targets[column] + + scores.append(self.score_one(truth_target, predictions_target)) + + return float(numpy.mean(scores)) + + @abc.abstractmethod + def score_one(self, truth_target: pandas.Series, predictions_target: pandas.Series) -> float: + raise NotImplementedError + + +class AccuracyMetric(_AllAsMultiLabelBase): + """ + Supports binary, multi-class, multi-label, and multi-task predictions. + """ + + def score_one(self, truth_target_encoded: pandas.DataFrame, predictions_target_encoded: pandas.DataFrame, all_labels: typing.Sequence) -> float: + return metrics.accuracy_score(truth_target_encoded, predictions_target_encoded) + + +class PrecisionMetric(_MultiTaskBase): + """ + Supports binary and multi-task predictions. + """ + + def __init__(self, pos_label: str) -> None: + self.pos_label = pos_label + + def score_one(self, truth_target: pandas.Series, predictions_target: pandas.Series) -> float: + # We do not have to pass labels because we are using binary average. + return metrics.precision_score(truth_target, predictions_target, pos_label=self.pos_label, average='binary') + + +class RecallMetric(_MultiTaskBase): + """ + Supports binary and multi-task predictions. + """ + + def __init__(self, pos_label: str) -> None: + self.pos_label = pos_label + + def score_one(self, truth_target: pandas.Series, predictions_target: pandas.Series) -> float: + # We do not have to pass labels because we are using binary average. + return metrics.recall_score(truth_target, predictions_target, pos_label=self.pos_label, average='binary') + + +class F1Metric(_MultiTaskBase): + """ + Supports binary and multi-task predictions. + """ + + def __init__(self, pos_label: str) -> None: + self.pos_label = pos_label + + def score_one(self, truth_target: pandas.Series, predictions_target: pandas.Series) -> float: + # We do not have to pass labels because we are using binary average. + return metrics.f1_score(truth_target, predictions_target, pos_label=self.pos_label, average='binary') + + +class F1MicroMetric(_AllAsMultiLabelBase): + """ + Supports multi-class, multi-label, and multi-task predictions. + """ + + def score_one(self, truth_target_encoded: pandas.DataFrame, predictions_target_encoded: pandas.DataFrame, all_labels: typing.Sequence) -> float: + # We use multi-label F1 score to compute for multi-class target as well. + # We want to use all labels, so we do not pass labels on. + return metrics.f1_score(truth_target_encoded, predictions_target_encoded, average='micro') + + +class F1MacroMetric(_AllAsMultiLabelBase): + """ + Supports multi-class, multi-label, and multi-task predictions. + """ + + def score_one(self, truth_target_encoded: pandas.DataFrame, predictions_target_encoded: pandas.DataFrame, all_labels: typing.Sequence) -> float: + # We use multi-label F1 score to compute for multi-class target as well. + # We want to use all labels, so we do not pass labels on. + return metrics.f1_score(truth_target_encoded, predictions_target_encoded, average='macro') + + +class MeanSquareErrorMetric(Metric): + """ + Supports univariate and multivariate. + """ + + def score(self, truth: Truth, predictions: Predictions) -> float: + predictions = self.align(truth, predictions) + + truth_targets = self.get_target_columns(truth) + predictions_targets = self.get_target_columns(predictions) + + return metrics.mean_squared_error(truth_targets, predictions_targets, multioutput='uniform_average') + + +class RootMeanSquareErrorMetric(Metric): + """ + Supports univariate and multivariate. + """ + + def score(self, truth: Truth, predictions: Predictions) -> float: + predictions = self.align(truth, predictions) + + truth_targets = self.get_target_columns(truth) + predictions_targets = self.get_target_columns(predictions) + + mean_squared_error = metrics.mean_squared_error(truth_targets, predictions_targets, multioutput='raw_values') + + return float(numpy.mean(numpy.sqrt(mean_squared_error))) + + +class MeanAbsoluteErrorMetric(Metric): + """ + Supports univariate and multivariate. + """ + + def score(self, truth: Truth, predictions: Predictions) -> float: + predictions = self.align(truth, predictions) + + truth_targets = self.get_target_columns(truth) + predictions_targets = self.get_target_columns(predictions) + + return metrics.mean_absolute_error(truth_targets, predictions_targets, multioutput='uniform_average') + + +class RSquaredMetric(Metric): + """ + Supports univariate and multivariate. + """ + + def score(self, truth: Truth, predictions: Predictions) -> float: + predictions = self.align(truth, predictions) + + truth_targets = self.get_target_columns(truth) + predictions_targets = self.get_target_columns(predictions) + + return metrics.r2_score(truth_targets, predictions_targets, multioutput='uniform_average') + + +class NormalizeMutualInformationMetric(Metric): + def score(self, truth: Truth, predictions: Predictions) -> float: + predictions = self.align(truth, predictions) + + truth_targets = self.get_target_columns(truth) + predictions_targets = self.get_target_columns(predictions) + + if len(truth_targets.columns) != len(predictions_targets.columns): + raise exceptions.InvalidArgumentValueError(f"The number of target columns in truth ({len(truth_targets.columns)}) and predictions ({len(predictions_targets.columns)}) do not match.") + + if len(truth_targets.columns) != 1: + raise exceptions.InvalidArgumentValueError("Only one target column is supported.") + + return metrics.normalized_mutual_info_score(truth_targets.iloc[:, 0].ravel(), predictions_targets.iloc[:, 0].ravel(), average_method='geometric') + + +class JaccardSimilarityScoreMetric(_MultiTaskBase): + """ + Supports binary and multi-task predictions. + """ + + def __init__(self, pos_label: str) -> None: + self.pos_label = pos_label + + def score_one(self, truth_target: pandas.Series, predictions_target: pandas.Series) -> float: + # We do not have to pass labels because we are using binary average. + return metrics.jaccard_score(truth_target, predictions_target, pos_label=self.pos_label, average='binary') + + +class PrecisionAtTopKMetric(Metric): + def __init__(self, k: int) -> None: + self.k = k + + def score(self, truth: Truth, predictions: Predictions) -> float: + predictions = self.align(truth, predictions) + + truth_targets = self.get_target_columns(truth) + predictions_targets = self.get_target_columns(predictions) + + if len(truth_targets.columns) != len(predictions_targets.columns): + raise exceptions.InvalidArgumentValueError(f"The number of target columns in truth ({len(truth_targets.columns)}) and predictions ({len(predictions_targets.columns)}) do not match.") + + if len(truth_targets.columns) != 1: + raise exceptions.InvalidArgumentValueError("Only one target column is supported.") + + truth_targets = truth_targets.values.ravel().astype(int) + predictions_targets = predictions_targets.values.ravel().astype(int) + + truth_targets = numpy.argsort(truth_targets)[::-1] + predictions_targets = numpy.argsort(predictions_targets)[::-1] + + truth_targets = truth_targets[0:self.k] + predictions_targets = predictions_targets[0:self.k] + + return numpy.float(len(numpy.intersect1d(truth_targets, predictions_targets))) / self.k + + +class ObjectDetectionAveragePrecisionMetric(Metric): + def _convert_bounding_polygon_to_box_coords(self, bounding_polygon: typing.List) -> typing.List: + # box_coords = [x_min, y_min, x_max, y_max] + if len(bounding_polygon) != 8: + raise exceptions.NotSupportedError("Polygon must contain eight vertices for this metric.") + + if bounding_polygon[0] != bounding_polygon[2] or bounding_polygon[4] != bounding_polygon[6]: + raise exceptions.NotSupportedError("X coordinates in bounding box do not match.") + + if bounding_polygon[1] != bounding_polygon[7] or bounding_polygon[3] != bounding_polygon[5]: + raise exceptions.NotSupportedError("Y coordinates in bounding box do not match.") + + box_coords = [bounding_polygon[0], bounding_polygon[1], + bounding_polygon[4], bounding_polygon[5]] + return box_coords + + def _group_gt_boxes_by_image_name(self, gt_boxes: typing.List) -> typing.Dict: + gt_dict: typing.Dict = {} + + for box in gt_boxes: + image_name = box[0] + bounding_polygon = box[1:] + bbox = self._convert_bounding_polygon_to_box_coords(bounding_polygon) + + if image_name not in gt_dict.keys(): + gt_dict[image_name] = [] + + gt_dict[image_name].append({'bbox': bbox}) + + return gt_dict + + def _voc_ap(self, rec: numpy.ndarray, prec: numpy.ndarray) -> float: + # First append sentinel values at the end. + mrec = numpy.concatenate(([0.], rec, [1.])) + mpre = numpy.concatenate(([0.], prec, [0.])) + + # Compute the precision envelope. + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = numpy.maximum(mpre[i - 1], mpre[i]) + + # To calculate area under PR curve, look for points + # where X axis (recall) changes value. + i = numpy.where(mrec[1:] != mrec[:-1])[0] + + # And sum (\Delta recall) * prec. + ap = numpy.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + + return float(ap) + + def _object_detection_average_precision(self, y_true: typing.List, y_pred: typing.List) -> float: + """ + This function takes a list of ground truth bounding polygons (rectangles in this case) + and a list of detected bounding polygons (also rectangles) for a given class and + computes the average precision of the detections with respect to the ground truth polygons. + Parameters: + ----------- + y_true: list + List of ground truth polygons. Each polygon is represented as a list of + vertices, starting in the upper-left corner going counter-clockwise. + Since in this case, the polygons are rectangles, they will have the + following format: + [image_name, x_min, y_min, x_min, y_max, x_max, y_max, x_max, y_min]. + y_pred: list + List of bounding box polygons with their corresponding confidence scores. Each + polygon is represented as a list of vertices, starting in the upper-left corner + going counter-clockwise. Since in this case, the polygons are rectangles, they + will have the following format: + [image_name, x_min, y_min, x_min, y_max, x_max, y_max, x_max, y_min, confidence_score]. + Returns: + -------- + ap: float + Average precision between detected polygons (rectangles) and the ground truth polylgons (rectangles). + (it is also the area under the precision-recall curve). + Example 1: + >> predictions_list_1 = [['img_00001.png', 110, 110, 110, 210, 210, 210, 210, 110, 0.6], + ['img_00002.png', 5, 10, 5, 20, 20, 20, 20, 10, 0.9], + ['img_00002.png', 120, 130, 120, 200, 200, 200, 200, 130, 0.6]] + >> ground_truth_list_1 = [['img_00001.png', 100, 100, 100, 200, 200, 200, 200, 100], + ['img_00002.png', 10, 10, 10, 20, 20, 20, 20, 10], + ['img_00002.png', 70, 80, 70, 150, 140, 150, 140, 80]] + >> ap_1 = object_detection_average_precision(ground_truth_list_1, predictions_list_1) + >> print(ap_1) + 0.667 + Example 2: + >> predictions_list_2 = [['img_00285.png', 330, 463, 330, 505, 387, 505, 387, 463, 0.0739], + ['img_00285.png', 420, 433, 420, 498, 451, 498, 451, 433, 0.0910], + ['img_00285.png', 328, 465, 328, 540, 403, 540, 403, 465, 0.1008], + ['img_00285.png', 480, 477, 480, 522, 508, 522, 508, 477, 0.1012], + ['img_00285.png', 357, 460, 357, 537, 417, 537, 417, 460, 0.1058], + ['img_00285.png', 356, 456, 356, 521, 391, 521, 391, 456, 0.0843], + ['img_00225.png', 345, 460, 345, 547, 415, 547, 415, 460, 0.0539], + ['img_00225.png', 381, 362, 381, 513, 455, 513, 455, 362, 0.0542], + ['img_00225.png', 382, 366, 382, 422, 416, 422, 416, 366, 0.0559], + ['img_00225.png', 730, 463, 730, 583, 763, 583, 763, 463, 0.0588]] + >> ground_truth_list_2 = [['img_00285.png', 480, 457, 480, 529, 515, 529, 515, 457], + ['img_00285.png', 480, 457, 480, 529, 515, 529, 515, 457], + ['img_00225.png', 522, 540, 522, 660, 576, 660, 576, 540], + ['img_00225.png', 739, 460, 739, 545, 768, 545, 768, 460]] + >> ap_2 = object_detection_average_precision(ground_truth_list_2, predictions_list_2) + >> print(ap_2) + 0.125 + Example 3: + >> predictions_list_3 = [['img_00001.png', 110, 110, 110, 210, 210, 210, 210, 110, 0.6], + ['img_00002.png', 120, 130, 120, 200, 200, 200, 200, 130, 0.6], + ['img_00002.png', 5, 8, 5, 16, 15, 16, 15, 8, 0.9], + ['img_00002.png', 11, 12, 11, 18, 21, 18, 21, 12, 0.9]] + >> ground_truth_list_3 = [['img_00001.png', 100, 100, 100, 200, 200, 200, 200, 100], + ['img_00002.png', 10, 10, 10, 20, 20, 20, 20, 10], + ['img_00002.png', 70, 80, 70, 150, 140, 150, 140, 80]] + >> ap_3 = object_detection_average_precision(ground_truth_list_3, predictions_list_3) + >> print(ap_3) + 0.444 + Example 4: + (Same as example 3 except the last two box predictions in img_00002.png are switched) + >> predictions_list_4 = [['img_00001.png', 110, 110, 110, 210, 210, 210, 210, 110, 0.6], + ['img_00002.png', 120, 130, 120, 200, 200, 200, 200, 130, 0.6], + ['img_00002.png', 11, 12, 11, 18, 21, 18, 21, 12, 0.9], + ['img_00002.png', 5, 8, 5, 16, 15, 16, 15, 8, 0.9]] + >> ground_truth_list_4 = [['img_00001.png', 100, 100, 100, 200, 200, 200, 200, 100], + ['img_00002.png', 10, 10, 10, 20, 20, 20, 20, 10], + ['img_00002.png', 70, 80, 70, 150, 140, 150, 140, 80]] + >> ap_4 = object_detection_average_precision(ground_truth_list_4, predictions_list_4) + >> print(ap_4) + 0.444 + """ + + ovthresh = 0.5 + + # y_true = typing.cast(Truth, unvectorize(y_true)) + # y_pred = typing.cast(Predictions, unvectorize(y_pred)) + + # Load ground truth. + gt_dict = self._group_gt_boxes_by_image_name(y_true) + + # Extract gt objects for this class. + recs = {} + npos = 0 + + imagenames = sorted(gt_dict.keys()) + for imagename in imagenames: + Rlist = [obj for obj in gt_dict[imagename]] + bbox = numpy.array([x['bbox'] for x in Rlist]) + det = [False] * len(Rlist) + npos = npos + len(Rlist) + recs[imagename] = {'bbox': bbox, 'det': det} + + # Load detections. + det_length = len(y_pred[0]) + + # Check that all boxes are the same size. + for det in y_pred: + assert len(det) == det_length, 'Not all boxes have the same dimensions.' + + image_ids = [x[0] for x in y_pred] + BP = numpy.array([[float(z) for z in x[1:-1]] for x in y_pred]) + BB = numpy.array([self._convert_bounding_polygon_to_box_coords(x) for x in BP]) + + confidence = numpy.array([float(x[-1]) for x in y_pred]) + boxes_w_confidences_list = numpy.hstack((BB, -1 * confidence[:, None])) + boxes_w_confidences = numpy.empty( + (boxes_w_confidences_list.shape[0],), + dtype=[ + ('x_min', float), ('y_min', float), + ('x_max', float), ('y_max', float), + ('confidence', float), + ], + ) + boxes_w_confidences[:] = [tuple(i) for i in boxes_w_confidences_list] + + # Sort by confidence. + sorted_ind = numpy.argsort( + boxes_w_confidences, kind='mergesort', + order=('confidence', 'x_min', 'y_min', 'x_max', 'y_max')) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # Go down y_pred and mark TPs and FPs. + nd = len(image_ids) + tp = numpy.zeros(nd) + fp = numpy.zeros(nd) + for d in range(nd): + R = recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -numpy.inf + BBGT = R['bbox'].astype(float) + + if BBGT.size > 0: + # Compute overlaps. + # Intersection. + ixmin = numpy.maximum(BBGT[:, 0], bb[0]) + iymin = numpy.maximum(BBGT[:, 1], bb[1]) + ixmax = numpy.minimum(BBGT[:, 2], bb[2]) + iymax = numpy.minimum(BBGT[:, 3], bb[3]) + iw = numpy.maximum(ixmax - ixmin + 1., 0.) + ih = numpy.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + # Union. + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + + overlaps = inters / uni + ovmax = numpy.max(overlaps) + jmax = numpy.argmax(overlaps) + + if ovmax > ovthresh: + if not R['det'][jmax]: + tp[d] = 1. + R['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + + # Compute precision recall. + fp = numpy.cumsum(fp) + tp = numpy.cumsum(tp) + rec = tp / float(npos) + # Avoid divide by zero in case the first detection matches a difficult ground truth. + prec = tp / numpy.maximum(tp + fp, numpy.finfo(numpy.float64).eps) + ap = self._voc_ap(rec, prec) + + return ap + + def score(self, truth: Truth, predictions: Predictions) -> float: + predictions = self.align(truth, predictions) + + truth_index = self.get_index_column(truth) + truth_targets = self.get_target_columns(truth) + + if len(truth_targets.columns) != 1: + raise NotImplementedError("Support for multiple target columns is not yet implemented.") + + truth_list = [] + for i, (index, target) in enumerate(pandas.concat([truth_index, truth_targets], axis=1).itertuples(index=False, name=None)): + truth_list.append([index] + [float(v) for v in target.split(',')]) + + predictions_index = self.get_index_column(predictions) + predictions_targets = self.get_target_columns(predictions) + predictions_confidence = self.get_confidence_column(predictions) + + if len(predictions_targets.columns) != 1: + raise NotImplementedError("Support for multiple target columns is not yet implemented.") + + predictions_list = [] + for i, (index, target, confidence) in enumerate(pandas.concat([predictions_index, predictions_targets, predictions_confidence], axis=1).itertuples(index=False, name=None)): + predictions_list.append([index] + [float(v) for v in target.split(',')] + [float(confidence)]) + + return self._object_detection_average_precision(truth_list, predictions_list) + + +class HammingLossMetric(_AllAsMultiLabelBase): + """ + Hamming loss gives the percentage of wrong labels to the total number of labels. + Lower the hamming loss, better is the performance of the method used. + + Supports multi-label and multi-task predictions. + """ + + def score_one(self, truth_target_encoded: pandas.DataFrame, predictions_target_encoded: pandas.DataFrame, all_labels: typing.Sequence) -> float: + # We do not have to pass labels because they are not needed and passing them is deprecated. + return metrics.hamming_loss(truth_target_encoded, predictions_target_encoded) + + +class _RocAucBase(Metric): + def __init__(self, all_labels: AllLabels = None) -> None: + self.all_labels = all_labels + + def encode_confidence(self, truth: Truth, predictions: Predictions) -> typing.Tuple[pandas.DataFrame, pandas.DataFrame]: + truth_vectorized = self.vectorize_columns(truth) + predictions_vectorized = self.vectorize_columns(predictions) + + predictions_vectorized = self.align(truth_vectorized, predictions_vectorized) + + truth_targets = self.get_target_columns(truth_vectorized) + predictions_targets = self.get_target_columns(predictions_vectorized) + predictions_confidence = self.get_confidence_column(predictions_vectorized).iloc[:, 0] + + if len(truth_targets.columns) != 1: + raise exceptions.InvalidArgumentValueError(f"Invalid number of target columns in truth: {len(truth_targets.columns)}") + if len(predictions_targets.columns) != 1: + raise exceptions.InvalidArgumentValueError(f"Invalid number of target columns in predictions: {len(predictions_targets.columns)}") + + truth_targets_columns_set = set(truth_targets.columns) + + # This holds from checks in "align". + assert truth_targets_columns_set == set(predictions_targets.columns), (truth_targets.columns, predictions_targets.columns) + + target_column_name = truth_targets.columns[0] + truth_target = truth_targets.iloc[:, 0] + predictions_target = predictions_targets.iloc[:, 0] + + truth_target_values_set = set(itertools.chain.from_iterable(truth_target)) + predictions_target_values_set = set(itertools.chain.from_iterable(predictions_target)) + + # If all labels were provided. + if self.all_labels is not None and target_column_name in self.all_labels: + all_labels_set = set(self.all_labels[target_column_name]) + + extra_truth_target_values_set = truth_target_values_set - all_labels_set + if extra_truth_target_values_set: + raise exceptions.InvalidArgumentValueError(f"Truth contains extra labels: {sorted(extra_truth_target_values_set)}") + + extra_predictions_target_values_set = predictions_target_values_set - all_labels_set + if extra_predictions_target_values_set: + raise exceptions.InvalidArgumentValueError(f"Predictions contain extra labels: {sorted(extra_predictions_target_values_set)}") + + # Otherwise we infer labels from available data. + else: + all_labels_set = truth_target_values_set | predictions_target_values_set + + all_labels = sorted(all_labels_set) + + truth_target_encoded = self.one_hot_encode_target(truth_target, all_labels) + + for i, prediction_targets in enumerate(predictions_target): + prediction_targets_set = set(prediction_targets) + prediction_targets_list = list(prediction_targets) + confidences = predictions_confidence[i] + + if len(prediction_targets_set) != len(prediction_targets_list): + raise exceptions.InvalidArgumentValueError( + f"Duplicate target values ({prediction_targets_list}) for sample '{predictions.loc[i, INDEX_COLUMN]}'." + ) + if len(prediction_targets) != len(confidences): + raise exceptions.InvalidArgumentValueError( + f"The number of target values ({len(prediction_targets)}) does not match the number of confidence values ({len(confidences)}) for sample '{predictions.loc[i, INDEX_COLUMN]}'." + ) + + assert not (prediction_targets_set - all_labels_set), (prediction_targets_set, all_labels_set) + + # We have to order confidences to match labels order. + # If any label is missing in confidences, we add it with confidence 0. + if all_labels != prediction_targets_list: + confidences_map = {label: confidence for label, confidence in zip(prediction_targets, confidences)} + predictions_confidence[i] = tuple(confidences_map.get(label, 0.0) for label in all_labels) + + # Check that all confidences can be converted to float and that they sum to 1. + sum_confidences = sum(float(confidence) for confidence in predictions_confidence[i]) + if not numpy.isclose(sum_confidences, 1.0): + raise exceptions.InvalidArgumentValueError( + f"Confidences do not sum to 1.0 for sample '{predictions.loc[i, INDEX_COLUMN]}', but {sum_confidences}." + ) + + predictions_confidence_encoded = self.one_hot_encode_confidence(predictions_confidence, all_labels) + + return truth_target_encoded, predictions_confidence_encoded + + +class RocAucMetric(_RocAucBase): + """ + Supports binary predictions. + """ + + def score(self, truth: Truth, predictions: Predictions) -> float: + truth_target_encoded, predictions_confidence_encoded = self.encode_confidence(truth, predictions) + + # We use multi-label ROC AUC to compute for binary target as well. + scores = metrics.roc_auc_score(truth_target_encoded, predictions_confidence_encoded, average=None) + + if len(scores) != 2: + raise exceptions.InvalidArgumentValueError("Predictions are not binary.") + + assert numpy.isclose(scores[0], scores[1]), scores + + return scores[0] + + +class RocAucMicroMetric(_RocAucBase): + """ + Supports multi-class and multi-label predictions. + """ + + def score(self, truth: Truth, predictions: Predictions) -> float: + truth_target_encoded, predictions_confidence_encoded = self.encode_confidence(truth, predictions) + + # We use multi-label ROC AUC to compute for multi-class target as well. + return metrics.roc_auc_score(truth_target_encoded, predictions_confidence_encoded, average='micro') + + +class RocAucMacroMetric(_RocAucBase): + """ + Supports multi-class and multi-label predictions. + """ + + def score(self, truth: Truth, predictions: Predictions) -> float: + truth_target_encoded, predictions_confidence_encoded = self.encode_confidence(truth, predictions) + + # We use multi-label ROC AUC to compute for multi-class target as well. + return metrics.roc_auc_score(truth_target_encoded, predictions_confidence_encoded, average='macro') + + +class _RankMetricBase(Metric): + MAX_RANK = 500 + + @classmethod + def get_merged_truth_predictions(cls, truth: Truth, predictions: Predictions) -> pandas.DataFrame: + predictions = cls.align(truth, predictions) + + truth_index = cls.get_index_column(truth) + truth_targets = cls.get_target_columns(truth) + + if len(truth_targets.columns) != 1: + raise exceptions.InvalidArgumentValueError("Only one target column is supported.") + + truth = pandas.concat([truth_index, truth_targets], axis=1) + + predictions_index = cls.get_index_column(predictions) + predictions_targets = cls.get_target_columns(predictions) + predictions_rank = cls.get_rank_column(predictions) + + if len(predictions_targets.columns) != 1: + raise exceptions.InvalidArgumentValueError("Only one target column is supported.") + + predictions = pandas.concat([predictions_index, predictions_targets, predictions_rank], axis=1) + + merged_truth_predictions = pandas.merge(truth, predictions, how='inner', on=truth.columns.values.tolist()) + + # edge-case: none of the true tuples appear in the predictions. + if merged_truth_predictions.empty: + return merged_truth_predictions + + # edge-case: some of the tuples does not appear in the predictions. In this case we give missing true tuples a MAX_RANK of 500. + if merged_truth_predictions.shape[0] != truth.shape[0]: + outer_merged_truth_predictions = pandas.merge(truth, predictions, how='outer', on=truth.columns.values.tolist()) + non_represented = outer_merged_truth_predictions[outer_merged_truth_predictions[RANK_COLUMN].isnull()] + non_represented = non_represented.fillna(cls.MAX_RANK) + merged_truth_predictions = pandas.concat([merged_truth_predictions, non_represented], axis=0) + + return merged_truth_predictions + + +class MeanReciprocalRankMetric(_RankMetricBase): + """ + This computes the mean of the reciprocal of elements of a vector of rankings. This metric is used for linkPrediction problems. + Consider the example: + learningData: + d3mIndex subject object relationship (target) + 0 James John father + 1 John Patricia sister + 2 Robert Thomas brother + ... + ... + + truth: + d3mIndex relationship + 0 father + 1 sister + 2 brother + + predictions: + d3mIndex relationships rank + 0 brother 1 + 0 cousin 2 + 0 mother 3 + 0 father 4 * + 0 grandfather 5 + 1 sister 1 * + 1 mother 2 + 1 aunt 3 + 2 father 1 + 2 brother 2 * + 2 sister 3 + 2 grandfather 4 + 2 aunt 5 + + Note that ranks (of truth relationships in the predictions) = [4,1,2] + MRR = np.sum(1/ranks)/len(ranks) + MRR = 0.58333 + """ + + def score(self, truth: Truth, predictions: Predictions) -> float: + merged_truth_predictions = self.get_merged_truth_predictions(truth, predictions) + + # edge-case: none of the true tuples appear in the predictions. This should return a score of 0.0. + if merged_truth_predictions.empty: + return 0.0 + + ranks = merged_truth_predictions[RANK_COLUMN].astype(float) + return numpy.sum(1 / ranks) / len(ranks) + + +class HitsAtKMetric(_RankMetricBase): + """ + The computes how many elements of a vector of ranks make it to the top 'k' positions. + Consider the example: + learningData: + d3mIndex subject object relationship (target) + 0 James John father + 1 John Patricia sister + 2 Robert Thomas brother + ... + ... + + truth: + d3mIndex relationship + 0 father + 1 sister + 2 brother + + predictions: + d3mIndex relationships rank + 0 brother 1 + 0 cousin 2 + 0 mother 3 + 0 father 4 * + 0 grandfather 5 + 1 sister 1 * + 1 mother 2 + 1 aunt 3 + 2 father 1 + 2 brother 2 * + 2 sister 3 + 2 grandfather 4 + 2 aunt 5 + + Note that ranks (of truth relationships in the predictions) = [4,1,2] + Hits@3 = 2/3 = 0.666666 + Hits@1 = 1/3 = 0.3333333 + Hits@5 = 3/3 = 1.0 + """ + + def __init__(self, k: int) -> None: + self.k = k + + def score(self, truth: Truth, predictions: Predictions) -> float: + merged_truth_predictions = self.get_merged_truth_predictions(truth, predictions) + + # edge-case: none of the true tuples appear in the predictions. This should return a score of 0.0. + if merged_truth_predictions.empty: + return 0.0 + + ranks = merged_truth_predictions[RANK_COLUMN].astype(float) + return numpy.sum(ranks <= self.k) / len(ranks) + + +class_map: typing.Dict[problem.PerformanceMetricBase, Metric] = { + problem.PerformanceMetric.ACCURACY: AccuracyMetric, + problem.PerformanceMetric.PRECISION: PrecisionMetric, + problem.PerformanceMetric.RECALL: RecallMetric, + problem.PerformanceMetric.F1: F1Metric, + problem.PerformanceMetric.F1_MICRO: F1MicroMetric, + problem.PerformanceMetric.F1_MACRO: F1MacroMetric, + problem.PerformanceMetric.MEAN_SQUARED_ERROR: MeanSquareErrorMetric, + problem.PerformanceMetric.ROOT_MEAN_SQUARED_ERROR: RootMeanSquareErrorMetric, + problem.PerformanceMetric.MEAN_ABSOLUTE_ERROR: MeanAbsoluteErrorMetric, + problem.PerformanceMetric.R_SQUARED: RSquaredMetric, + problem.PerformanceMetric.NORMALIZED_MUTUAL_INFORMATION: NormalizeMutualInformationMetric, + problem.PerformanceMetric.JACCARD_SIMILARITY_SCORE: JaccardSimilarityScoreMetric, + problem.PerformanceMetric.PRECISION_AT_TOP_K: PrecisionAtTopKMetric, + problem.PerformanceMetric.OBJECT_DETECTION_AVERAGE_PRECISION: ObjectDetectionAveragePrecisionMetric, + problem.PerformanceMetric.HAMMING_LOSS: HammingLossMetric, + problem.PerformanceMetric.ROC_AUC: RocAucMetric, + problem.PerformanceMetric.ROC_AUC_MICRO: RocAucMicroMetric, + problem.PerformanceMetric.ROC_AUC_MACRO: RocAucMacroMetric, + problem.PerformanceMetric.MEAN_RECIPROCAL_RANK: MeanReciprocalRankMetric, + problem.PerformanceMetric.HITS_AT_K: HitsAtKMetric, +} diff --git a/d3m/d3m/namespace.py b/d3m/d3m/namespace.py new file mode 100644 index 0000000..05770eb --- /dev/null +++ b/d3m/d3m/namespace.py @@ -0,0 +1,195 @@ +import importlib.abc +import importlib.machinery +import logging +import pkg_resources +import sys +import types +import typing + +__all__ = ('setup',) + +logger = logging.getLogger(__name__) + +# For which entry points we already warned that they are ignored? +_ignored_entry_points: typing.Set[str] = set() + + +def entry_points() -> typing.Iterator[pkg_resources.EntryPoint]: + """ + Makes sure that if two entry points are conflicting (one has a path + pointing to a primitive, and another is a path pointing to a module containing + other modules or primitives), the latter entry point is returned + while the former is ignored (and warned about). This makes loading primitives + deterministic. + + We iterate every time over entry points because maybe entry points have changed. + """ + + modules = set(tuple(entry_point.name.split('.')[:-1]) for entry_point in pkg_resources.iter_entry_points('d3m.primitives')) + + for entry_point in pkg_resources.iter_entry_points('d3m.primitives'): + primitive_path = tuple(entry_point.name.split('.')) + + # "primitive_path" starts with a module path the last segment is a class name. If it exists + # as a whole among what is seen as modules for all primitives, we have a conflict. + if primitive_path in modules: + if entry_point.name not in _ignored_entry_points: + _ignored_entry_points.add(entry_point.name) + logger.warning("An entry point for a primitive is conflicting with another entry point which has it as a module: %(entry_point_name)s", {'entry_point_name': entry_point.name}) + else: + yield entry_point + + +class ModuleType(types.ModuleType): + """ + A module which loads primitives on demand under ``d3m.primitives`` namespace. + """ + + def __dir__(self) -> typing.Sequence[str]: + """ + Adds to listed attributes of a module all primitive classes known from + entry points to be available under this module. + + They are not necessary loadable (trying to access them tries to load a primitive which + might fail) and it is not yet necessary that they are really pointing to primitive classes, + because this method does not try to load them yet to determine any of that. + + Already loaded primitives and imported submodules are provided by parent implementation + of "__dir__" already because they are real attributes of this module. + + We add only classes. Submodules are added as real attributes once they are + explicitly imported. This mimics how things work for regular modules in Python. + """ + + entries = set(super().__dir__()) + + current_module = self.__name__.split('.') + + for entry_point in entry_points(): + entry_point_name = ['d3m', 'primitives'] + entry_point.name.split('.') + + # We assume the last segment is a class name, so we remove it. + entry_point_module = entry_point_name[:-1] + + # If an entry point points to a class directly under this module, we add that class' name. + if current_module == entry_point_module: + # The last segment is a class name. + entries.add(entry_point_name[-1]) + + return list(entries) + + def __getattr__(self, item: str) -> typing.Any: + """ + This method is called when there is no real attribute with name "item" already + present in this module object (so not an existing method, an already loaded primitive, + or already imported submodule). + + If it looks like "name" is pointing to a primitive, we load the primitive here and add + it to the module object as a real attribute by calling "register_primitive". + + If it does not look like a primitive, we raise an exception and Python importing logic + tries to import the module instead. + """ + + # Importing here to prevent import cycle. + from d3m import index + + item_path = self.__name__.split('.') + [item] + + for entry_point in entry_points(): + entry_point_name = ['d3m', 'primitives'] + entry_point.name.split('.') + + # We assume for the last segment to be a class, so the full path has to match + # for path to look like it is pointing to a primitive's class. + if item_path == entry_point_name: + primitive = None + try: + logger.debug("Loading entry point '%(entry_point_name)s'.", {'entry_point_name': entry_point.name}) + entry_point.require() + primitive = entry_point.resolve() # type: ignore + except pkg_resources.ResolutionError as error: + logger.warning("While loading primitive '%(entry_point_name)s', an error has been detected: %(error)s", {'entry_point_name': entry_point.name, 'error': error}) + logger.warning("Attempting to load primitive '%(entry_point_name)s' without checking requirements.", {'entry_point_name': entry_point.name}) + + # There was an error, so we try again without checking requirements. + if primitive is None: + primitive = entry_point.resolve() # type: ignore + + try: + # We set the sentinel so that when during registration attribute with name "name" + # is accessed this method is not called again (because a real attribute already + # exists) but the sentinel is returned. + setattr(self, item, index._SENTINEL) + index.register_primitive('.'.join(entry_point_name), primitive) + except Exception: + if getattr(self, item) is index._SENTINEL: + delattr(self, item) + raise + + # Calling "register_primitive" should set a real attribute on this module object. + assert getattr(self, item) is primitive + + return primitive + + raise AttributeError('module \'{name}\' has no attribute \'{item}\''.format(name=self.__name__, item=item)) + + +class Loader(importlib.abc.Loader): + """ + A loader which returns modules of our subclass. + """ + + def create_module(self, spec: importlib.machinery.ModuleSpec) -> types.ModuleType: + return ModuleType(spec.name, ModuleType.__doc__) + + def exec_module(self, module: types.ModuleType) -> None: + pass + + +class MetaPathFinder(importlib.abc.MetaPathFinder): + """ + A finder for ``d3m.primitives`` namespace which uses our loader for entries in entry points. + """ + + def find_spec(self, fullname, path, target=None): # type: ignore + if not fullname.startswith('d3m.primitives'): + return None + + if fullname == 'd3m.primitives': + return importlib.machinery.ModuleSpec(fullname, Loader(), is_package=True) + + name = fullname.split('.') + + for entry_point in entry_points(): + entry_point_name = ['d3m', 'primitives'] + entry_point.name.split('.') + + # We assume the last segment is a class name, so we remove it. + entry_point_module = entry_point_name[:-1] + + # There is at least one entry point having this name as its module, + # so we return a module. + if len(entry_point_module) >= len(name) and entry_point_module[0:len(name)] == name: + return importlib.machinery.ModuleSpec(fullname, Loader(), is_package=True) + + return None + + +def setup() -> None: + """ + Expose all primitives under the same ``d3m.primitives`` namespace. + + This is achieved using Python entry points. Python packages containing primitives + can register them and expose them under the common namespace by adding an entry + like the following to package's ``setup.py``:: + + entry_points = { + 'd3m.primitives': [ + 'primitive_namespace.PrimitiveName = my_package.my_module:PrimitiveClassName', + ], + }, + + The example above would expose the ``my_package.my_module.PrimitiveClassName`` primitive under + ``d3m.primitives.primitive_namespace.PrimitiveName``. + """ + + sys.meta_path.append(MetaPathFinder()) diff --git a/d3m/d3m/primitive_interfaces/__init__.py b/d3m/d3m/primitive_interfaces/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d3m/d3m/primitive_interfaces/base.py b/d3m/d3m/primitive_interfaces/base.py new file mode 100644 index 0000000..4af1cb6 --- /dev/null +++ b/d3m/d3m/primitive_interfaces/base.py @@ -0,0 +1,1293 @@ +import abc +import inspect +import logging +import time +import typing + +from d3m import exceptions, types, utils +from d3m.metadata import base as metadata_base, hyperparams, params, problem + +__all__ = ( + 'Inputs', 'Outputs', 'Params', 'Hyperparams', 'CallResult', 'MultiCallResult', 'DockerContainer', + 'PrimitiveBase', 'ContinueFitMixin', 'SamplingCompositionalityMixin', + 'ProbabilisticCompositionalityMixin', 'Gradients', + 'GradientCompositionalityMixin', 'LossFunctionMixin', + 'NeuralNetworkModuleMixin', 'NeuralNetworkObjectMixin', + 'singleton', 'inputs_across_samples', +) + + +Inputs = typing.TypeVar('Inputs', bound=typing.Union[types.Container]) # type: ignore +Outputs = typing.TypeVar('Outputs', bound=typing.Union[types.Container]) # type: ignore +# This type parameter is optional and can be set to None. +# See "TransformerPrimitiveBase" for an example. +Params = typing.TypeVar('Params', bound=params.Params) +Hyperparams = typing.TypeVar('Hyperparams', bound=hyperparams.Hyperparams) +Module = typing.TypeVar('Module') + +T = typing.TypeVar('T') + +# All base classes (primitive interfaces) should have docstrings starting with this language. +# This allows us to validate that primitives have changed their descriptions/docstrings to something different. +DEFAULT_DESCRIPTION = "A base class for primitives" + + +class CallResult(typing.Generic[T]): + """ + Some methods return additional metadata about the method call itself + (which is different to metadata about the value returned, which is stored + in ``metadata`` attribute of the value itself). + + For ``produce`` method call, ``has_finished`` is ``True`` if the last call + to ``produce`` has produced the final outputs and a call with more time or + more iterations cannot get different outputs. + + For ``fit`` method call, ``has_finished`` is ``True`` if a primitive has been + fully fitted on current training data and further calls to ``fit`` are + unnecessary and will not change anything. ``False`` means that more iterations + can be done (but it does not necessary mean that more iterations are beneficial). + + If a primitive has iterations internally, then ``iterations_done`` contains + how many of those iterations have been made during the last call. If primitive + does not support them, ``iterations_done`` is ``None``. + + Those methods should return value wrapped into this class. + + Parameters + ---------- + value: + The value itself of the method call. + has_finished: + Set to ``True`` if it is not reasonable to call the method again anymore. + iterations_done: + How many iterations have been done during a method call, if any. + """ + + def __init__(self, value: T, has_finished: bool = True, iterations_done: int = None) -> None: + self.value = value + self.has_finished = has_finished + self.iterations_done = iterations_done + + +class MultiCallResult: + """ + Similar to `CallResult`, but used by ``multi_produce``. + + It has no precise typing information because type would have to be a dependent type + which is not (yet) supported in standard Python typing. Type would depend on + ``produce_methods`` argument and output types of corresponding produce methods. + + Parameters + ---------- + values: + A dict of values mapping between produce method names and their value outputs. + has_finished: + Set to ``True`` if it is not reasonable to call the method again anymore. + iterations_done: + How many iterations have been done during a method call, if any. + """ + + def __init__(self, values: typing.Dict, has_finished: bool = True, iterations_done: int = None) -> None: + self.values = values + self.has_finished = has_finished + self.iterations_done = iterations_done + + +class PrimitiveBaseMeta(utils.GenericMetaclass): + """ + A metaclass which provides the primitive instance to metadata so that primitive + metadata can be automatically generated. + """ + + def __new__(mcls, class_name, bases, namespace, **kwargs): # type: ignore + cls = super().__new__(mcls, class_name, bases, namespace, **kwargs) + + if inspect.isabstract(cls): + return cls + + if not isinstance(cls.metadata, metadata_base.PrimitiveMetadata): + raise TypeError("'metadata' attribute is not an instance of PrimitiveMetadata.") + + # We are creating a class-level logger so that it can be used both from class and instance methods. + # "python_path" is a required metadata value, but we leave metadata validation to later. + python_path = cls.metadata.query().get('python_path', None) + if python_path is not None: + cls.logger = logging.getLogger(python_path) + + cls.metadata.contribute_to_class(cls) + + return cls + + def __repr__(cls) -> str: + if getattr(cls, 'metadata', None) is not None: + return cls.metadata.query().get('python_path', super().__repr__()) + else: + return super().__repr__() + + +class DockerContainer(typing.NamedTuple): + """ + A tuple suitable to describe connection information necessary to connect + to exposed ports of a running Docker container. + + Attributes + ---------- + address: + An address at which the Docker container is available. + ports: + Mapping between image's exposed ports and real ports. E.g., + ``{'80/tcp': 80}``. + """ + + address: str + ports: typing.Dict[str, int] + + +class PrimitiveBase(typing.Generic[Inputs, Outputs, Params, Hyperparams], metaclass=PrimitiveBaseMeta): + """ + A base class for primitives. + + Class is parameterized using four type variables, ``Inputs``, ``Outputs``, ``Params``, + and ``Hyperparams``. + + ``Params`` has to be a subclass of `d3m.metadata.params.Params` and should define + all fields and their types for parameters which the primitive is fitting. + + ``Hyperparams`` has to be a subclass of a `d3m.metadata.hyperparams.Hyperparams`. + Hyper-parameters are those primitive's parameters which primitive is not fitting and + generally do not change during a life-time of a primitive. + + ``Params`` and ``Hyperparams`` have to be picklable and copyable. See `pickle`, + `copy`, and `copyreg` Python modules for more information. + + In this context we use term method arguments to mean both formal parameters and + actual parameters of a method. We do this to not confuse method parameters with + primitive parameters (``Params``). + + All arguments to all methods are keyword-only. No ``*args`` or ``**kwargs`` should + ever be used in any method. + + Standardized interface use few public attributes and no other public attributes are + allowed to assure future compatibility. For your attributes use the convention that + private symbols should start with ``_``. + + Primitives can have methods which are not part of standardized interface classes: + + * Additional "produce" methods which are prefixed with ``produce_`` and have + the same semantics as ``produce`` but potentially return different output + container types instead of ``Outputs`` (in such primitive ``Outputs`` is seen as + primary output type, but the primitive also has secondary output types). + They should return ``CallResult`` and have ``timeout`` and ``iterations`` arguments. + * Private methods prefixed with ``_``. + + No other public additional methods are allowed. If this represents a problem for you, + open an issue. (The rationale is that for other methods an automatic system will not + understand the semantics of the method.) + + Method arguments which start with ``_`` are seen as private and can be used for arguments + useful for debugging and testing, but they should not be used by (or even known to) a + caller during normal execution. Such arguments have to be optional (have a default value) + so that the method can be called without the knowledge of the argument. + + All arguments to all methods and all hyper-parameters together are seen as arguments to + the primitive as a whole. They are identified by their names. This means that any argument + name must have the same type and semantics across all methods, effectively be the same argument. + If a method argument matches in name a hyper-parameter, it has to match it in type and semantics + as well. Such method argument overrides a hyper-parameter for a method call. All this is necessary + so that callers can have easier time determine what values to pass to arguments and that it is + easier to describe what all values are inputs to a primitive as a whole (set of all + arguments). + + To recap, subclasses can extend arguments of standard methods with explicit typed keyword + arguments used for the method call, or define new "produce" methods with arbitrary explicit + typed keyword arguments. There are multiple kinds of such arguments allowed: + + * An (additional) input argument of any container type and not necessary of ``Inputs`` + (in such primitive ``Inputs`` is seen as primary input type, but the primitive also has + secondary input types). + * An argument which is overriding a hyper-parameter for the duration of the call. + It should match a hyper-parameter in name and type. It should be a required argument + (no default value) which the caller has to supply (or with a default value of a + hyper-parameter, or with the same hyper-parameter as it was passed to the constructor, + or with some other value). This is meant just for fine-control by a caller during fitting + or producing, e.g., for a threshold or learning rate, and is not reasonable for most + hyper-parameters. + * An (additional) value argument which is one of standard data types, but not a container type. + In this case a caller will try to satisfy the input by creating part of a pipeline which + ends with a primitive with singleton produce method and extract the singleton value and + pass it without a container. This kind of an argument is **discouraged** and should probably + be a hyper-parameter instead (because it is unclear how can a caller determine which value + is a reasonable value to pass in an automatic way), but it is defined for completeness and + so that existing pipelines can be easier described. + * A private argument prefixed with ``_`` which is used for debugging and testing. + It should not be used by (or even known to) a caller during normal execution. + Such argument has to be optional (have a default value) so that the method can be called + without the knowledge of the argument. + + Each primitive's class automatically gets an instance of Python's logging logger stored + into its ``logger`` class attribute. The instance is made under the name of primitive's + ``python_path`` metadata value. Primitives can use this logger to log information at + various levels (debug, warning, error) and even associate extra data with log record + using the ``extra`` argument to the logger calls. + + Subclasses of this class allow functional compositionality. + + Attributes + ---------- + metadata: + Primitive's metadata. Available as a class attribute. + logger: + Primitive's logger. Available as a class attribute. + hyperparams: + Hyperparams passed to the constructor. + random_seed: + Random seed passed to the constructor. + docker_containers: + A dict mapping Docker image keys from primitive's metadata to (named) tuples containing + container's address under which the container is accessible by the primitive, and a + dict mapping exposed ports to ports on that address. + volumes: + A dict mapping volume keys from primitive's metadata to file and directory paths + where downloaded and extracted files are available to the primitive. + temporary_directory: + An absolute path to a temporary directory a primitive can use to store any files + for the duration of the current pipeline run phase. Directory is automatically + cleaned up after the current pipeline run phase finishes. + """ + + # Primitive's metadata (annotation) should be put on "metadata' attribute to provide + # all fields (which cannot be determined automatically) inside the code. In this way metadata + # is close to the code and it is easier for consumers to make sure metadata they are using + # is really matching the code they are using. PrimitiveMetadata class will automatically + # extract additional metadata and update itself with metadata about code and other things + # it can extract automatically. + metadata: typing.ClassVar[metadata_base.PrimitiveMetadata] = None + + # This gets automatically set to primitive's logger in metaclass. + logger: typing.ClassVar[logging.Logger] = None + + hyperparams: Hyperparams + random_seed: int + docker_containers: typing.Dict[str, DockerContainer] + volumes: typing.Dict[str, str] + temporary_directory: str + + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, + docker_containers: typing.Dict[str, DockerContainer] = None, + volumes: typing.Dict[str, str] = None, + temporary_directory: str = None) -> None: + """ + All primitives should accept all their hyper-parameters in a constructor as one value, + an instance of type ``Hyperparams``. + + Provided random seed should control all randomness used by this primitive. + Primitive should behave exactly the same for the same random seed across multiple + invocations. You can call `numpy.random.RandomState(random_seed)` to obtain an + instance of a random generator using provided seed. If your primitive does not + use randomness, consider not exposing this argument in your primitive's constructor + to signal that. + + Primitives can be wrappers around or use one or more Docker images which they can + specify as part of ``installation`` field in their metadata. Each Docker image listed + there has a ``key`` field identifying that image. When primitive is created, + ``docker_containers`` contains a mapping between those keys and connection information + which primitive can use to connect to a running Docker container for a particular Docker + image and its exposed ports. Docker containers might be long running and shared between + multiple instances of a primitive. If your primitive does not use Docker images, + consider not exposing this argument in your primitive's constructor. + + **Note**: Support for primitives using Docker containers has been put on hold. + Currently it is not expected that any runtime running primitives will run + Docker containers for a primitive. + + Primitives can also use additional static files which can be added as a dependency + to ``installation`` metadata. When done so, given volumes are provided to the + primitive through ``volumes`` argument to the primitive's constructor as a + dict mapping volume keys to file and directory paths where downloaded and + extracted files are available to the primitive. All provided files and directories + are read-only. If your primitive does not use static files, consider not exposing + this argument in your primitive's constructor. + + Primitives can also use the provided temporary directory to store any files for + the duration of the current pipeline run phase. Directory is automatically + cleaned up after the current pipeline run phase finishes. Do not store in this + directory any primitive's state you would like to preserve between "fit" and + "produce" phases of pipeline execution. Use ``Params`` for that. The main intent + of this temporary directory is to store files referenced by any ``Dataset`` object + your primitive might create and followup primitives in the pipeline should have + access to. When storing files into this directory consider using capabilities + of Python's `tempfile` module to generate filenames which will not conflict with + any other files stored there. Use provided temporary directory as ``dir`` argument + to set it as base directory to generate additional temporary files and directories + as needed. If your primitive does not use temporary directory, consider not exposing + this argument in your primitive's constructor. + + No other arguments to the constructor are allowed (except for private arguments) + because we want instances of primitives to be created without a need for any other + prior computation. + + Module in which a primitive is defined should be kept lightweight and on import not do + any (pre)computation, data loading, or resource allocation/reservation. Any loading + and resource allocation/reservation should be done in the constructor. Any (pre)computation + should be done lazily when needed once requested through other methods and not in the constructor. + """ + + self.hyperparams = hyperparams + self.random_seed = random_seed + if docker_containers is None: + self.docker_containers: typing.Dict[str, DockerContainer] = {} + else: + self.docker_containers = docker_containers + if volumes is None: + self.volumes: typing.Dict[str, str] = {} + else: + self.volumes = volumes + self.temporary_directory = temporary_directory + + @abc.abstractmethod + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Produce primitive's best choice of the output for each of the inputs. + + The output value should be wrapped inside ``CallResult`` object before returning. + + In many cases producing an output is a quick operation in comparison with ``fit``, but not + all cases are like that. For example, a primitive can start a potentially long optimization + process to compute outputs. ``timeout`` and ``iterations`` can serve as a way for a caller + to guide the length of this process. + + Ideally, a primitive should adapt its call to try to produce the best outputs possible + inside the time allocated. If this is not possible and the primitive reaches the timeout + before producing outputs, it should raise a ``TimeoutError`` exception to signal that the + call was unsuccessful in the given time. The state of the primitive after the exception + should be as the method call has never happened and primitive should continue to operate + normally. The purpose of ``timeout`` is to give opportunity to a primitive to cleanly + manage its state instead of interrupting execution from outside. Maintaining stable internal + state should have precedence over respecting the ``timeout`` (caller can terminate the + misbehaving primitive from outside anyway). If a longer ``timeout`` would produce + different outputs, then ``CallResult``'s ``has_finished`` should be set to ``False``. + + Some primitives have internal iterations (for example, optimization iterations). + For those, caller can provide how many of primitive's internal iterations + should a primitive do before returning outputs. Primitives should make iterations as + small as reasonable. If ``iterations`` is ``None``, then there is no limit on + how many iterations the primitive should do and primitive should choose the best amount + of iterations on its own (potentially controlled through hyper-parameters). + If ``iterations`` is a number, a primitive has to do those number of iterations, + if possible. ``timeout`` should still be respected and potentially less iterations + can be done because of that. Primitives with internal iterations should make + ``CallResult`` contain correct values. + + For primitives which do not have internal iterations, any value of ``iterations`` + means that they should run fully, respecting only ``timeout``. + + If primitive should have been fitted before calling this method, but it has not been, + primitive should raise a ``PrimitiveNotFittedError`` exception. + + Parameters + ---------- + inputs: + The inputs of shape [num_inputs, ...]. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + The outputs of shape [num_inputs, ...] wrapped inside ``CallResult``. + """ + + def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, timeout: float = None, iterations: int = None) -> MultiCallResult: + """ + A method calling multiple produce methods at once. + + When a primitive has multiple produce methods it is common that they might compute the + same internal results for same inputs but return different representations of those results. + If caller is interested in multiple of those representations, calling multiple produce + methods might lead to recomputing same internal results multiple times. To address this, + this method allows primitive author to implement an optimized version which computes + internal results only once for multiple calls of produce methods, but return those different + representations. + + If any additional method arguments are added to primitive's produce method(s), they have + to be added to this method as well. This method should accept an union of all arguments + accepted by primitive's produce method(s) and then use them accordingly when computing + results. + + The default implementation of this method just calls all produce methods listed in + ``produce_methods`` in order and is potentially inefficient. + + If primitive should have been fitted before calling this method, but it has not been, + primitive should raise a ``PrimitiveNotFittedError`` exception. + + Parameters + ---------- + produce_methods: + A list of names of produce methods to call. + inputs: + The inputs given to all produce methods. + timeout: + A maximum time this primitive should take to produce outputs for all produce methods + listed in ``produce_methods`` argument, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs) + + def _multi_produce(self, *, produce_methods: typing.Sequence[str], timeout: float = None, iterations: int = None, **kwargs: typing.Dict[str, typing.Any]) -> MultiCallResult: + """ + We do not want a public API to use ``kwargs``, but such implementation allows easier subclassing and reuse + of a default implementation. Do not call directly. + """ + + results = [] + for method_name in produce_methods: + if method_name != 'produce' and not method_name.startswith('produce_'): + raise exceptions.InvalidArgumentValueError("Invalid produce method name '{method_name}'.".format(method_name=method_name)) + + if not hasattr(self, method_name): + raise exceptions.InvalidArgumentValueError("Unknown produce method name '{method_name}'.".format(method_name=method_name)) + + try: + expected_arguments = set(self.metadata.query()['primitive_code'].get('instance_methods', {})[method_name]['arguments']) + except KeyError as error: + raise exceptions.InvalidArgumentValueError("Unknown produce method name '{method_name}'.".format(method_name=method_name)) from error + + arguments = {name: value for name, value in kwargs.items() if name in expected_arguments} + + start = time.perf_counter() + results.append(getattr(self, method_name)(timeout=timeout, iterations=iterations, **arguments)) + delta = time.perf_counter() - start + + # Decrease the amount of time available to other calls. This delegates responsibility + # of raising a "TimeoutError" exception to produce methods themselves. It also assumes + # that if one passes a negative timeout value to a produce method, it raises a + # "TimeoutError" exception correctly. + if timeout is not None: + timeout -= delta + + if not isinstance(results[-1], CallResult): + raise exceptions.InvalidReturnTypeError("Primitive's produce method '{method_name}' has not returned a CallResult.".format( + method_name=method_name, + )) + + # We return the maximum number of iterations done by any produce method we called. + iterations_done = None + for result in results: + if result.iterations_done is not None: + if iterations_done is None: + iterations_done = result.iterations_done + else: + iterations_done = max(iterations_done, result.iterations_done) + + return MultiCallResult( + values={name: result.value for name, result in zip(produce_methods, results)}, + has_finished=all(result.has_finished for result in results), + iterations_done=iterations_done, + ) + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, outputs: Outputs, timeout: float = None, iterations: int = None) -> MultiCallResult: + """ + A method calling ``fit`` and after that multiple produce methods at once. + + This method allows primitive author to implement an optimized version of both fitting + and producing a primitive on same data. + + If any additional method arguments are added to primitive's ``set_training_data`` method + or produce method(s), or removed from them, they have to be added to or removed from this + method as well. This method should accept an union of all arguments accepted by primitive's + ``set_training_data`` method and produce method(s) and then use them accordingly when + computing results. + + The default implementation of this method just calls first ``set_training_data`` method, + ``fit`` method, and all produce methods listed in ``produce_methods`` in order and is + potentially inefficient. + + Parameters + ---------- + produce_methods: + A list of names of produce methods to call. + inputs: + The inputs given to ``set_training_data`` and all produce methods. + outputs: + The outputs given to ``set_training_data``. + timeout: + A maximum time this primitive should take to both fit the primitive and produce outputs + for all produce methods listed in ``produce_methods`` argument, in seconds. + iterations: + How many of internal iterations should the primitive do for both fitting and producing + outputs of all produce methods. + + Returns + ------- + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, outputs=outputs) + + def _fit_multi_produce(self, *, produce_methods: typing.Sequence[str], timeout: float = None, iterations: int = None, **kwargs: typing.Dict[str, typing.Any]) -> MultiCallResult: + """ + We do not want a public API to use ``kwargs``, but such implementation allows easier subclassing and reuse + of a default implementation. Do not call directly. + """ + + try: + expected_arguments = set(self.metadata.query()['primitive_code'].get('instance_methods', {})['set_training_data']['arguments']) + except KeyError as error: + raise exceptions.InvalidArgumentValueError("Unknown produce method name '{method_name}'.".format(method_name='set_training_data')) from error + + arguments = {name: value for name, value in kwargs.items() if name in expected_arguments} + + start = time.perf_counter() + self.set_training_data(**arguments) # type: ignore + delta = time.perf_counter() - start + + # Decrease the amount of time available to other calls. This delegates responsibility + # of raising a "TimeoutError" exception to fit and produce methods themselves. + # It also assumes that if one passes a negative timeout value to a fit or a produce + # method, it raises a "TimeoutError" exception correctly. + if timeout is not None: + timeout -= delta + + start = time.perf_counter() + fit_result = self.fit(timeout=timeout, iterations=iterations) + delta = time.perf_counter() - start + + if timeout is not None: + timeout -= delta + + if not isinstance(fit_result, CallResult): + raise exceptions.InvalidReturnTypeError("Primitive's fit method has not returned a CallResult.") + + produce_results = self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, **kwargs) + + results: typing.List[typing.Union[CallResult, MultiCallResult]] = [fit_result, produce_results] + + # We return the maximum number of iterations done by a fit method or any produce method we called. + iterations_done = None + for result in results: + if result.iterations_done is not None: + if iterations_done is None: + iterations_done = result.iterations_done + else: + iterations_done = max(iterations_done, result.iterations_done) + + return MultiCallResult( + # We return values just from produce methods. + values=produce_results.values, + has_finished=all(result.has_finished for result in results), + iterations_done=iterations_done, + ) + + @abc.abstractmethod + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + """ + Sets current training data of this primitive. + + This marks training data as changed even if new training data is the same as + previous training data. + + Standard sublasses in this package do not adhere to the Liskov substitution principle when + inheriting this method because they do not necessary accept all arguments found in the base + class. This means that one has to inspect which arguments are accepted at runtime, or in + other words, one has to inspect which exactly subclass a primitive implements, if + you are accepting a wider range of primitives. This relaxation is allowed only for + standard subclasses found in this package. Primitives themselves should not break + the Liskov substitution principle but should inherit from a suitable base class. + + Parameters + ---------- + inputs: + The inputs. + outputs: + The outputs. + """ + + @abc.abstractmethod + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fits primitive using inputs and outputs (if any) using currently set training data. + + The returned value should be a ``CallResult`` object with ``value`` set to ``None``. + + If ``fit`` has already been called in the past on different training data, + this method fits it **again from scratch** using currently set training data. + + On the other hand, caller can call ``fit`` multiple times on the same training data + to continue fitting. + + If ``fit`` fully fits using provided training data, there is no point in making further + calls to this method with same training data, and in fact further calls can be noops, + or a primitive can decide to fully refit from scratch. + + In the case fitting can continue with same training data (even if it is maybe not reasonable, + because the internal metric primitive is using looks like fitting will be degrading), if ``fit`` + is called again (without setting training data), the primitive has to continue fitting. + + Caller can provide ``timeout`` information to guide the length of the fitting process. + Ideally, a primitive should adapt its fitting process to try to do the best fitting possible + inside the time allocated. If this is not possible and the primitive reaches the timeout + before fitting, it should raise a ``TimeoutError`` exception to signal that fitting was + unsuccessful in the given time. The state of the primitive after the exception should be + as the method call has never happened and primitive should continue to operate normally. + The purpose of ``timeout`` is to give opportunity to a primitive to cleanly manage + its state instead of interrupting execution from outside. Maintaining stable internal state + should have precedence over respecting the ``timeout`` (caller can terminate the misbehaving + primitive from outside anyway). If a longer ``timeout`` would produce different fitting, + then ``CallResult``'s ``has_finished`` should be set to ``False``. + + Some primitives have internal fitting iterations (for example, epochs). For those, caller + can provide how many of primitive's internal iterations should a primitive do before returning. + Primitives should make iterations as small as reasonable. If ``iterations`` is ``None``, + then there is no limit on how many iterations the primitive should do and primitive should + choose the best amount of iterations on its own (potentially controlled through + hyper-parameters). If ``iterations`` is a number, a primitive has to do those number of + iterations (even if not reasonable), if possible. ``timeout`` should still be respected + and potentially less iterations can be done because of that. Primitives with internal + iterations should make ``CallResult`` contain correct values. + + For primitives which do not have internal iterations, any value of ``iterations`` + means that they should fit fully, respecting only ``timeout``. + + Parameters + ---------- + timeout: + A maximum time this primitive should be fitting during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + A ``CallResult`` with ``None`` value. + """ + + @abc.abstractmethod + def get_params(self) -> Params: + """ + Returns parameters of this primitive. + + Parameters are all parameters of the primitive which can potentially change during a life-time of + a primitive. Parameters which cannot are passed through constructor. + + Parameters should include all data which is necessary to create a new instance of this primitive + behaving exactly the same as this instance, when the new instance is created by passing the same + parameters to the class constructor and calling ``set_params``. + + No other arguments to the method are allowed (except for private arguments). + + Returns + ------- + An instance of parameters. + """ + + @abc.abstractmethod + def set_params(self, *, params: Params) -> None: + """ + Sets parameters of this primitive. + + Parameters are all parameters of the primitive which can potentially change during a life-time of + a primitive. Parameters which cannot are passed through constructor. + + No other arguments to the method are allowed (except for private arguments). + + Parameters + ---------- + params: + An instance of parameters. + """ + + def __getstate__(self) -> dict: + """ + Returns state which is used to pickle an instance of a primitive. + + By default it returns standard constructor arguments and value + returned from ``get_params`` method. + + Consider extending default implementation if your primitive accepts + additional constructor arguments you would like to preserve when pickling. + + Note that unpickled primitive instances can generally continue to work only + inside the same environment they were pickled in because they continue to use + same ``docker_containers``, ``volumes``, and ``temporary_directory`` values + passed initially to primitive's constructor. Those generally do not work in + another environment where those resources might be available differently. + Consider constructing primitive instance directly providing updated constructor + arguments and then using ``get_params``/``set_params`` to restore primitive's + state. + + Returns + ------- + State to pickle. + """ + + standard_arguments = { + 'hyperparams': self.hyperparams, + 'random_seed': self.random_seed, + 'docker_containers': self.docker_containers, + 'volumes': self.volumes, + 'temporary_directory': self.temporary_directory, + } + expected_constructor_arguments = self.metadata.query()['primitive_code'].get('instance_methods', {})['__init__']['arguments'] + + return { + 'constructor': {name: value for name, value in standard_arguments.items() if name in expected_constructor_arguments}, + 'params': self.get_params(), + } + + def __setstate__(self, state: dict) -> None: + """ + Uses ``state`` to restore the state of a primitive when unpickling. + + By default it passes constructor arguments to the constructor and + calls ``get_params``. + + Parameters + ---------- + state: + Unpickled state. + """ + + self.__init__(**state['constructor']) # type: ignore + self.set_params(params=state['params']) + + def __repr__(self) -> str: + if 'random_seed' in self.metadata.query().get('primitive_code', {}).get('instance_methods', {}).get('__init__', {}).get('arguments', []): + return '{class_name}(hyperparams={hyperparams}, random_seed={random_seed})'.format( + class_name=self.metadata.query()['python_path'], + hyperparams=self.hyperparams, + random_seed=self.random_seed, + ) + else: + return '{class_name}(hyperparams={hyperparams})'.format( + class_name=self.metadata.query()['python_path'], + hyperparams=self.hyperparams, + ) + + +class ContinueFitMixin(typing.Generic[Inputs, Outputs, Params, Hyperparams], metaclass=utils.GenericMetaclass): + @abc.abstractmethod + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Similar to base ``fit``, this method fits the primitive using inputs and outputs (if any) + using currently set training data. + + The difference is what happens when currently set training data is different from + what the primitive might have already been fitted on. ``fit`` resets parameters and + refits the primitive (restarts fitting), while ``continue_fit`` fits the primitive + further on new training data. ``fit`` does **not** have to be called before ``continue_fit``, + calling ``continue_fit`` first starts fitting as well. + + Caller can still call ``continue_fit`` multiple times on the same training data as well, + in which case primitive should try to improve the fit in the same way as with ``fit``. + + From the perspective of a caller of all other methods, the training data in effect + is still just currently set training data. If a caller wants to call ``gradient_output`` + on all data on which the primitive has been fitted through multiple calls of ``continue_fit`` + on different training data, the caller should pass all this data themselves through + another call to ``set_training_data``, do not call ``fit`` or ``continue_fit`` again, + and use ``gradient_output`` method. In this way primitives which truly support + continuation of fitting and need only the latest data to do another fitting, do not + have to keep all past training data around themselves. + + If a primitive supports this mixin, then both ``fit`` and ``continue_fit`` can be + called. ``continue_fit`` always continues fitting, if it was started through ``fit`` + or ``continue_fit`` and fitting has not already finished. Calling ``fit`` always restarts + fitting after ``continue_fit`` has been called, even if training data has not changed. + + Primitives supporting this mixin and which operate on categorical target columns should + use ``all_distinct_values`` metadata to obtain which all values (labels) can be in + a target column, even if currently set training data does not contain all those values. + + Parameters + ---------- + timeout: + A maximum time this primitive should be fitting during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + A ``CallResult`` with ``None`` value. + """ + + +class SamplingCompositionalityMixin(typing.Generic[Inputs, Outputs, Params, Hyperparams], metaclass=utils.GenericMetaclass): + """ + This mixin signals to a caller that the primitive is probabilistic but + may be likelihood free. + """ + + @abc.abstractmethod + def sample(self, *, inputs: Inputs, num_samples: int = 1, timeout: float = None, iterations: int = None) -> CallResult[typing.Sequence[Outputs]]: + """ + Sample output for each input from ``inputs`` ``num_samples`` times. + + Semantics of ``timeout`` and ``iterations`` is the same as in ``produce``. + + Parameters + ---------- + inputs: + The inputs of shape [num_inputs, ...]. + num_samples: + The number of samples to return in a set of samples. + timeout: + A maximum time this primitive should take to sample outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + The multiple sets of samples of shape [num_samples, num_inputs, ...] wrapped inside + ``CallResult``. While the output value type is specified as ``Sequence[Outputs]``, the + output value can be in fact any container type with dimensions/shape equal to combined + ``Sequence[Outputs]`` dimensions/shape. Subclasses should specify which exactly type + the output is. + """ + + +class ProbabilisticCompositionalityMixin(typing.Generic[Inputs, Outputs, Params, Hyperparams], metaclass=utils.GenericMetaclass): + """ + This mixin provides additional abstract methods which primitives should implement to + help callers with doing various end-to-end refinements using probabilistic + compositionality. + + This mixin adds methods to support at least: + + * Metropolis-Hastings + + Mixin should be used together with ``SamplingCompositionalityMixin`` mixin. + """ + + @abc.abstractmethod + def log_likelihoods(self, *, outputs: Outputs, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Returns log probability of outputs given inputs and params under this primitive: + + log(p(output_i | input_i, params)) + + Parameters + ---------- + outputs: + The outputs. The number of samples should match ``inputs``. + inputs: + The inputs. The number of samples should match ``outputs``. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + log(p(output_i | input_i, params))) wrapped inside ``CallResult``. + The number of columns should match the number of target columns in ``outputs``. + """ + + def log_likelihood(self, *, outputs: Outputs, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Returns log probability of outputs given inputs and params under this primitive: + + sum_i(log(p(output_i | input_i, params))) + + By default it calls ``log_likelihoods`` and tries to automatically compute a sum, but subclasses can + implement a more efficient or even correct version. + + Parameters + ---------- + outputs: + The outputs. The number of samples should match ``inputs``. + inputs: + The inputs. The number of samples should match ``outputs``. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + sum_i(log(p(output_i | input_i, params))) wrapped inside ``CallResult``. + The number of returned samples is always 1. + The number of columns should match the number of target columns in ``outputs``. + """ + + result = self.log_likelihoods(outputs=outputs, inputs=inputs, timeout=timeout, iterations=iterations) + + return CallResult(utils.columns_sum(result.value), result.has_finished, result.iterations_done) + + +Container = typing.TypeVar('Container', bound=typing.Union[types.Container]) # type: ignore + + +# TODO: This is not yet a properly defined type which would really be recognized similar to Container. +# You should specify a proper type in your subclass. Type checking might complain that your +# type does not match the parent type, but ignore it (add "type: ignore" comment to that line). +# This type will be fixed in the future. +class Gradients(typing.Generic[Container]): + """ + A type representing a structure similar to ``Container``, but the values are of type ``Optional[float]``. + Value is ``None`` if gradient for that part of the structure is not possible. + """ + + +class GradientCompositionalityMixin(typing.Generic[Inputs, Outputs, Params, Hyperparams], metaclass=utils.GenericMetaclass): + """ + This mixin provides additional abstract methods which primitives should implement to + help callers with doing various end-to-end refinements using gradient-based + compositionality. + + This mixin adds methods to support at least: + + * gradient-based, compositional end-to-end training + * regularized pre-training + * multi-task adaptation + * black box variational inference + * Hamiltonian Monte Carlo + """ + + @abc.abstractmethod + def gradient_output(self, *, outputs: Outputs, inputs: Inputs) -> Gradients[Outputs]: + """ + Returns the gradient of loss sum_i(L(output_i, produce_one(input_i))) with respect to outputs. + + When fit term temperature is set to non-zero, it should return the gradient with respect to outputs of: + + sum_i(L(output_i, produce_one(input_i))) + temperature * sum_i(L(training_output_i, produce_one(training_input_i))) + + When used in combination with the ``ProbabilisticCompositionalityMixin``, it returns gradient + of sum_i(log(p(output_i | input_i, params))) with respect to outputs. + + When fit term temperature is set to non-zero, it should return the gradient with respect to outputs of: + + sum_i(log(p(output_i | input_i, params))) + temperature * sum_i(log(p(training_output_i | training_input_i, params))) + + Parameters + ---------- + outputs: + The outputs. + inputs: + The inputs. + + Returns + ------- + A structure similar to ``Container`` but the values are of type ``Optional[float]``. + """ + + @abc.abstractmethod + def gradient_params(self, *, outputs: Outputs, inputs: Inputs) -> Gradients[Params]: + """ + Returns the gradient of loss sum_i(L(output_i, produce_one(input_i))) with respect to params. + + When fit term temperature is set to non-zero, it should return the gradient with respect to params of: + + sum_i(L(output_i, produce_one(input_i))) + temperature * sum_i(L(training_output_i, produce_one(training_input_i))) + + When used in combination with the ``ProbabilisticCompositionalityMixin``, it returns gradient + of sum_i(log(p(output_i | input_i, params))) with respect to params. + + When fit term temperature is set to non-zero, it should return the gradient with respect to params of: + + sum_i(log(p(output_i | input_i, params))) + temperature * sum_i(log(p(training_output_i | training_input_i, params))) + + Parameters + ---------- + outputs: + The outputs. + inputs: + The inputs. + + Returns + ------- + A version of ``Params`` with all differentiable fields from ``Params`` and values set to gradient for each parameter. + """ + + def forward(self, *, inputs: Inputs) -> Outputs: + """ + Similar to ``produce`` method but it is meant to be used for a forward pass during + backpropagation-based end-to-end training. Primitive can implement it differently + than ``produce``, e.g., forward pass during training can enable dropout layers, or + ``produce`` might not compute gradients while ``forward`` does. + + By default it calls ``produce`` for one iteration. + + Parameters + ---------- + inputs: + The inputs of shape [num_inputs, ...]. + + Returns + ------- + The outputs of shape [num_inputs, ...]. + """ + + return self.produce(inputs=inputs, timeout=None, iterations=1).value # type: ignore + + @abc.abstractmethod + def backward(self, *, gradient_outputs: Gradients[Outputs], fine_tune: bool = False, fine_tune_learning_rate: float = 0.00001, + fine_tune_weight_decay: float = 0.00001) -> typing.Tuple[Gradients[Inputs], Gradients[Params]]: + """ + Returns the gradient with respect to inputs and with respect to params of a loss + that is being backpropagated end-to-end in a pipeline. + + This is the standard backpropagation algorithm: backpropagation needs to be preceded by a + forward propagation (``forward`` method call). + + Parameters + ---------- + gradient_outputs: + The gradient of the loss with respect to this primitive's output. During backpropagation, + this comes from the next primitive in the pipeline, i.e., the primitive whose input + is the output of this primitive during the forward execution with ``forward`` (and ``produce``). + fine_tune: + If ``True``, executes a fine-tuning gradient descent step as a part of this call. + This provides the most straightforward way of end-to-end training/fine-tuning. + fine_tune_learning_rate: + Learning rate for end-to-end training/fine-tuning gradient descent steps. + fine_tune_weight_decay: + L2 regularization (weight decay) coefficient for end-to-end training/fine-tuning gradient + descent steps. + + Returns + ------- + A tuple of the gradient with respect to inputs and with respect to params. + """ + + @abc.abstractmethod + def set_fit_term_temperature(self, *, temperature: float = 0) -> None: + """ + Sets the temperature used in ``gradient_output`` and ``gradient_params``. + + Parameters + ---------- + temperature: + The temperature to use, [0, inf), typically, [0, 1]. + """ + + +class LossFunctionMixin(typing.Generic[Inputs, Outputs, Params, Hyperparams], metaclass=utils.GenericMetaclass): + """ + Mixin which provides abstract methods for a caller to call to inspect which + loss function or functions a primitive is using internally, and to compute + loss on given inputs and outputs. + """ + + @abc.abstractmethod + def get_loss_functions(self) -> typing.Sequence[typing.Tuple[problem.PerformanceMetric, PrimitiveBase, None]]: # type: ignore + """ + Returns a list of loss functions used by the primitive. Each element of the list can be: + + * A D3M metric value of the loss function used by the primitive during the last fitting. + * Primitives can be passed to other primitives as arguments. As such, some primitives + can accept another primitive as a loss function to use, or use it internally. A primitive + can expose this loss primitive to others, providing directly an instance of the primitive + being used during the last fitting. + * ``None`` if using a non-standard loss function. Used so that the loss function can still + be exposed through ``loss`` and ``losses`` methods. + + It should return an empty list if the primitive does not use loss functions at all. + + The order in the list matters because the loss function index is used for ``loss`` and ``losses`` methods. + + Returns + ------- + A list of: a D3M standard metric value of the loss function used, + or a D3M primitive used to compute loss, or ``None``. + """ + + @abc.abstractmethod + def losses(self, *, loss_function: int, inputs: Inputs, outputs: Outputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Returns the loss L(output_i, produce_one(input_i)) for each (input_i, output_i) pair + using a loss function used by the primitive during the last fitting, identified by the + ``loss_function`` index in the list of loss functions as returned by the ``get_loss_functions``. + + Parameters + ---------- + loss_function: + An index of the loss function to use. + inputs: + The inputs. + outputs: + The outputs. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + L(output_i, produce_one(input_i)) for each (input_i, output_i) pair + wrapped inside ``CallResult``. + The number of columns should match the number of target columns in ``outputs``. + """ + + def loss(self, *, loss_function: int, inputs: Inputs, outputs: Outputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Returns the loss sum_i(L(output_i, produce_one(input_i))) for all (input_i, output_i) pairs + using a loss function used by the primitive during the last fitting, identified by the + ``loss_function`` index in the list of loss functions as returned by the ``get_loss_functions``. + + By default it calls ``losses`` and tries to automatically compute a sum, but subclasses can + implement a more efficient or even correct version. + + Parameters + ---------- + loss_function: + An index of the loss function to use. + inputs: + The inputs. + outputs: + The outputs. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + sum_i(L(output_i, produce_one(input_i))) for all (input_i, output_i) pairs + wrapped inside ``CallResult``. + The number of returned samples is always 1. + The number of columns should match the number of target columns in ``outputs``. + """ + + result = self.losses(loss_function=loss_function, inputs=inputs, outputs=outputs, timeout=timeout, iterations=iterations) + + return CallResult(utils.columns_sum(result.value), result.has_finished, result.iterations_done) + + +class NeuralNetworkModuleMixin(typing.Generic[Inputs, Outputs, Params, Hyperparams, Module], metaclass=utils.GenericMetaclass): + """ + Mixin which provides an abstract method for connecting neural network + modules together. Mixin is parameterized with type variable ``Module``. + These modules can be either single layers, or they can be blocks of layers. + The construction of these modules is done by mapping the neural network + to the pipeline structure, where primitives (exposing modules through this + abstract method) are passed to followup layers through hyper-parameters. + The whole such structure is then passed for the final time as a hyper-parameter + to a training primitive which then builds the internal representation of the neural + network and trains it. + """ + + @abc.abstractmethod + def get_neural_network_module(self, *, input_module: Module) -> Module: + """ + Returns a neural network module corresponding to this primitive. That module + might be already connected to other modules, which can be done by + primitive calling this method recursively on other primitives. If this + is initial layer of the neural network, it input is provided through + ``input_module`` argument. + + Parameters + ---------- + input_module: + The input module to the initial layer of the neural network. + + Returns + ------- + The ``Module`` instance corresponding to this primitive. + """ + + +class NeuralNetworkObjectMixin(typing.Generic[Inputs, Outputs, Params, Hyperparams, Module], metaclass=utils.GenericMetaclass): + """ + Mixin which provides an abstract method which returns auxiliary objects for use + in representing neural networks as pipelines: loss functions, optimizers, etc. + + One should consider the use of other primitive metadata (primitive family, algorithm + types) to describe the primitive implementing this mixin and limit primitives + in hyper-parameters. + """ + + @abc.abstractmethod + def get_neural_network_object(self, module: Module) -> typing.Any: + """ + Returns a neural network object. The object is opaque from the perspective + of the pipeline. The caller is responsible to assure that the returned + object is of correct type and interface and that it is passed on to + a correct consumer understanding the object. + + Parameters + ---------- + module: + The module representing the neural network for which the object is requested. + It should be always provided even if particular implementation does not use it. + + Returns + ------- + An opaque object. + """ + + +def singleton(f: typing.Callable) -> typing.Callable: + """ + If a produce method is using this decorator, it is signaling that all outputs from the produce method are + sequences of length 1. This is useful because a caller can then directly extract this element. + + Example of such produce methods are produce methods of primitives which compute loss, which are returning + one number for multiple inputs. With this decorator they can return a sequence with this one number, but + caller which cares about the loss can extract it out. At the same time, other callers which operate + only on sequences can continue to operate normally. + + We can see other produce methods as mapping produce methods, and produce methods with this decorator as + reducing produce methods. + """ + + # Mark a produce method as a singleton. This is our custom flag. + f.__singleton__ = True # type: ignore + + return f + + +def inputs_across_samples(func: typing.Callable = None, inputs: typing.Sequence[str] = None, *args: str) -> typing.Callable: + """ + A produce method can use this decorator to signal which of the inputs (arguments) is using across + all samples and not sample by sample. + + For many produce methods it does not matter if it is called 100x on 1 sample or 1x on 100 samples, + but not all produce methods are like that and some produce results based on which all inputs were + given to them. If just a subset of inputs is given, results are different. An example of this is + ``produce_distance_matrix`` method which returns a NxN matrix where N is number of samples, computing + a distance from each sample to each other sample. + + When inputs have a primary key without uniqueness constraint, then "sample" for the purpose of + this decorator means all samples with the same primary key value. + + Decorator accepts a list of inputs which are used across all samples. By default, `inputs` + argument name is used. + """ + + if callable(func): + if inputs is None: + inputs = ('inputs',) + + # Make sure values are unique and sorted. + inputs = tuple(sorted(set(inputs))) + + # List inputs which a produce method computes across samples. This is our custom flag. + # That listed names are really argument names is checked during metadata generation. + func.__inputs_across_samples__ = inputs # type: ignore + + return func + + else: + def decorator(f): + # We do not have to call "functool.update_wrapper" or something similar + # because we are in fact returning the same function "f", just with + # set "__inputs_across_samples__" attribute + return inputs_across_samples(f, [s for s in [func, inputs] + list(args) if isinstance(s, str)]) + + return decorator + + +# We register additional immutable types. We are doing it this way to overcome issues with import cycles. +# This is a tricky one. Primitive instances are generally mutable, they can change state when they are used. +# But as part of hyper-parameters, they can be used as instances and are seen as immutable because the idea +# is that TA2 will make a copy of the primitive before passing it in as a hyper-parameter, leaving initial +# instance intact. +if PrimitiveBase not in utils.additional_immutable_types: + utils.additional_immutable_types += (PrimitiveBase,) diff --git a/d3m/d3m/primitive_interfaces/clustering.py b/d3m/d3m/primitive_interfaces/clustering.py new file mode 100644 index 0000000..f3caa19 --- /dev/null +++ b/d3m/d3m/primitive_interfaces/clustering.py @@ -0,0 +1,103 @@ +import abc +import typing + +from d3m import types, utils +from d3m.primitive_interfaces.base import * +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + +__all__ = ('ClusteringLearnerPrimitiveBase', 'ClusteringTransformerPrimitiveBase', 'DistanceMatrixOutput', 'ClusteringDistanceMatrixMixin') + +DistanceMatrixOutput = typing.TypeVar('DistanceMatrixOutput', bound=typing.Union[types.Container]) # type: ignore + + +class ClusteringLearnerPrimitiveBase(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A base class for primitives implementing a clustering algorithm which learns clusters. + """ + + @abc.abstractmethod + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + ``produce`` method should return a membership map. + + A data structure that for each input sample tells to which cluster that sample was assigned to. So ``Outputs`` + should have the same number of samples than ``Inputs``, and the value at each output sample should represent + a cluster. Consider representing it with just a simple numeric identifier. + + Parameters + ---------- + inputs: + The inputs of shape [num_inputs, ...]. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + The outputs of shape [num_inputs, 1] wrapped inside ``CallResult`` for a simple numeric + cluster identifier. + """ + + +class ClusteringTransformerPrimitiveBase(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A base class for primitives implementing a clustering algorithm without learning any sort of model. + """ + + @abc.abstractmethod + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + ``produce`` method should return a membership map. + + A data structure that for each input sample tells to which cluster that sample was assigned to. So ``Outputs`` + should have the same number of samples than ``Inputs``, and the value at each output sample should represent + a cluster. Consider representing it with just a simple numeric identifier. + + If an implementation of this method computes clusters based on the whole set of input samples, + use ``inputs_across_samples`` decorator to mark ``inputs`` as being computed across samples. + + Parameters + ---------- + inputs: + The inputs of shape [num_inputs, ...]. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + The outputs of shape [num_inputs, 1] wrapped inside ``CallResult`` for a simple numeric + cluster identifier. + """ + + +class ClusteringDistanceMatrixMixin(typing.Generic[Inputs, Outputs, Params, Hyperparams, DistanceMatrixOutput], metaclass=utils.GenericMetaclass): + @abc.abstractmethod + def produce_distance_matrix(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[DistanceMatrixOutput]: + """ + Semantics of this call are the same as the call to a regular ``produce`` method, just + that the output is a distance matrix instead of a membership map. + + Implementations of this method should use ``inputs_across_samples`` decorator to mark ``inputs`` + as being computed across samples. + + When this mixin is used with `ClusteringTransformerPrimitiveBase`, ``Params`` type variable should + be set to ``None``. + + Parameters + ---------- + inputs: + The inputs of shape [num_inputs, ...]. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + The distance matrix of shape [num_inputs, num_inputs, ...] wrapped inside ``CallResult``, where (i, j) element + of the matrix represent a distance between i-th and j-th sample in the inputs. + """ diff --git a/d3m/d3m/primitive_interfaces/distance.py b/d3m/d3m/primitive_interfaces/distance.py new file mode 100644 index 0000000..fd2e399 --- /dev/null +++ b/d3m/d3m/primitive_interfaces/distance.py @@ -0,0 +1,197 @@ +import abc +import typing + +from d3m import types +from d3m.primitive_interfaces.base import * +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +__all__ = ('PairwiseDistanceLearnerPrimitiveBase', 'PairwiseDistanceTransformerPrimitiveBase', 'InputLabels') + +InputLabels = typing.TypeVar('InputLabels', bound=typing.Union[types.Container]) # type: ignore + + +# Defining Generic with all type variables allows us to specify the order and an additional type variable. +class PairwiseDistanceLearnerPrimitiveBase(PrimitiveBase[Inputs, Outputs, Params, Hyperparams], typing.Generic[Inputs, InputLabels, Outputs, Params, Hyperparams]): + """ + A base class for primitives which learn distances (however defined) between two + different sets of instances. + + Class is parameterized using five type variables, ``Inputs``, ``InputLabels``, ``Outputs``, ``Params``, and ``Hyperparams``. + """ + + @abc.abstractmethod + def produce(self, *, inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # type: ignore + """ + Computes distance matrix between two sets of data. + + Implementations of this method should use ``inputs_across_samples`` decorator to mark ``inputs`` + and ``second_inputs`` as being computed across samples. + + Parameters + ---------- + inputs: + The first set of collections of instances. + second_inputs: + The second set of collections of instances. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + --------- + A n by m distance matrix describing the relationship between each instance in inputs[0] and each instance + in inputs[1] (n and m are the number of instances in inputs[0] and inputs[1], respectively), + wrapped inside ``CallResult``. + """ + + @abc.abstractmethod + def set_training_data(self, *, inputs: Inputs, input_labels: InputLabels) -> None: # type: ignore + """ + Sets training data of this primitive. + + Parameters + ---------- + inputs: + The inputs. + input_labels: + A set of class labels for the inputs. + """ + + def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> MultiCallResult: # type: ignore + """ + A method calling multiple produce methods at once. + + Parameters + ---------- + produce_methods: + A list of names of produce methods to call. + inputs: + The first set of collections of instances. + second_inputs: + The second set of collections of instances. + timeout: + A maximum time this primitive should take to produce outputs for all produce methods + listed in ``produce_methods`` argument, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, second_inputs=second_inputs) + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, input_labels: InputLabels, + second_inputs: Inputs, timeout: float = None, iterations: int = None) -> MultiCallResult: # type: ignore + """ + A method calling ``fit`` and after that multiple produce methods at once. + + Parameters + ---------- + produce_methods: + A list of names of produce methods to call. + inputs: + The first set of collections of instances. + input_labels: + A set of class labels for the inputs. + second_inputs: + The second set of collections of instances. + timeout: + A maximum time this primitive should take to both fit the primitive and produce outputs + for all produce methods listed in ``produce_methods`` argument, in seconds. + iterations: + How many of internal iterations should the primitive do for both fitting and producing + outputs of all produce methods. + + Returns + ------- + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, input_labels=input_labels, second_inputs=second_inputs) + + +class PairwiseDistanceTransformerPrimitiveBase(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A base class for primitives which compute distances (however defined) between two + different sets of instances without learning any sort of model. + """ + + @abc.abstractmethod + def produce(self, *, inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: # type: ignore + """ + Computes distance matrix between two sets of data. + + Implementations of this method should use ``inputs_across_samples`` decorator to mark ``inputs`` + and ``second_inputs`` as being computed across samples. + + Parameters + ---------- + inputs: + The first set of collections of instances. + second_inputs: + The second set of collections of instances. + timeout: + A maximum time this primitive should take to produce outputs during this method call, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + --------- + A n by m distance matrix describing the relationship between each instance in inputs[0] and each instance + in inputs[1] (n and m are the number of instances in inputs[0] and inputs[1], respectively), + wrapped inside ``CallResult``. + """ + + def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> MultiCallResult: # type: ignore + """ + A method calling multiple produce methods at once. + + Parameters + ---------- + produce_methods: + A list of names of produce methods to call. + inputs: + The first set of collections of instances. + second_inputs: + The second set of collections of instances. + timeout: + A maximum time this primitive should take to produce outputs for all produce methods + listed in ``produce_methods`` argument, in seconds. + iterations: + How many of internal iterations should the primitive do. + + Returns + ------- + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, second_inputs=second_inputs) + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> MultiCallResult: # type: ignore + """ + A method calling ``fit`` and after that multiple produce methods at once. + + Parameters + ---------- + produce_methods: + A list of names of produce methods to call. + inputs: + The first set of collections of instances. + second_inputs: + The second set of collections of instances. + timeout: + A maximum time this primitive should take to both fit the primitive and produce outputs + for all produce methods listed in ``produce_methods`` argument, in seconds. + iterations: + How many of internal iterations should the primitive do for both fitting and producing + outputs of all produce methods. + + Returns + ------- + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, second_inputs=second_inputs) diff --git a/d3m/d3m/primitive_interfaces/featurization.py b/d3m/d3m/primitive_interfaces/featurization.py new file mode 100644 index 0000000..4765e23 --- /dev/null +++ b/d3m/d3m/primitive_interfaces/featurization.py @@ -0,0 +1,22 @@ +from d3m.primitive_interfaces.base import * +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +__all__ = ('FeaturizationLearnerPrimitiveBase', 'FeaturizationTransformerPrimitiveBase') + + +class FeaturizationLearnerPrimitiveBase(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A base class for primitives which transform raw data into a more usable form. + + Use this version for featurizers that allow for fitting (for domain-adaptation, data-specific deep + learning, etc.). Otherwise use `FeaturizationTransformerPrimitiveBase`. + """ + + +class FeaturizationTransformerPrimitiveBase(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A base class for primitives which transform raw data into a more usable form. + + Use this version for featurizers that do not require or allow any fitting, and simply + transform data on demand. Otherwise use `FeaturizationLearnerPrimitiveBase`. + """ diff --git a/d3m/d3m/primitive_interfaces/generator.py b/d3m/d3m/primitive_interfaces/generator.py new file mode 100644 index 0000000..a44c383 --- /dev/null +++ b/d3m/d3m/primitive_interfaces/generator.py @@ -0,0 +1,62 @@ +import abc +import typing + +from d3m import container +from d3m.primitive_interfaces.base import * + +__all__ = ('GeneratorPrimitiveBase',) + + +class GeneratorPrimitiveBase(PrimitiveBase[container.List, Outputs, Params, Hyperparams]): + """ + A base class for primitives which have to be fitted before they can start + producing (useful) outputs, but they are fitted only on output data. + Moreover, they do not accept any inputs to generate outputs, + which is represented as a sequence (list) of non-negative integer values + to ``produce`` method, only to signal how many outputs are requested, and + which one from the potential set of outputs. + + The list of integer values to ``produce`` method provides support for batching. + A caller does not have to rely on the order in which the primitive is called + but can specify the index of the requested output. + + This class is parameterized using only by three type variables, + ``Outputs``, ``Params``, and ``Hyperparams``. + """ + + @abc.abstractmethod + def set_training_data(self, *, outputs: Outputs) -> None: # type: ignore + """ + Sets training data of this primitive. + + Parameters + ---------- + outputs: + The outputs. + """ + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: container.List, outputs: Outputs, timeout: float = None, iterations: int = None) -> MultiCallResult: + """ + A method calling ``fit`` and after that multiple produce methods at once. + + Parameters + ---------- + produce_methods: + A list of names of produce methods to call. + inputs: + The inputs given to all produce methods. + outputs: + The outputs given to ``set_training_data``. + timeout: + A maximum time this primitive should take to both fit the primitive and produce outputs + for all produce methods listed in ``produce_methods`` argument, in seconds. + iterations: + How many of internal iterations should the primitive do for both fitting and producing + outputs of all produce methods. + + Returns + ------- + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, outputs=outputs) # type: ignore diff --git a/d3m/d3m/primitive_interfaces/supervised_learning.py b/d3m/d3m/primitive_interfaces/supervised_learning.py new file mode 100644 index 0000000..74efec4 --- /dev/null +++ b/d3m/d3m/primitive_interfaces/supervised_learning.py @@ -0,0 +1,10 @@ +from d3m.primitive_interfaces.base import * + +__all__ = ('SupervisedLearnerPrimitiveBase',) + + +class SupervisedLearnerPrimitiveBase(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A base class for primitives which have to be fitted on both input and output data + before they can start producing (useful) outputs from inputs. + """ diff --git a/d3m/d3m/primitive_interfaces/transformer.py b/d3m/d3m/primitive_interfaces/transformer.py new file mode 100644 index 0000000..efed13e --- /dev/null +++ b/d3m/d3m/primitive_interfaces/transformer.py @@ -0,0 +1,71 @@ +import typing + +from d3m.primitive_interfaces.base import * + +__all__ = ('TransformerPrimitiveBase',) + + +class TransformerPrimitiveBase(PrimitiveBase[Inputs, Outputs, None, Hyperparams]): + """ + A base class for primitives which are not fitted at all and can + simply produce (useful) outputs from inputs directly. As such they + also do not have any state (params). + + This class is parameterized using only three type variables, ``Inputs``, + ``Outputs``, and ``Hyperparams``. + """ + + def set_training_data(self) -> None: # type: ignore + """ + A noop. + + Parameters + ---------- + """ + + return + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + A noop. + """ + + return CallResult(None) + + def get_params(self) -> None: + """ + A noop. + """ + + return None + + def set_params(self, *, params: None) -> None: + """ + A noop. + """ + + return + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, timeout: float = None, iterations: int = None) -> MultiCallResult: # type: ignore + """ + A method calling ``fit`` and after that multiple produce methods at once. + + Parameters + ---------- + produce_methods: + A list of names of produce methods to call. + inputs: + The inputs given to all produce methods. + timeout: + A maximum time this primitive should take to both fit the primitive and produce outputs + for all produce methods listed in ``produce_methods`` argument, in seconds. + iterations: + How many of internal iterations should the primitive do for both fitting and producing + outputs of all produce methods. + + Returns + ------- + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs) diff --git a/d3m/d3m/primitive_interfaces/unsupervised_learning.py b/d3m/d3m/primitive_interfaces/unsupervised_learning.py new file mode 100644 index 0000000..796215d --- /dev/null +++ b/d3m/d3m/primitive_interfaces/unsupervised_learning.py @@ -0,0 +1,48 @@ +import abc +import typing + +from d3m.primitive_interfaces.base import * + +__all__ = ('UnsupervisedLearnerPrimitiveBase',) + + +class UnsupervisedLearnerPrimitiveBase(PrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A base class for primitives which have to be fitted before they can start + producing (useful) outputs from inputs, but they are fitted only on input data. + """ + + @abc.abstractmethod + def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore + """ + Sets training data of this primitive. + + Parameters + ---------- + inputs: + The inputs. + """ + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, timeout: float = None, iterations: int = None) -> MultiCallResult: # type: ignore + """ + A method calling ``fit`` and after that multiple produce methods at once. + + Parameters + ---------- + produce_methods: + A list of names of produce methods to call. + inputs: + The inputs given to ``set_training_data`` and all produce methods. + timeout: + A maximum time this primitive should take to both fit the primitive and produce outputs + for all produce methods listed in ``produce_methods`` argument, in seconds. + iterations: + How many of internal iterations should the primitive do for both fitting and producing + outputs of all produce methods. + + Returns + ------- + A dict of values for each produce method wrapped inside ``MultiCallResult``. + """ + + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs) diff --git a/d3m/d3m/runtime.py b/d3m/d3m/runtime.py new file mode 100644 index 0000000..4c98310 --- /dev/null +++ b/d3m/d3m/runtime.py @@ -0,0 +1,2911 @@ +import argparse +import inspect +import json +import logging +import os +import os.path +import pickle +import re +import sys +import tempfile +import traceback +import typing +import uuid + +import jsonschema # type: ignore +import frozendict # type: ignore +import pandas # type: ignore + +from d3m import container, deprecate, exceptions, types, utils +from d3m.container import dataset as dataset_module +from d3m.container import utils as container_utils +from d3m.metadata import base as metadata_base, hyperparams as hyperparams_module, pipeline as pipeline_module, pipeline_run as pipeline_run_module, problem +from d3m.primitive_interfaces import base + +logger = logging.getLogger(__name__) + +DEFAULT_SCORING_PIPELINE_ID = 'f596cd77-25f8-4d4c-a350-bb30ab1e58f6' +DEFAULT_SCORING_PIPELINE_PATH = os.path.join( + os.path.dirname(__file__), 'contrib', 'pipelines', DEFAULT_SCORING_PIPELINE_ID + '.yml', +) + +DATASET_ID_REGEX = re.compile('(_TRAIN|_TEST|_SCORE)$') + + +class Result: + """ + Results from running a pipeline. + + Parameters + ---------- + pipeline_run: + A pipeline run description. + values: + A map between data references and their values computed during pipeline run. + error: + If during a run an exception occurred, then it is available here. + """ + + def __init__(self, pipeline_run: pipeline_run_module.PipelineRun, values: typing.Dict[str, typing.Any], error: Exception = None) -> None: + self.pipeline_run = pipeline_run + self.values = values + self.error = error + + def has_error(self) -> bool: + """ + Returns ``True`` if pipeline has not successfully finished. + """ + + return self.error is not None + + def check_success(self) -> None: + """ + Throws an exception if pipeline has not successfully finished. + """ + + if self.has_error(): + raise self.error + + +class MultiResult(typing.List[Result]): + """ + Results of running a pipeline multiple times. + """ + + @property + def pipeline_runs(self) -> typing.Sequence[pipeline_run_module.PipelineRun]: + return [result.pipeline_run for result in self] + + def has_error(self) -> bool: + """ + Returns ``True`` if any of pipelines has not successfully finished. + """ + + return any(result.has_error() for result in self) + + def check_success(self) -> None: + """ + Throws an exception if pipeline has not successfully finished in any of the runs. + """ + + for result in self: + result.check_success() + + +def get_singleton_value(value: typing.Any) -> typing.Any: + """ + A helper to extract a value from a singleton value (extracting a sole element of a + container of length 1). + """ + + if isinstance(value, pandas.DataFrame): + # Fetch the row as a list. This assures different columns can be of a different type. + singleton_value = container.List([value.iloc[0, k] for k in range(len(value.columns))]) + else: + singleton_value = value[0] + + if isinstance(singleton_value, types.Container): + singleton_value.metadata = metadata_base.DataMetadata() + singleton_value.metadata = value.metadata.copy_to( + singleton_value.metadata, + (0,), + ) + # TODO: We should also remove table metadata which might not hold true anymore. + # If original value was tabular, we now copied also metadata about tabular column dimension, + # but that is not true anymore for this singleton value, it is not tabular anymore. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/336 + singleton_value.metadata = singleton_value.metadata.generate(singleton_value) + + return singleton_value + + +# TODO: Add debug logging to the runtime. +class Runtime: + """ + Reference runtime to fit and produce a pipeline. + + Parameters + ---------- + pipeline: + A pipeline to run. + hyperparams: + Values for free hyper-parameters of the pipeline. It should be a list, where each element corresponds + to free hyper-parameters of the corresponding pipeline step. Not all free hyper-parameters have to be + specified. Default values are used for those which are not. Optional. + problem_description: + A parsed problem description in standard problem description schema. + context: + In which context to run pipelines, default is ``TESTING``. + random_seed: + A random seed to use for every run. This control all randomness during the run. + volumes_dir: + Path to a directory with static files required by primitives. + In the standard directory structure (as obtained running ``python3 -m d3m index download``). + scratch_dir: + Path to a directory to store any temporary files needed during execution. + is_standard_pipeline: + Is the pipeline a standard pipeline? + environment: + A description of the runtime environment, including engine versions, + Docker images, compute resources, and benchmarks. If not provided, + an attempt is made to determine it automatically. + users: + Users associated with running the pipeline. + + Attributes + ---------- + pipeline: + A pipeline to run. + hyperparams: + Values for free hyper-parameters of the pipeline. It should be a list, where each element corresponds + to free hyper-parameters of the corresponding pipeline step. Not all free hyper-parameters have to be + specified. Default values are used for those which are not. Optional. + problem_description: + A parsed problem description in standard problem description schema. + context: + In which context to run pipelines, default is ``TESTING``. + random_seed: + A random seed to use for every run. This control all randomness during the run. + volumes_dir: + Path to a directory with static files required by primitives. + In the standard directory structure (as obtained running ``python3 -m d3m index download``). + scratch_dir: + Path to a directory to store any temporary files needed during execution. + is_standard_pipeline: + Is the pipeline a standard pipeline? + environment: + A description of the runtime environment, including engine versions, + Docker images, compute resources, and benchmarks. If not provided, + an attempt is made to determine it automatically. + users: + Users associated with running the pipeline. + current_step: + Which step is currently being ran. + phase: + Which phase are we currently running. + pipeline_run: + A current instance of pipeline run. + return_values: + Which values should the runtime keep during a pipeline run, even after they are necessary. + data_values: + Map between available data references and their values during the run. + steps_state: + Fitted state for each step of the pipeline. + """ + + pipeline: pipeline_module.Pipeline + hyperparams: typing.Sequence + problem_description: problem.Problem + context: metadata_base.Context + random_seed: int + volumes_dir: str + scratch_dir: str + is_standard_pipeline: bool + environment: pipeline_run_module.RuntimeEnvironment + users: typing.Sequence[pipeline_run_module.User] + current_step: int + phase: metadata_base.PipelineRunPhase + pipeline_run: pipeline_run_module.PipelineRun + return_values: typing.Sequence[str] + data_values: typing.Dict[str, typing.Any] + steps_state: typing.List[typing.Union[typing.Any, typing.List]] + + def __init__( + self, pipeline: pipeline_module.Pipeline, hyperparams: typing.Sequence = None, *, + problem_description: problem.Problem = None, context: metadata_base.Context, + random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None, + is_standard_pipeline: bool = False, environment: pipeline_run_module.RuntimeEnvironment = None, + users: typing.Sequence[pipeline_run_module.User] = None, + ) -> None: + self.pipeline = pipeline + self.hyperparams = hyperparams + self.problem_description = problem_description + self.context = context + self.random_seed = random_seed + self.volumes_dir = volumes_dir + self.scratch_dir = scratch_dir + self.is_standard_pipeline = is_standard_pipeline + self.users = users + + if environment is None: + self.environment = pipeline_run_module.RuntimeEnvironment() + else: + self.environment = environment + + # Preliminary check. + self.pipeline.check(allow_placeholders=False, standard_pipeline=self.is_standard_pipeline) + + if self.hyperparams is not None: + self._check_hyperparams(self.pipeline, self.hyperparams) + + self.steps_state: typing.List[typing.Union[typing.Any, typing.List, None]] = [None for step in self.pipeline.steps] + + self._previous_pipeline_run: pipeline_run_module.PipelineRun = None + + self._initialize_run_state([], None, None) + + def _initialize_data_values(self, inputs: typing.Sequence[typing.Any]) -> None: + # TODO: Remove values from the "data_values" once they are not needed anymore to optimize memory use. + self.data_values: typing.Dict[str, typing.Any] = {} + + if self.phase is None: + return + + marked_problem_inputs: typing.Set[int] = set() + if self.problem_description is None: + problem_inputs: typing.List[typing.Dict] = [] + else: + problem_inputs = self.problem_description.get('inputs', []) + + for i, input_value in enumerate(inputs): + if isinstance(input_value, container.Dataset): + if problem_inputs: + input_value, marked_problem_indices = self._mark_columns(problem_inputs, input_value) + marked_problem_inputs.update(marked_problem_indices) + else: + # All standard pipeline inputs should be Datasets. + assert not self.is_standard_pipeline + + self.data_values['inputs.{i}'.format(i=i)] = input_value + + if len(marked_problem_inputs) != len(problem_inputs): + unmarked_problem_inputs = sorted(set(range(len(problem_inputs))) - marked_problem_inputs) + + raise exceptions.InvalidProblemError( + "Not all problem description inputs could be applied to input datasets: {inputs}".format( + inputs=', '.join(str(problem_inputs[unmarked_problem_input]) for unmarked_problem_input in unmarked_problem_inputs), + ) + ) + + def _clear_data_values(self) -> None: + self.data_values = {} + + def _initialize_run_state( + self, inputs: typing.Sequence[typing.Any], + phase: typing.Optional[metadata_base.PipelineRunPhase], + return_values: typing.Optional[typing.Sequence[str]], + ) -> None: + self.current_step = 0 + self.phase = phase + + if return_values is None: + self.return_values = self._get_all_outputs() + else: + # We sort "return_values" to have deterministic order. + self.return_values = sorted(set(return_values)) + + self._initialize_data_values(inputs) + + self._initialize_base_temporary_directory() + + self._initialize_pipeline_run() + + def _get_all_outputs(self) -> typing.Sequence[str]: + return ['outputs.{i}'.format(i=i) for i, output_description in enumerate(self.pipeline.outputs)] + + def _clear_run_state(self) -> None: + """ + After a pipeline run, we clear state which was necessary while pipeline was running, but it is not needed anymore. + """ + + # We keep "steps_state" so that we can produce. + + self.current_step = 0 + self.phase = None + self.return_values = None + + self._clear_data_values() + self._clear_base_temporary_directory() + self._clear_pipeline_run() + + def _check_hyperparams(self, pipeline: pipeline_module.Pipeline, hyperparams: typing.Sequence) -> None: + """ + Check provided values for free hyper-parameters. + """ + + if not utils.is_sequence(hyperparams): + raise exceptions.InvalidArgumentTypeError("Hyper-parameter values for the pipeline '{pipeline_id}' is not a sequence.".format( + pipeline_id=pipeline.id, + )) + + if len(hyperparams) != len(pipeline.steps): + raise exceptions.InvalidArgumentValueError( + "Hyper-parameter values for the pipeline '{pipeline_id}' do not match the number of steps in the pipeline: {hyperparams_steps} vs. {pipeline_steps}".format( + pipeline_id=pipeline.id, + hyperparams_steps=len(hyperparams), + pipeline_steps=len(pipeline.steps), + ), + ) + + for step_index, (hyperparams_for_step, step) in enumerate(zip(hyperparams, pipeline.steps)): + # Placeholder step is not really allowed, but we have it here for completeness. + # Its "get_free_hyperparams" returns an empty list. + if isinstance(step, pipeline_module.PlaceholderStep): + if not utils.is_sequence(hyperparams_for_step): + raise exceptions.InvalidArgumentTypeError("Hyper-parameter values for placeholder step {step_index} of pipeline '{pipeline_id}' is not a sequence.".format( + step_index=step_index, + pipeline_id=pipeline.id, + )) + + elif isinstance(step, pipeline_module.SubpipelineStep): + self._check_hyperparams(step.pipeline, hyperparams_for_step) + + elif isinstance(step, pipeline_module.PrimitiveStep): + if not isinstance(hyperparams_for_step, (dict, frozendict.frozendict)): + raise exceptions.InvalidArgumentTypeError("Hyper-parameter values for primitive step {step_index} of pipeline '{pipeline_id}' is not a dict.".format( + step_index=step_index, + pipeline_id=pipeline.id, + )) + + hyperparams_for_step_keys = set(hyperparams_for_step.keys()) + free_hyperparams_keys = set(step.get_free_hyperparams().keys()) + all_hyperparams_keys = set(step.get_all_hyperparams().keys()) + + if hyperparams_for_step_keys - all_hyperparams_keys: + raise exceptions.InvalidArgumentValueError( + "Hyper-parameter values for primitive step {step_index} of pipeline '{pipeline_id}' contain values for non-existent hyper-parameters: {hyperparams}".format( + step_index=step_index, + pipeline_id=pipeline.id, + hyperparams=sorted(hyperparams_for_step_keys - all_hyperparams_keys), + ), + ) + elif hyperparams_for_step_keys - free_hyperparams_keys: + raise exceptions.InvalidArgumentValueError( + "Hyper-parameter values for primitive step {step_index} of pipeline '{pipeline_id}' are overriding hyper-parameters fixed in the pipeline: {hyperparams}".format( + step_index=step_index, + pipeline_id=pipeline.id, + hyperparams=sorted(hyperparams_for_step_keys - free_hyperparams_keys), + ), + ) + + def _get_pipeline_run_class(self) -> typing.Type[pipeline_run_module.PipelineRun]: + return pipeline_run_module.PipelineRun + + def _initialize_pipeline_run(self) -> None: + if self.phase is None: + self.pipeline_run = None + return + + self.pipeline_run = self._get_pipeline_run_class()( + pipeline=self.pipeline, + problem_description=self.problem_description, + phase=self.phase, + context=self.context, + previous_pipeline_run=self._previous_pipeline_run, + environment=self.environment, + random_seed=self.random_seed, + is_standard_pipeline=self.is_standard_pipeline, + users=self.users + ) + + input_values = [] + for i, input_value in sorted((int(data_reference.split('.')[1]), input_value) for data_reference, input_value in self.data_values.items() if data_reference.startswith('inputs.')): + input_values.append(input_value) + + all_input_values_datasets = all(isinstance(input_value, container.Dataset) for input_value in input_values) + assert all_input_values_datasets or not self.is_standard_pipeline + + # Even if the pipeline is not a standard pipeline, we still record Dataset inputs (if all are Dataset inputs) + # into pipeline run to allow generation of pipeline runs for a subset of non-standard pipelines, especially + # those computing metafeatures. Because having inputs recorded is required for a pipeline run, any other + # (for other types of inputs) pipeline run is not a valid stand-alone pipeline run and you get an error if + # you want to serialize it to JSON. This is on purpose. (We could have a better error message though.) + # You can still build a pipeline run object for non-standard pipelines. This is being used for data + # preparation or scoring pipelines. + # See: https://gitlab.com/datadrivendiscovery/metalearning/issues/64 + if all_input_values_datasets: + for input_value in input_values: + self.pipeline_run.add_input_dataset(input_value) + + def _clear_pipeline_run(self) -> None: + self.pipeline_run = None + + def _initialize_base_temporary_directory(self) -> None: + if self.phase is None: + self._base_temporary_directory = None + self._base_temporary_directory_path = None + return + + self._base_temporary_directory = tempfile.TemporaryDirectory(dir=self.scratch_dir) + self._base_temporary_directory_path = os.path.abspath(self._base_temporary_directory.name) + + def _clear_base_temporary_directory(self) -> None: + if self._base_temporary_directory is not None: + self._base_temporary_directory.cleanup() + self._base_temporary_directory = None + self._base_temporary_directory_path = None + + def _check_pipeline(self, inputs: typing.Sequence[typing.Any]) -> None: + """ + Check with known inputs. + """ + + input_types = {} + for i, input_value in enumerate(inputs): + input_types['inputs.{i}'.format(i=i)] = type(input_value) + + self.pipeline.check(allow_placeholders=False, standard_pipeline=self.is_standard_pipeline, input_types=input_types) + + def _run_placeholder(self, step: pipeline_module.PlaceholderStep) -> None: + raise exceptions.InvalidPipelineError("Step {step_index} of pipeline '{pipeline_id}' is a placeholder but there should be no placeholders.".format( + step_index=self.current_step, + pipeline_id=self.pipeline.id, + )) + + # TODO: Make return type be equal to the current's class type, so that it adapts if this class is subclassed. + def _create_subpipeline(self, pipeline: pipeline_module.Pipeline, hyperparams: typing.Optional[typing.Sequence]) -> 'Runtime': + """ + Creates an instance of the subpipeline's runtime. + """ + + # We change the random seed in a deterministic way so that it does not matter in which order we run steps. + # Subpipelines are generally not a standard pipeline. + return type(self)( + pipeline, + hyperparams, + # TODO: Should we pass "problem_description" as well, but make it so that it does not try to mark columns again? + problem_description=None, + context=self.context, + random_seed=self.random_seed + self.current_step, + volumes_dir=self.volumes_dir, + scratch_dir=self.scratch_dir, + is_standard_pipeline=False, + environment=self.environment, + users=self.users, + ) + + def _run_subpipeline(self, step: pipeline_module.SubpipelineStep) -> None: + if step.pipeline is None: + raise exceptions.InvalidPipelineError("Pipeline has not been resolved.") + + subpipeline_inputs: typing.List[typing.Any] = [] + for i, data_reference in enumerate(step.inputs): + subpipeline_inputs.append(self.data_values[data_reference]) + + if self.hyperparams is not None: + hyperparams = self.hyperparams[self.current_step] + + # We checked this already in "_check_hyperparams". + assert utils.is_sequence(hyperparams), hyperparams + else: + hyperparams = None + + subpipeline = self._create_subpipeline(step.pipeline, hyperparams) + + if self.phase == metadata_base.PipelineRunPhase.FIT: + assert self.steps_state[self.current_step] is None + else: + subpipeline.set_params(typing.cast(typing.List, self.steps_state[self.current_step])) + + return_values_map = {} + return_values = set() + for i, output_id in enumerate(step.outputs): + # "output_id" can be "None" if this output is not used and should be skipped. + if output_id is not None: + data_reference = 'outputs.{i}'.format(i=i) + return_values.add(data_reference) + return_values_map['steps.{i}.{output_id}'.format(i=step.index, output_id=output_id)] = data_reference + + step_reference_prefix = 'steps.{i}.'.format(i=step.index) + for return_value in self.return_values: + # We process recursive data references for this subpipeline. + # We check that "return_value" is not in "return_values_map" because data + # references of the format "steps.{i}.{output_id}" have "step_reference_prefix" + # as a prefix but are not really a recursive data reference. + # But all references of that format are already in "return_values_map". + if return_value.startswith(step_reference_prefix) and return_value not in return_values_map: + data_reference = return_value[len(step_reference_prefix):] + # Data reference at this point should contain at least one dot, because all with the prefix + # which do not contain a dot we filtered out by checking them against "return_values_map". + assert '.' in data_reference, data_reference + return_values.add(data_reference) + return_values_map[return_value] = data_reference + + # We sort "return_values" to have deterministic order. + result = subpipeline._run(subpipeline_inputs, self.phase, return_values=sorted(return_values)) + self.pipeline_run.add_subpipeline_step(result.pipeline_run) + result.check_success() + + if self.phase == metadata_base.PipelineRunPhase.FIT: + assert self.steps_state[self.current_step] is None + self.steps_state[self.current_step] = subpipeline.get_params() + + for step_data_reference, subpipeline_data_reference in return_values_map.items(): + self.data_values[step_data_reference] = result.values[subpipeline_data_reference] + + def _get_singleton_value(self, value: typing.Any, is_argument: bool, name: str) -> typing.Any: + """ + A helper to extract a value from a singleton value (extracting a sole element of a + container of length 1). + """ + + if len(value) != 1: + if is_argument: + raise exceptions.InvalidPipelineError( + "Argument '{argument_name}' of step {step_index} of pipeline '{pipeline_id}' is singleton data, but available data is not.".format( + argument_name=name, + step_index=self.current_step, + pipeline_id=self.pipeline.id, + ), + ) + else: + raise exceptions.InvalidPipelineError( + "Hyper-parameter '{hyperparameter_name}' of step {step_index} of pipeline '{pipeline_id}' is singleton data, but available data is not.".format( + hyperparameter_name=name, + step_index=self.current_step, + pipeline_id=self.pipeline.id, + ), + ) + + return get_singleton_value(value) + + def _prepare_primitive_arguments(self, step: pipeline_module.PrimitiveStep) -> typing.Dict[str, typing.Any]: + arguments = {} + for argument_name, argument_description in step.arguments.items(): + + if argument_description['type'] == metadata_base.ArgumentType.DATA: + argument_value = self.data_values[argument_description['data']] + # We have to extract a singleton value out. + argument_value = self._get_singleton_value(argument_value, True, argument_name) + + elif argument_description['type'] == metadata_base.ArgumentType.CONTAINER: + if utils.is_sequence(argument_description['data']): + values = [self.data_values[data_reference] for data_reference in argument_description['data']] + # We have to create a container List. + argument_value = self._get_list_value(values) + else: + argument_value = self.data_values[argument_description['data']] + + else: + raise exceptions.UnexpectedValueError("Unknown argument type: {argument_type}".format(argument_type=argument_description['type'])) + + arguments[argument_name] = argument_value + + return arguments + + def _get_list_value(self, values: typing.Sequence) -> container.List: + """ + Creates a container List from ``values``. It reuses existing metadata in ``values`` + to create metadata of the container List. + """ + + container_list = container.List(values, { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.List, + 'dimension': { + 'length': len(values), + }, + }) + + for value_index, value in enumerate(values): + container_list.metadata = value.metadata.copy_to(container_list.metadata, (), (value_index,)) + + return container_list + + def _get_default_hyperparams(self, step: pipeline_module.PrimitiveStep) -> hyperparams_module.Hyperparams: + return step.get_primitive_hyperparams().defaults() + + def _get_runtime_hyperparams(self, step: pipeline_module.PrimitiveStep) -> typing.Dict: + if self.hyperparams is not None: + runtime_hyperparams = self.hyperparams[self.current_step] + + # We checked this already in "_check_hyperparams". + assert isinstance(runtime_hyperparams, (dict, frozendict.frozendict)), runtime_hyperparams + else: + runtime_hyperparams = {} + + return runtime_hyperparams + + def _get_pipeline_hyperparams(self, step: pipeline_module.PrimitiveStep) -> typing.Dict: + pipeline_hyperparams = {} + for hyperparameter_name, hyperparameter_description in step.hyperparams.items(): + if hyperparameter_description['type'] == metadata_base.ArgumentType.DATA: + if utils.is_sequence(hyperparameter_description['data']): + pipeline_hyperparams[hyperparameter_name] = [ + self._get_singleton_value(self.data_values[data_reference], False, hyperparameter_name) + for data_reference in hyperparameter_description['data'] + ] + else: + pipeline_hyperparams[hyperparameter_name] = self._get_singleton_value(self.data_values[hyperparameter_description['data']], False, hyperparameter_name) + + elif hyperparameter_description['type'] == metadata_base.ArgumentType.PRIMITIVE: + if utils.is_sequence(hyperparameter_description['data']): + primitive_references = hyperparameter_description['data'] + else: + primitive_references = typing.cast(typing.Sequence, [hyperparameter_description['data']]) + + primitives = [] + for primitive_reference in primitive_references: + # We make an instance of a primitive which is almost the same as the pipeline primitive + # (see "_create_pipeline_primitive"), but with a different random seed because of a different + # "current_step". Then we clone it (using "_clone_primitive") in "_handle_primitive_hyperparams" + # which uses the final random seed. This way we are handling all primitives in hyper-parameters + # the same no matter the source (it could be somebody somehow passes a primitive instance through + # produce method's output or something). + # TODO: See if an optimization (no additional clone) here is needed and how hard is to implement it. + # TODO: Try to re-use existing primitive instances. + # We currently do not store primitive instances of prior steps, but we could those we know we + # will need in later steps and then just use them here, instead of creating them from scratch. + primitive = self._create_primitive_reference_primitive(primitive_reference, hyperparameter_name) + primitives.append(primitive) + + if utils.is_sequence(hyperparameter_description['data']): + pipeline_hyperparams[hyperparameter_name] = primitives + else: + assert len(primitives) == 1 + + pipeline_hyperparams[hyperparameter_name] = primitives[0] # type: ignore + + elif hyperparameter_description['type'] == metadata_base.ArgumentType.CONTAINER: + pipeline_hyperparams[hyperparameter_name] = self.data_values[hyperparameter_description['data']] + + elif hyperparameter_description['type'] == metadata_base.ArgumentType.VALUE: + pipeline_hyperparams[hyperparameter_name] = hyperparameter_description['data'] + + else: + raise exceptions.UnexpectedValueError("Unknown hyper-parameter type: {hyperparameter_type}".format(hyperparameter_type=hyperparameter_description['type'])) + + return pipeline_hyperparams + + def _prepare_primitive_hyperparams(self, step: pipeline_module.PrimitiveStep) -> typing.Tuple[hyperparams_module.Hyperparams, typing.Dict]: + default_hyperparams = self._get_default_hyperparams(step) + pipeline_hyperparams = self._get_pipeline_hyperparams(step) + runtime_hyperparams = self._get_runtime_hyperparams(step) + + # Pipeline hyper-parameters should be disjoint with runtime hyper-parameters. + # We check this in "_check_hyperparams" call from the constructor. + assert set(pipeline_hyperparams.keys()).isdisjoint(set(runtime_hyperparams.keys())), (pipeline_hyperparams, runtime_hyperparams) + + hyperparams = default_hyperparams.replace(pipeline_hyperparams).replace(runtime_hyperparams) + + # We have to handle all primitive values present in hyper-parameters. + return self._handle_primitive_hyperparams(hyperparams, 0), pipeline_hyperparams + + def _filter_arguments(self, primitive_class: typing.Type[base.PrimitiveBase], method_name: str, arguments: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: + """ + Primitive as a whole gets arguments for all its methods, so here we then filter out + only those arguments expected by a given method. + """ + + method_arguments = primitive_class.metadata.query()['primitive_code'].get('instance_methods', {}).get(method_name, {}).get('arguments', []) + + filtered_arguments = {} + for argument_name in method_arguments: + if argument_name in arguments: + filtered_arguments[argument_name] = arguments[argument_name] + + return filtered_arguments + + def _get_primitive_volumes(self, primitive_class: typing.Type[base.PrimitiveBase]) -> typing.Dict: + volumes = {} + for entry in primitive_class.metadata.get_volumes(): + if self.volumes_dir is None: + raise exceptions.InvalidArgumentValueError( + "Primitive '{primitive_id}' of step {step_index} of pipeline '{pipeline_id}' requires static files (volumes) but volumes are not available.".format( + primitive_id=primitive_class.metadata.query()['id'], + step_index=self.current_step, + pipeline_id=self.pipeline.id, + ), + ) + + volume_path = os.path.join(self.volumes_dir, entry['file_digest']) + if not os.path.exists(volume_path): + raise exceptions.InvalidArgumentValueError( + "Primitive '{primitive_id}' of step {step_index} of pipeline '{pipeline_id}' requires static files (volume) but volume for key '{key}' is not available.".format( + primitive_id=primitive_class.metadata.query()['id'], + step_index=self.current_step, + pipeline_id=self.pipeline.id, + key=entry['key'], + ), + ) + + volumes[entry['key']] = volume_path + + return volumes + + def _get_primitive_temporary_directory(self, primitive_class: typing.Type[base.PrimitiveBase]) -> str: + return tempfile.mkdtemp(dir=self._base_temporary_directory_path) + + def _create_primitive_arguments(self, primitive_class: typing.Type[base.PrimitiveBase], hyperparams: hyperparams_module.Hyperparams, random_seed_offset: int) -> typing.Dict: + constructor_arguments = { + 'hyperparams': hyperparams, + # We change the random seed in a deterministic way so that it does not matter in which order we run steps. + 'random_seed': self.random_seed + self.current_step + random_seed_offset, + 'volumes': self._get_primitive_volumes(primitive_class), + 'temporary_directory': self._get_primitive_temporary_directory(primitive_class), + } + + filtered_arguments = self._filter_arguments(primitive_class, '__init__', constructor_arguments) + + return filtered_arguments + + def _create_primitive(self, primitive_class: typing.Type[base.PrimitiveBase], hyperparams: hyperparams_module.Hyperparams, random_seed_offset: int) -> base.PrimitiveBase: + """ + Creates an instance of a non-pipeline primitive. + + Constructor call is not recorded in pipeline run. + """ + + arguments = self._create_primitive_arguments(primitive_class, hyperparams, random_seed_offset) + + return primitive_class(**arguments) + + def _clone_primitive(self, primitive: base.PrimitiveBase, random_seed_offset: int) -> base.PrimitiveBase: + """ + Clone a primitive. It reuses hyper-parameters and params, but provides a + potentially different random seed and other constructor arguments. + + We are creating a new instance and not a deep copy because primitive instance might have + been created outside of the runtime and might not have valid constructor argument values. + """ + + # We have to handle all primitive values present in hyper-parameters. + # They are all already an instance, but we have to make their copies. + hyperparams = self._handle_primitive_hyperparams(primitive.hyperparams, random_seed_offset + 1) + + primitive_clone = self._create_primitive(type(primitive), hyperparams, random_seed_offset) + + primitive_clone.set_params(params=primitive.get_params()) + + return primitive_clone + + def _create_pipeline_primitive(self, primitive_class: typing.Type[base.PrimitiveBase], hyperparams: hyperparams_module.Hyperparams) -> base.PrimitiveBase: + """ + Creates an instance of a pipeline primitive. + + Constructor call is recorded in pipeline run. + """ + + arguments = self._create_primitive_arguments(primitive_class, hyperparams, 0) + + if 'random_seed' in arguments: + self.pipeline_run.set_primitive_step_random_seed(self.current_step, arguments['random_seed']) + + return self._call_primitive_method(primitive_class, arguments) + + def _create_hyperparameter_primitive(self, primitive_class: typing.Type[base.PrimitiveBase], random_seed_offset: int) -> base.PrimitiveBase: + """ + Creates an instance of the non-pipeline primitive with default hyper-parameters. + """ + + hyperparams_class = primitive_class.metadata.get_hyperparams() + + return self._create_primitive(primitive_class, hyperparams_class.defaults(), random_seed_offset) + + def _create_primitive_reference_primitive(self, primitive_reference: int, hyperparameter_name: str) -> base.PrimitiveBase: + """ + Creates an instance of a primitive based on its primitive reference (step index), meaning the instance + of a primitive is almost the same as the pipeline primitive (see "_create_pipeline_primitive") at that + step index, but with a different random seed because of a probably different "current_step". + + Constructor call is not recorded in pipeline run. + """ + + # It could point to a sub-pipeline and not primitive. + if not isinstance(self.pipeline.steps[primitive_reference], pipeline_module.PrimitiveStep): + raise exceptions.InvalidPipelineError( + "Hyper-parameter '{hyperparameter_name}' of step {step_index} of pipeline '{pipeline_id}' does not point to a primitive step (step {primitive_reference}).".format( # noqa + hyperparameter_name=hyperparameter_name, + step_index=self.current_step, + pipeline_id=self.pipeline.id, + primitive_reference=primitive_reference, + ), + ) + + step = typing.cast(pipeline_module.PrimitiveStep, self.pipeline.steps[primitive_reference]) + hyperparams, pipeline_hyperparams = self._prepare_primitive_hyperparams(step) + # We use 0 for "random_seed_offset" because we are creating a primitive instance + # which should be the same as the pipeline primitive (see "_create_pipeline_primitive"). + primitive = self._create_primitive(step.primitive, hyperparams, 0) + primitive.set_params(params=self.steps_state[primitive_reference]) + return primitive + + def _transform_primitive_hyperparameter(self, hyperparameter: hyperparams_module.Hyperparameter, value: typing.Any, index: int) -> typing.Any: + value_is_type = utils.is_type(value) + if value_is_type and issubclass(value, base.PrimitiveBase): + return self._create_hyperparameter_primitive(value, index) + elif not value_is_type and isinstance(value, base.PrimitiveBase): + return self._clone_primitive(value, index) + else: + # Not a primitive instance or a primitive class, do not do anything. + return value + + def _handle_primitive_hyperparams(self, hyperparams: base.Hyperparams, random_seed_offset: int) -> base.Hyperparams: + """ + Handles a special case when the value is a primitive instance or a primitive class. + In this case we have to make sure we create a new instance reusing its hyper-parameters, + or create an instance from the class using default hyper-parameters. + """ + + return hyperparams.transform_value(hyperparams, self._transform_primitive_hyperparameter, random_seed_offset) + + def _run_primitive(self, step: pipeline_module.PrimitiveStep) -> None: + if step.primitive is None: + raise exceptions.InvalidPipelineError("Primitive has not been resolved.") + + self.pipeline_run.add_primitive_step(step) + arguments = self._prepare_primitive_arguments(step) + + hyperparams, pipeline_hyperparams = self._prepare_primitive_hyperparams(step) + + if self.phase == metadata_base.PipelineRunPhase.FIT: + self.pipeline_run.set_primitive_step_hyperparams(self.current_step, hyperparams, pipeline_hyperparams) + + # We create a primitive just before it is being run. This assures that any primitives it depends on through its + # hyper-parameters have already been run (because they are in prior steps). Similarly, any pipeline-based value + # being passed to a hyper-parameter has already been computed. + primitive = self._create_pipeline_primitive(step.primitive, hyperparams) + + # If primitive step has no arguments we do not fit or produce it. It is meant to be used as + # unfitted primitive for another primitive's hyper-parameter. + if not arguments: + return + + if self.phase == metadata_base.PipelineRunPhase.FIT: + assert self.steps_state[self.current_step] is None + else: + primitive.set_params(params=self.steps_state[self.current_step]) + + if self.phase == metadata_base.PipelineRunPhase.FIT: + fit_multi_produce_arguments = self._filter_arguments(step.primitive, 'fit_multi_produce', dict(arguments, produce_methods=step.outputs)) + + # We fit and produce once, without any limits on iterations/time. + multi_call_result = self._call_primitive_method(primitive.fit_multi_produce, fit_multi_produce_arguments) + if not multi_call_result.has_finished: + # Because we have not set any limits on iterations/time, the primitive should finish and not stop early. + # One should be able to control through a hyper-parameter or hyper-parameters stopping criteria for the primitive. + raise exceptions.InvalidReturnValueError( + "\"fit_multi_produce\" call result should have \"has_finished\" set to true because iterations/time limits were set and the primitive should finish and not stop early.", + ) + outputs = multi_call_result.values + + elif self.phase == metadata_base.PipelineRunPhase.PRODUCE: + multi_produce_arguments = self._filter_arguments(step.primitive, 'multi_produce', dict(arguments, produce_methods=step.outputs)) + + # We produce once, without any limits on iterations/time. + multi_call_result = self._call_primitive_method(primitive.multi_produce, multi_produce_arguments) + if not multi_call_result.has_finished: + # Because we have not set any limits on iterations/time, the primitive should finish and not stop early. + # One should be able to control through a hyper-parameter or hyper-parameters stopping criteria for the primitive. + raise exceptions.InvalidReturnValueError( + "\"multi_produce\" call result should have \"has_finished\" set to true because iterations/time limits were set and the primitive should finish and not stop early.", + ) + outputs = multi_call_result.values + + else: + # TODO: Allow dispatch to a general method so that subclasses of this class can handle them if necessary. + raise exceptions.UnexpectedValueError("Unknown phase: {phase}".format(phase=self.phase)) + + if self.phase == metadata_base.PipelineRunPhase.FIT: + assert self.steps_state[self.current_step] is None + self.steps_state[self.current_step] = primitive.get_params() + + for output_id in step.outputs: + output_data_reference = 'steps.{i}.{output_id}'.format(i=step.index, output_id=output_id) + + if output_id in outputs: + self.data_values[output_data_reference] = outputs[output_id] + else: + raise exceptions.InvalidReturnValueError("Missing declared output '{output_id}' in computed primitive's outputs.".format(output_id=output_id)) + + def _call_primitive_method(self, method: typing.Callable, arguments: typing.Dict) -> typing.Any: + """ + Calls a primitive method (or constructor). Records relevant information in pipeline run. + + Parameters + ---------- + method: + Primitive's method or constructor to call. + arguments: + Arguments to pass to the method. + + Returns + ------- + The result of calling the method. It method is a constructor, + returns an instance. + """ + + # A special case for the constructor. + if inspect.isclass(method): + method_name = '__init__' + else: + method_name = method.__name__ + + pipeline_run_method_call_id = self.pipeline_run.add_method_call_to_primitive_step(self.current_step, method_name) + + callback = self.pipeline_run.get_method_call_logging_callback(pipeline_run_method_call_id) + logging_handler = utils.CallbackHandler(callback) + + root = logging.getLogger() + redirect_logger = logging.getLogger('redirect') + + old_level = root.level + old_handler_levels = [handler.level for handler in root.handlers] + old_propagate = redirect_logger.propagate + try: + # We are just about to modify the root logger level, so we change levels + # of all existing handlers to retain same configuration. + for handler in root.handlers: + # If existing handler has level already set to something more restrictive than what the + # root logger has, we do not change that. Otherwise, we set it to the root logger's level. + if handler.level < old_level: + handler.setLevel(old_level) + # Record all logging which happens during the call. + root.setLevel(logging.DEBUG) + root.addHandler(logging_handler) + # We do not want to print logging from "redirect_logger" because pass-through is enabled, so we + # disable propagation from it to the root logger (by default there is a stream handler on the root + # logger which prints all logging) and install our handler directly on the redirect logger. + redirect_logger.propagate = False + redirect_logger.addHandler(logging_handler) + + # TODO: All this redirection works in a single thread, what about multi-threaded or async? + # Reference engine is single threaded, but maybe a subclass would not be? + # We redirect all stdout/stderr to logging, but pass it through to stdout/stderr as well. + with utils.redirect_to_logging(logger=redirect_logger, pass_through=True): + with utils.global_randomness_warning(): + self.pipeline_run.method_call_started(pipeline_run_method_call_id) + + try: + result = method(**arguments) + except Exception as error: + self.pipeline_run.method_call_failed(pipeline_run_method_call_id, traceback.format_exc()) + + raise error + + self.pipeline_run.method_call_successful(pipeline_run_method_call_id) + + finally: + # Restore original logging configuration. + root.removeHandler(logging_handler) + root.setLevel(old_level) + for i, level in enumerate(old_handler_levels): + root.handlers[i].setLevel(level) + # Just to be consistent, if somebody is doing something with the same logger. + redirect_logger.propagate = old_propagate + redirect_logger.removeHandler(logging_handler) + + self.pipeline_run.set_method_call_result_metadata(pipeline_run_method_call_id, result) + + return result + + def _run_step(self, step: pipeline_module.StepBase) -> None: + if isinstance(step, pipeline_module.PlaceholderStep): + self._run_placeholder(step) + elif isinstance(step, pipeline_module.SubpipelineStep): + self._run_subpipeline(step) + elif isinstance(step, pipeline_module.PrimitiveStep): + self._run_primitive(step) + else: + # TODO: Allow dispatch to a general method so that subclasses of this class can handle them if necessary. + raise exceptions.UnexpectedValueError("Unknown step type: {step_type}".format(step_type=type(step))) + + def _do_run_step(self, step: pipeline_module.StepBase) -> None: + self.pipeline_run.step_started(self.current_step) + + try: + self._before_step_run() + self._run_step(step) + self._after_step_run() + except Exception as error: + self.pipeline_run.step_failed(self.current_step, traceback.format_exc()) + + raise exceptions.StepFailedError( + "Step {step_index} for pipeline {pipeline_id} failed.".format( + step_index=self.current_step, pipeline_id=self.pipeline.id, + ), + ) from error + + self.pipeline_run.step_successful(self.current_step) + + def _do_run(self) -> None: + for step_index, step in enumerate(self.pipeline.steps): + self.current_step = step_index + + self._do_run_step(step) + + def _run( + self, inputs: typing.Sequence[typing.Any], phase: metadata_base.PipelineRunPhase, + return_values: typing.Optional[typing.Sequence[str]] + ) -> Result: + self._check_pipeline(inputs) + + self._initialize_run_state(inputs, phase, return_values) + + self.pipeline_run.run_started() + + error: Exception = None + try: + self._do_run() + except Exception as run_error: + self.pipeline_run.run_failed(traceback.format_exc()) + + error = run_error + + if error is None: + self.pipeline_run.run_successful() + + self._populate_output_values() + + if self.is_standard_pipeline: + self.pipeline_run.set_predictions(self.data_values['outputs.0']) + + values = self._get_return_values(error) + + pipeline_run = self.pipeline_run + + self._clear_run_state() + + # TODO: What if some internal exception happens before we set this which leaves runtime in a changed state. + # This means that state has changed, but we have not set previous pipeline run. + # So if another phase is called, it might even by accident succeed, but have invalid + # previous pipeline run set which does not explain the state of the runtime. + # Maybe we should make sure we always set this ID, even when not returning a pipeline + # run so that it can be at least visible that some pipeline run is missing in the sequence. + self._previous_pipeline_run = pipeline_run + + return Result(pipeline_run, values, error) + + def _get_return_values(self, error: typing.Optional[Exception]) -> typing.Dict: + values = {} + for name in self.return_values: + try: + values[name] = self.data_values[name] + except KeyError as value_error: + # We try to return whichever values we can, even in the case of an error. + if error is None: + raise value_error + + return values + + def _before_step_run(self) -> None: + pass + + def _after_step_run(self) -> None: + self._delete_unnecessary_values() + + def _delete_unnecessary_values(self) -> None: + values_needed = set() + + # Which values are explicitly required to be kept until the end? + for value in self.return_values: + values_needed.add(value) + + # Outputs need values from steps. + for i, output_description in enumerate(self.pipeline.outputs): + if 'outputs.{i}'.format(i=i) in self.return_values: + values_needed.add(output_description['data']) + + # Future steps also need values. + for step in self.pipeline.steps[self.current_step + 1:]: + values_needed.update(step.get_input_data_references()) + + # Pipeline run for a standard pipeline needs predictions. + if self.is_standard_pipeline: + values_needed.add(self.pipeline.outputs[0]['data']) + + # Delete any value which is not needed anymore. + # We iterate over a list so that we can change dict while iterating. + for data_reference in list(self.data_values.keys()): + if data_reference not in values_needed: + del self.data_values[data_reference] + + def fit( + self, inputs: typing.Sequence[typing.Any], *, return_values: typing.Sequence[str] = None, + ) -> Result: + """ + Does a "fit" phase of the pipeline. + + Parameters + ---------- + inputs: + A list of inputs to the pipeline. + return_values: + A list of data references of all output values of all steps to return. + If ``None``, the output values of the whole pipeline are returned. + + Returns + ------- + A result object with kept values, pipeline run description, and any exception. + """ + + return self._run(inputs, metadata_base.PipelineRunPhase.FIT, return_values) + + def produce( + self, inputs: typing.Sequence[typing.Any], *, return_values: typing.Sequence[str] = None, + ) -> Result: + """ + Does a "produce" phase of the pipeline and returns outputs. + + Parameters + ---------- + inputs: + A list of inputs to the pipeline. + return_values: + A list of data references of all output values of all steps to return. + If ``None``, the output values of the whole pipeline are returned. + + Returns + ------- + A result object with kept values, pipeline run description, and any exception. + """ + + return self._run(inputs, metadata_base.PipelineRunPhase.PRODUCE, return_values) + + def get_params(self) -> typing.List[typing.Union[typing.Any, typing.List]]: + return self.steps_state + + def set_params(self, params: typing.List[typing.Union[typing.Any, typing.List]]) -> None: + if not isinstance(params, typing.List): + raise exceptions.InvalidArgumentValueError("Parameters not a list.") + + self._clear_run_state() + self.steps_state = params + + def _populate_output_values(self) -> None: + for i, output_description in enumerate(self.pipeline.outputs): + # Outputs might not be available because they were not requested to be returned from the run. + if output_description['data'] in self.data_values: + self.data_values['outputs.{i}'.format(i=i)] = self.data_values[output_description['data']] + + @classmethod + def _normalize_dataset_id(cls, dataset_id: str) -> str: + return DATASET_ID_REGEX.sub('', dataset_id) + + @classmethod + def _dataset_ids_match(cls, first_dataset_id: str, second_dataset_id: str) -> bool: + if first_dataset_id == second_dataset_id: + return True + + if cls._normalize_dataset_id(first_dataset_id) == cls._normalize_dataset_id(second_dataset_id): + return True + + return False + + @classmethod + def _mark_columns(cls, problem_inputs: typing.Sequence[typing.Dict], dataset: container.Dataset) -> typing.Tuple[container.Dataset, typing.Sequence[int]]: + dataset = dataset.copy() + dataset_id = dataset.metadata.query(())['id'] + + marked_problem_indices = [] + for problem_index, problem_input in enumerate(problem_inputs): + if not cls._dataset_ids_match(problem_input['dataset_id'], dataset_id): + continue + + marked_problem_indices.append(problem_index) + + for target in problem_input.get('targets', []): + if target['resource_id'] not in dataset: + raise exceptions.NotFoundError( + "Error marking target column: dataset does not contain resource with resource ID '{resource_id}'.".format( + resource_id=target['resource_id'], + ), + ) + if not isinstance(dataset[target['resource_id']], container.DataFrame): + raise TypeError( + "Error marking target column: resource '{resource_id}' is not a DataFrame.".format( + resource_id=target['resource_id'], + ), + ) + if not 0 <= target['column_index'] < dataset[target['resource_id']].shape[1]: + raise ValueError( + "Error marking target column: resource '{resource_id}' does not have a column with index '{column_index}'.".format( + resource_id=target['resource_id'], + column_index=target['column_index'], + ), + ) + + dataset.metadata = dataset.metadata.add_semantic_type( + (target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']), + 'https://metadata.datadrivendiscovery.org/types/Target', + ) + dataset.metadata = dataset.metadata.add_semantic_type( + (target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ) + # If column is marked as a target, it cannot be attribute as well. + # This allows one to define in problem description otherwise attribute columns as targets. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/265 + dataset.metadata = dataset.metadata.remove_semantic_type( + (target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']), + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ) + + # TODO: Warn if privileged data columns are not set on attributes. + for privileged_data in problem_input.get('privileged_data', []): + if privileged_data['resource_id'] not in dataset: + raise exceptions.NotFoundError( + "Error marking privileged data column: dataset does not contain resource with resource ID '{resource_id}'.".format( + resource_id=privileged_data['resource_id'], + ), + ) + if not isinstance(dataset[privileged_data['resource_id']], container.DataFrame): + raise TypeError( + "Error marking privileged data column: resource '{resource_id}' is not a DataFrame.".format( + resource_id=privileged_data['resource_id'], + ), + ) + if not 0 <= privileged_data['column_index'] < dataset[privileged_data['resource_id']].shape[1]: + raise ValueError( + "Error marking privileged data column: resource '{resource_id}' does not have a column with index '{column_index}'.".format( + resource_id=privileged_data['resource_id'], + column_index=privileged_data['column_index'], + ), + ) + + dataset.metadata = dataset.metadata.add_semantic_type( + (privileged_data['resource_id'], metadata_base.ALL_ELEMENTS, privileged_data['column_index']), + 'https://metadata.datadrivendiscovery.org/types/PrivilegedData', + ) + + return dataset, marked_problem_indices + + +def _prepare_data_and_scoring_hyperparams(free_hyperparams: typing.Sequence, hyperparameter_values: typing.Dict) -> typing.Tuple[typing.Sequence, typing.Set[str]]: + """ + Values in ``hyperparameter_values`` should be serialized as JSON, as obtained by JSON-serializing + the output of hyper-parameter's ``value_to_json_structure`` method call. + """ + + hyperparams: typing.List[typing.Union[typing.Dict, typing.Sequence]] = [] + + hyperparameter_values_used = set() + + for free_hyperparams_for_step in free_hyperparams: + if isinstance(free_hyperparams_for_step, (dict, frozendict.frozendict)): + values = {} + for name, hyperparameter in free_hyperparams_for_step.items(): + if name in hyperparameter_values: + values[name] = hyperparameter.value_from_json_structure(json.loads(hyperparameter_values[name])) + hyperparameter_values_used.add(name) + hyperparams.append(values) + elif utils.is_sequence(free_hyperparams_for_step): + step_hyperparams, step_hyperparameter_values_used = _prepare_data_and_scoring_hyperparams(free_hyperparams_for_step, hyperparameter_values) + hyperparams.append(step_hyperparams) + hyperparameter_values_used.update(step_hyperparameter_values_used) + else: + raise exceptions.UnexpectedValueError("Unknown hyper-parameters type: {hyperparams_type}".format(hyperparams_type=type(free_hyperparams_for_step))) + + return hyperparams, hyperparameter_values_used + + +# TODO: Add debug logging. +def fit( + pipeline: pipeline_module.Pipeline, inputs: typing.Sequence[container.Dataset], *, + problem_description: typing.Optional[problem.Problem], context: metadata_base.Context, + hyperparams: typing.Sequence = None, random_seed: int = 0, volumes_dir: str = None, scratch_dir: str = None, + runtime_environment: pipeline_run_module.RuntimeEnvironment = None, is_standard_pipeline: bool = True, + expose_produced_outputs: bool = False, +) -> typing.Tuple[typing.Optional[Runtime], typing.Optional[container.DataFrame], Result]: + for input in inputs: + if not isinstance(input, container.Dataset): + raise TypeError("A standard pipeline's input should be of a container Dataset type, not {input_type}.".format( + input_type=type(input), + )) + + if is_standard_pipeline and len(pipeline.outputs) != 1: + raise ValueError("A standard pipeline should have exactly one output, not {outputs}.".format( + outputs=len(pipeline.outputs), + )) + + runtime = Runtime( + pipeline, hyperparams, + problem_description=problem_description, context=context, + random_seed=random_seed, volumes_dir=volumes_dir, scratch_dir=scratch_dir, + is_standard_pipeline=is_standard_pipeline, environment=runtime_environment, + ) + + if expose_produced_outputs: + return_values = sorted(pipeline.get_producing_outputs()) + else: + return_values = ['outputs.0'] + + result = runtime.fit(inputs, return_values=return_values) + + if result.has_error(): + return None, None, result + + output = result.values['outputs.0'] + + if not isinstance(output, container.DataFrame): + raise TypeError("A standard pipeline's output should be of a container DataFrame type, not {output_type}.".format( + output_type=type(output), + )) + + return runtime, output, result + + +# TODO: Add debug logging. +def produce( + fitted_pipeline: Runtime, test_inputs: typing.Sequence[container.Dataset], *, + expose_produced_outputs: bool = False, +) -> typing.Tuple[typing.Optional[container.DataFrame], Result]: + for test_input in test_inputs: + if not isinstance(test_input, container.Dataset): + raise TypeError("A standard pipeline's input should be of a container Dataset type, not {input_type}.".format( + input_type=type(test_input), + )) + + # This is checked in "fit" already, but maybe somebody fitter a pipeline not through "fit". + if fitted_pipeline.is_standard_pipeline and len(fitted_pipeline.pipeline.outputs) != 1: + raise ValueError("A standard pipeline should have exactly one output, not {outputs}.".format( + outputs=len(fitted_pipeline.pipeline.outputs), + )) + + if expose_produced_outputs: + return_values = sorted(fitted_pipeline.pipeline.get_producing_outputs()) + else: + return_values = ['outputs.0'] + + result = fitted_pipeline.produce(test_inputs, return_values=return_values) + if result.has_error(): + return None, result + + output = result.values['outputs.0'] + + if not isinstance(output, container.DataFrame): + raise TypeError("A standard pipeline's output should be of a container DataFrame type, not {output_type}.".format( + output_type=type(output), + )) + + return output, result + + +# TODO: Add debug logging. +def score( + predictions: container.DataFrame, score_inputs: typing.Sequence[container.Dataset], *, scoring_pipeline: pipeline_module.Pipeline, + problem_description: typing.Optional[problem.Problem], metrics: typing.Sequence[typing.Dict], predictions_random_seed: int = None, + context: metadata_base.Context, scoring_params: typing.Dict[str, str] = None, random_seed: int = 0, volumes_dir: str = None, + scratch_dir: str = None, runtime_environment: pipeline_run_module.RuntimeEnvironment = None, +) -> typing.Tuple[typing.Optional[container.DataFrame], Result]: + for score_input in score_inputs: + if not isinstance(score_input, container.Dataset): + raise TypeError("A scoring pipeline's input should be of a container Dataset type, not {input_type}.".format( + input_type=type(score_input), + )) + + if len(scoring_pipeline.outputs) != 1: + raise ValueError("A scoring pipeline should have exactly one output, not {outputs}.".format( + outputs=len(scoring_pipeline.outputs), + )) + + metrics_hyperparameter = [] + for metric in metrics: + # Structure should match what "value_from_json_structure" would + # return for "ComputeScoresPrimitive" hyper-parameter. + # TODO: Once "ComputeScoresPrimitive" is moved to core package, use its default hyper-parameters here. + metric_hyperparameter = {'metric': metric['metric'].name, 'k': None, 'pos_label': None} + metric_hyperparameter.update(metric.get('params', {})) + metrics_hyperparameter.append(metric_hyperparameter) + + if scoring_params is None: + scoring_params = {} + + if metrics_hyperparameter: + # We have to JSON-serialize it because "_prepare_data_and_scoring_hyperparams" + # expects all values to be JSON-serialized. + scoring_params['metrics'] = json.dumps(metrics_hyperparameter) + + scoring_hyperparams, scoring_params_used = _prepare_data_and_scoring_hyperparams(scoring_pipeline.get_free_hyperparams(), scoring_params) + + scoring_params_keys_set = set(scoring_params.keys()) + if scoring_params_keys_set - scoring_params_used: + logger.warning("Not all provided hyper-parameters for the scoring pipeline %(pipeline_id)s were used: %(unused_params)s", { + 'pipeline_id': scoring_pipeline.id, + 'unused_params': ', '.join(sorted(scoring_params_keys_set - scoring_params_used)), + }) + + runtime = Runtime( + scoring_pipeline, scoring_hyperparams, + problem_description=problem_description, context=context, + random_seed=random_seed, volumes_dir=volumes_dir, scratch_dir=scratch_dir, + environment=runtime_environment, + ) + + inputs = [predictions] + list(score_inputs) # type: ignore + + # Fit + produce on same data. + result = runtime.fit(inputs, return_values=['outputs.0']) + if result.has_error(): + return None, result + + output = result.values['outputs.0'] + + if not isinstance(output, container.DataFrame): + raise TypeError("A scoring pipeline's output should be of a container DataFrame type, not {output_type}.".format( + output_type=type(output), + )) + + if predictions_random_seed is not None: + output = combine_random_seed(output, predictions_random_seed) + + return output, result + + +# TODO: Add debug logging. +def prepare_data( + inputs: typing.Sequence[container.Dataset], *, data_pipeline: pipeline_module.Pipeline, problem_description: typing.Optional[problem.Problem], + data_params: typing.Dict[str, str], context: metadata_base.Context, random_seed: int = 0, volumes_dir: str = None, + scratch_dir: str = None, runtime_environment: pipeline_run_module.RuntimeEnvironment = None, +) -> typing.Tuple[typing.List, Result]: + """ + Values in ``data_params`` should be serialized as JSON, as obtained by JSON-serializing + the output of hyper-parameter's ``value_to_json_structure`` method call. + """ + + for input in inputs: + if not isinstance(input, container.Dataset): + raise TypeError("A data preparation pipeline's input should be of a container Dataset type, not {input_type}.".format( + input_type=type(input), + )) + + if len(data_pipeline.outputs) != 3: + raise ValueError("A data preparation pipeline should have exactly three outputs, not {outputs}.".format( + outputs=len(data_pipeline.outputs), + )) + + if 'number_of_folds' in data_params: + number_of_folds = int(data_params['number_of_folds']) + else: + # For now we assume other data preparation pipelines do only one fold. We should standardize + # more hyper-parameters to gather how many folds have to be made (and not really folds, but + # more how many input indices have to be passed to the pipeline). + number_of_folds = 1 + + data_hyperparams, data_params_used = _prepare_data_and_scoring_hyperparams(data_pipeline.get_free_hyperparams(), data_params) + + data_params_keys_set = set(data_params.keys()) + if data_params_keys_set - data_params_used: + logger.warning("Not all provided hyper-parameters for the data preparation pipeline {pipeline_id} were used: {unused_params}".format( + pipeline_id=data_pipeline.id, + unused_params=sorted(data_params_keys_set - data_params_used), + )) + + runtime = Runtime( + data_pipeline, data_hyperparams, + problem_description=problem_description, context=context, + random_seed=random_seed, volumes_dir=volumes_dir, + scratch_dir=scratch_dir, environment=runtime_environment, + ) + + # Fit + produce on same data. The inputs are the list of indices of folds + # to generate and a dataset to split. + result = runtime.fit([container.List(range(number_of_folds))] + list(inputs), return_values=['outputs.0', 'outputs.1', 'outputs.2']) # type: ignore + if result.has_error(): + return [], result + + outputs = [result.values['outputs.0'], result.values['outputs.1'], result.values['outputs.2']] + + for output in outputs: + if not isinstance(output, container.List): + raise TypeError("A data preparation pipeline's output should be of a container List type, not {input_type}.".format( + input_type=type(output), + )) + if len(output) != number_of_folds: + raise ValueError("A data preparation pipeline's output should contain {number_of_folds} datasets, not {length}.".format( + number_of_folds=number_of_folds, + length=len(output), + )) + + return outputs, result + + +# TODO: Add debug logging. +def evaluate( + pipeline: pipeline_module.Pipeline, inputs: typing.Sequence[container.Dataset], *, data_pipeline: pipeline_module.Pipeline, + scoring_pipeline: pipeline_module.Pipeline, problem_description: typing.Optional[problem.Problem], + data_params: typing.Dict[str, str], metrics: typing.Sequence[typing.Dict], context: metadata_base.Context, + scoring_params: typing.Dict[str, str] = None, hyperparams: typing.Sequence = None, random_seed: int = 0, + data_random_seed: int = 0, scoring_random_seed: int = 0, volumes_dir: str = None, + scratch_dir: str = None, runtime_environment: pipeline_run_module.RuntimeEnvironment = None, +) -> typing.Tuple[typing.List[container.DataFrame], MultiResult]: + """ + Values in ``data_params`` should be serialized as JSON, as obtained by JSON-serializing + the output of hyper-parameter's ``value_to_json_structure`` method call. + """ + + outputs, data_result = prepare_data( + inputs, data_pipeline=data_pipeline, problem_description=problem_description, data_params=data_params, + context=context, random_seed=data_random_seed, volumes_dir=volumes_dir, + scratch_dir=scratch_dir, runtime_environment=runtime_environment, + ) + if data_result.has_error(): + return [], MultiResult([data_result]) + + fold_group_uuid = uuid.uuid4() + + all_scores: typing.List[container.DataFrame] = [] + all_results = MultiResult() + for fold_index, (train_inputs, test_inputs, score_inputs) in enumerate(zip(*outputs)): + fitted_pipeline, predictions, fit_result = fit( + pipeline, [train_inputs], problem_description=problem_description, context=context, hyperparams=hyperparams, + random_seed=random_seed, volumes_dir=volumes_dir, scratch_dir=scratch_dir, + runtime_environment=runtime_environment, + ) + + # Modifies "fit_result.pipeline_run" in-place. + combine_pipeline_runs( + fit_result.pipeline_run, data_pipeline_run=data_result.pipeline_run, + fold_group_uuid=fold_group_uuid, fold_index=fold_index, + ) + + all_results.append(fit_result) + if fit_result.has_error(): + assert all_results.has_error() + return all_scores, all_results + + predictions, produce_result = produce(fitted_pipeline, [test_inputs]) + + # Modifies "produce_result.pipeline_run" in-place. + combine_pipeline_runs( + produce_result.pipeline_run, data_pipeline_run=data_result.pipeline_run, + fold_group_uuid=fold_group_uuid, fold_index=fold_index + ) + + all_results.append(produce_result) + if produce_result.has_error(): + assert all_results.has_error() + return all_scores, all_results + + scores, score_result = score( + predictions, [score_inputs], scoring_pipeline=scoring_pipeline, problem_description=problem_description, metrics=metrics, + predictions_random_seed=random_seed, scoring_params=scoring_params, context=context, random_seed=scoring_random_seed, + volumes_dir=volumes_dir, scratch_dir=scratch_dir, runtime_environment=runtime_environment, + ) + + # Modifies "produce_result.pipeline_run" in-place. + combine_pipeline_runs( + produce_result.pipeline_run, scoring_pipeline_run=score_result.pipeline_run, + ) + # Sets the error, if there are any. + produce_result.error = score_result.error + + # We modified "produce_result.pipeline_run" in-place and "produce_result" + # is already among "all_results", so we do not add it again. + if score_result.has_error(): + assert all_results.has_error() + return all_scores, all_results + + # Modifies "produce_result.pipeline_run" in-place. + combine_pipeline_runs( + produce_result.pipeline_run, metrics=metrics, scores=scores, + ) + + all_scores.append(scores) + + return all_scores, all_results + + +is_uri = deprecate.function(message="use d3m.utils.is_uri instead")(utils.is_uri) + +get_dataset = deprecate.function(message="use d3m.container.dataset.get_dataset instead")(dataset_module.get_dataset) +get_problem = deprecate.function(message="use d3m.metadata.problem.get_problem instead")(problem.get_problem) +get_pipeline = deprecate.function(message="use d3m.metadata.pipeline.get_pipeline instead")(pipeline_module.get_pipeline) + + +@deprecate.function(message="use d3m.utils.get_datasets_and_problems instead") +def _get_datasets_and_problems( + datasets_dir: str, handle_score_split: bool = True, +) -> typing.Tuple[typing.Dict[str, str], typing.Dict[str, str]]: + return utils.get_datasets_and_problems(datasets_dir, handle_score_split) + + +def _resolve_pipeline_run_datasets( + pipeline_run_datasets: typing.Sequence[typing.Dict[str, str]], *, + dataset_resolver: typing.Callable, compute_digest: dataset_module.ComputeDigest, strict_digest: bool, + strict_resolving: bool, datasets_dir: typing.Optional[str], handle_score_split: bool, +) -> typing.Sequence[container.Dataset]: + resolved_datasets = [] + + for dataset_reference in pipeline_run_datasets: + resolved_dataset = dataset_resolver( + dataset_reference['id'], compute_digest=compute_digest, strict_digest=strict_digest, + datasets_dir=datasets_dir, handle_score_split=handle_score_split, + ) + + resolved_dataset_digest = resolved_dataset.metadata.query(()).get('digest', None) + + if resolved_dataset_digest != dataset_reference['digest']: + if strict_resolving: + raise exceptions.DigestMismatchError( + "Digest for dataset '{dataset_id}' does not match the one specified in the dataset reference. " + "Dataset reference digest: {dataset_digest}. Resolved dataset digest: {resolved_dataset_digest}.".format( + dataset_id=dataset_reference['id'], + dataset_digest=dataset_reference['digest'], + resolved_dataset_digest=resolved_dataset_digest, + ) + ) + else: + logger.warning( + "Digest for dataset '%(dataset_id)s' does not match the one specified in the dataset reference. " + "Dataset reference digest: %(dataset_digest)s. Resolved dataset digest: %(resolved_dataset_digest)s.", + { + 'dataset_id': dataset_reference['id'], + 'dataset_digest': dataset_reference['digest'], + 'resolved_dataset_digest': resolved_dataset_digest, + }, + ) + + resolved_datasets.append(resolved_dataset) + + return resolved_datasets + + +def parse_pipeline_run( + pipeline_run_file: typing.IO[typing.Any], pipeline_search_paths: typing.Sequence[str], datasets_dir: typing.Optional[str], *, + pipeline_resolver: typing.Callable = None, dataset_resolver: typing.Callable = None, + problem_resolver: typing.Callable = None, strict_resolving: bool = False, + compute_digest: dataset_module.ComputeDigest = dataset_module.ComputeDigest.ONLY_IF_MISSING, + strict_digest: bool = False, handle_score_split: bool = True, +) -> typing.Sequence[typing.Dict[str, typing.Any]]: + if pipeline_resolver is None: + pipeline_resolver = pipeline_module.get_pipeline + if dataset_resolver is None: + dataset_resolver = dataset_module.get_dataset + if problem_resolver is None: + problem_resolver = problem.get_problem + + pipeline_runs = list(utils.yaml_load_all(pipeline_run_file)) + + if not pipeline_runs: + raise exceptions.InvalidArgumentValueError("Pipeline run file must contain at least one pipeline run document.") + + for pipeline_run in pipeline_runs: + try: + pipeline_run_module.validate_pipeline_run(pipeline_run) + except jsonschema.exceptions.ValidationError as error: + raise exceptions.InvalidArgumentValueError("Provided pipeline run document is not valid.") from error + + pipeline_run['datasets'] = _resolve_pipeline_run_datasets( + pipeline_run['datasets'], dataset_resolver=dataset_resolver, + compute_digest=compute_digest, strict_digest=strict_digest, + strict_resolving=strict_resolving, datasets_dir=datasets_dir, + handle_score_split=handle_score_split, + ) + + if 'problem' in pipeline_run: + pipeline_run['problem'] = problem_resolver( + pipeline_run['problem']['id'], + strict_digest=strict_digest, + datasets_dir=datasets_dir, + handle_score_split=handle_score_split, + ) + + pipeline_run['pipeline'] = pipeline_resolver( + pipeline_run['pipeline']['id'], + strict_resolving=strict_resolving, + strict_digest=strict_digest, + pipeline_search_paths=pipeline_search_paths, + ) + + if 'data_preparation' in pipeline_run['run']: + pipeline_run['run']['data_preparation']['pipeline'] = pipeline_resolver( + pipeline_run['run']['data_preparation']['pipeline']['id'], + strict_resolving=strict_resolving, + strict_digest=strict_digest, + pipeline_search_paths=pipeline_search_paths, + ) + + if 'scoring' in pipeline_run['run']: + if 'datasets' in pipeline_run['run']['scoring']: + assert 'data_preparation' not in pipeline_run['run'] + pipeline_run['run']['scoring']['datasets'] = _resolve_pipeline_run_datasets( + pipeline_run['run']['scoring']['datasets'], dataset_resolver=dataset_resolver, + compute_digest=compute_digest, strict_digest=strict_digest, strict_resolving=strict_resolving, + datasets_dir=datasets_dir, handle_score_split=handle_score_split, + ) + + if pipeline_run['run']['scoring']['pipeline']['id'] == DEFAULT_SCORING_PIPELINE_ID: + pipeline_run['run']['scoring']['pipeline'] = pipeline_resolver( + DEFAULT_SCORING_PIPELINE_PATH, + strict_resolving=strict_resolving, + strict_digest=strict_digest, + pipeline_search_paths=pipeline_search_paths, + ) + else: + pipeline_run['run']['scoring']['pipeline'] = pipeline_resolver( + pipeline_run['run']['scoring']['pipeline']['id'], + strict_resolving=strict_resolving, + strict_digest=strict_digest, + pipeline_search_paths=pipeline_search_paths, + ) + + return pipeline_runs + + +def _get_runtime_hyperparams_from_pipeline_run(pipeline: pipeline_module.Pipeline, pipeline_run_steps: typing.Sequence[typing.Dict]) -> typing.Sequence[typing.Union[typing.Dict, typing.Sequence]]: + free_hyperparams = pipeline.get_free_hyperparams() + + # We want to allow missing steps for failed pipeline runs. + if len(free_hyperparams) >= len(pipeline_run_steps): + pipeline_run_steps = list(pipeline_run_steps) + for i in range(len(pipeline_run_steps), len(free_hyperparams)): + pipeline_run_steps.append({}) + else: + raise exceptions.InvalidPipelineRunError("Number of steps in the pipeline run does not match the number of steps of the pipeline.") + + hyperparams: typing.List[typing.Union[typing.Dict, typing.Sequence]] = [] + + for free_hyperparams_for_step, pipeline_run_step in zip(free_hyperparams, pipeline_run_steps): + if isinstance(free_hyperparams_for_step, (dict, frozendict.frozendict)): + values = {} + hyperparams_from_step = pipeline_run_step.get('hyperparams', {}) + for name, hyperparameter in free_hyperparams_for_step.items(): + if name in hyperparams_from_step: + if hyperparams_from_step[name]['type'] == metadata_base.ArgumentType.VALUE.name: + values[name] = hyperparameter.value_from_json_structure(hyperparams_from_step[name]['data']) + else: + raise exceptions.UnexpectedValueError("Hyper-parameter '{name}' of type '{type}' cannot be set at runtime.".format(name=name, type=hyperparams_from_step[name]['type'])) + hyperparams.append(values) + + extra_hyperparams_set = set(hyperparams_from_step.keys()) - set(free_hyperparams_for_step.keys()) + if extra_hyperparams_set: + logger.warning("Pipeline run contains values for additional hyper-parameters: %(extra_hyperparams)s", { + 'extra_hyperparams': sorted(extra_hyperparams_set), + }) + + elif utils.is_sequence(free_hyperparams_for_step): + step_hyperparams = _get_runtime_hyperparams_from_pipeline_run(free_hyperparams_for_step, pipeline_run_step.get('steps', [])) + hyperparams.append(step_hyperparams) + else: + raise exceptions.UnexpectedValueError("Unknown hyper-parameters type: {hyperparams_type}".format(hyperparams_type=type(free_hyperparams_for_step))) + + return hyperparams + + +def _get_data_and_scoring_params_from_pipeline_run(pipeline_run_steps: typing.Sequence[typing.Dict]) -> typing.Dict: + params: typing.Dict[str, typing.Any] = {} + + for pipeline_run_step in pipeline_run_steps: + if pipeline_run_step['type'] == metadata_base.PipelineStepType.PRIMITIVE.name: + new_params = {} + + for hyperparameter_name, hyperparameter in pipeline_run_step.get('hyperparams', {}).items(): + if hyperparameter['type'] == metadata_base.ArgumentType.VALUE.name: + # We are comparing JSON serializations, so we need it to be deterministic, so we sort keys. + new_params[hyperparameter_name] = json.dumps(hyperparameter['data'], sort_keys=True) + else: + raise exceptions.UnexpectedValueError("Hyper-parameter '{name}' of type '{type}' cannot be set at runtime.".format(name=hyperparameter_name, type=hyperparameter['type'])) + + elif pipeline_run_step['type'] == metadata_base.PipelineStepType.SUBPIPELINE.name: + new_params = _get_data_and_scoring_params_from_pipeline_run(pipeline_run_step.get('steps', [])) + + else: + raise exceptions.UnexpectedValueError("Unknown step type: {step_type}".format(step_type=pipeline_run_step['type'])) + + for name, value in new_params.items(): + if name in params: + if params[name] != value: + raise exceptions.UnexpectedValueError( + "Hyper-parameter '{name}' does not have the same value across the whole pipeline: {value1} vs {value2}.".format( + name=name, value1=params[name], value2=value, + ), + ) + else: + params[name] = value + + return params + + +def combine_random_seed(scores: container.DataFrame, random_seed: int) -> container.DataFrame: + random_seed_column = container.DataFrame({'randomSeed': [random_seed] * scores.shape[0]}) + # We add the new column at the end so that we do not have to do complicated changes to the metadata. + output_scores = pandas.concat([scores, random_seed_column], axis=1) + # There is one more column now, so we update metadata for it. + output_scores.metadata = scores.metadata.update((metadata_base.ALL_ELEMENTS,), { + 'dimension': { + 'length': output_scores.shape[1], + }, + }) + output_scores.metadata = output_scores.metadata.update_column(output_scores.shape[1] - 1, { + 'name': 'randomSeed', + 'structural_type': int, + }) + + return output_scores + + +def combine_folds(scores_list: typing.List[container.DataFrame]) -> container.DataFrame: + # We combine multiple scores tables into one output table by adding a "fold" column. + for fold, scores in enumerate(scores_list): + fold_column = container.DataFrame({'fold': [fold] * scores.shape[0]}) + # We add the new column at the end so that we do not have to do complicated + # changes to the metadata. + scores_list[fold] = pandas.concat([scores, fold_column], axis=1) + # There is one more column now, so we update metadata for it. + scores_list[fold].metadata = scores.metadata.update((metadata_base.ALL_ELEMENTS,), { + 'dimension': { + 'length': scores_list[fold].shape[1], + }, + }) + scores_list[fold].metadata = scores_list[fold].metadata.update_column(scores_list[fold].shape[1] - 1, { + 'name': 'fold', + 'structural_type': int, + }) + + scores = pandas.concat(scores_list, axis=0).reset_index(drop=True) + # We reuse metadata from the first fold and update the number of rows which is now + # combined across all folds. + scores.metadata = scores_list[0].metadata.update((), { + 'dimension': { + 'length': scores.shape[0], + }, + }) + + return scores + + +def combine_pipeline_runs( + standard_pipeline_run: pipeline_run_module.PipelineRun, *, + data_pipeline_run: pipeline_run_module.PipelineRun = None, scoring_pipeline_run: pipeline_run_module.PipelineRun = None, + score_inputs: typing.Sequence[typing.Any] = None, metrics: typing.Sequence[typing.Dict] = None, scores: container.DataFrame = None, + fold_group_uuid: uuid.UUID = None, fold_index: int = None, +) -> None: + fold_args_provided = (item is None for item in (fold_group_uuid, fold_index)) + if any(fold_args_provided) and not all(fold_args_provided): + raise exceptions.InvalidArgumentValueError("If any of 'fold_group_uuid' and 'fold_index' are provided, they must all be provided.") + + scores_args_provided = (item is None for item in (scores, metrics)) + if any(scores_args_provided) and not all(scores_args_provided): + raise exceptions.InvalidArgumentValueError("If any of 'scores' or 'metrics' is provided, they must both be provided.") + + if data_pipeline_run is not None: + standard_pipeline_run.set_data_preparation_pipeline_run(data_pipeline_run) + + if fold_group_uuid is not None: + standard_pipeline_run.set_fold_group(fold_group_uuid, fold_index) + + if scoring_pipeline_run is not None: + standard_pipeline_run.set_scoring_pipeline_run(scoring_pipeline_run, score_inputs) + + if scores is not None: + standard_pipeline_run.set_scores(scores, metrics) + + +@deprecate.function(message="use extended DataFrame.to_csv method instead") +def export_dataframe(dataframe: container.DataFrame, output_file: typing.IO[typing.Any] = None) -> typing.Optional[str]: + return dataframe.to_csv(output_file) + + +def _check_duplicate_metrics(metrics: typing.Sequence[typing.Dict]) -> None: + """ + In results from scoring we identify each score by its metric name. So to map those rows in scoring + output back to requested metrics, names must be unique. Otherwise we would not know to which + metric configuration the score belongs to. + """ + + only_metrics = [metric['metric'] for metric in metrics] + + if utils.has_duplicates(only_metrics): + raise exceptions.InvalidArgumentValueError("Same metric listed multiple times.") + + +def get_metrics_from_list(metrics: typing.Sequence[str]) -> typing.Sequence[typing.Dict]: + metric_descriptions = [{'metric': problem.PerformanceMetric[metric]} for metric in metrics] + + _check_duplicate_metrics(metric_descriptions) + + return metric_descriptions + + +def get_metrics_from_problem_description(problem_description: typing.Optional[problem.Problem]) -> typing.Sequence[typing.Dict]: + if problem_description is None: + return [] + + metric_descriptions = problem_description['problem'].get('performance_metrics', []) + + _check_duplicate_metrics(metric_descriptions) + + return metric_descriptions + + +def _output_pipeline_runs(arguments: argparse.Namespace, pipeline_runs: typing.Sequence[pipeline_run_module.PipelineRun]) -> None: + if not getattr(arguments, 'output_run', None): + return + + first = True + for pipeline_run in pipeline_runs: + pipeline_run.to_yaml(arguments.output_run, appending=not first) + first = False + + # Make sure the handle is flushed so that no data is lost. CLI file handles are generally + # used outside of a context manager which would otherwise handle that. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/436 + arguments.output_run.flush() + + +def fit_handler( + arguments: argparse.Namespace, *, pipeline_resolver: typing.Callable = None, + pipeline_run_parser: typing.Callable = None, dataset_resolver: typing.Callable = None, + problem_resolver: typing.Callable = None, +) -> None: + if pipeline_resolver is None: + pipeline_resolver = pipeline_module.get_pipeline + if pipeline_run_parser is None: + pipeline_run_parser = parse_pipeline_run + if dataset_resolver is None: + dataset_resolver = dataset_module.get_dataset + if problem_resolver is None: + problem_resolver = problem.get_problem + + context = metadata_base.Context[arguments.context] + compute_digest = dataset_module.ComputeDigest[getattr(arguments, 'compute_digest', dataset_module.ComputeDigest.ONLY_IF_MISSING.name)] + runtime_environment = pipeline_run_module.RuntimeEnvironment( + worker_id=getattr(arguments, 'worker_id', None), + ) + + if getattr(arguments, 'input_run', None) is not None: + parsed_pipeline_runs = pipeline_run_parser( + arguments.input_run, getattr(arguments, 'pipeline_search_paths', []), getattr(arguments, 'datasets_dir', None), + pipeline_resolver=pipeline_resolver, dataset_resolver=dataset_resolver, problem_resolver=problem_resolver, + strict_resolving=getattr(arguments, 'strict_resolving', False), + compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + + if len(parsed_pipeline_runs) != 1: + raise exceptions.InvalidArgumentValueError( + "Fit requires exactly one pipeline run. {pipeline_runs} provided.".format(pipeline_runs=len(parsed_pipeline_runs)) + ) + if parsed_pipeline_runs[0]['run']['phase'] != metadata_base.PipelineRunPhase.FIT.name: + raise exceptions.InvalidArgumentValueError( + "Fit requires a FIT phase pipeline run. {phase} phase provided.".format(phase=parsed_pipeline_runs[0]['run']['phase']) + ) + fit_pipeline_run = parsed_pipeline_runs[0] + + pipeline = fit_pipeline_run['pipeline'] + problem_description = fit_pipeline_run.get('problem', None) + inputs = fit_pipeline_run['datasets'] + # Currently, "random_seed" is not yet required. + random_seed = fit_pipeline_run.get('random_seed', 0) + hyperparams = _get_runtime_hyperparams_from_pipeline_run(fit_pipeline_run['pipeline'], fit_pipeline_run.get('steps', [])) + # Currently, "is_standard_pipeline" is not yet required. + is_standard_pipeline = fit_pipeline_run['run'].get('is_standard_pipeline', True) + + else: + pipeline = pipeline_resolver( + arguments.pipeline, + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + + if getattr(arguments, 'problem', None) is not None: + problem_description = problem_resolver(arguments.problem, strict_digest=getattr(arguments, 'strict_digest', False)) + else: + problem_description = None + + inputs = [ + dataset_resolver( + input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for input_uri in getattr(arguments, 'inputs', []) + ] + + random_seed = getattr(arguments, 'random_seed', 0) + # We use default hyper-parameter values for now. + hyperparams = None + is_standard_pipeline = getattr(arguments, 'standard_pipeline', True) + + expose_produced_outputs = getattr(arguments, 'expose_produced_outputs_dir', None) is not None + + fitted_pipeline, predictions, result = fit( + pipeline, inputs, + problem_description=problem_description, + context=context, + hyperparams=hyperparams, + random_seed=random_seed, + volumes_dir=getattr(arguments, 'volumes_dir', None), + scratch_dir=getattr(arguments, 'scratch_dir', None), + runtime_environment=runtime_environment, + is_standard_pipeline=is_standard_pipeline, + expose_produced_outputs=expose_produced_outputs, + ) + + if expose_produced_outputs: + save_steps_outputs(result, arguments.expose_produced_outputs_dir) + + _output_pipeline_runs(arguments, [result.pipeline_run]) + + result.check_success() + + if getattr(arguments, 'save', None) is not None: + pickle.dump(fitted_pipeline, arguments.save) + # Make sure the handle is flushed so that no data is lost. CLI file handles are generally + # used outside of a context manager which would otherwise handle that. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/436 + arguments.save.flush() + + if getattr(arguments, 'output', None) is not None: + assert is_standard_pipeline + predictions.to_csv(arguments.output) + + +# We have "pipeline_resolver" and "problem_resolver" as arguments (even if we are not +# using them in this function) so that the signature is the same for all handlers. +def produce_handler( + arguments: argparse.Namespace, *, pipeline_resolver: typing.Callable = None, + pipeline_run_parser: typing.Callable = None, dataset_resolver: typing.Callable = None, + problem_resolver: typing.Callable = None, +) -> None: + if pipeline_run_parser is None: + pipeline_run_parser = parse_pipeline_run + if dataset_resolver is None: + dataset_resolver = dataset_module.get_dataset + + compute_digest = dataset_module.ComputeDigest[getattr(arguments, 'compute_digest', dataset_module.ComputeDigest.ONLY_IF_MISSING.name)] + + fitted_pipeline = pickle.load(arguments.fitted_pipeline) + + if not fitted_pipeline.is_standard_pipeline and getattr(arguments, 'output', None) is not None: + raise exceptions.InvalidArgumentValueError("You cannot save predictions for a non-standard pipeline.") + + if getattr(arguments, 'input_run', None) is not None: + parsed_pipeline_runs = pipeline_run_parser( + arguments.input_run, getattr(arguments, 'pipeline_search_paths', []), getattr(arguments, 'datasets_dir', None), + pipeline_resolver=pipeline_resolver, dataset_resolver=dataset_resolver, problem_resolver=problem_resolver, + strict_resolving=getattr(arguments, 'strict_resolving', False), + compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + + if len(parsed_pipeline_runs) != 1: + raise exceptions.InvalidArgumentValueError( + "Produce requires exactly one pipeline run. {pipeline_runs} provided.".format(pipeline_runs=len(parsed_pipeline_runs)) + ) + if parsed_pipeline_runs[0]['run']['phase'] != metadata_base.PipelineRunPhase.PRODUCE.name: + raise exceptions.InvalidArgumentValueError( + "Produce requires a PRODUCE phase pipeline run. {phase} phase provided.".format(phase=parsed_pipeline_runs[0]['run']['phase']) + ) + produce_pipeline_run = parsed_pipeline_runs[0] + + # TODO: Check that pipeline (and hyperparams, is_standard_pipeline flag) and problem match those in the fitted_pipeline. + + test_inputs = produce_pipeline_run['datasets'] + + else: + test_inputs = [ + dataset_resolver( + input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for input_uri in getattr(arguments, 'test_inputs', []) + ] + + expose_produced_outputs = getattr(arguments, 'expose_produced_outputs_dir', None) is not None + + predictions, result = produce(fitted_pipeline, test_inputs, expose_produced_outputs=expose_produced_outputs) + + if expose_produced_outputs: + save_steps_outputs(result, arguments.expose_produced_outputs_dir) + + _output_pipeline_runs(arguments, [result.pipeline_run]) + + result.check_success() + + if getattr(arguments, 'output', None) is not None: + assert fitted_pipeline.is_standard_pipeline + predictions.to_csv(arguments.output) + + +# We have "problem_resolver" as an arguments (even if we are not +# using it in this function) so that the signature is the same for all handlers. +def score_handler( + arguments: argparse.Namespace, *, pipeline_resolver: typing.Callable = None, + pipeline_run_parser: typing.Callable = None, dataset_resolver: typing.Callable = None, + problem_resolver: typing.Callable = None, +) -> None: + if pipeline_resolver is None: + pipeline_resolver = pipeline_module.get_pipeline + if pipeline_run_parser is None: + pipeline_run_parser = parse_pipeline_run + if dataset_resolver is None: + dataset_resolver = dataset_module.get_dataset + + context = metadata_base.Context[arguments.context] + compute_digest = dataset_module.ComputeDigest[getattr(arguments, 'compute_digest', dataset_module.ComputeDigest.ONLY_IF_MISSING.name)] + runtime_environment = pipeline_run_module.RuntimeEnvironment( + worker_id=getattr(arguments, 'worker_id', None), + ) + + fitted_pipeline = pickle.load(arguments.fitted_pipeline) + + if not fitted_pipeline.is_standard_pipeline: + raise exceptions.InvalidArgumentValueError("You cannot score a non-standard pipeline.") + + if getattr(arguments, 'input_run', None) is not None: + parsed_pipeline_runs = pipeline_run_parser( + arguments.input_run, getattr(arguments, 'pipeline_search_paths', []), getattr(arguments, 'datasets_dir', None), + pipeline_resolver=pipeline_resolver, dataset_resolver=dataset_resolver, problem_resolver=problem_resolver, + strict_resolving=getattr(arguments, 'strict_resolving', False), + compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + + if len(parsed_pipeline_runs) != 1: + raise exceptions.InvalidArgumentValueError( + "Score requires exactly one pipeline run. {pipeline_runs} provided.".format(pipeline_runs=len(parsed_pipeline_runs)) + ) + if parsed_pipeline_runs[0]['run']['phase'] != metadata_base.PipelineRunPhase.PRODUCE.name: + raise exceptions.InvalidArgumentValueError( + "Score requires a PRODUCE phase pipeline run. {phase} phase provided.".format(phase=parsed_pipeline_runs[0]['run']['phase']) + ) + produce_pipeline_run = parsed_pipeline_runs[0] + + if 'scoring' not in produce_pipeline_run['run']: + raise exceptions.InvalidArgumentValueError("Score requires a pipeline run with scoring.") + if 'datasets' not in produce_pipeline_run['run']['scoring']: + raise exceptions.InvalidArgumentValueError("Score requires scoring datasets to be referenced in the PRODUCE phase pipeline run.") + + # TODO: Check that pipeline (and hyperparams, is_standard_pipeline flag) and problem match those in the fitted_pipeline. + + scoring_pipeline = produce_pipeline_run['run']['scoring']['pipeline'] + test_inputs = produce_pipeline_run['datasets'] + score_inputs = produce_pipeline_run['run']['scoring']['datasets'] + # Currently, "random_seed" is not yet required. + random_seed = produce_pipeline_run['run']['scoring'].get('random_seed', 0) + # We do not have to set metrics, because they should already be included in hyper-paramters. + metrics: typing.Sequence[typing.Dict] = [] + scoring_params = _get_data_and_scoring_params_from_pipeline_run(produce_pipeline_run['run']['scoring'].get('steps', [])) + + else: + scoring_pipeline = pipeline_resolver( + arguments.scoring_pipeline, + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + + test_inputs = [ + dataset_resolver( + input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for input_uri in getattr(arguments, 'test_inputs', []) + ] + score_inputs = [ + dataset_resolver( + score_input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for score_input_uri in getattr(arguments, 'score_inputs', []) + ] + + random_seed = getattr(arguments, 'random_seed', 0) + + if getattr(arguments, 'metrics', None) is not None: + metrics = get_metrics_from_list(arguments.metrics) + else: + metrics = get_metrics_from_problem_description(fitted_pipeline.problem_description) + + if getattr(arguments, 'scoring_params', None) is not None: + scoring_params = {name: value for name, value in arguments.scoring_params} + else: + scoring_params = {} + + expose_produced_outputs = getattr(arguments, 'expose_produced_outputs_dir', None) is not None + + predictions, produce_result = produce(fitted_pipeline, test_inputs, expose_produced_outputs=expose_produced_outputs) + + if expose_produced_outputs: + save_steps_outputs(produce_result, arguments.expose_produced_outputs_dir) + + if produce_result.has_error(): + _output_pipeline_runs(arguments, [produce_result.pipeline_run]) + + produce_result.check_success() + + assert False + + if getattr(arguments, 'output', None) is not None: + predictions.to_csv(arguments.output) + + scores, score_result = score( + predictions, + score_inputs, + scoring_pipeline=scoring_pipeline, + problem_description=fitted_pipeline.problem_description, + metrics=metrics, + predictions_random_seed=fitted_pipeline.random_seed, + scoring_params=scoring_params, + context=context, + random_seed=random_seed, + volumes_dir=getattr(arguments, 'volumes_dir', None), + scratch_dir=getattr(arguments, 'scratch_dir', None), + runtime_environment=runtime_environment, + ) + + # Modifies "produce_result.pipeline_run" in-place. + combine_pipeline_runs( + produce_result.pipeline_run, scoring_pipeline_run=score_result.pipeline_run, score_inputs=score_inputs, + ) + + if score_result.has_error(): + _output_pipeline_runs(arguments, [produce_result.pipeline_run]) + + score_result.check_success() + + assert False + + # Modifies "produce_pipeline_run" in-place. + combine_pipeline_runs( + produce_result.pipeline_run, metrics=metrics, scores=scores, + ) + + _output_pipeline_runs(arguments, [produce_result.pipeline_run]) + + if getattr(arguments, 'scores', None) is not None: + scores.to_csv(arguments.scores) + + +def fit_produce_handler( + arguments: argparse.Namespace, *, pipeline_resolver: typing.Callable = None, + pipeline_run_parser: typing.Callable = None, dataset_resolver: typing.Callable = None, + problem_resolver: typing.Callable = None, +) -> None: + if pipeline_resolver is None: + pipeline_resolver = pipeline_module.get_pipeline + if pipeline_run_parser is None: + pipeline_run_parser = parse_pipeline_run + if dataset_resolver is None: + dataset_resolver = dataset_module.get_dataset + if problem_resolver is None: + problem_resolver = problem.get_problem + + context = metadata_base.Context[arguments.context] + compute_digest = dataset_module.ComputeDigest[getattr(arguments, 'compute_digest', dataset_module.ComputeDigest.ONLY_IF_MISSING.name)] + runtime_environment = pipeline_run_module.RuntimeEnvironment( + worker_id=getattr(arguments, 'worker_id', None), + ) + + if getattr(arguments, 'input_run', None) is not None: + parsed_pipeline_runs = pipeline_run_parser( + arguments.input_run, getattr(arguments, 'pipeline_search_paths', []), getattr(arguments, 'datasets_dir', None), + pipeline_resolver=pipeline_resolver, dataset_resolver=dataset_resolver, problem_resolver=problem_resolver, + strict_resolving=getattr(arguments, 'strict_resolving', False), + compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + + if len(parsed_pipeline_runs) != 2: + raise exceptions.InvalidArgumentValueError( + "Fit-produce requires exactly two pipeline runs. {pipeline_runs} provided.".format(pipeline_runs=len(parsed_pipeline_runs)) + ) + # TODO: We might not want to require that the order in the file is strict. + # We could just require that pipeline runs belong together (using previous_pipeline_run) + # and are of FIT and PRODUCE phase and then run them in the correct order. + pipeline_run_0_phase = parsed_pipeline_runs[0]['run']['phase'] + if pipeline_run_0_phase != metadata_base.PipelineRunPhase.FIT.name: + raise exceptions.InvalidArgumentValueError( + "Fit-produce requires the first pipeline run to be a FIT phase. {phase} phase provided.".format(phase=pipeline_run_0_phase) + ) + pipeline_run_1_phase = parsed_pipeline_runs[1]['run']['phase'] + if pipeline_run_1_phase != metadata_base.PipelineRunPhase.PRODUCE.name: + raise exceptions.InvalidArgumentValueError( + "Fit-produce requires the second pipeline run to be a PRODUCE phase. {phase} phase provided.".format(phase=pipeline_run_1_phase) + ) + fit_pipeline_run = parsed_pipeline_runs[0] + produce_pipeline_run = parsed_pipeline_runs[1] + + if produce_pipeline_run['previous_pipeline_run']['id'] != fit_pipeline_run['id']: + raise exceptions.InvalidArgumentValueError("Fit-produce requires that the PRODUCE phase pipeline run must reference FIT phase pipeline run in \"previous_pipeline_run\".") + if fit_pipeline_run['pipeline'].id != produce_pipeline_run['pipeline'].id or fit_pipeline_run['pipeline'].get_digest() != produce_pipeline_run['pipeline'].get_digest(): + raise exceptions.InvalidArgumentValueError("Fit-produce requires that both the FIT phase and PRODUCE phase pipeline runs reference the same pipeline.") + if fit_pipeline_run['problem']['id'] != produce_pipeline_run['problem']['id'] or fit_pipeline_run['problem'].get_digest() != produce_pipeline_run['problem'].get_digest(): + raise exceptions.InvalidArgumentValueError("Fit-produce requires that both the FIT phase and PRODUCE phase pipeline runs reference the same problem description.") + + # TODO: Check that hyperparams match between both pipeline runs (but allow failed runs). + # TODO: Check that inputs match between both pipeline runs. + + pipeline = fit_pipeline_run['pipeline'] + problem_description = fit_pipeline_run.get('problem', None) + inputs = fit_pipeline_run['datasets'] + test_inputs = produce_pipeline_run['datasets'] + # Currently, "random_seed" is not yet required. + random_seed = fit_pipeline_run.get('random_seed', 0) + hyperparams = _get_runtime_hyperparams_from_pipeline_run(fit_pipeline_run['pipeline'], fit_pipeline_run.get('steps', [])) + # Currently, "is_standard_pipeline" is not yet required. + is_standard_pipeline = fit_pipeline_run['run'].get('is_standard_pipeline', True) + + else: + pipeline = pipeline_resolver( + arguments.pipeline, + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + + if getattr(arguments, 'problem', None) is not None: + problem_description = problem_resolver(arguments.problem, strict_digest=getattr(arguments, 'strict_digest', False)) + else: + problem_description = None + + inputs = [ + dataset_resolver( + input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for input_uri in getattr(arguments, 'inputs', []) + ] + test_inputs = [ + dataset_resolver( + input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for input_uri in getattr(arguments, 'test_inputs', []) + ] + + random_seed = getattr(arguments, 'random_seed', 0) + # We use default hyper-parameter values for now. + hyperparams = None + is_standard_pipeline = getattr(arguments, 'standard_pipeline', True) + + fitted_pipeline, predictions, fit_result = fit( + pipeline, inputs, + problem_description=problem_description, + context=context, + hyperparams=hyperparams, + random_seed=random_seed, + volumes_dir=getattr(arguments, 'volumes_dir', None), + scratch_dir=getattr(arguments, 'scratch_dir', None), + runtime_environment=runtime_environment, + is_standard_pipeline=is_standard_pipeline, + ) + + if fit_result.has_error(): + _output_pipeline_runs(arguments, [fit_result.pipeline_run]) + + fit_result.check_success() + + assert False + + if getattr(arguments, 'save', None) is not None: + pickle.dump(fitted_pipeline, arguments.save) + # Make sure the handle is flushed so that no data is lost. CLI file handles are generally + # used outside of a context manager which would otherwise handle that. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/436 + arguments.save.flush() + + expose_produced_outputs = getattr(arguments, 'expose_produced_outputs_dir', None) is not None + + predictions, produce_result = produce(fitted_pipeline, test_inputs, expose_produced_outputs=expose_produced_outputs) + + if expose_produced_outputs: + save_steps_outputs(produce_result, arguments.expose_produced_outputs_dir) + + _output_pipeline_runs(arguments, [fit_result.pipeline_run, produce_result.pipeline_run]) + + produce_result.check_success() + + if getattr(arguments, 'output', None) is not None: + assert is_standard_pipeline + predictions.to_csv(arguments.output) + + +def fit_score_handler( + arguments: argparse.Namespace, *, pipeline_resolver: typing.Callable = None, + pipeline_run_parser: typing.Callable = None, dataset_resolver: typing.Callable = None, + problem_resolver: typing.Callable = None, +) -> None: + if pipeline_resolver is None: + pipeline_resolver = pipeline_module.get_pipeline + if pipeline_run_parser is None: + pipeline_run_parser = parse_pipeline_run + if dataset_resolver is None: + dataset_resolver = dataset_module.get_dataset + if problem_resolver is None: + problem_resolver = problem.get_problem + + context = metadata_base.Context[arguments.context] + compute_digest = dataset_module.ComputeDigest[getattr(arguments, 'compute_digest', dataset_module.ComputeDigest.ONLY_IF_MISSING.name)] + runtime_environment = pipeline_run_module.RuntimeEnvironment( + worker_id=getattr(arguments, 'worker_id', None), + ) + + if getattr(arguments, 'input_run', None) is not None: + parsed_pipeline_runs = pipeline_run_parser( + arguments.input_run, getattr(arguments, 'pipeline_search_paths', []), getattr(arguments, 'datasets_dir', None), + pipeline_resolver=pipeline_resolver, dataset_resolver=dataset_resolver, problem_resolver=problem_resolver, + strict_resolving=getattr(arguments, 'strict_resolving', False), + compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + + if len(parsed_pipeline_runs) != 2: + raise exceptions.InvalidArgumentValueError( + "Fit-score requires exactly two pipeline runs. {pipeline_runs} provided.".format(pipeline_runs=len(parsed_pipeline_runs)) + ) + # TODO: We might not want to require that the order in the file is strict. + # We could just require that pipeline runs belong together (using previous_pipeline_run) + # and are of FIT and PRODUCE phase and then run them in the correct order. + pipeline_run_0_phase = parsed_pipeline_runs[0]['run']['phase'] + if pipeline_run_0_phase != metadata_base.PipelineRunPhase.FIT.name: + raise exceptions.InvalidArgumentValueError( + "Fit-score requires the first pipeline run to be a FIT phase. {phase} phase provided.".format(phase=pipeline_run_0_phase) + ) + pipeline_run_1_phase = parsed_pipeline_runs[1]['run']['phase'] + if pipeline_run_1_phase != metadata_base.PipelineRunPhase.PRODUCE.name: + raise exceptions.InvalidArgumentValueError( + "Fit-score requires the second pipeline run to be a PRODUCE phase. {phase} phase provided.".format(phase=pipeline_run_1_phase) + ) + fit_pipeline_run = parsed_pipeline_runs[0] + produce_pipeline_run = parsed_pipeline_runs[1] + + if produce_pipeline_run['previous_pipeline_run']['id'] != fit_pipeline_run['id']: + raise exceptions.InvalidArgumentValueError("Fit-produce requires that the PRODUCE phase pipeline run must reference FIT phase pipeline run in \"previous_pipeline_run\".") + if fit_pipeline_run['pipeline'].id != produce_pipeline_run['pipeline'].id or fit_pipeline_run['pipeline'].get_digest() != produce_pipeline_run['pipeline'].get_digest(): + raise exceptions.InvalidArgumentValueError("Fit-produce requires that both the FIT phase and PRODUCE phase pipeline runs reference the same pipeline.") + if fit_pipeline_run['problem']['id'] != produce_pipeline_run['problem']['id'] or fit_pipeline_run['problem'].get_digest() != produce_pipeline_run['problem'].get_digest(): + raise exceptions.InvalidArgumentValueError("Fit-produce requires that both the FIT phase and PRODUCE phase pipeline runs reference the same problem description.") + if 'scoring' not in produce_pipeline_run['run']: + raise exceptions.InvalidArgumentValueError("Fit-score requires the PRODUCE phase pipeline run to be a pipeline run with scoring.") + if 'datasets' not in produce_pipeline_run['run']['scoring']: + raise exceptions.InvalidArgumentValueError("Fit-score requires scoring datasets to be referenced in the PRODUCE phase pipeline run.") + + # TODO: Check that hyperparams match between both pipeline runs (but allow failed runs). + # TODO: Check that inputs match between both pipeline runs. + # TODO: Check that scoring pipelines match between both pipeline runs. + + pipeline = fit_pipeline_run['pipeline'] + scoring_pipeline = produce_pipeline_run['run']['scoring']['pipeline'] + problem_description = fit_pipeline_run.get('problem', None) + inputs = fit_pipeline_run['datasets'] + test_inputs = produce_pipeline_run['datasets'] + score_inputs = produce_pipeline_run['run']['scoring']['datasets'] + # Currently, "random_seed" is not yet required. + random_seed = fit_pipeline_run.get('random_seed', 0) + hyperparams = _get_runtime_hyperparams_from_pipeline_run(fit_pipeline_run['pipeline'], fit_pipeline_run.get('steps', [])) + # Currently, "random_seed" is not yet required. + scoring_random_seed = produce_pipeline_run['run']['scoring'].get('random_seed', 0) + # We do not have to set metrics, because they should already be included in hyper-paramters. + metrics: typing.Sequence[typing.Dict] = [] + scoring_params = _get_data_and_scoring_params_from_pipeline_run(produce_pipeline_run['run']['scoring'].get('steps', [])) + + else: + pipeline = pipeline_resolver( + arguments.pipeline, + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + scoring_pipeline = pipeline_resolver( + arguments.scoring_pipeline, + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + + if getattr(arguments, 'problem', None) is not None: + problem_description = problem_resolver(arguments.problem, strict_digest=getattr(arguments, 'strict_digest', False)) + else: + problem_description = None + + inputs = [ + dataset_resolver( + input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for input_uri in getattr(arguments, 'inputs', []) + ] + test_inputs = [ + dataset_resolver( + input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for input_uri in getattr(arguments, 'test_inputs', []) + ] + score_inputs = [ + dataset_resolver( + score_input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for score_input_uri in getattr(arguments, 'score_inputs', []) + ] + + random_seed = getattr(arguments, 'random_seed', 0) + hyperparams = None + scoring_random_seed = getattr(arguments, 'scoring_random_seed', 0) + + if getattr(arguments, 'metrics', None) is not None: + metrics = get_metrics_from_list(arguments.metrics) + else: + metrics = get_metrics_from_problem_description(problem_description) + + if getattr(arguments, 'scoring_params', None) is not None: + scoring_params = {name: value for name, value in arguments.scoring_params} + else: + scoring_params = {} + + fitted_pipeline, predictions, fit_result = fit( + pipeline, inputs, + problem_description=problem_description, + context=context, + hyperparams=hyperparams, + random_seed=random_seed, + volumes_dir=getattr(arguments, 'volumes_dir', None), + scratch_dir=getattr(arguments, 'scratch_dir', None), + runtime_environment=runtime_environment, + ) + + if fit_result.has_error(): + _output_pipeline_runs(arguments, [fit_result.pipeline_run]) + + fit_result.check_success() + + assert False + + if getattr(arguments, 'save', None) is not None: + pickle.dump(fitted_pipeline, arguments.save) + # Make sure the handle is flushed so that no data is lost. CLI file handles are generally + # used outside of a context manager which would otherwise handle that. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/436 + arguments.save.flush() + + expose_produced_outputs = getattr(arguments, 'expose_produced_outputs_dir', None) is not None + + predictions, produce_result = produce(fitted_pipeline, test_inputs, expose_produced_outputs=expose_produced_outputs) + + if expose_produced_outputs: + save_steps_outputs(produce_result, arguments.expose_produced_outputs_dir) + + if produce_result.has_error(): + _output_pipeline_runs(arguments, [fit_result.pipeline_run, produce_result.pipeline_run]) + + produce_result.check_success() + + assert False + + if getattr(arguments, 'output', None) is not None: + predictions.to_csv(arguments.output) + + scores, score_result = score( + predictions, score_inputs, + scoring_pipeline=scoring_pipeline, + problem_description=problem_description, + metrics=metrics, + predictions_random_seed=fitted_pipeline.random_seed, + scoring_params=scoring_params, context=context, + random_seed=scoring_random_seed, + volumes_dir=getattr(arguments, 'volumes_dir', None), + scratch_dir=getattr(arguments, 'scratch_dir', None), + runtime_environment=runtime_environment, + ) + + # Modifies "produce_result.pipeline_run" in-place. + combine_pipeline_runs( + produce_result.pipeline_run, scoring_pipeline_run=score_result.pipeline_run, score_inputs=score_inputs, + ) + + if score_result.has_error(): + _output_pipeline_runs(arguments, [fit_result.pipeline_run, produce_result.pipeline_run]) + + score_result.check_success() + + assert False + + # Modifies "produce_result.pipeline_run" in-place. + combine_pipeline_runs( + produce_result.pipeline_run, metrics=metrics, scores=scores, + ) + + _output_pipeline_runs(arguments, [fit_result.pipeline_run, produce_result.pipeline_run]) + + if getattr(arguments, 'scores', None) is not None: + scores.to_csv(arguments.scores) + + +# We have "pipeline_run_parser" as an arguments (even if we are not +# using it in this function) so that the signature is the same for all handlers. +def score_predictions_handler( + arguments: argparse.Namespace, *, pipeline_resolver: typing.Callable = None, + pipeline_run_parser: typing.Callable = None, dataset_resolver: typing.Callable = None, + problem_resolver: typing.Callable = None, +) -> None: + if pipeline_resolver is None: + pipeline_resolver = pipeline_module.get_pipeline + if dataset_resolver is None: + dataset_resolver = dataset_module.get_dataset + if problem_resolver is None: + problem_resolver = problem.get_problem + + context = metadata_base.Context[arguments.context] + compute_digest = dataset_module.ComputeDigest[getattr(arguments, 'compute_digest', dataset_module.ComputeDigest.ONLY_IF_MISSING.name)] + runtime_environment = pipeline_run_module.RuntimeEnvironment( + worker_id=getattr(arguments, 'worker_id', None), + ) + + scoring_pipeline = pipeline_resolver( + arguments.scoring_pipeline, + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + + if getattr(arguments, 'problem', None) is not None: + problem_description = problem_resolver(arguments.problem, strict_digest=getattr(arguments, 'strict_digest', False)) + else: + problem_description = None + + score_inputs = [ + dataset_resolver( + score_input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for score_input_uri in getattr(arguments, 'score_inputs', []) + ] + + predictions_dataframe = pandas.read_csv( + arguments.predictions, + # We do not want to do any conversion of values at this point. + # This should be done by primitives later on. + dtype=str, + # We always expect one row header. + header=0, + # We want empty strings and not NaNs. + na_filter=False, + encoding='utf8', + low_memory=False, + memory_map=True, + ) + predictions_random_seed = getattr(arguments, 'predictions_random_seed', None) + scoring_random_seed = getattr(arguments, 'scoring_random_seed', 0) + + if getattr(arguments, 'metrics', None) is not None: + metrics = get_metrics_from_list(arguments.metrics) + else: + metrics = get_metrics_from_problem_description(problem_description) + + if getattr(arguments, 'scoring_params', None) is not None: + scoring_params = {name: value for name, value in arguments.scoring_params} + else: + scoring_params = {} + + # Convert pandas DataFrame to container DataFrame. + predictions = container.DataFrame(predictions_dataframe, generate_metadata=True) + + if getattr(arguments, 'output', None) is not None: + predictions.to_csv(arguments.output) + + scores, score_result = score( + predictions, score_inputs, + scoring_pipeline=scoring_pipeline, + problem_description=problem_description, + metrics=metrics, + predictions_random_seed=predictions_random_seed, + scoring_params=scoring_params, + context=context, + random_seed=scoring_random_seed, + volumes_dir=getattr(arguments, 'volumes_dir', None), + scratch_dir=getattr(arguments, 'scratch_dir', None), + runtime_environment=runtime_environment, + ) + + score_result.check_success() + + if getattr(arguments, 'scores', None) is not None: + scores.to_csv(arguments.scores) + + +def evaluate_handler( + arguments: argparse.Namespace, *, pipeline_resolver: typing.Callable = None, pipeline_run_parser: typing.Callable = None, + dataset_resolver: typing.Callable = None, problem_resolver: typing.Callable = None, +) -> None: + if pipeline_resolver is None: + pipeline_resolver = pipeline_module.get_pipeline + if pipeline_run_parser is None: + pipeline_run_parser = parse_pipeline_run + if dataset_resolver is None: + dataset_resolver = dataset_module.get_dataset + if problem_resolver is None: + problem_resolver = problem.get_problem + + context = metadata_base.Context[arguments.context] + compute_digest = dataset_module.ComputeDigest[getattr(arguments, 'compute_digest', dataset_module.ComputeDigest.ONLY_IF_MISSING.name)] + runtime_environment = pipeline_run_module.RuntimeEnvironment( + worker_id=getattr(arguments, 'worker_id', None), + ) + + if getattr(arguments, 'input_run', None) is not None: + parsed_pipeline_runs = pipeline_run_parser( + arguments.input_run, getattr(arguments, 'pipeline_search_paths', []), getattr(arguments, 'datasets_dir', None), + pipeline_resolver=pipeline_resolver, dataset_resolver=dataset_resolver, problem_resolver=problem_resolver, + strict_resolving=getattr(arguments, 'strict_resolving', False), + compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + + # TODO: Support more than 2 pipeline runs (cross validation). + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/407 + if len(parsed_pipeline_runs) != 2: + raise exceptions.InvalidArgumentValueError( + "Evaluate requires exactly two pipeline runs. {pipeline_runs} provided.".format(pipeline_runs=len(parsed_pipeline_runs)) + ) + # TODO: We might not want to require that the order in the file is strict. + # We could just require that pipeline runs belong together (using previous_pipeline_run) + # and are of FIT and PRODUCE phase and then run them in the correct order. + pipeline_run_0_phase = parsed_pipeline_runs[0]['run']['phase'] + if pipeline_run_0_phase != metadata_base.PipelineRunPhase.FIT.name: + raise exceptions.InvalidArgumentValueError( + "Evaluate requires the first pipeline run to be a FIT phase. {phase} phase provided.".format(phase=pipeline_run_0_phase) + ) + pipeline_run_1_phase = parsed_pipeline_runs[1]['run']['phase'] + if pipeline_run_1_phase != metadata_base.PipelineRunPhase.PRODUCE.name: + raise exceptions.InvalidArgumentValueError( + "Evaluate requires the second pipeline run to be a PRODUCE phase. {phase} phase provided.".format(phase=pipeline_run_1_phase) + ) + fit_pipeline_run = parsed_pipeline_runs[0] + produce_pipeline_run = parsed_pipeline_runs[1] + + if produce_pipeline_run['previous_pipeline_run']['id'] != fit_pipeline_run['id']: + raise exceptions.InvalidArgumentValueError("Evaluate requires that the PRODUCE phase pipeline run must reference FIT phase pipeline run in \"previous_pipeline_run\".") + if fit_pipeline_run['pipeline'].id != produce_pipeline_run['pipeline'].id or fit_pipeline_run['pipeline'].get_digest() != produce_pipeline_run['pipeline'].get_digest(): + raise exceptions.InvalidArgumentValueError("Evaluate requires that both the FIT phase and PRODUCE phase pipeline runs reference the same pipeline.") + if fit_pipeline_run['problem']['id'] != produce_pipeline_run['problem']['id'] or fit_pipeline_run['problem'].get_digest() != produce_pipeline_run['problem'].get_digest(): + raise exceptions.InvalidArgumentValueError("Evaluate requires that both the FIT phase and PRODUCE phase pipeline runs reference the same problem description.") + if 'scoring' not in produce_pipeline_run['run']: + raise exceptions.InvalidArgumentValueError("Evaluate requires the PRODUCE phase pipeline run to be a pipeline run with scoring.") + if 'data_preparation' not in produce_pipeline_run['run']: + raise exceptions.InvalidArgumentValueError("Evaluate requires the FIT phase pipeline run to be a pipeline run with data preparation.") + + # TODO: Check that hyperparams match between both pipeline runs (but allow failed runs). + # TODO: Check that inputs match between both pipeline runs. + # TODO: Check that data preparation pipelines match between both pipeline runs. + # TODO: Check that scoring pipelines match between both pipeline runs. + + pipeline = fit_pipeline_run['pipeline'] + data_pipeline = fit_pipeline_run['run']['data_preparation']['pipeline'] + scoring_pipeline = produce_pipeline_run['run']['scoring']['pipeline'] + problem_description = fit_pipeline_run.get('problem', None) + inputs = fit_pipeline_run['datasets'] + # Currently, "random_seed" is not yet required. + random_seed = fit_pipeline_run.get('random_seed', 0) + hyperparams = _get_runtime_hyperparams_from_pipeline_run(fit_pipeline_run['pipeline'], fit_pipeline_run.get('steps', [])) + # Currently, "random_seed" is not yet required. + data_random_seed = fit_pipeline_run['run']['data_preparation'].get('random_seed', 0) + # Currently, "random_seed" is not yet required. + scoring_random_seed = produce_pipeline_run['run']['scoring'].get('random_seed', 0) + # We do not have to set metrics, because they should already be included in hyper-paramters. + metrics: typing.Sequence[typing.Dict] = [] + data_params = _get_data_and_scoring_params_from_pipeline_run(fit_pipeline_run['run']['data_preparation'].get('steps', [])) + scoring_params = _get_data_and_scoring_params_from_pipeline_run(produce_pipeline_run['run']['scoring'].get('steps', [])) + + else: + pipeline = pipeline_resolver( + arguments.pipeline, + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + data_pipeline = pipeline_resolver( + arguments.data_pipeline, + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + scoring_pipeline = pipeline_resolver( + arguments.scoring_pipeline, + strict_resolving=getattr(arguments, 'strict_resolving', False), + strict_digest=getattr(arguments, 'strict_digest', False), + pipeline_search_paths=getattr(arguments, 'pipeline_search_paths', []), + ) + + if getattr(arguments, 'problem', None) is not None: + problem_description = problem_resolver(arguments.problem, strict_digest=getattr(arguments, 'strict_digest', False)) + else: + problem_description = None + + inputs = [ + dataset_resolver( + input_uri, compute_digest=compute_digest, strict_digest=getattr(arguments, 'strict_digest', False), + ) + for input_uri in getattr(arguments, 'inputs', []) + ] + + random_seed = getattr(arguments, 'random_seed', 0) + hyperparams = None + data_random_seed = getattr(arguments, 'data_random_seed', 0) + scoring_random_seed = getattr(arguments, 'scoring_random_seed', 0) + + if getattr(arguments, 'metrics', None) is not None: + metrics = get_metrics_from_list(arguments.metrics) + else: + metrics = get_metrics_from_problem_description(problem_description) + + if getattr(arguments, 'data_params', None) is not None: + data_params = {name: value for name, value in arguments.data_params} + else: + data_params = {} + + if getattr(arguments, 'data_split_file', None) is not None: + split_file = pandas.read_csv( + arguments.data_split_file, + # We do not want to do any conversion of values at this point. + # This should be done by primitives later on. + dtype=str, + # We always expect one row header. + header=0, + # We want empty strings and not NaNs. + na_filter=False, + encoding='utf8', + low_memory=False, + memory_map=True, + ) + + # We use just the "d3mIndex" column and ignore multi-key indices. + # This works for now because it seems that every current multi-key + # dataset in fact has an unique value in "d3mIndex" alone. + # See: https://gitlab.com/datadrivendiscovery/data-supply/issues/117 + # Hyper-parameter value has to be JSON-serialized. + data_params['primary_index_values'] = json.dumps(list(split_file.loc[split_file['type'] == 'TEST']['d3mIndex'])) + + if getattr(arguments, 'scoring_params', None) is not None: + scoring_params = {name: value for name, value in arguments.scoring_params} + else: + scoring_params = {} + + scores_list, results_list = evaluate( + pipeline, inputs, + data_pipeline=data_pipeline, + scoring_pipeline=scoring_pipeline, + problem_description=problem_description, + data_params=data_params, + metrics=metrics, + scoring_params=scoring_params, + context=context, + hyperparams=hyperparams, + random_seed=random_seed, + data_random_seed=data_random_seed, + scoring_random_seed=scoring_random_seed, + volumes_dir=getattr(arguments, 'volumes_dir', None), + scratch_dir=getattr(arguments, 'scratch_dir', None), + runtime_environment=runtime_environment, + ) + + _output_pipeline_runs(arguments, results_list.pipeline_runs) + + results_list.check_success() + + scores = combine_folds(scores_list) + + if getattr(arguments, 'scores', None) is not None: + scores.to_csv(arguments.scores) + + +def save_steps_outputs(results: typing.Union[Result, MultiResult], output_dir: str) -> None: + if isinstance(results, Result): + for key, step_output in results.values.items(): + container_utils.save_container(step_output, os.path.join(output_dir, key)) + elif isinstance(results, MultiResult): + for i, result in enumerate(results): + for key, step_output in result.values.items(): + container_utils.save_container(step_output, os.path.join(output_dir, str(i), key)) + else: + raise exceptions.UnexpectedTypeError("Type: {results_type}".format(results_type=type(results))) + + +def main(argv: typing.Sequence) -> None: + # We have to disable importing while type checking because it makes + # an import cycle in mypy which makes many typing errors. + if not typing.TYPE_CHECKING: + # Importing here to prevent import cycle. + from d3m import cli + + logging.basicConfig() + + logger.warning("This CLI is deprecated. Use \"python3 -m d3m runtime\" instead.") + + parser = argparse.ArgumentParser(description="Run D3M pipelines.") + cli.runtime_configure_parser(parser) + + arguments = parser.parse_args(argv[1:]) + + cli.runtime_handler(arguments, parser) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/d3m/d3m/types.py b/d3m/d3m/types.py new file mode 100644 index 0000000..d1b3bd6 --- /dev/null +++ b/d3m/d3m/types.py @@ -0,0 +1,24 @@ +import numpy # type: ignore + +from d3m import container + +__all__ = ('Data', 'Container') + +# Open an issue if these standard types are too restrictive for you, +# but the idea is that callers should know in advance which data types +# are being passed in and out of primitives to be able to implement +# their introspection, serialization, and so on. + +simple_data_types = ( + str, bytes, bool, float, int, numpy.integer, numpy.float64, numpy.bool_, type(None), +) + +# A tuple representing all standard container types. +Container = ( + container.ndarray, container.DataFrame, + container.List, container.Dataset, +) + +# A tuple representing all standard data types. Data types are those which +# can be contained inside container types. +Data = Container + simple_data_types + (dict,) diff --git a/d3m/d3m/utils.py b/d3m/d3m/utils.py new file mode 100644 index 0000000..e3a1624 --- /dev/null +++ b/d3m/d3m/utils.py @@ -0,0 +1,1823 @@ +import abc +import argparse +import base64 +import collections +import contextlib +import copy +import datetime +import decimal +import enum +import functools +import gzip +import hashlib +import inspect +import json +import logging +import numbers +import operator +import os +import os.path +import pickle +import random +import re +import types +import typing +import sys +import unittest +import uuid +from urllib import parse as url_parse + +import custom_inherit # type: ignore +import frozendict # type: ignore +import git # type: ignore +import jsonpath_ng # type: ignore +import jsonschema # type: ignore +import numpy # type: ignore +import pandas # type: ignore +import typing_inspect # type: ignore +import yaml # type: ignore +import pyrsistent # type: ignore +from jsonschema import validators # type: ignore +from numpy import random as numpy_random # type: ignore +from pytypes import type_util # type: ignore + +import d3m +from d3m import deprecate, exceptions + +if yaml.__with_libyaml__: + from yaml import CSafeLoader as SafeLoader, CSafeDumper as SafeDumper # type: ignore +else: + from yaml import SafeLoader, SafeDumper + +logger = logging.getLogger(__name__) + +NONE_TYPE: typing.Type = type(None) + +# Only types without elements can be listed here. If they are elements, we have to +# check all elements as well. +KNOWN_IMMUTABLE_TYPES = ( + str, int, float, bool, numbers.Integral, decimal.Decimal, + numbers.Real, numpy.integer, numpy.float32, numpy.float64, numpy.bool_, bytes, + datetime.date, datetime.time, datetime.datetime, NONE_TYPE, enum.Enum, +) + +HASH_ID_NAMESPACE = uuid.UUID('8614b2cc-89ef-498e-9254-833233b3959b') + +PACKAGE_BASE = os.path.dirname(d3m.__file__) + + +def current_git_commit(path: str, search_parent_directories: bool = True) -> str: + """ + Returns a git commit hash of the repo at ``path`` or above if ``search_parent_directories`` is ``True``. + + When used to get a commit hash of a Python package, for this to work, the package has + to be installed in "editable" mode (``pip install -e``). + + Parameters + ---------- + path: + A path to repo or somewhere under the repo. + search_parent_directories: + Whether to search for a git repository in directories above ``path``. + + Returns + ------- + A git commit hash. + """ + + repo = git.Repo(path=path, search_parent_directories=search_parent_directories) + return repo.head.object.hexsha + + +# Using typing.TypeVar in type signature does not really work, so we are using type instead. +# See: https://github.com/python/typing/issues/520 +def get_type_arguments(cls: type, *, unique_names: bool = False) -> typing.Dict[type, type]: + """ + Returns a mapping between type arguments and their types of a given class ``cls``. + + Parameters + ---------- + cls: + A class to return mapping for. + unique_names: + Should we force unique names of type parameters. + + Returns + ------- + A mapping from type argument to its type. + """ + + # Using typing.TypeVar in type signature does not really work, so we are using type instead. + # See: https://github.com/python/typing/issues/520 + result: typing.Dict[type, type] = {} + + for base_class in inspect.getmro(typing_inspect.get_origin(cls)): + if base_class == typing.Generic: + break + + if not typing_inspect.is_generic_type(base_class): + continue + + parameters = typing_inspect.get_parameters(base_class) + + # We are using _select_Generic_superclass_parameters and not get_Generic_parameters + # so that we can handle the case where the result is None. + # See: https://github.com/Stewori/pytypes/issues/20 + arguments = type_util._select_Generic_superclass_parameters(cls, base_class) + + if arguments is None: + arguments = [typing.Any] * len(parameters) + + if len(parameters) != len(arguments): + raise TypeError("Number of parameters does not match number of arguments.") + + for parameter, argument in zip(parameters, arguments): + if type_util.resolve_fw_decl(argument, module_name=base_class.__module__, globs=dir(sys.modules[base_class.__module__]))[1]: + argument = argument.__forward_value__ + + visited: typing.Set[type] = set() + while typing_inspect.is_typevar(argument) and argument in result: + if argument in visited: + raise RuntimeError("Loop while resolving type variables.") + visited.add(argument) + + argument = result[argument] + + if parameter == argument: + argument = typing.Any + + if parameter in result: + if result[parameter] != argument: + raise TypeError("Different types for same parameter across class bases: {type1} vs. {type2}".format( + type1=result[parameter], + type2=argument, + )) + else: + result[parameter] = argument + + if unique_names: + type_parameter_names = [parameter.__name__ for parameter in result.keys()] + + type_parameter_names_set = set(type_parameter_names) + + if len(type_parameter_names) != len(type_parameter_names_set): + for name in type_parameter_names_set: + type_parameter_names.remove(name) + raise TypeError("Same name reused across different type parameters: {extra_names}".format(extra_names=type_parameter_names)) + + return result + + +def is_instance(obj: typing.Any, cls: typing.Union[type, typing.Tuple[type]]) -> bool: + # We do not want really to check generators. A workaround. + # See: https://github.com/Stewori/pytypes/issues/49 + if isinstance(obj, types.GeneratorType): + return False + + if isinstance(cls, tuple): + cls = typing.Union[cls] # type: ignore + + # "bound_typevars" argument has to be passed for this function to + # correctly work with type variables. + # See: https://github.com/Stewori/pytypes/issues/24 + return type_util._issubclass(type_util.deep_type(obj), cls, bound_typevars={}) + + +def is_subclass(subclass: type, superclass: typing.Union[type, typing.Tuple[type]]) -> bool: + # "bound_typevars" argument has to be passed for this function to + # correctly work with type variables. + # See: https://github.com/Stewori/pytypes/issues/24 + return type_util._issubclass(subclass, superclass, bound_typevars={}) + + +def get_type(obj: typing.Any) -> type: + typ = type_util.deep_type(obj, depth=1) + + if is_subclass(typ, type_util.Empty): + typ = typing_inspect.get_last_args(typ)[0] + + return typ + + +def is_instance_method_on_class(method: typing.Any) -> bool: + if is_class_method_on_class(method): + return False + + if inspect.isfunction(method): + return True + + if getattr(method, '__func__', None): + return True + + return False + + +def is_class_method_on_class(method: typing.Any) -> bool: + return inspect.ismethod(method) + + +def is_instance_method_on_object(method: typing.Any, object: typing.Any) -> bool: + if not inspect.ismethod(method): + return False + + if method.__self__ is object: + return True + + return False + + +def is_class_method_on_object(method: typing.Any, object: typing.Any) -> bool: + if not inspect.ismethod(method): + return False + + if method.__self__ is type(object): + return True + + return False + + +def is_type(obj: typing.Any) -> bool: + return isinstance(obj, type) or obj is typing.Any or typing_inspect.is_tuple_type(obj) or typing_inspect.is_union_type(obj) + + +def type_to_str(obj: type) -> str: + return type_util.type_str(obj, assumed_globals={}, update_assumed_globals=False) + + +def get_type_hints(func: typing.Callable) -> typing.Dict[str, typing.Any]: + # To skip decorators. Same stop function as used in "inspect.signature". + func = inspect.unwrap(func, stop=(lambda f: hasattr(f, '__signature__'))) + return type_util.get_type_hints(func) + + +yaml_warning_issued = False + + +def yaml_dump_all(documents: typing.Sequence[typing.Any], stream: typing.IO[typing.Any] = None, **kwds: typing.Any) -> typing.Any: + global yaml_warning_issued + + if not yaml.__with_libyaml__ and not yaml_warning_issued: + yaml_warning_issued = True + logger.warning("cyaml not found, using a slower pure Python YAML implementation.") + + return yaml.dump_all(documents, stream, Dumper=SafeDumper, **kwds) + + +def yaml_dump(data: typing.Any, stream: typing.IO[typing.Any] = None, **kwds: typing.Any) -> typing.Any: + global yaml_warning_issued + + if not yaml.__with_libyaml__ and not yaml_warning_issued: + yaml_warning_issued = True + logger.warning("cyaml not found, using a slower pure Python YAML implementation.") + + return yaml.dump_all([data], stream, Dumper=SafeDumper, **kwds) + + +def yaml_load_all(stream: typing.Union[str, typing.IO[typing.Any]]) -> typing.Any: + global yaml_warning_issued + + if not yaml.__with_libyaml__ and not yaml_warning_issued: + yaml_warning_issued = True + logger.warning("cyaml not found, using a slower pure Python YAML implementation.") + + return yaml.load_all(stream, SafeLoader) + + +def yaml_load(stream: typing.Union[str, typing.IO[typing.Any]]) -> typing.Any: + global yaml_warning_issued + + if not yaml.__with_libyaml__ and not yaml_warning_issued: + yaml_warning_issued = True + logger.warning("cyaml not found, using a slower pure Python YAML implementation.") + + return yaml.load(stream, SafeLoader) + + +def yaml_add_representer(value_type: typing.Type, represented: typing.Callable) -> None: + yaml.Dumper.add_representer(value_type, represented) + yaml.SafeDumper.add_representer(value_type, represented) + + if yaml.__with_libyaml__: + yaml.CDumper.add_representer(value_type, represented) # type: ignore + yaml.CSafeDumper.add_representer(value_type, represented) # type: ignore + + +class EnumMeta(enum.EnumMeta): + def __new__(mcls, class_name, bases, namespace, **kwargs): # type: ignore + def __reduce_ex__(self: typing.Any, proto: int) -> typing.Any: + return self.__class__, (self._value_,) + + if '__reduce_ex__' not in namespace: + namespace['__reduce_ex__'] = __reduce_ex__ + + cls = super().__new__(mcls, class_name, bases, namespace, **kwargs) + + def yaml_representer(dumper, data): # type: ignore + return yaml.ScalarNode('tag:yaml.org,2002:str', data.name) + + yaml_add_representer(cls, yaml_representer) + + return cls + + +class Enum(enum.Enum, metaclass=EnumMeta): + """ + An extension of `Enum` base class where: + + * Instances are equal to their string names, too. + * It registers itself with "yaml" module to serialize itself as a string. + * Allows dynamic registration of additional values using ``register_value``. + """ + + def __eq__(self, other): # type: ignore + if isinstance(other, str): + return self.name == other + + return super().__eq__(other) + + # It must hold a == b => hash(a) == hash(b). Because we allow enums to be equal to names, + # the easiest way to assure the condition is to hash everything according to their names. + def __hash__(self): # type: ignore + return hash(self.name) + + @classmethod + def register_value(cls, name: str, value: typing.Any) -> typing.Any: + # This code is based on Python's "EnumMeta.__new__" code, see + # comments there for more information about the code. + # It uses internals of Python's Enum so it is potentially fragile. + + __new__, save_new, use_args = type(cls)._find_new_({}, cls._member_type_, cls) # type: ignore + + dynamic_attributes = { + k for c in cls.mro() + for k, v in c.__dict__.items() + if isinstance(v, types.DynamicClassAttribute) + } + + if not isinstance(value, tuple): + args: typing.Tuple[typing.Any, ...] = (value,) + else: + args = value + if cls._member_type_ is tuple: # type: ignore + args = (args,) + + if not use_args: + enum_member = __new__(cls) + if not hasattr(enum_member, '_value_'): + enum_member._value_ = value + else: + enum_member = __new__(cls, *args) + if not hasattr(enum_member, '_value_'): + if cls._member_type_ is object: # type: ignore + enum_member._value_ = value + else: + enum_member._value_ = cls._member_type_(*args) # type: ignore + value = enum_member._value_ + enum_member._name_ = name + enum_member.__objclass__ = cls + enum_member.__init__(*args) + for canonical_member in cls._member_map_.values(): # type: ignore + if canonical_member._value_ == enum_member._value_: + enum_member = canonical_member + break + else: + cls._member_names_.append(name) # type: ignore + if name not in dynamic_attributes: + setattr(cls, name, enum_member) + cls._member_map_[name] = enum_member # type: ignore + try: + cls._value2member_map_[value] = enum_member # type: ignore + except TypeError: + pass + + +# Return type has to be "Any" because mypy does not support enums generated dynamically +# and complains about missing attributes when trying to access them. +def create_enum_from_json_schema_enum( + class_name: str, obj: typing.Dict, json_paths: typing.Union[typing.Sequence[str], str], *, + module: str = None, qualname: str = None, base_class: type = None +) -> typing.Any: + if qualname is None: + qualname = class_name + + if isinstance(json_paths, str): + names = _get_names(obj, json_paths) + else: + names = [] + for path in json_paths: + names += _get_names(obj, path) + + # Make the list contain unique names. It keeps the original order in Python 3.6+ + # because dicts are ordered. We use the same string for both the name and the value. + pairs = [(name, name) for name in dict.fromkeys(names).keys()] + + return Enum(value=class_name, names=pairs, module=module, qualname=qualname, type=base_class) # type: ignore + + +def _get_names(obj: typing.Dict, path: str) -> typing.List: + json_path_expression = jsonpath_ng.parse(path) + return [match.value for match in json_path_expression.find(obj)] + + +# This allows other modules to register additional immutable values and types. +# We are doing it this way to overcome issues with import cycles. +additional_immutable_values: typing.Tuple[typing.Any, ...] = () +additional_immutable_types: typing.Tuple[type, ...] = () + + +def make_immutable_copy(obj: typing.Any) -> typing.Any: + """ + Converts a given ``obj`` into an immutable copy of it, if possible. + + Parameters + ---------- + obj: + Object to convert. + + Returns + ------- + An immutable copy of ``obj``. + """ + + if any(obj is immutable_value for immutable_value in additional_immutable_values): + return obj + + if isinstance(obj, numpy.matrix): + # One cannot iterate over a matrix segment by segment. You always get back + # a matrix (2D structure) and not an array of rows or columns. By converting + # it to an array such iteration segment by segment works. + obj = numpy.array(obj) + + if isinstance(obj, KNOWN_IMMUTABLE_TYPES): + # Because str is among known immutable types, it will not be picked apart as a sequence. + return obj + if additional_immutable_types and isinstance(obj, additional_immutable_types): + return obj + if is_type(obj): + # Assume all types are immutable. + return obj + if isinstance(obj, typing.Mapping): + # We simply always preserve order of the mapping. Because we want to make sure also mapping's + # values are converted to immutable values, we cannot simply use MappingProxyType. + return frozendict.FrozenOrderedDict((make_immutable_copy(k), make_immutable_copy(v)) for k, v in obj.items()) + if isinstance(obj, typing.Set): + return frozenset(make_immutable_copy(o) for o in obj) + if isinstance(obj, tuple): + # To preserve named tuples. + return type(obj)(make_immutable_copy(o) for o in obj) + if isinstance(obj, pandas.DataFrame): + return tuple(make_immutable_copy(o) for o in obj.itertuples(index=False, name=None)) + if isinstance(obj, (typing.Sequence, numpy.ndarray)): + return tuple(make_immutable_copy(o) for o in obj) + + raise TypeError("{obj} is not known to be immutable.".format(obj=obj)) + + +def check_immutable(obj: typing.Any) -> None: + """ + Checks that ``obj`` is immutable. Raises an exception if this is not true. + + Parameters + ---------- + obj: + Object to check. + """ + + obj_type = type(obj) + + # First check common cases. + if any(obj is immutable_value for immutable_value in additional_immutable_values): + return + if obj_type in KNOWN_IMMUTABLE_TYPES: + return + if obj_type is frozendict.FrozenOrderedDict: + for k, v in obj.items(): + check_immutable(k) + check_immutable(v) + return + if obj_type is tuple: + for o in obj: + check_immutable(o) + return + + if isinstance(obj, KNOWN_IMMUTABLE_TYPES): + return + if additional_immutable_types and isinstance(obj, additional_immutable_types): + return + if isinstance(obj, tuple): + # To support named tuples. + for o in obj: + check_immutable(o) + return + if is_type(obj): + # Assume all types are immutable. + return + if obj_type is frozenset: + for o in obj: + check_immutable(o) + return + + raise TypeError("{obj} is not known to be immutable.".format(obj=obj)) + + +class Metaclass(custom_inherit._DocInheritorBase): + """ + A metaclass which makes sure docstrings are inherited. + + It knows how to merge numpy-style docstrings and merge parent sections with + child sections. For example, then it is not necessary to repeat documentation + for parameters if they have not changed. + """ + + @staticmethod + def class_doc_inherit(prnt_doc: str = None, child_doc: str = None) -> typing.Optional[str]: + return custom_inherit.store['numpy'](prnt_doc, child_doc) + + @staticmethod + def attr_doc_inherit(prnt_doc: str = None, child_doc: str = None) -> typing.Optional[str]: + return custom_inherit.store['numpy'](prnt_doc, child_doc) + + +class AbstractMetaclass(abc.ABCMeta, Metaclass): + """ + A metaclass which makes sure docstrings are inherited. For use with abstract classes. + """ + + +class GenericMetaclass(typing.GenericMeta, Metaclass): + """ + A metaclass which makes sure docstrings are inherited. For use with generic classes (which are also abstract). + """ + + +class RefResolverNoRemote(validators.RefResolver): + def resolve_remote(self, uri: str) -> typing.Any: + raise exceptions.NotSupportedError("Remote resolving disabled: {uri}".format(uri=uri)) + + +def enum_validator(validator, enums, instance, schema): # type: ignore + if isinstance(instance, Enum): + instance = instance.name + + yield from validators.Draft7Validator.VALIDATORS['enum'](validator, enums, instance, schema) + + +def json_schema_is_string(checker: jsonschema.TypeChecker, instance: typing.Any) -> bool: + if isinstance(instance, Enum): + return True + else: + return validators.Draft7Validator.TYPE_CHECKER.is_type(instance, 'string') + + +def json_schema_is_object(checker: jsonschema.TypeChecker, instance: typing.Any) -> bool: + if isinstance(instance, (frozendict.frozendict, frozendict.FrozenOrderedDict)): + return True + else: + return validators.Draft7Validator.TYPE_CHECKER.is_type(instance, 'object') + + +def json_schema_is_array(checker: jsonschema.TypeChecker, instance: typing.Any) -> bool: + if isinstance(instance, (tuple, set)): + return True + else: + return validators.Draft7Validator.TYPE_CHECKER.is_type(instance, 'array') + + +JsonSchemaTypeChecker = validators.Draft7Validator.TYPE_CHECKER.redefine_many({ + 'string': json_schema_is_string, + 'object': json_schema_is_object, + 'array': json_schema_is_array, +}) + + +# JSON schema validator with the following extension: +# +# * If a value is an instance of Python enumeration, its name is checked against JSON +# schema enumeration, instead of the value itself. When converting to a proper JSON +# these values should be enumeration's name. +Draft7Validator = validators.extend( + validators.Draft7Validator, + validators={ + 'enum': enum_validator, + }, + type_checker=JsonSchemaTypeChecker, +) + + +draft7_format_checker = copy.deepcopy(jsonschema.draft7_format_checker) + + +@draft7_format_checker.checks('python-type') +def json_schema_is_python_type(instance: typing.Any) -> bool: + return is_type(instance) or isinstance(instance, str) + + +# We cannot use "Draft7Validator" as a type (MyPy complains), so we are using +# "validators.Draft7Validator", which has the same interface. +def load_schema_validators(schemas: typing.Dict, load_validators: typing.Sequence[str]) -> typing.List[validators.Draft7Validator]: + schema_validators = [] + + for schema_filename in load_validators: + for schema_uri, schema_json in schemas.items(): + if os.path.basename(schema_uri) == schema_filename: + break + else: + raise exceptions.InvalidArgumentValueError("Cannot find schema '{schema_filename}'.".format(schema_filename=schema_filename)) + + # We validate schemas using unmodified validator. + validators.Draft7Validator.check_schema(schema_json) + + validator = Draft7Validator( + schema=schema_json, + resolver=RefResolverNoRemote(schema_json['id'], schema_json, schemas), + format_checker=draft7_format_checker, + ) + + schema_validators.append(validator) + + return schema_validators + + +def datetime_for_json(timestamp: datetime.datetime) -> str: + # Since Python 3.6 "astimezone" can be called on naive instances + # that are presumed to represent system local time. + # We remove timezone information before formatting to not have "+00:00" added and + # we then manually add "Z" instead (which has equivalent meaning). + return timestamp.astimezone(datetime.timezone.utc).replace(tzinfo=None).isoformat('T') + 'Z' + + +class JsonEncoder(json.JSONEncoder): + """ + JSON encoder with extensions, among them the main ones are: + + * Frozen dict is encoded as a dict. + * Python types are encoded into strings describing them. + * Python enumerations are encoded into their string names. + * Sets are encoded into lists. + * Encodes ndarray and DataFrame as nested lists. + * Encodes datetime into ISO format with UTC timezone. + * Everything else which cannot be encoded is converted to a string. + + You probably want to use `to_json_structure` and not this class, because `to_json_structure` + also encodes ``NaN`, ``Infinity``, and ``-Infinity`` as strings. + + It does not necessary make a JSON which can then be parsed back to reconstruct original value. + """ + + def default(self, o: typing.Any) -> typing.Any: + # Importing here to prevent import cycle. + from d3m.metadata import base + + if isinstance(o, numpy.matrix): + # One cannot iterate over a matrix segment by segment. You always get back + # a matrix (2D structure) and not an array of rows or columns. By converting + # it to an array such iteration segment by segment works. + o = numpy.array(o) + + if isinstance(o, frozendict.frozendict): + return dict(o) + if isinstance(o, frozendict.FrozenOrderedDict): + return collections.OrderedDict(o) + if is_type(o): + return type_to_str(o) + if isinstance(o, Enum): + return o.name + if o is base.ALL_ELEMENTS: + return repr(o) + if o is base.NO_VALUE: + return repr(o) + # For encoding numpy.int64, numpy.float64 already works. + if isinstance(o, numpy.integer): + return int(o) + if isinstance(o, numpy.bool_): + return bool(o) + if isinstance(o, typing.Mapping): + return collections.OrderedDict(o) + if isinstance(o, typing.Set): + return sorted(o, key=str) + if isinstance(o, pandas.DataFrame): + return list(o.itertuples(index=False, name=None)) + if isinstance(o, (typing.Sequence, numpy.ndarray)): + return list(o) + if isinstance(o, decimal.Decimal): + return float(o) + if isinstance(o, bytes): + return base64.b64encode(o).decode('utf8') + if isinstance(o, datetime.datetime): + return datetime_for_json(o) + + try: + return super().default(o) + except TypeError: + return str(o) + + +def normalize_numbers(obj: typing.Dict) -> typing.Dict: + return json.loads(json.dumps(obj), parse_int=float) + + +json_constant_map = { + '-Infinity': str(float('-Infinity')), + 'Infinity': str(float('Infinity')), + 'NaN': str(float('NaN')), +} + + +def to_json_structure(obj: typing.Any) -> typing.Any: + """ + In addition to what `JsonEncoder` encodes, this function also encodes as strings + float ``NaN``, ``Infinity``, and ``-Infinity``. + + It does not necessary make a JSON structure which can then be parsed back to reconstruct + original value. For that use ``to_reversible_json_structure``. + """ + + # We do not use "allow_nan=False" here because we will handle those values during loading. + # "JsonEncoder.default" is not called for float values so we cannot handle them there. + # See: https://bugs.python.org/issue36841 + json_string = json.dumps(obj, cls=JsonEncoder) + + return json.loads( + json_string, + parse_constant=lambda constant: json_constant_map[constant], + ) + + +def _json_key(key: typing.Any) -> str: + if isinstance(key, str): + return key + else: + raise TypeError("Key must be a string, not '{key_type}'.".format(key_type=type(key))) + + +def to_reversible_json_structure(obj: typing.Any) -> typing.Any: + """ + Operation is not idempotent. + """ + + if isinstance(obj, (str, bool, NONE_TYPE)): + return obj + + obj_type = type(obj) + + if _is_int(obj_type): + # To make sure it is Python int. + obj = int(obj) + + return obj + + elif _is_float(obj_type): + # To make sure it is Python float. + obj = float(obj) + + if not numpy.isfinite(obj): + return { + 'encoding': 'pickle', + 'description': str(obj), + 'value': base64.b64encode(pickle.dumps(obj)).decode('utf8'), + } + else: + return obj + + elif isinstance(obj, typing.Mapping): + if 'encoding' in obj and 'value' in obj: + return { + 'encoding': 'escape', + 'value': {_json_key(k): to_reversible_json_structure(v) for k, v in obj.items()}, + } + else: + return {_json_key(k): to_reversible_json_structure(v) for k, v in obj.items()} + + # We do not use "is_sequence" because we do not want to convert all sequences, + # because it can be loosing important information. + elif isinstance(obj, (tuple, list)): + return [to_reversible_json_structure(v) for v in obj] + + else: + return { + 'encoding': 'pickle', + 'description': str(obj), + 'value': base64.b64encode(pickle.dumps(obj)).decode('utf8'), + } + + +def from_reversible_json_structure(obj: typing.Any) -> typing.Any: + if is_instance(obj, typing.Union[str, int, float, bool, NONE_TYPE]): + return obj + + elif isinstance(obj, typing.Mapping): + if 'encoding' in obj and 'value' in obj: + if obj['encoding'] == 'pickle': + # TODO: Limit the types of values being able to load to prevent arbitrary code execution by a malicious pickle. + return pickle.loads(base64.b64decode(obj['value'].encode('utf8'))) + if obj['encoding'] == 'escape': + return {_json_key(k): from_reversible_json_structure(v) for k, v in obj['value'].items()} + else: + raise ValueError("Unsupported encoding '{encoding}'.".format(encoding=obj['encoding'])) + else: + return {_json_key(k): from_reversible_json_structure(v) for k, v in obj.items()} + + # We do not use "is_sequence" because we do not want to convert all sequences, + # because it can be loosing important information. + elif isinstance(obj, (tuple, list)): + return [from_reversible_json_structure(v) for v in obj] + + else: + raise TypeError("Unsupported type '{value_type}'.".format(value_type=type(obj))) + + +class StreamToLogger: + def __init__(self, logger: logging.Logger, level: typing.Union[str, int], pass_through_stream: typing.TextIO = None) -> None: + self.logger = logger + self.level = logging._checkLevel(level) # type: ignore + self.pending_line = "" + self.closed = False + self.pass_through_stream = pass_through_stream + + # Here we are trying to test for the case of a recursive loop which can happen + # if you are using "logging.StreamHandler" in your logging configuration (e.g., to + # output logging to a console) and configure it after "redirect_to_logging' context + # manager has been entered. + def _check_recursion(self) -> bool: + # We start at "2" so that we start from outside of this file. + frame = sys._getframe(2) + line_number = None + try: + i = 0 + # If loop is happening, it is generally looping inside less than 10 frames, + # so we exit after 20 frames (just to make sure, all these values are ballpark + # values) to optimize. + while frame and i < 20: + if frame.f_code.co_filename == __file__: + # The first (in fact the last from call perspective) time we are + # in this file. + if line_number is None: + line_number = frame.f_lineno + # If we were in the same file and line already higher in the stack, + # we are in a recursive loop. + elif line_number == frame.f_lineno: + return True + frame = frame.f_back + i += 1 + finally: + del frame + + return False + + def write(self, buffer: str) -> int: + if self.closed: + raise ValueError("Stream is closed.") + + if self._check_recursion(): + # We are being called by a logger in a recursive loop. Because this message has already been logged, + # it is safe for us to just drop it to break a recursive loop. + return 0 + + # We only write complete lines to the logger. Any incomplete line will be saved to "pending_line", and flushed + # if "flush" is called or the context manager is closed. + bytes_written = 0 + lines = (self.pending_line + buffer).split('\n') + # Since we split on "\n", the last string in the list of lines will be an empty string if the last character + # in the buffer is a newline, which is what we want in this case as it resets the "pending_line" to empty. + # Otherwise the last string in the list of lines are characters after the last "\n", which is again what we + # want, setting the "pending_line" to characters not logged this time. + self.pending_line = lines[-1] + for line in lines[:-1]: + # Whitespace lines should not be logged. + if line.strip(): + self.logger.log(self.level, line.rstrip()) + bytes_written += len(line) + + if self.pass_through_stream is not None: + self.pass_through_stream.write(buffer) + + return bytes_written + + def writelines(self, lines: typing.List[str]) -> None: + if self.closed: + raise ValueError("Stream is closed.") + + if self._check_recursion(): + # We are being called by a logger in a recursive loop. Because this message has already been logged, + # it is safe for us to just drop it to break a recursive loop. + return + + for line in lines: + if line.strip(): + self.logger.log(self.level, line.rstrip()) + + if self.pass_through_stream is not None: + if hasattr(self.pass_through_stream, 'writelines'): + self.pass_through_stream.writelines(lines) + else: + for line in lines: + self.pass_through_stream.write(line) + + def flush(self) -> None: + if self.closed: + raise ValueError("Stream is closed.") + + if self.pending_line.strip(): + self.logger.log(self.level, self.pending_line.rstrip()) + + if self.pass_through_stream is not None: + self.pass_through_stream.flush() + + def close(self) -> None: + if self.closed: + return + + if self.pending_line.strip(): + self.logger.log(self.level, self.pending_line.rstrip()) + self.closed = True + + def seekable(self) -> bool: + return False + + def seek(self, offset: int, whence: int = 0) -> int: + raise OSError("Stream is not seekable.") + + def tell(self) -> int: + raise OSError("Stream is not seekable.") + + def truncate(self, size: int = None) -> int: + raise OSError("Stream is not seekable.") + + def writable(self) -> bool: + return True + + def isatty(self) -> bool: + return False + + def readable(self) -> bool: + return False + + def read(self, n: int = -1) -> typing.AnyStr: + raise OSError("Stream is write-only.") + + def readline(self, limit: int = -1) -> typing.AnyStr: + raise OSError("Stream is write-only.") + + def readlines(self, hint: int = -1) -> typing.List[typing.AnyStr]: + raise OSError("Stream is write-only.") + + def fileno(self) -> int: + raise OSError("Stream does not use a file descriptor.") + + +class redirect_to_logging(contextlib.AbstractContextManager): + """ + A Python context manager which redirects all writes to stdout and stderr + to Python logging. + + Primitives should use logging to log messages, but maybe they are not doing + that or there are other libraries they are using which are not doing that. + One can then use this context manager to assure that (at least all Python) + writes to stdout and stderr by primitives are redirected to logging:: + + with redirect_to_logging(logger=PrimitiveClass.logger): + primitive = PrimitiveClass(...) + primitive.set_training_data(...) + primitive.fit(...) + primitive.produce(...) + """ + + # These are class variables to ensure that they are shared among all instances. + # We use a list to make this context manager re-entrant. + _old_stdouts: typing.List[typing.TextIO] = [] + _old_stderrs: typing.List[typing.TextIO] = [] + + def __init__(self, logger: logging.Logger = None, stdout_level: typing.Union[int, str] = 'INFO', stderr_level: typing.Union[int, str] = 'ERROR', pass_through: bool = True) -> None: + if logger is None: + self.logger = logging.getLogger('redirect') + else: + self.logger = logger + + self.stdout_level = logging._checkLevel(stdout_level) # type: ignore + self.stderr_level = logging._checkLevel(stderr_level) # type: ignore + self.pass_through = pass_through + + def __enter__(self) -> logging.Logger: + self._old_stdouts.append(sys.stdout) + self._old_stderrs.append(sys.stderr) + if self.pass_through: + stdout_pass_through = self._old_stdouts[0] + stderr_pass_through = self._old_stderrs[0] + else: + stdout_pass_through = None + stderr_pass_through = None + sys.stdout = typing.cast(typing.TextIO, StreamToLogger(self.logger, self.stdout_level, stdout_pass_through)) + sys.stderr = typing.cast(typing.TextIO, StreamToLogger(self.logger, self.stdout_level, stderr_pass_through)) + return self.logger + + def __exit__(self, exc_type: typing.Optional[typing.Type[BaseException]], + exc_value: typing.Optional[BaseException], + traceback: typing.Optional[types.TracebackType]) -> typing.Optional[bool]: + sys.stdout.close() + sys.stderr.close() + sys.stdout = self._old_stdouts.pop() + sys.stderr = self._old_stderrs.pop() + return None + + +class CallbackHandler(logging.Handler): + """ + Calls a ``callback`` with logging records as they are without any conversion except for: + + * formatting the logging message and adding it to the record object + * assuring ``asctime`` is set + * converts exception ``exc_info`` into exception's name + * making sure ``args`` are JSON-compatible or removing it + * making sure there are no null values + """ + + def __init__(self, callback: typing.Callable) -> None: + super().__init__(logging.DEBUG) + + self.callback = callback + + def emit(self, record: logging.LogRecord) -> None: + try: + self.callback(self.prepare(record)) + except Exception: + self.handleError(record) + + def prepare(self, record: logging.LogRecord) -> typing.Dict: + self.format(record) + + # If "asctime" is not set, we do it ourselves. + if not hasattr(record, 'asctime'): + if self.formatter: + fmt = self.formatter + else: + fmt = logging._defaultFormatter # type: ignore + record.asctime = fmt.formatTime(record, fmt.datefmt) + + output = copy.copy(record.__dict__) + + # Exceptions are not JSON compatible. + if 'exc_info' in output: + if output['exc_info']: + if isinstance(output['exc_info'], BaseException): + output['exc_type'] = type_to_str(type(output['exc_info'])) + else: + output['exc_type'] = type_to_str(type(output['exc_info'][1])) + del output['exc_info'] + + if 'args' in output: + try: + output['args'] = to_json_structure(output['args']) + except Exception: + # We assume this means "args" is not JSON compatible. + del output['args'] + + # We iterate over a list so that we can change dict while iterating. + for key, value in list(output.items()): + if value is None: + del output[key] + + return output + + +def _called_from_outside(modules: typing.Sequence[types.ModuleType]) -> bool: + # 0 == this function, 1 == wrapper, 2 == caller + frame = sys._getframe(2) + try: + if not frame: + caller_module_name = None + else: + caller_module_name = frame.f_globals.get('__name__', None) + finally: + del frame + + return all(caller_module_name != module.__name__ for module in modules) + + +def _decorate_all_methods(modules: typing.Sequence[types.ModuleType], src_obj: typing.Any, dst_obj: typing.Any, decorator: typing.Callable, ignore: typing.Set) -> None: + for name, function in inspect.getmembers(src_obj): + if name.startswith('_'): + continue + + if name in ignore: + continue + + # Wrap the method with the decorator. + if isinstance(function, (types.FunctionType, types.MethodType, types.BuiltinFunctionType, types.BuiltinMethodType)): + # For simplicity we use the name of the first module. + decorated_function = decorator(modules, modules[0].__name__, name, function) + setattr(dst_obj, name, decorated_function) + + # When functions are imported to other modules, we have to update those imported functions as well. + # Here we iterate over known modules and check if original function was copied over. If it was, + # we set it to the new decorated function. + for module in modules: + if getattr(module, name, None) == function: + setattr(module, name, decorated_function) + + +_random_warnings_enabled: typing.List[bool] = [] +_random_sources_patched = False + + +def _random_warning_decorator(modules: typing.Sequence[types.ModuleType], module_path: str, function_name: str, f: typing.Callable) -> typing.Callable: + @functools.wraps(f) + def wrapper(*args: typing.Any, **kwargs: typing.Any) -> typing.Any: + global _random_warnings_enabled + + # Some methods call into other methods. We do not want to issue a warning in such cases. + if _random_warnings_enabled and _random_warnings_enabled[-1] and _called_from_outside(modules): + log_once( + logger, + logging.WARNING, + "Using global/shared random source using '%(module_path)s.%(function_name)s' can make execution not reproducible.", + { + 'module_path': module_path, + 'function_name': function_name, + }, + stack_info=True, + ) + + return f(*args, **kwargs) + + return wrapper + + +class _RandomState(numpy_random.RandomState): + """ + A subclass just so that we can set somewhere decorated methods. The original class is read-only. + """ + + +def _patch_random_sources() -> None: + global _random_sources_patched + + if _random_sources_patched: + return + _random_sources_patched = True + + # We patch the global Python random number generator instance by decorating all methods. + # Used to support "global_randomness_warning" context manager. + # We do not issue warning for calling "getstate". + _decorate_all_methods([random], random._inst, random._inst, _random_warning_decorator, {'getstate'}) # type: ignore + + # For global NumPy random number generator we create a new random state instance first (of our subclass), + # and copy the state over. This is necessary because original random state instance has read-only methods. + old_rand = numpy.random.mtrand._rand + numpy.random.mtrand._rand = _RandomState() + numpy.random.mtrand._rand.set_state(old_rand.get_state()) + + # We do not issue warning for calling "get_state". + _decorate_all_methods([numpy.random, numpy.random.mtrand], old_rand, numpy.random.mtrand._rand, _random_warning_decorator, {'get_state'}) # type: ignore + + if hasattr(numpy_random, 'default_rng'): + old_default_rng = numpy_random.default_rng + + def default_rng(seed: typing.Any = None) -> typing.Any: + if seed is None: + log_once( + logger, + logging.WARNING, + "Using 'numpy.random.default_rng' without a seed can make execution not reproducible.", + stack_info=True, + ) + + return old_default_rng(seed) + + numpy_random.default_rng = default_rng + + +class global_randomness_warning(contextlib.AbstractContextManager): + """ + A Python context manager which issues a warning if global sources of + randomness are used. Currently it checks Python built-in global random + source, NumPy global random source, and NumPy ``default_rng`` being + used without a seed. + """ + + def __init__(self, enable: bool = True) -> None: + self.enable = enable + _patch_random_sources() + + def __enter__(self) -> None: + _random_warnings_enabled.append(self.enable) + + def __exit__(self, exc_type: typing.Optional[typing.Type[BaseException]], + exc_value: typing.Optional[BaseException], + traceback: typing.Optional[types.TracebackType]) -> typing.Optional[bool]: + _random_warnings_enabled.pop() + return None + + +def get_full_name(value: typing.Any) -> str: + return '{module}.{name}'.format(module=value.__module__, name=value.__name__) + + +def has_duplicates(data: typing.Sequence) -> bool: + """ + Returns ``True`` if ``data`` has duplicate elements. + + It works both with hashable and not-hashable elements. + """ + + try: + return len(set(data)) != len(data) + except TypeError: + n = len(data) + for i in range(n): + for j in range(i + 1, n): + if data[i] == data[j]: + return True + return False + + +@contextlib.contextmanager +def silence() -> typing.Generator: + """ + Hides logging and stdout output. + """ + + with unittest.TestCase().assertLogs(level=logging.DEBUG): + with redirect_to_logging(pass_through=False): + # Just to log something, otherwise "assertLogs" can fail. + logging.getLogger().debug("Silence.") + + yield + + +@deprecate.arguments('source', message="argument ignored") +def columns_sum(inputs: typing.Any, *, source: typing.Any = None) -> typing.Any: + """ + Computes sum per column. + """ + + # Importing here to prevent import cycle. + from d3m import container + + if isinstance(inputs, container.DataFrame): # type: ignore + results = container.DataFrame(inputs.agg(['sum']).reset_index(drop=True), generate_metadata=True) # type: ignore + return results + + elif isinstance(inputs, container.ndarray) and len(inputs.shape) == 2: + return numpy.sum(inputs, axis=0, keepdims=True) + + else: + raise exceptions.InvalidArgumentTypeError("Unsupported container type to sum: {type}".format( + type=type(inputs), + )) + + +def list_files(base_directory: str) -> typing.Sequence[str]: + files = [] + + base_directory = base_directory.rstrip(os.path.sep) + base_directory_prefix_length = len(base_directory) + 1 + for dirpath, dirnames, filenames in os.walk(base_directory): + for filename in filenames: + filepath = os.path.join(dirpath, filename) + + # We do not use "os.path.relpath" because it is to general + # and it first try to construct absolute path which is slow. + files.append(filepath[base_directory_prefix_length:]) + + # We sort to have a canonical order. + files = sorted(files) + + return files + + +def _is_int(typ: type) -> bool: + # We support more types than those listed in "d3m.types.simple_data_types". + return issubclass(typ, (int, numpy.integer, numbers.Integral)) + + +def is_int(typ: type) -> bool: + return _is_int(typ) and not issubclass(typ, bool) + + +def _is_float(typ: type) -> bool: + # We support more types than those listed in "d3m.types.simple_data_types". + return issubclass(typ, (float, numpy.float32, numpy.float64, decimal.Decimal, numbers.Real)) + + +def is_float(typ: type) -> bool: + return _is_float(typ) and not is_int(typ) + + +def is_numeric(typ: type) -> bool: + return is_int(typ) or _is_float(typ) + + +def compute_hash_id(obj: typing.Dict) -> str: + """ + Input should be a JSON compatible structure. + """ + + obj = copy.copy(obj) + + if 'id' in obj: + del obj['id'] + + # We iterate over a list so that we can change dict while iterating. + for key in list(obj.keys()): + # Do not count any private field into hash. + if key.startswith('_'): + del obj[key] + + # We have to use "normalize_numbers" first so that we normalize numbers. + # We cannot do this just with a custom encoder because encoders are not + # called for float values so we cannot handle them there. + # See: https://bugs.python.org/issue36841 + to_hash_id = json.dumps(normalize_numbers(obj), sort_keys=True) + + return str(uuid.uuid5(HASH_ID_NAMESPACE, to_hash_id)) + + +def compute_digest(obj: typing.Dict, extra_data: bytes = None) -> str: + """ + Input should be a JSON compatible structure. + """ + + obj = copy.copy(obj) + + if 'digest' in obj: + del obj['digest'] + + # We iterate over a list so that we can change dict while iterating. + for key in list(obj.keys()): + # Do not count any private field into digest. + if key.startswith('_'): + del obj[key] + + # We have to use "normalize_numbers" first so that we normalize numbers. + # We cannot do this just with a custom encoder because encoders are not + # called for float values so we cannot handle them there. + # See: https://bugs.python.org/issue36841 + to_digest = json.dumps(normalize_numbers(obj), sort_keys=True) + + digest = hashlib.sha256(to_digest.encode('utf8')) + + if extra_data is not None: + digest.update(extra_data) + + return digest.hexdigest() + + +def is_sequence(value: typing.Any) -> bool: + return isinstance(value, typing.Sequence) and not isinstance(value, (str, bytes)) + + +def get_dict_path(input_dict: typing.Dict, path: typing.Sequence[typing.Any]) -> typing.Any: + value: typing.Any = input_dict + + for segment in path: + value = value.get(segment, None) + + if value is None: + return None + + return value + + +def set_dict_path(input_dict: typing.Dict, path: typing.Sequence[typing.Any], value: typing.Any) -> None: + if not path: + raise exceptions.InvalidArgumentValueError("\"path\" has to be non-empty.") + + for segment in path[:-1]: + if segment not in input_dict: + input_dict[segment] = {} + input_dict = input_dict[segment] + + input_dict[path[-1]] = value + + +def register_yaml_representers() -> None: + def yaml_representer_numpy_float(dumper: yaml.Dumper, data: typing.Any) -> typing.Any: + return dumper.represent_float(float(data)) + + def yaml_representer_numpy_int(dumper: yaml.Dumper, data: typing.Any) -> typing.Any: + return dumper.represent_int(int(data)) + + def yaml_representer_numpy_bool(dumper: yaml.Dumper, data: typing.Any) -> typing.Any: + return dumper.represent_bool(bool(data)) + + representers = [ + {'type': numpy.float32, 'representer': yaml_representer_numpy_float}, + {'type': numpy.float64, 'representer': yaml_representer_numpy_float}, + {'type': numpy.int32, 'representer': yaml_representer_numpy_int}, + {'type': numpy.int64, 'representer': yaml_representer_numpy_int}, + {'type': numpy.integer, 'representer': yaml_representer_numpy_int}, + {'type': numpy.bool_, 'representer': yaml_representer_numpy_bool}, + ] + + for representer in representers: + yaml_add_representer(representer['type'], representer['representer']) + + +# Registers additional regexp for floating point resolver. +# See: https://github.com/yaml/pyyaml/issues/173 +def register_yaml_resolvers() -> None: + tag = 'tag:yaml.org,2002:float' + regexp = re.compile(r'''^(?:[-+]?(?:[0-9][0-9_]*)\.[0-9_]*(?:[eE][-+]?[0-9]+)? + |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+) + |\.[0-9_]+(?:[eE][-+]?[0-9]+)?)$''', re.X) + first = list(u'-+0123456789.') + + yaml.Dumper.add_implicit_resolver(tag, regexp, first) + yaml.SafeDumper.add_implicit_resolver(tag, regexp, first) + yaml.Loader.add_implicit_resolver(tag, regexp, first) + yaml.SafeLoader.add_implicit_resolver(tag, regexp, first) + + if yaml.__with_libyaml__: + yaml.CDumper.add_implicit_resolver(tag, regexp, first) # type: ignore + yaml.CSafeDumper.add_implicit_resolver(tag, regexp, first) # type: ignore + yaml.CLoader.add_implicit_resolver(tag, regexp, first) # type: ignore + yaml.CSafeLoader.add_implicit_resolver(tag, regexp, first) # type: ignore + + +def matches_structural_type(source_structural_type: type, target_structural_type: typing.Union[str, type]) -> bool: + if isinstance(target_structural_type, str): + return type_to_str(source_structural_type) == target_structural_type + else: + return is_subclass(source_structural_type, target_structural_type) + + +# Register YAML representers and resolvers. +register_yaml_representers() +register_yaml_resolvers() + + +class PMap(pyrsistent.PMap): + """ + Extends `pyrsistent.PMap` to (by default) iterate over its items in sorted order. + """ + + def iterkeys(self, *, sort: bool = True, reverse: bool = False) -> typing.Iterable: + for k, _ in self.iteritems(sort=sort, reverse=reverse): + yield k + + def itervalues(self, *, sort: bool = True, reverse: bool = False) -> typing.Iterable: + for _, v in self.iteritems(sort=sort, reverse=reverse): + yield v + + def iteritems(self, *, sort: bool = True, reverse: bool = False) -> typing.Iterable: + if sort: + yield from sorted(super().iteritems(), key=operator.itemgetter(0), reverse=reverse) + else: + yield from super().iteritems() + + # In Python 3 this is also an iterable. + def values(self, *, sort: bool = True, reverse: bool = False) -> typing.Iterable: + return self.itervalues(sort=sort, reverse=reverse) + + # In Python 3 this is also an iterable. + def keys(self, *, sort: bool = True, reverse: bool = False) -> typing.Iterable: + return self.iterkeys(sort=sort, reverse=reverse) + + # In Python 3 this is also an iterable. + def items(self, *, sort: bool = True, reverse: bool = False) -> typing.Iterable: + return self.iteritems(sort=sort, reverse=reverse) + + def evolver(self) -> 'Evolver': + return Evolver(self) + + def __reduce__(self) -> typing.Tuple[typing.Callable, typing.Tuple[typing.Dict]]: + return pmap, (dict(self),) + + +class Evolver(pyrsistent.PMap._Evolver): + def persistent(self) -> PMap: + if self.is_dirty(): + self._original_pmap = PMap(self._size, self._buckets_evolver.persistent()) + + return self._original_pmap + + +# It is OK to use a mutable default value here because it is never changed in-place. +def pmap(initial: typing.Mapping = {}, pre_size: int = 0) -> PMap: + super_pmap = pyrsistent.pmap(initial, pre_size) + + return PMap(super_pmap._size, super_pmap._buckets) + + +EMPTY_PMAP = pmap() + + +def is_uri(uri: str) -> bool: + """ + Test if a given string is an URI. + + Parameters + ---------- + uri: + A potential URI to test. + + Returns + ------- + ``True`` if string is an URI, ``False`` otherwise. + """ + + try: + parsed_uri = url_parse.urlparse(uri, allow_fragments=False) + except Exception: + return False + + return parsed_uri.scheme != '' + + +def fix_uri(uri: str, *, allow_relative_path: bool = True) -> str: + """ + Make a real file URI from a path. + + Parameters + ---------- + uri: + An input URI. + allow_relative_path: + Allow path to be relative? + + Returns + ------- + A fixed URI. + """ + + if is_uri(uri): + return uri + + if not uri.startswith('/') and not allow_relative_path: + raise exceptions.InvalidArgumentValueError(f"Path cannot be relative: {uri}") + + # Make absolute and normalize at the same time. + uri = os.path.abspath(uri) + + return 'file://{uri}'.format(uri=uri) + + +def outside_package_context() -> typing.Optional[deprecate.Context]: + frame = sys._getframe(1) + try: + while frame: + if frame.f_code.co_filename == '' or os.path.commonpath([PACKAGE_BASE, frame.f_code.co_filename]) != PACKAGE_BASE: + return deprecate.Context(None, None, frame.f_code.co_filename, frame.f_globals.get('__name__', None), frame.f_lineno) + + frame = frame.f_back + + finally: + del frame + + return None + + +already_logged: typing.Set[typing.Tuple[deprecate.Context, deprecate.Context]] = set() + + +def log_once(logger: logging.Logger, level: int, msg: str, *args: typing.Any, **kwargs: typing.Any) -> None: + frame = sys._getframe(1) + try: + if not frame: + function_context = None + else: + function_context = deprecate.Context(str(level), msg, frame.f_code.co_filename, frame.f_globals.get('__name__', None), frame.f_lineno) + finally: + del frame + + module_context = outside_package_context() + + context = (module_context, function_context) + + if context in already_logged: + return + + if module_context is not None and function_context is not None: + already_logged.add(context) + + logger.log(level, msg, *args, **kwargs) + + +# A workaround to handle also binary stdin/stdout. +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/353 +# See: https://bugs.python.org/issue14156 +# Moreover, if filename ends in ".gz" it decompresses the file as well. +class FileType(argparse.FileType): + def __call__(self, string: str) -> typing.IO[typing.Any]: + if string.endswith('.gz'): + # "gzip.open" has as a default binary mode, + # but we want text mode as a default. + if 't' not in self._mode and 'b' not in self._mode: # type: ignore + mode = self._mode + 't' # type: ignore + else: + mode = self._mode # type: ignore + + try: + return gzip.open(string, mode=mode, encoding=self._encoding, errors=self._errors) # type: ignore + except OSError as error: + message = argparse._("can't open '%s': %s") # type: ignore + raise argparse.ArgumentTypeError(message % (string, error)) + + handle = super().__call__(string) + + if string == '-' and 'b' in self._mode: # type: ignore + handle = handle.buffer # type: ignore + + return handle + + +def open(file: str, mode: str = 'r', buffering: int = -1, encoding: str = None, errors: str = None) -> typing.IO[typing.Any]: + try: + return FileType(mode=mode, bufsize=buffering, encoding=encoding, errors=errors)(file) + except argparse.ArgumentTypeError as error: + original_error = error.__context__ + + # So that we are outside of the except clause. + raise original_error + + +def filter_local_location_uris(doc: typing.Dict, *, empty_value: typing.Any = None) -> None: + if 'location_uris' in doc: + location_uris = [] + for location_uri in doc['location_uris']: + try: + parsed_uri = url_parse.urlparse(location_uri, allow_fragments=False) + except Exception: + continue + + if parsed_uri.scheme == 'file': + continue + + location_uris.append(location_uri) + + if location_uris: + doc['location_uris'] = location_uris + elif empty_value is not None: + doc['location_uris'] = empty_value + else: + del doc['location_uris'] + + if 'location_base_uris' in doc: + location_base_uris = [] + for location_base_uri in doc['location_base_uris']: + try: + parsed_uri = url_parse.urlparse(location_base_uri, allow_fragments=False) + except Exception: + continue + + if parsed_uri.scheme == 'file': + continue + + location_base_uris.append(location_base_uri) + + if location_base_uris: + doc['location_base_uris'] = location_base_uris + elif empty_value is not None: + doc['location_base_uris'] = empty_value + else: + del doc['location_base_uris'] + + +def json_structure_equals( + obj1: typing.Any, obj2: typing.Any, ignore_keys: typing.Set = None, +) -> bool: + """ + Parameters + ---------- + obj1: + JSON serializable object to compare with ``obj2``. + obj2: + JSON serializable object to compare with ``obj1``. + ignore_keys: + If ``obj1`` and ``obj2`` are of type ``Mapping``, any keys found in this set will not be considered to + determine whether ``obj1`` and ``obj2`` are equal. + + Returns + ------- + A boolean indicating whether ``obj1`` and ``obj2`` are equal. + """ + + if ignore_keys is None: + ignore_keys = set() + + if isinstance(obj1, collections.Mapping) and isinstance(obj2, collections.Mapping): + for key1 in obj1: + if key1 in ignore_keys: + continue + if key1 not in obj2: + return False + if not json_structure_equals(obj1[key1], obj2[key1], ignore_keys): + return False + + for key2 in obj2: + if key2 in ignore_keys: + continue + if key2 not in obj1: + return False + # Already checked if values are equal. + + return True + + elif is_sequence(obj1) and is_sequence(obj2): + if len(obj1) != len(obj2): + return False + for i, (item1, item2) in enumerate(zip(obj1, obj2)): + if not json_structure_equals(item1, item2, ignore_keys): + return False + return True + + else: + return obj1 == obj2 + + +@functools.lru_cache() +def get_datasets_and_problems( + datasets_dir: str, handle_score_split: bool = True, +) -> typing.Tuple[typing.Dict[str, str], typing.Dict[str, str]]: + if datasets_dir is None: + raise exceptions.InvalidArgumentValueError("Datasets directory has to be provided.") + + datasets: typing.Dict[str, str] = {} + problem_descriptions: typing.Dict[str, str] = {} + problem_description_contents: typing.Dict[str, typing.Dict] = {} + + for dirpath, dirnames, filenames in os.walk(datasets_dir, followlinks=True): + if 'datasetDoc.json' in filenames: + # Do not traverse further (to not parse "datasetDoc.json" or "problemDoc.json" if they + # exists in raw data filename). + dirnames[:] = [] + + dataset_path = os.path.join(os.path.abspath(dirpath), 'datasetDoc.json') + + try: + with open(dataset_path, 'r', encoding='utf8') as dataset_file: + dataset_doc = json.load(dataset_file) + + dataset_id = dataset_doc['about']['datasetID'] + # Handle a special case for SCORE dataset splits (those which have "targets.csv" file). + # They are the same as TEST dataset splits, but we present them differently, so that + # SCORE dataset splits have targets as part of data. Because of this we also update + # corresponding dataset ID. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176 + if handle_score_split and os.path.exists(os.path.join(dirpath, '..', 'targets.csv')) and dataset_id.endswith('_TEST'): + dataset_id = dataset_id[:-5] + '_SCORE' + + if dataset_id in datasets: + logger.warning( + "Duplicate dataset ID '%(dataset_id)s': '%(old_dataset)s' and '%(dataset)s'", { + 'dataset_id': dataset_id, + 'dataset': dataset_path, + 'old_dataset': datasets[dataset_id], + }, + ) + else: + datasets[dataset_id] = dataset_path + + except (ValueError, KeyError): + logger.exception( + "Unable to read dataset '%(dataset)s'.", { + 'dataset': dataset_path, + }, + ) + + if 'problemDoc.json' in filenames: + # We continue traversing further in this case. + + problem_path = os.path.join(os.path.abspath(dirpath), 'problemDoc.json') + + try: + with open(problem_path, 'r', encoding='utf8') as problem_file: + problem_doc = json.load(problem_file) + + problem_id = problem_doc['about']['problemID'] + # Handle a special case for SCORE dataset splits (those which have "targets.csv" file). + # They are the same as TEST dataset splits, but we present them differently, so that + # SCORE dataset splits have targets as part of data. Because of this we also update + # corresponding problem ID. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176 + if handle_score_split and os.path.exists(os.path.join(dirpath, '..', 'targets.csv')) and problem_id.endswith('_TEST'): + problem_id = problem_id[:-5] + '_SCORE' + + # Also update dataset references. + for data in problem_doc.get('inputs', {}).get('data', []): + if data['datasetID'].endswith('_TEST'): + data['datasetID'] = data['datasetID'][:-5] + '_SCORE' + + with open(problem_path, 'r', encoding='utf8') as problem_file: + problem_description = json.load(problem_file) + + if problem_id in problem_descriptions and problem_description != problem_description_contents[problem_id]: + logger.warning( + "Duplicate problem ID '%(problem_id)s': '%(old_problem)s' and '%(problem)s'", { + 'problem_id': problem_id, + 'problem': problem_path, + 'old_problem': problem_descriptions[problem_id], + }, + ) + else: + problem_descriptions[problem_id] = problem_path + problem_description_contents[problem_id] = problem_description + + except (ValueError, KeyError): + logger.exception( + "Unable to read problem description '%(problem)s'.", { + 'problem': problem_path, + }, + ) + + return datasets, problem_descriptions diff --git a/d3m/docs/_static/custom.css b/d3m/docs/_static/custom.css new file mode 100644 index 0000000..966f953 --- /dev/null +++ b/d3m/docs/_static/custom.css @@ -0,0 +1,38 @@ +/* Making index have only one column. */ +.genindextable td { + display: table-row; +} + +/* No need to make space on the right of the TOC smaller and smaller for every level. */ +.sphinxsidebar ul ul { + margin-right: 0; +} + +/* Let sidebar sticky to the top of the viewport when scrolling down. */ +.sphinxsidebar { + position: sticky; + top: 0; +} + +@media only screen and (min-width: 1250px) { + /* Wider sidebar on large screens. */ + .sphinxsidebar { + width: 350px !important; + } + + .document .bodywrapper { + margin-left: 350px !important; + } + + /* Increase the header height by factor 1.25. */ + body > .related { + line-height: 40px; + font-size: 1.125em; + } + + /* Keep the footer height as it was. */ + body > .related ~ .related { + line-height: 32px; + font-size: 0.9em; + } +} diff --git a/d3m/docs/_templates/toc.html b/d3m/docs/_templates/toc.html new file mode 100644 index 0000000..2acc35a --- /dev/null +++ b/d3m/docs/_templates/toc.html @@ -0,0 +1,8 @@ +{# + Similar to "localtoc.html" but without a link in the heading so + that the color of the heading matches other headings in the sidebar. +#} +{%- if display_toc %} +

{{ _('Table of Contents') }}

+ {{ toc }} +{%- endif %} diff --git a/d3m/docs/_templates/versions.html b/d3m/docs/_templates/versions.html new file mode 100644 index 0000000..1475a6a --- /dev/null +++ b/d3m/docs/_templates/versions.html @@ -0,0 +1,11 @@ +
+

{{ _('Versions') }}

+ +
diff --git a/d3m/docs/about.rst b/d3m/docs/about.rst new file mode 100644 index 0000000..85a43b1 --- /dev/null +++ b/d3m/docs/about.rst @@ -0,0 +1,12 @@ +:orphan: + +.. _about: + +About Data Driven Discovery program +----------------------------------- + +DARPA Data Driven Discovery (D3M) Program is researching ways to get +machines to build machine learning pipelines automatically. It is split +into three layers: TA1 (primitives), TA2 (systems which combine +primitives automatically into pipelines and executes them), and TA3 +(end-users interfaces). diff --git a/d3m/docs/conf.py b/d3m/docs/conf.py new file mode 100644 index 0000000..447d502 --- /dev/null +++ b/d3m/docs/conf.py @@ -0,0 +1,210 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import datetime +import os +import sys + +sys.path.insert(0, os.path.abspath('.')) +import d3m + + +# -- Project information ----------------------------------------------------- + +project = 'D3M' +project_lowercase = project.lower() + +# The short X.Y version +version = d3m.__version__ +# The full version, including alpha/beta/rc tags +release = version + +author = d3m.__author__ +copyright = '2017-{year}, {author}'.format(year=datetime.datetime.now().year, author=author) + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.mathjax', + 'sphinx.ext.ifconfig', + 'sphinx.ext.napoleon', + 'sphinx_autodoc_typehints', + 'sphinxcontrib.fulltoc', + 'recommonmark', + 'sphinx.ext.linkcode', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'nature' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +html_sidebars = { + '**': [ + 'toc.html', + 'versions.html', + 'searchbox.html', + ] +} + +html_title = "{project} {version}".format(project=project, version=version) +html_show_sourcelink = False +html_copy_source = False +modindex_common_prefix = ['d3m.'] + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = project_lowercase + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, '{name}.tex'.format(name=project_lowercase), project, author, 'manual'), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, project_lowercase, d3m.__description__, [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, project_lowercase, project, author, project, d3m.__description__, 'Miscellaneous'), +] + + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = { + 'https://docs.python.org/': None, + 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), + 'numpy': ('https://docs.scipy.org/doc/numpy/', None), + #'numpy': ('https://numpydoc.readthedocs.io/en/latest/', None), + 'scikit-learn': ('https://scikit-learn.org/stable/', None), + 'mypy': ('https://mypy.readthedocs.io/en/stable/', None), + 'setuptools': ('https://setuptools.readthedocs.io/en/latest/', None), +} + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + + +def setup(app): + app.add_stylesheet('custom.css') + + +def linkcode_resolve(domain, info): + if domain != 'py': + return None + if not info['module']: + return None + return 'https://gitlab.com/datadrivendiscovery/d3m/blob/{version}/{path}.py'.format(version=version, path=info['module'].replace('.', '/')) diff --git a/d3m/docs/discovery.rst b/d3m/docs/discovery.rst new file mode 100644 index 0000000..7e246f4 --- /dev/null +++ b/d3m/docs/discovery.rst @@ -0,0 +1,140 @@ +Primitives discovery +================================ + +Primitives D3M namespace +------------------------ + +The :mod:`d3m.primitives` module exposes all primitives under the same +``d3m.primitives`` namespace. + +This is achieved using :ref:`Python entry points `. +Python packages containing primitives should register them and expose +them under the common namespace by adding an entry like the following to +package's ``setup.py``: + +.. code:: python + + entry_points = { + 'd3m.primitives': [ + 'primitive_namespace.PrimitiveName = my_package.my_module:PrimitiveClassName', + ], + }, + +The example above would expose the +``my_package.my_module.PrimitiveClassName`` primitive under +``d3m.primitives.primitive_namespace.PrimitiveName``. + +Configuring ``entry_points`` in your ``setup.py`` does not just put +primitives into a common namespace, but also helps with discovery of +your primitives on the system. Then your package with primitives just +have to be installed on the system and can be automatically discovered +and used by any other Python code. + + **Note:** Only primitive classes are available through the + ``d3m.primitives`` namespace, no other symbols from a source + module. In the example above, only ``PrimitiveClassName`` is + available, not other symbols inside ``my_module`` (except if they + are other classes also added to entry points). + + **Note:** Modules under ``d3m.primitives`` are created dynamically + at run-time based on information from entry points. So some tools + (IDEs, code inspectors, etc.) might not find them because there are + no corresponding files and directories under ``d3m.primitives`` + module. You have to execute Python code for modules to be available. + Static analysis cannot find them. + +Primitives discovery on PyPi +---------------------------- + +To facilitate automatic discovery of primitives on PyPi (or any other +compatible Python Package Index), publish a package with a keyword +``d3m_primitive`` in its ``setup.py`` configuration: + +.. code:: python + + keywords='d3m_primitive' + + **Note:** Be careful when automatically discovering, installing, and + using primitives from unknown sources. While primitives are designed + to be bootstrapable and automatically installable without human + involvement, there are no isolation mechanisms yet in place for + running potentially malicious primitives. Currently recommended way + is to use manually curated lists of known primitives. + +d3m.index API +-------------------------- + +The :mod:`d3m.index` module exposes the following Python utility functions. + +``search`` +~~~~~~~~~~ + +Returns a list of primitive paths (Python paths under ``d3m.primitives`` +namespace) for all known (discoverable through entry points) primitives, +or limited by the ``primitive_path_prefix`` search argument. + +``get_primitive`` +~~~~~~~~~~~~~~~~~ + +Loads (if not already) a primitive class and returns it. + +``get_primitive_by_id`` +~~~~~~~~~~~~~~~~~~~~~~~ + +Returns a primitive class based on its ID from all currently loaded +primitives. + +``get_loaded_primitives`` +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Returns a list of all currently loaded primitives. + +``load_all`` +~~~~~~~~~~~~ + +Loads all primitives available and populates ``d3m.primitives`` +namespace with them. + +``register_primitive`` +~~~~~~~~~~~~~~~~~~~~~~ + +Registers a primitive under ``d3m.primitives`` namespace. + +This is useful to register primitives not necessary installed on the +system or which are generated at runtime. It is also useful for testing +purposes. + +``discover`` +~~~~~~~~~~~~ + +Returns package names from PyPi which provide D3M primitives. + +This is determined by them having a ``d3m_primitive`` among package +keywords. + +Command line +------------ + +The :mod:`d3m.index` module also provides a command line interface by +running ``python3 -m d3m index``. The following commands are currently +available. + +Use ``-h`` or ``--help`` argument to obtain more information about each +command and its arguments. + +``python3 -m d3m index search`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Searches locally available primitives. Lists registered Python paths for +primitives installed on the system. + +``python3 -m d3m index discover`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Discovers primitives available on PyPi. Lists package names containing +D3M primitives on PyPi. + +``python3 -m d3m index describe`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Generates a JSON description of a primitive. diff --git a/d3m/docs/index.rst b/d3m/docs/index.rst new file mode 100644 index 0000000..84179b9 --- /dev/null +++ b/d3m/docs/index.rst @@ -0,0 +1,34 @@ +D3M core package's documentation +================================ + +:Version: |version| + +This is documentation for the common code for D3M project, +the ``d3m`` core package. + +.. toctree:: + :maxdepth: 2 + + installation + quickstart + tutorial + interfaces + discovery + metadata + primitives_base_classes + pipeline + reference + primitive-checklist + +Miscellaneous pages +------------------- + +* :ref:`about` +* :ref:`repostructure` + +Indices and tables +------------------ + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/d3m/docs/installation.rst b/d3m/docs/installation.rst new file mode 100644 index 0000000..a9fe9b0 --- /dev/null +++ b/d3m/docs/installation.rst @@ -0,0 +1,112 @@ +Installation +------------ + +This package works with Python 3.6+ and pip 19+. You need to have the following +packages installed on the system (for Debian/Ubuntu): + +- ``libssl-dev`` +- ``libcurl4-openssl-dev`` +- ``libyaml-dev`` + +You can install latest stable version from `PyPI `__: + +:: + + $ pip3 install d3m + +To install latest development version: + +:: + + $ pip3 install -e git+https://gitlab.com/datadrivendiscovery/d3m.git@devel#egg=d3m + +When cloning a repository, clone it recursively to get also git +submodules: + +:: + + $ git clone --recursive https://gitlab.com/datadrivendiscovery/d3m.git + +Testing +------- + +To ensure consistent performance of the D3M package a test suite and performance benchmarks are ran in the CI pipeline after every commit. +If a commit fails tests or introduces significant performance regression the pipeline fails. + +Running tests +~~~~~~~~~~~~~ + +To run the test suite locally run: + +:: + + $ ./run_tests.py + +Running benchmarks +~~~~~~~~~~~~~~~~~~ + +If you want to run benchmarks locally you first need to install asv: + +:: + + $ pip install asv + +then clone the D3M repository: + +:: + + $ git clone git@gitlab.com:datadrivendiscovery/d3m.git + $ cd d3m/tests + +and run the benchmarks on a set of git commits. The following command: + +:: + + asv continuous --config asv.conf.json -f 1.1 devel HEAD + +will benchmarks changes between last commit to `devel` and latest commit to currently active feature branch. +Make sure the code you want to benchmark is commited into active git branch. + +To inspect performance changes between last two commits in the active branch run: + +:: + + $ asv continuous --config asv.conf.json -f 1.1 HEAD + · Creating environments + · Discovering benchmarks + ·· Uninstalling from virtualenv-py3.6 + ·· Installing a1bb2749 into virtualenv-py3.6. + · Running 4 total benchmarks (2 commits * 1 environments * 2 benchmarks) + [ 0.00%] · For d3m commit 3759f7a7 (round 1/2): + [ 0.00%] ·· Building for virtualenv-py3.6. + [ 0.00%] ·· Benchmarking virtualenv-py3.6 + [ 12.50%] ··· Running (metadata.DatasetMetadata.time_update_0k--).. + [ 25.00%] · For d3m commit a1bb2749 (round 1/2): + [ 25.00%] ·· Building for virtualenv-py3.6. + [ 25.00%] ·· Benchmarking virtualenv-py3.6 + [ 37.50%] ··· Running (metadata.DatasetMetadata.time_update_0k--).. + [ 50.00%] · For d3m commit a1bb2749 (round 2/2): + [ 50.00%] ·· Benchmarking virtualenv-py3.6 + [ 62.50%] ··· metadata.DatasetMetadata.time_update_0k 2.84±0.4ms + [ 75.00%] ··· metadata.DatasetMetadata.time_update_1k 174±4ms + [ 75.00%] · For d3m commit 3759f7a7 (round 2/2): + [ 75.00%] ·· Building for virtualenv-py3.6. + [ 75.00%] ·· Benchmarking virtualenv-py3.6 + [ 87.50%] ··· metadata.DatasetMetadata.time_update_0k 5.59±0.5ms + [100.00%] ··· metadata.DatasetMetadata.time_update_1k 714±10ms + before after ratio + [3759f7a7] [a1bb2749] + + - 5.59±0.5ms 2.84±0.4ms 0.51 metadata.DatasetMetadata.time_update_0k + - 714±10ms 174±4ms 0.24 metadata.DatasetMetadata.time_update_1k + + +During development, you can run a particular benchmark using the current environment and code by:: + + $ asv dev --config asv.conf.json --bench 'metadata.DatasetToJsonStructure.time_to_json_structure.*' + +For additional reference the following resources can be useful: + +- `Pandas performance test suite guide __` +- `Asv usage guide __` +- `Astropy benchmarks __` diff --git a/d3m/docs/interfaces.rst b/d3m/docs/interfaces.rst new file mode 100644 index 0000000..559697c --- /dev/null +++ b/d3m/docs/interfaces.rst @@ -0,0 +1,248 @@ +TA1 API for primitives +==================================== + +A collection of standard Python interfaces for TA1 primitives. All +primitives should extend one of the base classes available and +optionally implement available mixins. + +Design principles +----------------- + +Standard TA1 primitive interfaces have been designed to be possible for +TA2 systems to call primitives automatically and combine them into +pipelines. + +Some design principles applied: + +- Use of a de facto standard language for "glue" between different + components and libraries, Python. +- Use of keyword-only arguments for all methods so that caller does not + have to worry about the order of arguments. +- Every primitive should implement only one functionality, more or less + a function, with clear inputs and outputs. All parameters of the + function do not have to be known in advance and function can be + "fitted" as part of the training step of the pipeline. +- Use of Python 3 typing extensions to annotate methods and classes + with typing information to make it easier for TA2 systems to prune + incompatible combinations of inputs and outputs and to reuse existing + Python type-checking tooling. +- Typing information can serve both detecting issues and + incompatibilities in primitive implementations and help with pipeline + construction. +- All values being passed through a primitive have metadata associated + with them. +- Primitives can operate only at a metadata level to help guide the + pipeline construction process without having to operate on data + itself. +- Primitive metadata is close to the source, primitive code, and not in + separate files to minimize chances that it is goes out of sync. + Metadata which can be automatically determined from the code should + be automatically determined from the code. Similarly for data + metadata. +- All randomness of primitives is captured by a random seed argument to + assure reproducibility. +- Operations can work in iterations, under time budgets, and caller + might not always want to compute values fully. +- Through use of mixins primitives can signal which capabilities they + support. +- Primitives are to be composed and executed in a data-flow manner. + +Main concepts +------------- + +Interface classes, mixins, and methods are documented in detail through +use of docstrings and typing annotations. Here we note some higher-level +concept which can help understand basic ideas behind interfaces and what +they are trying to achieve, the big picture. This section is not +normative. + +A primitive should extend one of the base classes available and +optionally mixins as well. Not all mixins apply to all primitives. That +being said, you probably do not want to subclass ``PrimitiveBase`` +directly, but instead one of other base classes to signal to a caller +more about what your primitive is doing. If your primitive belong to a +larger set of primitives no exiting non-\ ``PrimitiveBase`` base class +suits well, consider suggesting that a new base class is created by +opening an issue or making a merge request. + +Base class and mixins have generally four type arguments you have to +provide: ``Inputs``, ``Outpus``, ``Params``, and ``Hyperparams``. One +can see a primitive as parameterized by those four type arguments. You +can access them at runtime through metadata: + +.. code:: python + + FooBarPrimitive.metadata.query()['class_type_arguments'] + +``Inputs`` should be set to a primary input type of a primitive. +Primary, because you can define additional inputs your primitive might +need, but we will go into these details later. Similarly for +``Outputs``. ``produce`` method then produces outputs from inputs. Other +primitive methods help the primitive (and its ``produce`` method) +achieve that, or help the runtime execute the primitive as a whole, or +optimize its behavior. + +Both ``Inputs`` and ``Outputs`` should be of a +:ref:`container_types`. We allow a limited set of value types being +passed between primitives so that both TA2 and TA3 systems can +implement introspection for those values if needed, or user interface +for them, etc. Moreover this allows us also to assure that they can be +efficiently used with Arrow/Plasma store. + +Container values can then in turn contain values of an :ref:`extended but +still limited set of data types `. + +Those values being passed between primitives also hold metadata. +Metadata is available on their ``metadata`` attribute. Metadata on +values is stored in an instance of +:class:`~d3m.metadata.base.DataMetadata` class. This is a +reason why we have :ref:`our own versions of some standard container +types `: to have the ``metadata`` attribute. + +All metadata is immutable and updating a metadata object returns a new, +updated, copy. Metadata internally remembers the history of changes, but +there is no API yet to access that. But the idea is that you will be +able to follow the whole history of change to data in a pipeline through +metadata. See :ref:`metadata API ` for more information +how to manipulate metadata. + +Primitives have a similar class ``PrimitiveMetadata``, which when +created automatically analyses its primitive and populates parts of +metadata based on that. In this way author does not have to have +information in two places (metadata and code) but just in code and +metadata is extracted from it. When possible. Some metadata author of +the primitive stil has to provide directly. + +Currently most standard interface base classes have only one ``produce`` +method, but design allows for multiple: their name has to be prefixed +with ``produce_``, have similar arguments and same semantics as all +produce methods. The main motivation for this is that some primitives +might be able to expose same results in different ways. Having multiple +produce methods allow the caller to pick which type of the result they +want. + +To keep primitive from outside simple and allow easier compositionality +in pipelines, primitives have arguments defined per primitive and not +per their method. The idea here is that once a caller satisfies +(computes a value to be passed to) an argument, any method which +requires that argument can be called on a primitive. + +There are three types of arguments: + +- pipeline – arguments which are provided by the pipeline, they are + required (otherwise caller would be able to trivially satisfy them by + always passing ``None`` or another default value) +- runtime – arguments which caller provides during pipeline execution + and they control various aspects of the execution +- hyper-parameter – a method can declare that primitive's + hyper-parameter can be overridden for the call of the method, they + have to match hyper-parameter definition + +Methods can accept additional pipeline and hyper-parameter arguments and +not just those from the standard interfaces. + +Produce methods and some other methods return results wrapped in +``CallResult``. In this way primitives can expose information about +internal iterative or optimization process and allow caller to decide +how long to run. + +When calling a primitive, to access ``Hyperparams`` class you can do: + +.. code:: python + + hyperparams_class = FooBarPrimitive.metadata.query()['class_type_arguments']['Hyperparams'] + +You can now create an instance of the class by directly providing values +for hyper-parameters, use available simple sampling, or just use default +values: + +.. code:: python + + hp1 = hyperparams_class({'threshold': 0.01}) + hp2 = hyperparams_class.sample(random_state=42) + hp3 = hyperparams_class.defaults + +You can then pass those instances as the ``hyperparams`` argument to +primitive's constructor. + +Author of a primitive has to define what internal parameters does the +primitive have, if any, by extending the ``Params`` class. It is just a +fancy dict, so you can both create an instance of it in the same way, +and access its values: + +.. code:: python + + class Params(params.Params): + coefficients: numpy.ndarray + + ps = Params({'coefficients': numpy.array[1, 2, 3]}) + ps['coefficients'] + +``Hyperparams`` class and ``Params`` class have to be pickable and +copyable so that instances of primitives can be serialized and restored +as needed. + +Primitives (and some other values) are uniquely identified by their ID +and version. ID does not change through versions. + +Primitives should not modify in-place any input argument but always +first make a copy before any modification. + +Checklist for creating a new primitive +-------------------------------------- +1. Implement as many interfaces as are applicable to your + primitive. An up-to-date list of mixins you can implement can be + found at + + +2. Create unit tests to test all methods you implement + +3. Include all relevant hyperparameters and use appropriate + ``Hyperparameter`` subclass for specifying the range of values a + hyperparameter can take. Try to provide good default values where + possible. Also include all relevant ``semantic_types`` + + +4. Include ``metadata`` and ``__author__`` fields in your class + definition. The ``__author__`` field should include a name or team + as well as email. The ``metadata`` object has many fields which should + be filled in: + + * id, this is a uuid unique to this primitive. It can be generated with :code:`import uuid; uuid.uuid4()` + * version + * python_path, the name you want to be import this primitive through + * keywords, keywords you want your primitive to be discovered by + * installation, how to install the package which has this primitive. This is easiest if this is just a python package on PyPI + * algorithm_types, specify which PrimitiveAlgorithmType the algorithm is, a complete list can be found in TODO + * primitive_family, specify the broad family a primitive falls under, a complete list can be found in TODO + * hyperparameters_to_tune, specify which hyperparameters you would prefer a TA2 system tune + +5. Make sure primitive uses the correct container type + +6. If container type is a dataframe, specify which column is the + target value, which columns are the input values, and which columns + are the output values. + +7. Create an example pipeline which includes this primitive and uses one of the seed datasets as input. + +Examples +-------- + +Examples of simple primitives using these interfaces can be found `in +this +repository `__: + +- `MonomialPrimitive `__ + is a simple regressor which shows how to use ``container.List``, + define and use ``Params`` and ``Hyperparams``, and implement multiple + methods needed by a supervised learner primitive +- `IncrementPrimitive `__ + is a transformer and shows how to have ``container.ndarray`` as + inputs and outputs, and how to set metadata for outputs +- `SumPrimitive `__ + is a transformer as well, but it is just a wrapper around a Docker + image, it shows how to define Docker image in metadata and how to + connect to a running Docker container, moreover, it also shows how + inputs can be a union type of multiple other types +- `RandomPrimitive `__ + is a generator which shows how to use ``random_seed``, too. diff --git a/d3m/docs/metadata.rst b/d3m/docs/metadata.rst new file mode 100644 index 0000000..b0e5f2f --- /dev/null +++ b/d3m/docs/metadata.rst @@ -0,0 +1,718 @@ +.. _metadata: + +Metadata for primitives and the values they process +=================================================== + +Metadata is a core component of any data-based system. This repository +is standardizing how we represent metadata in the D3M program and +focusing on three types of metadata: \* metadata associated with +primitives \* metadata associated with datasets \* metadata associated +with values passed inside pipelines + +This repository is also standardizing types of values being passed +between primitives in pipelines. While theoretically any value could be +passed between primitives, limiting them to a known set of values can +make primitives more compatible, efficient, and values easier to +introspect by TA3 systems. + +.. _container_types: + +Container types +--------------- + +All input and output (container) values passed between primitives should +expose a ``Sequence`` +`protocol `__ (sequence in +samples) and provide ``metadata`` attribute with metadata. + +``d3m.container`` module exposes such standard types: + +- ``Dataset`` – a class representing datasets, including D3M datasets, + implemented in + :mod:`d3m.container.dataset` module +- ``DataFrame`` – + :class:`pandas.DataFrame` + with support for ``metadata`` attribute, implemented in + :mod:`d3m.container.pandas` module +- ``ndarray`` – + :class:`numpy.ndarray` + with support for ``metadata`` attribute, implemented in + :mod:`d3m.container.numpy` module +- ``List`` – a standard :class:`list` with support for ``metadata`` + attribute, implemented in + :mod:`d3m.container.list` module + +``List`` can be used to create a simple list container. + +It is strongly encouraged to use the :class:`~d3m.container.pandas.DataFrame` container type for +primitives which do not have strong reasons to use something else +(:class:`~d3m.container.dataset.Dataset`\ s to operate on initial pipeline input, or optimized +high-dimensional packed data in :class:`~numpy.ndarray`\ s, or :class:`list`\ s to pass as +values to hyper-parameters). This makes it easier to operate just on +columns without type casting while the data is being transformed to make +it useful for models. + +When deciding which container type to use for inputs and outputs of a +primitive, consider as well where an expected place for your primitive +is in the pipeline. Generally, pipelines tend to have primitives +operating on :class:`~d3m.container.dataset.Dataset` at the beginning, then use :class:`~d3m.container.pandas.DataFrame` and +then convert to :class:`~numpy.ndarray`. + +.. _data_types: + +Data types +---------- + +Container types can contain values of the following types: + +* container types themselves +* Python builtin primitive types: + + * ``str`` + * ``bytes`` + * ``bool`` + * ``float`` + * ``int`` + * ``dict`` (consider using :class:`typing.Dict`, :class:`typing.NamedTuple`, or :ref:`TypedDict `) + * ``NoneType`` + +Metadata +-------- + +:mod:`d3m.metadata.base` module provides a +standard Python implementation for metadata object. + +When thinking about metadata, it is useful to keep in mind that metadata +can apply to different contexts: + +* primitives +* values being passed + between primitives, which we call containers (and are container types) +* datasets are a special case of a container +* to parts of data + contained inside a container +* for example, a cell in a table can have + its own metadata + +Containers and their data can be seen as multi-dimensional structures. +Dimensions can have numeric (arrays) or string indexes (string to value +maps, i.e., dicts). Moreover, even numeric indexes can still have names +associated with each index value, e.g., column names in a table. + +If a container type has a concept of *shape* +(:attr:`DataFrame.shape `, :attr:`ndarray.shape `), +dimensions go in that order. For tabular data and existing container +types this means that the first dimension of a container is always +traversing samples (e.g., rows in a table), and the second dimension +columns. + +Values can have nested other values and metadata dimensions go over all +of them until scalar values. So if a Pandas DataFrame contains +3-dimensional ndarrays, the whole value has 5 dimensions: two for rows +and columns of the DataFrame (even if there is only one column), and 3 +for the array. + +To tell to which part of data contained inside a container metadata +applies, we use a *selector*. Selector is a tuple of strings, integers, +or special values. Selector corresponds to a series of ``[...]`` item +getter Python operations on most values, except for Pandas DataFrame +where it corresponds to +:attr:`iloc ` +position-based selection. + +Special selector values: + +- ``ALL_ELEMENTS`` – makes metadata apply to all elements in a given + dimension (a wildcard) + +Metadata itself is represented as a (potentially nested) dict. If +multiple metadata dicts comes from different selectors for the same +resolved selector location, they are merged together in the order from +least specific to more specific, later overriding earlier. ``null`` +metadata value clears the key specified from a less specific selector. + +Example +~~~~~~~ + +To better understand how metadata is attached to various parts of the +value, A `simple tabular D3M +dataset `__ +could be represented as a multi-dimensional structure: + +.. code:: yaml + + { + "0": [ + [0, 5.1, 3.5, 1.4, 0.2, "Iris-setosa"], + [1, 4.9, 3, 1.4, 0.2, "Iris-setosa"], + ... + ] + } + +It contains one resource with ID ``"0"`` which is the first dimension +(using strings as index; it is a map not an array), then rows, which is +the second dimension, and then columns, which is the third dimension. +The last two dimensions are numeric. + +In Python, accessing third column of a second row would be +``["0"][1][2]`` which would be value ``3``. This is also the selector if +we would want to attach metadata to that cell. If this metadata is +description for this cell, we can thus describe this datum metadata as a +pair of a selector and a metadata dict: + +- selector: ``["0"][1][2]`` +- metadata: + ``{"description": "Measured personally by Ronald Fisher."}`` + +Dataset-level metadata have empty selector: + +- selector: ``[]`` +- metadata: ``{"id": "iris_dataset_1", "name": "Iris Dataset"}`` + +To describe first dimension itself, we set ``dimension`` metadata on the +dataset-level (container). ``dimension`` describes the next dimension at +that location in the data structure. + +- selector: ``[]`` +- metadata: ``{"dimension": {"name": "resources", "length": 1}}`` + +This means that the full dataset-level metadata is now: + +.. code:: json + + { + "id": "iris_dataset_1", + "name": "Iris Dataset", + "dimension": { + "name": "resources", + "length": 1 + } + } + +To attach metadata to the first (and only) resource, we can do: + +- selector: ``["0"]`` +- metadata: + ``{"structural_type": "pandas.core.frame.DataFrame", "dimension": {"length": 150, "name": "rows"}`` + +``dimension`` describes rows. + +Columns dimension: + +- selector: ``["0"][ALL_ELEMENTS]`` +- metadata: ``{"dimension": {"length": 6, "name": "columns"}}`` + +Observe that there is no requirement that dimensions are aligned from +the perspective of metadata. But in this case they are, so we can use +``ALL_ELEMENTS`` wildcard to describe columns for all rows. + +Third column metadata: + +- selector: ``["0"][ALL_ELEMENTS][2]`` +- metadata: + ``{"name": "sepalWidth", "structural_type": "builtins.str", "semantic_types": ["http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/Attribute"]}`` + +Column names belong to each particular column and not all columns. Using +``name`` can serve to assign a string name to otherwise numeric +dimension. + +We attach names and types to datums themselves and not dimensions. +Because we use ``ALL_ELEMENTS`` selector, this is internally stored +efficiently. We see traditional approach of storing this information in +the header of a column as a special case of a ``ALL_ELEMENTS`` selector. + +Note that the name of a column belongs to the metadata because it is +just an alternative way to reference values in an otherwise numeric +dimension. This is different from a case where a dimension has +string-based index (a map/dict) where names of values are part of the +data structure at that dimension. Which approach is used depends on the +structure of the container for which metadata is attached to. + +Default D3M dataset loader found in this package parses all tabular +values as strings and add semantic types, if known, for what could those +strings be representing (a float) and its role (an attribute). This +allows primitives later in a pipeline to convert them to proper +structural types but also allows additional analysis on original values +before such conversion is done. + +Fetching all metadata for ``["0"][1][2]`` now returns: + +.. code:: json + + { + "name": "sepalWidth", + "structural_type": "builtins.str", + "semantic_types": [ + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/Attribute" + ], + "description": "Measured personally by Ronald Fisher." + } + +.. _metadata_api: + +API +~~~ + +:mod:`d3m.metadata.base` module provides two +classes which serve for storing metadata on values: :class:`~d3m.metadata.base.DataMetadata` for +data values, and :class:`~d3m.metadata.base.PrimitiveMetadata` for primitives. It also exposes a +:const:`~d3m.metadata.base.ALL_ELEMENTS` constant to be used in selectors. + +You can see public methods available on classes documented in their +code. Some main ones are: + +- ``__init__(metadata)`` – constructs a new instance of the metadata + class and optionally initializes it with top-level metadata +- ``update(selector, metadata)`` – updates metadata at a given location + in data structure identified by a selector +- ``query(selector)`` – retrieves metadata at a given location +- ``query_with_exceptions(selector)`` – retrieves metadata at a given + location, but also returns metadata for selectors which have metadata + which differs from that of ``ALL_ELEMENTS`` +- ``remove(selector)`` – removes metadata at a given location +- ``get_elements(selector)`` – lists element names which exists at a + given location +- ``to_json()`` – converts metadata to a JSON representation +- ``pretty_print()`` – pretty-print all metadata + +``PrimitiveMetadata`` differs from ``DataMetadata`` that it does not +accept selector in its methods because there is no structure in +primitives. + +Standard metadata keys +~~~~~~~~~~~~~~~~~~~~~~ + +You can use custom keys for metadata, but the following keys are +standardized, so you should use those if you are trying to represent the +same metadata: +https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json + +The same key always have the same meaning and we reuse the same key in +different contexts when we need the same meaning. So instead of having +both ``primitive_name`` and ``dataset_name`` we have just ``name``. + +Different keys are expected in different contexts: + +- ``primitive`` – + https://metadata.datadrivendiscovery.org/schemas/v0/primitive.json +- ``container`` – + https://metadata.datadrivendiscovery.org/schemas/v0/container.json +- ``data`` – + https://metadata.datadrivendiscovery.org/schemas/v0/data.json + +A more user friendly visualization of schemas listed above is available +at https://metadata.datadrivendiscovery.org/. + +Contribute: Standardizing metadata schemas are an ongoing process. Feel +free to contribute suggestions and merge requests with improvements. + +.. _primitive-metadata: + +Primitive metadata +~~~~~~~~~~~~~~~~~~ + +Part of primitive metadata can be automatically obtained from +primitive's code, some can be computed through evaluation of primitives, +but some has to be provided by primitive's author. Details of which +metadata is currently standardized and what values are possible can be +found in primitive's JSON schema. This section describes author's +metadata into more detail. Example of primitive's metadata provided by +an author from `Monomial test +primitive `__, +slightly modified: + +.. code:: python + + metadata = metadata_module.PrimitiveMetadata({ + 'id': '4a0336ae-63b9-4a42-860e-86c5b64afbdd', + 'version': '0.1.0', + 'name': "Monomial Regressor", + 'keywords': ['test primitive'], + 'source': { + 'name': 'Test team', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/monomial.py', + 'https://gitlab.com/datadrivendiscovery/tests-data.git', + ], + }, + 'installation': [{ + 'type': metadata_module.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@{git_commit}#egg=test_primitives&subdirectory=primitives'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'location_uris': [ + 'https://gitlab.com/datadrivendiscovery/tests-data/raw/{git_commit}/primitives/test_primitives/monomial.py'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + ], + 'python_path': 'd3m.primitives.test.MonomialPrimitive', + 'algorithm_types': [ + metadata_module.PrimitiveAlgorithmType.LINEAR_REGRESSION, + ], + 'primitive_family': metadata_module.PrimitiveFamily.REGRESSION, + }) + +- Primitive's metadata provided by an author is defined as a class + attribute and instance of :class:`~d3m.metadata.base.PrimitiveMetadata`. +- When class is defined, class is automatically analyzed and metadata + is extended with automatically obtained values from class code. +- ``id`` can be simply generated using :func:`uuid.uuid4` in Python and + should never change. **Do not reuse IDs and do not use the ID from + this example.** +- When primitive's code changes you should update the version, a `PEP + 440 `__ compatible one. + Consider updating a version every time you change code, potentially + using `semantic versioning `__, but nothing of + this is enforced. +- ``name`` is a human-friendly name of the primitive. +- ``keywords`` can be anything you want to convey to users of the + primitive and which could help with primitive's discovery. +- ``source`` describes where the primitive is coming from. The required + value is ``name`` to tell information about the author, but you might + be interested also in ``contact`` where you can put an e-mail like + ``mailto:author@example.com`` as a way to contact the author. + ``uris`` can be anything. In above, one points to the code in GitLab, + and another to the repo. If there is a website for the primitive, you + might want to add it here as well. These URIs are not really meant + for automatic consumption but are more as a reference. See + ``location_uris`` for URIs to the code. +- ``installation`` is important because it describes how can your + primitive be automatically installed. Entries are installed in order + and currently the following types of entries are supported: +- A ``PIP`` package available on PyPI or some other package registry: + + :: + + ``` + { + 'type': metadata_module.PrimitiveInstallationType.PIP, + 'package': 'my-primitive-package', + 'version': '0.1.0', + } + ``` + +- A ``PIP`` package available at some URI. If this is a git repository, + then an exact git hash and ``egg`` name should be provided. ``egg`` + name should match the package name installed. Because here we have a + chicken and an egg problem: how can one commit a hash of code version + if this changes the hash, you can use a helper utility function to + provide you with a hash automatically at runtime. ``subdirectory`` + part of the URI suffix is not necessary and is here just because this + particular primitive happens to reside in a subdirectory of the + repository. +- A ``DOCKER`` image which should run while the primitive is operating. + Starting and stopping of a Docker container is managed by a caller, + which passes information about running container through primitive's + ``docker_containers`` ``__init__`` argument. The argument is a + mapping between the ``key`` value and address and ports at which the + running container is available. See `Sum test + primitive `__ + for an example: + + :: + + ``` + { + 'type': metadata_module.PrimitiveInstallationType.DOCKER, + 'key': 'summing', + 'image_name': 'registry.gitlab.com/datadrivendiscovery/tests-data/summing', + 'image_digest': 'sha256:07db5fef262c1172de5c1db5334944b2f58a679e4bb9ea6232234d71239deb64', + } + ``` + +- A ``UBUNTU`` entry can be used to describe a system library or + package required for installation or operation of your primitive. If + your other dependencies require a system library to be installed + before they can be installed, list this entry before them in + ``installation`` list. + + :: + + ``` + { + 'type': metadata_module.PrimitiveInstallationType.UBUNTU, + 'package': 'ffmpeg', + 'version': '7:3.3.4-2', + } + ``` + +- A ``FILE`` entry allows a primitive to specify a static file + dependency which should be provided by a caller to a primitive. + Caller passes information about the file path of downloaded file + through primitive's ``volumes`` ``__init__`` argument. The argument + is a mapping between the ``key`` value and file path. The filename + portion of the provided path does not necessary match the filename + portion of the file's URI. + + :: + + ``` + { + 'type': metadata_module.PrimitiveInstallationType.FILE, + 'key': 'model', + 'file_uri': 'http://mmlab.ie.cuhk.edu.hk/datasets/comp_cars/googlenet_finetune_web_car_iter_10000.caffemodel', + 'file_digest': '6bdf72f703a504cd02d7c3efc6c67cbbaf506e1cbd9530937db6a698b330242e', + } + ``` + +- A ``TGZ`` entry allows a primitive to specify a static directory + dependency which should be provided by a caller to a primitive. + Caller passes information about the directory path of downloaded and + extracted file through primitive's ``volumes`` ``__init__`` argument. + The argument is a mapping between the ``key`` value and directory + path. + + :: + + ``` + { + 'type': metadata_module.PrimitiveInstallationType.TGZ, + 'key': 'mails', + 'file_uri': 'https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz', + 'file_digest': 'b3da1b3fe0369ec3140bb4fbce94702c33b7da810ec15d718b3fadf5cd748ca7', + } + ``` + +- If you can provide, ``location_uris`` points to an exact code used by + the primitive. This can be obtained through installing a primitive, + but it can be helpful to have an online resource as well. +- ``python_path`` is a path under which the primitive will get mapped + through ``setup.py`` entry points. This is very important to keep in + sync. +- ``algorithm_types`` and ``primitive_family`` help with discovery of a + primitive. They are required and if suitable values are not available + for you, make a merge request and propose new values. As you see in + the code here and in ``installation`` entries, you can use directly + Python enumerations to populate these values. + +Some other metadata you might be interested to provide to help callers +use your primitive better are ``preconditions`` (what preconditions +should exist on data for primitive to operate well), ``effects`` (what +changes does a primitive do to data), and a ``hyperparams_to_tune`` hint +to help callers know which hyper-parameters are most important to focus +on. + +Primitive metadata also includes descriptions of a primitive and its +methods. These descriptions are automatically obtained from primitive's +docstrings. Docstrings should be made according to :ref:`numpy docstring +format ` +(`examples `__). + +Data metadata +~~~~~~~~~~~~~ + +Every value passed around a pipeline has metadata associated with it. +Defined container types have an attribute ``metadata`` to contain it. +API available to manipulate metadata is still evolving because many +operations one can do on data are reasonable also on metadata (e.g., +slicing and combining data). Currently, every operation on data clears +and re-initializes associated metadata. + + **Note:** While part of primitive's metadata is obtained + automatically nothing like that is currently done for data metadata. + This means one has to manually populate with dimension and typing + information. This will be improved in the future with automatic + extraction of this metadata from data. + +Parameters +---------- + +A base class to be subclassed and used as a type for :class:`~d3m.metadata.params.Params` type +argument in primitive interfaces can be found in the +:mod:`d3m.metadata.params` module. An +instance of this subclass should be returned from primitive's +:meth:`~d3m.metadata.params.Params.get_params` method, and accepted in :meth:`~d3m.metadata.params.Params.set_params`. + +To define parameters a primitive has you should subclass this base class +and define parameters as class attributes with type annotations. +Example: + +.. code:: python + + import numpy + from d3m.metadata import params + + class Params(params.Params): + weights: numpy.ndarray + bias: float + +:class:`~d3m.metadata.params.Params` class is just a fancy Python dict which checks types of +parameters and requires all of them to be set. You can create it like: + +.. code:: python + + ps = Params({'weights': weights, 'bias': 0.1}) + ps['bias'] + +:: + + 0.01 + +``weights`` and ``bias`` do not exist as an attributes on the class or +instance. In the class definition, they are just type annotations to +configure which parameters are there. + + **Note:** :class:`~d3m.metadata.params.Params` class uses ``parameter_name: type`` syntax + while :class:`~d3m.metadata.hyperparams.Hyperparams` class uses + ``hyperparameter_name = Descriptor(...)`` syntax. Do not confuse + them. + +.. _hyperparameters: + +Hyper-parameters +---------------- + +A base class for hyper-parameters description for primitives can be +found in the +:mod:`d3m.metadata.hyperparams` module. + +To define a hyper-parameters space you should subclass this base class +and define hyper-parameters as class attributes. Example: + +.. code:: python + + from d3m.metadata import hyperparams + + class Hyperparams(hyperparams.Hyperparams): + learning_rate = hyperparams.Uniform(lower=0.0, upper=1.0, default=0.001, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter' + ]) + clusters = hyperparams.UniformInt(lower=1, upper=100, default=10, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter' + ]) + +To access hyper-parameters space configuration, you can now call: + +.. code:: python + + Hyperparams.configuration + +:: + + OrderedDict([('learning_rate', Uniform(lower=0.0, upper=1.0, q=None, default=0.001)), ('clusters', UniformInt(lower=1, upper=100, default=10))]) + +To get a random sample of all hyper-parameters, call: + +.. code:: python + + hp1 = Hyperparams.sample(random_state=42) + +:: + + Hyperparams({'learning_rate': 0.3745401188473625, 'clusters': 93}) + +To get an instance with all default values: + +.. code:: python + + hp2 = Hyperparams.defaults() + +:: + + Hyperparams({'learning_rate': 0.001, 'clusters': 10}) + +:class:`~d3m.metadata.hyperparams.Hyperparams` class is just a fancy read-only Python dict. You can +also manually create its instance: + +.. code:: python + + hp3 = Hyperparams({'learning_rate': 0.01, 'clusters': 20}) + hp3['learning_rate'] + +:: + + 0.01 + +If you want to use most of default values, but set some, you can thus +use this dict-construction approach: + +.. code:: python + + hp4 = Hyperparams(Hyperparams.defaults(), clusters=30) + +:: + + Hyperparams({'learning_rate': 0.001, 'clusters': 30}) + +There is no class- or instance-level attribute ``learning_rate`` or +``clusters``. In the class definition, they were used only for defining +the hyper-parameters space, but those attributes were extracted out and +put into ``configuration`` attribute. + +There are four types of hyper-parameters: \* tuning parameters which +should be tuned during hyper-parameter optimization phase \* control +parameters which should be determined during pipeline construction phase +and are part of the logic of the pipeline \* parameters which control +the use of resources by the primitive \* parameters which control which +meta-features are computed by the primitive + +You can use hyper-parameter's semantic type to differentiate between +those types of hyper-parameters using the following URIs: + +* https://metadata.datadrivendiscovery.org/types/TuningParameter +* https://metadata.datadrivendiscovery.org/types/ControlParameter +* https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter +* https://metadata.datadrivendiscovery.org/types/MetafeatureParameter + +Once you define a :class:`~d3m.metadata.hyperparams.Hyperparams` class for your primitive you can pass +it as a class type argument in your primitive's class definition: + +.. code:: python + + class MyPrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + ... + +Those class type arguments are then automatically extracted from the +class definition and made part of primitive's metadata. This allows the +caller to access the :class:`~d3m.metadata.hyperparams.Hyperparams` class to crete an instance to pass +to primitive's constructor: + +.. code:: python + + hyperparams_class = MyPrimitive.metadata.get_hyperparams() + primitive = MyPrimitive(hyperparams=hyperparams_class.defaults()) + + **Note:** :class:`~d3m.metadata.hyperparams.Hyperparams` class uses + ``hyperparameter_name = Descriptor(...)`` syntax while :class:`~d3m.metadata.params.Params` + class uses ``parameter_name: type`` syntax. Do not confuse them. + +Problem description +------------------- + +:mod:`d3m.metadata.problem` module provides +a parser for problem description into a normalized Python object. + +You can load a problem description and get the loaded object dumped back +by running: + +.. code:: bash + + python3 -m d3m problem describe + +Dataset +------- + +This package also provides a Python class to load and represent datasets +in Python in :mod:`d3m.container.dataset` +module. This container value can serve as an input to the whole pipeline +and be used as input for primitives which operate on a dataset as a +whole. It allows one to register multiple loaders to support different +formats of datasets. You pass an URI to a dataset and it automatically +picks the right loader. By default it supports: + +- D3M dataset. Only ``file://`` URI scheme is supported and URI should + point to the ``datasetDoc.json`` file. Example: + ``file:///path/to/datasetDoc.json`` +- CSV file. Many URI schemes are supported, including remote ones like + ``http://``. URI should point to a file with ``.csv`` extension. + Example: ``http://example.com/iris.csv`` +- Sample datasets from :mod:`sklearn.datasets`. + Example: ``sklearn://boston`` + +You can load a dataset and get the loaded object dumped back by running: + +.. code:: bash + + python3 -m d3m dataset describe diff --git a/d3m/docs/pipeline.rst b/d3m/docs/pipeline.rst new file mode 100644 index 0000000..733238d --- /dev/null +++ b/d3m/docs/pipeline.rst @@ -0,0 +1,443 @@ +Pipeline +======== + +Pipeline is described as a DAG consisting of interconnected steps, where +steps can be primitives, or (nested) other pipelines. Pipeline has +data-flow semantics, which means that steps are not necessary executed +in the order they are listed, but a step can be executed when all its +inputs are available. Some steps can even be executed in parallel. On +the other hand, each step can use only previously defined outputs from +steps coming before in the order they are listed. In JSON, the following +is a sketch of its representation: + +.. code:: yaml + + { + "id": , + "schema": , + "source": { + "name": , + "contact": , + "from": + ... # Any extra metadata author might want to add into the pipeline, like version, + # name, and config parameters of the system which produced this pipeline. + }, + "created": , + "name": , + "description": , + "users": [ + { + "id": , + "reason": , + "rationale": + } + ], + "inputs": [ + { + "name": + } + ], + "outputs": [ + { + "name": , + "data": + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": , + "version": , + "python_path": , + "name": , + "digest": + }, + # Constructor arguments should not be listed here, because they can be automatically created from other + # information. All these arguments are listed as kind "PIPELINE" in primitive's metadata. + "arguments": { + # A standard inputs argument used for both set_training_data and default "produce" method. + "inputs": { + "type": "CONTAINER", + "data": + }, + # A standard inputs argument, used for "set_training_data". + "outputs": { + "type": "CONTAINER", + "data": + }, + # An extra argument which takes as inputs outputs from another primitive in this pipeline. + "extra_data": { + "type": "CONTAINER", + "data": + }, + # An extra argument which takes as input a singleton output from another step in this pipeline. + "offset": { + "type": "DATA", + "data": + } + }, + "outputs": [ + { + # Data is made available by this step from default "produce" method. + "id": "produce" + }, + { + # Data is made available by this step from an extra "produce" method, too. + "id": "produce_score" + } + ], + # Some hyper-parameters are not really tunable and should be fixed as part of pipeline definition. This + # can be done here. Hyper-parameters listed here cannot be tuned or overridden during a run. Author of + # a pipeline decides which hyper-parameters are which, probably based on their semantic type. + # This is a map hyper-parameter names and their values using a similar format as arguments, but + # allowing also PRIMITIVE and VALUE types. + "hyperparams": { + "loss": { + "type": "PRIMITIVE", + "data": <0-based index from steps identifying a primitive to pass in> + }, + "column_to_operate_on": { + "type": "VALUE", + # Value is converted to a JSON-compatible value by hyper-parameter class. + # It also knows how to convert it back. + "data": 5 + }, + # A special case where a hyper-parameter can also be a list of primitives, + # which are then passed to the \"Set\" hyper-parameter class. + "ensemble": { + "type": "PRIMITIVE", + "data": [ + <0-based index from steps identifying a primitive to pass in>, + <0-based index from steps identifying a primitive to pass in> + ] + } + }, + "users": [ + { + "id": , + "reason": , + "rationale": + } + ] + }, + { + "type": "SUBPIPELINE", + "pipeline": { + "id": + }, + # For example: [{"data": "steps.0.produce"}] would map the data reference "steps.0.produce" of + # the outer pipeline to the first input of a sub-pipeline. + "inputs": [ + { + "data": + } + ], + # For example: [{"id": "predictions"}] would map the first output of a sub-pipeline to a data + # reference "steps.X.predictions" where "X" is the step number of a given sub-pipeline step. + "outputs": [ + { + "id": + } + ] + }, + { + # Used to represent a pipeline template which can be used to generate full pipelines. Not to be used in + # the metalearning context. Additional properties to further specify the placeholder constraints are allowed. + "type": "PLACEHOLDER", + # A list of inputs which can be used as inputs to resulting sub-pipeline. + # Resulting sub-pipeline does not have to use all the inputs, but it cannot use any other inputs. + "inputs": [ + { + "data": + } + ], + # A list of outputs of the resulting sub-pipeline. + # Their (allowed) number and meaning are defined elsewhere. + "outputs": [ + { + "id": + } + ] + } + ] + } + +``id`` uniquely identifies this particular database document. + +Pipeline describes how inputs are computed into outputs. In most cases +inputs are :class:`~d3m.container.dataset.Dataset` container values and +outputs are predictions as Pandas :class:`~d3m.container.pandas.DataFrame` container +values in `Lincoln Labs predictions +format `__, +and, during training, potentially also internal losses/scores. The same +pipeline is used for both training and predicting. + +Pipeline description contains many *data references*. Data reference is +just a string which identifies an output of a step or a pipeline input +and forms a data-flow connection between data available and an input to +a step. It is recommended to be a string of the following forms: + +- ``steps..`` — ``number`` identifies the step in the list + of steps (0-based) and ``id`` identifies the name of a produce method + of the primitive, or the output of a pipeline step +- ``inputs.`` — ``number`` identifies the pipeline input + (0-based) +- ``outputs.`` — ``number`` identifies the pipeline output + (0-based) + +Inputs in the context of metalearning are expected to be datasets, and +the order of inputs match the order of datasets in a pipeline run. (In +other contexts, like TA2-TA3 API, inputs might be something else, for +example a pipeline can consist of just one primitive a TA3 wants to run +on a particular input.) + +Remember that each primitive has a set of arguments it takes as a whole, +combining all the arguments from all its methods. Each argument +(identified by its name) can have only one value associated with it and +any method accepting that argument receives that value. Once all values +for all arguments for a method are available, that method can be called. + +Remember as well that each primitive can have multiple "produce" +methods. These methods can be called after a primitive has been fitted. +In this way a primitive can have multiple outputs, for each "produce" +method one. + +Placeholders can be used to define pipeline templates to be used outside +of the metalearning context. A placeholder is replaced with a pipeline +step to form a pipeline. Restrictions of placeholders may apply on the +number of them, their position, allowed inputs and outputs, etc. + +.. _pipeline-description-example: + +Pipeline description example +---------------------------- + +The following example uses the core package and the `common primitives +repo `__, this +example provides the basic knowledge to build a pipeline in memory. This +specific example creates a pipeline for classification task. + +.. code:: python + + from d3m import index + from d3m.metadata.base import ArgumentType + from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + # -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest + # extract_columns_by_semantic_types(targets) -> ^ + + # Creating pipeline + pipeline_description = Pipeline() + pipeline_description.add_input(name='inputs') + + # Step 1: dataset_to_dataframe + step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) + step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') + step_0.add_output('produce') + pipeline_description.add_step(step_0) + + # Step 2: column_parser + step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) + step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') + step_1.add_output('produce') + pipeline_description.add_step(step_1) + + # Step 3: extract_columns_by_semantic_types(attributes) + step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) + step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') + step_2.add_output('produce') + step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) + pipeline_description.add_step(step_2) + + # Step 4: extract_columns_by_semantic_types(targets) + step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) + step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') + step_3.add_output('produce') + step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) + pipeline_description.add_step(step_3) + + attributes = 'steps.2.produce' + targets = 'steps.3.produce' + + # Step 5: imputer + step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) + step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) + step_4.add_output('produce') + pipeline_description.add_step(step_4) + + # Step 6: random_forest + step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.regression.random_forest.SKlearn')) + step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') + step_5.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets) + step_5.add_output('produce') + pipeline_description.add_step(step_5) + + # Final Output + pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + + # Output to YAML + print(pipeline_description.to_yaml()) + +Pipeline Run +------------ + +:mod:`d3m.metadata.pipeline_run` module contains the classes that represent the Pipeline Run. The Pipeline Run was +introduced to ensure that pipeline execution could be captured and duplicated. To accomplish this, the problem doc, +hyperparameter settings and any other variables to the pipeline execution phases are captured by the Pipeline Run. + +The Pipeline Run is generated during pipeline execution: + +:: + + $ python3 -m d3m runtime fit-produce -p pipeline.json -r problem/problemDoc.json -i dataset_TRAIN/datasetDoc.json \ + -t dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml + + In JSON, the following is a sketch of the Pipeline Run representation in two phases for the above fit-produce call: + +.. code:: yaml + + context: + datasets: + + end: + environment: +
+ id: e3187585-cf8b-5e31-9435-69907912c3ca + pipeline: + + problem: + + random_seed: + run: + is_standard_pipeline: true + phase: FIT + results: + + schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline_run.json + start: + status: + state: + steps: +
+ --- + context: + datasets: + + end: + environment: +
+ id: b2e9b591-c332-5bc5-815e-d1ec73ecdb06 + pipeline: + + previous_pipeline_run: + id: e3187585-cf8b-5e31-9435-69907912c3ca + problem: + + random_seed: + run: + is_standard_pipeline: true + phase: PRODUCE + results: + + scoring: + datasets: + + end: + pipeline: + + random_seed: + start: + status: + state: + steps: +
+ schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline_run.json + start: + status: + state: + steps: +
+ +The d3m module has a call that supports actions Pipeline Run: + +:: + + $ python3 -m d3m pipeline-run --help + +Currently there is only one command available which validates a Pipeline Run: + +:: + + $ python3 -m d3m pipeline-run validate pipeline_run.yml + +The Reference Runtime offers a way to pass an existing Pipeline Run file to a runtime command to allow it to be rerun. +Here is an example of this for the fit-produce call: + +:: + + $ python3 -m d3m runtime fit-produce -u pipeline_run.yml + +Here is the guidance from the help menu: + +:: + + -u INPUT_RUN, --input-run INPUT_RUN + path to a pipeline run file with configuration, use + "-" for stdin + + +Reference runtime +----------------- + +:mod:`d3m.runtime` module contains a reference runtime for pipelines. This +module also has an extensive command line interface you can access +through ``python3 -m d3m runtime``. + +Example of fitting and producing a pipeline with Runtime: + +.. code:: python + + from d3m.metadata import base as metadata_base, hyperparams as hyperparams_module, pipeline as pipeline_module, problem + from d3m.container.dataset import Dataset + from d3m.runtime import Runtime + + # Loading problem description. + problem_description = problem.parse_problem_description('problemDoc.json') + + # Loading dataset. + path = 'file://{uri}'.format(uri=os.path.abspath('datasetDoc.json')) + dataset = Dataset.load(dataset_uri=path) + + # Loading pipeline description file. + with open('pipeline_description.json', 'r') as file: + pipeline_description = pipeline_module.Pipeline.from_json(string_or_file=file) + + # Creating an instance on runtime with pipeline description and problem description. + runtime = Runtime(pipeline=pipeline_description, problem_description=problem_description, context=metadata_base.Context.TESTING) + + # Fitting pipeline on input dataset. + fit_results = runtime.fit(inputs=[dataset]) + fit_results.check_success() + + # Producing results using the fitted pipeline. + produce_results = runtime.produce(inputs=[dataset]) + produce_results.check_success() + + print(produce_results.values) + +Also, the Runtime provides a very useful set of tools to run pipelines +on the terminal, here is a basic example of how to fit and produce a +pipeline like the previous example: + +:: + + $ python3 -m d3m runtime fit-produce -p pipeline.json -r problem/problemDoc.json -i dataset_TRAIN/datasetDoc.json -t dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml + +For more information about the usage: + +:: + + $ python3 -m d3m runtime --help diff --git a/d3m/docs/primitive-checklist.rst b/d3m/docs/primitive-checklist.rst new file mode 100644 index 0000000..5871fdb --- /dev/null +++ b/d3m/docs/primitive-checklist.rst @@ -0,0 +1,139 @@ +.. _primitive-good-citizen: + +Primitive Good Citizen Checklist +================================ + +This is a list of dos, don'ts and things to consider when crafting a new primitive or updating an existing one. This +list is not exhaustive so please add new items to the list as they are discovered! An example of a primitive that +endeavors to adheres to all of the following guidance can be found `here`_: + +DO's + +* Do complete the documentation on the primitive such as: + + * Primitive family, algorithm type. + * Docstring of the primitive's Python class. + + * One line summary first: + + * Primitive name should be close to this. + * Primitive path should be close to this as well. + + * Longer documentation/description after, all in the main docstring of the class. + + * Provide pipeline examples together with the primitive annotation. + * Docstrings in `numpy style`_. + * Please use `reStructuredText`_ instead of markdown or other formats. + * Maintain a change-log of alterations to the primitive (somewhere in the primitive's repo, consider using a `standard format`_). + * One should also add point of contact information and the git repository link in primitive's metadata + (``source.name``, ``source.contact`` and ``source.uris`` metadata fields). + * Add your primitive name to the `list of primitive names`_ if it does not already + exist. Chances are that your generic primitive name is in that list and you should use that name for your primitive. + +* Do annotate your Primitive with Python types. + +* Do make sure the output from your produce method is a d3m container type. + + + +* If your primitive is operating on columns and rows: + + * Do include ``d3mIndex`` column in produced output if input has ``d3mIndex`` column. + * You can make this behavior controlled by the ``add_index_columns`` hyper-parameter. + + * If a primitive has a hyper-paramer to directly set which columns to operate on, do use column + indices and not column names to identify those columns. + + * Consider using a pair of hyper-parameters: ``use_columns`` and ``exclude_columns`` with standard logic. + + * When deciding on which columns to operate, when using semantic types, do use + ``https://metadata.datadrivendiscovery.org/types/TrueTarget`` + and ``https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData`` semantic types and not + ``https://metadata.datadrivendiscovery.org/types/SuggestedTarget`` and + ``https://metadata.datadrivendiscovery.org/types/SuggestedPrivilegedData``. + The latter are semantic types which come from the dataset, the former are those which come from the problem description. + While it is true that currently generally they always match, in fact primitives should just respect those coming from + the problem description. The dataset has them so that one can create problem descriptions on the fly, if needed. + +* Be mindful that data being passed through a pipeline also has metadata: + + * If your primitive generates new data (e.g., new columns), add metadata suitable for those columns: + + * Name the column appropriately for human consumption by setting column's ``name`` metadata. + + * Set semantic types appropriately. + + * If your primitive is producing target predictions, add ``https://metadata.datadrivendiscovery.org/types/PredictedTarget`` + to a column containing those predictions. + + * Remember metadata encountered on target columns during fitting, and reuse that metadata as much + as reasonable when producing target predictions. + + * If your primitive is transforming existing data (e.g., transforming columns), reuse as much metadata from + original data as reasonable, but do update metadata based on new data. + + * If structural type of the column changes, make sure you note this change in metadata as well. + + * Support also non-standard metadata and try to pass it through as-is if possible. + +* Do write unit tests for your primitives. This greatly aids porting to a new version of the core package. + + * Test pickle and unpickle of the primitive (both fitted and unfitted primitives). + * Test with use of semantic types to select columns to operate on, and without the use of semantic types. + * Test with all return types: ``append``, ``replace``, ``new``. + * Test all hyper-parameter values with their ``sample`` method. + * Use/contribute to `tests data repository`_. + +* Do clearly define hyper-parameters (bounds, descriptions, semantic types). + + * Suggest new classes of hyper-parameters if needed. + * Consider if ``upper_inclusive`` and ``lower_inclusive`` values should be included or not for every hyper-parameter + * Define reasonable hyper-parameters which can be automatically populated/searched by TA2. + A hyper-parameter such as ``hyperparams.Hyperparameter[typing.Sequence[Any]]`` is not useful in this case. + * Ensure that your primitive can be run successfully with default settings for all hyper-parameters. + * If there are combinations of hyper-parameters settings that are suboptimal please note this in the documentation. For + example: "If hyper-parameter A is set to a True, hyper-parameter B must always be a positive integer". + +* Do bump primitive version when changing hyper-parameters, method signatures or params. + In short, on any API change of your primitive. + +* If your primitive can use GPUs if available, set ``can_use_gpus`` primitive's metadata to true. + +* If your primitive can use different number of CPUs/cores, expose a hyper-parameter with semantic types + `https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter` and `https://metadata.datadrivendiscovery.org/types/CPUResourcesUseParameter` + and allow caller to control the number of CPUs/cores used through it. + + * Make sure that the default value of such hyper-parameter is 1. + +DON'Ts + +* Don't change the input DataFrame! Make a copy and make changes to the copy instead. The original input DataFrame is + assumed never to change between primitives in the pipeline. +* Don't return DataFrames with a (non-default) Pandas DataFrame index. It can be utilized internally, but drop it before + returning. On output a default index should be provided. + +PLEASE CONSIDER + +* Consider using/supporting semantic types to select which columns to operate on, and use the `use_semantic_types` hyper-parameter. +* Consider allowing three types of outputs strategies: ``new``/``append``/``replace`` output, if operating on columns, + controlled by the ``return_result`` hyper-parameter. +* Consider picking the input and output format/structure of data to match other primitives of the same family/type. If + necessary, convert data to the format you need inside your primitive. Pipelines tend to start with datasets, then go + to dataframes, and then to ndarrays sometimes, returning predictions as a dataframe. + Consider where your primitive in a pipeline generally should be and + consider that when deciding on what are inputs and outputs of your primitive. Consider that your primitive will be + chosen dynamically by a TA2 and will be expected to behave in predictable ways based on family and base class. +* Consider using a specific hyper-parameter class instead of the hyper-parameter base class as it is not very useful for + TA2s. For example use ``hyperparams.Set`` instead of ``hyperparams.Hyperparameter[typing.Sequence[Any]]``. It is + better to use the former as it is far more descriptive. +* Use a base class for your primitive which makes sense based on semantics of the base class and not necessarily + how a human would understand the primitive. +* Consider that your primitive will be chosen dynamically by a TA2 and will + be expected to behave in predictable ways based on primitive family and base class. + +.. _here: https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/random_forest.py +.. _numpy style: https://numpydoc.readthedocs.io/en/latest/format.html +.. _reStructuredText: http://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html +.. _tests data repository: https://gitlab.com/datadrivendiscovery/tests-data +.. _standard format: https://keepachangelog.com/en/1.0.0/ +.. _list of primitive names: https://gitlab.com/datadrivendiscovery/d3m/-/blob/devel/d3m/metadata/primitive_names.py diff --git a/d3m/docs/primitives_base_classes.rst b/d3m/docs/primitives_base_classes.rst new file mode 100644 index 0000000..c611db7 --- /dev/null +++ b/d3m/docs/primitives_base_classes.rst @@ -0,0 +1,40 @@ +High-level primitives base classes +================================== + +High-level primitives base classes provides tools to the developers +to easily create new primitives by abstracting some unnecessary and +repetitive work. + +Primitives base classes +----------------------- + +``FileReaderPrimitiveBase``: A primitive base class for reading files referenced in columns. + +``DatasetSplitPrimitiveBase``: A base class for primitives which fit on a +``Dataset`` object to produce splits of that ``Dataset`` when producing. + +``TabularSplitPrimitiveBase``: A primitive base class for splitting tabular datasets. + + +Examples +-------- + +Examples of primitives using these base classes can be found `in +this +repository `__: + +- `DataFrameImageReaderPrimitive `__ + A primitive which reads columns referencing image files. +- `FixedSplitDatasetSplitPrimitive `__ + A primitive which splits a tabular Dataset in a way that uses for the test + (score) split a fixed list of primary index values or row indices of the main + resource to be used. All other rows are added used for the train split. +- `KFoldDatasetSplitPrimitive `__ + A primitive which splits a tabular Dataset for k-fold cross-validation. +- `KFoldTimeSeriesSplitPrimitive `__ + A primitive which splits a tabular time-series Dataset for k-fold cross-validation. +- `NoSplitDatasetSplitPrimitive `__ + A primitive which splits a tabular Dataset in a way that for all splits it + produces the same (full) Dataset. +- `TrainScoreDatasetSplitPrimitive `__ + A primitive which splits a tabular Dataset into random train and score subsets. diff --git a/d3m/docs/quickstart.rst b/d3m/docs/quickstart.rst new file mode 100644 index 0000000..40cb826 --- /dev/null +++ b/d3m/docs/quickstart.rst @@ -0,0 +1,817 @@ +.. _quickstart: + +TA1 quick-start guide +===================== + +This aims to be a tutorial, or a quick-start guide, for +newcomers to the D3M project who are interested in writing TA1 primitives. +It is not meant to be a comprehensive +guide to everything about D3M, or even just TA1. The goal here is for +the reader to be able to write a new, simple, but working primitive by +the end of this tutorial. To achieve this goal, this tutorial is divided +into several sections: + +Important links +--------------- + +First, here is a list of some important links that should help you with +reference and instructional material beyond this quick start guide. Be +aware also that the d3m core package source code has extensive docstrings that +:ref:`you may find helpful `. + +- Documentation of the whole D3M program: + `https://docs.datadrivendiscovery.org `__ +- Common primitives: + `https://gitlab.com/datadrivendiscovery/common-primitives `__ +- Public datasets: + `https://datasets.datadrivendiscovery.org/d3m/datasets `__ +- Docker images: + `https://docs.datadrivendiscovery.org/docker.html `__ +- Index of TA1, TA2, TA3 repositories: + `https://github.com/darpa-i2o/d3m-program-index `__ +- :ref:`primitive-good-citizen` + +.. _overview-of-primitives-and-pipelines: + +Overview of primitives and pipelines +------------------------------------ + +Let's start with basic definitions in order for us to understand a +little bit better what happens when we run a pipeline later in the +tutorial. + +A *pipeline* is basically a series of steps that are executed in order +to solve a particular *problem* (such as prediction based on historical +data). A step of a pipeline is usually a *primitive* (a step can be +something else, however, like a sub-pipeline, but for the purposes of +this tutorial, assume that each step is a primitive): something that +individually could, for example, transform data into another format, or +fit a model for prediction. There are many types of primitives (see the +`primitives index repo`_ for the full +list of available primitives). In a pipeline, the steps must be arranged +in a way such that each step must be able to read the data in the format +produced by the preceding step. + +.. _primitives index repo: https://gitlab.com/datadrivendiscovery/primitives + +For this tutorial, let's try to use the example pipeline that comes with +a primitive called +``d3m.primitives.classification.logistic_regression.SKlearn`` to predict +baseball hall-of-fame players, based on their stats (see the +`185_baseball dataset `__). + +Let's take a look at the example pipeline. Many example pipelines can be found +in `primitives index repo`_ where they demonstrate how to use particular primitives. +At the time of this writing, an example pipeline can be found `here +`__, +but this repository's directory names and files periodically change, so it is +prudent to see how to navigate to this file too. + +The index is organized as: +- ``v2020.1.9`` (version of the core package of the index, changes periodically) +- ``JPL`` (the organization that develops/maintains the primitive) +- ``d3m.primitives.classification.logistic_regression.SKlearn`` (the python path of the actual primitive) +- ``2019.11.13`` (the version of this primitive, changes periodically) +- ``pipelines`` +- ``862df0a2-2f87-450d-a6bd-24e9269a8ba6.json`` (actual pipeline description filename, changes periodically) + +Early on in this JSON document, you will see a list called ``steps``. This +is the actual list of primitive steps that run one after another in a +pipeline. Each step has the information about the primitive, as well as +arguments, outputs, and hyper-parameters, if any. This specific pipeline +has 5 steps (the ``d3m.primitives`` prefix is omitted in the following +list): + +- ``data_transformation.dataset_to_dataframe.Common`` +- ``data_transformation.column_parser.Common`` +- ``data_cleaning.imputer.SKlearn`` +- ``classification.logistic_regression.SKlearn`` +- ``data_transformation.construct_predictions.Common`` + +Now let's take a look at the first primitive step in that pipeline. We +can find the source code of this primitive in the common-primitives repo +(`common_primitives/dataset_to_dataframe.py +`__). +Take a look particularly at the ``produce`` method. This is essentially +what the primitive does. Try to do this for the other primitive steps in +the pipeline as well - take a cursory look at what each one essentially +does (note that for the actual classifier primitive, you should look at +the ``fit`` method as well to see how the model is trained). Primitives +whose python path suffix is ``*.Common`` is in the `common primitives `__ +repository, and those that have a ``*.SKlearn`` suffix is in the +`sklearn-wrap `__ repository (checkout the `dist `__ branch, +to which primitives are being generated). + +If you're having a hard time looking for the correct source file, you can try +taking the primitive ``id`` from the primitive step description in the +pipeline, and ``grep`` for it. For example, if you were +looking for the source code of the first primitive step in this +pipeline, first look at the primitive info in that step and get its +``id``: + +.. code:: + + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "version": "0.3.0", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "name": "Extract a DataFrame from a Dataset" + }, + +Then, run this: + +.. code:: shell + + git clone https://gitlab.com/datadrivendiscovery/common-primitives.git + cd common-primitives + grep -r 4b42ce1e-9b98-4a25-b68e-fad13311eb65 . | grep -F .py + +However, this series of commands assumes that you know exactly which +specific repository is the primitive's source code located in (the ``git +clone`` command). Since this is probably not the case for an arbitrarily +given primitive, there is a method on how to find out the repository URL +of any primitive, and it requires using a d3m Docker image, which is +described in the next section. + +Setting up a local d3m environment +---------------------------------- + +In order to run a pipeline, you must have a Python environment where the +d3m core package is installed, as well as the packages of the primitives +installed as well. While it is possible to setup a Python virtual +environment and install the packages them through ``pip``, in this +tutorial, we're going to use the d3m Docker images instead (in many +cases, even beyond this tutorial, this will save you a lot of time and +effort trying to find the any missing primitive packages, manually +installing them, and troubleshooting installation errors). So, make sure +`Docker `__ is installed in your system. + +You can find the list of D3M docker images `here `__. +The one we're going to use in this tutorial is the v2020.1.9 +primitives image (feel free to use whatever the latest one instead +though - just modify the ``v2020.1.9`` part accordingly): + +.. code:: shell + + docker pull registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 + +Once you have downloaded the image, we can finally run the d3m package +(and hence run a pipeline). Before running a pipeline though, let's +first try to get a list of what primitives are installed in the image's +Python environment: + +.. code:: shell + + docker run --rm registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 python3 -m d3m index search + +You should get a big list of primitives. All of the known primitives to +D3M should be there. + +You can also run the docker container in interactive mode (to run +commands as if you have logged into the container machine provides) by +using the ``-it`` option: + +.. code:: shell + + docker run --rm -it registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 + +The previous section mentions a method of determining where the source +code of an arbitrarily given primitive can be found. We can do this +using the d3m python package within a d3m docker container. First get the +``python_path`` of the primitive step (see the JSON snippet above of the +primitive's info from the pipeline). Then, run this command: + +.. code:: shell + + docker run --rm registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 python3 -m d3m index describe d3m.primitives.data_transformation.dataset_to_dataframe.Common + +Near the top of the huge JSON string describing the primitive, you'll see +``"source"``, and inside it, ``"uris"``. To help read the JSON, you can use +the ``jq`` utility: + +.. code:: shell + + docker run --rm -it registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 + python3 -m d3m index describe d3m.primitives.data_transformation.dataset_to_dataframe.Common | jq .source.uris + +This should give the URI of the git repo where the source code of that primitive can be found. Also, You +can also substitute the primitive ``id`` for the ``python_path`` in that +command, but the command usually returns a result faster if you provide +the ``python_path``. Note also that you can only do this for primitives +that have been submitted for a particular image (primitives that are +contained in the `primitives index repo`_). + +It can be obscure at first how to use the d3m python package, but you can +always access the help string for each d3m command at every level of the +command chain by using the ``-h`` flag. This is useful especially for +the getting a list of all the possible arguments for the ``runtime`` +module. + +.. code:: shell + + docker run --rm registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 python3 -m d3m -h + docker run --rm registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 python3 -m d3m index -h + docker run --rm registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 python3 -m d3m runtime -h + docker run --rm registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 python3 -m d3m runtime fit-score -h + +One last point before we try running a pipeline. The docker container +must be able to access the dataset location and the pipeline location +from the host filesystem. We can do this by `bind-mounting +`__ a host directory that +contains both the ``datasets`` repo and the ``primitives`` index repo to +a container directory. Git clone these repos, and also make another empty directory called +``pipeline-outputs``. Now, if your directory structure looks like this:: + + /home/foo/d3m + ├── datasets + ├── pipeline-outputs + └── primitives + +Then you'll want to bind-mount ``/home/foo/d3m`` to a directory in the +container, say ``/mnt/d3m``. You can specify this mapping in the docker +command itself: + +.. code:: shell + + docker run \ + --rm \ + -v /home/foo/d3m:/mnt/d3m \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2019.11.10 \ + ls /mnt/d3m + +If you're reading this tutorial from a text editor, it might be a good +idea at this point to find and replace ``/home/foo/d3m`` with the actual +path in your system where the ``datasets``, ``pipeline-outputs``, and +``primitives`` directories are all located. This will make it easier for +you to just copy and paste the commands from here on out, instead of +changing the faux path every time. + +.. _running-example-pipeline: + +Running an example pipeline +--------------------------- + +At this point, let's try running a pipeline. Again, we're going to run +the example pipeline that comes with +``d3m.primitives.classification.logistic_regression.SKlearn``. There are +two ways to run a pipeline: by specifying all the necessary paths of the +dataset, or by specifying and using a pipeline run file. Let's +make sure first though that the dataset is available, as described in the +next subsection. + +.. _preparing-dataset: + +Preparing the dataset +~~~~~~~~~~~~~~~~~~~~~ + +Towards the end of the previous section, you were asked to git clone the +``datasets`` repo to your machine. Most likely, you might have +accomplished that like this: + +.. code:: shell + + git clone https://datasets.datadrivendiscovery.org/d3m/datasets.git + +But unless you had `git LFS `__ +installed, the entire contents of the repo might not have been really +installed. + +The repo is organized such that all files larger than 100 +KB is stored in git LFS. Thus, if you cloned without git LFS installed, you +most likely have to do a one-time extra step before you can use a dataset, as +some files of that dataset that are over 100 KB will not have the actual +data in them (although they will still exist as files in the cloned +repo). This is true even for the dataset that we will use in this +exercise, ``185_baseball``. To verify this, open this file in a text +editor:: + + datasets/training_datasets/seed_datasets_archive/185_baseball/185_baseball_dataset/tables/learningData.csv + +Then, see if it contains text similar to this:: + + version https://git-lfs.github.com/spec/v1 + oid sha256:931943cc4a675ee3f46be945becb47f53e4297ec3e470c4e3e1f1db66ad3b8d6 + size 131187 + +If it does, then this dataset has not yet been fully downloaded from git +LFS (but if it looks like a normal CSV file, then you can skip the rest +of this subsection and move on). To download this dataset, simply run +this command inside the ``datasets`` directory: + +.. code:: shell + + git lfs pull -I training_datasets/seed_datasets_archive/185_baseball/ + +Inspect the file again, and you should see that it looks like a normal +CSV file now. + +In general, if you don't know which specific dataset does a certain +example pipeline in the ``primitives`` repo uses, inspect the pipeline +run output file of that primitive (whose file path is similar to that of +the pipeline JSON file, as described in the :ref:`overview-of-primitives-and-pipelines` section, but +instead of going to ``pipelines``, go to ``pipeline_runs``). The +pipeline run is initially gzipped in the ``primitives`` repo, so +decompress it first. Then open up the actual .yml file, look at +``datasets``, and under it should be ``id``. If you do that for the +example pipeline run of the SKlearn logistic regression primitive +that we're looking at for this exercise, you'll find that the dataset id +is ``185_baseball_dataset``. The name of the main dataset directory is this string, +without the ``_dataset`` part. + +Now, let's actually run the pipeline using the two ways mentioned +earlier. + +Specifying all the necessary paths of a dataset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can use this if there is no existing pipeline run yet for a +pipeline, or if you want to manually specify the dataset path (set the +paths for ``--problem``, ``--input``, ``--test-input``, ``--score-input``, ``--pipeline`` to your target dataset +location). + +Remember to change the bind mount paths as appropriate for your system +(specified by ``-v``). + +.. code:: shell + + docker run \ + --rm \ + -v /home/foo/d3m:/mnt/d3m \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 \ + python3 -m d3m \ + runtime \ + fit-score \ + --problem /mnt/d3m/datasets/training_datasets/seed_datasets_archive/185_baseball/185_baseball_problem/problemDoc.json \ + --input /mnt/d3m/datasets/training_datasets/seed_datasets_archive/185_baseball/TRAIN/dataset_TRAIN/datasetDoc.json \ + --test-input /mnt/d3m/datasets/training_datasets/seed_datasets_archive/185_baseball/TEST/dataset_TEST/datasetDoc.json \ + --score-input /mnt/d3m/datasets/training_datasets/seed_datasets_archive/185_baseball/SCORE/dataset_TEST/datasetDoc.json \ + --pipeline /mnt/d3m/primitives/v2020.1.9/JPL/d3m.primitives.classification.logistic_regression.SKlearn/2019.11.13/pipelines/862df0a2-2f87-450d-a6bd-24e9269a8ba6.json \ + --output /mnt/d3m/pipeline-outputs/predictions.csv \ + --output-run /mnt/d3m/pipeline-outputs/run.yml + +The score is displayed after the pipeline run. The output predictions +will be stored on the path specified by ``--output``, and information about +the pipeline run is stored in the path specified by ``--output-run``. + +Again, you can use the ``-h`` flag on ``fit-score`` to access the help +string and read about the different arguments, as described earlier. + +If you get a python error that complains about missing columns, or +something that looks like this:: + + ValueError: Mismatch between column name in data 'version https://git-lfs.github.com/spec/v1' and column name in metadata 'd3mIndex'. + +Chances are that the ``185_baseball`` dataset has not yet been +downloaded through git LFS. See the :ref:`previous subsection +` for details on how to verify and do this. + +Using a pipeline run file +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of specifying all the specific dataset paths, you can also use +an existing pipeline run to essentially "re-run" a previous run +of the pipeline: + +.. code:: shell + + docker run \ + --rm \ + -v /home/foo/d3m:/mnt/d3m \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 \ + python3 -m d3m \ + --pipelines-path /mnt/d3m/primitives/v2020.1.9/JPL/d3m.primitives.classification.logistic_regression.SKlearn/2019.11.13/pipelines \ + runtime \ + --datasets /mnt/d3m/datasets \ + fit-score \ + --input-run /mnt/d3m/primitives/v2020.1.9/JPL/d3m.primitives.classification.logistic_regression.SKlearn/2019.11.13/pipeline_runs/pipeline_run.yml.gz \ + --output /mnt/d3m/pipeline-outputs/predictions.csv \ + --output-run /mnt/d3m/pipeline-outputs/run.yml + +In this case, ``--input-run`` is the pipeline run file that this pipeline +will re-run, and ``---output-run`` is the new pipeline run file that will be +generated. + +Note that if you choose ``fit-score`` for the d3m runtime option, the +pipeline actually runs in two phases: fit, and produce. You can verify +this by searching for ``phase`` in the pipeline run file. + +Lastly, if you want to run multiple commands in the docker container, +simply chain your commands with ``&&`` and wrap them double quotes +(``"``) for ``bash -c``. As an example: + +.. code:: shell + + docker run \ + --rm \ + -v /home/foo/d3m:/mnt/d3m \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 \ + /bin/bash -c \ + "python3 -m d3m \ + --pipelines-path /mnt/d3m/primitives/v2020.1.9/JPL/d3m.primitives.classification.logistic_regression.SKlearn/2019.11.13/pipelines \ + runtime \ + --datasets /mnt/d3m/datasets/training_datasets/seed_datasets_archive/185_baseball \ + fit-score \ + --input-run /mnt/d3m/primitives/v2020.1.9/JPL/d3m.primitives.classification.logistic_regression.SKlearn/2019.11.13/pipeline_runs/pipeline_run.yml \ + --output /mnt/d3m/pipeline-outputs/predictions.csv \ + --output-run /mnt/d3m/pipeline-outputs/run.yml && \ + head /mnt/d3m/pipeline-outputs/predictions.csv" + +Writing a new primitive +----------------------- + +Let's now try to write a very simple new primitive - one that simply +passes whatever input data it receives from the previous step to the +next step in the pipeline. Let's call this primitive "Passthrough". + +We will use this `skeleton primitive repo +`__ +as a starting point +for this exercise. A d3m primitive repo does not have to follow the +exact same directory structure as this, but this is a good structure to +start with, at least. git clone the repo into ``docs-quickstart`` at the same place +where the other repos that we have used earlier are located +(``datasets``, ``pipeline-outputs``, ``primitives``). + +Alternatively, you can also use the `test primitives +`__ +as a model/starting point. ``test_primitives/null.py`` is essentially +the same primitive that we are trying to write. + +.. _primitive-source-code: + +Primitive source code +~~~~~~~~~~~~~~~~~~~~~ + +In the ``docs-quickstart`` directory, open +``quickstart_primitives/sample_primitive1/input_to_output.py``. The first +important thing to change here is the primitive metadata, which are the +first objects defined under the ``InputToOutputPrimitive`` class. Modify the +following fields (unless otherwise noted, the values you put in must be +strings): + +- ``id``: The primitive's UUID v4 number/identifier. To generate one, + you can run simply run this simple inline Python command: + + .. code:: shell + + python3 -c "import uuid; print(uuid.uuid4())" + +- ``version``: You can use semantic versioning for this or another style + of versioning. Write ``"0.1.0"`` for this exercise. You should bump + the version of the primitive at least every time public interfaces + of the primitive change (e.g. hyper-parameters). + +- ``name``: The primitive's name. Write ``"Passthrough primitive"`` for + this exercise. + +- ``description``: A short description of the primitive. Write ``"A + primitive which directly outputs the input."`` for this exercise. + +- ``python_path``: This follows this format:: + + d3m.primitives... + + Primitive families can be found in the `d3m metadata page + `__ + (wait a few seconds for the page to load completely), and primitive + names can be found in the `d3m core package source code + `__. + The last segment can be used to attribute the primitive to the author and/or + describe in which way it is different from other primitives with same + primitive family and primitive name, e.g., a different implementation with different + trade-offs. + + For this exercise, write + ``"d3m.primitives.operator.input_to_output.Quickstart"``. Note that + ``input_to_output`` is not currently registered as a standard primitive name + and using it will produce a warning. For primitives you intent on publishing + make a merge request to the d3m core package to add any primitive names + you need. + +- ``primitive_family``: This must be the same as used for ``python_path``, + as enumeration value. You can use a string or Python enumeration value. + Add this import statement (if not there already): + + .. code:: python + + from d3m.metadata import base as metadata_base + + Then write ``metadata_base.PrimitiveFamily.OPERATOR`` (as + a value, not a string, so do not put quotation marks) as the value of + this field. + +- ``algorithm_types``: Algorithm type(s) that the primitive implements. + This can be multiple values in an array. Values can be chosen from + the `d3m metadata page + `__ + as well. + Write ``[metadata_base.PrimitiveAlgorithmType.IDENTITY_FUNCTION]`` + here for this exercise (as a list that contains one element, not a + string). + +- ``source``: General info about the author of this primitive. ``name`` + is usually the name of the person or the team that wrote this + primitive. ``contact`` is a ``mailto`` URI to the email address of + whoever one should contact about this primitive. ``uris`` are usually + the git clone URL of the repo, and you can also add the URL of the + source file of this primitive. + + Write these for the exercise: + + .. code:: python + + "name": "My Name", + "contact": "mailto:myname@example.com", + "uris": ["https://gitlab.com/datadrivendiscovery/docs-quickstart.git"], + +- ``keywords``: Key words for what this primitive is or does. Write + ``["passthrough"]``. + +- ``installation``: Information about how to install this primitive. Add + these import statements first: + + .. code:: python + + import os.path + from d3m import utils + + Then replace the ``installation`` entry with this: + + .. code:: python + + "installation": [{ + "type": metadata_base.PrimitiveInstallationType.PIP, + "package_uri": "git+https://gitlab.com/datadrivendiscovery/docs-quickstart@{git_commit}#egg=quickstart_primitives".format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)) + ), + }], + + In general, for your own actual primitives, you might only need to + substitute the git repo URL here as well as the python egg name. + +Next, let's take a look at the ``produce`` method. You can see that it +simply makes a new dataframe out of the input data, and returns it as +the output. To see for ourselves though that our primitive (and thus +this ``produce`` method) gets called during the pipeline run, let's add +a log statement here. The ``produce`` method should now look something +like this: + +.. code:: python + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + self.logger.warning('Hi, InputToOutputPrimitive.produce was called!') + return base.CallResult(value=inputs) + +Note that this is simply an example primitive that is intentionally +simple for the purposes of this tutorial. It does not necessarily model +a well-written primitive, by any means. For guidelines on how to write a +good primitive, take a look at the :ref:`primitive-good-citizen`. + +setup.py +~~~~~~~~ + +Next, we fill in the necessary information in ``setup.py`` so that +``pip`` can correctly install our primitive in our local d3m +environment. Open ``setup.py`` (in the project root), and modify the +following fields: + +- ``name``: Same as the egg name you used in ``package_uri`` + +- ``version``: Same as the primitive metadata's ``version`` + +- ``description``: Same as the primitive metadata's ``description``, + or a description of all primitives if there are multiple primitives + in the package you are making + +- ``author``: Same as the primitive metadata's ``suorce.name`` + +- ``url``: Same as main URL in the primitive metadata's + ``source.uris`` + +- ``packages``: This is an array of the python packages that this + primitive repo contains. You can use the ``find_packages`` helper: + + .. code:: python + + packages=find_packages(exclude=['pipelines']), + +- ``keywords``: A list of keywords. Important standard keyword is + ``d3m_primitive`` which makes all primitives discoverable on PyPi + +- ``install_requires``: This is an array of the python package + dependencies of the primitives contained in this repo. Our primitive + needs nothing except the d3m core package (and the + ``common-primitives`` package too for testing, but this is not a + package dependency), so write this as the value of this field: + ``['d3m']`` + +- ``entry_points``: This is how the d3m runtime maps your primitives' + d3m python paths to the your repo's local python paths. For this + exercise, it should look like this: + + .. code:: python + + entry_points={ + 'd3m.primitives': [ + 'operator.input_to_output.Quickstart = quickstart_primitives.sample_primitive1:InputToOutputPrimitive', + ], + } + +That's it for this file. Briefly review it for any possible syntax +errors. + +Primitive unit tests +~~~~~~~~~~~~~~~~~~~~ + +Let's now make a python test for this primitive, which in this case will +just assert whether the input dataframe to the primitive equals the +output dataframe. Make a new file called ``test_input_to_output.py`` +inside ``quickstart_primitives/sample_primitive1`` (the same directory as +``input_to_output.py``), and write this as its contents: + +.. code:: python + + import unittest + import os + + from d3m import container + from common_primitives import dataset_to_dataframe + from input_to_output import InputToOutputPrimitive + + + class InputToOutputTestCase(unittest.TestCase): + def test_output_equals_input(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'tests-data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + i2o_hyperparams_class = InputToOutputPrimitive.metadata.get_hyperparams() + i2o_primitive = InputToOutputPrimitive(hyperparams=dataframe_hyperparams_class.defaults()) + output = i2o_primitive.produce(inputs=dataframe).value + + self.assertTrue(output.equals(dataframe)) + + + if __name__ == '__main__': + unittest.main() + +For the dataset that this test uses, add as git submodule the `d3m tests-data `__ +repository at the root of the ``docs-quickstart`` repository. +Then let's install this new primitive to the Docker image's d3m environment, and +run this test using the command below: + +.. code:: shell + + docker run \ + --rm \ + -v /home/foo/d3m:/mnt/d3m \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 \ + /bin/bash -c \ + "pip3 install -e /mnt/d3m/docs-quickstart && \ + cd /mnt/d3m/docs-quickstart/quickstart_primitives/sample_primitive1 && \ + python3 test_input_to_output.py" + +You should see a log statement like this, as well as the python unittest +pass message:: + + Hi, InputToOutputPrimitive.produce was called! + . + ---------------------------------------------------------------------- + Ran 1 test in 0.011s + +Using this primitive in a pipeline +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Having seen the primitive test pass, we can now confidently include this +primitive in a pipeline. Let's take the same pipeline that we ran :ref:`before ` +(the sklearn logistic regression's example pipeline), +and add a step using this primitive. + +In the root directory of your repository, create these directories: +``pipelines/operator.input_to_output.Quickstart``. Then, from the d3m +``primitives`` repo, copy the JSON pipeline description file from +``primitives/v2020.1.9/JPL/d3m.primitives.classification.logistic_regression.SKlearn/2019.11.13/pipelines`` +into the directory we just created. Open this file, and replace the +``id`` (generate another UUID v4 number using the inline python command +earlier, different from the primitive ``id``), as well as the created +timestamp using this inline python command (add ``Z`` at the end of the +generated timestamp):: + + python3 -c "import time; import datetime; \ + print(datetime.datetime.fromtimestamp(time.time()).isoformat())" + +You can rename the json file too using the new pipeline ``id``. + +Next, change the output step number (shown below, ``"steps.4.produce"``) +to be one more than the current number (at the time of this writing, it +is ``4``, so in this case, change it to ``5``): + +.. code:: json + + "outputs": [ + { + "data": "steps.5.produce", + "name": "output predictions" + } + ], + +Then, find the step that contains the +``d3m.primitives.classification.logistic_regression.SKlearn`` primitive +(search for this string in the file), and right above it, add the +following JSON object. Remember to change ``primitive.id`` to the +primitive's id that you generated in the earlier :ref:`primitive-source-code` subsection. + +.. code:: json + + { + "type": "PRIMITIVE", + "primitive": { + "id": "30d5f2fa-4394-4e46-9857-2029ec9ed0e0", + "version": "0.1.0", + "python_path": "d3m.primitives.operator.input_to_output.Quickstart", + "name": "Passthrough primitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.2.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + +Make sure that the step number (``"steps.N.produce"``) in +``arguments.inputs.data`` is correct (one greater than the previous step +and one less than the next step). Do this as well for the succeeding +steps, with the following caveats: + +- For ``d3m.primitives.classification.logistic_regression.SKlearn``, + increment the step number both for ``arguments.inputs.data`` and + ``arguments.outputs.data`` (at the time of this writing, the number + should be changed to ``3``). +- For + ``d3m.primitives.data_transformation.construct_predictions.Common``, + increment the step number for ``arguments.inputs.data`` (at the time + of this writing, the number should be changed to ``4``), but do not + change the one for ``arguments.reference.data`` (the value should + stay as ``"steps.0.produce"``) + +Generally, you can also programmatically generate a pipeline, as +described in the :ref:`pipeline-description-example`. + +Now we can finally run this pipeline that uses our new primitive. In the +command below, modify the pipeline JSON filename in the ``-p`` argument +to match the filename of your pipeline file (if you changed it to the +new pipeline id that you generated). + +.. code:: shell + + docker run \ + --rm \ + -v /home/foo/d3m:/mnt/d3m \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 \ + /bin/bash -c \ + "pip3 install -e /mnt/d3m/docs-quickstart && \ + python3 -m d3m \ + runtime \ + fit-score \ + --problem /mnt/d3m/datasets/training_datasets/seed_datasets_archive/185_baseball/185_baseball_problem/problemDoc.json \ + --input /mnt/d3m/datasets/training_datasets/seed_datasets_archive/185_baseball/TRAIN/dataset_TRAIN/datasetDoc.json \ + --test-input /mnt/d3m/datasets/training_datasets/seed_datasets_archive/185_baseball/TEST/dataset_TEST/datasetDoc.json \ + --score-input /mnt/d3m/datasets/training_datasets/seed_datasets_archive/185_baseball/SCORE/dataset_TEST/datasetDoc.json \ + --pipeline /mnt/d3m/docs-quickstart/pipelines/operator.input_to_output.Quickstart/0f290525-3fec-44f7-ab93-bd778747b91e.json \ + --output /mnt/d3m/pipeline-outputs/predictions_new.csv \ + --output-run /mnt/d3m/pipeline-outputs/run_new.yml" + +In the output, you should see the log statement as a warning, +before the score is shown (similar to the text below):: + + ... + WARNING:d3m.primitives.operator.input_to_output.Quickstart:Hi, InputToOutputPrimitive.produce was called! + ... + metric,value,normalized,randomSeed + F1_MACRO,0.31696136214800263,0.31696136214800263,0 + +Verify that the old and new ``predictions.csv`` in ``pipeline-outputs`` +are the same (you can use ``diff``), as well as the scores in the old +and new ``run.yml`` files (search for ``scores`` in the files). + +Beyond this tutorial +-------------------- + +Congratulations! You just built your own primitive and you were able to +use it in a d3m pipeline! + +Normally, when you build your own primitives, you would proceed to +validating the primitives to be included in the d3m index of all known +primitives. See the `primitives repo README +`__ +on details on how to do this. diff --git a/d3m/docs/reference.rst b/d3m/docs/reference.rst new file mode 100644 index 0000000..2fcfdb4 --- /dev/null +++ b/d3m/docs/reference.rst @@ -0,0 +1,9 @@ +.. _api-reference: + +API reference +------------- + +.. toctree:: + :maxdepth: 2 + + d3m diff --git a/d3m/docs/repostructure.rst b/d3m/docs/repostructure.rst new file mode 100644 index 0000000..f4efc61 --- /dev/null +++ b/d3m/docs/repostructure.rst @@ -0,0 +1,17 @@ +:orphan: + +.. _repostructure: + +Repository structure +-------------------- + +``master`` branch contains latest stable release of the package. +``devel`` branch is a staging branch for the next release. + +Releases are +`tagged `__. + +Contributing +~~~~~~~~~~~~~ + +See the repo's `CODE_STYLE.md `__ document for our coding style and contribution guide. Please ensure any merge requests you open follow this guide. diff --git a/d3m/docs/tutorial.rst b/d3m/docs/tutorial.rst new file mode 100644 index 0000000..4f3993b --- /dev/null +++ b/d3m/docs/tutorial.rst @@ -0,0 +1,493 @@ +Advanced Tutorial +================= + +This tutorial assumes the reader is familiar with d3m ecosystem in general. +If not, please refer to other sections of `documentation`_ first, e.g., +:ref:`quickstart`. + +.. _documentation: https://docs.datadrivendiscovery.org + +Overview of building a primitive +-------------------------------- + +1. :ref:`Recognize the base class of a primitive `. + +2. :ref:`Identify the input and output container types `. + +3. :ref:`Define metadata for each primitive `. + +4. :ref:`Write a unit test to verify the primitive functions `. + +5. :ref:`Generate the primitive annotation for the primitive `. + +6. :ref:`Write pipeline for demonstrating primitive functionality `. + +7. :ref:`Advanced: Primitive might use static files `. + +.. _primitive-class: + +Primitive class +--------------- + +There are a variety of :py:mod:`primitive interfaces/classes ` available. As an example, +a primitive doing just attribute extraction without requiring any fitting, a :py:class:`~d3m.primitive_interfaces.transformer.TransformerPrimitiveBase` +from :py:mod:`~d3m.primitive_interfaces.transformer` module can be used. + +Each primitives can have it's own :py:mod:`hyper-parameters `. Some example hyper-parameter types one can use to describe +primitive's hyper-parameters are: :py:class:`~d3m.metadata.hyperparams.Constant`, :py:class:`~d3m.metadata.hyperparams.UniformBool`, +:py:class:`~d3m.metadata.hyperparams.UniformInt`, :py:class:`~d3m.metadata.hyperparams.Choice`, :py:class:`~d3m.metadata.hyperparams.List`. + +Also, each hyper-parameter should be defined as one or more of the four :ref:`hyper-parameter semantic types `: + +* `https://metadata.datadrivendiscovery.org/types/TuningParameter `__ +* `https://metadata.datadrivendiscovery.org/types/ControlParameter `__ +* `https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter `__ +* `https://metadata.datadrivendiscovery.org/types/MetafeatureParameter `__ + +Example +~~~~~~~ + +.. code:: python + + from d3m.primitive_interfaces import base, transformer + from d3m.metadata import base as metadata_base, hyperparams + + __all__ = ('ExampleTransformPrimitive',) + + + class Hyperparams(hyperparams.Hyperparams): + learning_rate = hyperparams.Uniform(lower=0.0, upper=1.0, default=0.001, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ]) + clusters = hyperparams.UniformInt(lower=1, upper=100, default=10, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ]) + + + class ExampleTransformPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + The docstring is very important and must to be included. It should contain + relevant information about the hyper-parameters, primitive functionality, etc. + """ + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + +.. _input-output-types: + +Input/Output types +------------------ + +The acceptable inputs/outputs of a primitive must be pre-defined. D3M supports a variety of +standard input/output :ref:`container types ` such as: + +- ``pandas.DataFrame`` (as :py:class:`d3m.container.pandas.DataFrame`) + +- ``numpy.ndarray`` (as :py:class:`d3m.container.numpy.ndarray`) + +- ``list`` (as :py:class:`d3m.container.list.List`) + +.. note:: + Even thought D3M container types behave mostly as standard types, the D3M container types must be used for inputs/outputs, because D3M container types support D3M metadata. + +Example +~~~~~~~ + +.. code:: python + + from d3m import container + + Inputs = container.DataFrame + Outputs = container.DataFrame + + + class ExampleTransformPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + ... + +.. note:: + When returning the output DataFrame, its metadata should be updated with the correct semantic and structural types. + +Example +~~~~~~~ + +.. code:: python + + # Update metadata for each DataFrame column. + for column_index in range(outputs.shape[1]): + column_metadata = {} + column_metadata['structural_type'] = type(1.0) + column_metadata['name'] = "column {i}".format(i=column_index) + column_metadata["semantic_types"] = ("http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/Attribute",) + outputs.metadata = outputs.metadata.update((metadata_base.ALL_ELEMENTS, column_index), column_metadata) + +.. _tutorial-primitive-metadata: + +Primitive Metadata +------------------ + +It is very crucial to define :ref:`primitive metadata ` for the primitive properly. +Primitive metadata can be used by TA2 systems to metalearn about primitives and in general decide which primitive to use when. + +Example +~~~~~~~ + +.. code:: python + + from d3m.primitive_interfaces import base, transformer + from d3m.metadata import base as metadata_base, hyperparams + + __all__ = ('ExampleTransformPrimitive',) + + class ExampleTransformPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Docstring. + """ + + metadata = metadata_base.PrimitiveMetadata({ + 'id': , + 'version': , + 'name': , + 'python_path': 'd3m.primitives.<>.<>.<>' # Must match path in setup.py, + 'source': { + 'name': , + 'uris': [], + 'contact': 'mailto:' + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+@{git_commit}#egg='.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + # Check https://metadata.datadrivendiscovery.org/devel/?definitions#definitions.algorithm_types for all available algorithm types. + # If algorithm type s not available a Merge Request should be made to add it to core package. + metadata_base.PrimitiveAlgorithmType., + ], + # Check https://metadata.datadrivendiscovery.org/devel/?definitions#definitions.primitive_family for all available primitive family types. + # If primitive family is not available a Merge Request should be made to add it to core package. + 'primitive_family': metadata_base.PrimitiveFamily. + }) + + ... + +.. _unit-tests: + +Unit tests +---------- + +Once the primitives are constructed, unit testing must be done to see if the +primitive works as intended. + +**Sample Setup** + +.. code:: python + + import os + import unittest + + from d3m.container import dataset + from d3m.metadata import base as metadata_base + from common_primitives import dataset_to_dataframe + + from example_primitive import ExampleTransformPrimitive + + + class ExampleTransformTest(unittest.TestCase): + def test_happy_path(): + # Load a dataset. + # Datasets can be obtained from: https://datasets.datadrivendiscovery.org/d3m/datasets + base_path = '../datasets/training_datasets/seed_datasets_archive/' + dataset_doc_path = os.path.join(base_path, '38_sick_dataset', 'datasetDoc.json') + dataset = dataset.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + # Call example transformer. + hyperparams_class = SampleTransform.metadata.get_hyperparams() + primitive = SampleTransform(hyperparams=hyperparams_class.defaults()) + test_out = primitive.produce(inputs=dataframe).value + + # Write assertions to make sure that the output (type, shape, metadata) is what is expected. + self.assertEqual(...) + + ... + + + if __name__ == '__main__': + unittest.main() + +It is recommended to do the testing inside the D3M Docker container: + +.. code:: shell + + docker run --rm -v /home/foo/d3m:/mnt/d3m -it \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 + cd /mnt/d3m/example_primitive + python3 primitive_name_test.py + +.. _primitive-annotation: + +Primitive annotation +-------------------- + +Once primitive is constructed and unit testing is successful, the +final step in building a primitive is to generate the primitive annotation +which will be indexed and used by D3M. + +.. code:: shell + + docker run --rm -v /home/foo/d3m:/mnt/d3m -it \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 + cd /mnt/d3m/example_primitive + pip3 install -e . + python3 -m d3m index describe -i 4 + +Alternatively, a `helper script `__ +can be used to generate primitive annotations as well. +This can be more convenient when having to manage multiple primitives. +In this case, generating the primitive annotation is done as follows: + +.. code:: shell + + docker run --rm -v /home/foo/d3m:/mnt/d3m -it \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 + cd /mnt/d3m/example_primitive + pip3 install -e . + python3 generate-primitive-json.py ... + +.. _example-pipeline: + +Example pipeline +---------------- + +After building custom primitives, it has to be used in an example pipeline and run using one of +D3M seed datasets in order to be integrated with other indexed D3M primitives. + +The essential elements of pipelines are: + +``Dataset Denormalizer -> Dataset Parser -> Data Cleaner (If necessary) -> Feature Extraction -> Classifier/Regressor -> Output`` + +An example code of building pipeline is shown below: + +.. code:: python + + # D3M dependencies + from d3m import index + from d3m.metadata.base import ArgumentType + from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + # Common Primitives + from common_primitives.column_parser import ColumnParserPrimitive + from common_primitives.dataset_to_dataframe import DatasetToDataFramePrimitive + from common_primitives.extract_columns_semantic_types import ExtractColumnsBySemanticTypesPrimitive + + # Testing primitive + from quickstart_primitives.sample_primitive1.input_to_output import InputToOutputPrimitive + + # Pipeline + pipeline = Pipeline() + pipeline.add_input(name='inputs') + + # Step 0: DatasetToDataFrame (Dataset Denormalizer) + step_0 = PrimitiveStep(primitive_description=DatasetToDataFramePrimitive.metadata.query()) + step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') + step_0.add_output('produce') + pipeline.add_step(step_0) + + # Step 1: Custom primitive + step_1 = PrimitiveStep(primitive=InputToOutputPrimitive) + step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') + step_1.add_output('produce') + pipeline.add_step(step_1) + + # Step 2: Column Parser (Dataset Parser) + step_2 = PrimitiveStep(primitive_description=ColumnParserPrimitive.metadata.query()) + step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') + step_2.add_output('produce') + pipeline.add_step(step_2) + + # Step 3: Extract Attributes (Feature Extraction) + step_3 = PrimitiveStep(primitive_description=ExtractColumnsBySemanticTypesPrimitive.metadata.query()) + step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') + step_3.add_output('produce') + step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute'] ) + pipeline.add_step(step_3) + + # Step 4: Extract Targets (Feature Extraction) + step_4 = PrimitiveStep(primitive_description=ExtractColumnsBySemanticTypesPrimitive.metadata.query()) + step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') + step_4.add_output('produce') + step_4.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'] ) + pipeline.add_step(step_4) + + attributes = 'steps.3.produce' + targets = 'steps.4.produce' + + # Step 6: Imputer (Data Cleaner) + step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) + step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) + step_5.add_output('produce') + pipeline.add_step(step_5) + + # Step 7: Classifier + step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.classification.decision_tree.SKlearn')) + step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') + step_6.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets) + step_6.add_output('produce') + pipeline.add_step(step_6) + + # Final Output + pipeline.add_output(name='output predictions', data_reference='steps.6.produce') + + # print(pipeline.to_json()) + with open('./pipeline.json', 'w') as write_file: + write_file.write(pipeline.to_json(indent=4, sort_keys=False, ensure_ascii=False)) + +Once pipeline is constructed and the pipeline's JSON file is generated, the pipeline is run using +``python3 -m d3m runtime`` command. +Successfully running the pipeline validates that the primitive is working as intended. + +.. code:: shell + + docker run --rm -v /home/foo/d3m:/mnt/d3m -it \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 \ + /bin/bash -c "cd /mnt/d3m; \ + pip3 install -e .; \ + cd pipelines; \ + python3 -m d3m runtime fit-produce \ + --pipeline pipeline.json \ + --problem /datasets/seed_datasets_current/38_sick/TRAIN/problem_TRAIN/problemDoc.json \ + --input /datasets/seed_datasets_current/38_sick/TRAIN/dataset_TRAIN/datasetDoc.json \ + --test-input /datasets/seed_datasets_current/38_sick/TEST/dataset_TEST/datasetDoc.json \ + --output 38_sick_results.csv \ + --output-run pipeline_run.yml; \ + exit" + +.. _static-files: + +Advanced: Primitive with static files +------------------------------------- + +When building primitives that uses external/static files i.e. pre-trained weights, the +metadata for the primitive must be properly define such dependency. +The static file can be hosted anywhere based on your preference, as long as the URL to the file is a direct download link. It must +be public so that users of your primitive can access the file. Be sure to keep the URL available, as +the older version of the primitive could potentially start failing if URL stops resolving. + +.. note:: + Full code of this section can be found in the `quickstart repository `__. + +Below is a description of primitive metadata definition required, named ``_weights_configs`` for +each static file. + +.. code:: python + + _weights_configs = [{ + 'type': 'FILE', + 'key': '', + 'file_uri': '', + 'file_digest':'sha256sum of the ', + }] + + +This ``_weights_configs`` should be directly added to the ``INSTALLATION`` field of the primitive metadata. + +.. code:: python + + from d3m.primitive_interfaces import base, transformer + from d3m.metadata import base as metadata_base, hyperparams + + __all__ = ('ExampleTransform',) + + class ExampleTransform(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Docstring. + """ + + _weights_configs = [{ + 'type': 'FILE', + 'key': '', + 'file_uri': '', + 'file_digest':'sha256sum of the ', + }] + + metadata = ... + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+@{git_commit}#egg='.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + _weights_configs, + ... + + ... + +After the primitive metadata definition, it is important to include code to return the path of files. +An example is given as follows: + +.. code:: python + + def _find_weights_path(self, key_filename): + if key_filename in self.volumes: + weight_file_path = self.volumes[key_filename] + else: + weight_file_path = os.path.join('.', self._weights_configs['file_digest'], key_filename) + + if not os.path.isfile(weight_file_path): + raise ValueError( + "Can't get weights file from volumes by key '{key_filename}' and at path '{path}'.".format( + key_filename=key_filename, + path=weight_file_path, + ), + ) + + return weight_file_path + +In this example code, ``_find_weights_path`` method will try to find the static files from volumes based on weight file key. +If it cannot be found (e.g., runtime was not provided with static files), then it looks into the current directory. +The latter fallback is useful during development. + +To run a pipeline with such primitive, you have to download static files and provide them to the runtime: + +.. code:: shell + + docker run --rm -v /home/foo/d3m:/mnt/d3m -it \ + registry.gitlab.com/datadrivendiscovery/images/primitives:ubuntu-bionic-python36-v2020.1.9 \ + /bin/bash -c "cd /mnt/d3m; \ + pip3 install -e .; \ + cd pipelines; \ + mkdir /static + python3 -m d3m index download -p d3m.primitives.path.of.Primitive -o /static; \ + python3 -m d3m runtime --volumes /static fit-produce \ + --pipeline feature_pipeline.json \ + --problem /datasets/seed_datasets_current/22_handgeometry/TRAIN/problem_TRAIN/problemDoc.json \ + --input /datasets/seed_datasets_current/22_handgeometry/TRAIN/dataset_TRAIN/datasetDoc.json \ + --test-input /datasets/seed_datasets_current/22_handgeometry/TEST/dataset_TEST/datasetDoc.json \ + --output 22_handgeometry_results.csv \ + --output-run feature_pipeline_run.yml; \ + exit" + +The static files will be downloaded and stored locally based on ``file_digest`` of ``_weights_configs``. +In this way we don't duplicate same files used by multiple primitives: + +.. code:: shell + + mkdir /static + python3 -m d3m index download -p d3m.primitives.path.of.Primitive -o /static + +``-p`` optional argument to download static files for a particular primitive, matching on its Python path. +``-o`` optional argument to download the static files into a common folder. If not provided, they are +downloaded into the current directory. + +After the download, the file structure is given as follows:: + + /static/ + / + + / + + ... + ... diff --git a/d3m/entry_points.ini b/d3m/entry_points.ini new file mode 100644 index 0000000..88e433b --- /dev/null +++ b/d3m/entry_points.ini @@ -0,0 +1,2 @@ +[d3m.primitives] +evaluation.compute_scores.Core = d3m.contrib.primitives.compute_scores:ComputeScoresPrimitive diff --git a/d3m/oldest_dependencies.py b/d3m/oldest_dependencies.py new file mode 100755 index 0000000..11acbab --- /dev/null +++ b/d3m/oldest_dependencies.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +import pkg_resources + +package = pkg_resources.working_set.by_key['d3m'] + +oldest_dependencies = [] + +for requirement in package.requires(): + dependency = requirement.project_name + if requirement.extras: + dependency += '[' + ','.join(requirement.extras) + ']' + for comparator, version in requirement.specs: + if comparator == '==': + if len(requirement.specs) != 1: + raise ValueError('Invalid dependency: {requirement}'.format(requirement=requirement)) + dependency += '==' + version + elif comparator == '<=': + if len(requirement.specs) != 2: + raise ValueError('Invalid dependency: {requirement}'.format(requirement=requirement)) + elif comparator == '>=': + dependency += '==' + version + + oldest_dependencies.append(dependency) + +for dependency in oldest_dependencies: + print(dependency) diff --git a/d3m/run_benchmarks.sh b/d3m/run_benchmarks.sh new file mode 100755 index 0000000..092baa4 --- /dev/null +++ b/d3m/run_benchmarks.sh @@ -0,0 +1,18 @@ +#!/bin/bash -e + +if ! git remote get-url upstream > /dev/null 2>&1 ; then + git remote add upstream https://gitlab.com/datadrivendiscovery/d3m.git +fi +git fetch upstream + +asv machine --yes --config tests/asv.conf.json + +ASV_OUTPUT=$(asv continuous upstream/devel HEAD -s -f 1.1 -e --config tests/asv.conf.json) +echo "$ASV_OUTPUT" + +if echo "$ASV_OUTPUT" | egrep -q "(SOME BENCHMARKS HAVE CHANGED SIGNIFICANTLY)|( failed$)" ; then + echo "Benchmarks have errors." + exit 1 +else + echo "Benchmarks ran without errors." +fi diff --git a/d3m/run_tests.py b/d3m/run_tests.py new file mode 100755 index 0000000..16c264a --- /dev/null +++ b/d3m/run_tests.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +import sys +import unittest + +runner = unittest.TextTestRunner(verbosity=1) + +tests = unittest.TestLoader().discover('tests') + +if not runner.run(tests).wasSuccessful(): + sys.exit(1) diff --git a/d3m/setup.cfg b/d3m/setup.cfg new file mode 100644 index 0000000..b6a8bc3 --- /dev/null +++ b/d3m/setup.cfg @@ -0,0 +1,25 @@ +[pycodestyle] +max-line-length = 200 + +[metadata] +description-file = README.md + +[mypy] +warn_redundant_casts = True +# TODO: Enable back once false positives are fixed. +# See: https://github.com/python/mypy/issues/4412 +#warn_unused_ignores = True +warn_unused_configs = True +disallow_untyped_defs = True + +# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300 +[mypy-d3m.container.list] +ignore_errors = True + +# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300 +[mypy-d3m.metadata.hyperparams] +ignore_errors = True + +# TODO: Remove once this is fixed: https://github.com/python/mypy/pull/4384#issuecomment-354033177 +[mypy-d3m.primitive_interfaces.distance] +ignore_errors = True diff --git a/d3m/setup.py b/d3m/setup.py new file mode 100644 index 0000000..b79da82 --- /dev/null +++ b/d3m/setup.py @@ -0,0 +1,87 @@ +import os +import os.path +import sys +from setuptools import setup, find_packages + +PACKAGE_NAME = 'd3m' +MINIMUM_PYTHON_VERSION = 3, 6 + + +def check_python_version(): + """Exit when the Python version is too low.""" + if sys.version_info < MINIMUM_PYTHON_VERSION: + sys.exit("Python {}.{}+ is required.".format(*MINIMUM_PYTHON_VERSION)) + + +def read_package_variable(key): + """Read the value of a variable from the package without importing.""" + module_path = os.path.join(PACKAGE_NAME, '__init__.py') + with open(module_path) as module: + for line in module: + parts = line.strip().split(' ') + if parts and parts[0] == key: + return parts[-1].strip("'") + raise KeyError("'{0}' not found in '{1}'".format(key, module_path)) + + +def read_readme(): + with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf8') as file: + return file.read() + + +def read_entry_points(): + with open('entry_points.ini') as entry_points: + return entry_points.read() + + +check_python_version() +version = read_package_variable('__version__') +description = read_package_variable('__description__') +author = read_package_variable('__author__') + +setup( + name=PACKAGE_NAME, + version=version, + description=version, + author=author, + packages=find_packages(exclude=['contrib', 'docs', 'site', 'tests*']), + package_data={'d3m': ['metadata/schemas/*/*.json', 'contrib/pipelines/*']}, + data_files=[('./', ['./entry_points.ini'])], + install_requires=[ + 'scikit-learn[alldeps]>=0.20.3,<=0.22.2.post1', + 'pytypes==1.0b5', + 'frozendict==1.2', + 'numpy>=1.15.4,<=1.18.2', + 'jsonschema>=3.0.2,<=3.2.0', + 'requests>=2.19.1,<=2.23.0', + 'strict-rfc3339==0.7', + 'rfc3987==1.3.8', + 'webcolors>=1.8.1,<=1.11.1', + 'dateparser>=0.7.0,<=0.7.2', + 'python-dateutil==2.8.1', + 'pandas>=0.23.4,<=1.0.3', + 'typing-inspect==0.5.0', + 'GitPython==3.1.0', + 'jsonpath-ng==1.4.3', + 'custom-inherit>=2.2.0,<=2.2.2', + 'PyYAML>=5.1,<=5.3', + 'pycurl>=7.43.0.2,<=7.43.0.5', + 'pyarrow>=0.15.1,<=0.16.0', + 'gputil>=1.3.0,<=1.4.0', + 'pyrsistent>=0.14.11,<=0.15.7', + 'scipy>=1.2.1,<=1.4.1', + 'openml==0.10.1', + ], + tests_require=[ + 'asv==0.3.1', + 'docker[tls]==2.7', + ], + entry_points=read_entry_points(), + url='https://gitlab.com/datadrivendiscovery/d3m', + long_description=read_readme(), + long_description_content_type='text/markdown', + license='Apache-2.0', + classifiers=[ + 'License :: OSI Approved :: Apache Software License', + ], +) diff --git a/d3m/site/.gitignore b/d3m/site/.gitignore new file mode 100644 index 0000000..ef5c05b --- /dev/null +++ b/d3m/site/.gitignore @@ -0,0 +1,4 @@ +static/bundle.js +static/bundle.css +static/fonts +node_modules diff --git a/d3m/site/Makefile b/d3m/site/Makefile new file mode 100644 index 0000000..4de816f --- /dev/null +++ b/d3m/site/Makefile @@ -0,0 +1,14 @@ +default: static/bundle.js static/bundle.css static/fonts + +clean: + rm -f static/bundle.js static/bundle.css + rm -rf static/fonts + +static/bundle.js: client.js html_construction.js package-lock.json + ./node_modules/.bin/browserify -d -t [ babelify --presets [ env ] ] ./client.js > $@ + +static/bundle.css: client.less package-lock.json + ./node_modules/.bin/lessc ./client.less $@ + +static/fonts: + cp -r node_modules/font-awesome/fonts static/fonts diff --git a/d3m/site/build_site.sh b/d3m/site/build_site.sh new file mode 100755 index 0000000..e6bd7ee --- /dev/null +++ b/d3m/site/build_site.sh @@ -0,0 +1,47 @@ +#!/bin/bash -e + +# Builds sites for schemas. For each tag and `devel` branch a separate site is built. + +deploy () { + if [ ! -d site ] || [ ! -e site/package.json ] + then + return 0 + fi + + cd site + npm install + make + cd .. + + # Copying results into output directory "public". + cp -a site/static public/$1 + rm -f public/$1/schemas + cp -a d3m/metadata/schemas public/$1/schemas + + # Cleaning. + cd site + make clean + rm -fr node_modules + cd .. + + # Reverting changes, "package-lock.json" might be changed. + git checkout -- . +} + +rm -rf public +mkdir public + +git checkout devel +cp -a d3m/metadata/schemas public/schemas +deploy devel + +while read -r -a line +do + IFS='/' read -r -a parts <<< ${line[1]} + + if [[ ${parts[-1]} == v* ]] + then + git checkout ${line[0]} + deploy ${parts[-1]} + fi +done <<< $(git show-ref --tags) diff --git a/d3m/site/build_site_types.py b/d3m/site/build_site_types.py new file mode 100644 index 0000000..178fb4d --- /dev/null +++ b/d3m/site/build_site_types.py @@ -0,0 +1,284 @@ +""" +Constructs sites for ``semantic types``, site, which hierarchically displays all types, and site, which +lists all available versions of schemas. + +Sites are placed under ``types`` folder inside ``public`` folder, which should exist at the root of the repository. +""" + +import json +import os +import typing +from shutil import copyfile + +from pyquery import PyQuery +from yattag import Doc + +PREFIX = 'https://metadata.datadrivendiscovery.org/types/' + +types = {} + + +def cycle_detection(url: str, past_urls: typing.List[str]) -> None: + """ + Detects cycle in semantic types' hierarchy. + + Also checks if referenced urls in ``parents`` exist. + + Parameters + ---------- + url : str + URL of the semantic type that is to be analyzed. + past_urls : typing.List[str] + List of previously called urls. + """ + + global types + + if url not in types: + raise Exception("Cannot find referenced semantic type '{url}'".format(url=url)) + if url in past_urls: + raise Exception("Cycle in semantic types hierarchy. Cycle: '{cycle}'".format( + cycle=(' -> '.join(past_urls + [url])) + )) + + for parent in types[url]['parents']: + cycle_detection(parent, past_urls + [url]) + + +def template(tag, line): + """ + Generates HTML base for the site. + + Yields the result, so HTML end brackets (e.g. ````) are not closed. + + Usage:: + + for temp in template(tag, line): + ... + + Parameters + ---------- + tag : yattag.tag + ``tag`` from the ``yattag`` module. + line : yattag.line + ``line`` from the ``yattag`` module. + + Returns + ------- + Element of the ``yattag`` module representing container of the page. + """ + + global types + + with tag('html'): + with tag('head'): + line('title', "D3M Metadata") + line('meta', '', charset='utf-8') + line('meta', '', name='viewport', content='width=device-width, initial-scale=1') + line('link', '', rel='stylesheet', href='/schema-org.css') + with tag('body'): + with tag('div', id='container'): + with tag('div', id='intro'): + with tag('div', id='pageHeader'): + with tag('div', klass='wrapper'): + with tag('div', id='sitename'): + with tag('h1'): + line('a', "metadata.datadrivendiscovery.org", href='/') + with tag('div', id='selectionbar'): + with tag('div', klass='wrapper'): + with tag('ul'): + with tag('li'): + line('a', "Types", href='/types') + with tag('li'): + line('a', "Schemas", href='/devel') + with tag('div', id='mainContent'): + yield + + +def construct_types(site, parent: str) -> None: + """ + Constructs hierarchy displayed semantic types at ``/types/`` path. + + More specifically, constructs list (HTML ``
    ``) of semantic types that have ``parent`` for ancestor. + + Parameters + ---------- + site + ``site`` from the ``yattag`` module. + parent : str + URL of the parent. + """ + + global types + + with site.tag('ul'): + for url in sorted(types, key=lambda key: types[key]['label']): + if parent in types[url]['parents'] or len(parent) == 0 and len(types[url]['parents']) == 0: + with site.tag('li'): + site.line('a', types[url]['label'], href=url) + construct_types(site, url) + + +def construct_breadcrumbs(site, ancestors: typing.List) -> None: + """ + Constructs breadcrumbs in the page. + + E.g. if ``ancestors`` equals [c, b, a], then breadcrumbs should be "... > c > b > a". + + Parameters + ---------- + site + ``site`` from the ``yattag`` module. + ancestors : typing.List + URLs of elements that are to be displayed in breadcrumbs. + """ + + global types + + parents = types[ancestors[0]]['parents'] + for parent in parents: + construct_breadcrumbs(site, [parent] + ancestors) + + if len(parents) == 0: + with site.tag('span'): + for url in ancestors: + site.line('a', types[url]['label'], href=url) + site.line('span', " > ", klass='hide-last') + site.stag('br') + + +def define_external_type(url: str) -> None: + """ + Used for adding types from ``schema.org`` domain to ``types``. Fetches ``url`` and looks for parents, + which are also recursively added to ``types``. + + Parameters + ---------- + url : str + URL of the type. Should be from ``schema.org`` domain. + """ + + global types + + if url in types: + return + + types[url] = { + 'label': url[url.rfind('/') + 1:], + 'description': '', + 'parents': [] + } + candidates = PyQuery(url)('link') + + for i in range(len(candidates)): + link = candidates.eq(i) + if link.attr('property') == 'rdfs:subClassOf': + parent = link.attr('href') + + if len(parent) > 0: + if parent not in types[url]['parents']: + types[url]['parents'].append(parent) + define_external_type(parent) + + +def main() -> None: + global types + + os.makedirs('public/types', 0o755, exist_ok=True) + copyfile('site/schema-org.css', 'public/schema-org.css') + + schema = json.load(open('d3m/metadata/schemas/v0/definitions.json')) + + # Filling "types". + for semantic_type in schema['definitions']['semantic_types']['items']['anyOf']: + if 'enum' not in semantic_type: + continue + + url = semantic_type['enum'][0] + + description = semantic_type.get('description', '') + parents = semantic_type.get('parents', []) + label = url[url.rfind('/') + 1:] + + if not isinstance(parents, list): + raise Exception("This semantic type does not have type 'list' for 'parents': {url}".format(url=url)) + + # Defining parents from 'schema.org'. + for parent in parents: + if '//schema.org' in parent: + define_external_type(parent) + + if '//schema.org' in url: + if len(parents) > 0: + raise Exception("This URL should not have parents defined in 'D3M' schema: {url}".format(url=url)) + + define_external_type(url) + types[url]['description'] = description + else: + types[url] = { + 'label': label, + 'description': description, + 'parents': parents + } + + # Cycle detection. + for url in types: + cycle_detection(url, []) + + # Constructing site at the root of the domain. + site, tag, text, line = Doc().ttl() + + for temp in template(tag, line): + line('h1', "Versions", klass='page-title', style='margin-bottom: 20px') + + with tag('div', klass='breadcrumbs'): + line('a', "devel", href='devel') + + # Sorting versions by release number. + versions = {} + for folder in os.listdir('public'): + try: + versions[folder] = [-int(number) for number in folder[1:].split('.')] + except: + pass + + for version in sorted(versions, key=lambda key: versions[key]): + if version.startswith('v'): + with tag('div', klass='breadcrumbs'): + line('a', version, href=version) + + with open('public/index.html', 'w') as file: + file.write(site.getvalue()) + + # Constructing site for all types. + site, tag, text, line = Doc().ttl() + + for temp in template(tag, line): + line('h1', "Semantic Types", klass='page-title', style='margin-bottom: 20px') + line('div', schema['definitions']['semantic_types'].get('description', ''), style='margin-bottom: 20px') + construct_types(site, '') + + with open('public/types/index.html', 'w') as file: + file.write(site.getvalue()) + + # Constructing site for each type. + for url in types: + if url.startswith(PREFIX): + site, tag, text, line = Doc().ttl() + + for temp in template(tag, line): + line('h1', types[url]['label'], klass='page-title') + with tag('span', klass='canonicalUrl'): + text("Canonical URL: ") + line('a', url, href=url) + with tag('h4'): + if len(types[url]['parents']) > 0: + construct_breadcrumbs(site, [url]) + line('div', types[url]['description']) + + with open('public/types/' + types[url]['label'], 'w') as file: + file.write(site.getvalue()) + + +if __name__ == '__main__': + main() diff --git a/d3m/site/client.js b/d3m/site/client.js new file mode 100644 index 0000000..40b77a9 --- /dev/null +++ b/d3m/site/client.js @@ -0,0 +1,334 @@ +import $ from 'jquery'; + +import {reloadPage, encodeID, referenceKey, expandAll, shrinkAll} from './html_construction' + +/** + * Names of schemas as they appear in the `schemas/v0` folder, without the extension. + * @type {string[]} + */ +const names = [ + 'primitive', + 'container', + 'data', + 'problem', + 'pipeline', + 'pipeline_run', + 'definitions', +]; + +/** + * Placeholder for schemas. Keys are names of the files without the extension. + * @type {{}} + */ +export let schemas = {}; + +/** + * Schema that contains definitions. It also appears in `schemas`. + */ +let definitions; + +/** + * True while fetching schemas from the server. + * @type {boolean} + */ +export let fetching = true; + +/** + * Key to be used in the cycle detection algorithm. + * @type {string} + */ +const cycleKey = 'cycle_detection_pflk2jds32ljfi2jfoja-p-2rla'; + +// Fetching schemas. +Promise.all(names.map((name) => { + return fetch(`schemas/v0/${name}.json`, {credentials: 'same-origin'}).then((response) => { + if (response.ok) { + return response.json(); + } + else { + throw new Error("Fetch failed."); + } + }); +})).then((fetchedSchemas) => { + // Moving schemas into "schemas" and "definitions". + fetchedSchemas.forEach((schema, i) => { + schemas[names[i]] = schema; + if (names[i] === 'definitions') + definitions = schema; + }); + + // Adding references in "definitions" to their pages. + Object.keys(definitions['definitions']).forEach((i) => { + definitions['definitions'][i][referenceKey] = { + 'url': '?definitions#' + encodeID('definitions/' + i, false), + 'name': 'definitions/' + i + }; + }); + + // Connecting schemas with "definitions" and applying workarounds. + Object.keys(schemas).forEach((schema) => { + schemas[schema] = resolveReferences(schemas[schema]); + }); + + fetching = false; + reloadPage(); +}); + +$('#shrink-all').click(function (event) { + removeUrlParameter(); + shrinkAll(); +}); +$('#expand-all').click(function (event) { + // Constructing url with added "expanded" parameter. + let search = window.location.search.substring(1).split('&'); + let parameters = ['expanded']; + search.forEach((parameter) => { + if (parameter.length > 0 && parameters.indexOf(parameter) === -1) + parameters.unshift(parameter); + }); + let url = '?' + parameters.join('&') + window.location.hash; + + history.replaceState(history.state, undefined, url); + expandAll(); +}); + +$(window).on('popstate', function (event) { + reloadPage(); +}); + +// Event handler for "scroll to top" button. +let $scrollToTop = $('#scroll-to-top').click(function (event) { + $('html, body').animate({scrollTop: 0, scrollLeft: 0}); +}); + +// Showing and hiding "scroll to top" button. +window.onscroll = function (event) { + if (document.body.scrollTop > 50 || document.documentElement.scrollTop > 50) + $scrollToTop.show(); + else $scrollToTop.hide(); +}; + +// Creating links to the different schemas at the top of the page. +let $container = $('#links'); +names.forEach((name, i) => { + $('').attr('href', '?' + name).text(name).appendTo($container).click(function (event) { + event.preventDefault(); + history.pushState(undefined, undefined, '?' + name); + reloadPage(); + }); + + // Appending comma. + if (i < names.length - 1) { + $('').text(', ').appendTo($container); + } +}); + +// Saving scrolling position when leaving this application, relevant if user returns. +window.onbeforeunload = function (event) { + history.state.position = window.pageYOffset || document.documentElement.scrollTop; + history.replaceState(history.state, undefined); +}; + +/** + * Replaces references (objects with the only key `$ref`) with the actual object they are pointing + * to. Objects are not copied, so there always exists only one copy of each object. That also enables + * easier cycle detection. + * + * Also applies workarounds. + * + * @param iterator Object, over which we are currently iterating. + * @returns {*} Updated object. + */ +function resolveReferences(iterator) { + if (typeof iterator === 'object') { + + // Workarounds. + iterator = workaround1(iterator); + workaround2(iterator); + + // Marking the current object, so we know that we have been here. + iterator[cycleKey] = true; + + for (let i in iterator) { + if (!iterator.hasOwnProperty(i)) + continue; + + if (i === '$ref') { + + // Getting object from "definitions". + let subSchema = getFromDefinitions(iterator[i]); + + // Checking if we have already been there. + if (subSchema.hasOwnProperty(cycleKey)) { + // Constructing new object with some description. + let ret = {'[Cycle in JSON schema, see link]': ''}; + + // Adding type, reference and description to the returned object, so user will get some idea of + // first repeated layer. + if (subSchema.hasOwnProperty('type')) ret['type'] = subSchema['type']; + if (subSchema.hasOwnProperty(referenceKey)) ret[referenceKey] = subSchema[referenceKey]; + if (subSchema.hasOwnProperty('description')) ret['description'] = subSchema['description']; + + delete iterator[cycleKey]; + return ret; + } + + // References can also appear in the sub schemas. + let ret = resolveReferences(subSchema); + + delete iterator[cycleKey]; + return ret; + } + + // If it's not a reference, go deeper. + iterator[i] = resolveReferences(iterator[i]); + } + delete iterator[cycleKey]; + } + return iterator; +} + +/** + * Url represents path to object in `definitions`, this function finds it and returns it. If object at given + * path does not exist, function does not crash, but rather returns url. + * + * @param url Url of the object. + * @returns {*} Retrieved object. + */ +function getFromDefinitions(url) { + try { + + // If url is like '.../pipeline.json'. + if (url.indexOf('#') === -1) { + let ret = {'[Cycle in JSON schema, see link]': ''}; + ret[referenceKey] = { + 'name': url, + 'url': '?' + url.substring(url.lastIndexOf('/') + 1, url.lastIndexOf('.')) + }; + return ret; + } + + // Supposing that 'url' is like '#/definitions/sth'. + else { + let subSchema = definitions; + let path = url.substring(url.lastIndexOf('#') + 1, url.length).split('/'); + path.forEach((part) => { + if (part.length > 0) { + subSchema = subSchema[part]; + } + }); + return subSchema; + } + } + catch (error) { + console.log(error); + } + return {"[couldn't show this schema]": url}; +} + +/** + * Removes `expanded` parameter from the url. + */ +export function removeUrlParameter() { + let search = window.location.search.substring(1).split('&'); + let parameters = []; + search.forEach((parameter) => { + if (parameter.length > 0 && parameter !== 'expanded') + parameters.push(parameter); + }); + + let url = '?' + parameters.join('&') + window.location.hash; + history.replaceState(history.state, undefined, url); +} + +/** + * If `data` has following structure: + * + * { + * 'allOf': [one schema with description1] + * 'description': description2 + * } + * + * Then description1 is replaced with description2, and whole object is replaced with the schema. + * + * @param data Some schema or part of the schema. + * @returns {boolean} Updated object. + */ +function workaround1(data) { + if (Object.keys(data).length === 2 && + data.hasOwnProperty('allOf') && data['allOf'].length === 1 && + data.hasOwnProperty('description')) { + let child = data['allOf'][0]; + child['description'] = data['description']; + data = child; + } + return data; +} + +/** + * If `data` has following structure: + * + * { + * 'anyOf' or 'oneOf': [ + * { + * 'enum': array of length 1 + * ... + * }, + * { + * 'enum': array of length 1 + * ... + * }, + * ... + * ], + * ... + * } + * + * Then all objects with `enum` are put into one array. + * + * @param data Some schema or part of the schema. + */ +function workaround2(data) { + if (data.hasOwnProperty('anyOf') || data.hasOwnProperty('oneOf')) { + let key = data.hasOwnProperty('anyOf') ? 'anyOf' : 'oneOf'; + // Placeholder for other properties. + let other = []; + // Placeholder for enums. + let enums = []; + + // Iterating over data and searching for enums. + Object.keys(data[key]).forEach((i) => { + let object = data[key][i]; + // If it contains "enum". + if (typeof object === 'object' && object.hasOwnProperty('enum') && object['enum'].length === 1) { + object['text'] = object['enum'][0]; + delete object['enum']; + enums.push(object); + } + else { + other.push(object); + } + }); + + if (enums.length > 0) { + // Sorting. + enums.sort(function (a, b) { + return a['text'].localeCompare(b['text']); + }); + + // "oneOf -> enum" is represented as "enum" only. + if (key === 'oneOf' && other.length === 0) { + data['enum'] = enums; + delete data[key]; + } + // Typical case. + else { + // Adding "enum" at the end. + other.push({ + 'enum': enums + }); + // Updating. + data[key] = other; + } + } + } +} diff --git a/d3m/site/client.less b/d3m/site/client.less new file mode 100644 index 0000000..b0392ec --- /dev/null +++ b/d3m/site/client.less @@ -0,0 +1,106 @@ +@import 'node_modules/font-awesome/less/font-awesome'; + +@fa-font-path: 'fonts'; + +body { + font-family: monospace; + font-size: 14px; + margin-left: 20px; +} + +#container { + // 95vh is added at the bottom so content won't move when clicking '-'. + margin: 2em 0 95vh 30px; +} + +#links { + margin-bottom: 30px; +} + +.instruction { + display: none; +} + +.button { + width: 0; + height: 0; + position: relative; + + & > i { + position: absolute; + font-size: 11px; + cursor: pointer; + padding: 4px 10px; + left: -18px; + } +} + +.title { + white-space: nowrap; + font-weight: bold; +} + +.paragraph { + color: transparent; + + &:hover { + color: black !important; + } +} + +.yellow { + min-width: 600px; + padding: 2px 0 2px; + background-color: rgba(251, 229, 78, 0.74); +} + +.description { + min-width: 500px; + font-style: italic; + color: grey; + font-size: 12px; +} + +.color { + white-space: nowrap; + color: #e699ff; +} + +.shift { + padding-left: 25px; +} + +.parenthesis { + white-space: nowrap; +} + +.or { + white-space: nowrap; + position: relative; + right: 15px; + margin: 10px 0 10px; + font-style: italic; + color: #00a645; +} + +.el { + display: table-cell; + padding: 0 4px 0 4px; + word-wrap: break-word; +} + +.margin { + margin-top: 6px; +} + +#scroll-to-top { + display: none; + position: fixed; + bottom: 20px; + left: 30px; + z-index: 99; + background-color: white; + padding: 7px 11px; + border-radius: 20px; + box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19); +} diff --git a/d3m/site/html_construction.js b/d3m/site/html_construction.js new file mode 100644 index 0000000..b803ae0 --- /dev/null +++ b/d3m/site/html_construction.js @@ -0,0 +1,526 @@ +import $ from 'jquery'; + +import {schemas, fetching, removeUrlParameter} from './client'; + +/** + * Ids that were already given by the `encodeID` function. + * @type {Set} + */ +let usedIDs = new Set(); + +/** + * Key to be used when linking some object to its original definition. + * @type {string} + */ +export const referenceKey = 'reference_kfemjkvfi39rfj39fjckslgfgv2frskfj'; + +/** + * List of keys that are not to be treated or are treated differently. + * @type {string[]} + */ +const knownSpecifiers = ['title', 'type', 'required', 'description', 'properties', '$schema', 'id', referenceKey, 'text']; + +/** + * Map between some keys and more readable version of them. + * @type {{string}} + */ +const prettierKeys = { + 'allOf': 'all of', + 'anyOf': 'any of', + 'oneOf': 'one of', + 'additionalProperties': 'additional properties', + 'patternProperties': 'pattern properties', + 'minItems': 'min items' +}; + +/** + * Array for caching content inside `#container` element. When html for schema is constructed, + * it is placed inside `#container`, and pushed here. Index of it is pushed to history.state, + * so content can be restored when the user presses browser's `back` button. + * @type {Array} + */ +let pastContents = []; + +/** + * When schema is requested and page is constructed, its copy is thrown here, so at next visit (within same session) + * construction of html doesn't need to be performed again. + * @type {{}} + */ +let cache = {}; + +/** + * Session key used to differentiate between different sessions of the page. If the page is reloaded, new session + * key is generated, so content is reloaded even though matching number may be present in `history.state`. + * @type {number} + */ +let session = Math.random(); + +/** + * Simulates fresh visit of the page: + * - clears content + * - ensures html representing schema + * - applies event handlers + * - expands desired parts of the schema + * - scrolls to the desired point in page + */ +export function reloadPage() { + if (fetching) return; + + // Clearing content. ".children().detach()" would keep event + // listeners, but they are lost either way when content is cloned. + let $container = $('#container').empty(); + + $('#title').hide(); + let $instructions = $('.instruction').hide(); + usedIDs.clear(); + + // Getting name of the schema. + let name = undefined; + window.location.search.substring(1).split('&').forEach((parameter) => { + if (schemas.hasOwnProperty(parameter)) { + name = parameter; + } + }); + if (name === undefined) { + return; + } + + // Creating new content, if "back" was NOT pressed. + let state = history.state, getNewContent = false; + try { + if (typeof state['index'] !== 'number' || state['index'] >= pastContents.length || state['index'] < 0 || + state['session'] !== session) + getNewContent = true; + } + catch (e) { + getNewContent = true; + } + + // Ensuring new content. + let scrollPosition, title = schemas[name].hasOwnProperty('title') ? schemas[name]['title'] : ''; + if (getNewContent) { + // If not in "cache", construct new HTML content and save it. + if (!cache.hasOwnProperty(name)) { + // Placeholder. + let content = div(); + constructHTML(schemas[name], content, title, [], []); + cache[name] = content; + } + + // Getting expanded buttons. Item inside expands plus at the root. + let expanded = [encodeID(title, false)]; + try { + if (Array.isArray(state['expanded'])) { + expanded = state['expanded']; + } + + // Getting scrolling position, relevant when user leaves this domain and returns. + scrollPosition = state['position']; + } + catch (e) {} + + // Replacing this state with new data. + history.replaceState({ + 'index': pastContents.length, + 'session': session, + 'expanded': expanded + }, undefined); + + pastContents.push(cache[name].clone()); + } + + // Showing content. + pastContents[history.state['index']].appendTo($container); + + // Event listener for links. + $container.find('.link').click(function (event) { + event.preventDefault(); + history.pushState(undefined, undefined, $(this).attr('href')); + reloadPage(); + }); + + // Event listener for buttons (+/-). + $container.find('.button').click(function (event) { + buttonClick($(this), event); + }).next('.title').css('cursor', 'pointer').click(function (event) { + buttonClick($(this).prev(), event); + }); + + // Event listener for coloring paragraph. + $container.find('.paragraph-row').mouseenter(function (event) { + $(this).children('.paragraph').css('color', '#e2e0e5'); + }).mouseleave(function (event) { + $(this).children('.paragraph').css('color', 'transparent'); + }); + + // Expanding rows above one in the URL's hash and coloring it. + if (window.location.hash.length > 1) { + let id = window.location.hash.substring(1); + // We use "getElementById" and not jQuery because "." can be in "id". + let $el = $(document.getElementById(id)); + $el.parentsUntil($container, '.container').children('.first-row').children('.button').each(function () { + expand($(this)); + }); + $el.addClass('yellow'); + } + + // Expanding rows that were expanded before. + Object.keys(history.state['expanded']).forEach((i) => { + let id = history.state['expanded'][i]; + if (id.length > 0) { + // We use "getElementById" and not jQuery because "." can be in "id". + let $row = $(document.getElementById(id)); + expand($row.children('.button').first()); + } + }); + + // Expanding all. + if (window.location.search.substring(1).split('&').indexOf('expanded') !== -1) + expandAll(); + + // Showing title, buttons. + $('#title').show().text(title); + $instructions.show(); + + // Scrolling content to the location in the URL's hash. + scrollToHash(); + + // Scrolling content when returning from different domain. + if (scrollPosition) { + document.documentElement.scrollTop = document.body.scrollTop = scrollPosition; + } +} + +/** + * Constructs HTML for given schema in `data`. + * + * @param data Some schema or part of the schema that is to be displayed. + * @param container jQuery element, in which schema is to be displayed. + * @param title Title of the element, which should equal the key object was retrieved with. + * @param required An array of required properties. + * @param path List of titles of previously called objects. + */ +export function constructHTML(data, container, title, required, path) { + // Needed when coloring row in yellow. + container.addClass('container'); + let firstRow = div('first-row').appendTo(container); + let showButton = false; + + // Simple case. + if (typeof data !== 'object') { + div('el').html(urlify(data)).appendTo(firstRow); + return; + } + + // Title. + if (title.length > 0) { + div('el title').html(urlify(title)).appendTo(firstRow); + path.push(title); + } + + // Text. + if (data.hasOwnProperty('text')) { + div('el').html(urlify(data['text'])).appendTo(firstRow); + } + + // Type and required. + let parenthesis = []; + if (data.hasOwnProperty('type')) { + parenthesis.push(data['type']); + } + if (required.indexOf(title) !== -1) { + parenthesis.push('required'); + } + if (parenthesis.length > 0) { + div('el parenthesis', `(${parenthesis.join(', ')})`).appendTo(firstRow); + } + + // Reference. + if (data.hasOwnProperty(referenceKey) && path.join('/') !== data[referenceKey]['name']) { + $('').addClass('el link').attr('href', data[referenceKey]['url']).text(data[referenceKey]['name']).appendTo(firstRow); + } + + if (firstRow.children().length > 0) { + // Applying margin between rows. + firstRow.before(div('margin')); + + // ID. + let id = encodeID(path.join('/'), true); + firstRow.attr('id', id).addClass('paragraph-row'); + + // Adding link. + let $link = $('').addClass('el paragraph link').attr('href', `#${id}`).appendTo(firstRow); + $('').addClass('fa fa-paragraph').attr('aria-hidden', 'true').appendTo($link); + + // Description. + if (data.hasOwnProperty('description')) + div('el description').html(urlify(data['description'])).appendTo(div('shift').appendTo(firstRow)); + + // Updating container. + container = div('shift').appendTo(container); + } + else { + firstRow.remove(); + } + + // All keys that are not listed in "knownSpecifiers". + // Constructing HTML elements. They are not shown, but pushed into "buffer". + let buffer = []; + Object.keys(data).forEach((key) => { + let el = data[key]; + + if (knownSpecifiers.indexOf(key) === -1) { + showButton = true; + path.push(key); + + // Creating wrapping "div" and appending it to buffer. + let wrapper = div(); + buffer.push([wrapper, undefined]); + + // First row. + let row = div().appendTo(wrapper); + div('el color', prettierKeys.hasOwnProperty(key) ? prettierKeys[key] : key).appendTo(row); + + // Simple case. + if (typeof el !== 'object') { + div('el').html(urlify(el)).appendTo(row); + } + else { + // Shifting. + row = div('shift').appendTo(wrapper); + + if (key === 'definitions') { + // Sorting. + let keys = Object.keys(el).sort(function (a, b) { + return a.localeCompare(b); + }); + + // Constructing html for each element. + keys.forEach((definition) => { + constructHTML(el[definition], div().appendTo(row), definition, [], path); + }); + } + else { + // Ensuring it's an array. + if (!Array.isArray(el)) + el = [el]; + + // Sorting if "el" consists only of strings. + let sort = true; + for (let i in el) { + if (el.hasOwnProperty(i) && typeof el[i] !== "string") { + sort = false; + break; + } + } + if (sort) { + el.sort(function (a, b) { + return a.localeCompare(b); + }); + } + + el.forEach((child, i) => { + if (el.length > 1) { + path.push(i); + } + constructHTML(child, row, '', [], path); + if (el.length > 1) { + path.pop(); + } + + // Division markers. + if (parseInt(i) !== data[key].length - 1) { + if (key === 'allOf') { + div('or').html('---------  and  ---------').appendTo(row); + } + else if (key === 'oneOf' || key === 'anyOf') { + div('or').html('---------  or  ---------').appendTo(row); + } + } + }); + } + } + path.pop(); + } + }); + + // Sorting and showing content from "buffer". + if (buffer.length > 1) { + buffer.forEach((el) => { + el[1] = el[0].find('*').length; + }); + buffer.sort(function (a, b) { + return a[1] - b[1]; + }); + } + buffer.forEach((el) => { + el[0].children().appendTo(container); + }); + + // Properties. + if (data.hasOwnProperty('properties')) { + showButton = true; + + // Sorting. + let keys = Object.keys(data['properties']).sort(function (a, b) { + return ('' + a).localeCompare(b); + }); + + // Constructing html for each element. + keys.forEach((key) => { + let row = div().appendTo(container); + constructHTML(data['properties'][key], row, key, data.hasOwnProperty('required') ? data['required'] : [], path); + }); + } + + // Button. + if (showButton && firstRow.children().length > 0) { + let button = div('button'); + $('').attr('aria-hidden', 'true').addClass('el fa fa-plus').appendTo(button); + firstRow.prepend(button); + container.hide(); + } + + if (title.length > 0) { + path.pop(); + } +} + +/** + * Creates html `
    ` element with given classes and text. + * + * @param classes + * @param text + * @returns {jQuery} Created element. + */ +function div(classes, text) { + let ret = document.createElement('div'); + + if (typeof classes !== 'undefined') { + ret.className = classes; + } + if (typeof text !== 'undefined') { + ret.innerText = text; + } + + return $(ret); +} + +/** + * Replaces characters in `text` so it can be used as an id of html element. If `unique` is True, + * then function remembers the id and doesn't return the same one twice. + * + * @param text + * @param unique + * @returns {string} Formatted text. + */ +export function encodeID(text, unique) { + text = text.replace(/\//g, '.'); + text = text.replace(/ /g, '_'); + text = text.replace(/[^a-z0-9-_:.]/gi, ''); + + if (unique) { + if (usedIDs.has(text)) { + let i = 0; + while (usedIDs.has(text + ++i)) {} + text += i; + } + usedIDs.add(text); + } + + return text; +} + +function scrollToHash() { + if (window.location.hash.length > 1) { + let id = window.location.hash.substring(1); + // We use "getElementById" and not jQuery because "." can be in "id". + let $offset = $(document.getElementById(id)).offset(); + if (typeof $offset !== 'undefined') { + $offset.left -= 20; + $offset.top -= 20; + $('html, body').animate({ + scrollTop: $offset.top, + scrollLeft: $offset.left + }, 0); + } + } +} + +function urlify(text) { + let urlRegex = /https?:\/\/[^\s]+\B./g; + return ('' + text).replace(urlRegex, function (url) { + return `${url}`; + }) +} + +/** + * Shrinks content below the element, which should represent button. + * @param $el jQuery element. + */ +function shrink($el) { + $el.children().removeClass('fa-minus').addClass('fa-plus'); + $el.parent().next().hide(); +} + +/** + * Expands content below the element, which should represent button. + * @param $el jQuery element. + */ +function expand($el) { + $el.children().removeClass('fa-plus').addClass('fa-minus'); + $el.parent().next().show(); +} + +/** + * Handles the event when button or title next to the button is clicked. + * + * @param $element jQuery button that was clicked. + * @param event Event, which holds information whether `Ctrl` key was pressed during the click. + */ +function buttonClick($element, event) { + if ($element.children().first().hasClass('fa-minus')) { + shrink($element); + $element.parent().next().find('.button').each(function () { + shrink($(this)); + }); + } + else { + expand($element); + if (event.ctrlKey) { + $element.parent().next().find('.button').each(function () { + expand($(this)); + }); + } + } + removeUrlParameter(); + updateHistoryState(); +} + +/** + * Updates which buttons are expanded in `history.state`. + */ +function updateHistoryState() { + history.state['expanded'] = []; + $('#container').find('.button').each(function () { + let $el = $(this); + if ($el.children().first().hasClass('fa-minus')) { + history.state['expanded'].push($el.parent().attr('id')); + } + }); + history.replaceState(history.state, undefined); +} + +export function shrinkAll() { + $('.button').each(function () { + shrink($(this)); + }); + updateHistoryState(); +} + +export function expandAll() { + $('.button').each(function () { + expand($(this)); + }); + updateHistoryState(); +} diff --git a/d3m/site/package-lock.json b/d3m/site/package-lock.json new file mode 100644 index 0000000..ae7a772 --- /dev/null +++ b/d3m/site/package-lock.json @@ -0,0 +1,2495 @@ +{ + "requires": true, + "lockfileVersion": 1, + "dependencies": { + "JSONStream": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/JSONStream/-/JSONStream-1.3.3.tgz", + "integrity": "sha512-3Sp6WZZ/lXl+nTDoGpGWHEpTnnC6X5fnkolYZR6nwIfzbxxvA8utPWe1gCt7i0m9uVGsSz2IS8K8mJ7HmlduMg==", + "requires": { + "jsonparse": "^1.2.0", + "through": ">=2.2.7 <3" + } + }, + "acorn": { + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-5.7.1.tgz", + "integrity": "sha512-d+nbxBUGKg7Arpsvbnlq61mc12ek3EY8EQldM3GPAhWJ1UVxC6TDGbIvUMNU6obBX3i1+ptCIzV4vq0gFPEGVQ==" + }, + "acorn-dynamic-import": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/acorn-dynamic-import/-/acorn-dynamic-import-3.0.0.tgz", + "integrity": "sha512-zVWV8Z8lislJoOKKqdNMOB+s6+XV5WERty8MnKBeFgwA+19XJjJHs2RP5dzM57FftIs+jQnRToLiWazKr6sSWg==", + "requires": { + "acorn": "^5.0.0" + } + }, + "acorn-node": { + "version": "1.5.2", + "resolved": "https://registry.npmjs.org/acorn-node/-/acorn-node-1.5.2.tgz", + "integrity": "sha512-krFKvw/d1F17AN3XZbybIUzEY4YEPNiGo05AfP3dBlfVKrMHETKpgjpuZkSF8qDNt9UkQcqj7am8yJLseklCMg==", + "requires": { + "acorn": "^5.7.1", + "acorn-dynamic-import": "^3.0.0", + "xtend": "^4.0.1" + } + }, + "ajv": { + "version": "4.11.8", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-4.11.8.tgz", + "integrity": "sha1-gv+wKynmYq5TvcIK8VlHcGc5xTY=", + "optional": true, + "requires": { + "co": "^4.6.0", + "json-stable-stringify": "^1.0.1" + } + }, + "ansi-regex": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-2.1.1.tgz", + "integrity": "sha1-w7M6te42DYbg5ijwRorn7yfWVN8=" + }, + "ansi-styles": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", + "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=" + }, + "array-filter": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/array-filter/-/array-filter-0.0.1.tgz", + "integrity": "sha1-fajPLiZijtcygDWB/SH2fKzS7uw=" + }, + "array-map": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/array-map/-/array-map-0.0.0.tgz", + "integrity": "sha1-iKK6tz0c97zVwbEYoAP2b2ZfpmI=" + }, + "array-reduce": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/array-reduce/-/array-reduce-0.0.0.tgz", + "integrity": "sha1-FziZ0//Rx9k4PkR5Ul2+J4yrXys=" + }, + "asap": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/asap/-/asap-2.0.6.tgz", + "integrity": "sha1-5QNHYR1+aQlDIIu9r+vLwvuGbUY=", + "optional": true + }, + "asn1": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz", + "integrity": "sha512-jxwzQpLQjSmWXgwaCZE9Nz+glAG01yF1QnWgbhGwHI5A6FRIEY6IVqtHhIepHqI7/kyEyQEagBC5mBEFlIYvdg==", + "optional": true, + "requires": { + "safer-buffer": "~2.1.0" + } + }, + "asn1.js": { + "version": "4.10.1", + "resolved": "https://registry.npmjs.org/asn1.js/-/asn1.js-4.10.1.tgz", + "integrity": "sha512-p32cOF5q0Zqs9uBiONKYLm6BClCoBCM5O9JfeUSlnQLBTxYdTK+pW+nXflm8UkKd2UYlEbYz5qEi0JuZR9ckSw==", + "requires": { + "bn.js": "^4.0.0", + "inherits": "^2.0.1", + "minimalistic-assert": "^1.0.0" + } + }, + "assert": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/assert/-/assert-1.4.1.tgz", + "integrity": "sha1-mZEtWRg2tab1s0XA8H7vwI/GXZE=", + "requires": { + "util": "0.10.3" + }, + "dependencies": { + "inherits": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.1.tgz", + "integrity": "sha1-sX0I0ya0Qj5Wjv9xn5GwscvfafE=" + }, + "util": { + "version": "0.10.3", + "resolved": "https://registry.npmjs.org/util/-/util-0.10.3.tgz", + "integrity": "sha1-evsa/lCAUkZInj23/g7TeTNqwPk=", + "requires": { + "inherits": "2.0.1" + } + } + } + }, + "assert-plus": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-0.2.0.tgz", + "integrity": "sha1-104bh+ev/A24qttwIfP+SBAasjQ=", + "optional": true + }, + "asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k=", + "optional": true + }, + "aws-sign2": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.6.0.tgz", + "integrity": "sha1-FDQt0428yU0OW4fXY81jYSwOeU8=", + "optional": true + }, + "aws4": { + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.7.0.tgz", + "integrity": "sha512-32NDda82rhwD9/JBCCkB+MRYDp0oSvlo2IL6rQWA10PQi7tDUM3eqMSltXmY+Oyl/7N3P3qNtAlv7X0d9bI28w==", + "optional": true + }, + "babel-code-frame": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-code-frame/-/babel-code-frame-6.26.0.tgz", + "integrity": "sha1-Y/1D99weO7fONZR9uP42mj9Yx0s=", + "requires": { + "chalk": "^1.1.3", + "esutils": "^2.0.2", + "js-tokens": "^3.0.2" + } + }, + "babel-core": { + "version": "6.26.3", + "resolved": "https://registry.npmjs.org/babel-core/-/babel-core-6.26.3.tgz", + "integrity": "sha512-6jyFLuDmeidKmUEb3NM+/yawG0M2bDZ9Z1qbZP59cyHLz8kYGKYwpJP0UwUKKUiTRNvxfLesJnTedqczP7cTDA==", + "requires": { + "babel-code-frame": "^6.26.0", + "babel-generator": "^6.26.0", + "babel-helpers": "^6.24.1", + "babel-messages": "^6.23.0", + "babel-register": "^6.26.0", + "babel-runtime": "^6.26.0", + "babel-template": "^6.26.0", + "babel-traverse": "^6.26.0", + "babel-types": "^6.26.0", + "babylon": "^6.18.0", + "convert-source-map": "^1.5.1", + "debug": "^2.6.9", + "json5": "^0.5.1", + "lodash": "^4.17.4", + "minimatch": "^3.0.4", + "path-is-absolute": "^1.0.1", + "private": "^0.1.8", + "slash": "^1.0.0", + "source-map": "^0.5.7" + } + }, + "babel-generator": { + "version": "6.26.1", + "resolved": "https://registry.npmjs.org/babel-generator/-/babel-generator-6.26.1.tgz", + "integrity": "sha512-HyfwY6ApZj7BYTcJURpM5tznulaBvyio7/0d4zFOeMPUmfxkCjHocCuoLa2SAGzBI8AREcH3eP3758F672DppA==", + "requires": { + "babel-messages": "^6.23.0", + "babel-runtime": "^6.26.0", + "babel-types": "^6.26.0", + "detect-indent": "^4.0.0", + "jsesc": "^1.3.0", + "lodash": "^4.17.4", + "source-map": "^0.5.7", + "trim-right": "^1.0.1" + } + }, + "babel-helper-builder-binary-assignment-operator-visitor": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-builder-binary-assignment-operator-visitor/-/babel-helper-builder-binary-assignment-operator-visitor-6.24.1.tgz", + "integrity": "sha1-zORReto1b0IgvK6KAsKzRvmlZmQ=", + "requires": { + "babel-helper-explode-assignable-expression": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-types": "^6.24.1" + } + }, + "babel-helper-call-delegate": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-call-delegate/-/babel-helper-call-delegate-6.24.1.tgz", + "integrity": "sha1-7Oaqzdx25Bw0YfiL/Fdb0Nqi340=", + "requires": { + "babel-helper-hoist-variables": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-traverse": "^6.24.1", + "babel-types": "^6.24.1" + } + }, + "babel-helper-define-map": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-helper-define-map/-/babel-helper-define-map-6.26.0.tgz", + "integrity": "sha1-pfVtq0GiX5fstJjH66ypgZ+Vvl8=", + "requires": { + "babel-helper-function-name": "^6.24.1", + "babel-runtime": "^6.26.0", + "babel-types": "^6.26.0", + "lodash": "^4.17.4" + } + }, + "babel-helper-explode-assignable-expression": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-explode-assignable-expression/-/babel-helper-explode-assignable-expression-6.24.1.tgz", + "integrity": "sha1-8luCz33BBDPFX3BZLVdGQArCLKo=", + "requires": { + "babel-runtime": "^6.22.0", + "babel-traverse": "^6.24.1", + "babel-types": "^6.24.1" + } + }, + "babel-helper-function-name": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-function-name/-/babel-helper-function-name-6.24.1.tgz", + "integrity": "sha1-00dbjAPtmCQqJbSDUasYOZ01gKk=", + "requires": { + "babel-helper-get-function-arity": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1", + "babel-traverse": "^6.24.1", + "babel-types": "^6.24.1" + } + }, + "babel-helper-get-function-arity": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-get-function-arity/-/babel-helper-get-function-arity-6.24.1.tgz", + "integrity": "sha1-j3eCqpNAfEHTqlCQj4mwMbG2hT0=", + "requires": { + "babel-runtime": "^6.22.0", + "babel-types": "^6.24.1" + } + }, + "babel-helper-hoist-variables": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-hoist-variables/-/babel-helper-hoist-variables-6.24.1.tgz", + "integrity": "sha1-HssnaJydJVE+rbyZFKc/VAi+enY=", + "requires": { + "babel-runtime": "^6.22.0", + "babel-types": "^6.24.1" + } + }, + "babel-helper-optimise-call-expression": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-optimise-call-expression/-/babel-helper-optimise-call-expression-6.24.1.tgz", + "integrity": "sha1-96E0J7qfc/j0+pk8VKl4gtEkQlc=", + "requires": { + "babel-runtime": "^6.22.0", + "babel-types": "^6.24.1" + } + }, + "babel-helper-regex": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-helper-regex/-/babel-helper-regex-6.26.0.tgz", + "integrity": "sha1-MlxZ+QL4LyS3T6zu0DY5VPZJXnI=", + "requires": { + "babel-runtime": "^6.26.0", + "babel-types": "^6.26.0", + "lodash": "^4.17.4" + } + }, + "babel-helper-remap-async-to-generator": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-remap-async-to-generator/-/babel-helper-remap-async-to-generator-6.24.1.tgz", + "integrity": "sha1-XsWBgnrXI/7N04HxySg5BnbkVRs=", + "requires": { + "babel-helper-function-name": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1", + "babel-traverse": "^6.24.1", + "babel-types": "^6.24.1" + } + }, + "babel-helper-replace-supers": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helper-replace-supers/-/babel-helper-replace-supers-6.24.1.tgz", + "integrity": "sha1-v22/5Dk40XNpohPKiov3S2qQqxo=", + "requires": { + "babel-helper-optimise-call-expression": "^6.24.1", + "babel-messages": "^6.23.0", + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1", + "babel-traverse": "^6.24.1", + "babel-types": "^6.24.1" + } + }, + "babel-helpers": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helpers/-/babel-helpers-6.24.1.tgz", + "integrity": "sha1-NHHenK7DiOXIUOWX5Yom3fN2ArI=", + "requires": { + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1" + } + }, + "babel-messages": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel-messages/-/babel-messages-6.23.0.tgz", + "integrity": "sha1-8830cDhYA1sqKVHG7F7fbGLyYw4=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-check-es2015-constants": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-check-es2015-constants/-/babel-plugin-check-es2015-constants-6.22.0.tgz", + "integrity": "sha1-NRV7EBQm/S/9PaP3XH0ekYNbv4o=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-syntax-async-functions": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/babel-plugin-syntax-async-functions/-/babel-plugin-syntax-async-functions-6.13.0.tgz", + "integrity": "sha1-ytnK0RkbWtY0vzCuCHI5HgZHvpU=" + }, + "babel-plugin-syntax-exponentiation-operator": { + "version": "6.13.0", + "resolved": "https://registry.npmjs.org/babel-plugin-syntax-exponentiation-operator/-/babel-plugin-syntax-exponentiation-operator-6.13.0.tgz", + "integrity": "sha1-nufoM3KQ2pUoggGmpX9BcDF4MN4=" + }, + "babel-plugin-syntax-trailing-function-commas": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-syntax-trailing-function-commas/-/babel-plugin-syntax-trailing-function-commas-6.22.0.tgz", + "integrity": "sha1-ugNgk3+NBuQBgKQ/4NVhb/9TLPM=" + }, + "babel-plugin-transform-async-to-generator": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-async-to-generator/-/babel-plugin-transform-async-to-generator-6.24.1.tgz", + "integrity": "sha1-ZTbjeK/2yx1VF6wOQOs+n8jQh2E=", + "requires": { + "babel-helper-remap-async-to-generator": "^6.24.1", + "babel-plugin-syntax-async-functions": "^6.8.0", + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-arrow-functions": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-arrow-functions/-/babel-plugin-transform-es2015-arrow-functions-6.22.0.tgz", + "integrity": "sha1-RSaSy3EdX3ncf4XkQM5BufJE0iE=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-block-scoped-functions": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-block-scoped-functions/-/babel-plugin-transform-es2015-block-scoped-functions-6.22.0.tgz", + "integrity": "sha1-u8UbSflk1wy42OC5ToICRs46YUE=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-block-scoping": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-block-scoping/-/babel-plugin-transform-es2015-block-scoping-6.26.0.tgz", + "integrity": "sha1-1w9SmcEwjQXBL0Y4E7CgnnOxiV8=", + "requires": { + "babel-runtime": "^6.26.0", + "babel-template": "^6.26.0", + "babel-traverse": "^6.26.0", + "babel-types": "^6.26.0", + "lodash": "^4.17.4" + } + }, + "babel-plugin-transform-es2015-classes": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-classes/-/babel-plugin-transform-es2015-classes-6.24.1.tgz", + "integrity": "sha1-WkxYpQyclGHlZLSyo7+ryXolhNs=", + "requires": { + "babel-helper-define-map": "^6.24.1", + "babel-helper-function-name": "^6.24.1", + "babel-helper-optimise-call-expression": "^6.24.1", + "babel-helper-replace-supers": "^6.24.1", + "babel-messages": "^6.23.0", + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1", + "babel-traverse": "^6.24.1", + "babel-types": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-computed-properties": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-computed-properties/-/babel-plugin-transform-es2015-computed-properties-6.24.1.tgz", + "integrity": "sha1-b+Ko0WiV1WNPTNmZttNICjCBWbM=", + "requires": { + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-destructuring": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-destructuring/-/babel-plugin-transform-es2015-destructuring-6.23.0.tgz", + "integrity": "sha1-mXux8auWf2gtKwh2/jWNYOdlxW0=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-duplicate-keys": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-duplicate-keys/-/babel-plugin-transform-es2015-duplicate-keys-6.24.1.tgz", + "integrity": "sha1-c+s9MQypaePvnskcU3QabxV2Qj4=", + "requires": { + "babel-runtime": "^6.22.0", + "babel-types": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-for-of": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-for-of/-/babel-plugin-transform-es2015-for-of-6.23.0.tgz", + "integrity": "sha1-9HyVsrYT3x0+zC/bdXNiPHUkhpE=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-function-name": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-function-name/-/babel-plugin-transform-es2015-function-name-6.24.1.tgz", + "integrity": "sha1-g0yJhTvDaxrw86TF26qU/Y6sqos=", + "requires": { + "babel-helper-function-name": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-types": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-literals": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-literals/-/babel-plugin-transform-es2015-literals-6.22.0.tgz", + "integrity": "sha1-T1SgLWzWbPkVKAAZox0xklN3yi4=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-modules-amd": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-amd/-/babel-plugin-transform-es2015-modules-amd-6.24.1.tgz", + "integrity": "sha1-Oz5UAXI5hC1tGcMBHEvS8AoA0VQ=", + "requires": { + "babel-plugin-transform-es2015-modules-commonjs": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-modules-commonjs": { + "version": "6.26.2", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-commonjs/-/babel-plugin-transform-es2015-modules-commonjs-6.26.2.tgz", + "integrity": "sha512-CV9ROOHEdrjcwhIaJNBGMBCodN+1cfkwtM1SbUHmvyy35KGT7fohbpOxkE2uLz1o6odKK2Ck/tz47z+VqQfi9Q==", + "requires": { + "babel-plugin-transform-strict-mode": "^6.24.1", + "babel-runtime": "^6.26.0", + "babel-template": "^6.26.0", + "babel-types": "^6.26.0" + } + }, + "babel-plugin-transform-es2015-modules-systemjs": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-systemjs/-/babel-plugin-transform-es2015-modules-systemjs-6.24.1.tgz", + "integrity": "sha1-/4mhQrkRmpBhlfXxBuzzBdlAfSM=", + "requires": { + "babel-helper-hoist-variables": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-modules-umd": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-modules-umd/-/babel-plugin-transform-es2015-modules-umd-6.24.1.tgz", + "integrity": "sha1-rJl+YoXNGO1hdq22B9YCNErThGg=", + "requires": { + "babel-plugin-transform-es2015-modules-amd": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-object-super": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-object-super/-/babel-plugin-transform-es2015-object-super-6.24.1.tgz", + "integrity": "sha1-JM72muIcuDp/hgPa0CH1cusnj40=", + "requires": { + "babel-helper-replace-supers": "^6.24.1", + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-parameters": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-parameters/-/babel-plugin-transform-es2015-parameters-6.24.1.tgz", + "integrity": "sha1-V6w1GrScrxSpfNE7CfZv3wpiXys=", + "requires": { + "babel-helper-call-delegate": "^6.24.1", + "babel-helper-get-function-arity": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1", + "babel-traverse": "^6.24.1", + "babel-types": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-shorthand-properties": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-shorthand-properties/-/babel-plugin-transform-es2015-shorthand-properties-6.24.1.tgz", + "integrity": "sha1-JPh11nIch2YbvZmkYi5R8U3jiqA=", + "requires": { + "babel-runtime": "^6.22.0", + "babel-types": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-spread": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-spread/-/babel-plugin-transform-es2015-spread-6.22.0.tgz", + "integrity": "sha1-1taKmfia7cRTbIGlQujdnxdG+NE=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-sticky-regex": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-sticky-regex/-/babel-plugin-transform-es2015-sticky-regex-6.24.1.tgz", + "integrity": "sha1-AMHNsaynERLN8M9hJsLta0V8zbw=", + "requires": { + "babel-helper-regex": "^6.24.1", + "babel-runtime": "^6.22.0", + "babel-types": "^6.24.1" + } + }, + "babel-plugin-transform-es2015-template-literals": { + "version": "6.22.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-template-literals/-/babel-plugin-transform-es2015-template-literals-6.22.0.tgz", + "integrity": "sha1-qEs0UPfp+PH2g51taH2oS7EjbY0=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-typeof-symbol": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-typeof-symbol/-/babel-plugin-transform-es2015-typeof-symbol-6.23.0.tgz", + "integrity": "sha1-3sCfHN3/lLUqxz1QXITfWdzOs3I=", + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-es2015-unicode-regex": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-es2015-unicode-regex/-/babel-plugin-transform-es2015-unicode-regex-6.24.1.tgz", + "integrity": "sha1-04sS9C6nMj9yk4fxinxa4frrNek=", + "requires": { + "babel-helper-regex": "^6.24.1", + "babel-runtime": "^6.22.0", + "regexpu-core": "^2.0.0" + } + }, + "babel-plugin-transform-exponentiation-operator": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-exponentiation-operator/-/babel-plugin-transform-exponentiation-operator-6.24.1.tgz", + "integrity": "sha1-KrDJx/MJj6SJB3cruBP+QejeOg4=", + "requires": { + "babel-helper-builder-binary-assignment-operator-visitor": "^6.24.1", + "babel-plugin-syntax-exponentiation-operator": "^6.8.0", + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-transform-regenerator": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-regenerator/-/babel-plugin-transform-regenerator-6.26.0.tgz", + "integrity": "sha1-4HA2lvveJ/Cj78rPi03KL3s6jy8=", + "requires": { + "regenerator-transform": "^0.10.0" + } + }, + "babel-plugin-transform-strict-mode": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-plugin-transform-strict-mode/-/babel-plugin-transform-strict-mode-6.24.1.tgz", + "integrity": "sha1-1fr3qleKZbvlkc9e2uBKDGcCB1g=", + "requires": { + "babel-runtime": "^6.22.0", + "babel-types": "^6.24.1" + } + }, + "babel-preset-env": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/babel-preset-env/-/babel-preset-env-1.6.1.tgz", + "integrity": "sha512-W6VIyA6Ch9ePMI7VptNn2wBM6dbG0eSz25HEiL40nQXCsXGTGZSTZu1Iap+cj3Q0S5a7T9+529l/5Bkvd+afNA==", + "requires": { + "babel-plugin-check-es2015-constants": "^6.22.0", + "babel-plugin-syntax-trailing-function-commas": "^6.22.0", + "babel-plugin-transform-async-to-generator": "^6.22.0", + "babel-plugin-transform-es2015-arrow-functions": "^6.22.0", + "babel-plugin-transform-es2015-block-scoped-functions": "^6.22.0", + "babel-plugin-transform-es2015-block-scoping": "^6.23.0", + "babel-plugin-transform-es2015-classes": "^6.23.0", + "babel-plugin-transform-es2015-computed-properties": "^6.22.0", + "babel-plugin-transform-es2015-destructuring": "^6.23.0", + "babel-plugin-transform-es2015-duplicate-keys": "^6.22.0", + "babel-plugin-transform-es2015-for-of": "^6.23.0", + "babel-plugin-transform-es2015-function-name": "^6.22.0", + "babel-plugin-transform-es2015-literals": "^6.22.0", + "babel-plugin-transform-es2015-modules-amd": "^6.22.0", + "babel-plugin-transform-es2015-modules-commonjs": "^6.23.0", + "babel-plugin-transform-es2015-modules-systemjs": "^6.23.0", + "babel-plugin-transform-es2015-modules-umd": "^6.23.0", + "babel-plugin-transform-es2015-object-super": "^6.22.0", + "babel-plugin-transform-es2015-parameters": "^6.23.0", + "babel-plugin-transform-es2015-shorthand-properties": "^6.22.0", + "babel-plugin-transform-es2015-spread": "^6.22.0", + "babel-plugin-transform-es2015-sticky-regex": "^6.22.0", + "babel-plugin-transform-es2015-template-literals": "^6.22.0", + "babel-plugin-transform-es2015-typeof-symbol": "^6.23.0", + "babel-plugin-transform-es2015-unicode-regex": "^6.22.0", + "babel-plugin-transform-exponentiation-operator": "^6.22.0", + "babel-plugin-transform-regenerator": "^6.22.0", + "browserslist": "^2.1.2", + "invariant": "^2.2.2", + "semver": "^5.3.0" + }, + "dependencies": { + "browserslist": { + "version": "2.11.3", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-2.11.3.tgz", + "integrity": "sha512-yWu5cXT7Av6mVwzWc8lMsJMHWn4xyjSuGYi4IozbVTLUOEYPSagUB8kiMDUHA1fS3zjr8nkxkn9jdvug4BBRmA==", + "requires": { + "caniuse-lite": "^1.0.30000792", + "electron-to-chromium": "^1.3.30" + } + } + } + }, + "babel-register": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-register/-/babel-register-6.26.0.tgz", + "integrity": "sha1-btAhFz4vy0htestFxgCahW9kcHE=", + "requires": { + "babel-core": "^6.26.0", + "babel-runtime": "^6.26.0", + "core-js": "^2.5.0", + "home-or-tmp": "^2.0.0", + "lodash": "^4.17.4", + "mkdirp": "^0.5.1", + "source-map-support": "^0.4.15" + } + }, + "babel-runtime": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-runtime/-/babel-runtime-6.26.0.tgz", + "integrity": "sha1-llxwWGaOgrVde/4E/yM3vItWR/4=", + "requires": { + "core-js": "^2.4.0", + "regenerator-runtime": "^0.11.0" + } + }, + "babel-template": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-template/-/babel-template-6.26.0.tgz", + "integrity": "sha1-3gPi0WOWsGn0bdn/+FIfsaDjXgI=", + "requires": { + "babel-runtime": "^6.26.0", + "babel-traverse": "^6.26.0", + "babel-types": "^6.26.0", + "babylon": "^6.18.0", + "lodash": "^4.17.4" + } + }, + "babel-traverse": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-traverse/-/babel-traverse-6.26.0.tgz", + "integrity": "sha1-RqnL1+3MYsjlwGTi0tjQ9ANXZu4=", + "requires": { + "babel-code-frame": "^6.26.0", + "babel-messages": "^6.23.0", + "babel-runtime": "^6.26.0", + "babel-types": "^6.26.0", + "babylon": "^6.18.0", + "debug": "^2.6.8", + "globals": "^9.18.0", + "invariant": "^2.2.2", + "lodash": "^4.17.4" + } + }, + "babel-types": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-types/-/babel-types-6.26.0.tgz", + "integrity": "sha1-o7Bz+Uq0nrb6Vc1lInozQ4BjJJc=", + "requires": { + "babel-runtime": "^6.26.0", + "esutils": "^2.0.2", + "lodash": "^4.17.4", + "to-fast-properties": "^1.0.3" + } + }, + "babelify": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/babelify/-/babelify-8.0.0.tgz", + "integrity": "sha512-xVr63fKEvMWUrrIbqlHYsMcc5Zdw4FSVesAHgkgajyCE1W8gbm9rbMakqavhxKvikGYMhEcqxTwB/gQmQ6lBtw==" + }, + "babylon": { + "version": "6.18.0", + "resolved": "https://registry.npmjs.org/babylon/-/babylon-6.18.0.tgz", + "integrity": "sha512-q/UEjfGJ2Cm3oKV71DJz9d25TPnq5rhBVL2Q4fA5wcC3jcrdn7+SssEybFIxwAvvP+YCsCYNKughoF33GxgycQ==" + }, + "balanced-match": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", + "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=" + }, + "base64-js": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.3.0.tgz", + "integrity": "sha512-ccav/yGvoa80BQDljCxsmmQ3Xvx60/UpBIij5QN21W3wBi/hhIC9OoO+KLpu9IJTS9j4DRVJ3aDDF9cMSoa2lw==" + }, + "bcrypt-pbkdf": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz", + "integrity": "sha1-pDAdOJtqQ/m2f/PKEaP2Y342Dp4=", + "optional": true, + "requires": { + "tweetnacl": "^0.14.3" + } + }, + "bn.js": { + "version": "4.11.8", + "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-4.11.8.tgz", + "integrity": "sha512-ItfYfPLkWHUjckQCk8xC+LwxgK8NYcXywGigJgSwOP8Y2iyWT4f2vsZnoOXTTbo+o5yXmIUJ4gn5538SO5S3gA==" + }, + "boom": { + "version": "2.10.1", + "resolved": "https://registry.npmjs.org/boom/-/boom-2.10.1.tgz", + "integrity": "sha1-OciRjO/1eZ+D+UkqhI9iWt0Mdm8=", + "requires": { + "hoek": "2.x.x" + } + }, + "brace-expansion": { + "version": "1.1.11", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", + "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "requires": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "brorand": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/brorand/-/brorand-1.1.0.tgz", + "integrity": "sha1-EsJe/kCkXjwyPrhnWgoM5XsiNx8=" + }, + "browser-pack": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/browser-pack/-/browser-pack-6.1.0.tgz", + "integrity": "sha512-erYug8XoqzU3IfcU8fUgyHqyOXqIE4tUTTQ+7mqUjQlvnXkOO6OlT9c/ZoJVHYoAaqGxr09CN53G7XIsO4KtWA==", + "requires": { + "JSONStream": "^1.0.3", + "combine-source-map": "~0.8.0", + "defined": "^1.0.0", + "safe-buffer": "^5.1.1", + "through2": "^2.0.0", + "umd": "^3.0.0" + } + }, + "browser-resolve": { + "version": "1.11.3", + "resolved": "https://registry.npmjs.org/browser-resolve/-/browser-resolve-1.11.3.tgz", + "integrity": "sha512-exDi1BYWB/6raKHmDTCicQfTkqwN5fioMFV4j8BsfMU4R2DK/QfZfK7kOVkmWCNANf0snkBzqGqAJBao9gZMdQ==", + "requires": { + "resolve": "1.1.7" + }, + "dependencies": { + "resolve": { + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.1.7.tgz", + "integrity": "sha1-IDEU2CrSxe2ejgQRs5ModeiJ6Xs=" + } + } + }, + "browserify": { + "version": "14.5.0", + "resolved": "https://registry.npmjs.org/browserify/-/browserify-14.5.0.tgz", + "integrity": "sha512-gKfOsNQv/toWz+60nSPfYzuwSEdzvV2WdxrVPUbPD/qui44rAkB3t3muNtmmGYHqrG56FGwX9SUEQmzNLAeS7g==", + "requires": { + "JSONStream": "^1.0.3", + "assert": "^1.4.0", + "browser-pack": "^6.0.1", + "browser-resolve": "^1.11.0", + "browserify-zlib": "~0.2.0", + "buffer": "^5.0.2", + "cached-path-relative": "^1.0.0", + "concat-stream": "~1.5.1", + "console-browserify": "^1.1.0", + "constants-browserify": "~1.0.0", + "crypto-browserify": "^3.0.0", + "defined": "^1.0.0", + "deps-sort": "^2.0.0", + "domain-browser": "~1.1.0", + "duplexer2": "~0.1.2", + "events": "~1.1.0", + "glob": "^7.1.0", + "has": "^1.0.0", + "htmlescape": "^1.1.0", + "https-browserify": "^1.0.0", + "inherits": "~2.0.1", + "insert-module-globals": "^7.0.0", + "labeled-stream-splicer": "^2.0.0", + "module-deps": "^4.0.8", + "os-browserify": "~0.3.0", + "parents": "^1.0.1", + "path-browserify": "~0.0.0", + "process": "~0.11.0", + "punycode": "^1.3.2", + "querystring-es3": "~0.2.0", + "read-only-stream": "^2.0.0", + "readable-stream": "^2.0.2", + "resolve": "^1.1.4", + "shasum": "^1.0.0", + "shell-quote": "^1.6.1", + "stream-browserify": "^2.0.0", + "stream-http": "^2.0.0", + "string_decoder": "~1.0.0", + "subarg": "^1.0.0", + "syntax-error": "^1.1.1", + "through2": "^2.0.0", + "timers-browserify": "^1.0.1", + "tty-browserify": "~0.0.0", + "url": "~0.11.0", + "util": "~0.10.1", + "vm-browserify": "~0.0.1", + "xtend": "^4.0.0" + } + }, + "browserify-aes": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/browserify-aes/-/browserify-aes-1.2.0.tgz", + "integrity": "sha512-+7CHXqGuspUn/Sl5aO7Ea0xWGAtETPXNSAjHo48JfLdPWcMng33Xe4znFvQweqc/uzk5zSOI3H52CYnjCfb5hA==", + "requires": { + "buffer-xor": "^1.0.3", + "cipher-base": "^1.0.0", + "create-hash": "^1.1.0", + "evp_bytestokey": "^1.0.3", + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "browserify-cipher": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/browserify-cipher/-/browserify-cipher-1.0.1.tgz", + "integrity": "sha512-sPhkz0ARKbf4rRQt2hTpAHqn47X3llLkUGn+xEJzLjwY8LRs2p0v7ljvI5EyoRO/mexrNunNECisZs+gw2zz1w==", + "requires": { + "browserify-aes": "^1.0.4", + "browserify-des": "^1.0.0", + "evp_bytestokey": "^1.0.0" + } + }, + "browserify-des": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/browserify-des/-/browserify-des-1.0.2.tgz", + "integrity": "sha512-BioO1xf3hFwz4kc6iBhI3ieDFompMhrMlnDFC4/0/vd5MokpuAc3R+LYbwTA9A5Yc9pq9UYPqffKpW2ObuwX5A==", + "requires": { + "cipher-base": "^1.0.1", + "des.js": "^1.0.0", + "inherits": "^2.0.1", + "safe-buffer": "^5.1.2" + } + }, + "browserify-rsa": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz", + "integrity": "sha1-IeCr+vbyApzy+vsTNWenAdQTVSQ=", + "requires": { + "bn.js": "^4.1.0", + "randombytes": "^2.0.1" + } + }, + "browserify-sign": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/browserify-sign/-/browserify-sign-4.0.4.tgz", + "integrity": "sha1-qk62jl17ZYuqa/alfmMMvXqT0pg=", + "requires": { + "bn.js": "^4.1.1", + "browserify-rsa": "^4.0.0", + "create-hash": "^1.1.0", + "create-hmac": "^1.1.2", + "elliptic": "^6.0.0", + "inherits": "^2.0.1", + "parse-asn1": "^5.0.0" + } + }, + "browserify-zlib": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/browserify-zlib/-/browserify-zlib-0.2.0.tgz", + "integrity": "sha512-Z942RysHXmJrhqk88FmKBVq/v5tqmSkDz7p54G/MGyjMnCFFnC79XWNbg+Vta8W6Wb2qtSZTSxIGkJrRpCFEiA==", + "requires": { + "pako": "~1.0.5" + } + }, + "buffer": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.2.0.tgz", + "integrity": "sha512-nUJyfChH7PMJy75eRDCCKtszSEFokUNXC1hNVSe+o+VdcgvDPLs20k3v8UXI8ruRYAJiYtyRea8mYyqPxoHWDw==", + "requires": { + "base64-js": "^1.0.2", + "ieee754": "^1.1.4" + } + }, + "buffer-from": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz", + "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==" + }, + "buffer-xor": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/buffer-xor/-/buffer-xor-1.0.3.tgz", + "integrity": "sha1-JuYe0UIvtw3ULm42cp7VHYVf6Nk=" + }, + "builtin-status-codes": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/builtin-status-codes/-/builtin-status-codes-3.0.0.tgz", + "integrity": "sha1-hZgoeOIbmOHGZCXgPQF0eI9Wnug=" + }, + "cached-path-relative": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/cached-path-relative/-/cached-path-relative-1.0.1.tgz", + "integrity": "sha1-0JxLUoAKpMB44t2BqGmqyQ0uVOc=" + }, + "caniuse-lite": { + "version": "1.0.30000865", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30000865.tgz", + "integrity": "sha512-vs79o1mOSKRGv/1pSkp4EXgl4ZviWeYReXw60XfacPU64uQWZwJT6vZNmxRF9O+6zu71sJwMxLK5JXxbzuVrLw==" + }, + "caseless": { + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz", + "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw=", + "optional": true + }, + "chalk": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", + "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", + "requires": { + "ansi-styles": "^2.2.1", + "escape-string-regexp": "^1.0.2", + "has-ansi": "^2.0.0", + "strip-ansi": "^3.0.0", + "supports-color": "^2.0.0" + } + }, + "cipher-base": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/cipher-base/-/cipher-base-1.0.4.tgz", + "integrity": "sha512-Kkht5ye6ZGmwv40uUDZztayT2ThLQGfnj/T71N/XzeZeo3nf8foyW7zGTsPYkEya3m5f3cAypH+qe7YOrM1U2Q==", + "requires": { + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "co": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", + "integrity": "sha1-bqa989hTrlTMuOR7+gvz+QMfsYQ=", + "optional": true + }, + "combine-source-map": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/combine-source-map/-/combine-source-map-0.8.0.tgz", + "integrity": "sha1-pY0N8ELBhvz4IqjoAV9UUNLXmos=", + "requires": { + "convert-source-map": "~1.1.0", + "inline-source-map": "~0.6.0", + "lodash.memoize": "~3.0.3", + "source-map": "~0.5.3" + }, + "dependencies": { + "convert-source-map": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-1.1.3.tgz", + "integrity": "sha1-SCnId+n+SbMWHzvzZziI4gRpmGA=" + } + } + }, + "combined-stream": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.6.tgz", + "integrity": "sha1-cj599ugBrFYTETp+RFqbactjKBg=", + "requires": { + "delayed-stream": "~1.0.0" + } + }, + "concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=" + }, + "concat-stream": { + "version": "1.5.2", + "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.5.2.tgz", + "integrity": "sha1-cIl4Yk2FavQaWnQd790mHadSwmY=", + "requires": { + "inherits": "~2.0.1", + "readable-stream": "~2.0.0", + "typedarray": "~0.0.5" + }, + "dependencies": { + "process-nextick-args": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", + "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=" + }, + "readable-stream": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.0.6.tgz", + "integrity": "sha1-j5A0HmilPMySh4jaz80Rs265t44=", + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.1", + "isarray": "~1.0.0", + "process-nextick-args": "~1.0.6", + "string_decoder": "~0.10.x", + "util-deprecate": "~1.0.1" + } + }, + "string_decoder": { + "version": "0.10.31", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", + "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=" + } + } + }, + "console-browserify": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/console-browserify/-/console-browserify-1.1.0.tgz", + "integrity": "sha1-8CQcRXMKn8YyOyBtvzjtx0HQuxA=", + "requires": { + "date-now": "^0.1.4" + } + }, + "constants-browserify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/constants-browserify/-/constants-browserify-1.0.0.tgz", + "integrity": "sha1-wguW2MYXdIqvHBYCF2DNJ/y4y3U=" + }, + "convert-source-map": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-1.5.1.tgz", + "integrity": "sha1-uCeAl7m8IpNl3lxiz1/K7YtVmeU=" + }, + "core-js": { + "version": "2.5.7", + "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.5.7.tgz", + "integrity": "sha512-RszJCAxg/PP6uzXVXL6BsxSXx/B05oJAQ2vkJRjyjrEcNVycaqOmNb5OTxZPE3xa5gwZduqza6L9JOCenh/Ecw==" + }, + "core-util-is": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", + "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" + }, + "create-ecdh": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/create-ecdh/-/create-ecdh-4.0.3.tgz", + "integrity": "sha512-GbEHQPMOswGpKXM9kCWVrremUcBmjteUaQ01T9rkKCPDXfUHX0IoP9LpHYo2NPFampa4e+/pFDc3jQdxrxQLaw==", + "requires": { + "bn.js": "^4.1.0", + "elliptic": "^6.0.0" + } + }, + "create-hash": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz", + "integrity": "sha512-z00bCGNHDG8mHAkP7CtT1qVu+bFQUPjYq/4Iv3C3kWjTFV10zIjfSoeqXo9Asws8gwSHDGj/hl2u4OGIjapeCg==", + "requires": { + "cipher-base": "^1.0.1", + "inherits": "^2.0.1", + "md5.js": "^1.3.4", + "ripemd160": "^2.0.1", + "sha.js": "^2.4.0" + } + }, + "create-hmac": { + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/create-hmac/-/create-hmac-1.1.7.tgz", + "integrity": "sha512-MJG9liiZ+ogc4TzUwuvbER1JRdgvUFSB5+VR/g5h82fGaIRWMWddtKBHi7/sVhfjQZ6SehlyhvQYrcYkaUIpLg==", + "requires": { + "cipher-base": "^1.0.3", + "create-hash": "^1.1.0", + "inherits": "^2.0.1", + "ripemd160": "^2.0.0", + "safe-buffer": "^5.0.1", + "sha.js": "^2.4.8" + } + }, + "cryptiles": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/cryptiles/-/cryptiles-2.0.5.tgz", + "integrity": "sha1-O9/s3GCBR8HGcgL6KR59ylnqo7g=", + "optional": true, + "requires": { + "boom": "2.x.x" + } + }, + "crypto-browserify": { + "version": "3.12.0", + "resolved": "https://registry.npmjs.org/crypto-browserify/-/crypto-browserify-3.12.0.tgz", + "integrity": "sha512-fz4spIh+znjO2VjL+IdhEpRJ3YN6sMzITSBijk6FK2UvTqruSQW+/cCZTSNsMiZNvUeq0CqurF+dAbyiGOY6Wg==", + "requires": { + "browserify-cipher": "^1.0.0", + "browserify-sign": "^4.0.0", + "create-ecdh": "^4.0.0", + "create-hash": "^1.1.0", + "create-hmac": "^1.1.0", + "diffie-hellman": "^5.0.0", + "inherits": "^2.0.1", + "pbkdf2": "^3.0.3", + "public-encrypt": "^4.0.0", + "randombytes": "^2.0.0", + "randomfill": "^1.0.3" + } + }, + "dashdash": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", + "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=", + "optional": true, + "requires": { + "assert-plus": "^1.0.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", + "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=", + "optional": true + } + } + }, + "date-now": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/date-now/-/date-now-0.1.4.tgz", + "integrity": "sha1-6vQ5/U1ISK105cx9vvIAZyueNFs=" + }, + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "requires": { + "ms": "2.0.0" + } + }, + "defined": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/defined/-/defined-1.0.0.tgz", + "integrity": "sha1-yY2bzvdWdBiOEQlpFRGZ45sfppM=" + }, + "delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk=" + }, + "deps-sort": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/deps-sort/-/deps-sort-2.0.0.tgz", + "integrity": "sha1-CRckkC6EZYJg65EHSMzNGvbiH7U=", + "requires": { + "JSONStream": "^1.0.3", + "shasum": "^1.0.0", + "subarg": "^1.0.0", + "through2": "^2.0.0" + } + }, + "des.js": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/des.js/-/des.js-1.0.0.tgz", + "integrity": "sha1-wHTS4qpqipoH29YfmhXCzYPsjsw=", + "requires": { + "inherits": "^2.0.1", + "minimalistic-assert": "^1.0.0" + } + }, + "detect-indent": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/detect-indent/-/detect-indent-4.0.0.tgz", + "integrity": "sha1-920GQ1LN9Docts5hnE7jqUdd4gg=", + "requires": { + "repeating": "^2.0.0" + } + }, + "detective": { + "version": "4.7.1", + "resolved": "https://registry.npmjs.org/detective/-/detective-4.7.1.tgz", + "integrity": "sha512-H6PmeeUcZloWtdt4DAkFyzFL94arpHr3NOwwmVILFiy+9Qd4JTxxXrzfyGk/lmct2qVGBwTSwSXagqu2BxmWig==", + "requires": { + "acorn": "^5.2.1", + "defined": "^1.0.0" + } + }, + "diffie-hellman": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz", + "integrity": "sha512-kqag/Nl+f3GwyK25fhUMYj81BUOrZ9IuJsjIcDE5icNM9FJHAVm3VcUDxdLPoQtTuUylWm6ZIknYJwwaPxsUzg==", + "requires": { + "bn.js": "^4.1.0", + "miller-rabin": "^4.0.0", + "randombytes": "^2.0.0" + } + }, + "domain-browser": { + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/domain-browser/-/domain-browser-1.1.7.tgz", + "integrity": "sha1-hnqksJP6oF8d4IwG9NeyH9+GmLw=" + }, + "duplexer2": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/duplexer2/-/duplexer2-0.1.4.tgz", + "integrity": "sha1-ixLauHjA1p4+eJEFFmKjL8a93ME=", + "requires": { + "readable-stream": "^2.0.2" + } + }, + "ecc-jsbn": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz", + "integrity": "sha1-OoOpBOVDUyh4dMVkt1SThoSamMk=", + "optional": true, + "requires": { + "jsbn": "~0.1.0", + "safer-buffer": "^2.1.0" + } + }, + "electron-to-chromium": { + "version": "1.3.55", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.3.55.tgz", + "integrity": "sha1-8VDhCyC3fZ1Br8yjEu/gw7Gn/c4=" + }, + "elliptic": { + "version": "6.4.0", + "resolved": "https://registry.npmjs.org/elliptic/-/elliptic-6.4.0.tgz", + "integrity": "sha1-ysmvh2LIWDYYcAPI3+GT5eLq5d8=", + "requires": { + "bn.js": "^4.4.0", + "brorand": "^1.0.1", + "hash.js": "^1.0.0", + "hmac-drbg": "^1.0.0", + "inherits": "^2.0.1", + "minimalistic-assert": "^1.0.0", + "minimalistic-crypto-utils": "^1.0.0" + } + }, + "errno": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/errno/-/errno-0.1.7.tgz", + "integrity": "sha512-MfrRBDWzIWifgq6tJj60gkAwtLNb6sQPlcFrSOflcP1aFmmruKQ2wRnze/8V6kgyz7H3FF8Npzv78mZ7XLLflg==", + "optional": true, + "requires": { + "prr": "~1.0.1" + } + }, + "escape-string-regexp": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", + "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=" + }, + "esutils": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.2.tgz", + "integrity": "sha1-Cr9PHKpbyx96nYrMbepPqqBLrJs=" + }, + "events": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/events/-/events-1.1.1.tgz", + "integrity": "sha1-nr23Y1rQmccNzEwqH1AEKI6L2SQ=" + }, + "evp_bytestokey": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/evp_bytestokey/-/evp_bytestokey-1.0.3.tgz", + "integrity": "sha512-/f2Go4TognH/KvCISP7OUsHn85hT9nUkxxA9BEWxFn+Oj9o8ZNLm/40hdlgSLyuOimsrTKLUMEorQexp/aPQeA==", + "requires": { + "md5.js": "^1.3.4", + "safe-buffer": "^5.1.1" + } + }, + "extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", + "optional": true + }, + "extsprintf": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", + "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=" + }, + "font-awesome": { + "version": "4.7.0", + "resolved": "https://registry.npmjs.org/font-awesome/-/font-awesome-4.7.0.tgz", + "integrity": "sha1-j6jPBBGhoxr9B7BtKQK7n8gVoTM=" + }, + "forever-agent": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz", + "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE=", + "optional": true + }, + "form-data": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.1.4.tgz", + "integrity": "sha1-M8GDrPGTJ27KqYFDpp6Uv+4XUNE=", + "optional": true, + "requires": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.5", + "mime-types": "^2.1.12" + } + }, + "fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=" + }, + "function-bind": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz", + "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==" + }, + "get-assigned-identifiers": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/get-assigned-identifiers/-/get-assigned-identifiers-1.2.0.tgz", + "integrity": "sha512-mBBwmeGTrxEMO4pMaaf/uUEFHnYtwr8FTe8Y/mer4rcV/bye0qGm6pw1bGZFGStxC5O76c5ZAVBGnqHmOaJpdQ==" + }, + "getpass": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz", + "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=", + "optional": true, + "requires": { + "assert-plus": "^1.0.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", + "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=", + "optional": true + } + } + }, + "glob": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", + "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", + "requires": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + }, + "globals": { + "version": "9.18.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-9.18.0.tgz", + "integrity": "sha512-S0nG3CLEQiY/ILxqtztTWH/3iRRdyBLw6KMDxnKMchrtbj2OFmehVh0WUCfW3DUrIgx/qFrJPICrq4Z4sTR9UQ==" + }, + "graceful-fs": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.11.tgz", + "integrity": "sha1-Dovf5NHduIVNZOBOp8AOKgJuVlg=", + "optional": true + }, + "har-schema": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-1.0.5.tgz", + "integrity": "sha1-0mMTX0MwfALGAq/I/pWXDAFRNp4=", + "optional": true + }, + "har-validator": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-4.2.1.tgz", + "integrity": "sha1-M0gdDxu/9gDdID11gSpqX7oALio=", + "optional": true, + "requires": { + "ajv": "^4.9.1", + "har-schema": "^1.0.5" + } + }, + "has": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz", + "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==", + "requires": { + "function-bind": "^1.1.1" + } + }, + "has-ansi": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/has-ansi/-/has-ansi-2.0.0.tgz", + "integrity": "sha1-NPUEnOHs3ysGSa8+8k5F7TVBbZE=", + "requires": { + "ansi-regex": "^2.0.0" + } + }, + "hash-base": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/hash-base/-/hash-base-3.0.4.tgz", + "integrity": "sha1-X8hoaEfs1zSZQDMZprCj8/auSRg=", + "requires": { + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "hash.js": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/hash.js/-/hash.js-1.1.5.tgz", + "integrity": "sha512-eWI5HG9Np+eHV1KQhisXWwM+4EPPYe5dFX1UZZH7k/E3JzDEazVH+VGlZi6R94ZqImq+A3D1mCEtrFIfg/E7sA==", + "requires": { + "inherits": "^2.0.3", + "minimalistic-assert": "^1.0.1" + } + }, + "hawk": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/hawk/-/hawk-3.1.3.tgz", + "integrity": "sha1-B4REvXwWQLD+VA0sm3PVlnjo4cQ=", + "optional": true, + "requires": { + "boom": "2.x.x", + "cryptiles": "2.x.x", + "hoek": "2.x.x", + "sntp": "1.x.x" + } + }, + "hmac-drbg": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/hmac-drbg/-/hmac-drbg-1.0.1.tgz", + "integrity": "sha1-0nRXAQJabHdabFRXk+1QL8DGSaE=", + "requires": { + "hash.js": "^1.0.3", + "minimalistic-assert": "^1.0.0", + "minimalistic-crypto-utils": "^1.0.1" + } + }, + "hoek": { + "version": "2.16.3", + "resolved": "https://registry.npmjs.org/hoek/-/hoek-2.16.3.tgz", + "integrity": "sha1-ILt0A9POo5jpHcRxCo/xuCdKJe0=" + }, + "home-or-tmp": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/home-or-tmp/-/home-or-tmp-2.0.0.tgz", + "integrity": "sha1-42w/LSyufXRqhX440Y1fMqeILbg=", + "requires": { + "os-homedir": "^1.0.0", + "os-tmpdir": "^1.0.1" + } + }, + "htmlescape": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/htmlescape/-/htmlescape-1.1.1.tgz", + "integrity": "sha1-OgPtwiFLyjtmQko+eVk0lQnLA1E=" + }, + "http-signature": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.1.1.tgz", + "integrity": "sha1-33LiZwZs0Kxn+3at+OE0qPvPkb8=", + "optional": true, + "requires": { + "assert-plus": "^0.2.0", + "jsprim": "^1.2.2", + "sshpk": "^1.7.0" + } + }, + "https-browserify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/https-browserify/-/https-browserify-1.0.0.tgz", + "integrity": "sha1-7AbBDgo0wPL68Zn3/X/Hj//QPHM=" + }, + "ieee754": { + "version": "1.1.12", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.1.12.tgz", + "integrity": "sha512-GguP+DRY+pJ3soyIiGPTvdiVXjZ+DbXOxGpXn3eMvNW4x4irjqXm4wHKscC+TfxSJ0yw/S1F24tqdMNsMZTiLA==" + }, + "image-size": { + "version": "0.5.5", + "resolved": "https://registry.npmjs.org/image-size/-/image-size-0.5.5.tgz", + "integrity": "sha1-Cd/Uq50g4p6xw+gLiZA3jfnjy5w=", + "optional": true + }, + "indexof": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/indexof/-/indexof-0.0.1.tgz", + "integrity": "sha1-gtwzbSMrkGIXnQWrMpOmYFn9Q10=" + }, + "inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", + "requires": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "inherits": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", + "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" + }, + "inline-source-map": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/inline-source-map/-/inline-source-map-0.6.2.tgz", + "integrity": "sha1-+Tk0ccGKedFyT4Y/o4tYY3Ct4qU=", + "requires": { + "source-map": "~0.5.3" + } + }, + "insert-module-globals": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/insert-module-globals/-/insert-module-globals-7.2.0.tgz", + "integrity": "sha512-VE6NlW+WGn2/AeOMd496AHFYmE7eLKkUY6Ty31k4og5vmA3Fjuwe9v6ifH6Xx/Hz27QvdoMoviw1/pqWRB09Sw==", + "requires": { + "JSONStream": "^1.0.3", + "acorn-node": "^1.5.2", + "combine-source-map": "^0.8.0", + "concat-stream": "^1.6.1", + "is-buffer": "^1.1.0", + "path-is-absolute": "^1.0.1", + "process": "~0.11.0", + "through2": "^2.0.0", + "undeclared-identifiers": "^1.1.2", + "xtend": "^4.0.0" + }, + "dependencies": { + "concat-stream": { + "version": "1.6.2", + "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", + "requires": { + "buffer-from": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^2.2.2", + "typedarray": "^0.0.6" + } + } + } + }, + "invariant": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/invariant/-/invariant-2.2.4.tgz", + "integrity": "sha512-phJfQVBuaJM5raOpJjSfkiD6BpbCE4Ns//LaXl6wGYtUBY83nWS6Rf9tXm2e8VaK60JEjYldbPif/A2B1C2gNA==", + "requires": { + "loose-envify": "^1.0.0" + } + }, + "is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==" + }, + "is-finite": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-finite/-/is-finite-1.0.2.tgz", + "integrity": "sha1-zGZ3aVYCvlUO8R6LSqYwU0K20Ko=", + "requires": { + "number-is-nan": "^1.0.0" + } + }, + "is-typedarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz", + "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo=", + "optional": true + }, + "isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" + }, + "isstream": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", + "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo=", + "optional": true + }, + "jquery": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/jquery/-/jquery-3.3.1.tgz", + "integrity": "sha512-Ubldcmxp5np52/ENotGxlLe6aGMvmF4R8S6tZjsP6Knsaxd/xp3Zrh50cG93lR6nPXyUFwzN3ZSOQI0wRJNdGg==" + }, + "js-tokens": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-3.0.2.tgz", + "integrity": "sha1-mGbfOVECEw449/mWvOtlRDIJwls=" + }, + "jsbn": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz", + "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM=", + "optional": true + }, + "jsesc": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-1.3.0.tgz", + "integrity": "sha1-RsP+yMGJKxKwgz25vHYiF226s0s=" + }, + "json-schema": { + "version": "0.2.3", + "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz", + "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=", + "optional": true + }, + "json-stable-stringify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/json-stable-stringify/-/json-stable-stringify-1.0.1.tgz", + "integrity": "sha1-mnWdOcXy/1A/1TAGRu1EX4jE+a8=", + "optional": true, + "requires": { + "jsonify": "~0.0.0" + } + }, + "json-stringify-safe": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", + "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus=", + "optional": true + }, + "json5": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/json5/-/json5-0.5.1.tgz", + "integrity": "sha1-Hq3nrMASA0rYTiOWdn6tn6VJWCE=" + }, + "jsonify": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/jsonify/-/jsonify-0.0.0.tgz", + "integrity": "sha1-LHS27kHZPKUbe1qu6PUDYx0lKnM=" + }, + "jsonparse": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/jsonparse/-/jsonparse-1.3.1.tgz", + "integrity": "sha1-P02uSpH6wxX3EGL4UhzCOfE2YoA=" + }, + "jsprim": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.1.tgz", + "integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=", + "optional": true, + "requires": { + "assert-plus": "1.0.0", + "extsprintf": "1.3.0", + "json-schema": "0.2.3", + "verror": "1.10.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", + "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=", + "optional": true + } + } + }, + "labeled-stream-splicer": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/labeled-stream-splicer/-/labeled-stream-splicer-2.0.1.tgz", + "integrity": "sha512-MC94mHZRvJ3LfykJlTUipBqenZz1pacOZEMhhQ8dMGcDHs0SBE5GbsavUXV7YtP3icBW17W0Zy1I0lfASmo9Pg==", + "requires": { + "inherits": "^2.0.1", + "isarray": "^2.0.4", + "stream-splicer": "^2.0.0" + }, + "dependencies": { + "isarray": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-2.0.4.tgz", + "integrity": "sha512-GMxXOiUirWg1xTKRipM0Ek07rX+ubx4nNVElTJdNLYmNO/2YrDkgJGw9CljXn+r4EWiDQg/8lsRdHyg2PJuUaA==" + } + } + }, + "less": { + "version": "2.7.3", + "resolved": "https://registry.npmjs.org/less/-/less-2.7.3.tgz", + "integrity": "sha512-KPdIJKWcEAb02TuJtaLrhue0krtRLoRoo7x6BNJIBelO00t/CCdJQUnHW5V34OnHMWzIktSalJxRO+FvytQlCQ==", + "requires": { + "errno": "^0.1.1", + "graceful-fs": "^4.1.2", + "image-size": "~0.5.0", + "mime": "^1.2.11", + "mkdirp": "^0.5.0", + "promise": "^7.1.1", + "request": "2.81.0", + "source-map": "^0.5.3" + } + }, + "lodash": { + "version": "4.17.10", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.10.tgz", + "integrity": "sha512-UejweD1pDoXu+AD825lWwp4ZGtSwgnpZxb3JDViD7StjQz+Nb/6l093lx4OQ0foGWNRoc19mWy7BzL+UAK2iVg==" + }, + "lodash.memoize": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/lodash.memoize/-/lodash.memoize-3.0.4.tgz", + "integrity": "sha1-LcvSwofLwKVcxCMovQxzYVDVPj8=" + }, + "loose-envify": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", + "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", + "requires": { + "js-tokens": "^3.0.0 || ^4.0.0" + } + }, + "md5.js": { + "version": "1.3.4", + "resolved": "https://registry.npmjs.org/md5.js/-/md5.js-1.3.4.tgz", + "integrity": "sha1-6b296UogpawYsENA/Fdk1bCdkB0=", + "requires": { + "hash-base": "^3.0.0", + "inherits": "^2.0.1" + } + }, + "miller-rabin": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/miller-rabin/-/miller-rabin-4.0.1.tgz", + "integrity": "sha512-115fLhvZVqWwHPbClyntxEVfVDfl9DLLTuJvq3g2O/Oxi8AiNouAHvDSzHS0viUJc+V5vm3eq91Xwqn9dp4jRA==", + "requires": { + "bn.js": "^4.0.0", + "brorand": "^1.0.1" + } + }, + "mime": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", + "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", + "optional": true + }, + "mime-db": { + "version": "1.35.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.35.0.tgz", + "integrity": "sha512-JWT/IcCTsB0Io3AhWUMjRqucrHSPsSf2xKLaRldJVULioggvkJvggZ3VXNNSRkCddE6D+BUI4HEIZIA2OjwIvg==" + }, + "mime-types": { + "version": "2.1.19", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.19.tgz", + "integrity": "sha512-P1tKYHVSZ6uFo26mtnve4HQFE3koh1UWVkp8YUC+ESBHe945xWSoXuHHiGarDqcEZ+whpCDnlNw5LON0kLo+sw==", + "requires": { + "mime-db": "~1.35.0" + } + }, + "minimalistic-assert": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/minimalistic-assert/-/minimalistic-assert-1.0.1.tgz", + "integrity": "sha512-UtJcAD4yEaGtjPezWuO9wC4nwUnVH/8/Im3yEHQP4b67cXlD/Qr9hdITCU1xDbSEXg2XKNaP8jsReV7vQd00/A==" + }, + "minimalistic-crypto-utils": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/minimalistic-crypto-utils/-/minimalistic-crypto-utils-1.0.1.tgz", + "integrity": "sha1-9sAMHAsIIkblxNmd+4x8CDsrWCo=" + }, + "minimatch": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", + "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", + "requires": { + "brace-expansion": "^1.1.7" + } + }, + "minimist": { + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", + "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=" + }, + "mkdirp": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", + "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", + "requires": { + "minimist": "0.0.8" + } + }, + "module-deps": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/module-deps/-/module-deps-4.1.1.tgz", + "integrity": "sha1-IyFYM/HaE/1gbMuAh7RIUty4If0=", + "requires": { + "JSONStream": "^1.0.3", + "browser-resolve": "^1.7.0", + "cached-path-relative": "^1.0.0", + "concat-stream": "~1.5.0", + "defined": "^1.0.0", + "detective": "^4.0.0", + "duplexer2": "^0.1.2", + "inherits": "^2.0.1", + "parents": "^1.0.0", + "readable-stream": "^2.0.2", + "resolve": "^1.1.3", + "stream-combiner2": "^1.1.1", + "subarg": "^1.0.0", + "through2": "^2.0.0", + "xtend": "^4.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + }, + "number-is-nan": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz", + "integrity": "sha1-CXtgK1NCKlIsGvuHkDGDNpQaAR0=" + }, + "oauth-sign": { + "version": "0.8.2", + "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.8.2.tgz", + "integrity": "sha1-Rqarfwrq2N6unsBWV4C31O/rnUM=", + "optional": true + }, + "once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", + "requires": { + "wrappy": "1" + } + }, + "os-browserify": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/os-browserify/-/os-browserify-0.3.0.tgz", + "integrity": "sha1-hUNzx/XCMVkU/Jv8a9gjj92h7Cc=" + }, + "os-homedir": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/os-homedir/-/os-homedir-1.0.2.tgz", + "integrity": "sha1-/7xJiDNuDoM94MFox+8VISGqf7M=" + }, + "os-tmpdir": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", + "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=" + }, + "pako": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.6.tgz", + "integrity": "sha512-lQe48YPsMJAig+yngZ87Lus+NF+3mtu7DVOBu6b/gHO1YpKwIj5AWjZ/TOS7i46HD/UixzWb1zeWDZfGZ3iYcg==" + }, + "parents": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/parents/-/parents-1.0.1.tgz", + "integrity": "sha1-/t1NK/GTp3dF/nHjcdc8MwfZx1E=", + "requires": { + "path-platform": "~0.11.15" + } + }, + "parse-asn1": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/parse-asn1/-/parse-asn1-5.1.1.tgz", + "integrity": "sha512-KPx7flKXg775zZpnp9SxJlz00gTd4BmJ2yJufSc44gMCRrRQ7NSzAcSJQfifuOLgW6bEi+ftrALtsgALeB2Adw==", + "requires": { + "asn1.js": "^4.0.0", + "browserify-aes": "^1.0.0", + "create-hash": "^1.1.0", + "evp_bytestokey": "^1.0.0", + "pbkdf2": "^3.0.3" + } + }, + "path-browserify": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-0.0.1.tgz", + "integrity": "sha512-BapA40NHICOS+USX9SN4tyhq+A2RrN/Ws5F0Z5aMHDp98Fl86lX8Oti8B7uN93L4Ifv4fHOEA+pQw87gmMO/lQ==" + }, + "path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=" + }, + "path-parse": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.5.tgz", + "integrity": "sha1-PBrfhx6pzWyUMbbqK9dKD/BVxME=" + }, + "path-platform": { + "version": "0.11.15", + "resolved": "https://registry.npmjs.org/path-platform/-/path-platform-0.11.15.tgz", + "integrity": "sha1-6GQhf3TDaFDwhSt43Hv31KVyG/I=" + }, + "pbkdf2": { + "version": "3.0.16", + "resolved": "https://registry.npmjs.org/pbkdf2/-/pbkdf2-3.0.16.tgz", + "integrity": "sha512-y4CXP3thSxqf7c0qmOF+9UeOTrifiVTIM+u7NWlq+PRsHbr7r7dpCmvzrZxa96JJUNi0Y5w9VqG5ZNeCVMoDcA==", + "requires": { + "create-hash": "^1.1.2", + "create-hmac": "^1.1.4", + "ripemd160": "^2.0.1", + "safe-buffer": "^5.0.1", + "sha.js": "^2.4.8" + } + }, + "performance-now": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-0.2.0.tgz", + "integrity": "sha1-M+8wxcd9TqIcWlOGnZG1bY8lVeU=", + "optional": true + }, + "private": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/private/-/private-0.1.8.tgz", + "integrity": "sha512-VvivMrbvd2nKkiG38qjULzlc+4Vx4wm/whI9pQD35YrARNnhxeiRktSOhSukRLFNlzg6Br/cJPet5J/u19r/mg==" + }, + "process": { + "version": "0.11.10", + "resolved": "https://registry.npmjs.org/process/-/process-0.11.10.tgz", + "integrity": "sha1-czIwDoQBYb2j5podHZGn1LwW8YI=" + }, + "process-nextick-args": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", + "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==" + }, + "promise": { + "version": "7.3.1", + "resolved": "https://registry.npmjs.org/promise/-/promise-7.3.1.tgz", + "integrity": "sha512-nolQXZ/4L+bP/UGlkfaIujX9BKxGwmQ9OT4mOt5yvy8iK1h3wqTEJCijzGANTCCl9nWjY41juyAn2K3Q1hLLTg==", + "optional": true, + "requires": { + "asap": "~2.0.3" + } + }, + "prr": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/prr/-/prr-1.0.1.tgz", + "integrity": "sha1-0/wRS6BplaRexok/SEzrHXj19HY=", + "optional": true + }, + "public-encrypt": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/public-encrypt/-/public-encrypt-4.0.2.tgz", + "integrity": "sha512-4kJ5Esocg8X3h8YgJsKAuoesBgB7mqH3eowiDzMUPKiRDDE7E/BqqZD1hnTByIaAFiwAw246YEltSq7tdrOH0Q==", + "requires": { + "bn.js": "^4.1.0", + "browserify-rsa": "^4.0.0", + "create-hash": "^1.1.0", + "parse-asn1": "^5.0.0", + "randombytes": "^2.0.1" + } + }, + "punycode": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", + "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=" + }, + "qs": { + "version": "6.4.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.4.0.tgz", + "integrity": "sha1-E+JtKK1rD/qpExLNO/cI7TUecjM=", + "optional": true + }, + "querystring": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/querystring/-/querystring-0.2.0.tgz", + "integrity": "sha1-sgmEkgO7Jd+CDadW50cAWHhSFiA=" + }, + "querystring-es3": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/querystring-es3/-/querystring-es3-0.2.1.tgz", + "integrity": "sha1-nsYfeQSYdXB9aUFFlv2Qek1xHnM=" + }, + "randombytes": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.0.6.tgz", + "integrity": "sha512-CIQ5OFxf4Jou6uOKe9t1AOgqpeU5fd70A8NPdHSGeYXqXsPe6peOwI0cUl88RWZ6sP1vPMV3avd/R6cZ5/sP1A==", + "requires": { + "safe-buffer": "^5.1.0" + } + }, + "randomfill": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/randomfill/-/randomfill-1.0.4.tgz", + "integrity": "sha512-87lcbR8+MhcWcUiQ+9e+Rwx8MyR2P7qnt15ynUlbm3TU/fjbgz4GsvfSUDTemtCCtVCqb4ZcEFlyPNTh9bBTLw==", + "requires": { + "randombytes": "^2.0.5", + "safe-buffer": "^5.1.0" + } + }, + "read-only-stream": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/read-only-stream/-/read-only-stream-2.0.0.tgz", + "integrity": "sha1-JyT9aoET1zdkrCiNQ4YnDB2/F/A=", + "requires": { + "readable-stream": "^2.0.2" + } + }, + "readable-stream": { + "version": "2.3.6", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", + "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + }, + "dependencies": { + "string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "requires": { + "safe-buffer": "~5.1.0" + } + } + } + }, + "regenerate": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/regenerate/-/regenerate-1.4.0.tgz", + "integrity": "sha512-1G6jJVDWrt0rK99kBjvEtziZNCICAuvIPkSiUFIQxVP06RCVpq3dmDo2oi6ABpYaDYaTRr67BEhL8r1wgEZZKg==" + }, + "regenerator-runtime": { + "version": "0.11.1", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.11.1.tgz", + "integrity": "sha512-MguG95oij0fC3QV3URf4V2SDYGJhJnJGqvIIgdECeODCT98wSWDAJ94SSuVpYQUoTcGUIL6L4yNB7j1DFFHSBg==" + }, + "regenerator-transform": { + "version": "0.10.1", + "resolved": "https://registry.npmjs.org/regenerator-transform/-/regenerator-transform-0.10.1.tgz", + "integrity": "sha512-PJepbvDbuK1xgIgnau7Y90cwaAmO/LCLMI2mPvaXq2heGMR3aWW5/BQvYrhJ8jgmQjXewXvBjzfqKcVOmhjZ6Q==", + "requires": { + "babel-runtime": "^6.18.0", + "babel-types": "^6.19.0", + "private": "^0.1.6" + } + }, + "regexpu-core": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/regexpu-core/-/regexpu-core-2.0.0.tgz", + "integrity": "sha1-SdA4g3uNz4v6W5pCE5k45uoq4kA=", + "requires": { + "regenerate": "^1.2.1", + "regjsgen": "^0.2.0", + "regjsparser": "^0.1.4" + } + }, + "regjsgen": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/regjsgen/-/regjsgen-0.2.0.tgz", + "integrity": "sha1-bAFq3qxVT3WCP+N6wFuS1aTtsfc=" + }, + "regjsparser": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/regjsparser/-/regjsparser-0.1.5.tgz", + "integrity": "sha1-fuj4Tcb6eS0/0K4ijSS9lJ6tIFw=", + "requires": { + "jsesc": "~0.5.0" + }, + "dependencies": { + "jsesc": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-0.5.0.tgz", + "integrity": "sha1-597mbjXW/Bb3EP6R1c9p9w8IkR0=" + } + } + }, + "repeating": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/repeating/-/repeating-2.0.1.tgz", + "integrity": "sha1-UhTFOpJtNVJwdSf7q0FdvAjQbdo=", + "requires": { + "is-finite": "^1.0.0" + } + }, + "request": { + "version": "2.81.0", + "resolved": "https://registry.npmjs.org/request/-/request-2.81.0.tgz", + "integrity": "sha1-xpKJRqDgbF+Nb4qTM0af/aRimKA=", + "optional": true, + "requires": { + "aws-sign2": "~0.6.0", + "aws4": "^1.2.1", + "caseless": "~0.12.0", + "combined-stream": "~1.0.5", + "extend": "~3.0.0", + "forever-agent": "~0.6.1", + "form-data": "~2.1.1", + "har-validator": "~4.2.1", + "hawk": "~3.1.3", + "http-signature": "~1.1.0", + "is-typedarray": "~1.0.0", + "isstream": "~0.1.2", + "json-stringify-safe": "~5.0.1", + "mime-types": "~2.1.7", + "oauth-sign": "~0.8.1", + "performance-now": "^0.2.0", + "qs": "~6.4.0", + "safe-buffer": "^5.0.1", + "stringstream": "~0.0.4", + "tough-cookie": "~2.3.0", + "tunnel-agent": "^0.6.0", + "uuid": "^3.0.0" + } + }, + "resolve": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.8.1.tgz", + "integrity": "sha512-AicPrAC7Qu1JxPCZ9ZgCZlY35QgFnNqc+0LtbRNxnVw4TXvjQ72wnuL9JQcEBgXkI9JM8MsT9kaQoHcpCRJOYA==", + "requires": { + "path-parse": "^1.0.5" + } + }, + "ripemd160": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/ripemd160/-/ripemd160-2.0.2.tgz", + "integrity": "sha512-ii4iagi25WusVoiC4B4lq7pbXfAp3D9v5CwfkY33vffw2+pkDjY1D8GaN7spsxvCSx8dkPqOZCEZyfxcmJG2IA==", + "requires": { + "hash-base": "^3.0.0", + "inherits": "^2.0.1" + } + }, + "safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" + }, + "safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" + }, + "semver": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.5.0.tgz", + "integrity": "sha512-4SJ3dm0WAwWy/NVeioZh5AntkdJoWKxHxcmyP622fOkgHa4z3R0TdBJICINyaSDE6uNwVc8gZr+ZinwZAH4xIA==" + }, + "sha.js": { + "version": "2.4.11", + "resolved": "https://registry.npmjs.org/sha.js/-/sha.js-2.4.11.tgz", + "integrity": "sha512-QMEp5B7cftE7APOjk5Y6xgrbWu+WkLVQwk8JNjZ8nKRciZaByEW6MubieAiToS7+dwvrjGhH8jRXz3MVd0AYqQ==", + "requires": { + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "shasum": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/shasum/-/shasum-1.0.2.tgz", + "integrity": "sha1-5wEjENj0F/TetXEhUOVni4euVl8=", + "requires": { + "json-stable-stringify": "~0.0.0", + "sha.js": "~2.4.4" + }, + "dependencies": { + "json-stable-stringify": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/json-stable-stringify/-/json-stable-stringify-0.0.1.tgz", + "integrity": "sha1-YRwj6BTbN1Un34URk9tZ3Sryf0U=", + "requires": { + "jsonify": "~0.0.0" + } + } + } + }, + "shell-quote": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.6.1.tgz", + "integrity": "sha1-9HgZSczkAmlxJ0MOo7PFR29IF2c=", + "requires": { + "array-filter": "~0.0.0", + "array-map": "~0.0.0", + "array-reduce": "~0.0.0", + "jsonify": "~0.0.0" + } + }, + "simple-concat": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.0.tgz", + "integrity": "sha1-c0TLuLbib7J9ZrL8hvn21Zl1IcY=" + }, + "slash": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-1.0.0.tgz", + "integrity": "sha1-xB8vbDn8FtHNF61LXYlhFK5HDVU=" + }, + "sntp": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/sntp/-/sntp-1.0.9.tgz", + "integrity": "sha1-ZUEYTMkK7qbG57NeJlkIJEPGYZg=", + "optional": true, + "requires": { + "hoek": "2.x.x" + } + }, + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=" + }, + "source-map-support": { + "version": "0.4.18", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.4.18.tgz", + "integrity": "sha512-try0/JqxPLF9nOjvSta7tVondkP5dwgyLDjVoyMDlmjugT2lRZ1OfsrYTkCd2hkDnJTKRbO/Rl3orm8vlsUzbA==", + "requires": { + "source-map": "^0.5.6" + } + }, + "sshpk": { + "version": "1.14.2", + "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.14.2.tgz", + "integrity": "sha1-xvxhZIo9nE52T9P8306hBeSSupg=", + "optional": true, + "requires": { + "asn1": "~0.2.3", + "assert-plus": "^1.0.0", + "bcrypt-pbkdf": "^1.0.0", + "dashdash": "^1.12.0", + "ecc-jsbn": "~0.1.1", + "getpass": "^0.1.1", + "jsbn": "~0.1.0", + "safer-buffer": "^2.0.2", + "tweetnacl": "~0.14.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", + "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=", + "optional": true + } + } + }, + "stream-browserify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.1.tgz", + "integrity": "sha1-ZiZu5fm9uZQKTkUUyvtDu3Hlyds=", + "requires": { + "inherits": "~2.0.1", + "readable-stream": "^2.0.2" + } + }, + "stream-combiner2": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/stream-combiner2/-/stream-combiner2-1.1.1.tgz", + "integrity": "sha1-+02KFCDqNidk4hrUeAOXvry0HL4=", + "requires": { + "duplexer2": "~0.1.0", + "readable-stream": "^2.0.2" + } + }, + "stream-http": { + "version": "2.8.3", + "resolved": "https://registry.npmjs.org/stream-http/-/stream-http-2.8.3.tgz", + "integrity": "sha512-+TSkfINHDo4J+ZobQLWiMouQYB+UVYFttRA94FpEzzJ7ZdqcL4uUUQ7WkdkI4DSozGmgBUE/a47L+38PenXhUw==", + "requires": { + "builtin-status-codes": "^3.0.0", + "inherits": "^2.0.1", + "readable-stream": "^2.3.6", + "to-arraybuffer": "^1.0.0", + "xtend": "^4.0.0" + } + }, + "stream-splicer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/stream-splicer/-/stream-splicer-2.0.0.tgz", + "integrity": "sha1-G2O+Q4oTPktnHMGTUZdgAXWRDYM=", + "requires": { + "inherits": "^2.0.1", + "readable-stream": "^2.0.2" + } + }, + "string_decoder": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.0.3.tgz", + "integrity": "sha512-4AH6Z5fzNNBcH+6XDMfA/BTt87skxqJlO0lAh3Dker5zThcAxG6mKz+iGu308UKoPPQ8Dcqx/4JhujzltRa+hQ==", + "requires": { + "safe-buffer": "~5.1.0" + } + }, + "stringstream": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/stringstream/-/stringstream-0.0.6.tgz", + "integrity": "sha512-87GEBAkegbBcweToUrdzf3eLhWNg06FJTebl4BVJz/JgWy8CvEr9dRtX5qWphiynMSQlxxi+QqN0z5T32SLlhA==", + "optional": true + }, + "strip-ansi": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", + "integrity": "sha1-ajhfuIU9lS1f8F0Oiq+UJ43GPc8=", + "requires": { + "ansi-regex": "^2.0.0" + } + }, + "subarg": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/subarg/-/subarg-1.0.0.tgz", + "integrity": "sha1-9izxdYHplrSPyWVpn1TAauJouNI=", + "requires": { + "minimist": "^1.1.0" + }, + "dependencies": { + "minimist": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz", + "integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ=" + } + } + }, + "supports-color": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", + "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=" + }, + "syntax-error": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/syntax-error/-/syntax-error-1.4.0.tgz", + "integrity": "sha512-YPPlu67mdnHGTup2A8ff7BC2Pjq0e0Yp/IyTFN03zWO0RcK07uLcbi7C2KpGR2FvWbaB0+bfE27a+sBKebSo7w==", + "requires": { + "acorn-node": "^1.2.0" + } + }, + "through": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", + "integrity": "sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=" + }, + "through2": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/through2/-/through2-2.0.3.tgz", + "integrity": "sha1-AARWmzfHx0ujnEPzzteNGtlBQL4=", + "requires": { + "readable-stream": "^2.1.5", + "xtend": "~4.0.1" + } + }, + "timers-browserify": { + "version": "1.4.2", + "resolved": "https://registry.npmjs.org/timers-browserify/-/timers-browserify-1.4.2.tgz", + "integrity": "sha1-ycWLV1voQHN1y14kYtrO50NZ9B0=", + "requires": { + "process": "~0.11.0" + } + }, + "to-arraybuffer": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/to-arraybuffer/-/to-arraybuffer-1.0.1.tgz", + "integrity": "sha1-fSKbH8xjfkZsoIEYCDanqr/4P0M=" + }, + "to-fast-properties": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-1.0.3.tgz", + "integrity": "sha1-uDVx+k2MJbguIxsG46MFXeTKGkc=" + }, + "tough-cookie": { + "version": "2.3.4", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.3.4.tgz", + "integrity": "sha512-TZ6TTfI5NtZnuyy/Kecv+CnoROnyXn2DN97LontgQpCwsX2XyLYCC0ENhYkehSOwAp8rTQKc/NUIF7BkQ5rKLA==", + "optional": true, + "requires": { + "punycode": "^1.4.1" + } + }, + "trim-right": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/trim-right/-/trim-right-1.0.1.tgz", + "integrity": "sha1-yy4SAwZ+DI3h9hQJS5/kVwTqYAM=" + }, + "tty-browserify": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/tty-browserify/-/tty-browserify-0.0.1.tgz", + "integrity": "sha512-C3TaO7K81YvjCgQH9Q1S3R3P3BtN3RIM8n+OvX4il1K1zgE8ZhI0op7kClgkxtutIE8hQrcrHBXvIheqKUUCxw==" + }, + "tunnel-agent": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", + "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=", + "optional": true, + "requires": { + "safe-buffer": "^5.0.1" + } + }, + "tweetnacl": { + "version": "0.14.5", + "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz", + "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=", + "optional": true + }, + "typedarray": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", + "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=" + }, + "umd": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/umd/-/umd-3.0.3.tgz", + "integrity": "sha512-4IcGSufhFshvLNcMCV80UnQVlZ5pMOC8mvNPForqwA4+lzYQuetTESLDQkeLmihq8bRcnpbQa48Wb8Lh16/xow==" + }, + "undeclared-identifiers": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/undeclared-identifiers/-/undeclared-identifiers-1.1.2.tgz", + "integrity": "sha512-13EaeocO4edF/3JKime9rD7oB6QI8llAGhgn5fKOPyfkJbRb6NFv9pYV6dFEmpa4uRjKeBqLZP8GpuzqHlKDMQ==", + "requires": { + "acorn-node": "^1.3.0", + "get-assigned-identifiers": "^1.2.0", + "simple-concat": "^1.0.0", + "xtend": "^4.0.1" + } + }, + "url": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/url/-/url-0.11.0.tgz", + "integrity": "sha1-ODjpfPxgUh63PFJajlW/3Z4uKPE=", + "requires": { + "punycode": "1.3.2", + "querystring": "0.2.0" + }, + "dependencies": { + "punycode": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.3.2.tgz", + "integrity": "sha1-llOgNvt8HuQjQvIyXM7v6jkmxI0=" + } + } + }, + "util": { + "version": "0.10.4", + "resolved": "https://registry.npmjs.org/util/-/util-0.10.4.tgz", + "integrity": "sha512-0Pm9hTQ3se5ll1XihRic3FDIku70C+iHUdT/W926rSgHV5QgXsYbKZN8MSC3tJtSkhuROzvsQjAaFENRXr+19A==", + "requires": { + "inherits": "2.0.3" + } + }, + "util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" + }, + "uuid": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.3.2.tgz", + "integrity": "sha512-yXJmeNaw3DnnKAOKJE51sL/ZaYfWJRl1pK9dr19YFCu0ObS231AB1/LbqTKRAQ5kw8A90rA6fr4riOUpTZvQZA==", + "optional": true + }, + "verror": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/verror/-/verror-1.10.0.tgz", + "integrity": "sha1-OhBcoXBTr1XW4nDB+CiGguGNpAA=", + "optional": true, + "requires": { + "assert-plus": "^1.0.0", + "core-util-is": "1.0.2", + "extsprintf": "^1.2.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", + "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=", + "optional": true + } + } + }, + "vm-browserify": { + "version": "0.0.4", + "resolved": "https://registry.npmjs.org/vm-browserify/-/vm-browserify-0.0.4.tgz", + "integrity": "sha1-XX6kW7755Kb/ZflUOOCofDV9WnM=", + "requires": { + "indexof": "0.0.1" + } + }, + "wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" + }, + "xtend": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.1.tgz", + "integrity": "sha1-pcbVMr5lbiPbgg77lDofBJmNY68=" + } + } +} diff --git a/d3m/site/package.json b/d3m/site/package.json new file mode 100644 index 0000000..a058bed --- /dev/null +++ b/d3m/site/package.json @@ -0,0 +1,11 @@ +{ + "dependencies": { + "less": "~2.7.3", + "browserify": "~14.5.0", + "babel-core": "~6.26.0", + "babel-preset-env": "~1.6.1", + "babelify": "~8.0.0", + "jquery": "~3.3.1", + "font-awesome": "~4.7.0" + } +} diff --git a/d3m/site/requirements.txt b/d3m/site/requirements.txt new file mode 100644 index 0000000..471205f --- /dev/null +++ b/d3m/site/requirements.txt @@ -0,0 +1,2 @@ +pyquery==1.4.0 +yattag==1.10.0 diff --git a/d3m/site/schema-org.css b/d3m/site/schema-org.css new file mode 100644 index 0000000..43fe67f --- /dev/null +++ b/d3m/site/schema-org.css @@ -0,0 +1,152 @@ + +/* For hiding the last `>` in breadcrumbs. */ +.hide-last:last-child { + display:none; +} + +/* Below are mostly copies from css at `schema.org`. */ +body { + color: #3A4956; + font-family: "Lucida Grande", "Lucida Sans Unicode", Verdana, Tahoma, Arial, sans-serif; + line-height: 160%; + margin: 0 0 100px 0; + padding: 0; + text-align: center; +} + +h1 { + font: bold 24px Helvetica, Arial, sans-serif; + color: #990000; + letter-spacing: -1px; + margin: 1em 0 0 0; +} + +#container { + width: 100%; + text-align: left; + margin: 0; + background: #fff; +} + +#intro { + position: relative; +} + +#pageHeader { + width: 100%; + height: 80px; + background: #990000; +} + +#pageHeader h1 { + color: #fff; + margin: 0; + padding-top: 25px; + font-family: bold Helvetica, Arial, sans-serif; + letter-spacing: -1px; + text-shadow: 0 2px 0 #510000; +} + +#pageHeader a:link, #pageHeader a:hover, #pageHeader a:visited { + color: #fff; + background-color: #990000; + text-decoration: none; +} + +@media all and (max-width: 720px) { + #pageHeader { + height: 120px; + } +} + +#pageHeader h1 { + color: #fff; + margin: 0; + padding-top: 25px; + font-family: bold Helvetica, Arial, sans-serif; + letter-spacing: -1px; + text-shadow: 0 2px 0 #510000; +} + +#mainContent, #footer, .wrapper { + margin: 0 auto !important; + padding: 0 0.5em; +} + +@media (min-width: 960px) { + #mainContent, #footer, .wrapper { + max-width: 960px; + padding: 0 1em; + } +} + +#sitename { + max-width: 500px; + min-width: auto; + display: inline-block; + text-shadow: 0 2px 0 #510000; + padding: 0; + top: 25px; + left: -40px; +} + +#mainContent { + text-align: left; + font-size: 100%; +} + +#selectionbar { + color: #fff; + height: 46px; + background: #660000; + font-size: 90%; +} + +#selectionbar ul { + float: right; + padding: 10px 0; + margin: 0 auto; + display: block; +} + +#selectionbar li { + display: inline; + list-style: none; +} + +#selectionbar a:link, #selectionbar a:visited { + color: #fff; + display: block; + float: right; + padding: 1px 9px 3px 6px; + margin: 0 6px; + text-decoration: none; +} + +#selectionbar a:hover { + color: #FFEE99; + background-color: transparent; +} + +#selectionbar .activelink a { + background: #990000; +} + +a:link { + color: #660000; + text-decoration: none; + border-bottom: dotted 1px #660000; +} + +a:visited { + color: #990000; + text-decoration: none; + border-bottom: dotted 1px #990000; +} + +a:hover { + border-bottom: none; + color: #fff; + background-color: #660000; + text-decoration: none; +} diff --git a/d3m/site/static/index.html b/d3m/site/static/index.html new file mode 100644 index 0000000..3d3c78f --- /dev/null +++ b/d3m/site/static/index.html @@ -0,0 +1,23 @@ + + + + D3M Metadata + + + + + + +

    + + shrink all + expand all +
    hold Ctrl while clicking '+' for expansion of whole subtree
    + +
    + + go to top + + + + diff --git a/d3m/site/static/schemas b/d3m/site/static/schemas new file mode 120000 index 0000000..eb7c906 --- /dev/null +++ b/d3m/site/static/schemas @@ -0,0 +1 @@ +../../d3m/metadata/schemas \ No newline at end of file diff --git a/d3m/tests/asv.conf.json b/d3m/tests/asv.conf.json new file mode 100644 index 0000000..eb00b4b --- /dev/null +++ b/d3m/tests/asv.conf.json @@ -0,0 +1,161 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "d3m", + + // The project's homepage + "project_url": "https://gitlab.com/datadrivendiscovery/d3m/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + // "build_command": [ + // "python setup.py build", + // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + // ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + // "branches": ["master"], // for git + // "branches": ["default"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + // "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "virtualenv", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + // "show_commit_url": "http://github.com/owner/project/commit/", + "show_commit_url": "https://gitlab.com/datadrivendiscovery/d3m/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["2.7", "3.6"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + // "conda_channels": ["conda-forge", "defaults"] + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + // "matrix": { + // "numpy": ["1.6", "1.7"], + // "six": ["", null], // test with and without six installed + // "pip+emcee": [""], // emcee is only available for install with pip. + // }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "six": null}, // don't run without six on conda + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "numpy": "1.8"}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/d3m/tests/benchmarks/__init__.py b/d3m/tests/benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/d3m/tests/benchmarks/base_utils.py b/d3m/tests/benchmarks/base_utils.py new file mode 100644 index 0000000..648eb5f --- /dev/null +++ b/d3m/tests/benchmarks/base_utils.py @@ -0,0 +1,41 @@ +from d3m import container +from d3m.base import utils as base_utils + + +class CombineColumns: + params = [[100, 300, 500, 700, 900]] + param_names = ['columns'] + + def setup(self, columns): + self.large_dataframe_with_many_columns = container.DataFrame({str(i): [j for j in range(5)] for i in range(columns)}, columns=[str(i) for i in range(columns)], generate_metadata=True) + self.list_of_many_dataframe_columns = [ + container.DataFrame({str(i): [j for j in range(5, 10)]}, columns=[str(i)], generate_metadata=True) + for i in range(int(columns / 2)) + ] + + def time_append(self, columns): + base_utils.combine_columns( + self.large_dataframe_with_many_columns, + list(range(int(columns / 4), int(columns / 2))), # Just 1/4 of columns. + self.list_of_many_dataframe_columns, + return_result='append', + add_index_columns=True, + ) + + def time_replace(self, columns): + base_utils.combine_columns( + self.large_dataframe_with_many_columns, + list(range(int(columns / 4), int(columns / 2))), # Just 1/4 of columns. + self.list_of_many_dataframe_columns, + return_result='replace', + add_index_columns=True, + ) + + def time_new(self, columns): + base_utils.combine_columns( + self.large_dataframe_with_many_columns, + list(range(int(columns / 4), int(columns / 2))), # Just 1/4 of columns. + self.list_of_many_dataframe_columns, + return_result='new', + add_index_columns=True, + ) diff --git a/d3m/tests/benchmarks/containers.py b/d3m/tests/benchmarks/containers.py new file mode 100644 index 0000000..845af4b --- /dev/null +++ b/d3m/tests/benchmarks/containers.py @@ -0,0 +1,38 @@ +from d3m import container + + +class ContainersWithMetadata: + params = [[True, False], [100, 300, 500, 700, 900]] + param_names = ['compact', 'columns'] + + def setup(self, compact, columns): + # Compacting a DataFrame with more than 300 columns timeouts the benchmark and fails it. + # By raising a "NotImplementedError exception such combinations are skipped. + if compact and columns > 300: + raise NotImplementedError + + def time_dataframe(self, compact, columns): + df = container.DataFrame({str(i): [j for j in range(5)] for i in range(columns)}, columns=[str(i) for i in range(columns)], generate_metadata=False) + df.metadata.generate(df, compact=compact) + + def time_columns(self, compact, columns): + dfs = [ + container.DataFrame({str(i): [j for j in range(5, 10)]}, columns=[str(i)], generate_metadata=False) + for i in range(int(columns / 2)) + ] + for df in dfs: + df.metadata.generate(df, compact=compact) + + +class ContainersWithoutMetadata: + params = [[100, 300, 500, 700, 900]] + param_names = ['columns'] + + def time_dataframe(self, columns): + container.DataFrame({str(i): [j for j in range(5)] for i in range(columns)}, columns=[str(i) for i in range(columns)], generate_metadata=False) + + def time_columns(self, columns): + [ + container.DataFrame({str(i): [j for j in range(5, 10)]}, columns=[str(i)], generate_metadata=False) + for i in range(int(columns / 2)) + ] diff --git a/d3m/tests/benchmarks/metadata.py b/d3m/tests/benchmarks/metadata.py new file mode 100644 index 0000000..1e75d06 --- /dev/null +++ b/d3m/tests/benchmarks/metadata.py @@ -0,0 +1,195 @@ +import os +import tempfile + +import numpy +import pandas + +from d3m import container, utils + +oneformat_dataset_json = """ +{ + "about": { + "datasetID": "benchmark_dataset", + "datasetName": "benchmark_dataset_name", + "license": "Unknown", + "datasetSchemaVersion": "3.2.0", + "redacted": false + }, + "dataResources": [ + { + "resID": "0", + "resPath": "media/", + "resType": "image", + "resFormat": [ + "image/png" + ], + "isCollection": true + } + ] +} +""" + +twoformats_dataset_json = """ +{ + "about": { + "datasetID": "benchmark_dataset", + "datasetName": "benchmark_dataset_name", + "license": "Unknown", + "datasetSchemaVersion": "3.2.0", + "redacted": false + }, + "dataResources": [ + { + "resID": "0", + "resPath": "media/", + "resType": "image", + "resFormat": [ + "image/png", + "image/jpeg" + ], + "isCollection": true + } + ] +} +""" + + +def create_oneformat_dataset(dataset_folder, n): + media_folder = os.path.join(dataset_folder, 'media') + os.makedirs(media_folder, mode=0o777, exist_ok=False) + + dataset_doc_file_path = os.path.join(dataset_folder, 'datasetDoc.json') + dataset_doc_file_uri = "file://" + os.path.abspath(dataset_doc_file_path) + + with open(dataset_doc_file_path, 'w') as f: + f.write(oneformat_dataset_json) + + filenames = ["image_{x}.png".format(x=x) for x in range(n)] + + for filename in filenames: + with open(os.path.join(media_folder, filename), 'w') as f: + pass + + return dataset_doc_file_uri + + +def create_twoformats_dataset(dataset_folder, n): + media_folder = os.path.join(dataset_folder, 'media') + os.makedirs(media_folder, mode=0o777, exist_ok=False) + + dataset_doc_file_path = os.path.join(dataset_folder, 'datasetDoc.json') + dataset_doc_file_uri = "file://" + os.path.abspath(dataset_doc_file_path) + + with open(dataset_doc_file_path, 'w') as f: + f.write(twoformats_dataset_json) + + filenames = ["image_{x}.{ext}".format(x=x, ext='png' if x % 2 else 'jpeg') for x in range(n)] + + for filename in filenames: + with open(os.path.join(media_folder, filename), 'w') as f: + pass + + return dataset_doc_file_uri + + +class OneFormatDataset: + params = [[True, False], [10000, 30000, 50000]] + param_names = ['compute_digest', 'dataset_files'] + + def setup(self, compute_digest, dataset_files): + self.temp_directory = tempfile.TemporaryDirectory() + + self.dataset_doc_file_uri = create_oneformat_dataset(self.temp_directory.name, dataset_files) + + def teardown(self, compute_digest, dataset_files): + self.temp_directory.cleanup() + + def time_dataset_load(self, compute_digest, dataset_files): + container.dataset.Dataset.load(self.dataset_doc_file_uri, compute_digest=container.ComputeDigest.ALWAYS if compute_digest else container.ComputeDigest.NEVER) + + +class TwoFormatsDataset: + params = [[True, False], [10000, 30000, 50000]] + param_names = ['compute_digest', 'dataset_files'] + + def setup(self, compute_digest, dataset_files): + self.temp_directory = tempfile.TemporaryDirectory() + + self.dataset_doc_file_uri = create_twoformats_dataset(self.temp_directory.name, dataset_files) + + def teardown(self, compute_digest, dataset_files): + self.temp_directory.cleanup() + + def time_dataset_load(self, compute_digest, dataset_files): + container.dataset.Dataset.load(self.dataset_doc_file_uri, compute_digest=container.ComputeDigest.ALWAYS if compute_digest else container.ComputeDigest.NEVER) + + +class DatasetToJsonStructure: + params = [[10000, 30000, 50000]] + param_names = ['dataset_files'] + + def setup(self, dataset_files): + self.temp_directory = tempfile.TemporaryDirectory() + + dataset_doc_file_uri = create_twoformats_dataset(self.temp_directory.name, dataset_files) + + self.dataset_metadata = container.dataset.Dataset.load(dataset_doc_file_uri).metadata + + def teardown(self, dataset_files): + self.temp_directory.cleanup() + + def time_to_json_structure(self, dataset_files): + self.dataset_metadata.to_internal_json_structure() + + def time_to_simple_structure_without_json(self, dataset_files): + self.dataset_metadata.to_internal_simple_structure() + + def time_to_simple_structure_with_json(self, dataset_files): + utils.to_json_structure(self.dataset_metadata.to_internal_simple_structure()) + + +class MetadataGeneration: + params = [[True, False]] + param_names = ['compact'] + + def setup(self, compact): + self.large_dataframe_with_objects = pandas.DataFrame({str(i): [str(j) for j in range(10000)] for i in range(50)}, columns=[str(i) for i in range(50)]) + self.large_list_with_objects = [container.List([str(j) for i in range(50)]) for j in range(10000)] + self.large_ndarray_with_objects = numpy.array([[[str(k) for k in range(5)] for i in range(10)] for j in range(10000)], dtype=object) + self.large_dict_with_objects = {str(i): {str(j): j for j in range(10000)} for i in range(50)} + + def time_large_dataframe_with_objects(self, compact): + df = container.DataFrame(self.large_dataframe_with_objects, generate_metadata=False) + df.metadata.generate(df, compact=compact) + + def time_large_list_with_objects(self, compact): + l = container.List(self.large_list_with_objects, generate_metadata=False) + l.metadata.generate(l, compact=compact) + + def time_large_ndarray_with_objects(self, compact): + a = container.ndarray(self.large_ndarray_with_objects, generate_metadata=False) + a.metadata.generate(a, compact=compact) + + def time_large_dict_with_objects(self, compact): + l = container.List([self.large_dict_with_objects], generate_metadata=False) + l.metadata.generate(l, compact=compact) + + +class MetadataToJsonStructure: + def setup(self): + self.large_dataframe = container.DataFrame(pandas.DataFrame({str(i): [str(j) for j in range(10000)] for i in range(50)}, columns=[str(i) for i in range(50)]), generate_metadata=True) + self.large_list = container.List([container.List([str(j) for i in range(50)]) for j in range(10000)], generate_metadata=True) + self.large_ndarray = container.ndarray(numpy.array([[[str(k) for k in range(5)] for i in range(10)] for j in range(10000)], dtype=object), generate_metadata=True) + self.large_dict_list = container.List({str(i): {str(j): j for j in range(10000)} for i in range(50)}, generate_metadata=True) + + def time_large_dataframe(self): + self.large_dataframe.metadata.to_internal_json_structure() + + def time_large_list(self): + self.large_list.metadata.to_internal_json_structure() + + def time_large_ndarray(self): + self.large_ndarray.metadata.to_internal_json_structure() + + def time_large_dict_list(self): + self.large_dict_list.metadata.to_internal_json_structure() diff --git a/d3m/tests/benchmarks/primitive.py b/d3m/tests/benchmarks/primitive.py new file mode 100644 index 0000000..d1e918f --- /dev/null +++ b/d3m/tests/benchmarks/primitive.py @@ -0,0 +1,32 @@ +import os +import sys + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), '..') + +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_pipeline import MockPrimitiveBuilder, MockPipelineBuilder + + +class Primitive: + def time_hash(self): + primitive_1 = MockPrimitiveBuilder({ + 'dataset': {'type': 'CONTAINER'}, + 'mean': {'type': 'CONTAINER'}, + }, {}) + primitive_2 = MockPrimitiveBuilder({ + 'a': {'type': 'CONTAINER'}, + 'b': {'type': 'CONTAINER'}, + }, {}) + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.1.produce', b='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_1 = builder.build() + + for _ in range(1000): + pipeline_1.hash() diff --git a/d3m/tests/benchmarks/sampling.py b/d3m/tests/benchmarks/sampling.py new file mode 100644 index 0000000..ea8e414 --- /dev/null +++ b/d3m/tests/benchmarks/sampling.py @@ -0,0 +1,36 @@ +import numpy + +from d3m.metadata import hyperparams + + +class Sampling: + def setup(self): + self.numerical = hyperparams.Uniform( + lower=0, + upper=1, + default=0.5, + ) + self.enumeration = hyperparams.Enumeration( + values=list(range(1000)), + default=0, + ) + + def time_numerical_sampling(self): + random_state = numpy.random.RandomState(0) + for i in range(100000): + self.numerical.sample(random_state) + + def time_numerical_sample_multiple(self): + random_state = numpy.random.RandomState(0) + for i in range(1000): + self.numerical.sample_multiple(500, 500, random_state, with_replacement=False) + + def time_enumeration_sampling(self): + random_state = numpy.random.RandomState(0) + for i in range(10000): + self.enumeration.sample(random_state) + + def time_enumeration_sample_multiple(self): + random_state = numpy.random.RandomState(0) + for i in range(10000): + self.enumeration.sample_multiple(500, 500, random_state, with_replacement=False) diff --git a/d3m/tests/test_base_utils.py b/d3m/tests/test_base_utils.py new file mode 100644 index 0000000..f46cae5 --- /dev/null +++ b/d3m/tests/test_base_utils.py @@ -0,0 +1,1035 @@ +import unittest + +from d3m import container, utils as d3m_utils +from d3m.base import utils +from d3m.metadata import base as metadata_base + + +class TestBaseUtils(unittest.TestCase): + def test_combine_columns_compact_metadata(self): + main = container.DataFrame({'a1': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9], 'd1': [10, 11, 12], 'e1': [13, 14, 15]}, { + 'top_level': 'main', + }, generate_metadata=False) + main.metadata = main.metadata.generate(main, compact=True) + main.metadata = main.metadata.update_column(0, {'name': 'aaa111'}) + main.metadata = main.metadata.update_column(1, {'name': 'bbb111', 'extra': 'b_column'}) + main.metadata = main.metadata.update_column(2, {'name': 'ccc111'}) + + columns2 = container.DataFrame({'a2': [21, 22, 23], 'b2': [24, 25, 26]}, { + 'top_level': 'columns2', + }, generate_metadata=False) + columns2.metadata = columns2.metadata.generate(columns2, compact=True) + columns2.metadata = columns2.metadata.update_column(0, {'name': 'aaa222'}) + columns2.metadata = columns2.metadata.update_column(1, {'name': 'bbb222'}) + + columns3 = container.DataFrame({'a3': [31, 32, 33], 'b3': [34, 35, 36]}, { + 'top_level': 'columns3', + }, generate_metadata=False) + columns3.metadata = columns3.metadata.generate(columns3, compact=True) + columns3.metadata = columns3.metadata.update_column(0, {'name': 'aaa333'}) + columns3.metadata = columns3.metadata.update_column(1, {'name': 'bbb333'}) + + result = utils.combine_columns(main, [1, 2], [columns2, columns3], return_result='append', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [1, 4, 7, 10, 13, 21, 24, 31, 34], + [2, 5, 8, 11, 14, 22, 25, 32, 35], + [3, 6, 9, 12, 15, 23, 26, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 9, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa111', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'extra': 'b_column', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'ccc111', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'd1', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'e1', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'aaa222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'name': 'bbb222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 7], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 8], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }]) + + result = utils.combine_columns(main, [1, 2], [columns2, columns3], return_result='new', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [21, 24, 31, 34], + [22, 25, 32, 35], + [23, 26, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'columns2', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa222', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb222', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }]) + + result = utils.combine_columns(main, [1, 2], [columns2, columns3], return_result='replace', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [1, 21, 24, 31, 34, 10, 13], + [2, 22, 25, 32, 35, 11, 14], + [3, 23, 26, 33, 36, 12, 15], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 7, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa111', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'aaa222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'bbb222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'd1', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'name': 'e1', + 'structural_type': 'numpy.int64', + }, + }]) + + result = utils.combine_columns(main, [0, 1, 2, 3, 4], [columns2, columns3], return_result='replace', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [21, 24, 31, 34], + [22, 25, 32, 35], + [23, 26, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }]) + + result = utils.combine_columns(main, [4], [columns2, columns3], return_result='replace', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [1, 4, 7, 10, 21, 24, 31, 34], + [2, 5, 8, 11, 22, 25, 32, 35], + [3, 6, 9, 12, 23, 26, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 8, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa111', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'extra': 'b_column', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'ccc111', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'd1', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'aaa222', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'bbb222', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'aaa333', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 7], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'bbb333', + }, + }]) + + result = utils.combine_columns(main, [0, 2, 4], [columns2, columns3], return_result='replace', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [21, 4, 24, 10, 31, 34], + [22, 5, 25, 11, 32, 35], + [23, 6, 26, 12, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'extra': 'b_column', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'bbb222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'd1', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }]) + def test_combine_columns_noncompact_metadata(self): + main = container.DataFrame({'a1': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9], 'd1': [10, 11, 12], 'e1': [13, 14, 15]}, { + 'top_level': 'main', + }, generate_metadata=False) + main.metadata = main.metadata.generate(main, compact=False) + main.metadata = main.metadata.update_column(0, {'name': 'aaa111'}) + main.metadata = main.metadata.update_column(1, {'name': 'bbb111', 'extra': 'b_column'}) + main.metadata = main.metadata.update_column(2, {'name': 'ccc111'}) + + columns2 = container.DataFrame({'a2': [21, 22, 23], 'b2': [24, 25, 26]}, { + 'top_level': 'columns2', + }, generate_metadata=False) + columns2.metadata = columns2.metadata.generate(columns2, compact=False) + columns2.metadata = columns2.metadata.update_column(0, {'name': 'aaa222'}) + columns2.metadata = columns2.metadata.update_column(1, {'name': 'bbb222'}) + + columns3 = container.DataFrame({'a3': [31, 32, 33], 'b3': [34, 35, 36]}, { + 'top_level': 'columns3', + }, generate_metadata=False) + columns3.metadata = columns3.metadata.generate(columns3, compact=False) + columns3.metadata = columns3.metadata.update_column(0, {'name': 'aaa333'}) + columns3.metadata = columns3.metadata.update_column(1, {'name': 'bbb333'}) + + result = utils.combine_columns(main, [1, 2], [columns2, columns3], return_result='append', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [1, 4, 7, 10, 13, 21, 24, 31, 34], + [2, 5, 8, 11, 14, 22, 25, 32, 35], + [3, 6, 9, 12, 15, 23, 26, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 9, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa111', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'extra': 'b_column', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'ccc111', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'd1', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'e1', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'aaa222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'name': 'bbb222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 7], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 8], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }]) + + result = utils.combine_columns(main, [1, 2], [columns2, columns3], return_result='new', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [21, 24, 31, 34], + [22, 25, 32, 35], + [23, 26, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'columns2', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }]) + + result = utils.combine_columns(main, [1, 2], [columns2, columns3], return_result='replace', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [1, 21, 24, 31, 34, 10, 13], + [2, 22, 25, 32, 35, 11, 14], + [3, 23, 26, 33, 36, 12, 15], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 7, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa111', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'aaa222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'bbb222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'd1', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'name': 'e1', + 'structural_type': 'numpy.int64', + }, + }]) + + result = utils.combine_columns(main, [0, 1, 2, 3, 4], [columns2, columns3], return_result='replace', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [21, 24, 31, 34], + [22, 25, 32, 35], + [23, 26, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }]) + + result = utils.combine_columns(main, [4], [columns2, columns3], return_result='replace', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [1, 4, 7, 10, 21, 24, 31, 34], + [2, 5, 8, 11, 22, 25, 32, 35], + [3, 6, 9, 12, 23, 26, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 8, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa111', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'extra': 'b_column', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'ccc111', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'd1', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'aaa222', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'bbb222', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'aaa333', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 7], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'bbb333', + }, + }]) + + result = utils.combine_columns(main, [0, 2, 4], [columns2, columns3], return_result='replace', add_index_columns=False) + + self.assertEqual(result.values.tolist(), [ + [21, 4, 24, 10, 31, 34], + [22, 5, 25, 11, 32, 35], + [23, 6, 26, 12, 33, 36], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'extra': 'b_column', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'bbb222', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'd1', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'aaa333', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'bbb333', + 'structural_type': 'numpy.int64', + }, + }]) + + def test_combine_columns_new_with_index_compact_metadata(self): + main = container.DataFrame({'d3mIndex': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9]}, columns=['d3mIndex', 'b1', 'c1'], generate_metadata=False) + main.metadata = main.metadata.generate(main, compact=True) + main.metadata = main.metadata.update_column(0, {'name': 'd3mIndex', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + main.metadata = main.metadata.update_column(1, {'name': 'b1', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute']}) + main.metadata = main.metadata.update_column(2, {'name': 'c1', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute']}) + + columns = container.DataFrame({'d3mIndex': [1, 2, 3], 'b2': [4, 5, 6]}, columns=['d3mIndex', 'b2'], generate_metadata=False) + columns.metadata = columns.metadata.generate(columns, compact=True) + columns.metadata = columns.metadata.update_column(0, {'name': 'd3mIndex', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + columns.metadata = columns.metadata.update_column(1, {'name': 'b2', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute']}) + + result = utils.combine_columns(main, [], [columns], return_result='new', add_index_columns=True) + + self.assertEqual(result.values.tolist(), [ + [1, 4], + [2, 5], + [3, 6], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'b2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + }, + }]) + + def test_combine_columns_new_with_index_noncompact_metadata(self): + main = container.DataFrame({'d3mIndex': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9]}, columns=['d3mIndex', 'b1', 'c1'], generate_metadata=False) + main.metadata = main.metadata.generate(main, compact=False) + main.metadata = main.metadata.update_column(0, {'name': 'd3mIndex', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + main.metadata = main.metadata.update_column(1, {'name': 'b1', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute']}) + main.metadata = main.metadata.update_column(2, {'name': 'c1', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute']}) + + columns = container.DataFrame({'d3mIndex': [1, 2, 3], 'b2': [4, 5, 6]}, columns=['d3mIndex', 'b2'], generate_metadata=False) + columns.metadata = columns.metadata.generate(columns, compact=False) + columns.metadata = columns.metadata.update_column(0, {'name': 'd3mIndex', 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']}) + columns.metadata = columns.metadata.update_column(1, {'name': 'b2', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute']}) + + result = utils.combine_columns(main, [], [columns], return_result='new', add_index_columns=True) + + self.assertEqual(result.values.tolist(), [ + [1, 4], + [2, 5], + [3, 6], + ]) + + self.assertEqual(d3m_utils.to_json_structure(result.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'b2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_cli_runtime.py b/d3m/tests/test_cli_runtime.py new file mode 100644 index 0000000..4be42f4 --- /dev/null +++ b/d3m/tests/test_cli_runtime.py @@ -0,0 +1,1680 @@ +import contextlib +import json +import gzip +import io +import logging +import os.path +import pickle +import random +import shutil +import sys +import tempfile +import traceback +import unittest + +import pandas + +COMMON_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'common-primitives') +# NOTE: This insertion should appear before any code attempting to resolve or load primitives, +# so the git submodule version of `common-primitives` is looked at first. +sys.path.insert(0, COMMON_PRIMITIVES_DIR) + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from common_primitives.column_parser import ColumnParserPrimitive +from common_primitives.construct_predictions import ConstructPredictionsPrimitive +from common_primitives.dataset_to_dataframe import DatasetToDataFramePrimitive +from common_primitives.no_split import NoSplitDatasetSplitPrimitive +from common_primitives.random_forest import RandomForestClassifierPrimitive +from common_primitives.train_score_split import TrainScoreDatasetSplitPrimitive + + +from test_primitives.random_classifier import RandomClassifierPrimitive +from test_primitives.fake_score import FakeScorePrimitive + +from d3m import cli, index, runtime, utils +from d3m.container import dataset as dataset_module +from d3m.contrib.primitives.compute_scores import ComputeScoresPrimitive +from d3m.metadata import base as metadata_base, pipeline as pipeline_module, pipeline_run as pipeline_run_module, problem as problem_module + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') +PROBLEM_DIR = os.path.join(TEST_DATA_DIR, 'problems') +DATASET_DIR = os.path.join(TEST_DATA_DIR, 'datasets') +PIPELINE_DIR = os.path.join(TEST_DATA_DIR, 'pipelines') + + +class TestCLIRuntime(unittest.TestCase): + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.test_dir) + + @classmethod + def setUpClass(cls): + to_register = { + 'd3m.primitives.data_transformation.dataset_to_dataframe.Common': DatasetToDataFramePrimitive, + 'd3m.primitives.classification.random_forest.Common': RandomForestClassifierPrimitive, + 'd3m.primitives.classification.random_classifier.Test': RandomClassifierPrimitive, + 'd3m.primitives.data_transformation.column_parser.Common': ColumnParserPrimitive, + 'd3m.primitives.data_transformation.construct_predictions.Common': ConstructPredictionsPrimitive, + 'd3m.primitives.evaluation.no_split_dataset_split.Common': NoSplitDatasetSplitPrimitive, + 'd3m.primitives.evaluation.compute_scores.Test': FakeScorePrimitive, + 'd3m.primitives.evaluation.train_score_dataset_split.Common': TrainScoreDatasetSplitPrimitive, + # We do not have to load this primitive, but loading it here prevents the package from loading all primitives. + 'd3m.primitives.evaluation.compute_scores.Core': ComputeScoresPrimitive, + } + + # To hide any logging or stdout output. + with utils.silence(): + for python_path, primitive in to_register.items(): + index.register_primitive(python_path, primitive) + + def _call_cli_runtime(self, arg): + logger = logging.getLogger('d3m.runtime') + with utils.silence(): + with self.assertLogs(logger=logger) as cm: + # So that at least one message is logged. + logger.warning("Debugging.") + cli.main(arg) + # We skip our "debugging" message. + return cm.records[1:] + + def _call_cli_runtime_without_fail(self, arg): + try: + return self._call_cli_runtime(arg) + except Exception as e: + self.fail(traceback.format_exc()) + + def _assert_valid_saved_pipeline_runs(self, pipeline_run_save_path): + with open(pipeline_run_save_path, 'r') as f: + for pipeline_run_dict in list(utils.yaml_load_all(f)): + try: + pipeline_run_module.validate_pipeline_run(pipeline_run_dict) + except Exception as e: + self.fail(traceback.format_exc()) + + def _validate_previous_pipeline_run_ids(self, pipeline_run_save_path): + ids = set() + prev_ids = set() + with open(pipeline_run_save_path, 'r') as f: + for pipeline_run_dict in list(utils.yaml_load_all(f)): + ids.add(pipeline_run_dict['id']) + if 'previous_pipeline_run' in pipeline_run_dict: + prev_ids.add(pipeline_run_dict['previous_pipeline_run']['id']) + self.assertTrue( + prev_ids.issubset(ids), + 'Some previous pipeline run ids {} are not in the set of pipeline run ids {}'.format(prev_ids, ids) + ) + + def test_fit_multi_input(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + arg = [ + '', + 'runtime', + 'fit', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--problem', + os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'multi-input-test.json'), + '--expose-produced-outputs', + self.test_dir, + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self._assert_standard_output_metadata() + + def test_fit_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline') + output_csv_path = os.path.join(self.test_dir, 'output.csv') + arg = [ + '', + 'runtime', + 'fit', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'multi-input-test.json'), + '--save', + fitted_pipeline_path, + '--expose-produced-outputs', + self.test_dir, + '--output', + output_csv_path, + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self.assertEqual(utils.list_files(self.test_dir), [ + 'fitted-pipeline', + 'output.csv', + 'outputs.0/data.csv', + 'outputs.0/metadata.json', + 'pipeline_run.yml', + 'steps.0.produce/data.csv', + 'steps.0.produce/metadata.json', + 'steps.1.produce/data.csv', + 'steps.1.produce/metadata.json', + 'steps.2.produce/data.csv', + 'steps.2.produce/metadata.json' + ]) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self._assert_standard_output_metadata() + self._assert_prediction_sum(prediction_sum=11225, outputs_path='outputs.0/data.csv') + self._assert_prediction_sum(prediction_sum=11225, outputs_path='output.csv') + + def test_produce_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-no-problem-pipeline') + output_csv_path = os.path.join(self.test_dir, 'output.csv') + arg = [ + '', + 'runtime', + 'fit', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'multi-input-test.json'), + '--save', + fitted_pipeline_path, + ] + self._call_cli_runtime_without_fail(arg) + + arg = [ + '', + 'runtime', + 'produce', + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--output', + output_csv_path, + '--fitted-pipeline', + fitted_pipeline_path, + '--expose-produced-outputs', + self.test_dir, + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self.assertEqual(utils.list_files(self.test_dir), [ + 'fitted-no-problem-pipeline', + 'output.csv', + 'outputs.0/data.csv', + 'outputs.0/metadata.json', + 'pipeline_run.yml', + 'steps.0.produce/data.csv', + 'steps.0.produce/metadata.json', + 'steps.1.produce/data.csv', + 'steps.1.produce/metadata.json', + 'steps.2.produce/data.csv', + 'steps.2.produce/metadata.json' + ]) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self._assert_standard_output_metadata() + self._assert_prediction_sum(prediction_sum=11008, outputs_path='outputs.0/data.csv') + self._assert_prediction_sum(prediction_sum=11008, outputs_path='output.csv') + + def test_fit_produce_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + output_csv_path = os.path.join(self.test_dir, 'output.csv') + arg = [ + '', + 'runtime', + 'fit-produce', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'multi-input-test.json'), + '--output', + output_csv_path, + '--expose-produced-outputs', + self.test_dir, + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self.assertEqual(utils.list_files(self.test_dir), [ + 'output.csv', + 'outputs.0/data.csv', + 'outputs.0/metadata.json', + 'pipeline_run.yml', + 'steps.0.produce/data.csv', + 'steps.0.produce/metadata.json', + 'steps.1.produce/data.csv', + 'steps.1.produce/metadata.json', + 'steps.2.produce/data.csv', + 'steps.2.produce/metadata.json' + ]) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + self._assert_standard_output_metadata() + self._assert_prediction_sum(prediction_sum=11008, outputs_path='outputs.0/data.csv') + self._assert_prediction_sum(prediction_sum=11008, outputs_path='output.csv') + + def test_nonstandard_fit_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline') + arg = [ + '', + 'runtime', + 'fit', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'semi-standard-pipeline.json'), + '--save', + fitted_pipeline_path, + '--expose-produced-outputs', + self.test_dir, + '--not-standard-pipeline', + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self.assertEqual(utils.list_files(self.test_dir), [ + 'fitted-pipeline', + 'outputs.0/data.csv', + 'outputs.0/metadata.json', + 'outputs.1/data.csv', + 'outputs.1/metadata.json', + 'pipeline_run.yml', + 'steps.0.produce/data.csv', + 'steps.0.produce/metadata.json', + 'steps.1.produce/data.csv', + 'steps.1.produce/metadata.json', + ]) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self._assert_standard_output_metadata() + self._assert_prediction_sum(prediction_sum=10710, outputs_path='outputs.0/data.csv') + self._assert_nonstandard_output(outputs_name='outputs.1') + + def test_nonstandard_produce_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline') + arg = [ + '', + 'runtime', + 'fit', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'semi-standard-pipeline.json'), + '--save', + fitted_pipeline_path, + '--not-standard-pipeline' + ] + self._call_cli_runtime_without_fail(arg) + + arg = [ + '', + 'runtime', + 'produce', + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--fitted-pipeline', + fitted_pipeline_path, + '--expose-produced-outputs', + self.test_dir, + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self.assertEqual(utils.list_files(self.test_dir), [ + 'fitted-pipeline', + 'outputs.0/data.csv', + 'outputs.0/metadata.json', + 'outputs.1/data.csv', + 'outputs.1/metadata.json', + 'pipeline_run.yml', + 'steps.0.produce/data.csv', + 'steps.0.produce/metadata.json', + 'steps.1.produce/data.csv', + 'steps.1.produce/metadata.json' + ]) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self._assert_standard_output_metadata() + self._assert_prediction_sum(prediction_sum=12106, outputs_path='outputs.0/data.csv') + self._assert_nonstandard_output(outputs_name='outputs.1') + + def test_nonstandard_fit_produce_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + arg = [ + '', + 'runtime', + 'fit-produce', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'semi-standard-pipeline.json'), + '--expose-produced-outputs', + self.test_dir, + '--not-standard-pipeline', + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self.assertEqual(utils.list_files(self.test_dir), [ + 'outputs.0/data.csv', + 'outputs.0/metadata.json', + 'outputs.1/data.csv', + 'outputs.1/metadata.json', + 'pipeline_run.yml', + 'steps.0.produce/data.csv', + 'steps.0.produce/metadata.json', + 'steps.1.produce/data.csv', + 'steps.1.produce/metadata.json', + ]) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + self._assert_standard_output_metadata() + self._assert_prediction_sum(prediction_sum=12106, outputs_path='outputs.0/data.csv') + self._assert_nonstandard_output(outputs_name='outputs.1') + + def test_fit_produce_multi_input(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + arg = [ + '', + 'runtime', + 'fit-produce', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--problem', + os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'), + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'multi-input-test.json'), + '--expose-produced-outputs', + self.test_dir, + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self.assertEqual(utils.list_files(self.test_dir), [ + 'outputs.0/data.csv', + 'outputs.0/metadata.json', + 'pipeline_run.yml', + 'steps.0.produce/data.csv', + 'steps.0.produce/metadata.json', + 'steps.1.produce/data.csv', + 'steps.1.produce/metadata.json', + 'steps.2.produce/data.csv', + 'steps.2.produce/metadata.json', + ]) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + self._assert_standard_output_metadata() + self._assert_prediction_sum(prediction_sum=11008, outputs_path='outputs.0/data.csv') + + def test_fit_score(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + arg = [ + '', + 'runtime', + 'fit-score', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--problem', + os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'), + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--score-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'random-forest-classifier.yml'), + '--scores', + os.path.join(self.test_dir, 'scores.csv'), + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + + dataframe = pandas.read_csv(os.path.join(self.test_dir, 'scores.csv')) + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0]]) + + def test_fit_score_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + arg = [ + '', + 'runtime', + 'fit-score', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--score-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'random-classifier.yml'), + '--scoring-pipeline', + os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'), + # this argument has no effect + '--metric', + 'F1_MACRO', + '--metric', + 'ACCURACY', + '--scores', + os.path.join(self.test_dir, 'scores.csv'), + '-O', + pipeline_run_save_path, + ] + logging_records = self._call_cli_runtime_without_fail(arg) + + self.assertEqual(len(logging_records), 1) + self.assertEqual(logging_records[0].msg, "Not all provided hyper-parameters for the scoring pipeline %(pipeline_id)s were used: %(unused_params)s") + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + + dataframe = pandas.read_csv(os.path.join(self.test_dir, 'scores.csv')) + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0]]) + + @staticmethod + def _get_iris_dataset_path(): + return os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json') + + @staticmethod + def _get_iris_problem_path(): + return os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json') + + @staticmethod + def _get_random_forest_pipeline_path(): + return os.path.join(PIPELINE_DIR, 'random-forest-classifier.yml') + + @staticmethod + def _get_no_split_data_pipeline_path(): + return os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml') + + @staticmethod + def _get_train_test_split_data_pipeline_path(): + return os.path.join(PIPELINE_DIR, 'data-preparation-train-test-split.yml') + + def _get_pipeline_run_save_path(self): + return os.path.join(self.test_dir, 'pipeline_run.yml') + + def _get_predictions_path(self): + return os.path.join(self.test_dir, 'predictions.csv') + + def _get_scores_path(self): + return os.path.join(self.test_dir, 'scores.csv') + + def _get_pipeline_rerun_save_path(self): + return os.path.join(self.test_dir, 'pipeline_rerun.yml') + + def _get_rescores_path(self): + return os.path.join(self.test_dir, 'rescores.csv') + + def _fit_iris_random_forest( + self, *, predictions_path=None, fitted_pipeline_path=None, pipeline_run_save_path=None + ): + if pipeline_run_save_path is None: + pipeline_run_save_path = self._get_pipeline_run_save_path() + arg = [ + '', + 'runtime', + 'fit', + '--input', + self._get_iris_dataset_path(), + '--problem', + self._get_iris_problem_path(), + '--pipeline', + self._get_random_forest_pipeline_path(), + '-O', + pipeline_run_save_path + ] + if predictions_path is not None: + arg.append('--output') + arg.append(predictions_path) + if fitted_pipeline_path is not None: + arg.append('--save') + arg.append(fitted_pipeline_path) + + self._call_cli_runtime_without_fail(arg) + + def _fit_iris_random_classifier_without_problem(self, *, fitted_pipeline_path): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + arg = [ + '', + 'runtime', + 'fit', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'random-classifier.yml'), + '-O', + pipeline_run_save_path + ] + if fitted_pipeline_path is not None: + arg.append('--save') + arg.append(fitted_pipeline_path) + + self._call_cli_runtime_without_fail(arg) + + def test_fit(self): + pipeline_run_save_path = self._get_pipeline_run_save_path() + fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline') + self._fit_iris_random_forest( + fitted_pipeline_path=fitted_pipeline_path, pipeline_run_save_path=pipeline_run_save_path + ) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self.assertTrue(os.path.isfile(fitted_pipeline_path)) + self.assertTrue(os.path.isfile(pipeline_run_save_path)) + + def test_evaluate(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + scores_path = os.path.join(self.test_dir, 'scores.csv') + arg = [ + '', + 'runtime', + 'evaluate', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--problem', + os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'random-forest-classifier.yml'), + '--data-pipeline', + os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml'), + '--scores', + scores_path, + '--metric', + 'ACCURACY', + '--metric', + 'F1_MACRO', + '-O', + pipeline_run_save_path + ] + self._call_cli_runtime_without_fail(arg) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + + dataframe = pandas.read_csv(scores_path) + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed', 'fold']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0, 0], ['F1_MACRO', 1.0, 1.0, 0, 0]]) + + def test_evaluate_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + scores_path = os.path.join(self.test_dir, 'scores.csv') + arg = [ + '', + 'runtime', + 'evaluate', + '--input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'random-classifier.yml'), + '--data-pipeline', + os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml'), + '--scoring-pipeline', + os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'), + # this argument has no effect + '--metric', + 'ACCURACY', + '--scores', + scores_path, + '-O', + pipeline_run_save_path + ] + logging_records = self._call_cli_runtime_without_fail(arg) + + self.assertEqual(len(logging_records), 1) + self.assertEqual(logging_records[0].msg, "Not all provided hyper-parameters for the scoring pipeline %(pipeline_id)s were used: %(unused_params)s") + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + + dataframe = pandas.read_csv(scores_path) + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed', 'fold']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0, 0]]) + + def test_score(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline') + self._fit_iris_random_forest(fitted_pipeline_path=fitted_pipeline_path) + self.assertTrue(os.path.isfile(fitted_pipeline_path)) + + scores_path = os.path.join(self.test_dir, 'scores.csv') + arg = [ + '', + 'runtime', + 'score', + '--fitted-pipeline', + fitted_pipeline_path, + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--score-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--scores', + scores_path, + '--metric', + 'F1_MACRO', + '--metric', + 'ACCURACY', + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self.assertTrue(os.path.isfile(scores_path), 'scores were not generated') + + dataframe = pandas.read_csv(scores_path) + + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed']) + self.assertEqual(dataframe.values.tolist(), [['F1_MACRO', 1.0, 1.0, 0], ['ACCURACY', 1.0, 1.0, 0]]) + + def test_score_without_problem_without_metric(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline') + self._fit_iris_random_classifier_without_problem(fitted_pipeline_path=fitted_pipeline_path) + self.assertTrue(os.path.isfile(fitted_pipeline_path)) + + scores_path = os.path.join(self.test_dir, 'scores.csv') + arg = [ + '', + 'runtime', + 'score', + '--fitted-pipeline', + fitted_pipeline_path, + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--score-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--scoring-pipeline', + os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'), + '--scores', + scores_path, + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self.assertTrue(os.path.isfile(scores_path), 'scores were not generated') + + dataframe = pandas.read_csv(scores_path) + + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0]]) + + def test_score_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline') + self._fit_iris_random_classifier_without_problem(fitted_pipeline_path=fitted_pipeline_path) + self.assertTrue(os.path.isfile(fitted_pipeline_path)) + + scores_path = os.path.join(self.test_dir, 'scores.csv') + arg = [ + '', + 'runtime', + 'score', + '--fitted-pipeline', + fitted_pipeline_path, + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--score-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--scoring-pipeline', + os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'), + # this argument has no effect + '--metric', + 'ACCURACY', + '--scores', + scores_path, + '-O', + pipeline_run_save_path, + ] + logging_records = self._call_cli_runtime_without_fail(arg) + + self.assertEqual(len(logging_records), 1) + self.assertEqual(logging_records[0].msg, "Not all provided hyper-parameters for the scoring pipeline %(pipeline_id)s were used: %(unused_params)s") + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self.assertTrue(os.path.isfile(scores_path), 'scores were not generated') + + dataframe = pandas.read_csv(scores_path) + + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0]]) + + def test_produce(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline') + self._fit_iris_random_forest(fitted_pipeline_path=fitted_pipeline_path) + self.assertTrue(os.path.isfile(fitted_pipeline_path)) + + arg = [ + '', + 'runtime', + 'produce', + '--fitted-pipeline', + fitted_pipeline_path, + '--test-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + def test_score_predictions(self): + predictions_path = os.path.join(self.test_dir, 'predictions.csv') + self._fit_iris_random_forest(predictions_path=predictions_path) + self.assertTrue(os.path.isfile(predictions_path)) + + scores_path = os.path.join(self.test_dir, 'scores.csv') + arg = [ + '', + 'runtime', + 'score-predictions', + '--score-input', + os.path.join(DATASET_DIR, 'iris_dataset_1/datasetDoc.json'), + '--problem', + os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'), + '--predictions', + predictions_path, + '--metric', + 'ACCURACY', + '--metric', + 'F1_MACRO', + '--scores', + scores_path, + ] + self._call_cli_runtime_without_fail(arg) + + self.assertTrue(os.path.isfile(scores_path), 'scores were not generated') + + dataframe = pandas.read_csv(scores_path) + + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0], ['F1_MACRO', 1.0, 1.0]]) + + def test_sklearn_dataset_fit_produce(self): + self._create_sklearn_iris_problem_doc() + + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + arg = [ + '', + 'runtime', + 'fit-produce', + '--input', + 'sklearn://iris', + '--input', + 'sklearn://iris', + '--problem', + os.path.join(self.test_dir, 'problemDoc.json'), + '--test-input', + 'sklearn://iris', + '--test-input', + 'sklearn://iris', + '--pipeline', + os.path.join(PIPELINE_DIR, 'multi-input-test.json'), + '--expose-produced-outputs', + self.test_dir, + '-O', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + + self.assertEqual(utils.list_files(self.test_dir), [ + 'outputs.0/data.csv', + 'outputs.0/metadata.json', + 'pipeline_run.yml', + 'problemDoc.json', + 'steps.0.produce/data.csv', + 'steps.0.produce/metadata.json', + 'steps.1.produce/data.csv', + 'steps.1.produce/metadata.json', + 'steps.2.produce/data.csv', + 'steps.2.produce/metadata.json' + ]) + self._assert_standard_output_metadata(prediction_type='numpy.int64') + self._assert_prediction_sum(prediction_sum=10648, outputs_path='outputs.0/data.csv') + + def test_sklearn_dataset_fit_produce_without_problem(self): + output_csv_path = os.path.join(self.test_dir, 'output.csv') + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline') + arg = [ + '', + 'runtime', + 'fit-produce', + '--input', + 'sklearn://iris', + '--test-input', + 'sklearn://iris', + '--pipeline', + os.path.join(PIPELINE_DIR, 'random-classifier.yml'), + '--save', + fitted_pipeline_path, + '--output', + output_csv_path, + '--expose-produced-outputs', + self.test_dir, + '-O', + pipeline_run_save_path, + ] + + self._call_cli_runtime_without_fail(arg) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + + self.assertEqual(utils.list_files(self.test_dir), [ + 'fitted-pipeline', + 'output.csv', + 'outputs.0/data.csv', + 'outputs.0/metadata.json', + 'pipeline_run.yml', + 'steps.0.produce/data.csv', + 'steps.0.produce/metadata.json', + 'steps.1.produce/data.csv', + 'steps.1.produce/metadata.json', + 'steps.2.produce/data.csv', + 'steps.2.produce/metadata.json', + ]) + self._assert_standard_output_metadata(prediction_type='numpy.int64') + self._assert_prediction_sum(prediction_sum=10648, outputs_path='outputs.0/data.csv') + self._assert_prediction_sum(prediction_sum=10648, outputs_path='output.csv') + + def _create_sklearn_iris_problem_doc(self): + with open(os.path.join(PROBLEM_DIR, 'iris_problem_1/problemDoc.json'), 'r', encoding='utf8') as problem_doc_file: + problem_doc = json.load(problem_doc_file) + + problem_doc['inputs']['data'][0]['datasetID'] = 'sklearn://iris' + + with open(os.path.join(self.test_dir, 'problemDoc.json'), 'x', encoding='utf8') as problem_doc_file: + json.dump(problem_doc, problem_doc_file) + + def test_sklearn_dataset_evaluate(self): + self._create_sklearn_iris_problem_doc() + + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + scores_path = os.path.join(self.test_dir, 'scores.csv') + arg = [ + '', + 'runtime', + 'evaluate', + '--input', + 'sklearn://iris', + '--problem', + os.path.join(self.test_dir, 'problemDoc.json'), + '--pipeline', + os.path.join(PIPELINE_DIR, 'random-forest-classifier.yml'), + '--data-pipeline', + os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml'), + '--scores', + scores_path, + '--metric', + 'ACCURACY', + '--metric', + 'F1_MACRO', + '-O', + pipeline_run_save_path + ] + self._call_cli_runtime_without_fail(arg) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + + dataframe = pandas.read_csv(scores_path) + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed', 'fold']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0, 0], ['F1_MACRO', 1.0, 1.0, 0, 0]]) + + def test_sklearn_dataset_evaluate_without_problem(self): + pipeline_run_save_path = os.path.join(self.test_dir, 'pipeline_run.yml') + scores_path = os.path.join(self.test_dir, 'scores.csv') + arg = [ + '', + 'runtime', + 'evaluate', + '--input', + 'sklearn://iris', + '--pipeline', + os.path.join(PIPELINE_DIR, 'random-classifier.yml'), + '--data-pipeline', + os.path.join(PIPELINE_DIR, 'data-preparation-no-split.yml'), + '--scoring-pipeline', + os.path.join(PIPELINE_DIR, 'fake_compute_score.yml'), + # this argument has no effect + '--metric', + 'ACCURACY', + '--scores', + scores_path, + '-O', + pipeline_run_save_path + ] + logging_records = self._call_cli_runtime_without_fail(arg) + + self.assertEqual(len(logging_records), 1) + self.assertEqual(logging_records[0].msg, "Not all provided hyper-parameters for the scoring pipeline %(pipeline_id)s were used: %(unused_params)s") + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + self._validate_previous_pipeline_run_ids(pipeline_run_save_path) + + dataframe = pandas.read_csv(scores_path) + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'normalized', 'randomSeed', 'fold']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, 1.0, 0, 0]]) + + def _assert_prediction_sum(self, prediction_sum, outputs_path): + if prediction_sum is not None: + with open(os.path.join(self.test_dir, outputs_path), 'r') as csv_file: + self.assertEqual(sum([int(v) for v in list(csv_file)[1:]]), prediction_sum) + + def _assert_standard_output_metadata(self, outputs_name='outputs.0', prediction_type='str'): + with open(os.path.join(self.test_dir, outputs_name, 'metadata.json'), 'r') as metadata_file: + metadata = json.load(metadata_file) + + self.assertEqual( + metadata, + [ + { + "selector": [], + "metadata": { + "dimension": { + "length": 150, + "name": "rows", + "semantic_types": ["https://metadata.datadrivendiscovery.org/types/TabularRow"], + }, + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/container.json", + "semantic_types": ["https://metadata.datadrivendiscovery.org/types/Table"], + "structural_type": "d3m.container.pandas.DataFrame", + }, + }, + { + "selector": ["__ALL_ELEMENTS__"], + "metadata": { + "dimension": { + "length": 1, + "name": "columns", + "semantic_types": ["https://metadata.datadrivendiscovery.org/types/TabularColumn"], + } + }, + }, + {"selector": ["__ALL_ELEMENTS__", 0], + "metadata": {"name": "predictions", "structural_type": prediction_type}}, + ], + ) + + def _assert_nonstandard_output(self, outputs_name='outputs.1'): + with open(os.path.join(self.test_dir, outputs_name, 'data.csv'), 'r') as csv_file: + output_dataframe = pandas.read_csv(csv_file, index_col=False) + learning_dataframe = pandas.read_csv( + os.path.join(DATASET_DIR, 'iris_dataset_1/tables/learningData.csv'), index_col=False) + self.assertTrue(learning_dataframe.equals(output_dataframe)) + + with open(os.path.join(self.test_dir, outputs_name, 'metadata.json'), 'r') as metadata_file: + metadata = json.load(metadata_file) + + self.assertEqual( + metadata, + [ + { + "metadata": { + "dimension": { + "length": 150, + "name": "rows", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/TabularRow" + ] + }, + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/container.json", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/Table" + ], + "structural_type": "d3m.container.pandas.DataFrame" + }, + "selector": [] + }, + { + "metadata": { + "dimension": { + "length": 6, + "name": "columns", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/TabularColumn" + ] + } + }, + "selector": [ + "__ALL_ELEMENTS__" + ] + }, + { + "metadata": { + "name": "d3mIndex", + "semantic_types": [ + "http://schema.org/Integer", + "https://metadata.datadrivendiscovery.org/types/PrimaryKey" + ], + "structural_type": "str" + }, + "selector": [ + "__ALL_ELEMENTS__", + 0 + ] + }, + { + "metadata": { + "name": "sepalLength", + "semantic_types": [ + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/Attribute" + ], + "structural_type": "str" + }, + "selector": [ + "__ALL_ELEMENTS__", + 1 + ] + }, + { + "metadata": { + "name": "sepalWidth", + "semantic_types": [ + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/Attribute" + ], + "structural_type": "str" + }, + "selector": [ + "__ALL_ELEMENTS__", + 2 + ] + }, + { + "metadata": { + "name": "petalLength", + "semantic_types": [ + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/Attribute" + ], + "structural_type": "str" + }, + "selector": [ + "__ALL_ELEMENTS__", + 3 + ] + }, + { + "metadata": { + "name": "petalWidth", + "semantic_types": [ + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/Attribute" + ], + "structural_type": "str" + }, + "selector": [ + "__ALL_ELEMENTS__", + 4 + ] + }, + { + "metadata": { + "name": "species", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/CategoricalData", + "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", + "https://metadata.datadrivendiscovery.org/types/Attribute" + ], + "structural_type": "str" + }, + "selector": [ + "__ALL_ELEMENTS__", + 5 + ] + } + ] + ) + + def _assert_pipeline_runs_equal(self, pipeline_run_save_path1, pipeline_run_save_path2): + with open(pipeline_run_save_path1, 'r') as f: + pipeline_runs1 = list(utils.yaml_load_all(f)) + + with open(pipeline_run_save_path2, 'r') as f: + pipeline_runs2 = list(utils.yaml_load_all(f)) + + self.assertEqual(len(pipeline_runs1), len(pipeline_runs2)) + + for pipeline_run1, pipeline_run2 in zip(pipeline_runs1, pipeline_runs2): + self.assertTrue(pipeline_run_module.PipelineRun.json_structure_equals(pipeline_run1, pipeline_run2)) + + def test_pipeline_run_json_structure_equals(self): + pipeline_run_save_path1 = os.path.join(self.test_dir, 'pipeline_run1.yml') + self._fit_iris_random_forest(pipeline_run_save_path=pipeline_run_save_path1) + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path1) + + pipeline_run_save_path2 = os.path.join(self.test_dir, 'pipeline_run2.yml') + self._fit_iris_random_forest(pipeline_run_save_path=pipeline_run_save_path2) + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path2) + + self._assert_pipeline_runs_equal(pipeline_run_save_path1, pipeline_run_save_path2) + + def _cache_pipeline_for_rerun(self, pipeline_path, cache_dir=None): + """make pipeline searchable by id in test_dir""" + with open(pipeline_path, 'r') as f: + pipeline = utils.yaml_load(f) + if cache_dir is None: + cache_dir = self.test_dir + temp_pipeline_path = os.path.join(cache_dir, pipeline['id'] + '.yml') + with open(temp_pipeline_path, 'w') as f: + utils.yaml_dump(pipeline, f) + + @staticmethod + def _generate_seed(): + return random.randint(2**31, 2**32-1) + + def test_fit_rerun(self): + dataset_path = self._get_iris_dataset_path() + problem_path = self._get_iris_problem_path() + pipeline_path = self._get_random_forest_pipeline_path() + pipeline_run_save_path = self._get_pipeline_run_save_path() + + problem = problem_module.get_problem(problem_path) + inputs = [dataset_module.get_dataset(dataset_path)] + with open(pipeline_path) as f: + pipeline = pipeline_module.Pipeline.from_yaml(f) + + hyperparams = [{}, {}, {'n_estimators': 19}, {}] + random_seed = self._generate_seed() + + with utils.silence(): + fitted_pipeline, predictions, fit_result = runtime.fit( + pipeline, inputs, problem_description=problem, hyperparams=hyperparams, + random_seed=random_seed, context=metadata_base.Context.TESTING, + ) + + with open(pipeline_run_save_path, 'w') as f: + fit_result.pipeline_run.to_yaml(f) + + self._cache_pipeline_for_rerun(pipeline_path) + + pipeline_rerun_save_path = self._get_pipeline_rerun_save_path() + + rerun_arg = [ + '', + '--pipelines-path', + self.test_dir, + 'runtime', + '--datasets', + TEST_DATA_DIR, + 'fit', + '--input-run', + pipeline_run_save_path, + '--output-run', + pipeline_rerun_save_path, + ] + self._call_cli_runtime_without_fail(rerun_arg) + + self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path) + self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path) + + def test_produce_rerun(self): + dataset_path = self._get_iris_dataset_path() + problem_path = self._get_iris_problem_path() + pipeline_path = self._get_random_forest_pipeline_path() + pipeline_run_save_path = self._get_pipeline_run_save_path() + fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline') + + self._fit_iris_random_forest(fitted_pipeline_path=fitted_pipeline_path) + self.assertTrue(os.path.isfile(fitted_pipeline_path)) + + arg = [ + '', + 'runtime', + 'produce', + '--fitted-pipeline', + fitted_pipeline_path, + '--test-input', + dataset_path, + '--output-run', + pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self._cache_pipeline_for_rerun(pipeline_path) + + pipeline_rerun_save_path = self._get_pipeline_rerun_save_path() + + rerun_arg = [ + '', + '--pipelines-path', + self.test_dir, + 'runtime', + '--datasets', + TEST_DATA_DIR, + 'produce', + '--fitted-pipeline', + fitted_pipeline_path, + '--input-run', + pipeline_run_save_path, + '--output-run', + pipeline_rerun_save_path, + ] + self._call_cli_runtime_without_fail(rerun_arg) + self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path) + + self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path) + + def _assert_scores_equal(self, scores_path, rescores_path): + scores = pandas.read_csv(scores_path) + rescores = pandas.read_csv(rescores_path) + self.assertTrue(scores.equals(rescores), '\n{}\n\n{}'.format(scores, rescores)) + + def _assert_scores_equal_pipeline_run(self, scores_path, pipeline_run_save_path): + scores = pandas.read_csv(scores_path) + scores.drop('fold', axis=1, inplace=True, errors='ignore') + scores_no_seed = scores.drop('randomSeed', axis=1, errors='ignore') + + with open(pipeline_run_save_path) as f: + # TODO: always use -1? + pipeline_run = list(utils.yaml_load_all(f))[-1] + self.assertEqual(pipeline_run['run']['phase'], metadata_base.PipelineRunPhase.PRODUCE.name) + # TODO: clean up preprocessing? + pipeline_run_scores_df = pandas.DataFrame(pipeline_run['run']['results']['scores']) + # TODO: is it possible to make pipeline run schema more compatible with scores csv schema? + pipeline_run_scores_df['metric'] = pipeline_run_scores_df['metric'].map(lambda cell: cell['metric']) + pipeline_run_scores_df = pipeline_run_scores_df[scores_no_seed.columns.tolist()] + + pandas.testing.assert_frame_equal(scores_no_seed, pipeline_run_scores_df) + self.assertEqual(scores['randomSeed'].iloc[0], pipeline_run['random_seed']) + + def test_score_rerun(self): + dataset_path = self._get_iris_dataset_path() + problem_path = self._get_iris_problem_path() + pipeline_path = self._get_random_forest_pipeline_path() + pipeline_run_save_path = self._get_pipeline_run_save_path() + fitted_pipeline_path = os.path.join(self.test_dir, 'iris-pipeline') + scores_path = os.path.join(self.test_dir, 'scores.csv') + + random_seed = self._generate_seed() + metrics = runtime.get_metrics_from_list(['ACCURACY', 'F1_MACRO']) + scoring_params = {'add_normalized_scores': 'false'} + scoring_random_seed = self._generate_seed() + + problem = problem_module.get_problem(problem_path) + inputs = [dataset_module.get_dataset(dataset_path)] + with open(pipeline_path) as f: + pipeline = pipeline_module.Pipeline.from_yaml(f) + with open(runtime.DEFAULT_SCORING_PIPELINE_PATH) as f: + scoring_pipeline = pipeline_module.Pipeline.from_yaml(f) + + with utils.silence(): + fitted_pipeline, predictions, fit_result = runtime.fit( + pipeline, inputs, problem_description=problem, random_seed=random_seed, + context=metadata_base.Context.TESTING, + ) + with open(fitted_pipeline_path, 'wb') as f: + pickle.dump(fitted_pipeline, f) + + predictions, produce_result = runtime.produce(fitted_pipeline, inputs) + + scores, score_result = runtime.score( + predictions, inputs, scoring_pipeline=scoring_pipeline, + problem_description=problem, metrics=metrics, predictions_random_seed=random_seed, + context=metadata_base.Context.TESTING, scoring_params=scoring_params, + random_seed=scoring_random_seed + ) + + self.assertFalse(score_result.has_error(), score_result.error) + + scores.to_csv(scores_path) + + runtime.combine_pipeline_runs( + produce_result.pipeline_run, scoring_pipeline_run=score_result.pipeline_run, score_inputs=inputs, + metrics=metrics, scores=scores + ) + with open(pipeline_run_save_path, 'w') as f: + produce_result.pipeline_run.to_yaml(f) + + self.assertTrue(os.path.isfile(fitted_pipeline_path)) + self.assertTrue(os.path.isfile(scores_path), 'scores were not generated') + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + dataframe = pandas.read_csv(scores_path) + + self.assertEqual(list(dataframe.columns), ['metric', 'value', 'randomSeed']) + self.assertEqual(dataframe.values.tolist(), [['ACCURACY', 1.0, random_seed], ['F1_MACRO', 1.0, random_seed]]) + + self._cache_pipeline_for_rerun(pipeline_path) + + pipeline_rerun_save_path = self._get_pipeline_rerun_save_path() + rescores_path = self._get_rescores_path() + + rerun_arg = [ + '', + '--pipelines-path', + self.test_dir, + 'runtime', + '--datasets', + TEST_DATA_DIR, + 'score', + '--fitted-pipeline', + fitted_pipeline_path, + '--input-run', + pipeline_run_save_path, + '--output-run', + pipeline_rerun_save_path, + '--scores', + rescores_path, + ] + self._call_cli_runtime_without_fail(rerun_arg) + self.assertTrue(os.path.isfile(pipeline_rerun_save_path)) + self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path) + self._assert_scores_equal(scores_path, rescores_path) + self._assert_scores_equal_pipeline_run(scores_path, pipeline_rerun_save_path) + self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path) + + def test_fit_produce_rerun(self): + dataset_path = self._get_iris_dataset_path() + problem_path = self._get_iris_problem_path() + pipeline_path = self._get_random_forest_pipeline_path() + pipeline_run_save_path = self._get_pipeline_run_save_path() + + hyperparams = [{}, {}, {'n_estimators': 19}, {}] + random_seed = self._generate_seed() + + problem = problem_module.get_problem(problem_path) + inputs = [dataset_module.get_dataset(dataset_path)] + with open(pipeline_path) as f: + pipeline = pipeline_module.Pipeline.from_yaml(f) + + with utils.silence(): + fitted_pipeline, predictions, fit_result = runtime.fit( + pipeline, inputs, problem_description=problem, hyperparams=hyperparams, + random_seed=random_seed, context=metadata_base.Context.TESTING, + ) + predictions, produce_result = runtime.produce(fitted_pipeline, inputs) + + with open(pipeline_run_save_path, 'w') as f: + fit_result.pipeline_run.to_yaml(f) + produce_result.pipeline_run.to_yaml(f, appending=True) + + self._cache_pipeline_for_rerun(pipeline_path) + + pipeline_rerun_save_path = self._get_pipeline_rerun_save_path() + + rerun_arg = [ + '', + '--pipelines-path', + self.test_dir, + '--strict-digest', + 'runtime', + '--datasets', + TEST_DATA_DIR, + 'fit-produce', + '--input-run', + pipeline_run_save_path, + '--output-run', + pipeline_rerun_save_path, + ] + self._call_cli_runtime_without_fail(rerun_arg) + self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path) + + self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path) + + def test_fit_score_rerun(self): + dataset_path = self._get_iris_dataset_path() + problem_path = self._get_iris_problem_path() + pipeline_path = self._get_random_forest_pipeline_path() + pipeline_run_save_path = self._get_pipeline_run_save_path() + scores_path = self._get_scores_path() + + hyperparams = [{}, {}, {'n_estimators': 19}, {}] + random_seed = self._generate_seed() + metrics = runtime.get_metrics_from_list(['ACCURACY', 'F1_MACRO']) + scoring_params = {'add_normalized_scores': 'false'} + scoring_random_seed = self._generate_seed() + + problem = problem_module.get_problem(problem_path) + inputs = [dataset_module.get_dataset(dataset_path)] + with open(pipeline_path) as f: + pipeline = pipeline_module.Pipeline.from_yaml(f) + with open(runtime.DEFAULT_SCORING_PIPELINE_PATH) as f: + scoring_pipeline = pipeline_module.Pipeline.from_yaml(f) + + with utils.silence(): + fitted_pipeline, predictions, fit_result = runtime.fit( + pipeline, inputs, problem_description=problem, hyperparams=hyperparams, + random_seed=random_seed, context=metadata_base.Context.TESTING, + ) + self.assertFalse(fit_result.has_error(), fit_result.error) + + predictions, produce_result = runtime.produce(fitted_pipeline, inputs) + self.assertFalse(produce_result.has_error(), produce_result.error) + + scores, score_result = runtime.score( + predictions, inputs, scoring_pipeline=scoring_pipeline, + problem_description=problem, metrics=metrics, + predictions_random_seed=fitted_pipeline.random_seed, + context=metadata_base.Context.TESTING, scoring_params=scoring_params, random_seed=scoring_random_seed + ) + + self.assertFalse(score_result.has_error(), score_result.error) + scores.to_csv(scores_path) + + runtime.combine_pipeline_runs( + produce_result.pipeline_run, scoring_pipeline_run=score_result.pipeline_run, score_inputs=inputs, + metrics=metrics, scores=scores + ) + + with open(pipeline_run_save_path, 'w') as f: + fit_result.pipeline_run.to_yaml(f) + produce_result.pipeline_run.to_yaml(f, appending=True) + + self._assert_valid_saved_pipeline_runs(pipeline_run_save_path) + + self._cache_pipeline_for_rerun(pipeline_path) + + pipeline_rerun_save_path = self._get_pipeline_rerun_save_path() + rescores_path = self._get_rescores_path() + + rerun_arg = [ + '', + '--pipelines-path', + self.test_dir, + '--strict-digest', + 'runtime', + '--datasets', + TEST_DATA_DIR, + 'fit-score', + '--input-run', + pipeline_run_save_path, + '--scores', + rescores_path, + '--output-run', + pipeline_rerun_save_path, + ] + self._call_cli_runtime_without_fail(rerun_arg) + self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path) + self._assert_scores_equal(scores_path, rescores_path) + self._assert_scores_equal_pipeline_run(scores_path, pipeline_rerun_save_path) + self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path) + + def test_evaluate_rerun(self): + dataset_path = self._get_iris_dataset_path() + problem_path = self._get_iris_problem_path() + pipeline_path = self._get_random_forest_pipeline_path() + data_pipeline_path = self._get_train_test_split_data_pipeline_path() + pipeline_run_save_path = self._get_pipeline_run_save_path() + scores_path = self._get_scores_path() + + hyperparams = [{}, {}, {'n_estimators': 19}, {}] + random_seed = self._generate_seed() + metrics = runtime.get_metrics_from_list(['ACCURACY', 'F1_MACRO']) + scoring_params = {'add_normalized_scores': 'false'} + scoring_random_seed = self._generate_seed() + data_params = {'shuffle': 'true', 'stratified': 'true', 'train_score_ratio': '0.59'} + data_random_seed = self._generate_seed() + + problem = problem_module.get_problem(problem_path) + inputs = [dataset_module.get_dataset(dataset_path)] + with open(pipeline_path) as f: + pipeline = pipeline_module.Pipeline.from_yaml(f) + with open(data_pipeline_path) as f: + data_pipeline = pipeline_module.Pipeline.from_yaml(f) + with open(runtime.DEFAULT_SCORING_PIPELINE_PATH) as f: + scoring_pipeline = pipeline_module.Pipeline.from_yaml(f) + + with utils.silence(): + dummy_runtime_environment = pipeline_run_module.RuntimeEnvironment(worker_id='dummy worker id') + + all_scores, all_results = runtime.evaluate( + pipeline, inputs, data_pipeline=data_pipeline, scoring_pipeline=scoring_pipeline, + problem_description=problem, data_params=data_params, metrics=metrics, + context=metadata_base.Context.TESTING, scoring_params=scoring_params, + hyperparams=hyperparams, random_seed=random_seed, + data_random_seed=data_random_seed, scoring_random_seed=scoring_random_seed, + runtime_environment=dummy_runtime_environment, + ) + + self.assertEqual(len(all_scores), 1) + scores = runtime.combine_folds(all_scores) + scores.to_csv(scores_path) + + if any(result.has_error() for result in all_results): + self.fail([result.error for result in all_results if result.has_error()][0]) + + with open(pipeline_run_save_path, 'w') as f: + for i, pipeline_run in enumerate(all_results.pipeline_runs): + pipeline_run.to_yaml(f, appending=i>0) + + self._cache_pipeline_for_rerun(pipeline_path) + self._cache_pipeline_for_rerun(data_pipeline_path) + + pipeline_rerun_save_path = self._get_pipeline_rerun_save_path() + rescores_path = self._get_rescores_path() + + rerun_arg = [ + '', + '--pipelines-path', + self.test_dir, + 'runtime', + '--datasets', + TEST_DATA_DIR, + 'evaluate', + '--input-run', + pipeline_run_save_path, + '--output-run', + pipeline_rerun_save_path, + '--scores', + rescores_path, + ] + self._call_cli_runtime_without_fail(rerun_arg) + self._assert_valid_saved_pipeline_runs(pipeline_rerun_save_path) + self._assert_scores_equal(scores_path, rescores_path) + self._assert_scores_equal_pipeline_run(scores_path, pipeline_rerun_save_path) + self._assert_pipeline_runs_equal(pipeline_run_save_path, pipeline_rerun_save_path) + + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/406 + # TODO: Test rerun validation code (that we throw exceptions on invalid pipeline runs). + # TODO: Test rerun with multiple inputs (non-standard pipeline). + # TODO: Test rerun without problem description. + # TODO: Test evaluate rerun with data split file. + + def test_validate_gzipped_pipeline_run(self): + # First, generate the pipeline run file + pipeline_run_save_path = self._get_pipeline_run_save_path() + gzip_pipeline_run_save_path = '{pipeline_run_save_path}.gz'.format(pipeline_run_save_path=pipeline_run_save_path) + fitted_pipeline_path = os.path.join(self.test_dir, 'fitted-pipeline') + self._fit_iris_random_forest( + fitted_pipeline_path=fitted_pipeline_path, pipeline_run_save_path=pipeline_run_save_path + ) + + # Second, gzip the pipeline run file + with open(pipeline_run_save_path, 'rb') as file_in: + with gzip.open(gzip_pipeline_run_save_path, 'wb') as file_out: + shutil.copyfileobj(file_in, file_out) + os.remove(pipeline_run_save_path) + + # Third, ensure that calling 'pipeline-run validate' on the gzipped pipeline run file is successful + arg = [ + '', + 'pipeline-run', + 'validate', + gzip_pipeline_run_save_path, + ] + self._call_cli_runtime_without_fail(arg) + + def test_help_message(self): + arg = [ + '', + 'runtime', + 'fit', + '--version', + ] + + with io.StringIO() as buffer: + with contextlib.redirect_stderr(buffer): + with self.assertRaises(SystemExit): + cli.main(arg) + + help = buffer.getvalue() + self.assertTrue('usage: d3m runtime fit' in help, help) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_compute_scores.py b/d3m/tests/test_compute_scores.py new file mode 100644 index 0000000..3effee4 --- /dev/null +++ b/d3m/tests/test_compute_scores.py @@ -0,0 +1,321 @@ +import os +import unittest + +import numpy + +from d3m import container, exceptions +from d3m.metadata import base as metadata_base +from d3m.contrib.primitives import compute_scores + + +class ComputeScoresPrimitiveTestCase(unittest.TestCase): + def test_regression(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + random = numpy.random.RandomState(42) + + # Create a synthetic prediction DataFrame. + d3mIndex = dataset['learningData'].iloc[:, 0].astype(int) + value = random.randn(len(d3mIndex)) + predictions = container.DataFrame({'d3mIndex': d3mIndex, 'value': value}, generate_metadata=True) + shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True) + + hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() + metrics_class = hyperparams_class.configuration['metrics'].elements + primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'metrics': [metrics_class({ + 'metric': 'MEAN_SQUARED_ERROR', + 'pos_label': None, + 'k': None, + }), metrics_class({ + 'metric': 'ROOT_MEAN_SQUARED_ERROR', + 'pos_label': None, + 'k': None, + }), metrics_class({ + 'metric': 'MEAN_ABSOLUTE_ERROR', + 'pos_label': None, + 'k': None, + }), metrics_class({ + 'metric': 'R_SQUARED', + 'pos_label': None, + 'k': None, + })], + })) + + for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]): + scores = primitive.produce(inputs=pred, score_dataset=dataset).value + self.assertEqual(scores.values.tolist(), [ + ['MEAN_SQUARED_ERROR', 3112.184932446708, 0.08521485450672399], + ['ROOT_MEAN_SQUARED_ERROR', 55.786960236660214, 0.9721137517700256], + ['MEAN_ABSOLUTE_ERROR', 54.579668078204385, 0.9727169385086356], + ['R_SQUARED', -22.62418041588221, 0.9881884591239001], + ], name) + + self.assertEqual(scores.metadata.query_column(0)['name'], 'metric', name) + self.assertEqual(scores.metadata.query_column(1)['name'], 'value', name) + + def test_multivariate(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'multivariate_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + random = numpy.random.RandomState(42) + + # Create a synthetic prediction DataFrame. + d3mIndex = dataset['learningData'].iloc[:, 0].astype(int) + amplitude = random.randn(len(d3mIndex)) + lengthscale = random.randn(len(d3mIndex)) + predictions = container.DataFrame({'d3mIndex': d3mIndex, 'amplitude': amplitude, 'lengthscale': lengthscale}, generate_metadata=True) + shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True) + + hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() + metrics_class = hyperparams_class.configuration['metrics'].elements + primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'metrics': [metrics_class({ + 'metric': 'MEAN_SQUARED_ERROR', + 'pos_label': None, + 'k': None, + }), metrics_class({ + 'metric': 'ROOT_MEAN_SQUARED_ERROR', + 'pos_label': None, + 'k': None, + }), metrics_class({ + 'metric': 'MEAN_ABSOLUTE_ERROR', + 'pos_label': None, + 'k': None, + })], + })) + + for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]): + scores = primitive.produce(inputs=pred, score_dataset=dataset).value + self.assertEqual(scores.values.tolist(), [ + ['MEAN_SQUARED_ERROR', 1.7627871219522482, 0.9991186066672619], + ['ROOT_MEAN_SQUARED_ERROR', 1.3243591896125282, 0.9993378205019783], + ['MEAN_ABSOLUTE_ERROR', 1.043095768817859, 0.9994784521628801], + ], name) + + self.assertEqual(scores.metadata.query_column(0)['name'], 'metric', name) + self.assertEqual(scores.metadata.query_column(1)['name'], 'value', name) + + def test_classification(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + random = numpy.random.RandomState(42) + + # Create a synthetic prediction DataFrame. + d3mIndex = dataset['learningData'].iloc[:, 0].astype(int) + species = random.choice(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], len(d3mIndex)) + predictions = container.DataFrame({'d3mIndex': d3mIndex, 'species': species}, generate_metadata=True) + shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True) + + hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() + metrics_class = hyperparams_class.configuration['metrics'].elements + primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'metrics': [metrics_class({ + 'metric': 'ACCURACY', + 'pos_label': None, + 'k': None, + }), metrics_class({ + 'metric': 'F1_MICRO', + 'pos_label': None, + 'k': None, + }), metrics_class({ + 'metric': 'F1_MACRO', + 'pos_label': None, + 'k': None, + })], + })) + + for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]): + scores = primitive.produce(inputs=pred, score_dataset=dataset).value + self.assertEqual(scores.values.tolist(), [ + ['ACCURACY', 0.4066666666666667, 0.4066666666666667], + ['F1_MICRO', 0.4066666666666667, 0.4066666666666667], + ['F1_MACRO', 0.4051068540623797, 0.4051068540623797], + ], name) + + self.assertEqual(scores.metadata.query_column(0)['name'], 'metric', name) + self.assertEqual(scores.metadata.query_column(1)['name'], 'value', name) + + # TODO: Test also when there is both "color_not_class" and "bounding_polygon_area" targets predicted. + def test_object_detection_just_bounding_polygon(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'object_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + random = numpy.random.RandomState(42) + + # Create a synthetic prediction DataFrame. + predictions = container.DataFrame([ + [0, '330,463,330,505,387,505,387,463', 0.0739], + [0, '420,433,420,498,451,498,451,433', 0.091], + [0, '328,465,328,540,403,540,403,465', 0.1008], + [0, '480,477,480,522,508,522,508,477', 0.1012], + [0, '357,460,357,537,417,537,417,460', 0.1058], + [0, '356,456,356,521,391,521,391,456', 0.0843], + [1, '345,460,345,547,415,547,415,460', 0.0539], + [1, '381,362,381,513,455,513,455,362', 0.0542], + [1, '382,366,382,422,416,422,416,366', 0.0559], + [1, '730,463,730,583,763,583,763,463', 0.0588], + ], columns=['d3mIndex', 'bounding_polygon_area', 'confidence'], generate_metadata=True) + shuffled_predictions = predictions.reindex(random.permutation(predictions.index)).reset_index(drop=True) + + hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() + metrics_class = hyperparams_class.configuration['metrics'].elements + primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'metrics': [metrics_class({ + 'metric': 'OBJECT_DETECTION_AVERAGE_PRECISION', + 'pos_label': None, + 'k': None, + })], + })) + + for name, pred in zip(['predictions', 'shuffled_predictions'], [predictions, shuffled_predictions]): + scores = primitive.produce(inputs=pred, score_dataset=dataset).value + self.assertEqual(scores.values.tolist(), [ + ['OBJECT_DETECTION_AVERAGE_PRECISION', 0.125, 0.125], + ], name) + + self.assertEqual(scores.metadata.query_column(0)['name'], 'metric') + self.assertEqual(scores.metadata.query_column(1)['name'], 'value') + self.assertEqual(scores.metadata.query_column(2)['name'], 'normalized') + + def test_all_labels(self): + truth = container.DataFrame([ + [3, 'happy-pleased'], + [3, 'relaxing-calm'], + [7, 'amazed-suprised'], + [7, 'happy-pleased'], + [13, 'quiet-still'], + [13, 'sad-lonely'], + ], columns=['d3mIndex', 'class_label']) + + truth_dataset = container.Dataset({'learningData': truth}, generate_metadata=True) + + truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') + truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/Target') + truth_dataset.metadata = truth_dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + + predictions = container.DataFrame([ + [3, 'happy-pleased'], + [3, 'sad-lonely'], + [7, 'amazed-suprised'], + [7, 'happy-pleased'], + [13, 'quiet-still'], + [13, 'happy-pleased'], + ], columns=['d3mIndex', 'class_label'], generate_metadata=True) + + hyperparams_class = compute_scores.ComputeScoresPrimitive.metadata.get_hyperparams() + metrics_class = hyperparams_class.configuration['metrics'].elements + all_labels_class = hyperparams_class.configuration['all_labels'].elements + primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'metrics': [metrics_class({ + 'metric': 'HAMMING_LOSS', + 'pos_label': None, + 'k': None, + })], + })) + + scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value + self.assertEqual(scores.values.tolist(), [ + ['HAMMING_LOSS', 0.26666666666666666, 0.7333333333333334], + ]) + + self.assertEqual(scores.metadata.query_column(0)['name'], 'metric') + self.assertEqual(scores.metadata.query_column(1)['name'], 'value') + self.assertEqual(scores.metadata.query_column(2)['name'], 'normalized') + + primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'metrics': [metrics_class({ + 'metric': 'HAMMING_LOSS', + 'pos_label': None, + 'k': None, + })], + 'all_labels': [all_labels_class({ + 'column_name': 'class_label', + 'labels': ['happy-pleased', 'relaxing-calm', 'amazed-suprised', 'quiet-still', 'sad-lonely', 'foobar'], + })], + })) + + scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value + self.assertEqual(scores.values.tolist(), [ + ['HAMMING_LOSS', 0.2222222222222222, 0.7777777777777778], + ]) + + primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'metrics': [metrics_class({ + 'metric': 'HAMMING_LOSS', + 'pos_label': None, + 'k': None, + })], + 'all_labels': [all_labels_class({ + 'column_name': 'class_label', + 'labels': ['happy-pleased', 'relaxing-calm', 'amazed-suprised'], + })], + })) + + with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'Truth contains extra labels'): + primitive.produce(inputs=predictions, score_dataset=truth_dataset) + + truth_dataset.metadata = truth_dataset.metadata.update_column(1, { + 'all_distinct_values': ['happy-pleased', 'relaxing-calm', 'amazed-suprised', 'quiet-still', 'sad-lonely', 'foobar'], + }, at=('learningData',)) + + primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'metrics': [metrics_class({ + 'metric': 'HAMMING_LOSS', + 'pos_label': None, + 'k': None, + })], + })) + + scores = primitive.produce(inputs=predictions, score_dataset=truth_dataset).value + self.assertEqual(scores.values.tolist(), [ + ['HAMMING_LOSS', 0.2222222222222222, 0.7777777777777778], + ]) + + truth_dataset.metadata = truth_dataset.metadata.update_column(1, { + 'all_distinct_values': ['happy-pleased', 'relaxing-calm', 'amazed-suprised'], + }, at=('learningData',)) + + primitive = compute_scores.ComputeScoresPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'metrics': [metrics_class({ + 'metric': 'HAMMING_LOSS', + 'pos_label': None, + 'k': None, + })], + })) + + with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'Truth contains extra labels'): + primitive.produce(inputs=predictions, score_dataset=truth_dataset) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_container_metadata.py b/d3m/tests/test_container_metadata.py new file mode 100644 index 0000000..003e9e0 --- /dev/null +++ b/d3m/tests/test_container_metadata.py @@ -0,0 +1,1258 @@ +import collections +import unittest + +import numpy +import pandas + +from d3m import container, utils +from d3m.metadata import base + + +class TestContainerMetadata(unittest.TestCase): + def test_update_with_generated_metadata(self): + metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.ndarray, + }) + + cells_metadata = collections.OrderedDict() + cells_metadata[('a',)] = {'other': 1} + cells_metadata[('b',)] = {'other': 2} + cells_metadata[('c',)] = {'other': 3} + cells_metadata[(base.ALL_ELEMENTS,)] = {'foo': 'bar'} + cells_metadata[('other', 'a')] = {'other': 4} + cells_metadata[('other', 'b')] = {'other': 5} + cells_metadata[('other', 'c')] = {'other': 6} + cells_metadata[('other', base.ALL_ELEMENTS)] = {'foo': 'bar2'} + + metadata._update_with_generated_metadata(cells_metadata) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'foo': 'bar'}, + }, { + 'selector': ['a'], + 'metadata': {'other': 1}, + }, { + 'selector': ['b'], + 'metadata': {'other': 2}, + }, { + 'selector': ['c'], + 'metadata': {'other': 3}, + }, { + 'selector': ['other', '__ALL_ELEMENTS__'], + 'metadata': {'foo': 'bar2'}, + }, { + 'selector': ['other', 'a'], + 'metadata': {'other': 4}, + }, { + 'selector': ['other', 'b'], + 'metadata': {'other': 5}, + }, { + 'selector': ['other', 'c'], + 'metadata': {'other': 6}, + }]) + + metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.ndarray, + 'semantic_types': ['http://example.com/Type1'], + 'dimension': { + 'length': 0, + 'foobar': 42, + 'semantic_types': ['http://example.com/Type2'], + } + }) + + metadata = metadata.update(('a',), { + 'semantic_types': ['http://example.com/Type3'], + 'dimension': { + 'length': 0, + 'foobar': 45, + 'semantic_types': ['http://example.com/Type4'], + } + }) + + cells_metadata = collections.OrderedDict() + cells_metadata[()] = { + 'other': 1, + 'structural_type': container.ndarray, + 'semantic_types': ['http://example.com/Type1a'], + 'dimension': { + 'length': 100, + 'name': 'test1', + 'semantic_types': ['http://example.com/Type2a'], + } + } + cells_metadata[('a',)] = { + 'semantic_types': ['http://example.com/Type3', 'http://example.com/Type3a'], + 'dimension': { + 'length': 200, + 'name': 'test2', + 'semantic_types': ['http://example.com/Type4', 'http://example.com/Type4a'], + } + } + cells_metadata[('b',)] = {'other': 2} + + metadata._update_with_generated_metadata(cells_metadata) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'other': 1, + 'semantic_types': ['http://example.com/Type1', 'http://example.com/Type1a'], + 'dimension': { + 'length': 100, + 'name': 'test1', + 'foobar': 42, + 'semantic_types': ['http://example.com/Type2', 'http://example.com/Type2a'], + }, + }, + }, { + 'selector': ['a'], + 'metadata': { + 'semantic_types': ['http://example.com/Type3', 'http://example.com/Type3a'], + 'dimension': { + 'length': 200, + 'name': 'test2', + 'foobar': 45, + 'semantic_types': ['http://example.com/Type4', 'http://example.com/Type4a'], + }, + }, + }, { + 'selector': ['b'], + 'metadata': {'other': 2}, + }]) + + self.assertEqual(metadata.to_json_structure(), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'other': 1, + 'semantic_types': ['http://example.com/Type1', 'http://example.com/Type1a'], + 'dimension': { + 'length': 100, + 'name': 'test1', + 'foobar': 42, + 'semantic_types': ['http://example.com/Type2', 'http://example.com/Type2a'], + }, + }, + }, { + 'selector': ['a'], + 'metadata': { + 'semantic_types': ['http://example.com/Type3', 'http://example.com/Type3a'], + 'dimension': { + 'length': 200, + 'name': 'test2', + 'foobar': 45, + 'semantic_types': ['http://example.com/Type4', 'http://example.com/Type4a'], + }, + }, + }, { + 'selector': ['b'], + 'metadata': {'other': 2}, + }]) + + def test_dataframe(self): + df = container.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}, generate_metadata=True) + + self.assertEqual(utils.to_json_structure(df.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + } + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'str', + }, + }]) + + def test_dataset(self): + dataset = container.Dataset({'0': container.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})}, generate_metadata=False) + + compact_metadata = dataset.metadata.generate(dataset, compact=True) + noncompact_metadata = dataset.metadata.generate(dataset, compact=False) + + self.assertEqual(utils.to_json_structure(compact_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'str', + }, + }]) + + self.assertEqual(utils.to_json_structure(noncompact_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': 1, + }, + }, + }, { + 'selector': ['0'], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['0', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['0', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['0', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'str', + }, + }]) + + def test_list(self): + lst = container.List(['a', 'b', 'c'], generate_metadata=True) + + self.assertEqual(utils.to_json_structure(lst.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + }, + }]) + + lst = container.List([1, 'a', 2.0], generate_metadata=True) + + self.assertEqual(utils.to_json_structure(lst.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': [0], + 'metadata': { + 'structural_type': 'int', + }, + }, { + 'selector': [1], + 'metadata': { + 'structural_type': 'str', + }, + }, { + 'selector': [2], + 'metadata': { + 'structural_type': 'float', + }, + }]) + + lst = container.List([container.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})], generate_metadata=True) + + self.assertEqual(utils.to_json_structure(lst.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'str', + }, + }]) + + def test_ndarray(self): + array = container.ndarray(numpy.array([1, 2, 3]), generate_metadata=True) + + self.assertEqual(utils.to_json_structure(array.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + def test_dataframe_with_names_kept(self): + df = container.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}, generate_metadata=True) + + df.metadata = df.metadata.update((base.ALL_ELEMENTS, 0), { + 'name': 'first_column', + }) + df.metadata = df.metadata.update((base.ALL_ELEMENTS, 1), { + 'name': 'second_column', + }) + + self.assertEqual(utils.to_json_structure(df.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'first_column', + 'structural_type': 'numpy.int64', + } + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'second_column', + 'structural_type': 'str', + }, + }]) + + df2 = container.DataFrame({'A': [1, 2, 3, 4], 'B': ['a', 'b', 'c', 'd']}) + + df2.metadata = df.metadata.generate(df2) + + self.assertEqual(utils.to_json_structure(df2.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'first_column', + 'structural_type': 'numpy.int64', + } + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'second_column', + 'structural_type': 'str', + }, + }]) + + def test_dataframe_tabular_semantic_types(self): + # A DataFrame with explicit WRONG metadata. + df = container.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, generate_metadata=True) + + self.assertEqual(utils.to_json_structure(df.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + # We respect the name, but we override the semantic types. + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + } + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'str', + }, + }]) + + def test_complex_value(self): + dataset = container.Dataset({ + '0': container.DataFrame({ + 'A': [ + container.ndarray(numpy.array(['a', 'b', 'c'])), + container.ndarray(numpy.array([1, 2, 3])), + container.ndarray(numpy.array([1.0, 2.0, 3.0])), + ], + 'B': [ + container.List(['a', 'b', 'c']), + container.List([1, 2, 3]), + container.List([1.0, 2.0, 3.0]), + ], + }), + }, generate_metadata=False) + + dataset_metadata = dataset.metadata.generate(dataset, compact=True) + + self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3 + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'd3m.container.numpy.ndarray', + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'd3m.container.list.List', + 'name': 'B', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0, 0, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.str_', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0, 1, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1, 0, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1, 1, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'int', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2, 0, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2, 1, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'float', + } + }]) + + dataset_metadata = dataset.metadata.generate(dataset, compact=False) + + self.assertEqual(utils.to_json_structure(dataset_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'structural_type': 'd3m.container.dataset.Dataset', + }, + }, { + 'selector': ['0'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, + { + 'selector': ['0', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 2, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, + { + 'selector': ['0', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + }, + }, + { + 'selector': ['0', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + }, + }, + { + 'selector': ['0', 0, 0], + 'metadata': { + 'dimension': { + 'length': 3, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, + { + 'selector': ['0', 0, 0, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.str_' + }, + }, + { + 'selector': ['0', 0, 1], + 'metadata': { + 'dimension': { + 'length': 3, + }, + 'structural_type': 'd3m.container.list.List', + }, + }, { + 'selector': ['0', 0, 1, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + }, + }, { + 'selector': ['0', 1, 0], + 'metadata': { + 'dimension': { + 'length': 3, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['0', 1, 0, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['0', 1, 1], + 'metadata': { + 'dimension': { + 'length': 3, + }, + 'structural_type': 'd3m.container.list.List', + }, + }, { + 'selector': ['0', 1, 1, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'int', + }, + }, { + 'selector': ['0', 2, 0], + 'metadata': { + 'dimension': { + 'length': 3, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['0', 2, 0, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.float64', + }, + }, + { + 'selector': ['0', 2, 1], + 'metadata': { + 'dimension': { + 'length': 3, + }, + 'structural_type': 'd3m.container.list.List', + }, + }, { + 'selector': ['0', 2, 1, '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'float', + }, + }]) + + def test_dataframe_with_objects(self): + df = pandas.DataFrame({str(i): [str(j) for j in range(10)] for i in range(5)}, columns=[str(i) for i in range(5)]) + + df = container.DataFrame(df, generate_metadata=False) + + compact_metadata = df.metadata.generate(df, compact=True) + noncompact_metadata = df.metadata.generate(df, compact=False) + + basic_metadata = [ + { + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 10, + }, + }, + }, + { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 5, + }, + }, + }, + ] + + column_names = [{'selector': ['__ALL_ELEMENTS__', i], 'metadata': {'name': str(i)}} for i in range(5)] + + self.assertEqual(utils.to_json_structure(compact_metadata.to_internal_simple_structure()), basic_metadata + [ + { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + }, + } + ] + column_names) + + column_names = [{'selector': ['__ALL_ELEMENTS__', i], 'metadata': {'name': str(i), 'structural_type': 'str'}} for i in range(5)] + + self.assertEqual(utils.to_json_structure(noncompact_metadata.to_internal_simple_structure()), basic_metadata + column_names) + + def test_list_with_objects(self): + l = container.List([container.List([str(j) for i in range(5)]) for j in range(10)], generate_metadata=True) + + self.assertEqual(utils.to_json_structure(l.metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 10, + }, + }, + }, + { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 5, + }, + } + }, + { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + }, + }, + ]) + + def test_ndarray_with_objects(self): + array = numpy.array([[[str(k) for k in range(5)] for i in range(10)] for j in range(10)], dtype=object) + + array = container.ndarray(array, generate_metadata=True) + + self.assertEqual(utils.to_json_structure(array.metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 10, + }, + }, + }, + { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 10, + }, + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + }, + }, + ]) + + def test_dict_with_objects(self): + l = container.List([{str(i): {str(j): j for j in range(10)} for i in range(5)}], generate_metadata=True) + + self.assertEqual(utils.to_json_structure(l.metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 1, + }, + }, + }, + { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'dict', + 'dimension': { + 'length': 5, + }, + } + }, + { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'dict', + 'dimension': { + 'length': 10. + }, + } + }, + { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'int', + }, + }, + ]) + + def test_custom_column_name_with_compacting(self): + dataframe = container.DataFrame({'a': ['1.0', '2.0', '3.0']}, generate_metadata=False) + + dataframe.metadata = dataframe.metadata.generate(dataframe, compact=True) + + dataframe.metadata = dataframe.metadata.update((base.ALL_ELEMENTS, 0), { + 'name': 'test', + 'foo': 'bar', + }) + + self.assertEqual(utils.to_json_structure(dataframe.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + } + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + 'name': 'a', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'test', + 'foo': 'bar', + }, + }]) + + dataframe.metadata = dataframe.metadata.generate(dataframe, compact=True) + + self.assertEqual(utils.to_json_structure(dataframe.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + } + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + 'name': 'a', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'test', + 'foo': 'bar', + }, + }]) + + dataframe.metadata = dataframe.metadata.update((base.ALL_ELEMENTS, 0), { + 'name': base.NO_VALUE, + }) + + dataframe.metadata = dataframe.metadata.generate(dataframe, compact=True) + + self.assertEqual(utils.to_json_structure(dataframe.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + } + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + 'name': 'a', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': '__NO_VALUE__', + 'foo': 'bar', + }, + }]) + + + def test_custom_column_name_without_compacting(self): + dataframe = container.DataFrame({'a': ['1.0', '2.0', '3.0']}, generate_metadata=False) + + dataframe.metadata = dataframe.metadata.generate(dataframe, compact=False) + + dataframe.metadata = dataframe.metadata.update((base.ALL_ELEMENTS, 0), { + 'name': 'test', + 'foo': 'bar', + }) + + self.assertEqual(utils.to_json_structure(dataframe.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + } + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'test', + 'foo': 'bar', + 'structural_type': 'str', + }, + }]) + + dataframe.metadata = dataframe.metadata.generate(dataframe, compact=False) + + self.assertEqual(utils.to_json_structure(dataframe.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + } + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'test', + 'foo': 'bar', + 'structural_type': 'str', + }, + }]) + + dataframe.metadata = dataframe.metadata.update((base.ALL_ELEMENTS, 0), { + 'name': base.NO_VALUE, + }) + + dataframe.metadata = dataframe.metadata.generate(dataframe, compact=False) + + self.assertEqual(utils.to_json_structure(dataframe.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + } + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': '__NO_VALUE__', + 'foo': 'bar', + 'structural_type': 'str', + }, + }]) + + def test_unset_structural_type(self): + dataframe = container.DataFrame({'a': ['a', 'b', 'c'], 'b': ['a', 'b', 'c']}, generate_metadata=False) + + compact_metadata = dataframe.metadata.generate(dataframe, compact=True) + + all_elements_metadata = compact_metadata.query((base.ALL_ELEMENTS, base.ALL_ELEMENTS)) + compact_metadata = compact_metadata.remove((base.ALL_ELEMENTS, base.ALL_ELEMENTS), strict_all_elements=True) + compact_metadata = compact_metadata.update((base.ALL_ELEMENTS, 0), all_elements_metadata) + compact_metadata = compact_metadata.update((base.ALL_ELEMENTS, 1), all_elements_metadata) + + compact_metadata = compact_metadata.generate(dataframe, compact=True) + + self.assertEqual(utils.to_json_structure(compact_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'str', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'a', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'b', + }, + }]) + + compact_metadata = dataframe.metadata.generate(dataframe, compact=False) + + all_elements_metadata = compact_metadata.query((base.ALL_ELEMENTS, base.ALL_ELEMENTS)) + compact_metadata = compact_metadata.remove((base.ALL_ELEMENTS, base.ALL_ELEMENTS), strict_all_elements=True) + compact_metadata = compact_metadata.update((base.ALL_ELEMENTS, 0), all_elements_metadata) + compact_metadata = compact_metadata.update((base.ALL_ELEMENTS, 1), all_elements_metadata) + + compact_metadata = compact_metadata.generate(dataframe, compact=False) + + self.assertEqual(utils.to_json_structure(compact_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'a', + 'structural_type': 'str', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'b', + 'structural_type': 'str', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_containers.py b/d3m/tests/test_containers.py new file mode 100644 index 0000000..57bc551 --- /dev/null +++ b/d3m/tests/test_containers.py @@ -0,0 +1,2608 @@ +import copy +import os.path +import pickle +import tempfile +import unittest +import warnings + +import numpy +import pandas +import pandas.core.common + +from d3m import container, utils +from d3m.container import utils as container_utils +from d3m.metadata import base as metadata_base + +copy_functions = { + 'obj.copy()': lambda obj: obj.copy(), + 'obj[:]': lambda obj: obj[:], + 'copy.copy()': lambda obj: copy.copy(obj), + 'copy.deepcopy()': lambda obj: copy.deepcopy(obj), + 'pickle.loads(pickle.dumps())': lambda obj: pickle.loads(pickle.dumps(obj)), +} + + +class TestContainers(unittest.TestCase): + def test_list(self): + l = container.List() + + self.assertTrue(hasattr(l, 'metadata')) + + l = container.List([1, 2, 3], generate_metadata=True) + + l.metadata = l.metadata.update((), { + 'test': 'foobar', + }) + + self.assertSequenceEqual(l, [1, 2, 3]) + self.assertIsInstance(l, container.List) + self.assertTrue(hasattr(l, 'metadata')) + self.assertEqual(l.metadata.query(()).get('test'), 'foobar') + + self.assertIsInstance(l, container.List) + self.assertIsInstance(l, list) + + self.assertNotIsInstance([], container.List) + + for name, copy_function in copy_functions.items(): + l_copy = copy_function(l) + + self.assertIsInstance(l_copy, container.List, name) + self.assertTrue(hasattr(l_copy, 'metadata'), name) + + self.assertSequenceEqual(l, l_copy, name) + self.assertEqual(l.metadata.to_internal_json_structure(), l_copy.metadata.to_internal_json_structure(), name) + self.assertEqual(l_copy.metadata.query(()).get('test'), 'foobar', name) + + l_copy = container.List(l, { + 'test2': 'barfoo', + }, generate_metadata=True) + + self.assertIsInstance(l_copy, container.List) + self.assertTrue(hasattr(l_copy, 'metadata')) + + self.assertSequenceEqual(l, l_copy) + self.assertEqual(l_copy.metadata.query(()), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.List, + 'dimension': { + 'length': 3, + }, + 'test': 'foobar', + 'test2': 'barfoo', + }) + + self.assertEqual(l[1], 2) + + with self.assertRaisesRegex(TypeError, 'list indices must be integers or slices, not tuple'): + l[1, 2] + + l_slice = l[1:3] + + self.assertSequenceEqual(l, [1, 2, 3]) + self.assertSequenceEqual(l_slice, [2, 3]) + self.assertIsInstance(l_slice, container.List) + self.assertTrue(hasattr(l_slice, 'metadata')) + self.assertEqual(l.metadata.to_internal_json_structure(), l_slice.metadata.to_internal_json_structure()) + + l_added = l + [4, 5] + + self.assertSequenceEqual(l, [1, 2, 3]) + self.assertSequenceEqual(l_added, [1, 2, 3, 4, 5]) + self.assertIsInstance(l_added, container.List) + self.assertTrue(hasattr(l_added, 'metadata')) + self.assertEqual(l.metadata.to_internal_json_structure(), l_added.metadata.to_internal_json_structure()) + + l_added += [6, 7] + + self.assertSequenceEqual(l_added, [1, 2, 3, 4, 5, 6, 7]) + self.assertIsInstance(l_added, container.List) + self.assertTrue(hasattr(l_added, 'metadata')) + self.assertEqual(l.metadata.to_internal_json_structure(), l_added.metadata.to_internal_json_structure()) + + l_multiplied = l * 3 + + self.assertSequenceEqual(l, [1, 2, 3]) + self.assertSequenceEqual(l_multiplied, [1, 2, 3, 1, 2, 3, 1, 2, 3]) + self.assertIsInstance(l_multiplied, container.List) + self.assertTrue(hasattr(l_multiplied, 'metadata')) + self.assertEqual(l.metadata.to_internal_json_structure(), l_multiplied.metadata.to_internal_json_structure()) + + l_multiplied = 3 * l + + self.assertSequenceEqual(l, [1, 2, 3]) + self.assertSequenceEqual(l_multiplied, [1, 2, 3, 1, 2, 3, 1, 2, 3]) + self.assertIsInstance(l_multiplied, container.List) + self.assertTrue(hasattr(l_multiplied, 'metadata')) + self.assertEqual(l.metadata.to_internal_json_structure(), l_multiplied.metadata.to_internal_json_structure()) + + l_multiplied *= 2 + + self.assertSequenceEqual(l_multiplied, [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]) + self.assertIsInstance(l_multiplied, container.List) + self.assertTrue(hasattr(l_multiplied, 'metadata')) + self.assertEqual(l.metadata.to_internal_json_structure(), l_multiplied.metadata.to_internal_json_structure()) + + def test_ndarray(self): + array = container.ndarray(numpy.array([1, 2, 3]), generate_metadata=True) + self.assertTrue(numpy.array_equal(array, [1, 2, 3])) + self.assertIsInstance(array, container.ndarray) + self.assertTrue(hasattr(array, 'metadata')) + + self.assertIsInstance(array, numpy.ndarray) + + self.assertNotIsInstance(numpy.array([]), container.ndarray) + + array.metadata = array.metadata.update((), { + 'test': 'foobar', + }) + + self.assertEqual(array.metadata.query(()).get('test'), 'foobar') + + for name, copy_function in copy_functions.items(): + array_copy = copy_function(array) + + self.assertIsInstance(array_copy, container.ndarray, name) + self.assertTrue(hasattr(array_copy, 'metadata'), name) + + self.assertTrue(numpy.array_equal(array, array_copy), name) + self.assertEqual(array.metadata.to_internal_json_structure(), array_copy.metadata.to_internal_json_structure(), name) + self.assertEqual(array_copy.metadata.query(()).get('test'), 'foobar', name) + + + array_copy = container.ndarray(array, { + 'test2': 'barfoo', + }, generate_metadata=True) + + self.assertIsInstance(array_copy, container.ndarray) + self.assertTrue(hasattr(array_copy, 'metadata')) + + self.assertTrue(numpy.array_equal(array, array_copy)) + self.assertEqual(array_copy.metadata.query(()), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.ndarray, + 'dimension': { + 'length': 3, + }, + 'test': 'foobar', + 'test2': 'barfoo', + }) + + array_from_list = container.ndarray([1, 2, 3], generate_metadata=True) + self.assertTrue(numpy.array_equal(array_from_list, [1, 2, 3])) + self.assertIsInstance(array_from_list, container.ndarray) + self.assertTrue(hasattr(array_from_list, 'metadata')) + + def test_dataframe_to_csv(self): + df = container.DataFrame(pandas.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}), generate_metadata=True) + df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 0), {'name': 'E'}) + df.metadata = df.metadata.update((metadata_base.ALL_ELEMENTS, 1), {'name': 'F'}) + + self.assertEqual(df.columns.tolist(), ['A', 'B']) + self.assertEqual(df.to_csv(), 'E,F\n1,4\n2,5\n3,6\n') + + def test_dataframe(self): + df = container.DataFrame(pandas.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}), generate_metadata=True) + self.assertTrue(df._data.equals(pandas.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})._data)) + self.assertIsInstance(df, container.DataFrame) + self.assertTrue(hasattr(df, 'metadata')) + + self.assertIsInstance(df, pandas.DataFrame) + + self.assertNotIsInstance(pandas.DataFrame({'A': [1, 2, 3]}), container.DataFrame) + + df.metadata = df.metadata.update((), { + 'test': 'foobar', + }) + + self.assertEqual(df.metadata.query(()).get('test'), 'foobar') + + for name, copy_function in copy_functions.items(): + df_copy = copy_function(df) + + self.assertIsInstance(df_copy, container.DataFrame, name) + self.assertTrue(hasattr(df_copy, 'metadata'), name) + + self.assertTrue(df.equals(df_copy), name) + self.assertEqual(df.metadata.to_internal_json_structure(), df_copy.metadata.to_internal_json_structure(), name) + self.assertEqual(df_copy.metadata.query(()).get('test'), 'foobar', name) + + df_copy = container.DataFrame(df, { + 'test2': 'barfoo', + }, generate_metadata=True) + + self.assertIsInstance(df_copy, container.DataFrame) + self.assertTrue(hasattr(df_copy, 'metadata')) + + self.assertTrue(numpy.array_equal(df, df_copy)) + self.assertEqual(df_copy.metadata.query(()), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Table',), + 'dimension': { + 'name': 'rows', + 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TabularRow',), + 'length': 3 + }, + 'test': 'foobar', + 'test2': 'barfoo', + }) + + df_from_dict = container.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, generate_metadata=True) + self.assertTrue(df_from_dict._data.equals(pandas.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})._data)) + self.assertIsInstance(df_from_dict, container.DataFrame) + self.assertTrue(hasattr(df_from_dict, 'metadata')) + + # Regression tests to make sure column name cannot overwrite DataFrame + # attributes we use (like metadata and custom methods). + dataframe = container.DataFrame({'metadata': [0], 'select_columns': [1]}) + self.assertIsInstance(dataframe.metadata, metadata_base.DataMetadata) + self.assertIsInstance(dataframe.select_columns([0]), container.DataFrame) + self.assertEqual(dataframe.loc[0, 'metadata'], 0) + self.assertEqual(dataframe.loc[0, 'select_columns'], 1) + + def test_dataset(self): + dataset = container.Dataset.load('sklearn://boston') + + self.assertIsInstance(dataset, container.Dataset) + self.assertTrue(hasattr(dataset, 'metadata')) + + dataset.metadata = dataset.metadata.update((), { + 'test': 'foobar', + }) + + self.assertEqual(dataset.metadata.query(()).get('test'), 'foobar') + + for name, copy_function in copy_functions.items(): + # Not supported on dicts. + if name == 'obj[:]': + continue + + dataset_copy = copy_function(dataset) + + self.assertIsInstance(dataset_copy, container.Dataset, name) + self.assertTrue(hasattr(dataset_copy, 'metadata'), name) + + self.assertEqual(len(dataset), len(dataset_copy), name) + self.assertEqual(dataset.keys(), dataset_copy.keys(), name) + for resource_name in dataset.keys(): + self.assertTrue(numpy.array_equal(dataset[resource_name], dataset_copy[resource_name]), name) + self.assertEqual(dataset.metadata.to_internal_json_structure(), dataset_copy.metadata.to_internal_json_structure(), name) + self.assertEqual(dataset_copy.metadata.query(()).get('test'), 'foobar', name) + + def test_list_ndarray_int(self): + # With custom metadata which should be preserved. + l = container.List([1, 2, 3], { + 'foo': 'bar', + }, generate_metadata=True) + + self.assertEqual(utils.to_json_structure(l.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'int', + }, + }]) + + array = container.ndarray(l, generate_metadata=True) + + self.assertEqual(utils.to_json_structure(array.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + l2 = container.List(array, generate_metadata=True) + + self.assertEqual(utils.to_json_structure(l2.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + def test_dataframe_ndarray_int_noncompact_metadata(self): + # With custom metadata which should be preserved. + df = container.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, { + 'foo': 'bar', + }, generate_metadata=False) + + df.metadata = df.metadata.generate(df, compact=False) + + self.assertEqual(utils.to_json_structure(df.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + 'structural_type': 'numpy.int64', + }, + }]) + + array = container.ndarray(df, generate_metadata=False) + + array.metadata = array.metadata.generate(array, compact=False) + + self.assertEqual(utils.to_json_structure(array.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + }, + }]) + + df2 = container.DataFrame(array, generate_metadata=False) + + df2.metadata = df2.metadata.generate(df2, compact=False) + + self.assertEqual(utils.to_json_structure(df2.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + 'structural_type': 'numpy.int64', + }, + }]) + + def test_dataframe_ndarray_int_compact_metadata(self): + # With custom metadata which should be preserved. + df = container.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, { + 'foo': 'bar', + }, generate_metadata=False) + + df.metadata = df.metadata.generate(df, compact=True) + + self.assertEqual(utils.to_json_structure(df.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + }, + }]) + + array = container.ndarray(df, generate_metadata=False) + + array.metadata = array.metadata.generate(array, compact=True) + + self.assertEqual(utils.to_json_structure(array.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + }, + }]) + + df2 = container.DataFrame(array, generate_metadata=False) + + df2.metadata = df2.metadata.generate(df2, compact=True) + + self.assertEqual(utils.to_json_structure(df2.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + }, + }]) + + def test_dataframe_list_int_compact_metadata(self): + # With custom metadata which should be preserved. + df = container.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, { + 'foo': 'bar', + }, generate_metadata=False) + + df.metadata = df.metadata.generate(df, compact=True) + + self.assertEqual(utils.to_json_structure(df.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + }, + }]) + + l = container.List(df, generate_metadata=False) + + l.metadata = l.metadata.generate(l, compact=True) + + self.assertEqual(utils.to_json_structure(l.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'int', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + }, + }]) + + df2 = container.DataFrame(l, generate_metadata=False) + + df2.metadata = df2.metadata.generate(df2, compact=True) + + self.assertEqual(utils.to_json_structure(df2.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + # This is not really required, but current implementation adds it. + # It is OK if in the future this gets removed. + 'structural_type': '__NO_VALUE__', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + }, + }]) + + def test_dataframe_list_int_noncompact_metadata(self): + # With custom metadata which should be preserved. + df = container.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, { + 'foo': 'bar', + }, generate_metadata=False) + + df.metadata = df.metadata.generate(df, compact=False) + + self.assertEqual(utils.to_json_structure(df.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + 'structural_type': 'numpy.int64', + }, + }]) + + l = container.List(df, generate_metadata=False) + + l.metadata = l.metadata.generate(l, compact=False) + + self.assertEqual(utils.to_json_structure(l.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'int', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + }, + }]) + + df2 = container.DataFrame(l, generate_metadata=False) + + df2.metadata = df2.metadata.generate(df2, compact=False) + + self.assertEqual(utils.to_json_structure(df2.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + # This is not really required, but current implementation adds it. + # It is OK if in the future this gets removed. + 'structural_type': '__NO_VALUE__', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'int', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'C', + 'structural_type': 'numpy.int64', + }, + }]) + + def test_deep_ndarray_compact_metadata(self): + # With custom metadata which should be preserved. + array = container.ndarray(numpy.arange(3 * 4 * 5 * 5 * 5).reshape((3, 4, 5, 5, 5)), { + 'foo': 'bar', + }, generate_metadata=False) + array.metadata = array.metadata.generate(array, compact=True) + + self.assertEqual(utils.to_json_structure(array.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + df = container.DataFrame(array, generate_metadata=False) + df.metadata = df.metadata.generate(df, compact=True) + + self.assertEqual(utils.to_json_structure(df.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + array2 = container.ndarray(df, generate_metadata=False) + array2.metadata = array2.metadata.generate(array2, compact=True) + + # We do not automatically compact numpy with nested numpy arrays into one array + # (there might be an exception if array is jagged). + self.assertEqual(utils.to_json_structure(array2.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'foo': 'bar', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + def test_deep_ndarray_noncompact_metadata(self): + # With custom metadata which should be preserved. + array = container.ndarray(numpy.arange(3 * 4 * 5 * 5 * 5).reshape((3, 4, 5, 5, 5)), { + 'foo': 'bar', + }, generate_metadata=False) + array.metadata = array.metadata.generate(array, compact=False) + + self.assertEqual(utils.to_json_structure(array.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + df = container.DataFrame(array, generate_metadata=False) + df.metadata = df.metadata.generate(df, compact=False) + + self.assertEqual(utils.to_json_structure(df.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'foo': 'bar', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'dimension': { + 'length': 5, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0, '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0, '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0, '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'dimension': { + 'length': 5, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1, '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1, '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1, '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'dimension': { + 'length': 5, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2, '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2, '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2, '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'dimension': { + 'length': 5, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3, '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3, '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3, '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + array2 = container.ndarray(df, generate_metadata=False) + array2.metadata = array2.metadata.generate(array2, compact=False) + + # We do not automatically compact numpy with nested numpy arrays into one array + # (there might be an exception if array is jagged). + self.assertEqual(utils.to_json_structure(array2.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'foo': 'bar', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + def test_simple_list_to_dataframe(self): + data = container.List([1, 2, 3], generate_metadata=True) + + dataframe = container.DataFrame(data, generate_metadata=False) + + compact_metadata = dataframe.metadata.generate(dataframe, compact=True) + noncompact_metadata = dataframe.metadata.generate(dataframe, compact=False) + + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'structural_type': 'd3m.container.pandas.DataFrame', + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': '__NO_VALUE__', + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }] + + self.assertEqual(utils.to_json_structure(compact_metadata.to_internal_simple_structure()), expected_metadata) + + expected_metadata[2]['selector'] = ['__ALL_ELEMENTS__', 0] + + self.assertEqual(utils.to_json_structure(noncompact_metadata.to_internal_simple_structure()), expected_metadata) + + def test_select_columns_compact_metadata(self): + data = container.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, generate_metadata=False) + + data.metadata = data.metadata.generate(data, compact=True) + + data.metadata = data.metadata.update_column(0, {'name': 'aaa'}) + data.metadata = data.metadata.update_column(1, {'name': 'bbb'}) + data.metadata = data.metadata.update_column(2, {'name': 'ccc'}) + data.metadata = data.metadata.update((0, 0), {'row': '1'}) + data.metadata = data.metadata.update((1, 0), {'row': '2'}) + data.metadata = data.metadata.update((2, 0), {'row': '3'}) + data.metadata = data.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowA'}) + + data_metadata_before = data.metadata.to_internal_json_structure() + + # Test "select_columns" working with a tuple. Specifically, iloc[:, tuple(1)] does not work + # (i.e. throws "{IndexingError}Too many indexers"), but iloc[:, 1] and iloc[:, [1]] work. + selected = data.select_columns(tuple([1, 0, 2, 1])) + + self.assertEqual(selected.values.tolist(), [[4, 1, 7, 4], [5, 2, 8, 5], [6, 3, 9, 6]]) + + self.assertEqual(utils.to_json_structure(selected.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': {'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'bbb'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'aaa'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'ccc'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'name': 'bbb'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 1], + 'metadata': {'row': '1'}, + }, { + 'selector': [1, 1], + 'metadata': {'row': '2'}, + }, { + 'selector': [2, 1], + 'metadata': {'row': '3'}, + }]) + + self.assertEqual(data.metadata.to_internal_json_structure(), data_metadata_before) + + selected = data.select_columns([1]) + + self.assertEqual(selected.values.tolist(), [[4], [5], [6]]) + + self.assertEqual(utils.to_json_structure(selected.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': {'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'bbb'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }]) + + self.assertEqual(data.metadata.to_internal_json_structure(), data_metadata_before) + + def test_select_columns_noncompact_metadata(self): + data = container.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, generate_metadata=False) + + data.metadata = data.metadata.generate(data, compact=False) + + data.metadata = data.metadata.update_column(0, {'name': 'aaa'}) + data.metadata = data.metadata.update_column(1, {'name': 'bbb'}) + data.metadata = data.metadata.update_column(2, {'name': 'ccc'}) + data.metadata = data.metadata.update((0, 0), {'row': '1'}) + data.metadata = data.metadata.update((1, 0), {'row': '2'}) + data.metadata = data.metadata.update((2, 0), {'row': '3'}) + data.metadata = data.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowA'}) + + data_metadata_before = data.metadata.to_internal_json_structure() + + # Test "select_columns" working with a tuple. Specifically, iloc[:, tuple(1)] does not work + # (i.e. throws "{IndexingError}Too many indexers"), but iloc[:, 1] and iloc[:, [1]] work. + selected = data.select_columns(tuple([1, 0, 2, 1])) + + self.assertEqual(selected.values.tolist(), [[4, 1, 7, 4], [5, 2, 8, 5], [6, 3, 9, 6]]) + + self.assertEqual(utils.to_json_structure(selected.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'bbb', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'aaa', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'ccc', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'name': 'bbb', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 1], + 'metadata': {'row': '1'}, + }, { + 'selector': [1, 1], + 'metadata': {'row': '2'}, + }, { + 'selector': [2, 1], + 'metadata': {'row': '3'}, + }]) + + self.assertEqual(data.metadata.to_internal_json_structure(), data_metadata_before) + + selected = data.select_columns([1]) + + self.assertEqual(selected.values.tolist(), [[4], [5], [6]]) + + self.assertEqual(utils.to_json_structure(selected.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'bbb', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }]) + + self.assertEqual(data.metadata.to_internal_json_structure(), data_metadata_before) + + def test_append_columns_compact_metadata(self): + left = container.DataFrame({'a1': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9]}, { + 'top_level': 'left', + }, generate_metadata=False) + left.metadata = left.metadata.generate(left, compact=True) + + left.metadata = left.metadata.update_column(0, {'name': 'aaa111'}) + left.metadata = left.metadata.update_column(1, {'name': 'bbb111'}) + left.metadata = left.metadata.update_column(2, {'name': 'ccc111'}) + left.metadata = left.metadata.update((0, 0), {'row': '1a'}) + left.metadata = left.metadata.update((1, 0), {'row': '2a'}) + left.metadata = left.metadata.update((2, 0), {'row': '3a'}) + left.metadata = left.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowA'}) + + right = container.DataFrame({'a2': [11, 12, 13], 'b2': [14, 15, 16], 'c2': [17, 18, 19]}, { + 'top_level': 'right', + }, generate_metadata=False) + right.metadata = right.metadata.generate(right, compact=True) + + right.metadata = right.metadata.update_column(0, {'name': 'aaa222'}) + right.metadata = right.metadata.update_column(1, {'name': 'bbb222'}) + right.metadata = right.metadata.update_column(2, {'name': 'ccc222'}) + right.metadata = right.metadata.update((0, 1), {'row': '1b'}) + right.metadata = right.metadata.update((1, 1), {'row': '2b'}) + right.metadata = right.metadata.update((2, 1), {'row': '3b'}) + right.metadata = right.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowB'}) + + right_metadata_before = right.metadata.to_internal_json_structure() + + data = left.append_columns(right, use_right_metadata=False) + + self.assertEqual(data.values.tolist(), [[1, 4, 7, 11, 14, 17], [2, 5, 8, 12, 15, 18], [3, 6, 9, 13, 16, 19]]) + + self.assertEqual(utils.to_json_structure(data.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'left', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'bbb111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'ccc111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': {'name': 'ccc222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'row': '1a'}, + }, { + 'selector': [0, 3], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 4], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 5], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [1, 0], + 'metadata': {'row': '2a'}, + }, { + 'selector': [1, 4], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 0], + 'metadata': {'row': '3a'}, + }, { + 'selector': [2, 4], + 'metadata': {'row': '3b'}, + }]) + + data = left.append_columns(right, use_right_metadata=True) + + self.assertEqual(data.values.tolist(), [[1, 4, 7, 11, 14, 17], [2, 5, 8, 12, 15, 18], [3, 6, 9, 13, 16, 19]]) + + self.assertEqual(utils.to_json_structure(data.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'right', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'bbb111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'ccc111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'name': 'aaa222'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': {'name': 'bbb222'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': {'name': 'ccc222'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 0], + 'metadata': {'row': '1a', 'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 1], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 2], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 4], + 'metadata': {'row': '1b'}, + }, { + 'selector': [1, 0], + 'metadata': {'row': '2a'}, + }, { + 'selector': [1, 4], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 0], + 'metadata': {'row': '3a'}, + }, { + 'selector': [2, 4], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(right.metadata.to_internal_json_structure(), right_metadata_before) + + def test_append_columns_noncompact_metadata(self): + left = container.DataFrame({'a1': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9]}, { + 'top_level': 'left', + }, generate_metadata=False) + left.metadata = left.metadata.generate(left, compact=False) + + left.metadata = left.metadata.update_column(0, {'name': 'aaa111'}) + left.metadata = left.metadata.update_column(1, {'name': 'bbb111'}) + left.metadata = left.metadata.update_column(2, {'name': 'ccc111'}) + left.metadata = left.metadata.update((0, 0), {'row': '1a'}) + left.metadata = left.metadata.update((1, 0), {'row': '2a'}) + left.metadata = left.metadata.update((2, 0), {'row': '3a'}) + left.metadata = left.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowA'}) + + right = container.DataFrame({'a2': [11, 12, 13], 'b2': [14, 15, 16], 'c2': [17, 18, 19]}, { + 'top_level': 'right', + }, generate_metadata=False) + right.metadata = right.metadata.generate(right, compact=False) + + right.metadata = right.metadata.update_column(0, {'name': 'aaa222'}) + right.metadata = right.metadata.update_column(1, {'name': 'bbb222'}) + right.metadata = right.metadata.update_column(2, {'name': 'ccc222'}) + right.metadata = right.metadata.update((0, 1), {'row': '1b'}) + right.metadata = right.metadata.update((1, 1), {'row': '2b'}) + right.metadata = right.metadata.update((2, 1), {'row': '3b'}) + right.metadata = right.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowB'}) + + right_metadata_before = right.metadata.to_internal_json_structure() + + data = left.append_columns(right, use_right_metadata=False) + + self.assertEqual(data.values.tolist(), [[1, 4, 7, 11, 14, 17], [2, 5, 8, 12, 15, 18], [3, 6, 9, 13, 16, 19]]) + + self.assertEqual(utils.to_json_structure(data.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'left', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'bbb111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'ccc111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': {'name': 'ccc222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'row': '1a'}, + }, { + 'selector': [0, 3], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 4], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 5], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [1, 0], + 'metadata': {'row': '2a'}, + }, { + 'selector': [1, 4], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 0], + 'metadata': {'row': '3a'}, + }, { + 'selector': [2, 4], + 'metadata': {'row': '3b'}, + }]) + + data = left.append_columns(right, use_right_metadata=True) + + self.assertEqual(data.values.tolist(), [[1, 4, 7, 11, 14, 17], [2, 5, 8, 12, 15, 18], [3, 6, 9, 13, 16, 19]]) + + self.assertEqual(utils.to_json_structure(data.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'right', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'bbb111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'ccc111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': {'name': 'ccc222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 0], + 'metadata': {'row': '1a', 'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 1], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 2], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 4], + 'metadata': {'row': '1b'}, + }, { + 'selector': [1, 0], + 'metadata': {'row': '2a'}, + }, { + 'selector': [1, 4], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 0], + 'metadata': {'row': '3a'}, + }, { + 'selector': [2, 4], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(right.metadata.to_internal_json_structure(), right_metadata_before) + + def test_replace_columns_compact_metadata(self): + main = container.DataFrame({'a1': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9]}, { + 'top_level': 'main', + }, generate_metadata=False) + main.metadata = main.metadata.generate(main, compact=True) + + main.metadata = main.metadata.update_column(0, {'name': 'aaa111'}) + main.metadata = main.metadata.update_column(1, {'name': 'bbb111', 'extra': 'b_column'}) + main.metadata = main.metadata.update_column(2, {'name': 'ccc111'}) + main.metadata = main.metadata.update((0, 0), {'row': '1a'}) + main.metadata = main.metadata.update((1, 0), {'row': '2a'}) + main.metadata = main.metadata.update((2, 0), {'row': '3a'}) + main.metadata = main.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowA'}) + + main_metadata_before = main.metadata.to_internal_json_structure() + + columns = container.DataFrame({'a2': [11, 12, 13], 'b2': [14, 15, 16]}, { + 'top_level': 'columns', + }, generate_metadata=False) + columns.metadata = columns.metadata.generate(columns, compact=True) + + columns.metadata = columns.metadata.update_column(0, {'name': 'aaa222'}) + columns.metadata = columns.metadata.update_column(1, {'name': 'bbb222'}) + columns.metadata = columns.metadata.update((0, 1), {'row': '1b'}) + columns.metadata = columns.metadata.update((1, 1), {'row': '2b'}) + columns.metadata = columns.metadata.update((2, 1), {'row': '3b'}) + columns.metadata = columns.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowB'}) + + columns_metadata_before = columns.metadata.to_internal_json_structure() + + new_main = main.replace_columns(columns, [1, 2]) + + self.assertEqual(new_main.values.tolist(), [[1, 11, 14], [2, 12, 15], [3, 13, 16]]) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'row': '1a'}, + }, { + 'selector': [0, 1], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 2], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [1, 0], + 'metadata': {'row': '2a'}, + }, { + 'selector': [1, 2], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 0], + 'metadata': {'row': '3a'}, + }, { + 'selector': [2, 2], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(main_metadata_before, main.metadata.to_internal_json_structure()) + self.assertEqual(columns_metadata_before, columns.metadata.to_internal_json_structure()) + + new_main = main.replace_columns(columns, [0, 2]) + + self.assertEqual(new_main.values.tolist(), [[11, 4, 14], [12, 5, 15], [13, 6, 16]]) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'extra': 'b_column', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 2], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [1, 2], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 2], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(main_metadata_before, main.metadata.to_internal_json_structure()) + self.assertEqual(columns_metadata_before, columns.metadata.to_internal_json_structure()) + + new_main = main.replace_columns(columns, [1]) + + self.assertEqual(new_main.values.tolist(), [[1, 11, 14, 7], [2, 12, 15, 8], [3, 13, 16, 9]]) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'name': 'ccc111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'row': '1a'}, + }, { + 'selector': [0, 1], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 2], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 3], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [1, 0], + 'metadata': {'row': '2a'}, + }, { + 'selector': [1, 2], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 0], + 'metadata': {'row': '3a'}, + }, { + 'selector': [2, 2], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(main_metadata_before, main.metadata.to_internal_json_structure()) + self.assertEqual(columns_metadata_before, columns.metadata.to_internal_json_structure()) + + new_main = main.replace_columns(columns, [0, 1, 2]) + + self.assertEqual(new_main.values.tolist(), [[11, 14], [12, 15], [13, 16]]) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 1], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [1, 1], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 1], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(main_metadata_before, main.metadata.to_internal_json_structure()) + self.assertEqual(columns_metadata_before, columns.metadata.to_internal_json_structure()) + + def test_replace_columns_noncompact_metadata(self): + main = container.DataFrame({'a1': [1, 2, 3], 'b1': [4, 5, 6], 'c1': [7, 8, 9]}, { + 'top_level': 'main', + }, generate_metadata=False) + main.metadata = main.metadata.generate(main, compact=False) + + main.metadata = main.metadata.update_column(0, {'name': 'aaa111'}) + main.metadata = main.metadata.update_column(1, {'name': 'bbb111', 'extra': 'b_column'}) + main.metadata = main.metadata.update_column(2, {'name': 'ccc111'}) + main.metadata = main.metadata.update((0, 0), {'row': '1a'}) + main.metadata = main.metadata.update((1, 0), {'row': '2a'}) + main.metadata = main.metadata.update((2, 0), {'row': '3a'}) + main.metadata = main.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowA'}) + + main_metadata_before = main.metadata.to_internal_json_structure() + + columns = container.DataFrame({'a2': [11, 12, 13], 'b2': [14, 15, 16]}, { + 'top_level': 'columns', + }, generate_metadata=False) + columns.metadata = columns.metadata.generate(columns, compact=False) + + columns.metadata = columns.metadata.update_column(0, {'name': 'aaa222'}) + columns.metadata = columns.metadata.update_column(1, {'name': 'bbb222'}) + columns.metadata = columns.metadata.update((0, 1), {'row': '1b'}) + columns.metadata = columns.metadata.update((1, 1), {'row': '2b'}) + columns.metadata = columns.metadata.update((2, 1), {'row': '3b'}) + columns.metadata = columns.metadata.update((0, metadata_base.ALL_ELEMENTS), {'all_elements_on_row': 'rowB'}) + + columns_metadata_before = columns.metadata.to_internal_json_structure() + + new_main = main.replace_columns(columns, [1, 2]) + + self.assertEqual(new_main.values.tolist(), [[1, 11, 14], [2, 12, 15], [3, 13, 16]]) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'row': '1a'}, + }, { + 'selector': [0, 1], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 2], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [1, 0], + 'metadata': {'row': '2a'}, + }, { + 'selector': [1, 2], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 0], + 'metadata': {'row': '3a'}, + }, { + 'selector': [2, 2], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(main_metadata_before, main.metadata.to_internal_json_structure()) + self.assertEqual(columns_metadata_before, columns.metadata.to_internal_json_structure()) + + new_main = main.replace_columns(columns, [0, 2]) + + self.assertEqual(new_main.values.tolist(), [[11, 4, 14], [12, 5, 15], [13, 6, 16]]) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'extra': 'b_column', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 2], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [1, 2], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 2], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(main_metadata_before, main.metadata.to_internal_json_structure()) + self.assertEqual(columns_metadata_before, columns.metadata.to_internal_json_structure()) + + new_main = main.replace_columns(columns, [1]) + + self.assertEqual(new_main.values.tolist(), [[1, 11, 14, 7], [2, 12, 15, 8], [3, 13, 16, 9]]) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'name': 'ccc111', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'row': '1a'}, + }, { + 'selector': [0, 1], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 2], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 3], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [1, 0], + 'metadata': {'row': '2a'}, + }, { + 'selector': [1, 2], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 0], + 'metadata': {'row': '3a'}, + }, { + 'selector': [2, 2], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(main_metadata_before, main.metadata.to_internal_json_structure()) + self.assertEqual(columns_metadata_before, columns.metadata.to_internal_json_structure()) + + new_main = main.replace_columns(columns, [0, 1, 2]) + + self.assertEqual(new_main.values.tolist(), [[11, 14], [12, 15], [13, 16]]) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'aaa222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'bbb222', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [0, '__ALL_ELEMENTS__'], + 'metadata': {'all_elements_on_row': 'rowA'}, + }, { + 'selector': [0, 0], + 'metadata': {'all_elements_on_row': 'rowB'}, + }, { + 'selector': [0, 1], + 'metadata': {'row': '1b', 'all_elements_on_row': 'rowB'}, + }, { + 'selector': [1, 1], + 'metadata': {'row': '2b'}, + }, { + 'selector': [2, 1], + 'metadata': {'row': '3b'}, + }]) + + self.assertEqual(main_metadata_before, main.metadata.to_internal_json_structure()) + self.assertEqual(columns_metadata_before, columns.metadata.to_internal_json_structure()) + + def test_select_columns_empty(self): + data = container.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, generate_metadata=True) + + with self.assertRaises(Exception): + data.select_columns([]) + + with self.assertRaises(Exception): + data.metadata.select_columns([]) + + selected = data.select_columns([], allow_empty_columns=True) + + self.assertEqual(selected.shape, (3, 0)) + + self.assertEqual(utils.to_json_structure(selected.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 0, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }]) + + def test_dataframe_select_copy(self): + df = container.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) + + selection = df.select_columns([0]) + + with warnings.catch_warnings(record=True) as w: + selection.iloc[:, 0] = selection.iloc[:, 0].map(lambda x: x + 1) + + self.assertEqual(len(w), 0) + + self.assertEqual(selection.values.tolist(), [[2], [3], [4]]) + self.assertEqual(df.values.tolist(), [[1, 4], [2, 5], [3, 6]]) + + def test_save_container_empty_dataset(self): + dataset = container.Dataset({}, generate_metadata=True) + + with tempfile.TemporaryDirectory() as temp_directory: + container_utils.save_container(dataset, os.path.join(temp_directory, 'dataset')) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_dataset.py b/d3m/tests/test_dataset.py new file mode 100644 index 0000000..c213444 --- /dev/null +++ b/d3m/tests/test_dataset.py @@ -0,0 +1,2094 @@ +import collections +import datetime +import filecmp +import glob +import json +import os +import os.path +import shutil +import sys +import tempfile +import unittest +import uuid + +import frozendict +import numpy +from sklearn import datasets + +COMMON_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'common-primitives') +# NOTE: This insertion should appear before any code attempting to resolve or load primitives, +# so the git submodule version of `common-primitives` is looked at first. +sys.path.insert(0, COMMON_PRIMITIVES_DIR) + +from common_primitives.column_parser import ColumnParserPrimitive +from common_primitives.dataset_to_dataframe import DatasetToDataFramePrimitive + +from d3m import container, exceptions, utils +from d3m.container import dataset +from d3m.metadata import base as metadata_base, pipeline_run + + +def convert_metadata(metadata): + return json.loads(json.dumps(metadata, cls=utils.JsonEncoder)) + + +def make_regular_dict_and_list(obj): + if isinstance(obj, (collections.OrderedDict, frozendict.FrozenOrderedDict, frozendict.frozendict)): + obj = dict(obj) + if isinstance(obj, tuple): + obj = list(obj) + + if isinstance(obj, list): + obj = [make_regular_dict_and_list(o) for o in obj] + + if isinstance(obj, dict): + obj = {k: make_regular_dict_and_list(v) for k, v in obj.items()} + + return obj + + +def _normalize_dataset_description(dataset_description, dataset_name=None): + for key in ('digest', 'datasetVersion', 'datasetSchemaVersion', 'redacted'): + dataset_description['about'].pop(key, None) + + for i, r in enumerate(dataset_description.get('dataResources', [])): + for j, c in enumerate(r.get('columns', [])): + if 'attribute' in c['role'] and len(c['role']) > 1: + k = c['role'].index('attribute') + c['role'].pop(k) + dataset_description['dataResources'][i]['columns'][j] = c + + if dataset_name == 'audio_dataset_1': + del dataset_description['dataResources'][1]['columns'][2]['refersTo'] + del dataset_description['dataResources'][1]['columns'][3]['refersTo'] + + if dataset_name == 'dataset_TEST': + dataset_description['about']['datasetID'] = 'object_dataset_1_TEST' + + dataset_description.pop('qualities', None) + + return dataset_description + + +class TestDataset(unittest.TestCase): + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_d3m(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json') + ) + + ds = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + self._test_d3m(ds, dataset_doc_path) + + pipeline_run.validate_dataset(ds.to_json_structure(canonical=True)) + metadata_base.CONTAINER_SCHEMA_VALIDATOR.validate(ds.to_json_structure(canonical=True)) + + def _test_d3m(self, ds, dataset_doc_path): + ds.metadata.check(ds) + + for row in ds['learningData']: + for cell in row: + # Nothing should be parsed from a string. + self.assertIsInstance(cell, str, dataset_doc_path) + + self.assertEqual(len(ds['learningData']), 150, dataset_doc_path) + self.assertEqual(len(ds['learningData'].iloc[0]), 6, dataset_doc_path) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': 'iris_dataset_1', + 'name': 'Iris Dataset', + 'location_uris': ['file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)], + 'source': {'license': 'CC', 'redacted': False, 'human_subjects_research': False}, + 'dimension': { + 'length': 1, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'digest': '6191a49372f185f530920ffa35a3c4a78034ec47247aa23474537c449d37323b', + 'version': '4.0.0', + }, + dataset_doc_path, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData',))), + { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + dataset_doc_path, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }, + dataset_doc_path, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), + { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }, + dataset_doc_path, + ) + + for i in range(1, 5): + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i))), + { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'][i - 1], + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + dataset_doc_path, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 5))), + { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + dataset_doc_path, + ) + + def test_d3m_lazy(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json') + ) + + ds = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path), lazy=True) + + ds.metadata.check(ds) + + self.assertTrue(len(ds) == 0) + self.assertTrue(ds.is_lazy()) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': 'iris_dataset_1', + 'name': 'Iris Dataset', + 'location_uris': ['file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)], + 'source': {'license': 'CC', 'redacted': False, 'human_subjects_research': False}, + 'dimension': { + 'length': 0, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'digest': '6191a49372f185f530920ffa35a3c4a78034ec47247aa23474537c449d37323b', + 'version': '4.0.0', + }, + ) + + self.assertEqual(convert_metadata(ds.metadata.query(('learningData',))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), {}) + + ds.load_lazy() + + self.assertFalse(ds.is_lazy()) + + self._test_d3m(ds, dataset_doc_path) + + def test_d3m_minimal_metadata(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_3', 'datasetDoc.json') + ) + + ds = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + self._test_d3m_minimal_metadata(ds, dataset_doc_path) + + pipeline_run.validate_dataset(ds.to_json_structure(canonical=True)) + metadata_base.CONTAINER_SCHEMA_VALIDATOR.validate(ds.to_json_structure(canonical=True)) + + def _test_d3m_minimal_metadata(self, ds, dataset_doc_path): + ds.metadata.check(ds) + + for row in ds['learningData']: + for cell in row: + # Nothing should be parsed from a string. + self.assertIsInstance(cell, str, dataset_doc_path) + + self.assertEqual(len(ds['learningData']), 150, dataset_doc_path) + self.assertEqual(len(ds['learningData'].iloc[0]), 6, dataset_doc_path) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': 'iris_dataset_3', + 'name': 'Iris Dataset with minimal metadata', + 'location_uris': ['file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)], + 'source': {'license': 'CC', 'redacted': False, 'human_subjects_research': False}, + 'dimension': { + 'length': 1, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'digest': '4a0b43c5e5a76919b42b2066015ba0962512beb8600919dfffa4e2ad604e446d', + 'version': '4.0.0', + }, + dataset_doc_path, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData',))), + { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + dataset_doc_path, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }, + dataset_doc_path, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), + { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }, + dataset_doc_path, + ) + + for i in range(1, 6): + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i))), + { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species'][i - 1], + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + ], + }, + dataset_doc_path, + ) + + def test_d3m_saver(self): + at_least_one = False + + for dirpath, dirnames, filenames in os.walk( + os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets')) + ): + if 'datasetDoc.json' in filenames: + # Do not traverse further (to not parse "datasetDoc.json" or "problemDoc.json" if they + # exists in raw data filename). + dirnames[:] = [] + + dataset_path = os.path.join(os.path.abspath(dirpath), 'datasetDoc.json') + dataset_name = dataset_path.split(os.path.sep)[-2] + + # We skip "graph_dataset_1" because saving changes GML file to a file collection. + # We skip "iris_dataset_2" and "iris_dataset_3" because when loading we add additional metadata. + if 'graph_dataset_1' not in dataset_path and 'iris_dataset_3' not in dataset_path and 'iris_dataset_2' not in dataset_path: + self._test_d3m_saver(dataset_path, dataset_name) + self._test_d3m_saver_digest(dataset_path, dataset_name) + at_least_one = True + + self.assertTrue(at_least_one) + + def test_d3m_saver_update_column_description(self): + source_dataset_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'audio_dataset_1') + ) + source_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(source_dataset_path, 'datasetDoc.json') + ) + + output_dataset_path = os.path.join(self.test_dir, 'audio_dataset_1') + output_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(output_dataset_path, 'datasetDoc.json') + ) + + selector = ('learningData', metadata_base.ALL_ELEMENTS, 0) + new_metadata = {'description': 'Audio files'} + ds = container.Dataset.load(source_dataset_uri) + ds.metadata = ds.metadata.update(selector, new_metadata) + ds.save(output_dataset_uri) + ds2 = container.Dataset.load(output_dataset_uri) + + self.assertEqual(convert_metadata(ds.metadata.query(selector)), convert_metadata(ds2.metadata.query(selector))) + + def test_d3m_saver_file_columns(self): + source_dataset_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'multivariate_dataset_1') + ) + source_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(source_dataset_path, 'datasetDoc.json') + ) + + output_dataset_path = os.path.join(self.test_dir, 'multivariate_dataset_1') + output_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(output_dataset_path, 'datasetDoc.json') + ) + + ds = container.Dataset.load(source_dataset_uri) + ds.save(output_dataset_uri) + + with open(os.path.join(source_dataset_path, 'datasetDoc.json'), 'r') as f: + source_dataset_description = _normalize_dataset_description(json.load(f)) + + with open(os.path.join(output_dataset_path, 'datasetDoc.json'), 'r') as f: + output_dataset_description = _normalize_dataset_description(json.load(f)) + + self.assertEqual(source_dataset_description, output_dataset_description) + + source_files = [ + x + for x in glob.iglob(os.path.join(source_dataset_path, '**'), recursive=True) + if os.path.isfile(x) and os.path.basename(x) != 'datasetDoc.json' + ] + output_files = [ + x + for x in glob.iglob(os.path.join(output_dataset_path, '**'), recursive=True) + if os.path.isfile(x) and os.path.basename(x) != 'datasetDoc.json' + ] + + for x, y in zip(source_files, output_files): + self.assertTrue(filecmp.cmp(x, y, shallow=False), (x, y)) + + source_relative_filepaths = [os.path.relpath(x, source_dataset_path) for x in source_files] + output_relative_filepaths = [os.path.relpath(x, output_dataset_path) for x in output_files] + self.assertEqual(source_relative_filepaths, output_relative_filepaths) + + def test_load_sklearn_save_d3m(self): + self.maxDiff = None + + for dataset_path in ['boston', 'breast_cancer', 'diabetes', 'digits', 'iris', 'linnerud']: + source_dataset_uri = 'sklearn://{dataset_path}'.format(dataset_path=dataset_path) + output_dateset_doc_path = os.path.join(self.test_dir, 'sklearn', dataset_path, 'datasetDoc.json') + output_dateset_doc_uri = 'file://{output_dateset_doc_path}'.format( + output_dateset_doc_path=output_dateset_doc_path + ) + + sklearn_dataset = container.Dataset.load(source_dataset_uri) + sklearn_dataset.save(output_dateset_doc_uri) + + self.assertTrue(os.path.exists(output_dateset_doc_path)) + + d3m_dataset = container.Dataset.load(output_dateset_doc_uri) + + sklearn_metadata = make_regular_dict_and_list(sklearn_dataset.metadata.to_internal_simple_structure()) + d3m_metadata = make_regular_dict_and_list(d3m_dataset.metadata.to_internal_simple_structure()) + + del sklearn_metadata[0]['metadata']['digest'] + del d3m_metadata[0]['metadata']['digest'] + del sklearn_metadata[0]['metadata']['location_uris'] + del d3m_metadata[0]['metadata']['location_uris'] + + # When saving, we convert all columns to string type. + for metadata_index in range(3, len(sklearn_metadata)): + sklearn_metadata[metadata_index]['metadata']['structural_type'] = str + + # Additional metadata added when saving. + sklearn_metadata.insert( + 3, + { + 'metadata': {'structural_type': str}, + 'selector': ['learningData', metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS], + }, + ) + + self.assertEqual(sklearn_metadata, d3m_metadata) + + def test_load_csv_save_d3m(self): + source_csv_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv') + ) + output_csv_path = os.path.join( + self.test_dir, 'load_csv_save_d3m', 'iris_dataset_1', 'tables', 'learningData.csv' + ) + source_csv_uri = 'file://{source_csv_path}'.format(source_csv_path=source_csv_path) + + output_dateset_doc_path = os.path.join(self.test_dir, 'load_csv_save_d3m', 'iris_dataset_1', 'datasetDoc.json') + output_dateset_doc_uri = 'file://{output_dateset_doc_path}'.format( + output_dateset_doc_path=output_dateset_doc_path + ) + + csv_dataset = container.Dataset.load(source_csv_uri) + csv_dataset.save(output_dateset_doc_uri) + + self.assertTrue(os.path.exists(output_dateset_doc_path)) + self.assertTrue(os.path.exists(output_csv_path)) + self.assertTrue(filecmp.cmp(source_csv_path, output_csv_path)) + + d3m_dataset = container.Dataset.load(output_dateset_doc_uri) + + csv_metadata = make_regular_dict_and_list(csv_dataset.metadata.to_internal_simple_structure()) + d3m_metadata = make_regular_dict_and_list(d3m_dataset.metadata.to_internal_simple_structure()) + + del csv_metadata[0]['metadata']['digest'] + del d3m_metadata[0]['metadata']['digest'] + del csv_metadata[0]['metadata']['location_uris'] + del d3m_metadata[0]['metadata']['location_uris'] + del d3m_metadata[0]['metadata']['approximate_stored_size'] + + # Additional metadata added when saving. + csv_metadata.insert( + 3, + { + 'metadata': {'structural_type': str}, + 'selector': ['learningData', metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS], + }, + ) + + self.assertEqual(csv_metadata, d3m_metadata) + + def _test_d3m_saver(self, dataset_path, dataset_name): + self.maxDiff = None + + try: + input_dataset_doc_path = dataset_path + output_dateset_doc_path = os.path.join(self.test_dir, dataset_name, 'datasetDoc.json') + + with open(input_dataset_doc_path, 'r', encoding='utf8') as f: + input_dataset_description = json.load(f) + + ds = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=input_dataset_doc_path)) + ds.save('file://{dataset_doc_path}'.format(dataset_doc_path=output_dateset_doc_path)) + + with open(output_dateset_doc_path) as f: + output_dataset_description = json.load(f) + + input_dataset_description = _normalize_dataset_description(input_dataset_description) + output_dataset_description = _normalize_dataset_description(output_dataset_description, dataset_name) + + self.assertDictEqual(input_dataset_description, output_dataset_description, dataset_name) + + source_files = [ + x for x in glob.iglob(os.path.join(os.path.dirname(input_dataset_doc_path), '**'), recursive=True) + ] + output_files = [ + x for x in glob.iglob(os.path.join(os.path.dirname(output_dateset_doc_path), '**'), recursive=True) + ] + + source_relative_filepaths = [ + os.path.relpath(x, os.path.dirname(input_dataset_doc_path)) for x in source_files + ] + output_relative_filepaths = [ + os.path.relpath(x, os.path.dirname(output_dateset_doc_path)) for x in output_files + ] + + self.assertEqual(source_relative_filepaths, output_relative_filepaths, dataset_name) + + source_files = [x for x in source_files if (x != input_dataset_doc_path) and os.path.isfile(x)] + output_files = [x for x in output_files if (x != output_dateset_doc_path) and os.path.isfile(x)] + + for x, y in zip(source_files, output_files): + if dataset_name == 'dataset_TEST' and os.path.basename(x) == 'learningData.csv': + continue + + self.assertTrue(filecmp.cmp(x, y, shallow=False), (dataset_name, x, y)) + + finally: + shutil.rmtree(os.path.join(self.test_dir, dataset_name), ignore_errors=True) + + def _test_d3m_saver_digest(self, dataset_path, dataset_name): + self.maxDiff = None + + try: + # Load original dataset and store it's digest + original_dataset_uri = 'file://{dataset_path}'.format(dataset_path=dataset_path) + original_dataset = container.Dataset.load(original_dataset_uri) + original_dateset_digest = original_dataset.metadata.query(())['digest'] + + # Save the dataset to a new location + output_dataset_path = os.path.join(self.test_dir, dataset_name) + output_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(output_dataset_path, 'datasetDoc.json') + ) + original_dataset.save(output_dataset_uri) + + # Load the dataset from the new location and store the digest + output_dataset = container.Dataset.load(output_dataset_uri) + output_dataset_digest = output_dataset.metadata.query(())['digest'] + + # Remove digest from the in-memory dataset and store the dataset to a new location + output_dataset.metadata = output_dataset.metadata.update((), {'digest': metadata_base.NO_VALUE}) + new_output_dataset_path = os.path.join(self.test_dir, dataset_name + '_new') + output_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(new_output_dataset_path, 'datasetDoc.json') + ) + output_dataset.save(output_dataset_uri) + + # Load digest from the stored datasetDoc.json + with open(os.path.join(new_output_dataset_path, 'datasetDoc.json'), 'r') as f: + saved_digest = json.load(f)['about']['digest'] + + # Calculate dataset digest with the reference function + reference_dataset_digest = dataset.get_d3m_dataset_digest( + os.path.join(output_dataset_path, 'datasetDoc.json') + ) + + self.assertEqual(output_dataset_digest, saved_digest) + self.assertEqual(output_dataset_digest, reference_dataset_digest) + + finally: + shutil.rmtree(os.path.join(self.test_dir, dataset_name), ignore_errors=True) + shutil.rmtree(os.path.join(self.test_dir, dataset_name + '_new'), ignore_errors=True) + + def test_d3m_preserve_edge_list_resource_type(self): + source_dataset_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'graph_dataset_2') + ) + source_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(source_dataset_path, 'datasetDoc.json') + ) + + output_dataset_path = os.path.join(self.test_dir, 'graph_dataset_2') + output_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(output_dataset_path, 'datasetDoc.json') + ) + + ds_1 = container.Dataset.load(source_dataset_uri) + ds_1.save(output_dataset_uri) + ds_2 = container.Dataset.load(output_dataset_uri) + + selector = ('edgeList',) + + self.assertEqual( + convert_metadata(ds_1.metadata.query(selector)), convert_metadata(ds_2.metadata.query(selector)) + ) + + def test_d3m_saver_qualities(self): + self.maxDiff = None + + source_dataset_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'audio_dataset_1') + ) + source_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(source_dataset_path, 'datasetDoc.json') + ) + output_dataset_path = os.path.join(self.test_dir, 'audio_dataset_1') + output_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(output_dataset_path, 'datasetDoc.json') + ) + + ds = container.Dataset.load(source_dataset_uri) + # Insert a non-standard dataset value to test quality saving / loading. + ds.metadata = ds.metadata.update((), {'additional_quality': 'some value'}) + ds.save(output_dataset_uri) + ds2 = container.Dataset.load(output_dataset_uri) + + ds.metadata = ds.metadata.update((), {'location_uris': ''}) + ds.metadata = ds.metadata.update((), {'digest': ''}) + ds.metadata = ds.metadata.update(('0', metadata_base.ALL_ELEMENTS, 0), {'location_base_uris': ''}) + + ds2.metadata = ds2.metadata.update((), {'location_uris': ''}) + ds2.metadata = ds2.metadata.update((), {'digest': ''}) + ds2.metadata = ds2.metadata.update(('0', metadata_base.ALL_ELEMENTS, 0), {'location_base_uris': ''}) + + ds_metadata = make_regular_dict_and_list(ds.metadata.to_internal_simple_structure()) + ds2_metadata = make_regular_dict_and_list(ds2.metadata.to_internal_simple_structure()) + + # Additional metadata added when saving. + ds_metadata.insert( + 3, + { + 'metadata': {'structural_type': str}, + 'selector': ['0', metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS], + }, + ) + ds_metadata.insert( + 7, + { + 'metadata': {'structural_type': str}, + 'selector': ['learningData', metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS], + }, + ) + + self.assertEqual(ds_metadata, ds2_metadata) + + ds2 = container.Dataset.load(output_dataset_uri, lazy=True) + + ds2.metadata = ds2.metadata.update((), {'location_uris': ''}) + ds2.metadata = ds2.metadata.update((), {'digest': ''}) + ds2.metadata = ds2.metadata.update(('0', metadata_base.ALL_ELEMENTS, 0), {'location_base_uris': ''}) + + ds_metadata = make_regular_dict_and_list(ds.metadata.query(())) + ds2_metadata = make_regular_dict_and_list(ds2.metadata.query(())) + + ds_metadata['dimension']['length'] = 0 + + self.assertEqual(ds_metadata, ds2_metadata) + + def test_d3m_saver_synthetic_dataset(self): + dataset_path = os.path.abspath(os.path.join(self.test_dir, 'synthetic_dataset_1', 'datasetDoc.json')) + dataset_uri = 'file://{dataset_path}'.format(dataset_path=dataset_path) + + df = container.DataFrame([[0]], columns=['col_1'], generate_metadata=False) + ds = container.Dataset(resources={'someData': df}, generate_metadata=True) + + ds.metadata = ds.metadata.update( + (), + { + 'custom_metadata_1': 'foo', + 'custom_metadata_2': datetime.datetime(2019, 6, 6), + 'deleted_metadata': metadata_base.NO_VALUE, + }, + ) + + with self.assertRaises(exceptions.InvalidMetadataError): + ds.save(dataset_uri) + + ds.metadata = ds.metadata.update((), {'id': 'synthetic_dataset_1', 'name': 'Synthetic dataset 1'}) + + ds.save(dataset_uri) + ds2 = container.Dataset.load(dataset_uri) + + self.assertEqual( + make_regular_dict_and_list(ds2.metadata.to_internal_simple_structure()), + [ + { + 'selector': [], + 'metadata': { + 'custom_metadata_1': 'foo', + 'custom_metadata_2': datetime.datetime(2019, 6, 6), + 'deleted_metadata': metadata_base.NO_VALUE, + 'digest': 'bc41e654599e31169061ce5f6b99133e6220eea2a83c53f55c653e4d9a4b67e2', + 'dimension': { + 'length': 1, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'id': 'synthetic_dataset_1', + 'location_uris': [dataset_uri], + 'name': 'Synthetic dataset 1', + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'structural_type': container.Dataset, + }, + }, + { + 'selector': ['someData'], + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': container.DataFrame, + }, + }, + { + 'selector': ['someData', metadata_base.ALL_ELEMENTS], + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': ['someData', metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS], + 'metadata': {'structural_type': str}, + }, + { + 'selector': ['someData', metadata_base.ALL_ELEMENTS, 0], + 'metadata': { + 'name': 'col_1', + 'semantic_types': ['http://schema.org/Integer'], + 'structural_type': str, + }, + }, + ], + ) + + def test_d3m_saver_synthetic_dataset_2(self): + self.maxDiff = None + + dataset_path = os.path.abspath(os.path.join(self.test_dir, 'synthetic_dataset_2', 'datasetDoc.json')) + dataset_uri = 'file://{dataset_path}'.format(dataset_path=dataset_path) + + df = container.DataFrame({'col_1': [0], 'col_2': [0.0]}, generate_metadata=True) + synthetic_dataset = container.Dataset(resources={'learningData': df}, generate_metadata=True) + + with self.assertRaises(exceptions.InvalidMetadataError): + synthetic_dataset.save(dataset_uri) + + synthetic_dataset.metadata = synthetic_dataset.metadata.update( + (), {'id': 'synthetic_dataset_2', 'name': 'Synthetic dataset 2'} + ) + + synthetic_dataset.save(dataset_uri) + + loaded_dataset = container.Dataset.load(dataset_uri) + + hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams() + primitive = DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + loaded_dataframe = primitive.produce(inputs=loaded_dataset).value + + hyperparams_class = ColumnParserPrimitive.metadata.get_hyperparams() + primitive = ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + loaded_dataframe = primitive.produce(inputs=loaded_dataframe).value + + self.assertEqual( + make_regular_dict_and_list(loaded_dataframe.metadata.to_internal_simple_structure()), + [ + { + 'selector': [], + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': container.DataFrame, + }, + }, + { + 'selector': [metadata_base.ALL_ELEMENTS], + 'metadata': { + 'dimension': { + 'length': 2, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': [metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS], + 'metadata': {'structural_type': str}, + }, + { + 'selector': [metadata_base.ALL_ELEMENTS, 0], + 'metadata': { + 'name': 'col_1', + 'semantic_types': ['http://schema.org/Integer'], + 'structural_type': int, + }, + }, + { + 'selector': [metadata_base.ALL_ELEMENTS, 1], + 'metadata': { + 'name': 'col_2', + 'semantic_types': ['http://schema.org/Float'], + 'structural_type': float, + }, + }, + ], + ) + + def test_d3m_saver_unknown_type(self): + metadata = metadata_base.DataMetadata() + + metadata = metadata.update( + (), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + 'id': 'multi_source_1', + 'version': '1.0', + 'name': 'A multi source dataset', + 'source': {'license': 'CC0', 'redacted': False, 'human_subjects_research': False}, + 'dimension': { + 'length': 1, + 'name': 'resources', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/DatasetResource', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + }, + ) + + metadata = metadata.update( + ('learningData',), + { + 'structural_type': container.DataFrame, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + ) + + metadata = metadata.update( + ('learningData', metadata_base.ALL_ELEMENTS), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + } + }, + ) + + metadata = metadata.update( + ('learningData', metadata_base.ALL_ELEMENTS, 0), + { + 'name': 'd3mIndex', + 'structural_type': str, + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }, + ) + + metadata = metadata.update( + ('learningData', metadata_base.ALL_ELEMENTS, 1), + { + 'name': 'sepalLength', + 'structural_type': str, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + ) + + metadata = metadata.update( + ('learningData', metadata_base.ALL_ELEMENTS, 2), + { + 'name': 'species', + 'structural_type': str, + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + ], + }, + ) + + dataset_path = os.path.abspath(os.path.join(self.test_dir, 'unknown_columns_1', 'datasetDoc.json')) + dataset_uri = 'file://{dataset_path}'.format(dataset_path=dataset_path) + + df = container.DataFrame([[0, 0.1, 'Iris-setosa']], columns=['d3mIndex', 'sepalLength', 'species']) + ds = container.Dataset(resources={'learningData': df}, metadata=metadata) + + with self.assertRaises(exceptions.InvalidMetadataError): + ds.save(dataset_uri) + + metadata = metadata.update( + ('learningData', metadata_base.ALL_ELEMENTS, 2), + { + 'name': 'species', + 'structural_type': str, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + ], + }, + ) + + ds = container.Dataset(resources={'learningData': df}, metadata=metadata) + ds.save(dataset_uri) + + ds = container.Dataset.load(dataset_uri) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), + { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 1))), + { + 'name': 'sepalLength', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 2))), + { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + ) + + def test_d3m_saver_multi_source(self): + shutil.copytree( + os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'raw_dataset_1')), + os.path.join(self.test_dir, 'raw_dataset_1'), + ) + shutil.copytree( + os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'image_dataset_1')), + os.path.join(self.test_dir, 'image_dataset_1'), + ) + shutil.copytree( + os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'image_dataset_1')), + os.path.join(self.test_dir, 'image_dataset_2'), + ) + + metadata = metadata_base.DataMetadata() + + metadata = metadata.update( + (), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + 'id': 'multi_source_1', + 'version': '1.0', + 'name': 'A multi source dataset', + 'source': {'license': 'CC0', 'redacted': False, 'human_subjects_research': False}, + 'dimension': { + 'length': 1, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + }, + ) + + metadata = metadata.update( + ('learningData',), + { + 'structural_type': container.DataFrame, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/FilesCollection', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + ) + + metadata = metadata.update( + ('learningData', metadata_base.ALL_ELEMENTS), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + } + }, + ) + + metadata = metadata.update( + ('learningData', metadata_base.ALL_ELEMENTS, 0), + { + 'media_types': ['image/jpeg', 'image/png', 'text/csv'], + 'name': 'filename', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'https://metadata.datadrivendiscovery.org/types/FileName', + 'https://metadata.datadrivendiscovery.org/types/UnspecifiedStructure', + ], + 'structural_type': str, + }, + ) + + metadata = metadata.update( + ('learningData', 0, 0), + { + 'location_base_uris': [ + 'file://{dataset_doc_path}'.format( + dataset_doc_path=os.path.join(self.test_dir, 'raw_dataset_1', 'raw') + '/' + ) + ], + 'media_types': ['text/csv'], + }, + ) + + metadata = metadata.update( + ('learningData', 1, 0), + { + 'location_base_uris': [ + 'file://{dataset_doc_path}'.format( + dataset_doc_path=os.path.join(self.test_dir, 'image_dataset_1', 'media') + '/' + ) + ], + 'media_types': ['image/png'], + }, + ) + + metadata = metadata.update( + ('learningData', 2, 0), + { + 'location_base_uris': [ + 'file://{dataset_doc_path}'.format( + dataset_doc_path=os.path.join(self.test_dir, 'image_dataset_2', 'media') + '/' + ) + ], + 'media_types': ['image/jpeg'], + }, + ) + + df = container.DataFrame({'filename': ['complementaryData.csv', 'cifar10_bird_1.png', '001_HandPhoto_left_01.jpg']}) + + ds = container.Dataset(resources={'learningData': df}, metadata=metadata) + data_path = os.path.abspath(os.path.join(self.test_dir, 'multi_source_1', 'datasetDoc.json')) + ds.save('file://' + data_path) + + self.assertTrue(os.path.exists(data_path)) + with open(data_path, 'r', encoding='utf') as data_file: + description = json.load(data_file) + + self.assertEqual( + description['dataResources'], + [ + { + 'resID': 'learningData', + 'isCollection': True, + 'resFormat': {'image/jpeg': ['jpg'], 'image/png': ['png'], 'text/csv': ['csv']}, + 'resType': 'raw', + 'resPath': 'files/', + } + ], + ) + + self.assertTrue(os.path.exists(os.path.join(self.test_dir, 'multi_source_1', 'files', 'complementaryData.csv'))) + self.assertTrue(os.path.exists(os.path.join(self.test_dir, 'multi_source_1', 'files', 'cifar10_bird_1.png'))) + self.assertTrue(os.path.exists(os.path.join(self.test_dir, 'multi_source_1', 'files', '001_HandPhoto_left_01.jpg'))) + + def test_csv_with_d3m_index(self): + dataset_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv') + ) + + dataset_id = '219a5e7b-4499-4160-9b72-9cfa53c4924d' + dataset_name = 'Iris Dataset' + + ds = container.Dataset.load( + 'file://{dataset_path}'.format(dataset_path=dataset_path), dataset_id=dataset_id, dataset_name=dataset_name + ) + + self._test_csv_with_d3m_index(ds, dataset_path, dataset_id, dataset_name) + + def _test_csv_with_d3m_index(self, ds, dataset_path, dataset_id, dataset_name): + ds.metadata.check(ds) + + for row in ds['learningData']: + for cell in row: + # Nothing should be parsed from a string. + self.assertIsInstance(cell, str) + + self.assertEqual(len(ds['learningData']), 150, dataset_name) + self.assertEqual(len(ds['learningData'].iloc[0]), 6, dataset_name) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': dataset_id, + 'name': dataset_name, + 'stored_size': 4961, + 'location_uris': ['file://localhost{dataset_path}'.format(dataset_path=dataset_path)], + 'dimension': { + 'length': 1, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'digest': 'a5e827f2fb60639f1eb7b9bd3b849b0db9c308ba74d0479c20aaeaad77ccda48', + }, + dataset_name, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData',))), + { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + dataset_name, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }, + dataset_name, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), + { + 'name': 'd3mIndex', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/UnknownType'], + 'structural_type': 'str', + }, + dataset_name, + ) + + for i in range(1, 6): + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i))), + { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species'][i - 1], + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/UnknownType'], + 'structural_type': 'str', + }, + dataset_name, + ) + + def test_csv_lazy_with_d3m_index(self): + dataset_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv') + ) + + dataset_id = '219a5e7b-4499-4160-9b72-9cfa53c4924d' + dataset_name = 'Iris Dataset' + + ds = container.Dataset.load( + 'file://{dataset_path}'.format(dataset_path=dataset_path), + dataset_id=dataset_id, + dataset_name=dataset_name, + lazy=True, + ) + + ds.metadata.check(ds) + + self.assertTrue(len(ds) == 0) + self.assertTrue(ds.is_lazy()) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': dataset_id, + 'name': dataset_name, + 'location_uris': ['file://localhost{dataset_path}'.format(dataset_path=dataset_path)], + 'dimension': { + 'length': 0, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + }, + ) + + self.assertEqual(convert_metadata(ds.metadata.query(('learningData',))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), {}) + + ds.load_lazy() + + self.assertFalse(ds.is_lazy()) + + self._test_csv_with_d3m_index(ds, dataset_path, dataset_id, dataset_name) + + def test_sklearn(self): + for dataset_path in ['boston', 'breast_cancer', 'diabetes', 'digits', 'iris', 'linnerud']: + container.Dataset.load('sklearn://{dataset_path}'.format(dataset_path=dataset_path)) + + dataset_uri = 'sklearn://iris' + dataset_id = str(uuid.uuid3(uuid.NAMESPACE_URL, dataset_uri)) + dataset_name = 'Iris Dataset' + + ds = container.Dataset.load(dataset_uri, dataset_id=dataset_id, dataset_name=dataset_name) + + self._test_sklearn(ds, dataset_uri) + + def _test_sklearn(self, ds, dataset_uri): + ds.metadata.check(ds) + + self.assertEqual(len(ds['learningData']), 150, dataset_uri) + self.assertEqual(len(ds['learningData'].iloc[0]), 6, dataset_uri) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': '44f6efaa-72e7-383e-9369-64bd7168fb26', + 'name': 'Iris Dataset', + 'location_uris': [dataset_uri], + 'description': datasets.load_iris()['DESCR'], + 'dimension': { + 'length': 1, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'digest': '2cd0dd490ba383fe08a9f89514f6688bb5cb77d4a7da140e9458e7c534eb82f4', + }, + dataset_uri, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData',))), + { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + dataset_uri, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }, + dataset_uri, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), + { + 'name': 'd3mIndex', + 'structural_type': 'numpy.int64', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }, + dataset_uri, + ) + + for i in range(1, 5): + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i))), + { + 'name': ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'][i - 1], + 'structural_type': 'numpy.float64', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + dataset_uri, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 5))), + { + 'name': 'column 4', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + dataset_uri, + ) + + @unittest.skip("requires rewrite") + # TODO: Fix. Currently "generate_metadata" is not called when not loading lazily. + # We should just always use auto generation for as much as possible. + # Or not, to make sure things are speedy? + def test_sklearn_lazy(self): + for dataset_path in ['boston', 'breast_cancer', 'diabetes', 'digits', 'iris', 'linnerud']: + container.Dataset.load('sklearn://{dataset_path}'.format(dataset_path=dataset_path)) + + dataset_uri = 'sklearn://iris' + dataset_id = str(uuid.uuid3(uuid.NAMESPACE_URL, dataset_uri)) + dataset_name = 'Iris Dataset' + + ds = container.Dataset.load(dataset_uri, dataset_id=dataset_id, dataset_name=dataset_name, lazy=True) + + ds.metadata.check(ds) + + self.assertTrue(len(ds) == 0) + self.assertTrue(ds.is_lazy()) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': '44f6efaa-72e7-383e-9369-64bd7168fb26', + 'name': 'Iris Dataset', + 'location_uris': [dataset_uri], + 'dimension': { + 'length': 0, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + }, + ) + + self.assertEqual(convert_metadata(ds.metadata.query(('learningData',))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), {}) + + ds.load_lazy() + + self.assertFalse(ds.is_lazy()) + + self._test_sklearn(ds, dataset_uri) + + def test_multi_table(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json') + ) + + container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + def test_timeseries(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json') + ) + + container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + def test_audio(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'audio_dataset_1', 'datasetDoc.json') + ) + + ds = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': 'audio_dataset_1', + 'version': '4.0.0', + 'name': 'Audio dataset to be used for tests', + 'location_uris': ['file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)], + 'source': {'license': 'CC0', 'redacted': False}, + 'dimension': { + 'length': 2, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'digest': '4eaa4ee8ce18dc066d400d756105aab1ce92895593d09c8be23e08fdd89640e1', + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('0',))), + { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/FilesCollection', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 1, + }, + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('0', metadata_base.ALL_ELEMENTS))), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + } + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData',))), + { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 1, + }, + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 5, + } + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), + { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 1))), + { + 'name': 'audio_file', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'foreign_key': {'type': 'COLUMN', 'resource_id': '0', 'column_index': 0}, + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 2))), + { + 'name': 'start', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Boundary', + 'https://metadata.datadrivendiscovery.org/types/IntervalStart', + ], + 'boundary_for': {'resource_id': 'learningData', 'column_index': 1}, + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 3))), + { + 'name': 'end', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Boundary', + 'https://metadata.datadrivendiscovery.org/types/IntervalEnd', + ], + 'boundary_for': {'resource_id': 'learningData', 'column_index': 1}, + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 4))), + { + 'name': 'class', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + ) + + def test_raw(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'raw_dataset_1', 'datasetDoc.json') + ) + + ds = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': 'raw_dataset_1', + 'name': 'Raw dataset to be used for tests', + 'location_uris': ['file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)], + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': 1, + }, + 'digest': 'e28468d602c30c7da7643aa78840bcaae68a9abb96b48cc98eb51fb94e6fd3af', + 'source': {'redacted': False}, + 'version': '4.0.0', + }, + ) + self.assertEqual( + convert_metadata(ds.metadata.query(('0', metadata_base.ALL_ELEMENTS, 0))), + { + 'location_base_uris': [ + 'file://{dataset_path}/raw/'.format(dataset_path=os.path.dirname(dataset_doc_path)) + ], + 'media_types': ['text/csv'], + 'name': 'filename', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'https://metadata.datadrivendiscovery.org/types/FileName', + 'https://metadata.datadrivendiscovery.org/types/UnspecifiedStructure', + ], + 'structural_type': 'str', + }, + ) + + def test_select_rows(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json') + ) + ds = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # add metadata for rows 0, 1, 2 + ds.metadata = ds.metadata.update(('learningData', 0), {'a': 0}) + ds.metadata = ds.metadata.update(('learningData', 1), {'b': 1}) + ds.metadata = ds.metadata.update(('learningData', 2), {'c': 2}) + + cut_dataset = ds.select_rows({'learningData': [0, 2]}) + + # verify that rows are removed from dataframe and re-indexed + self.assertListEqual([0, 1], list(cut_dataset['learningData'].index)) + self.assertListEqual(['0', '2'], list(cut_dataset['learningData'].d3mIndex)) + + # verify that metadata is removed and re-indexed + self.assertEqual(cut_dataset.metadata.query(('learningData', 0))['a'], 0) + self.assertEqual(cut_dataset.metadata.query(('learningData', 1))['c'], 2) + + def test_score_workaround(self): + dataset_doc_path = os.path.abspath( + os.path.join( + os.path.dirname(__file__), 'data', 'datasets', 'score_dataset_1', 'dataset_TEST', 'datasetDoc.json' + ) + ) + ds = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + self.assertEqual(ds.metadata.query_field((), 'id'), 'object_dataset_1_SCORE') + + self.assertEqual( + ds['learningData'].values.tolist(), + [ + ['0', 'img_00285.png', 'red', '480,457,480,529,515,529,515,457'], + ['0', 'img_00285.png', 'black', '10,117,10,329,105,329,105,117'], + ['1', 'img_00225.png', 'blue', '422,540,422,660,576,660,576,540'], + ['1', 'img_00225.png', 'red', '739,460,739,545,768,545,768,460'], + ], + ) + + def test_csv_without_d3m_index(self): + dataset_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'tables', 'values.csv') + ) + + dataset_id = '7cd469db-e922-4418-84a9-cda9517251d1' + dataset_name = 'Database Dataset' + + ds = container.Dataset.load( + 'file://{dataset_path}'.format(dataset_path=dataset_path), dataset_id=dataset_id, dataset_name=dataset_name + ) + + self._test_csv_without_d3m_index(ds, dataset_path, dataset_id, dataset_name) + + def _test_csv_without_d3m_index(self, ds, dataset_path, dataset_id, dataset_name): + ds.metadata.check(ds) + + for row in ds['learningData']: + for cell in row: + # Nothing should be parsed from a string. + self.assertIsInstance(cell, str) + + self.assertEqual(len(ds['learningData']), 64, dataset_name) + self.assertEqual(len(ds['learningData'].iloc[0]), 5, dataset_name) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': dataset_id, + 'name': dataset_name, + 'stored_size': 1794, + 'location_uris': ['file://localhost{dataset_path}'.format(dataset_path=dataset_path)], + 'dimension': { + 'length': 1, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'digest': 'b22431ee93c7b5fd6405c813bc67bfe6b2e1718eb6080cc50ff90ef6b2812139', + }, + dataset_name, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData',))), + { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 64, + }, + }, + dataset_name, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 5, + } + }, + dataset_name, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), + { + 'name': 'd3mIndex', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + 'structural_type': 'numpy.int64', + }, + dataset_name, + ) + + for i in range(1, 5): + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i))), + { + 'name': ['code', 'key', 'year', 'value'][i - 1], + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/UnknownType'], + 'structural_type': 'str', + }, + dataset_name, + ) + + def test_csv_lazy_without_d3m_index(self): + dataset_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'tables', 'values.csv') + ) + + dataset_id = '7cd469db-e922-4418-84a9-cda9517251d1' + dataset_name = 'Database Dataset' + + ds = container.Dataset.load( + 'file://{dataset_path}'.format(dataset_path=dataset_path), + dataset_id=dataset_id, + dataset_name=dataset_name, + lazy=True, + ) + + ds.metadata.check(ds) + + self.assertTrue(len(ds) == 0) + self.assertTrue(ds.is_lazy()) + + self.maxDiff = None + + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': dataset_id, + 'name': dataset_name, + 'location_uris': ['file://localhost{dataset_path}'.format(dataset_path=dataset_path)], + 'dimension': { + 'length': 0, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + }, + ) + + self.assertEqual(convert_metadata(ds.metadata.query(('learningData',))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), {}) + + ds.load_lazy() + + self.assertFalse(ds.is_lazy()) + + self._test_csv_without_d3m_index(ds, dataset_path, dataset_id, dataset_name) + + def test_openml(self): + self.maxDiff = None + + # TODO: Try also with 1414. Do we have to convert date columns to strings or something? + # See: https://github.com/openml/openml-data/issues/23 + for dataset_id in [8, 17, 61, 42, 46, 373, 41496]: + dataset_uri = 'https://www.openml.org/d/{dataset_id}'.format(dataset_id=dataset_id) + output_dataset_uri = 'file://{dataset_path}'.format( + dataset_path=os.path.join(self.test_dir, str(dataset_id), 'datasetDoc.json') + ) + + ds_1 = dataset.Dataset.load(dataset_uri=dataset_uri, dataset_id=str(dataset_id)) + ds_1.save(dataset_uri=output_dataset_uri) + ds_2 = dataset.Dataset.load(dataset_uri=output_dataset_uri) + + self._test_openml_compare_loaded(ds_1, ds_2) + + def _test_openml_compare_loaded(self, ds_1, ds_2): + keys_to_remove = ['digest', 'location_uris'] + for metadata_key in keys_to_remove: + ds_1.metadata = ds_1.metadata.update((), {metadata_key: metadata_base.NO_VALUE}) + ds_2.metadata = ds_2.metadata.update((), {metadata_key: metadata_base.NO_VALUE}) + + for resource_id in ds_1: + # Additional metadata added when saving. + ds_1.metadata = ds_1.metadata.update( + (resource_id, metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS), {'structural_type': str} + ) + + # When saving, we convert all columns to string type. + for column_index in range( + ds_1.metadata.query((resource_id, metadata_base.ALL_ELEMENTS))['dimension']['length'] + ): + ds_1.metadata = ds_1.metadata.update( + (resource_id, metadata_base.ALL_ELEMENTS, column_index), {'structural_type': str} + ) + + self.assertEqual(ds_1.metadata.to_internal_json_structure(), ds_2.metadata.to_internal_json_structure()) + + def test_openml_nonlazy(self): + dataset_id = 61 + dataset_name = 'iris' + dataset_uri = 'https://www.openml.org/d/{dataset_id}'.format(dataset_id=dataset_id) + ds = dataset.Dataset.load(dataset_uri, dataset_id=str(dataset_id), dataset_name=dataset_name) + + self._openml_check(ds, dataset_uri) + + def _openml_check_top_metadata(self, ds, dataset_uri, resources): + self.assertEqual( + convert_metadata(ds.metadata.query(())), + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': '61', + 'name': 'iris', + 'location_uris': [dataset_uri], + 'description': """**Author**: R.A. Fisher +**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall +**Please cite**: + +**Iris Plants Database** +This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other. + +Predicted attribute: class of iris plant. +This is an exceedingly simple domain. + +### Attribute Information: + 1. sepal length in cm + 2. sepal width in cm + 3. petal length in cm + 4. petal width in cm + 5. class: + -- Iris Setosa + -- Iris Versicolour + -- Iris Virginica""", + 'dimension': { + 'length': resources, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'keywords': [ + 'study_1', + 'study_25', + 'study_4', + 'study_41', + 'study_50', + 'study_52', + 'study_7', + 'study_86', + 'study_88', + 'study_89', + 'uci', + ], + 'source': { + 'license': 'Public', + 'name': 'R.A. Fisher', + 'published': '1936-01-01T00:00:00Z', + 'uris': [ + 'https://www.openml.org/d/61', + 'https://archive.ics.uci.edu/ml/datasets/Iris', + 'http://digital.library.adelaide.edu.au/dspace/handle/2440/15227', + ], + }, + 'version': '1', + 'digest': '3b516a917d2f91d898be96391761e9e4aa7c4817bd45c2a89aace3fd6cc88d10', + 'data_metafeatures': { + 'dimensionality': float(1 / 30), + 'kurtosis_of_attributes': { + 'max': 0.2907810623654319, + 'mean': -0.7507394876837399, + 'median': -0.9459091062274964, + 'min': -1.401920800645399, + 'quartile_1': -1.3863791432688857, + 'quartile_3': 0.08006978644516216, + }, + 'mean_of_attributes': { + 'max': 5.843333333333334, + 'mean': 3.4636666666666667, + 'median': 3.406333333333333, + 'min': 1.1986666666666665, + 'quartile_1': 1.6624999999999999, + 'quartile_3': 5.322166666666667, + }, + 'number_distinct_values_of_categorical_attributes': { + 'max': 3.0, + 'min': 3.0, + 'mean': 3.0, + 'std': 0.0, + }, + 'number_of_attributes': 5, + 'number_of_binary_attributes': 0, + 'number_of_categorical_attributes': 1, + 'number_of_instances': 150, + 'number_of_instances_with_missing_values': 0, + 'number_of_missing_values': 0, + 'number_of_numeric_attributes': 4, + 'ratio_of_binary_attributes': 0.0, + 'ratio_of_categorical_attributes': 20.0, + 'ratio_of_instances_with_missing_values': 0.0, + 'ratio_of_missing_values': 0.0, + 'ratio_of_numeric_attributes': 80.0, + 'skew_of_attributes': { + 'max': 0.33405266217208907, + 'mean': 0.067375701047788, + 'median': 0.10495719724642329, + 'min': -0.2744642524737837, + 'quartile_1': -0.2320973298913695, + 'quartile_3': 0.32926723578831013, + }, + 'standard_deviation_of_attributes': { + 'max': 1.7644204199522626, + 'mean': 0.9473104002482851, + 'median': 0.795613434839352, + 'min': 0.43359431136217386, + 'quartile_1': 0.5159859189468406, + 'quartile_3': 1.5303318469586626, + }, + + + }, + }, + ) + + def _openml_check(self, ds, dataset_uri): + self.maxDiff = None + + ds.metadata.check(ds) + + self.assertEqual(len(ds['learningData']), 150) + self.assertEqual(len(ds['learningData'].iloc[0]), 6) + self.assertEqual(ds['learningData'].iloc[0, 5], 'Iris-setosa') + self.assertEqual(ds['learningData'].dtypes[5], numpy.object) + + self._openml_check_top_metadata(ds, dataset_uri, 1) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData',))), + { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), + { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), + { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }, + ) + + for i in range(1, 5): + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i))), + { + 'name': ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'][i - 1], + 'structural_type': 'float', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + ) + + self.assertEqual( + convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 5))), + { + 'name': 'class', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + ) + + def test_openml_lazy(self): + self.maxDiff = None + + dataset_id = 61 + dataset_name = 'iris' + dataset_uri = 'https://www.openml.org/d/{dataset_id}'.format(dataset_id=dataset_id) + ds = dataset.Dataset.load(dataset_uri, dataset_id=str(dataset_id), dataset_name=dataset_name, lazy=True) + + ds.metadata.check(ds) + + self.assertEqual(len(ds), 0) + self.assertTrue(ds.is_lazy()) + + self._openml_check_top_metadata(ds, dataset_uri, 0) + + self.assertEqual(convert_metadata(ds.metadata.query(('learningData',))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS))), {}) + self.assertEqual(convert_metadata(ds.metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), {}) + + ds.load_lazy() + + self.assertEqual(len(ds), 1) + self.assertFalse(ds.is_lazy()) + + self._openml_check(ds, dataset_uri) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_file_reader.py b/d3m/tests/test_file_reader.py new file mode 100644 index 0000000..c5ff738 --- /dev/null +++ b/d3m/tests/test_file_reader.py @@ -0,0 +1,171 @@ +import unittest +import os.path +import sys + +COMMON_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'common-primitives') +# NOTE: This insertion should appear before any code attempting to resolve or load primitives, +# so the git submodule version of `common-primitives` is looked at first. +sys.path.insert(0, COMMON_PRIMITIVES_DIR) + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from common_primitives.dataset_to_dataframe import DatasetToDataFramePrimitive + +from test_primitives.file_reader import DummyImageReaderPrimitive + +from d3m import container, utils + + +class TestDummyImageReaderPrimitive(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'image_dataset_1', 'datasetDoc.json') + ) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataframe_hyperparams_class = DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = DatasetToDataFramePrimitive( + hyperparams=dataframe_hyperparams_class.defaults().replace({'dataframe_resource': '0'}) + ) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + image_hyperparams_class = DummyImageReaderPrimitive.metadata.get_hyperparams() + image_primitive = DummyImageReaderPrimitive( + hyperparams=image_hyperparams_class.defaults().replace({'return_result': 'replace'}) + ) + images_names = image_primitive.produce(inputs=dataframe).value + + self.assertEqual(images_names.iloc[0]['filename'][0], '001_HandPhoto_left_01.jpg') + self.assertEqual(images_names.iloc[1]['filename'][0], 'cifar10_bird_1.png') + self.assertEqual(images_names.iloc[2]['filename'][0], 'cifar10_bird_2.png') + self.assertEqual(images_names.iloc[3]['filename'][0], 'mnist_0_2.png') + self.assertEqual(images_names.iloc[4]['filename'][0], 'mnist_1_1.png') + + self._test_metadata(images_names.metadata) + + def _test_metadata(self, metadata): + self.assertEqual( + utils.to_json_structure(metadata.to_internal_simple_structure()), + [ + { + 'metadata': { + 'dimension': { + 'length': 5, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/FilesCollection', + ], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + 'selector': [], + }, + { + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + 'selector': ['__ALL_ELEMENTS__'], + }, + { + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'location_base_uris': '__NO_VALUE__', + 'media_types': '__NO_VALUE__', + 'name': 'filename', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'http://schema.org/ImageObject', + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'structural_type': 'd3m.container.numpy.ndarray', + }, + 'selector': ['__ALL_ELEMENTS__', 0], + }, + { + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + 'selector': ['__ALL_ELEMENTS__', 0, '__ALL_ELEMENTS__'], + }, + { + 'metadata': {'structural_type': 'str'}, + 'selector': ['__ALL_ELEMENTS__', 0, '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + }, + { + 'metadata': { + 'image_reader_metadata': {'foobar': 42}, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'http://schema.org/ImageObject', + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + }, + 'selector': [0, 0], + }, + { + 'metadata': { + 'image_reader_metadata': {'foobar': 42}, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'http://schema.org/ImageObject', + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + }, + 'selector': [1, 0], + }, + { + 'metadata': { + 'image_reader_metadata': {'foobar': 42}, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'http://schema.org/ImageObject', + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + }, + 'selector': [2, 0], + }, + { + 'metadata': { + 'image_reader_metadata': {'foobar': 42}, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'http://schema.org/ImageObject', + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + }, + 'selector': [3, 0], + }, + { + 'metadata': { + 'image_reader_metadata': {'foobar': 42}, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'http://schema.org/ImageObject', + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + }, + 'selector': [4, 0], + }, + ], + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_hyperparams.py b/d3m/tests/test_hyperparams.py new file mode 100644 index 0000000..18b9233 --- /dev/null +++ b/d3m/tests/test_hyperparams.py @@ -0,0 +1,1795 @@ +import json +import logging +import os +import typing +import pickle +import subprocess +import sys +import unittest +from collections import OrderedDict + +import frozendict +import numpy +from sklearn.utils import validation as sklearn_validation + +from d3m import container, exceptions, index, utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') + +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.monomial import MonomialPrimitive +from test_primitives.random import RandomPrimitive +from test_primitives.sum import SumPrimitive +from test_primitives.increment import IncrementPrimitive + + +# It's defined at global scope so it can be pickled. +class TestPicklingHyperparams(hyperparams.Hyperparams): + choice = hyperparams.Choice( + choices={ + 'alpha': hyperparams.Hyperparams.define(OrderedDict( + value=hyperparams.Union( + OrderedDict( + float=hyperparams.Hyperparameter[float](0), + int=hyperparams.Hyperparameter[int](0) + ), + default='float' + ), + )) + }, + default='alpha', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + +class TestHyperparams(unittest.TestCase): + def test_hyperparameter(self): + hyperparameter = hyperparams.Hyperparameter[str]('nothing') + + self.assertEqual(hyperparameter.get_default(), 'nothing') + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.sample(42), 'nothing') + self.assertEqual(len(cm.records), 1) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.sample_multiple(0, 1, 42), ('nothing',)) + self.assertEqual(len(cm.records), 1) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.sample_multiple(0, 0, 42), ()) + self.assertEqual(len(cm.records), 1) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': 'nothing', + 'semantic_types': [], + 'structural_type': str, + 'type': hyperparams.Hyperparameter, + }) + + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.get_default()), 'nothing') + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.sample(42)), 'nothing') + self.assertEqual(len(cm.records), 1) + + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.get_default())), hyperparameter.get_default()) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.sample(42))), hyperparameter.sample(42)) + self.assertEqual(len(cm.records), 1) + + with self.assertRaisesRegex(TypeError, 'Value \'.*\' is not an instance of the structural type'): + hyperparams.Hyperparameter[int]('nothing') + + with self.assertRaisesRegex(ValueError, '\'max_samples\' cannot be larger than'): + hyperparameter.sample_multiple(0, 2, 42) + + def test_constant(self): + hyperparameter = hyperparams.Constant(12345) + + self.assertEqual(hyperparameter.get_default(), 12345) + self.assertEqual(hyperparameter.sample(), 12345) + self.assertEqual(hyperparameter.sample_multiple(0, 1, 42), (12345,)) + + self.assertEqual(hyperparameter.sample_multiple(0, 0, 42), ()) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': 12345, + 'semantic_types': [], + 'structural_type': int, + 'type': hyperparams.Constant, + }) + + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.get_default()), 12345) + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.sample(42)), 12345) + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.get_default())), hyperparameter.get_default()) + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.sample(42))), hyperparameter.sample(42)) + + with self.assertRaisesRegex(TypeError, 'Value \'.*\' is not an instance of the structural type'): + hyperparams.Hyperparameter[int]('different') + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is not the constant default value'): + hyperparameter.validate(54321) + + with self.assertRaisesRegex(ValueError, '\'max_samples\' cannot be larger than'): + self.assertEqual(hyperparameter.sample_multiple(0, 2, 42), {12345}) + + hyperparameter = hyperparams.Constant('constant') + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is not the constant default value'): + hyperparameter.validate('different') + + def test_bounded(self): + hyperparameter = hyperparams.Bounded[float](0.0, 1.0, 0.2) + + self.assertEqual(hyperparameter.get_default(), 0.2) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.sample(42), 0.37454011884736255) + self.assertEqual(len(cm.records), 1) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.sample_multiple(0, 1, 7), (0.22733907982646523,)) + self.assertEqual(len(cm.records), 1) + + self.assertEqual(hyperparameter.sample_multiple(0, 0, 42), ()) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': 0.2, + 'semantic_types': [], + 'structural_type': float, + 'type': hyperparams.Bounded, + 'lower': 0.0, + 'upper': 1.0, + 'lower_inclusive': True, + 'upper_inclusive': True, + }) + + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.get_default()), 0.2) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.sample(42)), 0.37454011884736255) + self.assertEqual(len(cm.records), 1) + + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.get_default())), hyperparameter.get_default()) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.sample(42))), hyperparameter.sample(42)) + self.assertEqual(len(cm.records), 1) + + with self.assertRaisesRegex(TypeError, 'Value \'.*\' is not an instance of the structural type'): + hyperparams.Bounded[str]('lower', 'upper', 0.2) + + with self.assertRaisesRegex(TypeError, 'Lower bound \'.*\' is not an instance of the structural type'): + hyperparams.Bounded[str](0.0, 'upper', 'default') + + with self.assertRaisesRegex(TypeError, 'Upper bound \'.*\' is not an instance of the structural type'): + hyperparams.Bounded[str]('lower', 1.0, 'default') + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is outside of range'): + hyperparams.Bounded[str]('lower', 'upper', 'default') + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is outside of range'): + hyperparams.Bounded[float](0.0, 1.0, 1.2) + + hyperparams.Bounded[typing.Optional[float]](0.0, None, 0.2) + hyperparams.Bounded[typing.Optional[float]](None, 1.0, 0.2) + + with self.assertRaisesRegex(ValueError, 'Lower and upper bounds cannot both be None'): + hyperparams.Bounded[typing.Optional[float]](None, None, 0.2) + + with self.assertRaisesRegex(TypeError, 'Value \'.*\' is not an instance of the structural type'): + hyperparams.Bounded[float](0.0, 1.0, None) + + with self.assertRaises(TypeError): + hyperparams.Bounded[typing.Optional[float]](0.0, 1.0, None) + + hyperparams.Bounded[typing.Optional[float]](None, 1.0, None) + hyperparams.Bounded[typing.Optional[float]](0.0, None, None) + + hyperparameter = hyperparams.Bounded[float](0.0, None, 0.2) + + with self.assertRaisesRegex(ValueError, '\'max_samples\' cannot be larger than'): + hyperparameter.sample_multiple(0, 2, 42) + + with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'must be finite'): + hyperparams.Bounded[typing.Optional[float]](0.0, numpy.nan, 0) + + with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'must be finite'): + hyperparams.Bounded[typing.Optional[float]](numpy.inf, 0.0, 0) + + def test_enumeration(self): + hyperparameter = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + + self.assertEqual(hyperparameter.get_default(), None) + self.assertEqual(hyperparameter.sample(42), 2) + self.assertEqual(hyperparameter.sample_multiple(0, 1, 42), ()) + self.assertEqual(hyperparameter.sample_multiple(0, 2, 42), ('b', None)) + self.assertEqual(hyperparameter.sample_multiple(0, 3, 42), ('b', None)) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': None, + 'semantic_types': [], + 'structural_type': typing.Union[str, int, type(None)], + 'type': hyperparams.Enumeration, + 'values': ['a', 'b', 1, 2, None], + }) + + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.get_default()), None) + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.sample(42)), 2) + + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.get_default())), hyperparameter.get_default()) + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.sample(42))), hyperparameter.sample(42)) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is not among values'): + hyperparams.Enumeration(['a', 'b', 1, 2], None) + + with self.assertRaisesRegex(TypeError, 'Value \'.*\' is not an instance of the structural type'): + hyperparams.Enumeration[typing.Union[str, int]](['a', 'b', 1, 2, None], None) + + with self.assertRaisesRegex(ValueError, '\'max_samples\' cannot be larger than'): + self.assertEqual(hyperparameter.sample_multiple(0, 6, 42), ()) + + hyperparameter = hyperparams.Enumeration(['a', 'b', 'c'], 'a') + + self.assertEqual(hyperparameter.value_to_json_structure('c'), 'c') + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure('c')), 'c') + + with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'contain duplicates'): + hyperparams.Enumeration([1.0, 1], 1) + + hyperparameter = hyperparams.Enumeration([1.0, float('nan'), float('infinity'), float('-infinity')], 1.0) + + hyperparameter.validate(float('nan')) + + self.assertEqual(utils.to_json_structure(hyperparameter.to_simple_structure()), { + 'type': 'd3m.metadata.hyperparams.Enumeration', + 'default': 1.0, + 'structural_type': 'float', + 'semantic_types': [], + 'values': [1.0, 'nan', 'inf', '-inf'], + }) + + self.assertEqual(json.dumps(hyperparameter.value_to_json_structure(float('nan')), allow_nan=False), '{"encoding": "pickle", "value": "gANHf/gAAAAAAAAu"}') + self.assertEqual(json.dumps(hyperparameter.value_to_json_structure(float('inf')), allow_nan=False), '{"encoding": "pickle", "value": "gANHf/AAAAAAAAAu"}') + + def test_other(self): + hyperparameter = hyperparams.UniformInt(1, 10, 2) + + self.assertEqual(hyperparameter.get_default(), 2) + self.assertEqual(hyperparameter.sample(42), 7) + self.assertEqual(hyperparameter.sample_multiple(0, 1, 42), ()) + self.assertEqual(hyperparameter.sample_multiple(0, 2, 42), (4, 8)) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': 2, + 'semantic_types': [], + 'structural_type': int, + 'type': hyperparams.UniformInt, + 'lower': 1, + 'upper': 10, + 'lower_inclusive': True, + 'upper_inclusive': False, + }) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is outside of range'): + hyperparams.UniformInt(1, 10, 0) + + with self.assertRaisesRegex(ValueError, '\'max_samples\' cannot be larger than'): + self.assertEqual(hyperparameter.sample_multiple(0, 10, 42), ()) + + hyperparameter = hyperparams.Uniform(1.0, 10.0, 2.0) + + self.assertEqual(hyperparameter.get_default(), 2.0) + self.assertEqual(hyperparameter.sample(42), 4.370861069626263) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': 2.0, + 'semantic_types': [], + 'structural_type': float, + 'type': hyperparams.Uniform, + 'lower': 1.0, + 'upper': 10.0, + 'lower_inclusive': True, + 'upper_inclusive': False, + }) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is outside of range'): + hyperparams.Uniform(1.0, 10.0, 0.0) + + hyperparameter = hyperparams.LogUniform(1.0, 10.0, 2.0) + + self.assertEqual(hyperparameter.get_default(), 2.0) + self.assertEqual(hyperparameter.sample(42), 2.368863950364078) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': 2.0, + 'semantic_types': [], + 'structural_type': float, + 'type': hyperparams.LogUniform, + 'lower': 1.0, + 'upper': 10.0, + 'lower_inclusive': True, + 'upper_inclusive': False, + }) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is outside of range'): + hyperparams.LogUniform(1.0, 10.0, 0.0) + + hyperparameter = hyperparams.UniformBool(True) + + self.assertEqual(hyperparameter.get_default(), True) + self.assertEqual(hyperparameter.sample(42), True) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': True, + 'semantic_types': [], + 'structural_type': bool, + 'type': hyperparams.UniformBool, + }) + + with self.assertRaises(exceptions.InvalidArgumentValueError): + hyperparams.UniformInt(0, 1, 1, lower_inclusive=False, upper_inclusive=False) + + hyperparameter = hyperparams.UniformInt(0, 2, 1, lower_inclusive=False, upper_inclusive=False) + + self.assertEqual(hyperparameter.sample(42), 1) + + with self.assertRaises(exceptions.InvalidArgumentValueError): + hyperparameter.sample_multiple(2, 2, 42) + + self.assertEqual(hyperparameter.sample_multiple(2, 2, 42, with_replacement=True), (1, 1)) + + def test_union(self): + hyperparameter = hyperparams.Union( + OrderedDict( + none=hyperparams.Hyperparameter(None), + range=hyperparams.UniformInt(1, 10, 2) + ), + 'none', + ) + + self.assertEqual(hyperparameter.get_default(), None) + self.assertEqual(hyperparameter.sample(45), 4) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': None, + 'semantic_types': [], + 'structural_type': typing.Optional[int], + 'type': hyperparams.Union, + 'configuration': { + 'none': { + 'default': None, + 'semantic_types': [], + 'structural_type': type(None), + 'type': hyperparams.Hyperparameter, + }, + 'range': { + 'default': 2, + 'semantic_types': [], + 'structural_type': int, + 'type': hyperparams.UniformInt, + 'lower': 1, + 'upper': 10, + 'lower_inclusive': True, + 'upper_inclusive': False, + } + } + }) + + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.get_default()), {'case': 'none', 'value': None}) + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.sample(45)), {'case': 'range', 'value': 4}) + + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.get_default())), hyperparameter.get_default()) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.sample(42))), hyperparameter.sample(42)) + self.assertEqual(len(cm.records), 1) + + with self.assertRaisesRegex(TypeError, 'Hyper-parameter name is not a string'): + hyperparams.Union(OrderedDict({1: hyperparams.Hyperparameter(None)}), 1) + + with self.assertRaisesRegex(TypeError, 'Hyper-parameter description is not an instance of the Hyperparameter class'): + hyperparams.Union(OrderedDict(none=None), 'none') + + with self.assertRaisesRegex(ValueError, 'Default value \'.*\' is not in configuration'): + hyperparams.Union(OrderedDict(range=hyperparams.UniformInt(1, 10, 2)), 'none') + + hyperparams.Union(OrderedDict(range=hyperparams.UniformInt(1, 10, 2), default=hyperparams.Hyperparameter('nothing')), 'default') + hyperparams.Union[typing.Union[str, int]](OrderedDict(range=hyperparams.UniformInt(1, 10, 2), default=hyperparams.Hyperparameter('nothing')), 'default') + + with self.assertRaisesRegex(TypeError, 'Hyper-parameter \'.*\' is not a subclass of the structural type'): + hyperparams.Union[str](OrderedDict(range=hyperparams.UniformInt(1, 10, 2), default=hyperparams.Hyperparameter('nothing')), 'default') + + def test_hyperparams(self): + class TestHyperparams(hyperparams.Hyperparams): + a = hyperparams.Union(OrderedDict( + range=hyperparams.UniformInt(1, 10, 2), + none=hyperparams.Hyperparameter(None), + ), 'range') + b = hyperparams.Uniform(1.0, 10.0, 2.0) + + testCls = hyperparams.Hyperparams.define(OrderedDict( + a=hyperparams.Union(OrderedDict( + range=hyperparams.UniformInt(1, 10, 2), + none=hyperparams.Hyperparameter(None), + ), 'range'), + b=hyperparams.Uniform(1.0, 10.0, 2.0), + ), set_names=True) + + for cls in (TestHyperparams, testCls): + self.assertEqual(cls.configuration['a'].name, 'a', cls) + + self.assertEqual(cls.defaults(), {'a': 2, 'b': 2.0}, cls) + self.assertEqual(cls.defaults(), cls({'a': 2, 'b': 2.0}), cls) + self.assertEqual(cls.sample(42), {'a': 4, 'b': 9.556428757689245}, cls) + self.assertEqual(cls.sample(42), cls({'a': 4, 'b': 9.556428757689245}), cls) + self.assertEqual(cls(cls.defaults(), b=3.0), {'a': 2, 'b': 3.0}, cls) + self.assertEqual(cls(cls.defaults(), **{'b': 4.0}), {'a': 2, 'b': 4.0}, cls) + self.assertEqual(cls.defaults('a'), 2, cls) + self.assertEqual(cls.defaults('b'), 2.0, cls) + + self.assertEqual(cls.to_simple_structure(), { + 'a': { + 'default': 2, + 'semantic_types': [], + 'structural_type': typing.Optional[int], + 'type': hyperparams.Union, + 'configuration': { + 'none': { + 'default': None, + 'semantic_types': [], + 'structural_type': type(None), + 'type': hyperparams.Hyperparameter, + }, + 'range': { + 'default': 2, + 'lower': 1, + 'semantic_types': [], + 'structural_type': int, + 'type': hyperparams.UniformInt, + 'upper': 10, + 'lower_inclusive': True, + 'upper_inclusive': False, + }, + }, + }, + 'b': { + 'default': 2.0, + 'semantic_types': [], + 'structural_type': float, + 'type': hyperparams.Uniform, + 'lower': 1.0, + 'upper': 10.0, + 'lower_inclusive': True, + 'upper_inclusive': False, + } + }, cls) + + test_hyperparams = cls({'a': cls.configuration['a'].get_default(), 'b': cls.configuration['b'].get_default()}) + + self.assertEqual(test_hyperparams['a'], 2, cls) + self.assertEqual(test_hyperparams['b'], 2.0, cls) + + self.assertEqual(test_hyperparams.values_to_json_structure(), {'a': {'case': 'range', 'value': 2}, 'b': 2.0}) + self.assertEqual(cls.values_from_json_structure(test_hyperparams.values_to_json_structure()), test_hyperparams) + + with self.assertRaisesRegex(ValueError, 'Not all hyper-parameters are specified', msg=cls): + cls({'a': cls.configuration['a'].get_default()}) + + with self.assertRaisesRegex(ValueError, 'Additional hyper-parameters are specified', msg=cls): + cls({'a': cls.configuration['a'].get_default(), 'b': cls.configuration['b'].get_default(), 'c': 'two'}) + + cls({'a': 3, 'b': 3.0}) + cls({'a': None, 'b': 3.0}) + + test_hyperparams = cls(a=None, b=3.0) + self.assertEqual(test_hyperparams['a'], None, cls) + self.assertEqual(test_hyperparams['b'], 3.0, cls) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' for hyper-parameter \'.*\' has not validated with any of configured hyper-parameters', msg=cls): + cls({'a': 0, 'b': 3.0}) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' for hyper-parameter \'.*\' is outside of range', msg=cls): + cls({'a': 3, 'b': 100.0}) + + class SubTestHyperparams(cls): + c = hyperparams.Hyperparameter[int](0) + + self.assertEqual(SubTestHyperparams.defaults(), {'a': 2, 'b': 2.0, 'c': 0}, cls) + + testSubCls = cls.define(OrderedDict( + c=hyperparams.Hyperparameter[int](0), + ), set_names=True) + + self.assertEqual(testSubCls.defaults(), {'a': 2, 'b': 2.0, 'c': 0}, cls) + + class ConfigurationHyperparams(hyperparams.Hyperparams): + configuration = hyperparams.Uniform(1.0, 10.0, 2.0) + + self.assertEqual(ConfigurationHyperparams.configuration['configuration'].to_simple_structure(), hyperparams.Uniform(1.0, 10.0, 2.0).to_simple_structure()) + + def test_numpy(self): + class TestHyperparams(hyperparams.Hyperparams): + value = hyperparams.Hyperparameter[container.ndarray]( + default=container.ndarray([0], generate_metadata=True), + ) + + values = TestHyperparams(value=container.ndarray([1, 2, 3], generate_metadata=True)) + + self.assertEqual(values.values_to_json_structure(), {'value': {'encoding': 'pickle', 'value': 'gANjbnVtcHkuY29yZS5tdWx0aWFycmF5Cl9yZWNvbnN0cnVjdApxAGNkM20uY29udGFpbmVyLm51bXB5Cm5kYXJyYXkKcQFLAIVxAkMBYnEDh3EEUnEFfXEGKFgFAAAAbnVtcHlxByhLAUsDhXEIY251bXB5CmR0eXBlCnEJWAIAAABpOHEKSwBLAYdxC1JxDChLA1gBAAAAPHENTk5OSv////9K/////0sAdHEOYolDGAEAAAAAAAAAAgAAAAAAAAADAAAAAAAAAHEPdHEQWAgAAABtZXRhZGF0YXERY2QzbS5tZXRhZGF0YS5iYXNlCkRhdGFNZXRhZGF0YQpxEimBcRN9cRQoWBEAAABfY3VycmVudF9tZXRhZGF0YXEVY2QzbS5tZXRhZGF0YS5iYXNlCk1ldGFkYXRhRW50cnkKcRYpgXEXTn1xGChYCAAAAGVsZW1lbnRzcRljZDNtLnV0aWxzCnBtYXAKcRp9cRuFcRxScR1YDAAAAGFsbF9lbGVtZW50c3EeaBYpgXEfTn1xIChoGWgdaB5OaBFjZnJvemVuZGljdApGcm96ZW5PcmRlcmVkRGljdApxISmBcSJ9cSMoWAUAAABfZGljdHEkY2NvbGxlY3Rpb25zCk9yZGVyZWREaWN0CnElKVJxJlgPAAAAc3RydWN0dXJhbF90eXBlcSdjbnVtcHkKaW50NjQKcShzWAUAAABfaGFzaHEpTnViWAgAAABpc19lbXB0eXEqiVgRAAAAaXNfZWxlbWVudHNfZW1wdHlxK4h1hnEsYmgRaCEpgXEtfXEuKGgkaCUpUnEvKFgGAAAAc2NoZW1hcTBYQgAAAGh0dHBzOi8vbWV0YWRhdGEuZGF0YWRyaXZlbmRpc2NvdmVyeS5vcmcvc2NoZW1hcy92MC9jb250YWluZXIuanNvbnExaCdoAVgJAAAAZGltZW5zaW9ucTJoISmBcTN9cTQoaCRoJSlScTVYBgAAAGxlbmd0aHE2SwNzaClOdWJ1aClOdWJoKoloK4h1hnE3YmgpTnVidWIu'}}) + self.assertTrue(numpy.array_equal(TestHyperparams.values_from_json_structure(values.values_to_json_structure())['value'], values['value'])) + + def test_set(self): + set_hyperparameter = hyperparams.Set(hyperparams.Hyperparameter[int](1), []) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(set(set_hyperparameter.sample_multiple(min_samples=2, max_samples=2)), {(1,), ()}) + self.assertEqual(len(cm.records), 1) + elements = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + set_hyperparameter = hyperparams.Set(elements, ('a', 'b', 1, 2, None), 5, 5) + + self.assertEqual(set_hyperparameter.get_default(), ('a', 'b', 1, 2, None)) + self.assertEqual(set_hyperparameter.sample(45), ('b', None, 'a', 1, 2)) + self.assertEqual(set_hyperparameter.get_max_samples(), 1) + self.assertEqual(set_hyperparameter.sample_multiple(1, 1, 42), (('b', None, 1, 'a', 2),)) + self.assertEqual(set_hyperparameter.sample_multiple(0, 1, 42), ()) + + self.maxDiff = None + + self.assertEqual(set_hyperparameter.to_simple_structure(), { + 'default': ('a', 'b', 1, 2, None), + 'semantic_types': [], + 'structural_type': typing.Sequence[typing.Union[str, int, type(None)]], + 'type': hyperparams.Set, + 'min_size': 5, + 'max_size': 5, + 'elements': { + 'default': None, + 'semantic_types': [], + 'structural_type': typing.Union[str, int, type(None)], + 'type': hyperparams.Enumeration, + 'values': ['a', 'b', 1, 2, None], + }, + 'is_configuration': False, + }) + + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default()), ['a', 'b', 1, 2, None]) + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(45)), ['b', None, 'a', 1, 2]) + + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default())), set_hyperparameter.get_default()) + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(45))), set_hyperparameter.sample(45)) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' has less than 5 elements'): + elements = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + hyperparams.Set(elements, (), 5, 5) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is not among values'): + elements = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + hyperparams.Set(elements, ('a', 'b', 1, 2, 3), 5, 5) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' has duplicate elements'): + elements = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + hyperparams.Set(elements, ('a', 'b', 1, 2, 2), 5, 5) + + set_hyperparameter.contribute_to_class('foo') + + with self.assertRaises(KeyError): + set_hyperparameter.get_default('foo') + + list_of_supported_metafeatures = ['f1', 'f2', 'f3'] + metafeature = hyperparams.Enumeration(list_of_supported_metafeatures, list_of_supported_metafeatures[0], semantic_types=['https://metadata.datadrivendiscovery.org/types/MetafeatureParameter']) + set_hyperparameter = hyperparams.Set(metafeature, (), 0, 3) + + self.assertEqual(set_hyperparameter.get_default(), ()) + self.assertEqual(set_hyperparameter.sample(42), ('f2', 'f3')) + self.assertEqual(set_hyperparameter.get_max_samples(), 8) + self.assertEqual(set_hyperparameter.sample_multiple(0, 3, 42), (('f2', 'f3', 'f1'), ('f2', 'f3'))) + + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default()), []) + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(42)), ['f2', 'f3']) + + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default())), set_hyperparameter.get_default()) + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(42))), set_hyperparameter.sample(42)) + + set_hyperparameter = hyperparams.Set(metafeature, (), 0, None) + + self.assertEqual(set_hyperparameter.get_default(), ()) + self.assertEqual(set_hyperparameter.sample(42), ('f2', 'f3')) + self.assertEqual(set_hyperparameter.get_max_samples(), 8) + self.assertEqual(set_hyperparameter.sample_multiple(0, 3, 42), (('f2', 'f3', 'f1'), ('f2', 'f3'))) + + def test_set_with_hyperparams(self): + elements = hyperparams.Hyperparams.define(OrderedDict( + range=hyperparams.UniformInt(1, 10, 2), + enum=hyperparams.Enumeration(['a', 'b', 1, 2, None], None), + )) + set_hyperparameter = hyperparams.Set(elements, (elements(range=2, enum='a'),), 0, 5) + + self.assertEqual(set_hyperparameter.get_default(), ({'range': 2, 'enum': 'a'},)) + self.assertEqual(set_hyperparameter.sample(45), ({'range': 4, 'enum': None}, {'range': 1, 'enum': 2}, {'range': 5, 'enum': 'b'})) + self.assertEqual(set_hyperparameter.get_max_samples(), 1385980) + self.assertEqual(set_hyperparameter.sample_multiple(1, 1, 42), (({'range': 8, 'enum': None}, {'range': 5, 'enum': 'b'}, {'range': 3, 'enum': 1}),)) + self.assertEqual(set_hyperparameter.sample_multiple(0, 1, 42), ()) + self.maxDiff = None + + self.assertEqual(set_hyperparameter.to_simple_structure(), { + 'default': ({'range': 2, 'enum': 'a'},), + 'elements': { + 'enum': { + 'default': None, + 'semantic_types': [], + 'structural_type': typing.Union[str, int, type(None)], + 'type': hyperparams.Enumeration, + 'values': ['a', 'b', 1, 2, None], + }, + 'range': { + 'default': 2, + 'lower': 1, + 'semantic_types': [], + 'structural_type': int, + 'type': hyperparams.UniformInt, + 'upper': 10, + 'lower_inclusive': True, + 'upper_inclusive': False, + }, + }, + 'is_configuration': True, + 'max_size': 5, + 'min_size': 0, + 'semantic_types': [], + 'structural_type': typing.Sequence[elements], + 'type': hyperparams.Set, + }) + + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default()), [{'range': 2, 'enum': 'a'}]) + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(45)), [{'range': 4, 'enum': None}, {'range': 1, 'enum': 2}, {'range': 5, 'enum': 'b'}]) + + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default())), set_hyperparameter.get_default()) + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(45))), set_hyperparameter.sample(45)) + + # We have to explicitly disable setting names if we want to use it for "Set" hyper-parameter. + class SetHyperparams(hyperparams.Hyperparams, set_names=False): + choice = hyperparams.Choice({ + 'none': hyperparams.Hyperparams, + 'range': hyperparams.Hyperparams.define(OrderedDict( + value=hyperparams.UniformInt(1, 10, 2), + )), + }, 'none') + + class TestHyperparams(hyperparams.Hyperparams): + a = set_hyperparameter + b = hyperparams.Set(SetHyperparams, (SetHyperparams({'choice': {'choice': 'none'}}),), 0, 3) + + self.assertEqual(TestHyperparams.to_simple_structure(), { + 'a': { + 'type': hyperparams.Set, + 'default': ({'range': 2, 'enum': 'a'},), + 'structural_type': typing.Sequence[elements], + 'semantic_types': [], + 'elements': { + 'range': { + 'type': hyperparams.UniformInt, + 'default': 2, + 'structural_type': int, + 'semantic_types': [], + 'lower': 1, + 'upper': 10, + 'lower_inclusive': True, + 'upper_inclusive': False, + }, + 'enum': { + 'type': hyperparams.Enumeration, + 'default': None, + 'structural_type': typing.Union[str, int, type(None)], + 'semantic_types': [], + 'values': ['a', 'b', 1, 2, None], + }, + }, + 'is_configuration': True, + 'min_size': 0, + 'max_size': 5, + }, + 'b': { + 'type': hyperparams.Set, + 'default': ({'choice': {'choice': 'none'}},), + 'structural_type': typing.Sequence[SetHyperparams], + 'semantic_types': [], + 'elements': { + 'choice': { + 'type': hyperparams.Choice, + 'default': {'choice': 'none'}, + 'structural_type': typing.Dict, + 'semantic_types': [], + 'choices': { + 'none': { + 'choice': { + 'type': hyperparams.Hyperparameter, + 'default': 'none', + 'structural_type': str, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/ChoiceParameter'], + }, + }, + 'range': { + 'value': { + 'type': hyperparams.UniformInt, + 'default': 2, + 'structural_type': int, + 'semantic_types': [], + 'lower': 1, + 'upper': 10, + 'lower_inclusive': True, + 'upper_inclusive': False, + }, + 'choice': { + 'type': hyperparams.Hyperparameter, + 'default': 'range', + 'structural_type': str, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/ChoiceParameter'], + }, + }, + }, + }, + }, + 'is_configuration': True, + 'min_size': 0, + 'max_size': 3, + }, + }) + + self.assertEqual(TestHyperparams.configuration['b'].elements.configuration['choice'].choices['range'].configuration['value'].name, 'b.choice.range.value') + + self.assertEqual(TestHyperparams.defaults(), { + 'a': ({'range': 2, 'enum': 'a'},), + 'b': ({'choice': {'choice': 'none'}},), + }) + self.assertTrue(utils.is_instance(TestHyperparams.defaults()['a'], typing.Sequence[elements])) + self.assertTrue(utils.is_instance(TestHyperparams.defaults()['b'], typing.Sequence[SetHyperparams])) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(TestHyperparams.sample(42), { + 'a': ({'range': 8, 'enum': None}, {'range': 5, 'enum': 'b'}, {'range': 3, 'enum': 1}), + 'b': ( + { + 'choice': {'value': 5, 'choice': 'range'}, + }, { + 'choice': {'value': 8, 'choice': 'range'}, + }, + ), + }) + self.assertEqual(len(cm.records), 1) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(TestHyperparams.sample(42).values_to_json_structure(), { + 'a': [{'range': 8, 'enum': None}, {'range': 5, 'enum': 'b'}, {'range': 3, 'enum': 1}], + 'b': [ + { + 'choice': {'value': 5, 'choice': 'range'}, + }, { + 'choice': {'value': 8, 'choice': 'range'}, + }, + ], + }) + self.assertEqual(len(cm.records), 1) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(TestHyperparams.values_from_json_structure(TestHyperparams.sample(42).values_to_json_structure()), TestHyperparams.sample(42)) + self.assertEqual(len(cm.records), 1) + + self.assertEqual(len(list(TestHyperparams.traverse())), 8) + + self.assertEqual(TestHyperparams.defaults('a'), ({'range': 2, 'enum': 'a'},)) + self.assertEqual(TestHyperparams.defaults('a.range'), 2) + # Default of a whole "Set" hyper-parameter can be different than of nested hyper-parameters. + self.assertEqual(TestHyperparams.defaults('a.enum'), None) + self.assertEqual(TestHyperparams.defaults('b'), ({'choice': {'choice': 'none'}},)) + self.assertEqual(TestHyperparams.defaults('b.choice'), {'choice': 'none'}) + self.assertEqual(TestHyperparams.defaults('b.choice.none'), {'choice': 'none'}) + self.assertEqual(TestHyperparams.defaults('b.choice.none.choice'), 'none') + self.assertEqual(TestHyperparams.defaults('b.choice.range'), {'choice': 'range', 'value': 2}) + self.assertEqual(TestHyperparams.defaults('b.choice.range.value'), 2) + self.assertEqual(TestHyperparams.defaults('b.choice.range.choice'), 'range') + + self.assertEqual(TestHyperparams(TestHyperparams.defaults(), b=( + SetHyperparams({ + 'choice': {'value': 5, 'choice': 'range'}, + }), + SetHyperparams({ + 'choice': {'value': 8, 'choice': 'range'}, + }), + )), { + 'a': ({'range': 2, 'enum': 'a'},), + 'b': ( + { + 'choice': {'value': 5, 'choice': 'range'}, + }, + { + 'choice': {'value': 8, 'choice': 'range'}, + }, + ), + }) + self.assertEqual(TestHyperparams(TestHyperparams.defaults(), **{'a': ( + elements({'range': 8, 'enum': None}), + elements({'range': 5, 'enum': 'b'}), + elements({'range': 3, 'enum': 1}), + )}), { + 'a': ( + {'range': 8, 'enum': None}, + {'range': 5, 'enum': 'b'}, + {'range': 3, 'enum': 1}, + ), + 'b': ({'choice': {'choice': 'none'}},) + }) + + self.assertEqual(TestHyperparams.defaults().replace({'a': ( + elements({'range': 8, 'enum': None}), + elements({'range': 5, 'enum': 'b'}), + elements({'range': 3, 'enum': 1}), + )}), { + 'a': ( + {'range': 8, 'enum': None}, + {'range': 5, 'enum': 'b'}, + {'range': 3, 'enum': 1}, + ), + 'b': ({'choice': {'choice': 'none'}},), + }) + + def test_choice(self): + choices_hyperparameter = hyperparams.Choice({ + 'none': hyperparams.Hyperparams, + 'range': hyperparams.Hyperparams.define(OrderedDict( + # To test that we can use this name. + configuration=hyperparams.UniformInt(1, 10, 2), + )), + }, 'none') + + # Class should not be changed directly (when adding "choice"). + self.assertEqual(hyperparams.Hyperparams.configuration, {}) + + self.assertEqual(choices_hyperparameter.get_default(), {'choice': 'none'}) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(choices_hyperparameter.sample(45), {'choice': 'range', 'configuration': 4}) + self.assertEqual(len(cm.records), 1) + self.assertEqual(choices_hyperparameter.get_max_samples(), 10) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(choices_hyperparameter.sample_multiple(0, 3, 42), (frozendict.frozendict({'choice': 'range', 'configuration': 8}), frozendict.frozendict({'choice': 'none'}))) + self.assertEqual(len(cm.records), 1) + + self.maxDiff = None + + self.assertEqual(choices_hyperparameter.to_simple_structure(), { + 'default': {'choice': 'none'}, + 'semantic_types': [], + 'structural_type': typing.Dict, + 'type': hyperparams.Choice, + 'choices': { + 'none': { + 'choice': { + 'default': 'none', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/ChoiceParameter'], + 'structural_type': str, + 'type': hyperparams.Hyperparameter, + }, + }, + 'range': { + 'choice': { + 'default': 'range', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/ChoiceParameter'], + 'structural_type': str, + 'type': hyperparams.Hyperparameter, + }, + 'configuration': { + 'default': 2, + 'lower': 1, + 'lower_inclusive': True, + 'upper': 10, + 'upper_inclusive': False, + 'semantic_types': [], + 'structural_type': int, + 'type': hyperparams.UniformInt, + }, + }, + }, + }) + + self.assertEqual(choices_hyperparameter.value_to_json_structure(choices_hyperparameter.get_default()), {'choice': 'none'}) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(choices_hyperparameter.value_to_json_structure(choices_hyperparameter.sample(45)), {'configuration': 4, 'choice': 'range'}) + self.assertEqual(len(cm.records), 1) + + self.assertEqual(choices_hyperparameter.value_from_json_structure(choices_hyperparameter.value_to_json_structure(choices_hyperparameter.get_default())), choices_hyperparameter.get_default()) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(choices_hyperparameter.value_from_json_structure(choices_hyperparameter.value_to_json_structure(choices_hyperparameter.sample(45))), choices_hyperparameter.sample(45)) + self.assertEqual(len(cm.records), 1) + + # We have to explicitly disable setting names if we want to use it for "Choice" hyper-parameter. + class ChoicesHyperparams(hyperparams.Hyperparams, set_names=False): + foo = hyperparams.UniformInt(5, 20, 10) + + class TestHyperparams(hyperparams.Hyperparams): + a = choices_hyperparameter + b = hyperparams.Choice({ + 'nochoice': ChoicesHyperparams, + }, 'nochoice') + + self.assertEqual(TestHyperparams.configuration['a'].choices['range'].configuration['configuration'].name, 'a.range.configuration') + + self.assertEqual(TestHyperparams.defaults(), {'a': {'choice': 'none'}, 'b': {'choice': 'nochoice', 'foo': 10}}) + self.assertIsInstance(TestHyperparams.defaults()['a'], hyperparams.Hyperparams) + self.assertIsInstance(TestHyperparams.defaults()['b'], ChoicesHyperparams) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(TestHyperparams.sample(42), {'a': {'choice': 'none'}, 'b': {'choice': 'nochoice', 'foo': 8}}) + self.assertEqual(len(cm.records), 1) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(TestHyperparams.sample(42).values_to_json_structure(), {'a': {'choice': 'none'}, 'b': {'choice': 'nochoice', 'foo': 8}}) + self.assertEqual(len(cm.records), 1) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(TestHyperparams.values_from_json_structure(TestHyperparams.sample(42).values_to_json_structure()), TestHyperparams.sample(42)) + self.assertEqual(len(cm.records), 1) + + self.assertEqual(len(list(TestHyperparams.traverse())), 7) + + self.assertEqual(TestHyperparams.defaults('a'), {'choice': 'none'}) + self.assertEqual(TestHyperparams.defaults('a.none'), {'choice': 'none'}) + self.assertEqual(TestHyperparams.defaults('a.none.choice'), 'none') + self.assertEqual(TestHyperparams.defaults('a.range'), {'choice': 'range', 'configuration': 2}) + self.assertEqual(TestHyperparams.defaults('a.range.configuration'), 2) + self.assertEqual(TestHyperparams.defaults('a.range.choice'), 'range') + self.assertEqual(TestHyperparams.defaults('b'), {'choice': 'nochoice', 'foo': 10}) + self.assertEqual(TestHyperparams.defaults('b.nochoice'), {'choice': 'nochoice', 'foo': 10}) + self.assertEqual(TestHyperparams.defaults('b.nochoice.foo'), 10) + self.assertEqual(TestHyperparams.defaults('b.nochoice.choice'), 'nochoice') + + def test_primitive(self): + # To hide any logging or stdout output. + with utils.silence(): + index.register_primitive('d3m.primitives.regression.monomial.Test', MonomialPrimitive) + index.register_primitive('d3m.primitives.data_generation.random.Test', RandomPrimitive) + index.register_primitive('d3m.primitives.operator.sum.Test', SumPrimitive) + index.register_primitive('d3m.primitives.operator.increment.Test', IncrementPrimitive) + + hyperparameter = hyperparams.Primitive(MonomialPrimitive) + + self.assertEqual(hyperparameter.structural_type, MonomialPrimitive) + self.assertEqual(hyperparameter.get_default(), MonomialPrimitive) + # To hide any logging or stdout output. + with utils.silence(): + self.assertEqual(hyperparameter.sample(42), MonomialPrimitive) + + hyperparams_class = MonomialPrimitive.metadata.get_hyperparams() + primitive = MonomialPrimitive(hyperparams=hyperparams_class.defaults()) + + hyperparameter = hyperparams.Enumeration([MonomialPrimitive, RandomPrimitive, SumPrimitive, IncrementPrimitive, None], None) + + self.assertEqual(hyperparameter.structural_type, typing.Union[MonomialPrimitive, RandomPrimitive, SumPrimitive, IncrementPrimitive, type(None)]) + self.assertEqual(hyperparameter.get_default(), None) + self.assertEqual(hyperparameter.sample(42), IncrementPrimitive) + + hyperparameter = hyperparams.Enumeration[typing.Optional[base.PrimitiveBase]]([MonomialPrimitive, RandomPrimitive, SumPrimitive, IncrementPrimitive, None], None) + + self.assertEqual(hyperparameter.structural_type, typing.Optional[base.PrimitiveBase]) + self.assertEqual(hyperparameter.get_default(), None) + self.assertEqual(hyperparameter.sample(42), IncrementPrimitive) + + set_hyperparameter = hyperparams.Set(hyperparameter, (MonomialPrimitive, RandomPrimitive), 2, 4) + + self.assertEqual(set_hyperparameter.get_default(), (MonomialPrimitive, RandomPrimitive)) + self.assertEqual(set_hyperparameter.sample(42), (RandomPrimitive, None, SumPrimitive, MonomialPrimitive)) + + union_hyperparameter = hyperparams.Union(OrderedDict( + none=hyperparams.Hyperparameter(None), + primitive=hyperparams.Enumeration[base.PrimitiveBase]([MonomialPrimitive, RandomPrimitive, SumPrimitive, IncrementPrimitive], MonomialPrimitive), + ), 'none') + + self.assertEqual(union_hyperparameter.get_default(), None) + self.assertEqual(union_hyperparameter.sample(45), SumPrimitive) + + hyperparameter = hyperparams.Enumeration([primitive, RandomPrimitive, SumPrimitive, IncrementPrimitive, None], None) + + self.assertEqual(hyperparameter.structural_type, typing.Union[MonomialPrimitive, RandomPrimitive, SumPrimitive, IncrementPrimitive, type(None)]) + self.assertEqual(hyperparameter.get_default(), None) + self.assertEqual(hyperparameter.sample(42), IncrementPrimitive) + + hyperparameter = hyperparams.Enumeration[typing.Optional[base.PrimitiveBase]]([primitive, RandomPrimitive, SumPrimitive, IncrementPrimitive, None], None) + + self.assertEqual(hyperparameter.structural_type, typing.Optional[base.PrimitiveBase]) + self.assertEqual(hyperparameter.get_default(), None) + self.assertEqual(hyperparameter.sample(42), IncrementPrimitive) + + set_hyperparameter = hyperparams.Set(hyperparameter, (primitive, RandomPrimitive), 2, 4) + + self.assertEqual(set_hyperparameter.get_default(), (primitive, RandomPrimitive)) + self.assertEqual(set_hyperparameter.sample(42), (RandomPrimitive, None, SumPrimitive, primitive)) + + union_hyperparameter = hyperparams.Union(OrderedDict( + none=hyperparams.Hyperparameter(None), + primitive=hyperparams.Enumeration[base.PrimitiveBase]([primitive, RandomPrimitive, SumPrimitive, IncrementPrimitive], primitive), + ), 'none') + + self.assertEqual(union_hyperparameter.get_default(), None) + self.assertEqual(union_hyperparameter.sample(45), SumPrimitive) + + hyperparameter = hyperparams.Primitive(primitive) + + self.assertEqual(hyperparameter.structural_type, MonomialPrimitive) + self.assertEqual(hyperparameter.get_default(), primitive) + # To hide any logging or stdout output. + with utils.silence(): + self.assertEqual(hyperparameter.sample(42), primitive) + + hyperparameter = hyperparams.Primitive[base.PrimitiveBase](MonomialPrimitive) + + self.assertEqual(hyperparameter.get_default(), MonomialPrimitive) + # To hide any logging or stdout output. + with utils.silence(): + # There might be additional primitives available in the system, + # so we cannot know which one will really be returned. + self.assertTrue(hyperparameter.sample(42), hyperparameter.matching_primitives) + + self.maxDiff = None + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': MonomialPrimitive, + 'semantic_types': [], + 'structural_type': base.PrimitiveBase, + 'type': hyperparams.Primitive, + 'primitive_families': [], + 'algorithm_types': [], + 'produce_methods': [], + }) + + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.get_default()), {'class': 'd3m.primitives.regression.monomial.Test'}) + self.assertEqual(hyperparameter.value_from_json_structure(hyperparameter.value_to_json_structure(hyperparameter.get_default())), hyperparameter.get_default()) + + self.assertTrue(hyperparameter.get_max_samples() >= 4, hyperparameter.get_max_samples()) + + hyperparameter = hyperparams.Primitive[base.PrimitiveBase](primitive) + + self.assertEqual(hyperparameter.get_default(), primitive) + + self.assertEqual(hyperparameter.to_simple_structure(), { + 'default': primitive, + 'semantic_types': [], + 'structural_type': base.PrimitiveBase, + 'type': hyperparams.Primitive, + 'primitive_families': [], + 'algorithm_types': [], + 'produce_methods': [], + }) + + self.assertEqual(hyperparameter.value_to_json_structure(hyperparameter.get_default()), {'instance': 'gANjdGVzdF9wcmltaXRpdmVzLm1vbm9taWFsCk1vbm9taWFsUHJpbWl0aXZlCnEAKYFxAX1xAihYCwAAAGNvbnN0cnVjdG9ycQN9cQQoWAsAAABoeXBlcnBhcmFtc3EFY3Rlc3RfcHJpbWl0aXZlcy5tb25vbWlhbApIeXBlcnBhcmFtcwpxBimBcQd9cQhYBAAAAGJpYXNxCUcAAAAAAAAAAHNiWAsAAAByYW5kb21fc2VlZHEKSwB1WAYAAABwYXJhbXNxC2N0ZXN0X3ByaW1pdGl2ZXMubW9ub21pYWwKUGFyYW1zCnEMKYFxDVgBAAAAYXEOSwBzdWIu'}) + + set_hyperparameter = hyperparams.Set(hyperparameter, (MonomialPrimitive, RandomPrimitive), 2, 4) + + self.assertEqual(set_hyperparameter.get_default(), (MonomialPrimitive, RandomPrimitive)) + + union_hyperparameter = hyperparams.Union(OrderedDict( + none=hyperparams.Hyperparameter(None), + primitive=hyperparameter, + ), 'none') + + self.assertEqual(union_hyperparameter.get_default(), None) + + def test_invalid_name(self): + with self.assertRaisesRegex(ValueError, 'Hyper-parameter name \'.*\' contains invalid characters.'): + hyperparams.Hyperparams.define({ + 'foo.bar': hyperparams.Uniform(1.0, 10.0, 2.0), + }) + + def test_class_as_default(self): + class Foo: + pass + + foo = Foo() + + hyperparameter = hyperparams.Enumeration(['a', 'b', 1, 2, foo], foo) + + self.assertEqual(hyperparameter.value_to_json_structure(1), {'encoding': 'pickle', 'value': 'gANLAS4='}) + + hyperparameter = hyperparams.Enumeration(['a', 'b', 1, 2], 2) + + self.assertEqual(hyperparameter.value_to_json_structure(1), 1) + + def test_configuration_immutability(self): + class TestHyperparams(hyperparams.Hyperparams): + a = hyperparams.Union(OrderedDict( + range=hyperparams.UniformInt(1, 10, 2), + none=hyperparams.Hyperparameter(None), + ), 'range') + b = hyperparams.Uniform(1.0, 10.0, 2.0) + + with self.assertRaisesRegex(TypeError, '\'FrozenOrderedDict\' object does not support item assignment'): + TestHyperparams.configuration['c'] = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + + with self.assertRaisesRegex(AttributeError, 'Hyper-parameters configuration is immutable'): + TestHyperparams.configuration = OrderedDict( + range=hyperparams.UniformInt(1, 10, 2), + none=hyperparams.Hyperparameter(None), + ) + + def test_dict_as_default(self): + Inputs = container.DataFrame + Outputs = container.DataFrame + + class Hyperparams(hyperparams.Hyperparams): + value = hyperparams.Hyperparameter({}, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']) + + # Silence any validation warnings. + with utils.silence(): + class Primitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '152ea984-d8a4-4a37-87a0-29829b082e54', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.dict_as_default', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + self.assertEqual(Primitive.metadata.query()['primitive_code']['hyperparams']['value']['default'], {}) + + def test_comma_warning(self): + logger = logging.getLogger('d3m.metadata.hyperparams') + + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + class Hyperparams(hyperparams.Hyperparams): + value = hyperparams.Hyperparameter({}, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']), + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].message, 'Probably invalid definition of a hyper-parameter. Hyper-parameter should be defined as class attribute without a trailing comma.') + + def test_json_schema(self): + Inputs = container.DataFrame + Outputs = container.DataFrame + + # Silence any validation warnings. + with utils.silence(): + # Defining primitive triggers checking against JSON schema. + class TestJsonPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, TestPicklingHyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': 'cdfada09-5161-4f2e-bc7f-223d843d59c1', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.json_schema', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def test_pickling(self): + pickle.loads(pickle.dumps(TestPicklingHyperparams)) + + unpickled = pickle.loads(pickle.dumps(TestPicklingHyperparams.defaults())) + + self.assertEqual(unpickled['choice'].configuration['value'].structural_type, typing.Union[float, int]) + + def test_sorted_set(self): + set_hyperparameter = hyperparams.SortedSet(hyperparams.Hyperparameter[int](1), []) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(set(set_hyperparameter.sample_multiple(min_samples=2, max_samples=2)), {(1,), ()}) + self.assertEqual(len(cm.records), 1) + + elements = hyperparams.Enumeration(['a', 'b', 'c', 'd', 'e'], 'e') + set_hyperparameter = hyperparams.SortedSet(elements, ('a', 'b', 'c', 'd', 'e'), 5, 5) + + self.assertEqual(set_hyperparameter.get_default(), ('a', 'b', 'c', 'd', 'e')) + self.assertEqual(set_hyperparameter.sample(45), ('a', 'b', 'c', 'd', 'e')) + self.assertEqual(set_hyperparameter.get_max_samples(), 1) + self.assertEqual(set_hyperparameter.sample_multiple(1, 1, 42), (('a', 'b', 'c', 'd', 'e'),)) + self.assertEqual(set_hyperparameter.sample_multiple(0, 1, 42), ()) + + self.maxDiff = None + + self.assertEqual(set_hyperparameter.to_simple_structure(), { + 'default': ('a', 'b', 'c', 'd', 'e'), + 'semantic_types': [], + 'structural_type': typing.Sequence[str], + 'type': hyperparams.SortedSet, + 'min_size': 5, + 'max_size': 5, + 'elements': { + 'default': 'e', + 'semantic_types': [], + 'structural_type': str, + 'type': hyperparams.Enumeration, + 'values': ['a', 'b', 'c', 'd', 'e'], + }, + 'ascending': True, + }) + + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default()), ['a', 'b', 'c', 'd', 'e']) + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(45)), ['a', 'b', 'c', 'd', 'e']) + + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default())), set_hyperparameter.get_default()) + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(45))), set_hyperparameter.sample(45)) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' has less than 5 elements'): + elements = hyperparams.Enumeration(['a', 'b', 'c', 'd', 'e'], 'e') + hyperparams.SortedSet(elements, (), 5, 5) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is not among values'): + elements = hyperparams.Enumeration(['a', 'b', 'c', 'd', 'e'], 'e') + hyperparams.SortedSet(elements, ('a', 'b', 'c', 'd', 'f'), 5, 5) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' has duplicate elements'): + elements = hyperparams.Enumeration(['a', 'b', 'c', 'd', 'e'], 'e') + hyperparams.SortedSet(elements, ('a', 'b', 'c', 'd', 'd'), 5, 5) + + set_hyperparameter.contribute_to_class('foo') + + with self.assertRaises(KeyError): + set_hyperparameter.get_default('foo') + + list_of_supported_metafeatures = ['f1', 'f2', 'f3'] + metafeature = hyperparams.Enumeration(list_of_supported_metafeatures, list_of_supported_metafeatures[0], semantic_types=['https://metadata.datadrivendiscovery.org/types/MetafeatureParameter']) + set_hyperparameter = hyperparams.SortedSet(metafeature, (), 0, 3) + + self.assertEqual(set_hyperparameter.get_default(), ()) + self.assertEqual(set_hyperparameter.sample(42), ('f2', 'f3')) + self.assertEqual(set_hyperparameter.get_max_samples(), 8) + self.assertEqual(set_hyperparameter.sample_multiple(0, 3, 42), (('f1', 'f2', 'f3'), ('f2', 'f3'))) + + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default()), []) + self.assertEqual(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(42)), ['f2', 'f3']) + + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.get_default())), set_hyperparameter.get_default()) + self.assertEqual(set_hyperparameter.value_from_json_structure(set_hyperparameter.value_to_json_structure(set_hyperparameter.sample(42))), set_hyperparameter.sample(42)) + + set_hyperparameter = hyperparams.SortedSet(metafeature, (), 0, None) + + self.assertEqual(set_hyperparameter.get_default(), ()) + self.assertEqual(set_hyperparameter.sample(42), ('f2', 'f3')) + self.assertEqual(set_hyperparameter.get_max_samples(), 8) + self.assertEqual(set_hyperparameter.sample_multiple(0, 3, 42), (('f1', 'f2', 'f3'), ('f2', 'f3'))) + + set_hyperparameter = hyperparams.SortedSet(hyperparams.Hyperparameter[int](0), (0, 1), min_size=2, max_size=2) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(set_hyperparameter.sample_multiple(1, 1, 42), ((0, 1),)) + self.assertEqual(len(cm.records), 1) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(set_hyperparameter.sample(42), (0, 1)) + self.assertEqual(len(cm.records), 1) + + set_hyperparameter = hyperparams.SortedSet(hyperparams.Hyperparameter[int](0), (0,), min_size=1, max_size=1) + + with self.assertLogs(hyperparams.logger) as cm: + set_hyperparameter.sample(42) + self.assertEqual(len(cm.records), 1) + + set_hyperparameter = hyperparams.SortedSet(hyperparams.Uniform(0.0, 100.0, 50.0, lower_inclusive=False, upper_inclusive=False), (25.0, 75.0), min_size=2, max_size=2) + + self.assertEqual(set_hyperparameter.sample(42), (37.454011884736246, 95.07143064099162)) + + def test_sorted_set_with_hyperparams(self): + elements = hyperparams.Hyperparams.define(OrderedDict( + range=hyperparams.UniformInt(1, 10, 2), + enum=hyperparams.Enumeration(['a', 'b', 'c', 'd', 'e'], 'e'), + )) + + with self.assertRaises(exceptions.NotSupportedError): + hyperparams.SortedSet(elements, (elements(range=2, enum='a'),), 0, 5) + + def test_list(self): + list_hyperparameter = hyperparams.List(hyperparams.Hyperparameter[int](1), [], 0, 1) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(set(list_hyperparameter.sample_multiple(min_samples=2, max_samples=2)), {(1,), ()}) + self.assertEqual(len(cm.records), 1) + + elements = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + list_hyperparameter = hyperparams.List(elements, ('a', 'b', 1, 2, None), 5, 5) + + self.assertEqual(list_hyperparameter.get_default(), ('a', 'b', 1, 2, None)) + self.assertEqual(list_hyperparameter.sample(45), (2, 2, None, 'a', 2)) + self.assertEqual(list_hyperparameter.get_max_samples(), 3125) + self.assertEqual(list_hyperparameter.sample_multiple(1, 1, 42), ((2, None, 1, None, None),)) + self.assertEqual(list_hyperparameter.sample_multiple(0, 1, 42), ()) + + self.maxDiff = None + + self.assertEqual(list_hyperparameter.to_simple_structure(), { + 'default': ('a', 'b', 1, 2, None), + 'semantic_types': [], + 'structural_type': typing.Sequence[typing.Union[str, int, type(None)]], + 'type': hyperparams.List, + 'min_size': 5, + 'max_size': 5, + 'elements': { + 'default': None, + 'semantic_types': [], + 'structural_type': typing.Union[str, int, type(None)], + 'type': hyperparams.Enumeration, + 'values': ['a', 'b', 1, 2, None], + }, + 'is_configuration': False, + }) + + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default()), ['a', 'b', 1, 2, None]) + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(45)), [2, 2, None, 'a', 2]) + + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default())), list_hyperparameter.get_default()) + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(45))), list_hyperparameter.sample(45)) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' has less than 5 elements'): + elements = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + hyperparams.List(elements, (), 5, 5) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is not among values'): + elements = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + hyperparams.List(elements, ('a', 'b', 1, 2, 3), 5, 5) + + list_hyperparameter.contribute_to_class('foo') + + with self.assertRaises(KeyError): + list_hyperparameter.get_default('foo') + + list_of_supported_metafeatures = ['f1', 'f2', 'f3'] + metafeature = hyperparams.Enumeration(list_of_supported_metafeatures, list_of_supported_metafeatures[0], semantic_types=['https://metadata.datadrivendiscovery.org/types/MetafeatureParameter']) + list_hyperparameter = hyperparams.List(metafeature, (), 0, 3) + + self.assertEqual(list_hyperparameter.get_default(), ()) + self.assertEqual(list_hyperparameter.sample(42), ('f1', 'f3')) + self.assertEqual(list_hyperparameter.get_max_samples(), 40) + self.assertEqual(list_hyperparameter.sample_multiple(0, 3, 42), (('f1', 'f3', 'f3'), ('f1', 'f1', 'f3'))) + + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default()), []) + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(42)), ['f1', 'f3']) + + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default())), list_hyperparameter.get_default()) + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(42))), list_hyperparameter.sample(42)) + + list_hyperparameter = hyperparams.List(metafeature, (), 0, 10) + + self.assertEqual(list_hyperparameter.get_default(), ()) + self.assertEqual(list_hyperparameter.sample(42), ('f1', 'f3', 'f3', 'f1', 'f1', 'f3')) + self.assertEqual(list_hyperparameter.get_max_samples(), 88573) + self.assertEqual(list_hyperparameter.sample_multiple(0, 3, 42), (('f1', 'f3', 'f3'), ('f1', 'f1', 'f3', 'f2', 'f3', 'f3', 'f3'))) + + list_hyperparameter = hyperparams.List(hyperparams.Bounded(1, None, 100), (100,), min_size=1, max_size=None) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(list_hyperparameter.sample(42), (100,)) + self.assertEqual(len(cm.records), 1) + + def test_list_with_hyperparams(self): + elements = hyperparams.Hyperparams.define(OrderedDict( + range=hyperparams.UniformInt(1, 10, 2), + enum=hyperparams.Enumeration(['a', 'b', 1, 2, None], None), + )) + list_hyperparameter = hyperparams.List(elements, (elements(range=2, enum='a'),), 0, 5) + + self.assertEqual(list_hyperparameter.get_default(), ({'range': 2, 'enum': 'a'},)) + self.assertEqual(list_hyperparameter.sample(45), ({'range': 4, 'enum': None}, {'range': 1, 'enum': 2}, {'range': 5, 'enum': 'b'})) + self.assertEqual(list_hyperparameter.get_max_samples(), 188721946) + self.assertEqual(list_hyperparameter.sample_multiple(1, 1, 42), (({'range': 8, 'enum': None}, {'range': 5, 'enum': 'b'}, {'range': 3, 'enum': 1}),)) + self.assertEqual(list_hyperparameter.sample_multiple(0, 1, 42), ()) + self.maxDiff = None + + self.assertEqual(list_hyperparameter.to_simple_structure(), { + 'default': ({'range': 2, 'enum': 'a'},), + 'elements': { + 'enum': { + 'default': None, + 'semantic_types': [], + 'structural_type': typing.Union[str, int, type(None)], + 'type': hyperparams.Enumeration, + 'values': ['a', 'b', 1, 2, None], + }, + 'range': { + 'default': 2, + 'lower': 1, + 'semantic_types': [], + 'structural_type': int, + 'type': hyperparams.UniformInt, + 'upper': 10, + 'lower_inclusive': True, + 'upper_inclusive': False, + }, + }, + 'is_configuration': True, + 'max_size': 5, + 'min_size': 0, + 'semantic_types': [], + 'structural_type': typing.Sequence[elements], + 'type': hyperparams.List, + }) + + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default()), [{'range': 2, 'enum': 'a'}]) + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(45)), [{'range': 4, 'enum': None}, {'range': 1, 'enum': 2}, {'range': 5, 'enum': 'b'}]) + + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default())), list_hyperparameter.get_default()) + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(45))), list_hyperparameter.sample(45)) + + # We have to explicitly disable setting names if we want to use it for "List" hyper-parameter. + class ListHyperparams(hyperparams.Hyperparams, set_names=False): + choice = hyperparams.Choice({ + 'none': hyperparams.Hyperparams, + 'range': hyperparams.Hyperparams.define(OrderedDict( + value=hyperparams.UniformInt(1, 10, 2), + )), + }, 'none') + + class TestHyperparams(hyperparams.Hyperparams): + a = list_hyperparameter + b = hyperparams.List(ListHyperparams, (ListHyperparams({'choice': {'choice': 'none'}}),), 0, 3) + + self.assertEqual(TestHyperparams.to_simple_structure(), { + 'a': { + 'type': hyperparams.List, + 'default': ({'range': 2, 'enum': 'a'},), + 'structural_type': typing.Sequence[elements], + 'semantic_types': [], + 'elements': { + 'range': { + 'type': hyperparams.UniformInt, + 'default': 2, + 'structural_type': int, + 'semantic_types': [], + 'lower': 1, + 'upper': 10, + 'lower_inclusive': True, + 'upper_inclusive': False, + }, + 'enum': { + 'type': hyperparams.Enumeration, + 'default': None, + 'structural_type': typing.Union[str, int, type(None)], + 'semantic_types': [], + 'values': ['a', 'b', 1, 2, None], + }, + }, + 'is_configuration': True, + 'min_size': 0, + 'max_size': 5, + }, + 'b': { + 'type': hyperparams.List, + 'default': ({'choice': {'choice': 'none'}},), + 'structural_type': typing.Sequence[ListHyperparams], + 'semantic_types': [], + 'elements': { + 'choice': { + 'type': hyperparams.Choice, + 'default': {'choice': 'none'}, + 'structural_type': typing.Dict, + 'semantic_types': [], + 'choices': { + 'none': { + 'choice': { + 'type': hyperparams.Hyperparameter, + 'default': 'none', + 'structural_type': str, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/ChoiceParameter'], + }, + }, + 'range': { + 'value': { + 'type': hyperparams.UniformInt, + 'default': 2, + 'structural_type': int, + 'semantic_types': [], + 'lower': 1, + 'upper': 10, + 'lower_inclusive': True, + 'upper_inclusive': False, + }, + 'choice': { + 'type': hyperparams.Hyperparameter, + 'default': 'range', + 'structural_type': str, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/ChoiceParameter'], + }, + }, + }, + }, + }, + 'is_configuration': True, + 'min_size': 0, + 'max_size': 3, + }, + }) + + self.assertEqual(TestHyperparams.configuration['b'].elements.configuration['choice'].choices['range'].configuration['value'].name, 'b.choice.range.value') + + self.assertEqual(TestHyperparams.defaults(), { + 'a': ({'range': 2, 'enum': 'a'},), + 'b': ({'choice': {'choice': 'none'}},), + }) + self.assertTrue(utils.is_instance(TestHyperparams.defaults()['a'], typing.Sequence[elements])) + self.assertTrue(utils.is_instance(TestHyperparams.defaults()['b'], typing.Sequence[ListHyperparams])) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(TestHyperparams.sample(42), { + 'a': ({'range': 8, 'enum': None}, {'range': 5, 'enum': 'b'}, {'range': 3, 'enum': 1}), + 'b': ( + { + 'choice': {'value': 5, 'choice': 'range'}, + }, { + 'choice': {'value': 8, 'choice': 'range'}, + }, + ), + }) + self.assertEqual(len(cm.records), 1) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(TestHyperparams.sample(42).values_to_json_structure(), { + 'a': [{'range': 8, 'enum': None}, {'range': 5, 'enum': 'b'}, {'range': 3, 'enum': 1}], + 'b': [ + { + 'choice': {'value': 5, 'choice': 'range'}, + }, { + 'choice': {'value': 8, 'choice': 'range'}, + }, + ], + }) + self.assertEqual(len(cm.records), 1) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(TestHyperparams.values_from_json_structure(TestHyperparams.sample(42).values_to_json_structure()), TestHyperparams.sample(42)) + self.assertEqual(len(cm.records), 1) + + self.assertEqual(len(list(TestHyperparams.traverse())), 8) + + self.assertEqual(TestHyperparams.defaults('a'), ({'range': 2, 'enum': 'a'},)) + self.assertEqual(TestHyperparams.defaults('a.range'), 2) + # Default of a whole "List" hyper-parameter can be different than of nested hyper-parameters. + self.assertEqual(TestHyperparams.defaults('a.enum'), None) + self.assertEqual(TestHyperparams.defaults('b'), ({'choice': {'choice': 'none'}},)) + self.assertEqual(TestHyperparams.defaults('b.choice'), {'choice': 'none'}) + self.assertEqual(TestHyperparams.defaults('b.choice.none'), {'choice': 'none'}) + self.assertEqual(TestHyperparams.defaults('b.choice.none.choice'), 'none') + self.assertEqual(TestHyperparams.defaults('b.choice.range'), {'choice': 'range', 'value': 2}) + self.assertEqual(TestHyperparams.defaults('b.choice.range.value'), 2) + self.assertEqual(TestHyperparams.defaults('b.choice.range.choice'), 'range') + + self.assertEqual(TestHyperparams(TestHyperparams.defaults(), b=( + ListHyperparams({ + 'choice': {'value': 5, 'choice': 'range'}, + }), + ListHyperparams({ + 'choice': {'value': 8, 'choice': 'range'}, + }), + )), { + 'a': ({'range': 2, 'enum': 'a'},), + 'b': ( + { + 'choice': {'value': 5, 'choice': 'range'}, + }, + { + 'choice': {'value': 8, 'choice': 'range'}, + }, + ), + }) + self.assertEqual(TestHyperparams(TestHyperparams.defaults(), **{'a': ( + elements({'range': 8, 'enum': None}), + elements({'range': 5, 'enum': 'b'}), + elements({'range': 3, 'enum': 1}), + )}), { + 'a': ( + {'range': 8, 'enum': None}, + {'range': 5, 'enum': 'b'}, + {'range': 3, 'enum': 1}, + ), + 'b': ({'choice': {'choice': 'none'}},) + }) + + self.assertEqual(TestHyperparams.defaults().replace({'a': ( + elements({'range': 8, 'enum': None}), + elements({'range': 5, 'enum': 'b'}), + elements({'range': 3, 'enum': 1}), + )}), { + 'a': ( + {'range': 8, 'enum': None}, + {'range': 5, 'enum': 'b'}, + {'range': 3, 'enum': 1}, + ), + 'b': ({'choice': {'choice': 'none'}},), + }) + + def test_sorted_list(self): + list_hyperparameter = hyperparams.SortedList(hyperparams.Hyperparameter[int](1), [], 0, 1) + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(set(list_hyperparameter.sample_multiple(min_samples=2, max_samples=2)), {(1,), ()}) + self.assertEqual(len(cm.records), 1) + + elements = hyperparams.Enumeration(['a', 'b', 'c', 'd', 'e'], 'e') + list_hyperparameter = hyperparams.SortedList(elements, ('a', 'b', 'c', 'd', 'e'), 5, 5) + + self.assertEqual(list_hyperparameter.get_default(), ('a', 'b', 'c', 'd', 'e')) + self.assertEqual(list_hyperparameter.sample(45), ('a', 'd', 'd', 'd', 'e')) + self.assertEqual(list_hyperparameter.get_max_samples(), 126) + self.assertEqual(list_hyperparameter.sample_multiple(1, 1, 42), (('c', 'd', 'e', 'e', 'e'),)) + self.assertEqual(list_hyperparameter.sample_multiple(0, 1, 42), ()) + + self.maxDiff = None + + self.assertEqual(list_hyperparameter.to_simple_structure(), { + 'default': ('a', 'b', 'c', 'd', 'e'), + 'semantic_types': [], + 'structural_type': typing.Sequence[str], + 'type': hyperparams.SortedList, + 'min_size': 5, + 'max_size': 5, + 'elements': { + 'default': 'e', + 'semantic_types': [], + 'structural_type': str, + 'type': hyperparams.Enumeration, + 'values': ['a', 'b', 'c', 'd', 'e'], + }, + 'ascending': True, + }) + + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default()), ['a', 'b', 'c', 'd', 'e']) + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(45)), ['a', 'd', 'd', 'd', 'e']) + + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default())), list_hyperparameter.get_default()) + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(45))), list_hyperparameter.sample(45)) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' has less than 5 elements'): + elements = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + hyperparams.SortedList(elements, (), 5, 5) + + with self.assertRaisesRegex(ValueError, 'Value \'.*\' is not among values'): + elements = hyperparams.Enumeration(['a', 'b', 1, 2, None], None) + hyperparams.SortedList(elements, ('a', 'b', 1, 2, 3), 5, 5) + + list_hyperparameter.contribute_to_class('foo') + + with self.assertRaises(KeyError): + list_hyperparameter.get_default('foo') + + list_of_supported_metafeatures = ['f1', 'f2', 'f3'] + metafeature = hyperparams.Enumeration(list_of_supported_metafeatures, list_of_supported_metafeatures[0], semantic_types=['https://metadata.datadrivendiscovery.org/types/MetafeatureParameter']) + list_hyperparameter = hyperparams.SortedList(metafeature, (), 0, 3) + + self.assertEqual(list_hyperparameter.get_default(), ()) + self.assertEqual(list_hyperparameter.sample(42), ('f1', 'f3')) + self.assertEqual(list_hyperparameter.get_max_samples(), 20) + self.assertEqual(list_hyperparameter.sample_multiple(0, 3, 42), (('f1', 'f3', 'f3'), ('f1', 'f1', 'f3'))) + + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default()), []) + self.assertEqual(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(42)), ['f1', 'f3']) + + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.get_default())), list_hyperparameter.get_default()) + self.assertEqual(list_hyperparameter.value_from_json_structure(list_hyperparameter.value_to_json_structure(list_hyperparameter.sample(42))), list_hyperparameter.sample(42)) + + list_hyperparameter = hyperparams.SortedList(metafeature, (), 0, 10) + + self.assertEqual(list_hyperparameter.get_default(), ()) + self.assertEqual(list_hyperparameter.sample(42), ('f1', 'f1', 'f1', 'f3', 'f3', 'f3')) + self.assertEqual(list_hyperparameter.get_max_samples(), 286) + self.assertEqual(list_hyperparameter.sample_multiple(0, 3, 42), (('f1', 'f3', 'f3'), ('f1', 'f1', 'f2', 'f3', 'f3', 'f3', 'f3'))) + + list_hyperparameter = hyperparams.SortedList(hyperparams.Bounded[int](1, None, 1), (1, 1), min_size=2, max_size=2) + + with self.assertLogs(hyperparams.logger) as cm: + self.assertEqual(list_hyperparameter.sample(42), (1, 1)) + self.assertEqual(len(cm.records), 1) + + def test_sorted_list_with_hyperparams(self): + elements = hyperparams.Hyperparams.define(OrderedDict( + range=hyperparams.UniformInt(1, 10, 2), + enum=hyperparams.Enumeration(['a', 'b', 'c', 'd', 'e'], 'e'), + )) + + with self.assertRaises(exceptions.NotSupportedError): + hyperparams.SortedList(elements, (elements(range=2, enum='a'),), 0, 5) + + def test_import_cycle(self): + # All references to "hyperparams_module" in "d3m.metadata.base" should be lazy: + # for example, as a string in the typing signature, because we have an import cycle. + subprocess.run([sys.executable, '-c', 'import d3m.metadata.base'], check=True) + subprocess.run([sys.executable, '-c', 'import d3m.metadata.hyperparams'], check=True) + + def test_union_float_int(self): + float_hp = hyperparams.Uniform(1, 10, 2) + int_hp = hyperparams.UniformInt(1, 10, 2) + + x = float_hp.value_from_json_structure(2.0) + self.assertEqual(x, 2.0) + self.assertIs(type(x), float) + + x = float_hp.value_from_json_structure(2) + self.assertEqual(x, 2.0) + self.assertIs(type(x), float) + + x = float_hp.value_from_json_structure(2.1) + self.assertEqual(x, 2.1) + self.assertIs(type(x), float) + + x = int_hp.value_from_json_structure(2.0) + self.assertEqual(x, 2) + self.assertIs(type(x), int) + + x = int_hp.value_from_json_structure(2) + self.assertEqual(x, 2) + self.assertIs(type(x), int) + + with self.assertRaises(exceptions.InvalidArgumentTypeError): + int_hp.value_from_json_structure(2.1) + + hyperparameter = hyperparams.Union( + OrderedDict( + float=hyperparams.Uniform(1, 5, 2), + int=hyperparams.UniformInt(6, 10, 7), + ), + 'float', + ) + + self.assertEqual(hyperparameter.value_to_json_structure(2.0), {'case': 'float', 'value': 2.0}) + self.assertEqual(hyperparameter.value_to_json_structure(7), {'case': 'int', 'value': 7}) + + x = hyperparameter.value_from_json_structure({'case': 'float', 'value': 2.0}) + self.assertEqual(x, 2.0) + self.assertIs(type(x), float) + + x = hyperparameter.value_from_json_structure({'case': 'float', 'value': 2.1}) + self.assertEqual(x, 2.1) + self.assertIs(type(x), float) + + x = hyperparameter.value_from_json_structure({'case': 'float', 'value': 2}) + self.assertEqual(x, 2.0) + self.assertIs(type(x), float) + + x = hyperparameter.value_from_json_structure({'case': 'int', 'value': 7}) + self.assertEqual(x, 7) + self.assertIs(type(x), int) + + x = hyperparameter.value_from_json_structure({'case': 'int', 'value': 7.0}) + self.assertEqual(x, 7) + self.assertIs(type(x), int) + + def test_can_serialize_to_json(self): + # See: https://gitlab.com/datadrivendiscovery/d3m/-/issues/440 + # This is enumeration internally so it tests also that enumeration values are kept as-is when sampled. + hyperparameter = hyperparams.UniformBool(True) + sample = hyperparameter.sample() + self.assertIsInstance(sample, bool) + x = hyperparameter.value_to_json_structure(sample) + json.dumps(x) + + def test_sampling_type(self): + sample = hyperparams.Uniform(0, 10, 5).sample() + self.assertIsInstance(sample, float) + + def test_numpy_sampling(self): + class UniformInt64(hyperparams.Bounded[numpy.int64]): + def __init__( + self, lower: numpy.int64, upper: numpy.int64, default: numpy.int64, *, lower_inclusive: bool = True, upper_inclusive: bool = False, + semantic_types: typing.Sequence[str] = None, description: str = None, + ) -> None: + if lower is None or upper is None: + raise exceptions.InvalidArgumentValueError("Bounds cannot be None.") + + super().__init__(lower, upper, default, lower_inclusive=lower_inclusive, upper_inclusive=upper_inclusive, semantic_types=semantic_types, description=description) + + def _initialize_effective_bounds(self) -> None: + self._initialize_effective_bounds_int() + + super()._initialize_effective_bounds() + + def sample(self, random_state: numpy.random.RandomState = None) -> int: + random_state = sklearn_validation.check_random_state(random_state) + + return self.structural_type(random_state.randint(self._effective_lower, self._effective_upper)) + + def get_max_samples(self) -> typing.Optional[int]: + return self._effective_upper - self._effective_lower + + with self.assertRaises(exceptions.InvalidArgumentTypeError): + UniformInt64(0, 10, 5) + + hyperparameter = UniformInt64(numpy.int64(0), numpy.int64(10), numpy.int64(5)) + sample = hyperparameter.sample() + self.assertIsInstance(sample, numpy.int64) + x = hyperparameter.value_to_json_structure(sample) + json.dumps(x) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_increment.py b/d3m/tests/test_increment.py new file mode 100644 index 0000000..a387e3a --- /dev/null +++ b/d3m/tests/test_increment.py @@ -0,0 +1,256 @@ +import json +import unittest +import os.path +import sys + +import numpy + +import d3m +from d3m import container, utils +from d3m.metadata import base + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') + +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.increment import IncrementPrimitive + + +EXPECTED_PRIMITIVE_DESCRIPTION_JSON = r""" +{ + "id": "5c9d5acf-7754-420f-a49f-90f4d9d0d694", + "version": "0.1.0", + "digest": "__DIGEST__", + "name": "Increment Values", + "keywords": [ + "test primitive" + ], + "source": { + "name": "Test team", + "contact": "mailto:author@example.com", + "uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/increment.py", + "https://gitlab.com/datadrivendiscovery/tests-data.git" + ] + }, + "installation": [ + { + "type": "PIP", + "package_uri": "git+https://gitlab.com/datadrivendiscovery/tests-data.git@__GIT_COMMIT__#egg=test_primitives&subdirectory=primitives" + } + ], + "location_uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/raw/__GIT_COMMIT__/primitives/test_primitives/increment.py" + ], + "python_path": "d3m.primitives.operator.increment.Test", + "algorithm_types": [ + "COMPUTER_ALGEBRA" + ], + "primitive_family": "OPERATOR", + "preconditions": [ + "NO_MISSING_VALUES", + "NO_CATEGORICAL_VALUES" + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/primitive.json", + "original_python_path": "test_primitives.increment.IncrementPrimitive", + "primitive_code": { + "class_type_arguments": { + "Inputs": "d3m.container.pandas.DataFrame", + "Outputs": "d3m.container.pandas.DataFrame", + "Hyperparams": "test_primitives.increment.Hyperparams", + "Params": "NoneType" + }, + "interfaces_version": "__INTERFACES_VERSION__", + "interfaces": [ + "transformer.TransformerPrimitiveBase", + "base.PrimitiveBase" + ], + "hyperparams": { + "amount": { + "type": "d3m.metadata.hyperparams.Hyperparameter", + "default": 1, + "structural_type": "float", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/ControlParameter" + ] + } + }, + "arguments": { + "hyperparams": { + "type": "test_primitives.increment.Hyperparams", + "kind": "RUNTIME" + }, + "timeout": { + "type": "typing.Union[NoneType, float]", + "kind": "RUNTIME", + "default": null + }, + "iterations": { + "type": "typing.Union[NoneType, int]", + "kind": "RUNTIME", + "default": null + }, + "produce_methods": { + "type": "typing.Sequence[str]", + "kind": "RUNTIME" + }, + "inputs": { + "type": "d3m.container.pandas.DataFrame", + "kind": "PIPELINE" + }, + "params": { + "type": "NoneType", + "kind": "RUNTIME" + } + }, + "class_methods": {}, + "instance_methods": { + "__init__": { + "kind": "OTHER", + "arguments": [ + "hyperparams" + ], + "returns": "NoneType" + }, + "fit": { + "kind": "OTHER", + "arguments": [ + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[NoneType]", + "description": "A noop.\n\nParameters\n----------\ntimeout:\n A maximum time this primitive should be fitting during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA ``CallResult`` with ``None`` value." + }, + "fit_multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling ``fit`` and after that multiple produce methods at once.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to both fit the primitive and produce outputs\n for all produce methods listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do for both fitting and producing\n outputs of all produce methods.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "get_params": { + "kind": "OTHER", + "arguments": [], + "returns": "NoneType", + "description": "A noop.\n\nReturns\n-------\nAn instance of parameters." + }, + "multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling multiple produce methods at once.\n\nWhen a primitive has multiple produce methods it is common that they might compute the\nsame internal results for same inputs but return different representations of those results.\nIf caller is interested in multiple of those representations, calling multiple produce\nmethods might lead to recomputing same internal results multiple times. To address this,\nthis method allows primitive author to implement an optimized version which computes\ninternal results only once for multiple calls of produce methods, but return those different\nrepresentations.\n\nIf any additional method arguments are added to primitive's produce method(s), they have\nto be added to this method as well. This method should accept an union of all arguments\naccepted by primitive's produce method(s) and then use them accordingly when computing\nresults.\n\nThe default implementation of this method just calls all produce methods listed in\n``produce_methods`` in order and is potentially inefficient.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to produce outputs for all produce methods\n listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "produce": { + "kind": "PRODUCE", + "arguments": [ + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[d3m.container.pandas.DataFrame]", + "singleton": false, + "inputs_across_samples": [], + "description": "Produce primitive's best choice of the output for each of the inputs.\n\nThe output value should be wrapped inside ``CallResult`` object before returning.\n\nIn many cases producing an output is a quick operation in comparison with ``fit``, but not\nall cases are like that. For example, a primitive can start a potentially long optimization\nprocess to compute outputs. ``timeout`` and ``iterations`` can serve as a way for a caller\nto guide the length of this process.\n\nIdeally, a primitive should adapt its call to try to produce the best outputs possible\ninside the time allocated. If this is not possible and the primitive reaches the timeout\nbefore producing outputs, it should raise a ``TimeoutError`` exception to signal that the\ncall was unsuccessful in the given time. The state of the primitive after the exception\nshould be as the method call has never happened and primitive should continue to operate\nnormally. The purpose of ``timeout`` is to give opportunity to a primitive to cleanly\nmanage its state instead of interrupting execution from outside. Maintaining stable internal\nstate should have precedence over respecting the ``timeout`` (caller can terminate the\nmisbehaving primitive from outside anyway). If a longer ``timeout`` would produce\ndifferent outputs, then ``CallResult``'s ``has_finished`` should be set to ``False``.\n\nSome primitives have internal iterations (for example, optimization iterations).\nFor those, caller can provide how many of primitive's internal iterations\nshould a primitive do before returning outputs. Primitives should make iterations as\nsmall as reasonable. If ``iterations`` is ``None``, then there is no limit on\nhow many iterations the primitive should do and primitive should choose the best amount\nof iterations on its own (potentially controlled through hyper-parameters).\nIf ``iterations`` is a number, a primitive has to do those number of iterations,\nif possible. ``timeout`` should still be respected and potentially less iterations\ncan be done because of that. Primitives with internal iterations should make\n``CallResult`` contain correct values.\n\nFor primitives which do not have internal iterations, any value of ``iterations``\nmeans that they should run fully, respecting only ``timeout``.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\ninputs:\n The inputs of shape [num_inputs, ...].\ntimeout:\n A maximum time this primitive should take to produce outputs during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nThe outputs of shape [num_inputs, ...] wrapped inside ``CallResult``." + }, + "set_params": { + "kind": "OTHER", + "arguments": [ + "params" + ], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------\nparams:\n An instance of parameters." + }, + "set_training_data": { + "kind": "OTHER", + "arguments": [], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------" + } + }, + "class_attributes": { + "logger": "logging.Logger", + "metadata": "d3m.metadata.base.PrimitiveMetadata" + }, + "instance_attributes": { + "hyperparams": "d3m.metadata.hyperparams.Hyperparams", + "random_seed": "int", + "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]", + "volumes": "typing.Dict[str, str]", + "temporary_directory": "typing.Union[NoneType, str]" + } + }, + "structural_type": "test_primitives.increment.IncrementPrimitive", + "description": "A primitive which increments each value by a fixed amount, by default 1.\n\nAttributes\n----------\nmetadata:\n Primitive's metadata. Available as a class attribute.\nlogger:\n Primitive's logger. Available as a class attribute.\nhyperparams:\n Hyperparams passed to the constructor.\nrandom_seed:\n Random seed passed to the constructor.\ndocker_containers:\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes:\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory:\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes." +} +""".replace('__INTERFACES_VERSION__', d3m.__version__).replace('__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace('__DIGEST__', IncrementPrimitive.metadata.query()['digest']) + + +class TestIncrementPrimitive(unittest.TestCase): + def call_primitive(self, primitive, method_name, **kwargs): + return getattr(primitive, method_name)(**kwargs) + + def test_basic(self): + hyperparams_class = IncrementPrimitive.metadata.get_hyperparams() + + primitive = IncrementPrimitive(hyperparams=hyperparams_class.defaults()) + + inputs = container.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], { + # Custom metadata. + 'foo': 'bar', + }, generate_metadata=True) + + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertTrue(call_metadata.value.equals(container.DataFrame([[2.0, 3.0, 4.0, 5.0], [6.0, 7.0, 8.0, 9.0]], generate_metadata=True))) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query(())['dimension']['length'], 2) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS,))['dimension']['length'], 4) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 0))['structural_type'], numpy.float64) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 1))['structural_type'], numpy.float64) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 2))['structural_type'], numpy.float64) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 3))['structural_type'], numpy.float64) + self.assertEqual(call_metadata.value.metadata.query(()).get('foo', None), 'bar') + + def test_hyperparameter(self): + hyperparams_class = IncrementPrimitive.metadata.get_hyperparams() + + primitive = IncrementPrimitive(hyperparams=hyperparams_class(amount=2)) + + inputs = container.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], generate_metadata=True) + + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertTrue(call_metadata.value.equals(container.DataFrame([[3.0, 4.0, 5.0, 6.0], [7.0, 8.0, 9.0, 10.0]], generate_metadata=True))) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query(())['dimension']['length'], 2) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS,))['dimension']['length'], 4) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 0))['structural_type'], numpy.float64) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 1))['structural_type'], numpy.float64) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 2))['structural_type'], numpy.float64) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 3))['structural_type'], numpy.float64) + + def test_metadata(self): + expected_description = json.loads(EXPECTED_PRIMITIVE_DESCRIPTION_JSON) + + # We stringify to JSON and parse it to make sure the description can be stringified to JSON. + description = json.loads(json.dumps(IncrementPrimitive.metadata.to_json_structure())) + + self.maxDiff = None + self.assertEqual(expected_description, description) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_index.py b/d3m/tests/test_index.py new file mode 100644 index 0000000..7d56965 --- /dev/null +++ b/d3m/tests/test_index.py @@ -0,0 +1,133 @@ +import copy +import logging +import os.path +import pkg_resources +import sys +import types +import unittest + +COMMON_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'common-primitives') +# NOTE: This insertion should appear before any code attempting to resolve or load primitives, +# so the git submodule version of `common-primitives` is looked at first. +sys.path.insert(0, COMMON_PRIMITIVES_DIR) + +from common_primitives.column_parser import ColumnParserPrimitive + +from d3m import container, index, utils +from d3m.metadata import base as metadata_base, hyperparams, pipeline_run +from d3m.primitive_interfaces import base, transformer + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +def create_primitive(primitive_id, python_path): + # Silence any validation warnings. + with utils.silence(): + class Primitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': primitive_id, + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': python_path, + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + return Primitive + + +FooBar2Primitive = create_primitive('e328012a-56f3-4da4-a422-2a0ade5d05b0', 'd3m.primitives.foo2.bar2.FooBar2Primitive') +FooBar3Primitive = create_primitive('266acf13-b7c3-4115-aaff-230971624a7d', 'd3m.primitives.foo3.bar3.FooBar3Primitive') +FooBar4Primitive = create_primitive('ab699c0f-434a-43eb-ad4a-f1e669cac50e', 'd3m.primitives.foo3.bar3') + + +class TestIndex(unittest.TestCase): + def test_register(self): + FooBarPrimitive = create_primitive('e2fc24f8-5b32-4759-be5b-8126a42522a3', 'd3m.primitives.foo.bar.FooBarPrimitive') + + # To hide any logging or stdout output. + with self.assertLogs(level=logging.DEBUG) as cm: + with utils.redirect_to_logging(): + index.register_primitive('d3m.primitives.foo.bar.FooBarPrimitive', FooBarPrimitive) + + # Just to log something, otherwise "assertLogs" can fail. + logging.getLogger().debug("Start test.") + + index.get_primitive('d3m.primitives.foo.bar.FooBarPrimitive') + + def test_entrypoint(self): + working_set_entries = copy.copy(pkg_resources.working_set.entries) + working_set_entry_keys = copy.copy(pkg_resources.working_set.entry_keys) + working_set_by_key = copy.copy(pkg_resources.working_set.by_key) + + try: + distribution = pkg_resources.Distribution(__file__) + entry_point = pkg_resources.EntryPoint.parse('foo2.bar2.FooBar2Primitive = test_index:FooBar2Primitive', dist=distribution) + distribution._ep_map = {'d3m.primitives': {'foo2.bar2.FooBar2Primitive': entry_point}} + pkg_resources.working_set.add(distribution) + + python_path = 'd3m.primitives.foo2.bar2.FooBar2Primitive' + + self.assertIn(python_path, index.search()) + + self.assertIs(index.get_primitive(python_path), FooBar2Primitive) + + finally: + pkg_resources.working_set.entries = working_set_entries + pkg_resources.working_set.entry_keys = working_set_entry_keys + pkg_resources.working_set.by_key = working_set_by_key + + def test_entrypoint_conflict(self): + working_set_entries = copy.copy(pkg_resources.working_set.entries) + working_set_entry_keys = copy.copy(pkg_resources.working_set.entry_keys) + working_set_by_key = copy.copy(pkg_resources.working_set.by_key) + + try: + distribution = pkg_resources.Distribution(__file__) + distribution._ep_map = { + 'd3m.primitives': { + 'foo3.bar3': pkg_resources.EntryPoint.parse('foo3.bar3 = test_index:FooBar4Primitive', dist=distribution), + 'foo3.bar3.FooBar3Primitive': pkg_resources.EntryPoint.parse('foo3.bar3.FooBar3Primitive = test_index:FooBar3Primitive', dist=distribution), + }, + } + pkg_resources.working_set.add(distribution) + + with self.assertLogs(level=logging.WARNING) as cm: + from d3m.primitives.foo3 import bar3 + from d3m.primitives.foo3.bar3 import FooBar3Primitive as primitive + + self.assertIsInstance(bar3, types.ModuleType) + self.assertIs(primitive, FooBar3Primitive) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, 'An entry point for a primitive is conflicting with another entry point which has it as a module: %(entry_point_name)s') + + finally: + pkg_resources.working_set.entries = working_set_entries + pkg_resources.working_set.entry_keys = working_set_entry_keys + pkg_resources.working_set.by_key = working_set_by_key + + def test_validate(self): + # To hide any logging or stdout output. + with utils.silence(): + index.register_primitive('d3m.primitives.data_transformation.column_parser.Common', ColumnParserPrimitive) + + primitive = index.get_primitive_by_id('d510cb7a-1782-4f51-b44c-58f0236e47c7') + + primitive_description = primitive.metadata.to_json_structure() + + pipeline_run.validate_primitive(primitive_description) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_metadata.py b/d3m/tests/test_metadata.py new file mode 100644 index 0000000..f9c0b2c --- /dev/null +++ b/d3m/tests/test_metadata.py @@ -0,0 +1,2211 @@ +import unittest + +import jsonschema +import numpy + +from d3m import container, utils +from d3m.metadata import base + + +def copy_elements_metadata(source_metadata, target_metadata, from_selector, to_selector=(), *, ignore_all_elements=False): + return source_metadata._copy_elements_metadata(target_metadata, list(from_selector), list(to_selector), [], ignore_all_elements) + + +class TestMetadata(unittest.TestCase): + def test_basic(self): + md1 = base.Metadata({'value': 'test'}) + + self.assertEqual(md1.query(()), {'value': 'test'}) + self.assertEqual(md1.query(('foo',)), {}) + self.assertEqual(md1.query(('bar',)), {}) + + md2 = md1.update((), {'value2': 'test2'}) + + self.assertEqual(md1.query(()), {'value': 'test'}) + self.assertEqual(md1.query(('foo',)), {}) + self.assertEqual(md1.query(('bar',)), {}) + self.assertEqual(md2.query(()), {'value': 'test', 'value2': 'test2'}) + + md3 = md2.update(('foo',), {'element': 'one'}) + + self.assertEqual(md1.query(()), {'value': 'test'}) + self.assertEqual(md1.query(('foo',)), {}) + self.assertEqual(md1.query(('bar',)), {}) + self.assertEqual(md2.query(()), {'value': 'test', 'value2': 'test2'}) + self.assertEqual(md3.query(()), {'value': 'test', 'value2': 'test2'}) + self.assertEqual(md3.query(('foo',)), {'element': 'one'}) + + md4 = md3.update((base.ALL_ELEMENTS,), {'element': 'two'}) + + self.assertEqual(md1.query(()), {'value': 'test'}) + self.assertEqual(md1.query(('foo',)), {}) + self.assertEqual(md1.query(('bar',)), {}) + self.assertEqual(md2.query(()), {'value': 'test', 'value2': 'test2'}) + self.assertEqual(md3.query(()), {'value': 'test', 'value2': 'test2'}) + self.assertEqual(md3.query(('foo',)), {'element': 'one'}) + self.assertEqual(md4.query(()), {'value': 'test', 'value2': 'test2'}) + self.assertEqual(md4.query((base.ALL_ELEMENTS,)), {'element': 'two'}) + self.assertEqual(md4.query(('foo',)), {'element': 'two'}) + + md5 = md4.update(('foo',), {'element': 'three'}) + + self.assertEqual(md1.query(()), {'value': 'test'}) + self.assertEqual(md1.query(('foo',)), {}) + self.assertEqual(md1.query(('bar',)), {}) + self.assertEqual(md2.query(()), {'value': 'test', 'value2': 'test2'}) + self.assertEqual(md3.query(()), {'value': 'test', 'value2': 'test2'}) + self.assertEqual(md3.query(('foo',)), {'element': 'one'}) + self.assertEqual(md4.query(()), {'value': 'test', 'value2': 'test2'}) + self.assertEqual(md4.query((base.ALL_ELEMENTS,)), {'element': 'two'}) + self.assertEqual(md4.query(('foo',)), {'element': 'two'}) + self.assertEqual(md5.query(()), {'value': 'test', 'value2': 'test2'}) + self.assertEqual(md5.query((base.ALL_ELEMENTS,)), {'element': 'two'}) + self.assertEqual(md5.query(('foo',)), {'element': 'three'}) + + def test_all_elements(self): + md1 = base.Metadata() + + md2 = md1.update((base.ALL_ELEMENTS, 'bar'), {'value': 'test1'}) + + self.assertEqual(md2.query(('foo', 'bar')), {'value': 'test1'}) + + md3 = md2.update(('foo', 'bar'), {'value': 'test2'}) + + self.assertEqual(md2.query(('foo', 'bar')), {'value': 'test1'}) + self.assertEqual(md3.query(('foo', 'bar')), {'value': 'test2'}) + + md4 = md3.update((base.ALL_ELEMENTS, 'bar'), {'value': 'test3'}) + + self.assertEqual(md2.query(('foo', 'bar')), {'value': 'test1'}) + self.assertEqual(md3.query(('foo', 'bar')), {'value': 'test2'}) + self.assertEqual(md4.query(('foo', 'bar')), {'value': 'test3'}) + + md5 = md4.update(('foo', base.ALL_ELEMENTS), {'value': 'test4'}) + + self.assertEqual(md2.query(('foo', 'bar')), {'value': 'test1'}) + self.assertEqual(md3.query(('foo', 'bar')), {'value': 'test2'}) + self.assertEqual(md4.query(('foo', 'bar')), {'value': 'test3'}) + self.assertEqual(md5.query(('foo', 'bar')), {'value': 'test4'}) + + md6 = md5.update(('foo', 'bar'), {'value': 'test5'}) + + self.assertEqual(md2.query(('foo', 'bar')), {'value': 'test1'}) + self.assertEqual(md3.query(('foo', 'bar')), {'value': 'test2'}) + self.assertEqual(md4.query(('foo', 'bar')), {'value': 'test3'}) + self.assertEqual(md5.query(('foo', 'bar')), {'value': 'test4'}) + self.assertEqual(md6.query(('foo', 'bar')), {'value': 'test5'}) + + md7 = md6.update((base.ALL_ELEMENTS, base.ALL_ELEMENTS), {'value': 'test6'}) + + self.assertEqual(md2.query(('foo', 'bar')), {'value': 'test1'}) + self.assertEqual(md3.query(('foo', 'bar')), {'value': 'test2'}) + self.assertEqual(md4.query(('foo', 'bar')), {'value': 'test3'}) + self.assertEqual(md5.query(('foo', 'bar')), {'value': 'test4'}) + self.assertEqual(md6.query(('foo', 'bar')), {'value': 'test5'}) + self.assertEqual(md7.query(('foo', 'bar')), {'value': 'test6'}) + + md8 = md7.update(('foo', 'bar'), {'value': 'test7'}) + + self.assertEqual(md2.query(('foo', 'bar')), {'value': 'test1'}) + self.assertEqual(md3.query(('foo', 'bar')), {'value': 'test2'}) + self.assertEqual(md4.query(('foo', 'bar')), {'value': 'test3'}) + self.assertEqual(md5.query(('foo', 'bar')), {'value': 'test4'}) + self.assertEqual(md6.query(('foo', 'bar')), {'value': 'test5'}) + self.assertEqual(md7.query(('foo', 'bar')), {'value': 'test6'}) + self.assertEqual(md8.query(('foo', 'bar')), {'value': 'test7'}) + + self.assertEqual(md8.to_internal_json_structure(), [{ + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6' + } + }, { + 'selector': ['foo', 'bar'], + 'metadata': { + 'value': 'test7' + } + }]) + + def test_removal(self): + md1 = base.Metadata().update((), {'value': 'test1'}) + + self.assertEqual(md1.query(()), {'value': 'test1'}) + + md2 = md1.update((), {'value': base.NO_VALUE}) + + self.assertEqual(md1.query(()), {'value': 'test1'}) + self.assertEqual(md2.query(()), {}) + self.assertEqual(md2.query((), ignore_all_elements=True), {}) + + md3 = md2.update((), {'value': {'value2': 'test2'}}) + + self.assertEqual(md1.query(()), {'value': 'test1'}) + self.assertEqual(md2.query(()), {}) + self.assertEqual(md3.query(()), {'value': {'value2': 'test2'}}) + + md4 = md3.update((), {'value': {'value2': base.NO_VALUE}}) + + self.assertEqual(md1.query(()), {'value': 'test1'}) + self.assertEqual(md2.query(()), {}) + self.assertEqual(md3.query(()), {'value': {'value2': 'test2'}}) + self.assertEqual(md4.query(()), {}) + + md5 = md4.update((), {'value': base.NO_VALUE}) + + self.assertEqual(md1.query(()), {'value': 'test1'}) + self.assertEqual(md2.query(()), {}) + self.assertEqual(md3.query(()), {'value': {'value2': 'test2'}}) + self.assertEqual(md4.query(()), {}) + self.assertEqual(md5.query(()), {}) + + def test_empty_dict(self): + md = base.Metadata().update((), {'value': {}}) + + self.assertEqual(md.query(()), {'value': {}}) + + md = md.update((), {'value': {'a': '1', 'b': 2}}) + + self.assertEqual(md.query(()), {'value': {'a': '1', 'b': 2}}) + + md = md.update((), {'value': {'a': base.NO_VALUE, 'b': base.NO_VALUE}}) + + self.assertEqual(md.query(()), {}) + + md = md.update((), {'value': {'a': '1', 'b': 2}}) + + self.assertEqual(md.query(()), {'value': {'a': '1', 'b': 2}}) + + md = md.update((), {'value': {'a': base.NO_VALUE}}) + + self.assertEqual(md.query(()), {'value': {'b': 2}}) + + def test_remove(self): + metadata = base.Metadata().update((), {'value': 'test1'}) + metadata = metadata.update(('a',), {'value': 'test2'}) + metadata = metadata.update(('a', 'b'), {'value': 'test3'}) + metadata = metadata.update(('a', 'b', 'c'), {'value': 'test4'}) + metadata = metadata.update((base.ALL_ELEMENTS, 'b', 'd'), {'value': 'test5'}) + metadata = metadata.update((base.ALL_ELEMENTS, 'b', 'e', base.ALL_ELEMENTS), {'value': 'test6'}) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'd'], + 'metadata': { + 'value': 'test5', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'e', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6', + }, + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + { + 'selector': ['a', 'b'], + 'metadata': { + 'value': 'test3', + }, + }, + { + 'selector': ['a', 'b', 'c'], + 'metadata': { + 'value': 'test4', + }, + }, + ]) + + new_metadata = metadata.remove(('a', 'b')) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'd'], + 'metadata': { + 'value': 'test5', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'e', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6', + }, + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + { + 'selector': ['a', 'b', 'c'], + 'metadata': { + 'value': 'test4', + }, + }, + ]) + + new_metadata = metadata.remove(('a', 'b'), recursive=True) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'd'], + 'metadata': { + 'value': 'test5', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'e', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6', + }, + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + ]) + + new_metadata = metadata.remove((), recursive=True) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), []) + + new_metadata = metadata.remove((base.ALL_ELEMENTS, 'b')) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'd'], + 'metadata': { + 'value': 'test5', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'e', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6', + }, + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + { + 'selector': ['a', 'b', 'c'], + 'metadata': { + 'value': 'test4', + }, + }, + ]) + + new_metadata = metadata.remove((base.ALL_ELEMENTS, 'b'), recursive=True) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + ]) + + new_metadata = metadata.remove((base.ALL_ELEMENTS, 'b'), strict_all_elements=True) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'd'], + 'metadata': { + 'value': 'test5', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'e', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6', + }, + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + { + 'selector': ['a', 'b'], + 'metadata': { + 'value': 'test3', + }, + }, + { + 'selector': ['a', 'b', 'c'], + 'metadata': { + 'value': 'test4', + }, + }, + ]) + + new_metadata = metadata.remove((base.ALL_ELEMENTS, 'b'), recursive=True, strict_all_elements=True) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + { + 'selector': ['a', 'b'], + 'metadata': { + 'value': 'test3', + }, + }, + { + 'selector': ['a', 'b', 'c'], + 'metadata': { + 'value': 'test4', + }, + }, + ]) + + new_metadata = metadata.remove(('a', base.ALL_ELEMENTS)) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'd'], + 'metadata': { + 'value': 'test5', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'e', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6', + }, + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + { + 'selector': ['a', 'b', 'c'], + 'metadata': { + 'value': 'test4', + }, + }, + ]) + + new_metadata = metadata.remove(('a', base.ALL_ELEMENTS), strict_all_elements=True) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'd'], + 'metadata': { + 'value': 'test5', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'e', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6', + }, + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + { + 'selector': ['a', 'b'], + 'metadata': { + 'value': 'test3', + }, + }, + { + 'selector': ['a', 'b', 'c'], + 'metadata': { + 'value': 'test4', + }, + }, + ]) + + new_metadata = metadata.remove(('a', base.ALL_ELEMENTS), recursive=True) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'd'], + 'metadata': { + 'value': 'test5', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'e', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6', + }, + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + ]) + + new_metadata = metadata.remove((base.ALL_ELEMENTS, 'b', base.ALL_ELEMENTS)) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['__ALL_ELEMENTS__', 'b', 'e', '__ALL_ELEMENTS__'], + 'metadata': { + 'value': 'test6', + }, + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + { + 'selector': ['a', 'b'], + 'metadata': { + 'value': 'test3', + }, + }, + ]) + + new_metadata = metadata.remove((base.ALL_ELEMENTS, 'b', base.ALL_ELEMENTS), recursive=True) + + self.assertEqual(utils.to_json_structure(new_metadata.to_internal_simple_structure()), [ + { + 'selector': [], + 'metadata': { + 'value': 'test1', + } + }, + { + 'selector': ['a'], + 'metadata': { + 'value': 'test2', + }, + }, + { + 'selector': ['a', 'b'], + 'metadata': { + 'value': 'test3', + }, + }, + ]) + + def test_remove_column(self): + metadata = base.DataMetadata().update((base.ALL_ELEMENTS, 0), {'name': 'column1'}) + metadata = metadata.update((base.ALL_ELEMENTS, 1), {'name': 'column2'}) + metadata = metadata.update((10, 0), {'value': 'row10.0'}) + metadata = metadata.update((10, 1), {'value': 'row10.1'}) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'name': 'column1'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'column2'}, + }, { + 'selector': [10, 0], + 'metadata': {'value': 'row10.0'}, + }, { + 'selector': [10, 1], + 'metadata': {'value': 'row10.1'}, + }]) + + metadata = metadata.remove_column(0) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'name': 'column2'}, + }, { + 'selector': [10, 1], + 'metadata': {'value': 'row10.1'}, + }]) + + def test_check(self): + data = container.Dataset({ + '0': container.ndarray(numpy.array([ + [1, 2, 3], + [4, 5, 6], + ])), + }) + + md1 = base.DataMetadata().update((), { + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': type(data), + 'value': 'test' + }) + + md1.check(data) + + md2 = md1.update(('missing',), {'value': 'test'}) + + with self.assertRaisesRegex(ValueError, 'cannot be resolved'): + md2.check(data) + + md3 = md1.update(('0', 1), {'value': 'test'}) + + md4 = md3.update(('0', 2), {'value': 'test'}) + + with self.assertRaisesRegex(ValueError, 'cannot be resolved'): + md4.check(data) + + md5 = md3.update(('0', 1, 3), {'value': 'test'}) + + with self.assertRaisesRegex(ValueError, 'cannot be resolved'): + md5.check(data) + + md6 = md3.update(('0', 1, 2, base.ALL_ELEMENTS), {'value': 'test'}) + + with self.assertRaisesRegex(ValueError, 'ALL_ELEMENTS set but dimension missing at'): + md6.check(data) + + def test_errors(self): + with self.assertRaisesRegex(TypeError, 'Metadata should be a dict'): + base.Metadata().update((), None) + + class Custom: + pass + + with self.assertRaisesRegex(TypeError, 'is not known to be immutable'): + base.Metadata().update((), {'foo': Custom()}) + + with self.assertRaisesRegex(TypeError, 'Selector is not a tuple or a list'): + base.Metadata().update({}, {'value': 'test'}) + + with self.assertRaisesRegex(TypeError, 'is not a str, int, or ALL_ELEMENTS'): + base.Metadata().update((1.0,), {'value': 'test'}) + + with self.assertRaisesRegex(TypeError, 'is not a str, int, or ALL_ELEMENTS'): + base.Metadata().update((None,), {'value': 'test'}) + + def test_data(self): + data = container.Dataset({ + '0': container.ndarray(numpy.array([ + [1, 2, 3], + [4, 5, 6], + ])), + }) + + md1 = base.DataMetadata() + md1.update((), {'value': 'test'}) + + with self.assertRaisesRegex(jsonschema.exceptions.ValidationError, 'is a required property'): + md1.check(md1) + + md1 = base.DataMetadata().generate(data, compact=True) + + md2 = md1.update((), { + 'id': 'test-dataset', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': type(data), + 'dimension': { + 'length': 1 + } + }) + + md3 = md2.update(('0',), { + 'structural_type': type(data['0']), + 'dimension': { + 'length': 2 + } + }) + + self.assertEqual(utils.to_json_structure(md3.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'id': 'test-dataset', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': 1 + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 2, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['0'], + 'metadata': { + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 2, + }, + }, + }]) + + md1 = base.DataMetadata().generate(data, compact=False) + + md2 = md1.update((), { + 'id': 'test-dataset', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': type(data), + 'dimension': { + 'length': 1 + } + }) + + md3 = md2.update(('0',), { + 'structural_type': type(data['0']), + 'dimension': { + 'length': 2 + } + }) + + self.assertEqual(utils.to_json_structure(md3.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'id': 'test-dataset', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': 1 + }, + }, + }, { + 'selector': ['0'], + 'metadata': { + 'dimension': { + 'length': 2, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.numpy.ndarray', + }, + }, { + 'selector': ['0', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['0', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + def test_prune_bug(self): + metadata = base.Metadata().update((base.ALL_ELEMENTS, 0), {'foo': 'bar1'}) + metadata = metadata.update((0, 1), {'foo': 'bar2'}) + metadata = metadata.update((1, 1), {'foo': 'bar2'}) + metadata = metadata.update((2, 1), {'foo': 'bar2'}) + metadata = metadata.update((base.ALL_ELEMENTS, base.ALL_ELEMENTS), {'foo': 'bar3'}) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': {'foo': 'bar3'}, + }]) + + def test_remove_empty_metadata(self): + metadata = base.Metadata().update((base.ALL_ELEMENTS,), { + 'foo': { + 'bar': 42, + }, + 'other': 1, + }) + + metadata = metadata.update((base.ALL_ELEMENTS,), { + 'foo': { + 'bar': base.NO_VALUE, + }, + }) + + self.assertEqual(metadata.query((base.ALL_ELEMENTS,)), { + 'other': 1, + }) + + metadata = base.Metadata({ + 'foo': { + 'bar': 42, + }, + 'other': 1, + }) + + metadata = metadata.update((), { + 'foo': { + 'bar': base.NO_VALUE, + }, + }) + + self.assertEqual(metadata.query(()), { + 'other': 1, + }) + + metadata = base.Metadata({ + 'foo': { + 'bar': 42, + }, + }) + + metadata = metadata.update((), { + 'foo': { + 'bar': base.NO_VALUE, + }, + }) + + self.assertEqual(metadata.query(()), {}) + + metadata = base.Metadata().update(('a',), { + 'foo': { + 'bar': 42, + }, + }) + + metadata = metadata.update((base.ALL_ELEMENTS,), { + 'foo': { + 'bar': base.NO_VALUE, + }, + }) + + self.assertEqual(metadata.query(('a',)), {}) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'foo': { + 'bar': '__NO_VALUE__', + }, + }, + }]) + + def test_ignore_all_elements(self): + metadata = base.Metadata().update((base.ALL_ELEMENTS,), { + 'foo': 'bar', + 'other': 42, + }) + + metadata = metadata.update((0,), { + 'foo': base.NO_VALUE, + }) + + metadata = metadata.update((2,), { + 'other2': 43, + }) + + self.assertEqual(metadata.query((0,)), {'other': 42}) + self.assertEqual(metadata.query((1,)), {'foo': 'bar', 'other': 42}) + self.assertEqual(metadata.query((2,)), {'foo': 'bar', 'other': 42, 'other2': 43}) + self.assertEqual(metadata.query((0,), ignore_all_elements=True), {}) + self.assertEqual(metadata.query((1,), ignore_all_elements=True), {}) + self.assertEqual(metadata.query((2,), ignore_all_elements=True), {'other2': 43}) + + metadata = metadata.update((base.ALL_ELEMENTS,), { + 'foo': 'bar2', + }) + + self.assertEqual(metadata.query((0,)), {'foo': 'bar2', 'other': 42}) + self.assertEqual(metadata.query((1,)), {'foo': 'bar2', 'other': 42}) + self.assertEqual(metadata.query((2,)), {'foo': 'bar2', 'other': 42, 'other2': 43}) + self.assertEqual(metadata.query((0,), ignore_all_elements=True), {}) + self.assertEqual(metadata.query((1,), ignore_all_elements=True), {}) + self.assertEqual(metadata.query((2,), ignore_all_elements=True), {'other2': 43}) + + def test_query_with_exceptions(self): + metadata = base.Metadata().update((base.ALL_ELEMENTS,), { + 'foo': 'bar', + 'other': 42, + }) + + metadata = metadata.update((0,), { + 'foo': base.NO_VALUE, + }) + + metadata = metadata.update((2,), { + 'other2': 43, + }) + + self.assertEqual(metadata.query((0,)), {'other': 42}) + self.assertEqual(metadata.query((1,)), {'foo': 'bar', 'other': 42}) + self.assertEqual(metadata.query((2,)), {'foo': 'bar', 'other': 42, 'other2': 43}) + + self.assertEqual(metadata.query_with_exceptions((0,)), ({'other': 42}, {})) + self.assertEqual(metadata.query_with_exceptions((1,)), ({'foo': 'bar', 'other': 42}, {})) + self.assertEqual(metadata.query_with_exceptions((2,)), ({'foo': 'bar', 'other': 42, 'other2': 43}, {})) + + self.assertEqual(metadata.query_with_exceptions((base.ALL_ELEMENTS,)), ({ + 'foo': 'bar', + 'other': 42, + }, { + (0,): {'other': 42}, + (2,): {'foo': 'bar', 'other': 42, 'other2': 43}, + })) + + metadata = metadata.update((base.ALL_ELEMENTS,), { + 'foo': 'bar2', + }) + + self.assertEqual(metadata.query_with_exceptions((base.ALL_ELEMENTS,)), ({ + 'foo': 'bar2', + 'other': 42, + }, { + (2,): {'foo': 'bar2', 'other': 42, 'other2': 43}, + })) + + metadata = base.Metadata().update((base.ALL_ELEMENTS, 0), { + 'name': 'bar', + }) + + metadata = metadata.update((base.ALL_ELEMENTS, 1), { + 'name': 'foo', + }) + + metadata = metadata.update((2, 0), { + 'name': 'bar2', + }) + + metadata = metadata.update((2, 2), { + 'name': 'foo2', + }) + + self.assertEqual(metadata.query_with_exceptions((base.ALL_ELEMENTS, 0)), ({ + 'name': 'bar', + }, { + (2, 0): {'name': 'bar2'}, + })) + + self.assertEqual(metadata.query_with_exceptions((base.ALL_ELEMENTS, 1)), ({ + 'name': 'foo', + }, {})) + + self.assertEqual(metadata.query_with_exceptions((base.ALL_ELEMENTS, 2)), ({}, { + (2, 2): {'name': 'foo2'}, + })) + + self.assertEqual(metadata.query_with_exceptions((2, base.ALL_ELEMENTS)), ({}, { + (2, 0): {'name': 'bar2'}, + (2, 2): {'name': 'foo2'}, + })) + + metadata = base.Metadata().update((base.ALL_ELEMENTS, base.ALL_ELEMENTS), { + 'foo': 'bar', + 'other': 42, + }) + + metadata = metadata.update((base.ALL_ELEMENTS, 0), { + 'foo': base.NO_VALUE, + }) + + metadata = metadata.update((base.ALL_ELEMENTS, 2), { + 'other2': 43, + }) + + self.assertEqual(metadata.query((base.ALL_ELEMENTS, 0)), {'other': 42}) + self.assertEqual(metadata.query((base.ALL_ELEMENTS, 1)), {'foo': 'bar', 'other': 42}) + self.assertEqual(metadata.query((base.ALL_ELEMENTS, 2)), {'foo': 'bar', 'other': 42, 'other2': 43}) + + self.assertEqual(metadata.query_with_exceptions((base.ALL_ELEMENTS, 0)), ({'other': 42}, {})) + self.assertEqual(metadata.query_with_exceptions((base.ALL_ELEMENTS, 1)), ({'foo': 'bar', 'other': 42}, {})) + self.assertEqual(metadata.query_with_exceptions((base.ALL_ELEMENTS, 2)), ({'foo': 'bar', 'other': 42, 'other2': 43}, {})) + + self.assertEqual(metadata.query_with_exceptions((base.ALL_ELEMENTS, base.ALL_ELEMENTS)), ({ + 'foo': 'bar', + 'other': 42, + }, { + (base.ALL_ELEMENTS, 0): {'other': 42}, + (base.ALL_ELEMENTS, 2): {'foo': 'bar', 'other': 42, 'other2': 43}, + })) + + def test_semantic_types(self): + metadata = base.DataMetadata({ + 'structural_type': container.DataFrame, + 'schema': base.CONTAINER_SCHEMA_VERSION, + }) + + self.assertFalse(metadata.has_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetResource')) + + metadata = metadata.add_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetResource') + + self.assertTrue(metadata.has_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetResource')) + + metadata = metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetResource') + + self.assertFalse(metadata.has_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetResource')) + + metadata = metadata.add_semantic_type((base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Attribute') + metadata = metadata.add_semantic_type((base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') + metadata = metadata.add_semantic_type((base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + self.assertEqual(metadata.get_elements_with_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/Attribute'), []) + self.assertEqual(metadata.get_elements_with_semantic_type((base.ALL_ELEMENTS,), 'https://metadata.datadrivendiscovery.org/types/Attribute'), [0, 2]) + + def test_copy_elements_metadata(self): + metadata = base.Metadata() + + metadata = metadata.update((), {'level0': 'foobar0'}) + + metadata = metadata.update(('level1',), {'level1': 'foobar1'}) + + metadata = metadata.update((base.ALL_ELEMENTS,), {'level1a': 'foobar1a', 'level1b': 'foobar1b'}) + + metadata = metadata.update(('level1',), {'level1b': base.NO_VALUE}) + + metadata = metadata.update(('level1', 'level2'), {'level2': 'foobar2'}) + + metadata = metadata.update((base.ALL_ELEMENTS, base.ALL_ELEMENTS), {'level2a': 'foobar2a', 'level2b': 'foobar2b'}) + + metadata = metadata.update(('level1', 'level2'), {'level2b': base.NO_VALUE}) + + metadata = metadata.update(('level1', 'level2', 'level3'), {'level3': 'foobar3'}) + + metadata = metadata.update((base.ALL_ELEMENTS, base.ALL_ELEMENTS, 'level3'), {'level3a': 'foobar3a'}) + + metadata = metadata.update(('level1', 'level2', 'level3.1'), {'level3.1': 'foobar3.1'}) + + metadata = metadata.update(('level1', 'level2', 'level3', 'level4'), {'level4': 'foobar4'}) + + metadata = metadata.update(('level1', 'level2', 'level3', 'level4.1'), {'level4.1': 'foobar4.1'}) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': {'level0': 'foobar0'}, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'level1a': 'foobar1a', 'level1b': 'foobar1b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['level1'], + 'metadata': {'level1': 'foobar1', 'level1b': '__NO_VALUE__'}, + }, { + 'selector': ['level1', 'level2'], + 'metadata': {'level2': 'foobar2', 'level2b': '__NO_VALUE__'}, + }, { + 'selector': ['level1', 'level2', 'level3'], + 'metadata': {'level3': 'foobar3'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['level1', 'level2', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + self.assertEqual(metadata.query(('level1', 'level2')), { + 'level2a': 'foobar2a', + 'level2': 'foobar2', + }) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + + target_metadata = copy_elements_metadata(metadata, target_metadata, ()) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'level1a': 'foobar1a', 'level1b': 'foobar1b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['level1'], + 'metadata': {'level1': 'foobar1', 'level1b': '__NO_VALUE__'}, + }, { + 'selector': ['level1', 'level2'], + 'metadata': {'level2': 'foobar2', 'level2b': '__NO_VALUE__'}, + }, { + 'selector': ['level1', 'level2', 'level3'], + 'metadata': {'level3': 'foobar3'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['level1', 'level2', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + self.assertEqual(target_metadata.to_json_structure(), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'level1a': 'foobar1a', 'level1b': 'foobar1b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['level1'], + 'metadata': {'level1': 'foobar1', 'level1a': 'foobar1a'}, + }, { + 'selector': ['level1', '__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['level1', '__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['level1', 'level2'], + 'metadata': {'level2': 'foobar2', 'level2a': 'foobar2a'}, + }, { + 'selector': ['level1', 'level2', 'level3'], + 'metadata': {'level3': 'foobar3', 'level3a': 'foobar3a'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['level1', 'level2', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + + target_metadata = copy_elements_metadata(metadata, target_metadata, ('level1',)) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['level2'], + 'metadata': {'level2': 'foobar2', 'level2b': '__NO_VALUE__'}, + }, { + 'selector': ['level2', 'level3'], + 'metadata': {'level3': 'foobar3'}, + }, { + 'selector': ['level2', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['level2', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['level2', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + self.assertEqual(target_metadata.query(('level2',)), { + 'level2a': 'foobar2a', + 'level2': 'foobar2', + }) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + target_metadata = target_metadata.update(('zlevel',), {'level1z': 'foobar1z'}) + + target_metadata = copy_elements_metadata(metadata, target_metadata, ('level1',), ('zlevel',)) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['zlevel'], + 'metadata': {'level1z': 'foobar1z'}, + }, { + 'selector': ['zlevel', '__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['zlevel', '__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['zlevel', 'level2'], + 'metadata': {'level2': 'foobar2', 'level2b': '__NO_VALUE__'}, + }, { + 'selector': ['zlevel', 'level2', 'level3'], + 'metadata': {'level3': 'foobar3'}, + }, { + 'selector': ['zlevel', 'level2', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['zlevel', 'level2', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['zlevel', 'level2', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + self.assertEqual(target_metadata.query(('zlevel', 'level2',)), { + 'level2a': 'foobar2a', + 'level2': 'foobar2', + }) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + + target_metadata = copy_elements_metadata(metadata, target_metadata, ('level1', 'level2')) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['level3'], + 'metadata': {'level3': 'foobar3', 'level3a': 'foobar3a'}, + }, { + 'selector': ['level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + target_metadata = target_metadata.update(('zlevel',), {'level1z': 'foobar1z'}) + + target_metadata = copy_elements_metadata(metadata, target_metadata, ('level1', 'level2'), ('zlevel',)) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['zlevel'], + 'metadata': {'level1z': 'foobar1z'}, + }, { + 'selector': ['zlevel', 'level3'], + 'metadata': {'level3': 'foobar3', 'level3a': 'foobar3a'}, + }, { + 'selector': ['zlevel', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['zlevel', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['zlevel', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + def test_copy_metadata(self): + metadata = base.Metadata() + + metadata = metadata.update((), {'level0': 'foobar0'}) + + metadata = metadata.update(('level1',), {'level1': 'foobar1'}) + + metadata = metadata.update((base.ALL_ELEMENTS,), {'level1a': 'foobar1a', 'level1b': 'foobar1b'}) + + metadata = metadata.update(('level1',), {'level1b': base.NO_VALUE}) + + metadata = metadata.update(('level1', 'level2'), {'level2': 'foobar2'}) + + metadata = metadata.update((base.ALL_ELEMENTS, base.ALL_ELEMENTS), {'level2a': 'foobar2a', 'level2b': 'foobar2b'}) + + metadata = metadata.update(('level1', 'level2'), {'level2b': base.NO_VALUE}) + + metadata = metadata.update(('level1', 'level2', 'level3'), {'level3': 'foobar3'}) + + metadata = metadata.update((base.ALL_ELEMENTS, base.ALL_ELEMENTS, 'level3'), {'level3a': 'foobar3a'}) + + metadata = metadata.update(('level1', 'level2', 'level3.1'), {'level3.1': 'foobar3.1'}) + + metadata = metadata.update(('level1', 'level2', 'level3', 'level4'), {'level4': 'foobar4'}) + + metadata = metadata.update(('level1', 'level2', 'level3', 'level4.1'), {'level4.1': 'foobar4.1'}) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': {'level0': 'foobar0'}, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'level1a': 'foobar1a', 'level1b': 'foobar1b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['level1'], + 'metadata': {'level1': 'foobar1', 'level1b': '__NO_VALUE__'}, + }, { + 'selector': ['level1', 'level2'], + 'metadata': {'level2': 'foobar2', 'level2b': '__NO_VALUE__'}, + }, { + 'selector': ['level1', 'level2', 'level3'], + 'metadata': {'level3': 'foobar3'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['level1', 'level2', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + self.assertEqual(metadata.query(('level1', 'level2')), { + 'level2a': 'foobar2a', + 'level2': 'foobar2', + }) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + + target_metadata = metadata.copy_to(target_metadata, ()) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0': 'foobar0', + 'level0z': 'foobar0z', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'level1a': 'foobar1a', 'level1b': 'foobar1b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['level1'], + 'metadata': {'level1': 'foobar1', 'level1b': '__NO_VALUE__'}, + }, { + 'selector': ['level1', 'level2'], + 'metadata': {'level2': 'foobar2', 'level2b': '__NO_VALUE__'}, + }, { + 'selector': ['level1', 'level2', 'level3'], + 'metadata': {'level3': 'foobar3'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['level1', 'level2', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['level1', 'level2', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + + target_metadata = metadata.copy_to(target_metadata, ('level1',)) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'level1': 'foobar1', + 'level1b': '__NO_VALUE__', + 'level1a': 'foobar1a', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['level2'], + 'metadata': {'level2': 'foobar2', 'level2b': '__NO_VALUE__'}, + }, { + 'selector': ['level2', 'level3'], + 'metadata': {'level3': 'foobar3'}, + }, { + 'selector': ['level2', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['level2', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['level2', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + self.assertEqual(target_metadata.query(('level2',)), { + 'level2a': 'foobar2a', + 'level2': 'foobar2', + }) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + target_metadata = target_metadata.update(('zlevel',), {'level1z': 'foobar1z'}) + + target_metadata = metadata.copy_to(target_metadata, ('level1',), ('zlevel',)) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['zlevel'], + 'metadata': {'level1z': 'foobar1z', 'level1': 'foobar1', 'level1b': '__NO_VALUE__', 'level1a': 'foobar1a'}, + }, { + 'selector': ['zlevel', '__ALL_ELEMENTS__'], + 'metadata': {'level2a': 'foobar2a', 'level2b': 'foobar2b'}, + }, { + 'selector': ['zlevel', '__ALL_ELEMENTS__', 'level3'], + 'metadata': {'level3a': 'foobar3a'}, + }, { + 'selector': ['zlevel', 'level2'], + 'metadata': {'level2': 'foobar2', 'level2b': '__NO_VALUE__'}, + }, { + 'selector': ['zlevel', 'level2', 'level3'], + 'metadata': {'level3': 'foobar3'}, + }, { + 'selector': ['zlevel', 'level2', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['zlevel', 'level2', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['zlevel', 'level2', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + self.assertEqual(target_metadata.query(('zlevel', 'level2',)), { + 'level2a': 'foobar2a', + 'level2': 'foobar2', + }) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + + target_metadata = metadata.copy_to(target_metadata, ('level1', 'level2')) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'level2': 'foobar2', + 'level2b': '__NO_VALUE__', + 'level2a': 'foobar2a', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['level3'], + 'metadata': {'level3': 'foobar3', 'level3a': 'foobar3a'}, + }, { + 'selector': ['level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + target_metadata = base.DataMetadata({ + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }) + + target_metadata = target_metadata.update((), {'level0z': 'foobar0z'}) + target_metadata = target_metadata.update(('zlevel',), {'level1z': 'foobar1z'}) + + target_metadata = metadata.copy_to(target_metadata, ('level1', 'level2'), ('zlevel',)) + + self.assertEqual(utils.to_json_structure(target_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'level0z': 'foobar0z', + 'schema': base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['zlevel'], + 'metadata': {'level1z': 'foobar1z', 'level2': 'foobar2', 'level2b': '__NO_VALUE__', 'level2a': 'foobar2a'}, + }, { + 'selector': ['zlevel', 'level3'], + 'metadata': {'level3': 'foobar3', 'level3a': 'foobar3a'}, + }, { + 'selector': ['zlevel', 'level3', 'level4'], + 'metadata': {'level4': 'foobar4'}, + }, { + 'selector': ['zlevel', 'level3', 'level4.1'], + 'metadata': {'level4.1': 'foobar4.1'}, + }, { + 'selector': ['zlevel', 'level3.1'], + 'metadata': {'level3.1': 'foobar3.1'}, + }]) + + def test_get_index_columns(self): + main = container.DataFrame({'a1': [1, 2, 3], 'b1': [4, 5, 6]}, generate_metadata=True) + + main.metadata = main.metadata.update((base.ALL_ELEMENTS, 0), { + 'name': 'image', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }) + main.metadata = main.metadata.update((base.ALL_ELEMENTS, 1), { + 'name': 'd3mIndex', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }) + + self.assertEqual(main.metadata.get_index_columns(), [1, 0]) + + def test_query_field(self): + md = base.Metadata() + md = md.update((1,), {'key': 'value'}) + + self.assertEqual(md.query_field((1,), 'key', strict_all_elements=False), 'value') + self.assertEqual(md.query_field((1,), 'key', strict_all_elements=True), 'value') + + with self.assertRaises(KeyError): + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=False), 'value') + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=True), 'value') + + with self.assertRaises(KeyError): + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key2', strict_all_elements=True), 'value') + + md = md.update((2,), {'key': 'value'}) + + with self.assertRaises(KeyError): + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=False), 'value') + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=True), 'value') + + md = md.update((3,), {'key': 'value2'}) + + with self.assertRaises(KeyError): + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=False), 'value') + with self.assertRaises(KeyError): + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=True), 'value') + + md = md.update((base.ALL_ELEMENTS,), {'key': 'value'}) + + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=False), 'value') + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=True), 'value') + + self.assertEqual(md.query_field((1,), 'key', strict_all_elements=False), 'value') + self.assertEqual(md.query_field((1,), 'key', strict_all_elements=True), 'value') + + md = md.update((3,), {'key': 'value2'}) + + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=False), 'value') + with self.assertRaises(KeyError): + self.assertEqual(md.query_field((base.ALL_ELEMENTS,), 'key', strict_all_elements=True), 'value') + + def test_query_field_with_exceptions(self): + md = base.Metadata() + md = md.update((1,), {'key': 'value'}) + md = md.update((2,), {'key': 'value2'}) + + self.assertEqual(md.query_field_with_exceptions((1,), 'key'), ('value', {})) + self.assertEqual(md.query_field_with_exceptions((2,), 'key'), ('value2', {})) + with self.assertRaises(KeyError): + md.query_field_with_exceptions((3,), 'key') + + self.assertEqual(md.query_field_with_exceptions((base.ALL_ELEMENTS,), 'key'), (base.NO_VALUE, {(1,): 'value', (2,): 'value2'})) + + # All elements ar require "key" field when there is no explicit ALL_ELEMENTS metadata. + md = md.update((3,), {'key2': 'value'}) + + with self.assertRaises(KeyError): + md.query_field_with_exceptions((base.ALL_ELEMENTS,), 'key') + + md = md.update((base.ALL_ELEMENTS,), {'key': 'value'}) + + self.assertEqual(md.query_field_with_exceptions((1,), 'key'), ('value', {})) + self.assertEqual(md.query_field_with_exceptions((2,), 'key'), ('value', {})) + + self.assertEqual(md.query_field_with_exceptions((base.ALL_ELEMENTS,), 'key'), ('value', {})) + + md = md.update((3,), {'key': 'value2'}) + + self.assertEqual(md.query_field_with_exceptions((base.ALL_ELEMENTS,), 'key'), ('value', {(3,): 'value2'})) + + # Setting same value as what ALL_ELEMENTS has should not add additional exception. + md = md.update((4,), {'key': 'value'}) + + self.assertEqual(md.query_field_with_exceptions((base.ALL_ELEMENTS,), 'key'), ('value', {(3,): 'value2'})) + + # Because ALL_ELEMENTS is set, any additional elements without "key" field are ignored. + md = md.update((5,), {'key2': 'value'}) + + self.assertEqual(md.query_field_with_exceptions((base.ALL_ELEMENTS,), 'key'), ('value', {(3,): 'value2'})) + + def test_compact_generated_metadata(self): + ALL_GENERATED_KEYS = ['foo', 'name', 'other', 'structural_type'] + + compacted_metadata = base.DataMetadata._compact_metadata({}, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, {}) + + # All equal. + new_metadata = { + ('a',): {'foo': 'bar', 'other': 1}, + ('b',): {'foo': 'bar', 'other': 2}, + ('c',): {'foo': 'bar', 'other': 3}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS,): {'foo': 'bar'}, + ('a',): {'other': 1}, + ('b',): {'other': 2}, + ('c',): {'other': 3}, + }) + + # One different. + new_metadata = { + ('a',): {'foo': 'bar', 'other': 1}, + ('b',): {'foo': 'bar', 'other': 2}, + ('c',): {'foo': 'bar2', 'other': 3,}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + ('a',): {'foo': 'bar', 'other': 1}, + ('b',): {'foo': 'bar', 'other': 2}, + ('c',): {'foo': 'bar2', 'other': 3,}, + }) + + # Recursive. + new_metadata = { + ('deep', 'a'): {'foo': 'bar', 'other': 1}, + ('deep', 'b'): {'foo': 'bar', 'other': 2}, + ('deep', 'c'): {'foo': 'bar', 'other': 3}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS): {'foo': 'bar'}, + (base.ALL_ELEMENTS, 'a'): {'other': 1}, + (base.ALL_ELEMENTS, 'b'): {'other': 2}, + (base.ALL_ELEMENTS, 'c'): {'other': 3}, + }) + + new_metadata = { + ('deep', 'a'): {'foo': 'bar', 'other': 1}, + ('deep', 'b'): {'foo': 'bar', 'other': 2}, + ('deep', 'c'): {'foo': 'bar', 'other': 3}, + ('deep2', 'd'): {'foo': 'bar', 'other': 4}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS): {'foo': 'bar'}, + (base.ALL_ELEMENTS, 'a'): {'other': 1}, + (base.ALL_ELEMENTS, 'b'): {'other': 2}, + (base.ALL_ELEMENTS, 'c'): {'other': 3}, + (base.ALL_ELEMENTS, 'd'): {'other': 4}, + }) + + new_metadata = { + ('deep', 'a'): {'foo': 'bar', 'other': 1}, + ('deep', 'b'): {'foo': 'bar', 'other': 2}, + ('deep', 'c'): {'foo': 'bar', 'other': 3}, + ('deep2', 'a'): {'foo': 'bar', 'other': 4}, + ('deep2', 'b'): {'foo': 'bar', 'other': 5}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS): {'foo': 'bar'}, + (base.ALL_ELEMENTS, 'c'): {'other': 3}, + ('deep', 'a'): {'other': 1}, + ('deep', 'b'): {'other': 2}, + ('deep2', 'a'): {'other': 4}, + ('deep2', 'b'): {'other': 5}, + }) + + new_metadata = { + ('deep', 'a'): {'foo': 'bar', 'other': 1}, + ('deep', 'b'): {'foo': 'bar', 'other': 2}, + ('deep', 'c'): {'foo': 'bar2', 'other': 3}, + ('deep2', 'a'): {'foo': 'bar', 'other': 4}, + ('deep2', 'b'): {'foo': 'bar', 'other': 5}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, 'a'): {'foo': 'bar'}, + (base.ALL_ELEMENTS, 'b'): {'foo': 'bar'}, + (base.ALL_ELEMENTS, 'c'): {'foo': 'bar2', 'other': 3}, + ('deep', 'a'): {'other': 1}, + ('deep', 'b'): {'other': 2}, + ('deep2', 'a'): {'other': 4}, + ('deep2', 'b'): {'other': 5}, + }) + + new_metadata = { + ('a', 'deep'): {'foo': 'bar', 'other': 1}, + ('b', 'deep'): {'foo': 'bar', 'other': 2}, + ('c', 'deep'): {'foo': 'bar2', 'other': 3}, + ('a', 'deep2'): {'foo': 'bar', 'other': 4}, + ('b', 'deep2'): {'foo': 'bar', 'other': 5}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + ('a', base.ALL_ELEMENTS): {'foo': 'bar'}, + ('a', 'deep'): {'other': 1}, + ('a', 'deep2'): {'other': 4}, + ('b', base.ALL_ELEMENTS): {'foo': 'bar'}, + ('b', 'deep'): {'other': 2}, + ('b', 'deep2'): {'other': 5}, + ('c', base.ALL_ELEMENTS): {'foo': 'bar2', 'other': 3}, + }) + + new_metadata = { + (base.ALL_ELEMENTS, 'a'): {'foo': 'bar', 'other': 1}, + (base.ALL_ELEMENTS, 'b'): {'foo': 'bar', 'other': 2}, + (base.ALL_ELEMENTS, 'c'): {'foo': 'bar', 'other': 3}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS): {'foo': 'bar'}, + (base.ALL_ELEMENTS, 'a'): {'other': 1}, + (base.ALL_ELEMENTS, 'b'): {'other': 2}, + (base.ALL_ELEMENTS, 'c'): {'other': 3}, + }) + + new_metadata = { + (base.ALL_ELEMENTS, 0): {'foo': 'bar1'}, + (0, 1): {'foo': 'bar2'}, + (1, 1): {'foo': 'bar2'}, + (2, 1): {'foo': 'bar2'}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, 0): {'foo': 'bar1'}, + (base.ALL_ELEMENTS, 1): {'foo': 'bar2'}, + }) + + new_metadata = { + ('deep1', 'a'): {'foo': 'bar', 'other': 1}, + ('deep1', 'b'): {'foo': 'bar2', 'other': 2}, + ('deep2', 'a'): {'foo': 'bar', 'other': 3}, + ('deep2', 'b'): {'foo': 'bar2', 'other': 4}, + ('deep3', 'a'): {'foo': 'bar', 'other': 5}, + ('deep3', 'b'): {'foo': 'bar2', 'other': 6}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, 'a'): {'foo': 'bar'}, + (base.ALL_ELEMENTS, 'b'): {'foo': 'bar2'}, + ('deep1', 'a'): {'other': 1}, + ('deep1', 'b'): {'other': 2}, + ('deep2', 'a'): {'other': 3}, + ('deep2', 'b'): {'other': 4}, + ('deep3', 'a'): {'other': 5}, + ('deep3', 'b'): {'other': 6}, + }) + + new_metadata = { + ('deep1', 'a'): {'foo': 'bar', 'other': 1}, + ('deep1', 'b'): {'foo': 'bar', 'other': 2}, + ('deep2', 'c'): {'foo': 'bar', 'other': 3}, + ('deep2', 'd'): {'foo': 'bar', 'other': 4}, + ('deep3', 'e'): {'foo': 'bar', 'other': 5}, + ('deep3', 'f'): {'foo': 'bar', 'other': 6}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS): {'foo': 'bar'}, + (base.ALL_ELEMENTS, 'a'): {'other': 1}, + (base.ALL_ELEMENTS, 'b'): {'other': 2}, + (base.ALL_ELEMENTS, 'c'): {'other': 3}, + (base.ALL_ELEMENTS, 'd'): {'other': 4}, + (base.ALL_ELEMENTS, 'e'): {'other': 5}, + (base.ALL_ELEMENTS, 'f'): {'other': 6}, + }) + + new_metadata = { + ('deep1', 'a', 1): {'foo': 'bar1', 'other': 1}, + ('deep2', 'a', 2): {'foo': 'bar1', 'other': 2}, + ('deep3', 'a', 3): {'foo': 'bar1', 'other': 3}, + ('deep4', 'a', 4): {'foo': 'bar1', 'other': 4}, + ('deep1', 'b', 1): {'foo': 'bar2', 'other': 5}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 2): {'other': 2}, + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 3): {'other': 3}, + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 4): {'other': 4}, + (base.ALL_ELEMENTS, 'a', base.ALL_ELEMENTS): {'foo': 'bar1'}, + (base.ALL_ELEMENTS, 'a', 1): {'other': 1}, + (base.ALL_ELEMENTS, 'b', base.ALL_ELEMENTS): {'foo': 'bar2', 'other': 5}, + }) + + new_metadata = { + ('deep', 'a', 1): {'foo': 'bar1', 'other': 1}, + ('deep', 'a', 2): {'foo': 'bar1', 'other': 2}, + ('deep', 'b', 1): {'foo': 'bar2', 'other': 3}, + ('deep', 'b', 2): {'foo': 'bar2', 'other': 4}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, 'a', base.ALL_ELEMENTS): {'foo': 'bar1'}, + (base.ALL_ELEMENTS, 'a', 1): {'other': 1}, + (base.ALL_ELEMENTS, 'a', 2): {'other': 2}, + (base.ALL_ELEMENTS, 'b', base.ALL_ELEMENTS): {'foo': 'bar2'}, + (base.ALL_ELEMENTS, 'b', 1): {'other': 3}, + (base.ALL_ELEMENTS, 'b', 2): {'other': 4}, + }) + + new_metadata = { + ('deep', 'a', 1): {'foo': 'bar1', 'other': 'bar1'}, + ('deep', 'a', 2): {'foo': 'bar1', 'other': 'bar2'}, + ('deep', 'b', 1): {'foo': 'bar2', 'other': 'bar1'}, + ('deep', 'b', 2): {'foo': 'bar2', 'other': 'bar2'}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 1): {'other': 'bar1'}, + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 2): {'other': 'bar2'}, + (base.ALL_ELEMENTS, 'a', base.ALL_ELEMENTS): {'foo': 'bar1'}, + (base.ALL_ELEMENTS, 'b', base.ALL_ELEMENTS): {'foo': 'bar2'}, + }) + + new_metadata = { + ('deep1', 'a', 1): {'foo': 'bar1', 'other': 1}, + ('deep1', 'a', 2): {'foo': 'bar1', 'other': 2}, + ('deep2', 'a', 3): {'foo': 'bar1', 'other': 3}, + ('deep2', 'a', 4): {'foo': 'bar1', 'other': 4}, + ('deep1', 'b', 1): {'foo': 'bar2', 'other': 1}, + ('deep1', 'b', 2): {'foo': 'bar2', 'other': 2}, + ('deep2', 'b', 3): {'foo': 'bar2', 'other': 3}, + ('deep2', 'b', 4): {'foo': 'bar2', 'other': 4}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 1): {'other': 1}, + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 2): {'other': 2}, + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 3): {'other': 3}, + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 4): {'other': 4}, + (base.ALL_ELEMENTS, 'a', base.ALL_ELEMENTS): {'foo': 'bar1'}, + (base.ALL_ELEMENTS, 'b', base.ALL_ELEMENTS): {'foo': 'bar2'}, + }) + + new_metadata = { + ('deep', 'a'): {'foo': 'bar', 'other': 1}, + ('deep', 'b'): {'foo': 'bar', 'other': 2}, + ('deep2', 'b'): {'other': 3}, + ('deep2', 'c'): {'foo': 'bar', 'other': 4}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, 'a'): {'other': 1}, + (base.ALL_ELEMENTS, 'c'): {'foo': 'bar', 'other': 4}, + ('deep',base.ALL_ELEMENTS): {'foo': 'bar'}, + ('deep', 'b'): {'other': 2}, + ('deep2', 'b'): {'other': 3}, + }) + + new_metadata = { + ('deep', 'a'): {'foo': 'bar', 'other': 1}, + ('deep', 'b'): {'foo': 'bar', 'other': 2}, + ('deep', 'c'): {'other': 3}, + ('deep2', 'd'): {'foo': 'bar', 'other': 4}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, 'a'): {'foo': 'bar', 'other': 1}, + (base.ALL_ELEMENTS, 'b'): {'foo': 'bar', 'other': 2}, + (base.ALL_ELEMENTS, 'c'): {'other': 3}, + (base.ALL_ELEMENTS, 'd'): {'foo':'bar', 'other': 4}, + }) + + new_metadata = { + (base.ALL_ELEMENTS, 0): {'structural_type': 'numpy.int64'}, + (0, 1): {'structural_type': 'str'}, + (1, 1): {'structural_type': 'str'}, + (2, 1): {'structural_type': 'str'}, + (base.ALL_ELEMENTS, 1): {'name': 'B'}, + (0, 0): {'structural_type': 'numpy.int64'}, + (1, 0): {'structural_type': 'numpy.int64'}, + (2, 0): {'structural_type': 'numpy.int64'}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, 0): {'structural_type': 'numpy.int64'}, + (base.ALL_ELEMENTS, 1): {'name': 'B', 'structural_type': 'str'}, + }) + + new_metadata = { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 0): {'structural_type': 'numpy.int64'}, + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 1): {'structural_type': 'str'}, + ('0', base.ALL_ELEMENTS, 0): {'name': 'A', 'structural_type': 'numpy.int64'}, + ('0', base.ALL_ELEMENTS, 1): {'name': 'B', 'structural_type': 'str'}, + } + + compacted_metadata = base.DataMetadata._compact_metadata(new_metadata, ALL_GENERATED_KEYS) + + self.assertEqual(compacted_metadata, { + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 0): {'structural_type': 'numpy.int64', 'name': 'A'}, + (base.ALL_ELEMENTS, base.ALL_ELEMENTS, 1): {'structural_type': 'str', 'name': 'B'}, + }) + + def test_greedy_prune_metadata(self): + # Warmup test 1. + selectors_to_compact = [('a',), ('b',),('c',)] + compacted_selector = [(base.ALL_ELEMENTS, )] + + pruned_selectors = base.DataMetadata._greedy_prune_selector(compacted_selector, selectors_to_compact) + + self.assertEqual(pruned_selectors, [ + (base.ALL_ELEMENTS, ) + ]) + + # Warmup test 2. + selectors_to_compact = [('deep', 'a'), ('deep', 'b'), ('deep', 'c'), ('deep2', 'd')] + compacted_selector = [(base.ALL_ELEMENTS, base.ALL_ELEMENTS,)] + + pruned_selectors = base.DataMetadata._greedy_prune_selector(compacted_selector, selectors_to_compact) + + self.assertEqual(pruned_selectors, [ + (base.ALL_ELEMENTS, base.ALL_ELEMENTS,) + ]) + + # Check if it can remove unnecessary outputs. + selectors_to_compact = [('deep', 'a'), ('deep', 'b'), ('deep2', 'a'), ('deep2', 'b')] + compacted_selector = [(base.ALL_ELEMENTS, 'a'), (base.ALL_ELEMENTS, 'b'), ('deep', 'a'), ('deep2', 'b')] + + pruned_selectors = base.DataMetadata._greedy_prune_selector(compacted_selector, selectors_to_compact) + + self.assertEqual(pruned_selectors, [ + (base.ALL_ELEMENTS, 'a'), (base.ALL_ELEMENTS, 'b') + ]) + + # Case when compacted_selector overlaps. + selectors_to_compact = [('a', 'deep'), ('b', 'deep'), ('a', 'deep2'), ('b', 'deep2')] + compacted_selector = [('a', base.ALL_ELEMENTS), ('b', base.ALL_ELEMENTS), (base.ALL_ELEMENTS, 'deep2')] + + pruned_selectors = base.DataMetadata._greedy_prune_selector(compacted_selector, selectors_to_compact) + + self.assertEqual(pruned_selectors, [ + ('a', base.ALL_ELEMENTS), ('b', base.ALL_ELEMENTS) + ]) + + # Check the order. + selectors_to_compact = [('a', 'deep'), ('b', 'deep'), ('a', 'deep2'), ('b', 'deep2')] + compacted_selector = [(base.ALL_ELEMENTS, 'deep2'), ('a', base.ALL_ELEMENTS), ('b', base.ALL_ELEMENTS),] + + pruned_selectors = base.DataMetadata._greedy_prune_selector(compacted_selector, selectors_to_compact) + + self.assertEqual(pruned_selectors, [ + ('a', base.ALL_ELEMENTS), ('b', base.ALL_ELEMENTS) + ]) + + # More complex compacted_selectors. + selectors_to_compact = [('a', 'deep'), ('b', 'deep'), ('a', 'deep2'), ('b', 'deep2')] + compacted_selector = [(base.ALL_ELEMENTS, 'deep2'), ('a', base.ALL_ELEMENTS), + (base.ALL_ELEMENTS, 'deep'), ('b', base.ALL_ELEMENTS),] + + pruned_selectors = base.DataMetadata._greedy_prune_selector(compacted_selector, selectors_to_compact) + + self.assertEqual(pruned_selectors, [ + (base.ALL_ELEMENTS, 'deep2'), (base.ALL_ELEMENTS, 'deep') + ]) + + # All-elements in selectors_to_compact. + selectors_to_compact = [('deep', 'a', 1), ('deep', base.ALL_ELEMENTS, 2)] + compacted_selector = [(base.ALL_ELEMENTS, 'a', base.ALL_ELEMENTS), ('deep', base.ALL_ELEMENTS, 2)] + + pruned_selectors = base.DataMetadata._greedy_prune_selector(compacted_selector, selectors_to_compact) + + self.assertEqual(pruned_selectors, [ + (base.ALL_ELEMENTS, 'a', base.ALL_ELEMENTS), ('deep', base.ALL_ELEMENTS, 2) + ]) + + def test_semantic_types_merge(self): + metadata = base.DataMetadata().update(('0',), { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'], + }) + + metadata_regular = metadata.update((base.ALL_ELEMENTS,), { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }) + + self.assertEqual(metadata_regular.query(('0',)).get('semantic_types', None), ('https://metadata.datadrivendiscovery.org/types/Table',)) + + metadata._update_with_generated_metadata({ + (base.ALL_ELEMENTS,): { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }) + + self.assertEqual(metadata.query(('0',)).get('semantic_types', None), ('https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', 'https://metadata.datadrivendiscovery.org/types/Table',)) + + def test_compact(self): + md = base.Metadata().update(('0',), { + 'key': 'value', + }) + md = md.update(('1',), { + 'key': 'value', + }) + + md = md.compact(['key']) + + self.assertEqual(md.to_internal_json_structure(), [{ + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'key': 'value'}, + }]) + + md = base.Metadata().update(('0',), { + 'key': 'value', + }) + md = md.update(('1',), { + 'key': 'value', + }) + md = md.update(('2',), { + 'key': 'value2', + }) + + md = md.compact(['key']) + + self.assertEqual(md.to_internal_json_structure(), [{ + 'selector': ['0'], + 'metadata': {'key': 'value'}, + }, { + 'selector': ['1'], + 'metadata': {'key': 'value'}, + }, { + 'selector': ['2'], + 'metadata': {'key': 'value2'}, + }]) + + md = base.Metadata().update(('0',), { + 'key': 'value', + 'key2': 'value', + }) + md = md.update(('1',), { + 'key': 'value', + 'key2': 'value', + }) + + md = md.compact(['key']) + + self.assertEqual(md.to_internal_json_structure(), [{ + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'key': 'value'}, + }, { + 'selector': ['0'], + 'metadata': {'key2': 'value'}, + }, { + 'selector': ['1'], + 'metadata': {'key2': 'value'}, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_metrics.py b/d3m/tests/test_metrics.py new file mode 100644 index 0000000..b6ac9ec --- /dev/null +++ b/d3m/tests/test_metrics.py @@ -0,0 +1,1261 @@ +import io +import unittest + +import pandas +import sklearn +from distutils.version import LooseVersion + +from d3m import exceptions, metrics +from d3m.metadata import problem + + +class TestMetrics(unittest.TestCase): + def _read_csv(self, csv): + return pandas.read_csv( + io.StringIO(csv), + # We do not want to do any conversion of values at this point. + # This should be done by primitives later on. + dtype=str, + # We always expect one row header. + header=0, + # We want empty strings and not NaNs. + na_filter=False, + encoding='utf8', + ) + + def test_alignment(self): + truth = self._read_csv(""" +d3mIndex,class_label +1,a +2,b +3,c +4,d + """) + + predictions = self._read_csv(""" +d3mIndex,class_label,confidence +2,b,0.4 +4,d,0.5 +3,c,0.6 +1,a,0.1 + """) + + self.assertEqual(metrics.Metric.align(truth, predictions).values.tolist(), [['1', 'a', '0.1'], ['2', 'b', '0.4'], ['3', 'c', '0.6'], ['4', 'd', '0.5']]) + + predictions = self._read_csv(""" +d3mIndex,confidence,class_label +1,0.1,a +2,0.4,b +4,0.5,d +3,0.6,c + """) + + self.assertEqual(metrics.Metric.align(truth, predictions).values.tolist(), [['1', 'a', '0.1'], ['2', 'b', '0.4'], ['3', 'c', '0.6'], ['4', 'd', '0.5']]) + + predictions = self._read_csv(""" +confidence,class_label,d3mIndex +0.1,a,1 +0.4,b,2 +0.5,d,4 +0.6,c,3 + """) + + self.assertEqual(metrics.Metric.align(truth, predictions).values.tolist(), [['1', 'a', '0.1'], ['2', 'b', '0.4'], ['3', 'c', '0.6'], ['4', 'd', '0.5']]) + + predictions = self._read_csv(""" +d3mIndex +1 +2 +4 +3 + """) + + with self.assertRaises(exceptions.InvalidArgumentValueError): + metrics.Metric.align(truth, predictions) + + predictions = self._read_csv(""" +d3mIndex,class_label,confidence +1,a,0.1 +2,b,0.4 +3,c,0.6 + """) + + with self.assertRaises(exceptions.InvalidArgumentValueError): + metrics.Metric.align(truth, predictions) + + truth = self._read_csv(""" +d3mIndex,class_label +1,a1 +1,a2 +2,b +3,c1 +3,c2 +3,c3 +4,d1 +4,d2 + """) + + predictions = self._read_csv(""" +d3mIndex,class_label +2,b +4,d +3,c +1,a + """) + + self.assertEqual(metrics.Metric.align(truth, predictions).values.tolist(), [['1', 'a'], ['2', 'b'], ['3', 'c'], ['4', 'd']]) + + predictions = self._read_csv(""" +d3mIndex,class_label +4,d +2,b1 +2,b2 +2,b3 +2,b4 +2,b5 +2,b6 +3,c +1,a + """) + + self.assertEqual(metrics.Metric.align(truth, predictions).values.tolist(), [['1', 'a'], ['2', 'b1'], ['2', 'b2'], ['2', 'b3'], ['2', 'b4'], ['2', 'b5'], ['2', 'b6'], ['3', 'c'], ['4', 'd']]) + + truth = self._read_csv(""" +d3mIndex,class_label +1,a1 +1,a2 +3,c1 +2,b +3,c2 +3,c3 +4,d1 +4,d2 + """) + + with self.assertRaises(exceptions.InvalidArgumentValueError): + metrics.Metric.align(truth, predictions) + + def test_labels(self): + pred_df = pandas.DataFrame(columns=['d3mIndex', 'class'], dtype=object) + pred_df['d3mIndex'] = pandas.Series([0, 1, 2, 3, 4]) + pred_df['class'] = pandas.Series(['a', 'b', 'a', 'b', 'b']) + + ground_truth_df = pandas.DataFrame(columns=['d3mIndex', 'class'], dtype=object) + ground_truth_df['d3mIndex'] = pandas.Series([0, 1, 2, 3, 4]) + ground_truth_df['class'] = pandas.Series(['a', 'b', 'a', 'b', 'a']) + + precision_metric = metrics.PrecisionMetric(pos_label='a') + self.assertEqual(precision_metric.score(ground_truth_df, pred_df), 1.0) + + precision_metric = metrics.PrecisionMetric(pos_label='b') + self.assertAlmostEqual(precision_metric.score(ground_truth_df, pred_df), 0.6666666666666666) + + def test_hamming_loss(self): + # Testcase 1: MultiLabel, typical + + y_true = self._read_csv(""" +d3mIndex,class_label +3,happy-pleased +3,relaxing-calm +7,amazed-suprised +7,happy-pleased +13,quiet-still +13,sad-lonely + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +3,happy-pleased +3,sad-lonely +7,amazed-suprised +7,happy-pleased +13,quiet-still +13,happy-pleased + """) + + self.assertAlmostEqual(metrics.HammingLossMetric().score(y_true, y_pred), 0.26666666666666666) + + # Testcase 2: MultiLabel, Zero loss + + y_true = self._read_csv(""" +d3mIndex,class_label +3,happy-pleased +3,relaxing-calm +7,amazed-suprised +7,happy-pleased +13,quiet-still +13,sad-lonely + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +3,happy-pleased +3,relaxing-calm +7,amazed-suprised +7,happy-pleased +13,quiet-still +13,sad-lonely + """) + + self.assertAlmostEqual(metrics.HammingLossMetric().score(y_true, y_pred), 0.0) + + # Testcase 3: MultiLabel, Complete loss + + y_true = self._read_csv(""" +d3mIndex,class_label +3,happy-pleased +3,relaxing-calm +7,amazed-suprised +7,happy-pleased +13,quiet-still +13,sad-lonely + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +3,ecstatic +3,sad-lonely +3,quiet-still +3,amazed-suprised +7,ecstatic +7,sad-lonely +7,relaxing-calm +7,quiet-still +13,ecstatic +13,happy-pleased +13,relaxing-calm +13,amazed-suprised + """) + + self.assertAlmostEqual(metrics.HammingLossMetric().score(y_true, y_pred), 1.0) + + # Testcase 4: Multiclass, case 1 + # Multiclass is not really supported or reasonable to use, but we still test it to test also edge cases. + + y_true = self._read_csv(""" +d3mIndex,species +2,versicolor +16,virginica +17,setosa +22,versicolor +30,versicolor +31,virginica +26,versicolor +33,versicolor +1,versicolor +37,virginica + """) + + y_pred = self._read_csv(""" +d3mIndex,species +1,setosa +2,versicolor +22,versicolor +26,virginica +30,versicolor +31,virginica +33,versicolor +17,setosa +37,virginica +16,virginica + """) + + self.assertAlmostEqual(metrics.HammingLossMetric().score(y_true, y_pred), 0.1333333) + + # Testcase 5: Multiclass, case 2 + # Multiclass is not really supported or reasonable to use, but we still test it to test also edge cases. + + y_true = self._read_csv(""" +d3mIndex,species +1,versicolor +2,versicolor +16,virginica +17,setosa +22,versicolor +26,versicolor +30,versicolor +31,virginica +33,versicolor +37,virginica + """) + + y_pred = self._read_csv(""" +d3mIndex,species +1,versicolor +2,versicolor +16,virginica +17,setosa +22,versicolor +26,versicolor +30,versicolor +31,virginica +33,versicolor +37,virginica + """) + + self.assertAlmostEqual(metrics.HammingLossMetric().score(y_true, y_pred), 0.0) + + # Testcase 6: Multiclass, case 3 + # Multiclass is not really supported or reasonable to use, but we still test it to test also edge cases. + + y_true = self._read_csv(""" +d3mIndex,species +1,versicolor +2,versicolor +16,versicolor +17,virginica +22,versicolor +26,versicolor +30,versicolor +31,virginica +33,versicolor +37,virginica + """) + + y_pred = self._read_csv(""" +d3mIndex,species +1,setosa +2,setosa +16,setosa +17,setosa +22,setosa +26,setosa +30,setosa +31,setosa +33,setosa +37,setosa + """) + + self.assertAlmostEqual(metrics.HammingLossMetric().score(y_true, y_pred), 0.66666666) + + def test_root_mean_squared_error(self): + y_true = self._read_csv(""" +d3mIndex,value +1,3 +2,-1.0 +17,7 +16,2 + """) + + # regression univariate, regression multivariate, forecasting, collaborative filtering + y_pred = self._read_csv(""" +d3mIndex,value +1,2.1 +2,0.0 +16,2 +17,8 + """) + + self.assertAlmostEqual(metrics.RootMeanSquareErrorMetric().score(y_true, y_pred), 0.8381527307120105) + + y_true = self._read_csv(""" +d3mIndex,value1,value2 +1,0.5,1 +2,-1,1 +16,7,-6 + """) + + y_pred = self._read_csv(""" +d3mIndex,value1,value2 +1,0,2 +2,-1,2 +16,8,-5 + """) + + self.assertAlmostEqual(metrics.RootMeanSquareErrorMetric().score(y_true, y_pred), 0.8227486121839513) + + def test_precision_at_top_k(self): + # Forecasting test + ground_truth_list_1 = self._read_csv(""" +d3mIndex,value +1,1 +6,6 +2,10 +4,5 +5,12 +7,2 +8,18 +3,7 +9,4 +10,8 + """) + predictions_list_1 = self._read_csv(""" +d3mIndex,value +1,0 +10,11 +2,2 +4,6 +5,14 +6,9 +7,3 +8,17 +9,10 +3,8 + """) + self.assertAlmostEqual(metrics.PrecisionAtTopKMetric(k=5).score(ground_truth_list_1, predictions_list_1), 0.6) + + def test_object_detection_average_precision(self): + # Object Dectection test + predictions_list_1 = self._read_csv(""" +d3mIndex,box,confidence +1,"110,110,110,210,210,210,210,110",0.6 +2,"5,10,5,20,20,20,20,10",0.9 +2,"120,130,120,200,200,200,200,130",0.6 + """) + + ground_truth_list_1 = self._read_csv(""" +d3mIndex,box +1,"100,100,100,200,200,200,200,100" +2,"10,10,10,20,20,20,20,10" +2,"70,80,70,150,140,150,140,80" + """) + + self.assertAlmostEqual(metrics.ObjectDetectionAveragePrecisionMetric().score(ground_truth_list_1, predictions_list_1), 0.6666666666666666) + + predictions_list_2 = self._read_csv(""" +d3mIndex,box,confidence +285,"330,463,330,505,387,505,387,463",0.0739 +285,"420,433,420,498,451,498,451,433",0.0910 +285,"328,465,328,540,403,540,403,465",0.1008 +285,"480,477,480,522,508,522,508,477",0.1012 +285,"357,460,357,537,417,537,417,460",0.1058 +285,"356,456,356,521,391,521,391,456",0.0843 +225,"345,460,345,547,415,547,415,460",0.0539 +225,"381,362,381,513,455,513,455,362",0.0542 +225,"382,366,382,422,416,422,416,366",0.0559 +225,"730,463,730,583,763,583,763,463",0.0588 + """) + + ground_truth_list_2 = self._read_csv(""" +d3mIndex,box +285,"480,457,480,529,515,529,515,457" +285,"480,457,480,529,515,529,515,457" +225,"522,540,522,660,576,660,576,540" +225,"739,460,739,545,768,545,768,460" + """) + + self.assertAlmostEqual(metrics.ObjectDetectionAveragePrecisionMetric().score(ground_truth_list_2, predictions_list_2), 0.125) + + predictions_list_3 = self._read_csv(""" +d3mIndex,box,confidence +1,"110,110,110,210,210,210,210,110",0.6 +2,"120,130,120,200,200,200,200,130",0.6 +2,"5,8,5,16,15,16,15,8",0.9 +2,"11,12,11,18,21,18,21,12",0.9 + """) + + ground_truth_list_3 = self._read_csv(""" +d3mIndex,box +1,"100,100,100,200,200,200,200,100" +2,"10,10,10,20,20,20,20,10" +2,"70,80,70,150,140,150,140,80" + """) + + self.assertAlmostEqual(metrics.ObjectDetectionAveragePrecisionMetric().score(ground_truth_list_3, predictions_list_3), 0.4444444444444444) + + predictions_list_4 = self._read_csv(""" +d3mIndex,box,confidence +1,"110,110,110,210,210,210,210,110",0.6 +2,"120,130,120,200,200,200,200,130",0.6 +2,"11,12,11,18,21,18,21,12",0.9 +2,"5,8,5,16,15,16,15,8",0.9 + """) + + ground_truth_list_4 = self._read_csv(""" +d3mIndex,box +1,"100,100,100,200,200,200,200,100" +2,"10,10,10,20,20,20,20,10" +2,"70,80,70,150,140,150,140,80" + """) + + self.assertAlmostEqual(metrics.ObjectDetectionAveragePrecisionMetric().score(ground_truth_list_4, predictions_list_4), 0.4444444444444444) + + def test_normalized_mutual_info_score(self): + # Community Detection Test + predictions_list_1 = self._read_csv(""" +d3mIndex,Class +0,2 +1,2 +2,1 +3,1 + """) + + ground_truth_list_1 = self._read_csv(""" +d3mIndex,Class +0,1 +1,1 +2,1 +3,1 + """) + + self.assertAlmostEqual(metrics.NormalizeMutualInformationMetric().score(ground_truth_list_1, predictions_list_1), 0.5) + + def test_f1_score(self): + # MultiTask MultiClass Classification + y_true = self._read_csv(""" +d3mIndex,value1,value2 +1,1,1 +2,3,2 +16,4,1 + """) + + y_pred = self._read_csv(""" +d3mIndex,value1,value2 +1,1,2 +2,3,1 +16,4,2 + """) + + self.assertAlmostEqual(metrics.F1MacroMetric().score(y_true, y_pred), 0.5) + self.assertAlmostEqual(metrics.F1MicroMetric().score(y_true, y_pred), 0.5) + + # MultiClass Classification Test + y_true = self._read_csv(""" +d3mIndex,class_label +1,0 +2,1 +3,2 +4,3 + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +1,0 +2,2 +3,1 +4,3 + """) + + self.assertAlmostEqual(metrics.F1MacroMetric().score(y_true, y_pred), 0.5) + self.assertAlmostEqual(metrics.F1MicroMetric().score(y_true, y_pred), 0.5) + + # MultiTask Binary Classification + y_true = self._read_csv(""" +d3mIndex,value1,value2 +1,1,1 +2,0,0 +16,0,1 + """) + + y_pred = self._read_csv(""" +d3mIndex,value1,value2 +1,1,1 +2,0,1 +16,0,0 + """) + + self.assertAlmostEqual(metrics.F1Metric(pos_label='1').score(y_true, y_pred), 0.75) + + # MultiLabel Classification Test + y_true = self._read_csv(""" +d3mIndex,class_label +1,3 +1,1 +2,2 +3,3 + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +1,1 +1,2 +1,3 +2,1 +3,3 + """) + + self.assertEqual(metrics.F1MacroMetric().score(y_true, y_pred), 0.5555555555555555) + self.assertAlmostEqual(metrics.F1MicroMetric().score(y_true, y_pred), 0.6666666666666665) + + # MultiTask MultiLabel Classification Test + y_true = self._read_csv(""" +d3mIndex,value1,value2 +1,3,1 +1,1, +2,2,0 +3,3,1 +3,3,3 + """) + + y_pred = self._read_csv(""" +d3mIndex,value1,value2 +1,1,1 +1,2, +1,3, +2,1,3 +2,,3 +3,3,0 + """) + + self.assertEqual(metrics.F1MacroMetric().score(y_true, y_pred), 0.38888888888888884) + self.assertAlmostEqual(metrics.F1MicroMetric().score(y_true, y_pred), 0.47619047619047616) + + def test_all_labels(self): + y_true = self._read_csv(""" +d3mIndex,class_label +3,happy-pleased +3,relaxing-calm +7,amazed-suprised +7,happy-pleased +13,quiet-still +13,sad-lonely + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +3,happy-pleased +3,sad-lonely +7,amazed-suprised +7,happy-pleased +13,quiet-still +13,happy-pleased + """) + + self.assertAlmostEqual(metrics.HammingLossMetric(all_labels={'class_label': ['happy-pleased', 'relaxing-calm', 'amazed-suprised', 'quiet-still', 'sad-lonely', 'foobar']}).score(y_true, y_pred), 0.2222222222222222) + + with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'Truth contains extra labels'): + self.assertAlmostEqual(metrics.HammingLossMetric(all_labels={'class_label': ['happy-pleased', 'relaxing-calm', 'amazed-suprised']}).score(y_true, y_pred), 0.2222222222222222) + + def test_duplicate_columns(self): + y_true = self._read_csv(""" +d3mIndex,value1,value2 +1,1,1 +16,4,1 +2,3,2 + """) + + y_pred = self._read_csv(""" +d3mIndex,value1,value2 +1,1,2 +2,3,1 +16,4,2 + """) + + y_true.columns = ('d3mIndex', 'value1', 'value1') + y_pred.columns = ('d3mIndex', 'value1', 'value1') + + with self.assertRaises(exceptions.InvalidArgumentValueError): + (metrics.F1MicroMetric().score(y_true, y_pred), 0.5) + + def test_precision(self): + # Binary Classification Test + y_true = self._read_csv(""" +d3mIndex,class_label +1,pos +2,pos +3,neg +4,neg +5,pos + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +1,pos +2,pos +3,neg +4,neg +5,neg + """) + self.assertEqual(metrics.PrecisionMetric("pos").score(y_true, y_pred), 1.0) + + y_pred_2 = self._read_csv(""" +d3mIndex,class_label +1,pos +2,pos +3,pos +4,pos +5,neg + """) + + self.assertEqual(metrics.PrecisionMetric("pos").score(y_true, y_pred_2), 0.5) + + y_pred_3 = self._read_csv(""" +d3mIndex,class_label +1,neg +2,neg +3,pos +4,pos +5,neg + """) + + self.assertEqual(metrics.PrecisionMetric("pos").score(y_true, y_pred_3), 0.0) + + def test_accuracy(self): + # Binary Classification Test + y_true = self._read_csv(""" +d3mIndex,class_label +1,pos +2,pos +3,neg +4,neg +5,pos + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +1,pos +2,pos +3,neg +4,neg +5,pos + """) + + self.assertEqual(metrics.AccuracyMetric().score(y_true, y_pred), 1.0) + + y_pred_2 = self._read_csv(""" +d3mIndex,class_label +1,pos +2,pos +3,pos +4,pos +5,neg + """) + + self.assertEqual(metrics.AccuracyMetric().score(y_true, y_pred_2), 0.4) + + y_pred_3 = self._read_csv(""" +d3mIndex,class_label +1,neg +2,neg +3,pos +4,pos +5,neg + """) + + self.assertEqual(metrics.AccuracyMetric().score(y_true, y_pred_3), 0.0) + + # MultiClass Classification Test + y_true = self._read_csv(""" +d3mIndex,class_label +1,0 +2,1 +3,2 +4,3 + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +1,0 +2,2 +4,3 +3,1 + """) + + self.assertEqual(metrics.AccuracyMetric().score(y_true, y_pred), 0.5) + + # MultiLabel Classification Test + y_true = self._read_csv(""" +d3mIndex,class_label +1,3 +1,1 +2,2 +3,3 + """) + + y_pred = self._read_csv(""" +d3mIndex,class_label +1,1 +1,2 +1,3 +2,1 +3,3 + """) + + self.assertEqual(metrics.AccuracyMetric().score(y_true, y_pred), 0.3333333333333333) + + def test_mean_squared_error(self): + # regression univariate, regression multivariate, forecasting, collaborative filtering + y_true = self._read_csv(""" +d3mIndex,value +1,3 +16,2 +2,-1.0 +17,7 + """) + + y_pred = self._read_csv(""" +d3mIndex,value +1,2.1 +2,0.0 +16,2 +17,8 + """) + + self.assertAlmostEqual(metrics.MeanSquareErrorMetric().score(y_true, y_pred), 0.7024999999999999) + + y_true = self._read_csv(""" +d3mIndex,value1,value2 +1,0.5,1 +2,-1,1 +16,7,-6 + """) + + y_pred = self._read_csv(""" +d3mIndex,value1,value2 +1,0,2 +2,-1,2 +16,8,-5 + """) + + self.assertAlmostEqual(metrics.MeanSquareErrorMetric().score(y_true, y_pred), 0.7083333333333334) + + def test_mean_absolute_error(self): + # regression univariate, regression multivariate, forecasting, collaborative filtering + y_true = self._read_csv(""" +d3mIndex,value +1,3 +2,-0.5 +16,2 +17,7 + """) + + y_pred = self._read_csv(""" +d3mIndex,value +17,8 +1,2.5 +2,0.0 +16,2 + """) + + self.assertAlmostEqual(metrics.MeanAbsoluteErrorMetric().score(y_true, y_pred), 0.5) + + y_true = self._read_csv(""" +d3mIndex,value1,value2 +1,0.5,1 +2,-1,1 +16,7,-6 + """) + + y_pred = self._read_csv(""" +d3mIndex,value2,value1 +1,2,0 +16,-5,8 +2,2,-1 + """) + + self.assertAlmostEqual(metrics.MeanAbsoluteErrorMetric().score(y_true, y_pred), 0.75) + + def test_r_squared(self): + # regression univariate, regression multivariate, forecasting, collaborative filtering + y_true = self._read_csv(""" +d3mIndex,value +1,3 +2,-0.5 +16,2 +17,7 + """) + + y_pred = self._read_csv(""" +d3mIndex,value +1,2.5 +2,0.0 +16,2 +17,8 + """) + + self.assertAlmostEqual(metrics.RSquaredMetric().score(y_true, y_pred), 0.9486081370449679) + + y_true = self._read_csv(""" +d3mIndex,value1,value2 +1,0.5,1 +2,-1,1 +16,7,-6 + """) + + y_pred = self._read_csv(""" +d3mIndex,value2,value1 +1,2,0 +16,-5,8 +2,2,-1 + """) + + self.assertAlmostEqual(metrics.RSquaredMetric().score(y_true, y_pred), 0.9368005266622779) + + y_true = self._read_csv(""" +d3mIndex,value +1,1 +2,2 +16,3 + """) + + y_pred = self._read_csv(""" +d3mIndex,value +1,1 +2,2 +16,3 + """) + + self.assertAlmostEqual(metrics.RSquaredMetric().score(y_true, y_pred), 1.0) + + y_true = self._read_csv(""" +d3mIndex,value +1,1 +2,2 +16,3 + """) + + y_pred = self._read_csv(""" +d3mIndex,value +1,2 +2,2 +16,2 + """) + + self.assertAlmostEqual(metrics.RSquaredMetric().score(y_true, y_pred), 0.0) + + y_true = self._read_csv(""" +d3mIndex,value +1,1 +2,2 +16,3 + """) + + y_pred = self._read_csv(""" +d3mIndex,value +1,3 +2,2 +16,1 + """) + + self.assertAlmostEqual(metrics.RSquaredMetric().score(y_true, y_pred), -3.0) + + def test_recall(self): + # Binary Classification Test + y_true = self._read_csv(""" +d3mIndex,value +1,0 +6,1 +3,0 +4,0 +5,1 +2,1 + """) + + y_pred = self._read_csv(""" +d3mIndex,value +3,1 +1,0 +2,1 +4,0 +5,0 +6,1 + """) + + self.assertAlmostEqual(metrics.RecallMetric(pos_label='1').score(y_true, y_pred), 0.6666666666666666) + + @unittest.skipUnless(sklearn.__version__ >= LooseVersion("0.21"), "jaccard_score introduced in sklearn version 0.21") + def test_jaccard(self): + # Binary Classification Test + y_true = self._read_csv(""" +d3mIndex,value +1,0 +2,1 +16,1 + """) + + y_pred = self._read_csv(""" +d3mIndex,value +1,1 +2,1 +16,1 + """) + + self.assertAlmostEqual(metrics.JaccardSimilarityScoreMetric(pos_label='1').score(y_true, y_pred), 0.6666666666666666) + + + def test_meanReciprocalRank(self): + y_true = self._read_csv(""" +d3mIndex,relationship +0,father +1,sister +2,brother + """) + + # case 1: all correct + y_pred = self._read_csv(""" +d3mIndex,relationship,rank +0,father,1 +0,cousin,2 +0,mother,3 +0,brother,4 +0,grandfather,5 +1,sister,1 +1,mother,2 +1,aunt,3 +2,brother,1 +2,father,2 +2,sister,3 +2,grandfather,4 +2,aunt,5 + """) + self.assertAlmostEqual(metrics.MeanReciprocalRankMetric().score(y_true, y_pred), 1.0) + + # case 2: all wrong + y_pred = self._read_csv(""" +d3mIndex,relationship,rank +0,brother,1 +0,cousin,2 +0,mother,3 +0,grandfather,4 +1,brother,1 +1,mother,2 +1,aunt,3 +2,father,1 +2,grandmother,2 +2,sister,3 +2,grandfather,4 +2,aunt,5 + """) + self.assertAlmostEqual(metrics.MeanReciprocalRankMetric().score(y_true, y_pred), 0.0) + + # case 3 (typical case): some correct and some low ranks + y_pred = self._read_csv(""" +d3mIndex,relationship,rank +0,brother,1 +0,cousin,2 +0,mother,3 +0,father,4 +0,grandfather,5 +1,sister,1 +1,mother,2 +1,aunt,3 +2,father,1 +2,brother,2 +2,sister,3 +2,grandfather,4 +2,aunt,5 + """) + self.assertAlmostEqual(metrics.MeanReciprocalRankMetric().score(y_true, y_pred), 0.5833333333333334) + + # case 4: some are not ranked at all + y_pred = self._read_csv(""" +d3mIndex,relationship,rank +0,brother,1 +0,cousin,2 +0,mother,3 +0,grandfather,4 +1,sister,1 +1,mother,2 +1,aunt,3 +2,father,1 +2,uncle,2 +2,sister,3 +2,grandfather,4 +2,aunt,5 + """) + self.assertAlmostEqual(metrics.MeanReciprocalRankMetric().score(y_true, y_pred), 0.33466666666666667) + + def test_hitsAtK(self): + y_true = self._read_csv(""" +d3mIndex,relationship +0,father +1,sister +2,brother + """) + + # case 1: all correct + y_pred = self._read_csv(""" +d3mIndex,relationship,rank +0,father,1 +0,cousin,2 +0,mother,3 +0,brother,4 +0,grandfather,5 +1,sister,1 +1,mother,2 +1,aunt,3 +2,brother,1 +2,father,2 +2,sister,3 +2,grandfather,4 +2,aunt,5 + """) + self.assertAlmostEqual(metrics.HitsAtKMetric(k=3).score(y_true, y_pred), 1.0) + + # case 2: all wrong + y_pred = self._read_csv(""" +d3mIndex,relationship,rank +0,brother,1 +0,cousin,2 +0,mother,3 +0,grandfather,4 +1,brother,1 +1,mother,2 +1,aunt,3 +2,father,1 +2,grandmother,2 +2,sister,3 +2,grandfather,4 +2,aunt,5 + """) + self.assertAlmostEqual(metrics.HitsAtKMetric(k=3).score(y_true, y_pred), 0.0) + + # case 3 (typical case): some correct and some low ranks + y_pred = self._read_csv(""" +d3mIndex,relationship,rank +0,brother,1 +0,cousin,2 +0,mother,3 +0,father,4 +0,grandfather,5 +1,sister,1 +1,mother,2 +1,aunt,3 +2,father,1 +2,brother,2 +2,sister,3 +2,grandfather,4 +2,aunt,5 + """) + self.assertAlmostEqual(metrics.HitsAtKMetric(k=3).score(y_true, y_pred), 0.6666666666666666) + self.assertAlmostEqual(metrics.HitsAtKMetric(k=1).score(y_true, y_pred), 0.3333333333333333) + self.assertAlmostEqual(metrics.HitsAtKMetric(k=5).score(y_true, y_pred), 1.0) + + # case 4: some are not ranked at all + y_pred = self._read_csv(""" +d3mIndex,relationship,rank +0,brother,1 +0,cousin,2 +0,mother,3 +0,grandfather,4 +1,sister,1 +1,mother,2 +1,aunt,3 +2,father,1 +2,uncle,2 +2,sister,3 +2,grandfather,4 +2,aunt,5 + """) + self.assertAlmostEqual(metrics.HitsAtKMetric(k=3).score(y_true, y_pred), 0.3333333) + + def test_custom_metric(self): + class FooBar(): + def score(self, truth: metrics.Truth, predictions: metrics.Predictions) -> float: + return 1.0 + + problem.PerformanceMetric.register_metric('FOOBAR', best_value=1.0, worst_value=0.0, score_class=FooBar) + + self.assertEqual(problem.PerformanceMetric.FOOBAR.best_value(), 1.0) + self.assertEqual(problem.PerformanceMetric['FOOBAR'].worst_value(), 0.0) + self.assertEqual(problem.PerformanceMetric('FOOBAR').requires_confidence(), False) + self.assertIs(problem.PerformanceMetric.FOOBAR.get_class(), FooBar) + + def test_roc_auc(self): + # Binary Classification Test + y_true = self._read_csv(""" +d3mIndex,value +640,0 +641,1 +642,0 +643,0 +644,1 +645,1 +646,0 + """) + + y_pred = self._read_csv(""" +d3mIndex,value,confidence +640,0,0.612 +640,1,0.388 +641,0,0.6 +641,1,0.4 +645,1,0.9 +645,0,0.1 +642,1,0.0 +642,0,1.0 +643,0,0.52 +643,1,0.48 +644,0,0.3 +644,1,0.7 +646,0,1.0 +646,1,0.0 + """) + + self.assertAlmostEqual(metrics.RocAucMetric().score(y_true, y_pred), 0.9166666666666667) + + def test_roc_auc_micro(self): + # Testcase 1: MultiLabel, typical + + y_true = self._read_csv(""" +d3mIndex,value +3,d +4,a +4,b +4,c +7,a +7,b +7,d +9,b +9,e + """) + + y_pred = self._read_csv(""" +d3mIndex,value,confidence +9,b,0.1 +4,a,0.4 +4,b,0.3 +3,a,0.2 +3,b,0.1 +3,c,0.6 +3,d,0.1 +3,e,0 +4,c,0.1 +4,e,0.1 +4,d,0.1 +7,a,0.1 +7,b,0.1 +7,d,0.7 +7,c,0.1 +7,e,0 +9,a,0.4 +9,c,0.15 +9,d,0.3 +9,e,0.05 + """) + self.assertAlmostEqual(metrics.RocAucMicroMetric().score(y_true, y_pred), 0.5151515151515151) + + def test_roc_auc_macro(self): + # Testcase 1: MultiLabel, typical + + y_true = self._read_csv(""" +d3mIndex,value +3,d +4,a +4,b +4,c +7,a +7,b +7,d +9,b +9,e + """) + + y_pred = self._read_csv(""" +d3mIndex,value,confidence +3,a,0.2 +3,b,0.1 +3,c,0.6 +3,d,0.1 +3,e,0 +7,b,0.1 +7,a,0.1 +4,a,0.4 +4,b,0.3 +4,c,0.1 +4,d,0.1 +4,e,0.1 +9,a,0.4 +9,b,0.1 +9,c,0.15 +9,d,0.3 +9,e,0.05 +7,c,0.1 +7,d,0.7 +7,e,0 + """) + self.assertAlmostEqual(metrics.RocAucMacroMetric().score(y_true, y_pred), 0.5) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_monomial.py b/d3m/tests/test_monomial.py new file mode 100644 index 0000000..9888ba5 --- /dev/null +++ b/d3m/tests/test_monomial.py @@ -0,0 +1,358 @@ +import json +import pickle +import unittest +import os.path +import sys + +import d3m +from d3m import container, utils +from d3m.metadata import base + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') + +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.monomial import MonomialPrimitive + + +EXPECTED_PRIMITIVE_DESCRIPTION_JSON = r""" +{ + "id": "4a0336ae-63b9-4a42-860e-86c5b64afbdd", + "version": "0.1.0", + "name": "Monomial Regressor", + "keywords": [ + "test primitive" + ], + "source": { + "name": "Test team", + "contact": "mailto:author@example.com", + "uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/monomial.py", + "https://gitlab.com/datadrivendiscovery/tests-data.git" + ] + }, + "installation": [ + { + "type": "PIP", + "package_uri": "git+https://gitlab.com/datadrivendiscovery/tests-data.git@__GIT_COMMIT__#egg=test_primitives&subdirectory=primitives" + } + ], + "location_uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/raw/__GIT_COMMIT__/primitives/test_primitives/monomial.py" + ], + "python_path": "d3m.primitives.regression.monomial.Test", + "algorithm_types": [ + "LINEAR_REGRESSION" + ], + "primitive_family": "REGRESSION", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/primitive.json", + "original_python_path": "test_primitives.monomial.MonomialPrimitive", + "primitive_code": { + "class_type_arguments": { + "Inputs": "d3m.container.list.List", + "Outputs": "d3m.container.list.List", + "Params": "test_primitives.monomial.Params", + "Hyperparams": "test_primitives.monomial.Hyperparams" + }, + "interfaces_version": "__INTERFACES_VERSION__", + "interfaces": [ + "supervised_learning.SupervisedLearnerPrimitiveBase", + "base.PrimitiveBase" + ], + "hyperparams": { + "bias": { + "type": "d3m.metadata.hyperparams.Hyperparameter", + "default": 0.0, + "structural_type": "float", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/TuningParameter" + ] + } + }, + "arguments": { + "hyperparams": { + "type": "test_primitives.monomial.Hyperparams", + "kind": "RUNTIME" + }, + "timeout": { + "type": "typing.Union[NoneType, float]", + "kind": "RUNTIME", + "default": null + }, + "iterations": { + "type": "typing.Union[NoneType, int]", + "kind": "RUNTIME", + "default": null + }, + "produce_methods": { + "type": "typing.Sequence[str]", + "kind": "RUNTIME" + }, + "random_seed": { + "default": 0, + "kind": "RUNTIME", + "type": "int" + }, + "inputs": { + "type": "d3m.container.list.List", + "kind": "PIPELINE" + }, + "outputs": { + "type": "d3m.container.list.List", + "kind": "PIPELINE" + }, + "params": { + "type": "test_primitives.monomial.Params", + "kind": "RUNTIME" + } + }, + "class_methods": {}, + "instance_methods": { + "__init__": { + "kind": "OTHER", + "arguments": [ + "hyperparams", + "random_seed" + ], + "returns": "NoneType" + }, + "fit": { + "kind": "OTHER", + "arguments": [ + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[NoneType]", + "description": "Fits primitive using inputs and outputs (if any) using currently set training data.\n\nThe returned value should be a ``CallResult`` object with ``value`` set to ``None``.\n\nIf ``fit`` has already been called in the past on different training data,\nthis method fits it **again from scratch** using currently set training data.\n\nOn the other hand, caller can call ``fit`` multiple times on the same training data\nto continue fitting.\n\nIf ``fit`` fully fits using provided training data, there is no point in making further\ncalls to this method with same training data, and in fact further calls can be noops,\nor a primitive can decide to fully refit from scratch.\n\nIn the case fitting can continue with same training data (even if it is maybe not reasonable,\nbecause the internal metric primitive is using looks like fitting will be degrading), if ``fit``\nis called again (without setting training data), the primitive has to continue fitting.\n\nCaller can provide ``timeout`` information to guide the length of the fitting process.\nIdeally, a primitive should adapt its fitting process to try to do the best fitting possible\ninside the time allocated. If this is not possible and the primitive reaches the timeout\nbefore fitting, it should raise a ``TimeoutError`` exception to signal that fitting was\nunsuccessful in the given time. The state of the primitive after the exception should be\nas the method call has never happened and primitive should continue to operate normally.\nThe purpose of ``timeout`` is to give opportunity to a primitive to cleanly manage\nits state instead of interrupting execution from outside. Maintaining stable internal state\nshould have precedence over respecting the ``timeout`` (caller can terminate the misbehaving\nprimitive from outside anyway). If a longer ``timeout`` would produce different fitting,\nthen ``CallResult``'s ``has_finished`` should be set to ``False``.\n\nSome primitives have internal fitting iterations (for example, epochs). For those, caller\ncan provide how many of primitive's internal iterations should a primitive do before returning.\nPrimitives should make iterations as small as reasonable. If ``iterations`` is ``None``,\nthen there is no limit on how many iterations the primitive should do and primitive should\nchoose the best amount of iterations on its own (potentially controlled through\nhyper-parameters). If ``iterations`` is a number, a primitive has to do those number of\niterations (even if not reasonable), if possible. ``timeout`` should still be respected\nand potentially less iterations can be done because of that. Primitives with internal\niterations should make ``CallResult`` contain correct values.\n\nFor primitives which do not have internal iterations, any value of ``iterations``\nmeans that they should fit fully, respecting only ``timeout``.\n\nParameters\n----------\ntimeout:\n A maximum time this primitive should be fitting during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA ``CallResult`` with ``None`` value." + }, + "fit_multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "outputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling ``fit`` and after that multiple produce methods at once.\n\nThis method allows primitive author to implement an optimized version of both fitting\nand producing a primitive on same data.\n\nIf any additional method arguments are added to primitive's ``set_training_data`` method\nor produce method(s), or removed from them, they have to be added to or removed from this\nmethod as well. This method should accept an union of all arguments accepted by primitive's\n``set_training_data`` method and produce method(s) and then use them accordingly when\ncomputing results.\n\nThe default implementation of this method just calls first ``set_training_data`` method,\n``fit`` method, and all produce methods listed in ``produce_methods`` in order and is\npotentially inefficient.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to ``set_training_data`` and all produce methods.\noutputs:\n The outputs given to ``set_training_data``.\ntimeout:\n A maximum time this primitive should take to both fit the primitive and produce outputs\n for all produce methods listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do for both fitting and producing\n outputs of all produce methods.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "get_params": { + "kind": "OTHER", + "arguments": [], + "returns": "test_primitives.monomial.Params", + "description": "Returns parameters of this primitive.\n\nParameters are all parameters of the primitive which can potentially change during a life-time of\na primitive. Parameters which cannot are passed through constructor.\n\nParameters should include all data which is necessary to create a new instance of this primitive\nbehaving exactly the same as this instance, when the new instance is created by passing the same\nparameters to the class constructor and calling ``set_params``.\n\nNo other arguments to the method are allowed (except for private arguments).\n\nReturns\n-------\nAn instance of parameters." + }, + "multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling multiple produce methods at once.\n\nWhen a primitive has multiple produce methods it is common that they might compute the\nsame internal results for same inputs but return different representations of those results.\nIf caller is interested in multiple of those representations, calling multiple produce\nmethods might lead to recomputing same internal results multiple times. To address this,\nthis method allows primitive author to implement an optimized version which computes\ninternal results only once for multiple calls of produce methods, but return those different\nrepresentations.\n\nIf any additional method arguments are added to primitive's produce method(s), they have\nto be added to this method as well. This method should accept an union of all arguments\naccepted by primitive's produce method(s) and then use them accordingly when computing\nresults.\n\nThe default implementation of this method just calls all produce methods listed in\n``produce_methods`` in order and is potentially inefficient.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to produce outputs for all produce methods\n listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "produce": { + "kind": "PRODUCE", + "arguments": [ + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[d3m.container.list.List]", + "singleton": false, + "inputs_across_samples": [], + "description": "Produce primitive's best choice of the output for each of the inputs.\n\nThe output value should be wrapped inside ``CallResult`` object before returning.\n\nIn many cases producing an output is a quick operation in comparison with ``fit``, but not\nall cases are like that. For example, a primitive can start a potentially long optimization\nprocess to compute outputs. ``timeout`` and ``iterations`` can serve as a way for a caller\nto guide the length of this process.\n\nIdeally, a primitive should adapt its call to try to produce the best outputs possible\ninside the time allocated. If this is not possible and the primitive reaches the timeout\nbefore producing outputs, it should raise a ``TimeoutError`` exception to signal that the\ncall was unsuccessful in the given time. The state of the primitive after the exception\nshould be as the method call has never happened and primitive should continue to operate\nnormally. The purpose of ``timeout`` is to give opportunity to a primitive to cleanly\nmanage its state instead of interrupting execution from outside. Maintaining stable internal\nstate should have precedence over respecting the ``timeout`` (caller can terminate the\nmisbehaving primitive from outside anyway). If a longer ``timeout`` would produce\ndifferent outputs, then ``CallResult``'s ``has_finished`` should be set to ``False``.\n\nSome primitives have internal iterations (for example, optimization iterations).\nFor those, caller can provide how many of primitive's internal iterations\nshould a primitive do before returning outputs. Primitives should make iterations as\nsmall as reasonable. If ``iterations`` is ``None``, then there is no limit on\nhow many iterations the primitive should do and primitive should choose the best amount\nof iterations on its own (potentially controlled through hyper-parameters).\nIf ``iterations`` is a number, a primitive has to do those number of iterations,\nif possible. ``timeout`` should still be respected and potentially less iterations\ncan be done because of that. Primitives with internal iterations should make\n``CallResult`` contain correct values.\n\nFor primitives which do not have internal iterations, any value of ``iterations``\nmeans that they should run fully, respecting only ``timeout``.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\ninputs:\n The inputs of shape [num_inputs, ...].\ntimeout:\n A maximum time this primitive should take to produce outputs during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nThe outputs of shape [num_inputs, ...] wrapped inside ``CallResult``." + }, + "set_params": { + "kind": "OTHER", + "arguments": [ + "params" + ], + "returns": "NoneType", + "description": "Sets parameters of this primitive.\n\nParameters are all parameters of the primitive which can potentially change during a life-time of\na primitive. Parameters which cannot are passed through constructor.\n\nNo other arguments to the method are allowed (except for private arguments).\n\nParameters\n----------\nparams:\n An instance of parameters." + }, + "set_training_data": { + "kind": "OTHER", + "arguments": [ + "inputs", + "outputs" + ], + "returns": "NoneType", + "description": "Sets current training data of this primitive.\n\nThis marks training data as changed even if new training data is the same as\nprevious training data.\n\nStandard sublasses in this package do not adhere to the Liskov substitution principle when\ninheriting this method because they do not necessary accept all arguments found in the base\nclass. This means that one has to inspect which arguments are accepted at runtime, or in\nother words, one has to inspect which exactly subclass a primitive implements, if\nyou are accepting a wider range of primitives. This relaxation is allowed only for\nstandard subclasses found in this package. Primitives themselves should not break\nthe Liskov substitution principle but should inherit from a suitable base class.\n\nParameters\n----------\ninputs:\n The inputs.\noutputs:\n The outputs." + } + }, + "class_attributes": { + "logger": "logging.Logger", + "metadata": "d3m.metadata.base.PrimitiveMetadata" + }, + "instance_attributes": { + "hyperparams": "d3m.metadata.hyperparams.Hyperparams", + "random_seed": "int", + "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]", + "volumes": "typing.Dict[str, str]", + "temporary_directory": "typing.Union[NoneType, str]" + }, + "params": { + "a": "float" + } + }, + "structural_type": "test_primitives.monomial.MonomialPrimitive", + "description": "A primitive which fits output = a * input.\n\nAttributes\n----------\nmetadata:\n Primitive's metadata. Available as a class attribute.\nlogger:\n Primitive's logger. Available as a class attribute.\nhyperparams:\n Hyperparams passed to the constructor.\nrandom_seed:\n Random seed passed to the constructor.\ndocker_containers:\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes:\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory:\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.", + "digest": "__DIGEST__" +} +""".replace('__INTERFACES_VERSION__', d3m.__version__).replace('__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace('__DIGEST__', MonomialPrimitive.metadata.query()['digest']) + + +class TestMonomialPrimitive(unittest.TestCase): + def call_primitive(self, primitive, method_name, **kwargs): + return getattr(primitive, method_name)(**kwargs) + + def test_basic(self): + hyperparams_class = MonomialPrimitive.metadata.get_hyperparams() + + primitive = MonomialPrimitive(hyperparams=hyperparams_class.defaults()) + + inputs = container.List([1, 2, 3, 4, 5, 6], generate_metadata=True) + + outputs = container.List([2, 4, 6, 8, 10, 12], generate_metadata=True) + + self.call_primitive(primitive, 'set_training_data', inputs=inputs, outputs=outputs) + call_metadata = self.call_primitive(primitive, 'fit') + + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + inputs = container.List([10, 20, 30], generate_metadata=True) + + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertSequenceEqual(call_metadata.value, [20, 40, 60]) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query(())['dimension']['length'], 3) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS,))['structural_type'], float) + + call_metadata = primitive.multi_produce(produce_methods=('produce',), inputs=inputs) + + self.assertEqual(len(call_metadata.values), 1) + self.assertSequenceEqual(call_metadata.values['produce'], [20, 40, 60]) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + def test_hyperparameter(self): + hyperparams_class = MonomialPrimitive.metadata.get_hyperparams() + + primitive = MonomialPrimitive(hyperparams=hyperparams_class(bias=1)) + + inputs = container.List([1, 2, 3, 4, 5, 6], generate_metadata=True) + + outputs = container.List([2, 4, 6, 8, 10, 12], generate_metadata=True) + + self.call_primitive(primitive, 'set_training_data', inputs=inputs, outputs=outputs) + call_metadata = self.call_primitive(primitive, 'fit') + + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + inputs = container.List([10, 20, 30], generate_metadata=True) + + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertSequenceEqual(call_metadata.value, [21, 41, 61]) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query(())['dimension']['length'], 3) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS,))['structural_type'], float) + + def test_recreation(self): + hyperparams_class = MonomialPrimitive.metadata.get_hyperparams() + + primitive = MonomialPrimitive(hyperparams=hyperparams_class(bias=1)) + + inputs = container.List([1, 2, 3, 4, 5, 6], generate_metadata=True) + + outputs = container.List([2, 4, 6, 8, 10, 12], generate_metadata=True) + + self.call_primitive(primitive, 'set_training_data', inputs=inputs, outputs=outputs) + call_metadata = self.call_primitive(primitive, 'fit') + + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + params = self.call_primitive(primitive, 'get_params') + + pickled_params = pickle.dumps(params) + unpickled_params = pickle.loads(pickled_params) + + self.assertEqual(params, unpickled_params) + + pickled_hyperparams = pickle.dumps(primitive.hyperparams) + unpickled_hyperparams = pickle.loads(pickled_hyperparams) + + self.assertEqual(primitive.hyperparams, unpickled_hyperparams) + + primitive = MonomialPrimitive(hyperparams=unpickled_hyperparams) + + self.call_primitive(primitive, 'set_params', params=unpickled_params) + + inputs = container.List([10, 20, 30], generate_metadata=True) + + call_metadata =self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertSequenceEqual(call_metadata.value, [21, 41, 61]) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query(())['dimension']['length'], 3) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS,))['structural_type'], float) + + def test_pickle(self): + hyperparams_class = MonomialPrimitive.metadata.get_hyperparams() + + primitive = MonomialPrimitive(hyperparams=hyperparams_class(bias=1)) + + inputs = container.List([1, 2, 3, 4, 5, 6], generate_metadata=True) + + outputs = container.List([2, 4, 6, 8, 10, 12], generate_metadata=True) + + self.call_primitive(primitive, 'set_training_data', inputs=inputs, outputs=outputs) + call_metadata = self.call_primitive(primitive, 'fit') + + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + pickled_primitive = pickle.dumps(primitive) + unpickled_primitive = pickle.loads(pickled_primitive) + + self.assertEqual(primitive.hyperparams, unpickled_primitive.hyperparams) + self.assertEqual(primitive.random_seed, unpickled_primitive.random_seed) + self.assertEqual(primitive.docker_containers, unpickled_primitive.docker_containers) + + inputs = container.List([10, 20, 30], generate_metadata=True) + + call_metadata =self.call_primitive(unpickled_primitive, 'produce', inputs=inputs) + + self.assertSequenceEqual(call_metadata.value, [21, 41, 61]) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query(())['dimension']['length'], 3) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS,))['structural_type'], float) + + def test_metadata(self): + expected_description = json.loads(EXPECTED_PRIMITIVE_DESCRIPTION_JSON) + + # We stringify to JSON and parse it to make sure the description can be stringified to JSON. + description = json.loads(json.dumps(MonomialPrimitive.metadata.to_json_structure())) + + self.maxDiff = None + self.assertEqual(expected_description, description) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_null.py b/d3m/tests/test_null.py new file mode 100644 index 0000000..10e059f --- /dev/null +++ b/d3m/tests/test_null.py @@ -0,0 +1,267 @@ +import json +import unittest +import os.path +import sys + +import numpy as np + +import d3m +from d3m import container, utils +from d3m.metadata import base + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') + +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.null import NullTransformerPrimitive, NullDataFrameUnsupervisedLearnerPrimitive + + +EXPECTED_PRIMITIVE_DESCRIPTION_JSON = r""" +{ + "id": "e0f83c35-fe3d-4fa6-92cf-f7421408eab5", + "version": "0.1.0", + "name": "Produce the same as the input", + "keywords": [ + "test primitive" + ], + "source": { + "name": "Test team", + "contact": "mailto:author@example.com", + "uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/null.py", + "https://gitlab.com/datadrivendiscovery/tests-data.git" + ] + }, + "installation": [ + { + "type": "PIP", + "package_uri": "git+https://gitlab.com/datadrivendiscovery/tests-data.git@__GIT_COMMIT__#egg=test_primitives&subdirectory=primitives" + } + ], + "location_uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/raw/__GIT_COMMIT__/primitives/test_primitives/add_primitives.py" + ], + "python_path": "d3m.primitives.operator.null.TransformerTest", + "algorithm_types": [ + "IDENTITY_FUNCTION" + ], + "primitive_family": "OPERATOR", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/primitive.json", + "original_python_path": "test_primitives.null.NullTransformerPrimitive", + "primitive_code": { + "class_type_arguments": { + "Inputs": "d3m.container.list.List", + "Outputs": "d3m.container.list.List", + "Hyperparams": "test_primitives.null.Hyperparams", + "Params": "NoneType" + }, + "interfaces_version": "__INTERFACES_VERSION__", + "interfaces": [ + "transformer.TransformerPrimitiveBase", + "base.PrimitiveBase" + ], + "hyperparams": {}, + "arguments": { + "hyperparams": { + "type": "test_primitives.null.Hyperparams", + "kind": "RUNTIME" + }, + "random_seed": { + "type": "int", + "kind": "RUNTIME", + "default": 0 + }, + "docker_containers": { + "type": "typing.Union[NoneType, typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]]", + "kind": "RUNTIME", + "default": null + }, + "volumes": { + "type": "typing.Union[NoneType, typing.Dict[str, str]]", + "kind": "RUNTIME", + "default": null + }, + "temporary_directory": { + "type": "typing.Union[NoneType, str]", + "kind": "RUNTIME", + "default": null + }, + "timeout": { + "type": "typing.Union[NoneType, float]", + "kind": "RUNTIME", + "default": null + }, + "iterations": { + "type": "typing.Union[NoneType, int]", + "kind": "RUNTIME", + "default": null + }, + "produce_methods": { + "type": "typing.Sequence[str]", + "kind": "RUNTIME" + }, + "inputs": { + "type": "d3m.container.list.List", + "kind": "PIPELINE" + }, + "params": { + "type": "NoneType", + "kind": "RUNTIME" + } + }, + "class_methods": {}, + "instance_methods": { + "__init__": { + "kind": "OTHER", + "arguments": [ + "hyperparams", + "random_seed", + "docker_containers", + "volumes", + "temporary_directory" + ], + "returns": "NoneType", + "description": "All primitives should accept all their hyper-parameters in a constructor as one value,\nan instance of type ``Hyperparams``.\n\nProvided random seed should control all randomness used by this primitive.\nPrimitive should behave exactly the same for the same random seed across multiple\ninvocations. You can call `numpy.random.RandomState(random_seed)` to obtain an\ninstance of a random generator using provided seed. If your primitive does not\nuse randomness, consider not exposing this argument in your primitive's constructor\nto signal that.\n\nPrimitives can be wrappers around or use one or more Docker images which they can\nspecify as part of ``installation`` field in their metadata. Each Docker image listed\nthere has a ``key`` field identifying that image. When primitive is created,\n``docker_containers`` contains a mapping between those keys and connection information\nwhich primitive can use to connect to a running Docker container for a particular Docker\nimage and its exposed ports. Docker containers might be long running and shared between\nmultiple instances of a primitive. If your primitive does not use Docker images,\nconsider not exposing this argument in your primitive's constructor.\n\n**Note**: Support for primitives using Docker containers has been put on hold.\nCurrently it is not expected that any runtime running primitives will run\nDocker containers for a primitive.\n\nPrimitives can also use additional static files which can be added as a dependency\nto ``installation`` metadata. When done so, given volumes are provided to the\nprimitive through ``volumes`` argument to the primitive's constructor as a\ndict mapping volume keys to file and directory paths where downloaded and\nextracted files are available to the primitive. All provided files and directories\nare read-only. If your primitive does not use static files, consider not exposing\nthis argument in your primitive's constructor.\n\nPrimitives can also use the provided temporary directory to store any files for\nthe duration of the current pipeline run phase. Directory is automatically\ncleaned up after the current pipeline run phase finishes. Do not store in this\ndirectory any primitive's state you would like to preserve between \"fit\" and\n\"produce\" phases of pipeline execution. Use ``Params`` for that. The main intent\nof this temporary directory is to store files referenced by any ``Dataset`` object\nyour primitive might create and followup primitives in the pipeline should have\naccess to. When storing files into this directory consider using capabilities\nof Python's `tempfile` module to generate filenames which will not conflict with\nany other files stored there. Use provided temporary directory as ``dir`` argument\nto set it as base directory to generate additional temporary files and directories\nas needed. If your primitive does not use temporary directory, consider not exposing\nthis argument in your primitive's constructor.\n\nNo other arguments to the constructor are allowed (except for private arguments)\nbecause we want instances of primitives to be created without a need for any other\nprior computation.\n\nModule in which a primitive is defined should be kept lightweight and on import not do\nany (pre)computation, data loading, or resource allocation/reservation. Any loading\nand resource allocation/reservation should be done in the constructor. Any (pre)computation\nshould be done lazily when needed once requested through other methods and not in the constructor." + }, + "fit": { + "kind": "OTHER", + "arguments": [ + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[NoneType]", + "description": "A noop.\n\nParameters\n----------\ntimeout:\n A maximum time this primitive should be fitting during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA ``CallResult`` with ``None`` value." + }, + "fit_multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling ``fit`` and after that multiple produce methods at once.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to both fit the primitive and produce outputs\n for all produce methods listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do for both fitting and producing\n outputs of all produce methods.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "get_params": { + "kind": "OTHER", + "arguments": [], + "returns": "NoneType", + "description": "A noop.\n\nReturns\n-------\nAn instance of parameters." + }, + "multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling multiple produce methods at once.\n\nWhen a primitive has multiple produce methods it is common that they might compute the\nsame internal results for same inputs but return different representations of those results.\nIf caller is interested in multiple of those representations, calling multiple produce\nmethods might lead to recomputing same internal results multiple times. To address this,\nthis method allows primitive author to implement an optimized version which computes\ninternal results only once for multiple calls of produce methods, but return those different\nrepresentations.\n\nIf any additional method arguments are added to primitive's produce method(s), they have\nto be added to this method as well. This method should accept an union of all arguments\naccepted by primitive's produce method(s) and then use them accordingly when computing\nresults.\n\nThe default implementation of this method just calls all produce methods listed in\n``produce_methods`` in order and is potentially inefficient.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to produce outputs for all produce methods\n listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "produce": { + "kind": "PRODUCE", + "arguments": [ + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[d3m.container.list.List]", + "singleton": false, + "inputs_across_samples": [], + "description": "Produce primitive's best choice of the output for each of the inputs.\n\nThe output value should be wrapped inside ``CallResult`` object before returning.\n\nIn many cases producing an output is a quick operation in comparison with ``fit``, but not\nall cases are like that. For example, a primitive can start a potentially long optimization\nprocess to compute outputs. ``timeout`` and ``iterations`` can serve as a way for a caller\nto guide the length of this process.\n\nIdeally, a primitive should adapt its call to try to produce the best outputs possible\ninside the time allocated. If this is not possible and the primitive reaches the timeout\nbefore producing outputs, it should raise a ``TimeoutError`` exception to signal that the\ncall was unsuccessful in the given time. The state of the primitive after the exception\nshould be as the method call has never happened and primitive should continue to operate\nnormally. The purpose of ``timeout`` is to give opportunity to a primitive to cleanly\nmanage its state instead of interrupting execution from outside. Maintaining stable internal\nstate should have precedence over respecting the ``timeout`` (caller can terminate the\nmisbehaving primitive from outside anyway). If a longer ``timeout`` would produce\ndifferent outputs, then ``CallResult``'s ``has_finished`` should be set to ``False``.\n\nSome primitives have internal iterations (for example, optimization iterations).\nFor those, caller can provide how many of primitive's internal iterations\nshould a primitive do before returning outputs. Primitives should make iterations as\nsmall as reasonable. If ``iterations`` is ``None``, then there is no limit on\nhow many iterations the primitive should do and primitive should choose the best amount\nof iterations on its own (potentially controlled through hyper-parameters).\nIf ``iterations`` is a number, a primitive has to do those number of iterations,\nif possible. ``timeout`` should still be respected and potentially less iterations\ncan be done because of that. Primitives with internal iterations should make\n``CallResult`` contain correct values.\n\nFor primitives which do not have internal iterations, any value of ``iterations``\nmeans that they should run fully, respecting only ``timeout``.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\ninputs:\n The inputs of shape [num_inputs, ...].\ntimeout:\n A maximum time this primitive should take to produce outputs during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nThe outputs of shape [num_inputs, ...] wrapped inside ``CallResult``." + }, + "set_params": { + "kind": "OTHER", + "arguments": [ + "params" + ], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------\nparams:\n An instance of parameters." + }, + "set_training_data": { + "kind": "OTHER", + "arguments": [], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------" + } + }, + "class_attributes": { + "logger": "logging.Logger", + "metadata": "d3m.metadata.base.PrimitiveMetadata" + }, + "instance_attributes": { + "hyperparams": "d3m.metadata.hyperparams.Hyperparams", + "random_seed": "int", + "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]", + "volumes": "typing.Dict[str, str]", + "temporary_directory": "typing.Union[NoneType, str]" + } + }, + "structural_type": "test_primitives.null.NullTransformerPrimitive", + "description": "A primitive which passes through inputs as outputs.\n\nIt does not really care if inputs is list.\n\nAttributes\n----------\nmetadata:\n Primitive's metadata. Available as a class attribute.\nlogger:\n Primitive's logger. Available as a class attribute.\nhyperparams:\n Hyperparams passed to the constructor.\nrandom_seed:\n Random seed passed to the constructor.\ndocker_containers:\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes:\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory:\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.", + "digest": "__DIGEST__" +} +""".replace('__INTERFACES_VERSION__', d3m.__version__).replace('__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace('__DIGEST__', NullTransformerPrimitive.metadata.query()['digest']) + + +class TestNullPrimitives(unittest.TestCase): + def call_primitive(self, primitive, method_name, extra_arguments=None, **kwargs): + primitive_arguments = primitive.metadata.query()['primitive_code'].get('arguments', {}) + + if extra_arguments is None: + extra_arguments = {} + + arguments = {} + + for argument_name, argument_value in dict(extra_arguments, **kwargs).items(): + if primitive_arguments[argument_name]['kind'] == base.PrimitiveArgumentKind.PIPELINE: + arguments[argument_name] = argument_value.metadata + else: + arguments[argument_name] = type(argument_value) + + return getattr(primitive, method_name)(**kwargs) + + def test_null_transformer(self): + hyperparams_class = NullTransformerPrimitive.metadata.get_hyperparams() + + primitive = NullTransformerPrimitive(hyperparams=hyperparams_class.defaults()) + + inputs = container.List([10, 20, 30], generate_metadata=True) + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertSequenceEqual(call_metadata.value, [10, 20, 30]) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query(())['dimension']['length'], 3) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS,))['structural_type'], int) + + def test_null_transformer_metadata(self): + expected_description = json.loads(EXPECTED_PRIMITIVE_DESCRIPTION_JSON) + + # We stringify to JSON and parse it to make sure the description can be stringified to JSON. + description = json.loads(json.dumps(NullTransformerPrimitive.metadata.to_json_structure())) + + self.maxDiff = None + self.assertEqual(expected_description, description) + + def test_null_dataframe_unsupervised_learner(self): + hyperparams_class = NullDataFrameUnsupervisedLearnerPrimitive.metadata.get_hyperparams() + + primitive = NullDataFrameUnsupervisedLearnerPrimitive(hyperparams=hyperparams_class.defaults()) + + inputs = container.DataFrame([10, 20, 30], generate_metadata=True) + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertSequenceEqual(call_metadata.value.values.tolist(), [[10], [20], [30]]) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query(())['dimension']['length'], 3) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 0,))['structural_type'], np.int64) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_params.py b/d3m/tests/test_params.py new file mode 100644 index 0000000..a29db27 --- /dev/null +++ b/d3m/tests/test_params.py @@ -0,0 +1,53 @@ +import typing +import unittest + +import numpy + +from d3m.metadata import params +from d3m import container + + +class TestParams(unittest.TestCase): + def test_params(self): + class TestParams(params.Params): + a: str + b: int + + test_params = TestParams({'a': 'foo', 'b': 42}) + + self.assertEqual(test_params['a'], 'foo') + self.assertEqual(test_params['b'], 42) + + with self.assertRaisesRegex(ValueError, 'Not all parameters are specified'): + TestParams({'a': 'foo'}) + + with self.assertRaisesRegex(ValueError, 'Additional parameters are specified'): + TestParams({'a': 'foo', 'b': 42, 'c': None}) + + test_params = TestParams(a='bar', b=10) + self.assertEqual(test_params['a'], 'bar') + self.assertEqual(test_params['b'], 10) + + with self.assertRaisesRegex(TypeError, 'Value \'.*\' is not an instance of the type'): + TestParams({'a': 'foo', 'b': 10.1}) + + with self.assertRaisesRegex(TypeError, 'Only methods and attribute type annotations can be defined on Params class'): + class ErrorParams(params.Params): + a = str + b = int + + def test_numpy(self): + class TestParams(params.Params): + state: container.ndarray + + TestParams(state=container.ndarray([1, 2, 3], generate_metadata=True)) + + def test_list_int64(self): + class TestParams(params.Params): + mapping: typing.Dict + + TestParams(mapping={'a': [numpy.int64(1), numpy.int64(1)]}) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_pipeline.py b/d3m/tests/test_pipeline.py new file mode 100644 index 0000000..0f64758 --- /dev/null +++ b/d3m/tests/test_pipeline.py @@ -0,0 +1,1487 @@ +import collections +import copy +import datetime +import json +import logging +import os +import sys +import typing +import unittest +import uuid + +COMMON_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'common-primitives') +# NOTE: This insertion should appear before any code attempting to resolve or load primitives, +# so the git submodule version of `common-primitives` is looked at first. +sys.path.insert(0, COMMON_PRIMITIVES_DIR) + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from common_primitives.dataset_to_dataframe import DatasetToDataFramePrimitive +from common_primitives.column_parser import ColumnParserPrimitive +from common_primitives.random_forest import RandomForestClassifierPrimitive + +from test_primitives.monomial import MonomialPrimitive +from test_primitives.random import RandomPrimitive +from test_primitives.sum import SumPrimitive +from test_primitives.increment import IncrementPrimitive + +from d3m import container, exceptions, index, utils +from d3m.metadata import base as metadata_base, hyperparams, params, pipeline +from d3m.primitive_interfaces import base, transformer, supervised_learning + + +TEST_PIPELINE_1 = """ +{ + "id": "2b50a7db-c5e2-434c-b02d-9e595bd56788", + "digest": "b87dbbd5b8bcc1470050a756cf22d6def2662a61482debf55c09948225372411", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "source": { + "name": "Test author", + "contact": "mailto:test@example.com" + }, + "created": "2018-02-28T09:42:27.443844Z", + "name": "Test pipeline", + "description": "Just a test pipeline", + "users": [ + { + "id": "f32467bc-698c-4ab6-a489-2e8f73fcfdaa", + "reason": "User was making a test", + "rationale": "I made a test" + } + ], + "inputs": [ + { + "name": "dataframe inputs" + }, + { + "name": "dataframe outputs" + }, + { + "name": "extra data" + } + ], + "outputs": [ + { + "name": "dataframe predictions", + "data": "steps.6.main" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "efa24fae-49c4-4482-b49f-ceb351c0d916", + "version": "0.1.0", + "python_path": "d3m.primitives.test.LossPrimitive", + "name": "Loss Primitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + }, + "outputs": { + "type": "CONTAINER", + "data": "inputs.1" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "00c3a435-a87c-405b-bed9-3a8c402d4431", + "version": "0.1.0", + "python_path": "d3m.primitives.test.Model1Primitive", + "name": "Model 1 Primitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + }, + "outputs": { + "type": "CONTAINER", + "data": "inputs.1" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "4987c4b0-cf4c-4f7f-9bcc-557a6d72589d", + "version": "0.1.0", + "python_path": "d3m.primitives.test.Model2Primitive", + "name": "Model 2 Primitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + }, + "outputs": { + "type": "CONTAINER", + "data": "inputs.1" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "9c00d42d-382d-4177-a0e7-082da88a29c8", + "version": "0.1.0", + "python_path": "d3m.primitives.operator.sum.Test", + "name": "Sum Values", + "digest": "__SUM_DIGEST__" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "e42e6f17-77cc-4611-8cca-bba36a46e806", + "version": "0.1.0", + "python_path": "d3m.primitives.test.PipelineTestPrimitive", + "name": "Pipeline Test Primitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + }, + "extra_data": { + "type": "CONTAINER", + "data": "inputs.2" + }, + "offset": { + "type": "DATA", + "data": "steps.3.produce" + } + }, + "outputs": [ + { + "id": "produce" + }, + { + "id": "produce_score" + } + ], + "hyperparams": { + "loss": { + "type": "PRIMITIVE", + "data": 0 + }, + "column_to_operate_on": { + "type": "VALUE", + "data": 5 + }, + "ensemble": { + "type": "PRIMITIVE", + "data": [ + 1, + 2 + ] + }, + "columns_to_operate_on": { + "type": "VALUE", + "data": [3, 6, 7] + } + }, + "users": [ + { + "id": "98e5cc4a-7edc-41a3-ac98-ee799fb6a41b", + "reason": "User clicked on a button", + "rationale": "I dragged an icon" + } + ] + }, + { + "type": "SUBPIPELINE", + "pipeline": { + "id": "0113b91f-3010-4a47-bd56-a50c4e28a4a4", + "digest": "83430addfcb9430ad02fd59f114ac7c723806058ca90d6b0f226d1031826ac8d" + }, + "inputs": [ + { + "data": "steps.4.produce" + } + ], + "outputs": [ + { + "id": "pipeline_output" + } + ] + }, + { + "type": "PLACEHOLDER", + "inputs": [ + { + "data": "steps.5.pipeline_output" + }, + { + "data": "steps.4.produce_score" + } + ], + "outputs": [ + { + "id": "main" + } + ] + } + ] +} +""".replace('__SUM_DIGEST__', SumPrimitive.metadata.query()['digest']) + +TEST_PIPELINE_2 = """ +{ + "id": "0113b91f-3010-4a47-bd56-a50c4e28a4a4", + "digest": "83430addfcb9430ad02fd59f114ac7c723806058ca90d6b0f226d1031826ac8d", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2018-02-28T09:42:27.443844Z", + "name": "Test pipeline", + "description": "Just a test pipeline", + "inputs": [ + {} + ], + "outputs": [ + { + "data": "steps.0.produce" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "4987c4b0-cf4c-4f7f-9bcc-557a6d72589d", + "version": "0.1.0", + "python_path": "d3m.primitives.test.Model2Primitive", + "name": "Model 2 Primitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + }, + "outputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] +} +""" + + +class MockPrimitiveBuilder: + """ + This class helps build mock primitives from scratch without checking. + """ + + def __init__(self, inputs, hyperparams, primitive_id=None, version='0.0.0', name='mock_primitive_name', python_path='d3m.primitives.mock.foobar', digest='f' * 64): + """ + inputs : Dict + It will be used to fill the 'arguments' field. + outputs : List + List of output names + """ + + self.primitive_dict = { + 'type': 'PRIMITIVE', + 'primitive': { + 'id': primitive_id if primitive_id is not None else str(uuid.uuid4()), + 'version': version, + 'python_path': python_path, + 'name': name, + 'digest': digest, + }, + 'arguments': inputs, + 'hyperparams': hyperparams, + 'outputs': None, + } + + def build(self, **inputs_data): + primitive_dict = copy.deepcopy(self.primitive_dict) + primitive_dict['arguments'] = copy.deepcopy({name: self.primitive_dict['arguments'][name] for name in inputs_data.keys() if name in self.primitive_dict['arguments']}) + primitive_dict['hyperparams'] = copy.deepcopy({name: self.primitive_dict['hyperparams'][name] for name in inputs_data.keys() if name in self.primitive_dict['hyperparams']}) + + for name, data in inputs_data.items(): + if name in primitive_dict['arguments']: + primitive_dict['arguments'][name]['data'] = data + elif name in primitive_dict['hyperparams']: + primitive_dict['hyperparams'][name]['data'] = data + else: + raise IndexError("No match name found for '{name}' in primitive {primitive_name}".format(name=name, primitive_name=self.primitive_dict['primitive']['name'])) + return primitive_dict + + +class MockPipelineBuilder: + """ + This class helps build pipelines for testing from scratch without checking. + """ + + def __init__(self, input_names, pipeline_id=None, name='mock_name', description='mock_description'): + self._increase_counter = 0 + self.pipeline_dict = { + 'id': pipeline_id if pipeline_id is not None else str(uuid.uuid4()), + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json', + 'source': {'name': 'Test author'}, + 'created': datetime.datetime.now(tz=datetime.timezone.utc).isoformat(), + 'name': name, + 'description': description, + 'inputs': [{'name': n} for n in input_names], + 'outputs': [], + 'steps': [] + } + self._subpipelines = {} + + def add_primitive(self, primitive_dict, outputs): + """ + Add primitives. + """ + + primitive_dict['outputs'] = [{'id': o} for o in outputs] + self.pipeline_dict['steps'].append(primitive_dict) + + def add_placeholder(self, inputs, outputs): + placeholder_dict = { + 'type': 'PLACEHOLDER', + 'inputs': [{'data': input_data_ref} for input_data_ref in inputs], + 'outputs': [{'id': output_id for output_id in outputs}] + } + self.pipeline_dict['steps'].append(placeholder_dict) + + def add_subpipeline(self, pipeline: pipeline.Pipeline, inputs, outputs): + self._subpipelines[pipeline.id] = pipeline + subpipeline_dict = { + 'type': 'SUBPIPELINE', + 'pipeline': {'id': pipeline.id}, + 'inputs': [{'data': input_data_ref} for input_data_ref in inputs], + 'outputs': [{'id': output_id for output_id in outputs}] + } + self.pipeline_dict['steps'].append(subpipeline_dict) + + def add_output(self, name, data): + self.pipeline_dict['outputs'].append({'name': name, 'data': data}) + + def build(self, primitive_loading='ignore') -> pipeline.Pipeline: + """ + Output built pipeline instance. + + Parameters + ---------- + primitive_loading : str or callable + If `primitive_loading` == 'ignore', the primitive resolving function will be skipped. + If `primitive_loading` == 'default', a default primitive resolving function will be loaded. + If `primitive_loading` is a function, it will become the resolving function. + + Returns + ------- + Pipeline + A pipeline instance. + """ + + resolver = pipeline.Resolver() + resolver.get_pipeline = lambda pipeline_description: self._subpipelines[pipeline_description['id']] + if primitive_loading == 'ignore': + resolver.get_primitive = lambda primitive_description: None + elif primitive_loading == 'full': + pass + elif callable(primitive_loading): + resolver.get_primitive = primitive_loading + else: + raise ValueError("unknown value of 'primitive_loading'") + + return pipeline.Pipeline.from_json_structure(self.pipeline_dict, resolver=resolver) + + +class Resolver(pipeline.Resolver): + def _get_primitive(self, primitive_description: typing.Dict) -> typing.Optional[typing.Type[base.PrimitiveBase]]: + # To hide any logging or stdout output. + with utils.silence(): + return super()._get_primitive(primitive_description) + + def get_pipeline(self, pipeline_description: typing.Dict) -> pipeline.Pipeline: + if pipeline_description['id'] == '0113b91f-3010-4a47-bd56-a50c4e28a4a4': + return pipeline.Pipeline.from_json(TEST_PIPELINE_2, resolver=self) + + return super().get_pipeline(pipeline_description) + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class Params(params.Params): + pass + + +# Silence any validation warnings. +with utils.silence(): + class LossPrimitive(supervised_learning.SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': 'efa24fae-49c4-4482-b49f-ceb351c0d916', + 'version': '0.1.0', + 'name': "Loss Primitive", + 'python_path': 'd3m.primitives.test.LossPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.CROSS_ENTROPY, + ], + 'primitive_family': metadata_base.PrimitiveFamily.LOSS_FUNCTION, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + pass + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + pass + + def get_params(self) -> Params: + pass + + def set_params(self, *, params: Params) -> None: + pass + + class Model1Primitive(supervised_learning.SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '00c3a435-a87c-405b-bed9-3a8c402d4431', + 'version': '0.1.0', + 'name': "Model 1 Primitive", + 'python_path': 'd3m.primitives.test.Model1Primitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.RANDOM_FOREST, + ], + 'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + pass + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + pass + + def get_params(self) -> Params: + pass + + def set_params(self, *, params: Params) -> None: + pass + + class Model2Hyperparams(hyperparams.Hyperparams): + # To test that a primitive instance can be a default value. + base_estimator = hyperparams.Hyperparameter[base.PrimitiveBase]( + default=LossPrimitive(hyperparams=Hyperparams.defaults()), + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + + class Model2Primitive(supervised_learning.SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Model2Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '4987c4b0-cf4c-4f7f-9bcc-557a6d72589d', + 'version': '0.1.0', + 'name': "Model 2 Primitive", + 'python_path': 'd3m.primitives.test.Model2Primitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.SUPPORT_VECTOR_MACHINE, + ], + 'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + pass + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + pass + + def get_params(self) -> Params: + pass + + def set_params(self, *, params: Params) -> None: + pass + + +class TestPipeline(unittest.TestCase): + @classmethod + def setUpClass(cls): + column_index = hyperparams.Hyperparameter[int](-1) + + class PipelineTestHyperparams(hyperparams.Hyperparams): + loss = hyperparams.Hyperparameter[typing.Optional[base.PrimitiveBase]](default=None, semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']) + column_to_operate_on = hyperparams.Hyperparameter[int](default=-1, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']) + ensemble = hyperparams.Set(elements=hyperparams.Hyperparameter[base.PrimitiveBase](default=MonomialPrimitive), default=(), max_size=10, semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']) + columns_to_operate_on = hyperparams.Set(column_index, (), 0, None, semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']) + + PipelineTestInputs = typing.Union[container.Dataset, container.DataFrame] + + # Silence any validation warnings. + with utils.silence(): + class PipelineTestPrimitive(transformer.TransformerPrimitiveBase[PipelineTestInputs, Outputs, PipelineTestHyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': 'e42e6f17-77cc-4611-8cca-bba36a46e806', + 'version': '0.1.0', + 'name': "Pipeline Test Primitive", + 'python_path': 'd3m.primitives.test.PipelineTestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.CROSS_ENTROPY, + ], + 'primitive_family': metadata_base.PrimitiveFamily.LOSS_FUNCTION, + }) + + def produce(self, *, inputs: PipelineTestInputs, extra_data: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def produce_score(self, *, inputs: PipelineTestInputs, offset: float, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: PipelineTestInputs, extra_data: Inputs, offset: float, timeout: float = None, iterations: int = None) -> base.MultiCallResult: + return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, extra_data=extra_data, offset=offset) + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: PipelineTestInputs, extra_data: Inputs, offset: float, timeout: float = None, iterations: int = None) -> base.MultiCallResult: + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, extra_data=extra_data, offset=offset) + + class SimplePipelineTestPrimitive(transformer.TransformerPrimitiveBase[PipelineTestInputs, Outputs, PipelineTestHyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '02d966d6-4e4f-465b-ad93-83b14c7c47be', + 'version': '0.1.0', + 'name': "Simple Pipeline Test Primitive", + 'python_path': 'd3m.primitives.test.SimplePipelineTestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.CROSS_ENTROPY, + ], + 'primitive_family': metadata_base.PrimitiveFamily.LOSS_FUNCTION, + }) + + def produce(self, *, inputs: PipelineTestInputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + ColumnsInputs = container.Dataset + ColumnsOutputs = container.List + + # Silence any validation warnings. + with utils.silence(): + class ColumnSelectionPrimitive(transformer.TransformerPrimitiveBase[ColumnsInputs, ColumnsOutputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': 'fdabb0c2-0555-4188-8f08-eeda722e1f04', + 'version': '0.1.0', + 'name': "Column Selection Primitive", + 'python_path': 'd3m.primitives.test.ColumnSelectionPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + }) + + def produce(self, *, inputs: ColumnsInputs, timeout: float = None, iterations: int = None) -> base.CallResult[ColumnsOutputs]: + pass + + # To hide any logging or stdout output. + with utils.silence(): + index.register_primitive('d3m.primitives.regression.monomial.Test', MonomialPrimitive) + index.register_primitive('d3m.primitives.data_generation.random.Test', RandomPrimitive) + index.register_primitive('d3m.primitives.operator.sum.Test', SumPrimitive) + index.register_primitive('d3m.primitives.operator.increment.Test', IncrementPrimitive) + index.register_primitive('d3m.primitives.test.LossPrimitive', LossPrimitive) + index.register_primitive('d3m.primitives.test.Model1Primitive', Model1Primitive) + index.register_primitive('d3m.primitives.test.Model2Primitive', Model2Primitive) + index.register_primitive('d3m.primitives.test.PipelineTestPrimitive', PipelineTestPrimitive) + index.register_primitive('d3m.primitives.test.SimplePipelineTestPrimitive', SimplePipelineTestPrimitive) + index.register_primitive('d3m.primitives.test.ColumnSelectionPrimitive', ColumnSelectionPrimitive) + + def test_basic(self): + self.maxDiff = None + + p = pipeline.Pipeline.from_json(TEST_PIPELINE_2, resolver=Resolver()) + + p_json_input = json.loads(TEST_PIPELINE_2) + p_json_output = p.to_json_structure() + + self.assertEqual(p_json_input, p_json_output) + + p.check(standard_pipeline=False) + + p = pipeline.Pipeline.from_json(TEST_PIPELINE_1, resolver=Resolver()) + + p.check(allow_placeholders=True, input_types={'inputs.0': container.DataFrame, 'inputs.1': container.DataFrame, 'inputs.2': container.DataFrame}) + + with self.assertRaisesRegex(exceptions.InvalidPipelineError, 'Step .* of pipeline \'.*\' is a placeholder but there should be no placeholders'): + p.check(allow_placeholders=False, input_types={'inputs.0': container.DataFrame, 'inputs.1': container.DataFrame, 'inputs.2': container.DataFrame}) + + p_json_input = json.loads(TEST_PIPELINE_1) + p_json_output = p.to_json_structure() + + p_json_input.pop('digest', None) + p_json_output.pop('digest', None) + self.assertEqual(p_json_input, p_json_output) + + p_from_json = pipeline.Pipeline.from_json(p.to_json(), resolver=Resolver()).to_json_structure() + p_from_json.pop('digest', None) + self.assertEqual(p_json_input, p_from_json) + + p_from_yaml = pipeline.Pipeline.from_yaml(p.to_yaml(), resolver=Resolver()).to_json_structure() + p_from_yaml.pop('digest', None) + self.assertEqual(p_json_input, p_from_yaml) + + self.assertEqual(p.get_producing_outputs(), {'outputs.0', 'steps.0.produce', 'steps.1.produce', 'steps.2.produce', 'steps.3.produce', 'steps.4.produce', 'steps.4.produce_score', 'steps.5.outputs.0', 'steps.5.pipeline_output', 'steps.5.steps.0.produce', 'steps.6.main'}) + + def test_non_strict_resolving(self): + test_pipeline = json.loads(TEST_PIPELINE_1) + + full_primitive_description = copy.deepcopy(test_pipeline['steps'][3]['primitive']) + full_pipeline_description = copy.deepcopy(test_pipeline['steps'][5]['pipeline']) + + test_pipeline['steps'][3]['primitive']['version'] = '0.0.1' + test_pipeline['steps'][3]['primitive']['name'] = 'Something Else' + del test_pipeline['steps'][3]['primitive']['digest'] + del test_pipeline['steps'][5]['pipeline']['digest'] + test_pipeline['digest'] = utils.compute_digest(test_pipeline) + + logger = logging.getLogger('d3m.metadata.pipeline') + + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + p = pipeline.Pipeline.from_json(json.dumps(test_pipeline), resolver=Resolver()) + + self.assertEqual(len(cm.records), 2) + self.assertEqual(cm.records[0].msg, "Version for primitive '%(primitive_id)s' does not match the one specified in the primitive description. Primitive description version: '%(primitive_version)s'. Resolved primitive version: '%(resolved_primitive_version)s'.") + self.assertEqual(cm.records[1].msg, "Name for primitive '%(primitive_id)s' does not match the one specified in the primitive description. Primitive description name: '%(primitive_name)s'. Resolved primitive name: '%(resolved_primitive_name)s'.") + + # After loading, primitive and pipeline information should be updated and fully populated. + self.assertEqual(p.to_json_structure()['steps'][3]['primitive'], full_primitive_description) + self.assertEqual(p.to_json_structure()['steps'][5]['pipeline'], full_pipeline_description) + + def test_nested_to_json_structure(self): + p = pipeline.Pipeline.from_json(TEST_PIPELINE_1, resolver=Resolver()) + + self.assertEqual(p.to_json_structure()['steps'][5]['pipeline'], { + 'id': '0113b91f-3010-4a47-bd56-a50c4e28a4a4', + 'digest': '83430addfcb9430ad02fd59f114ac7c723806058ca90d6b0f226d1031826ac8d', + }) + + p2 = pipeline.Pipeline.from_json_structure(p.to_json_structure(), resolver=Resolver()) + + self.assertEqual(p.to_json_structure(nest_subpipelines=True), p2.to_json_structure(nest_subpipelines=True)) + + self.assertEqual(p.to_json_structure(nest_subpipelines=True)['steps'][5]['pipeline'], json.loads(TEST_PIPELINE_2)) + + class TestResolver(Resolver): + def _from_file(self, pipeline_description): + raise AssertionError("Should not be called.") + + p2 = pipeline.Pipeline.from_json_structure(p.to_json_structure(nest_subpipelines=True), resolver=TestResolver()) + + self.assertEqual(p.to_json_structure(nest_subpipelines=True), p2.to_json_structure(nest_subpipelines=True)) + + def test_primitive_annotation(self): + # This test does not really belong here but it is easiest to make it here. + # Test that hyper-parameter can have a primitive instance as a default value + # and that such primitive can have its metadata converted to JSON. + + self.assertEqual(index.get_primitive('d3m.primitives.test.Model2Primitive').metadata.to_json_structure()['primitive_code']['hyperparams']['base_estimator'], { + 'type': 'd3m.metadata.hyperparams.Hyperparameter', + 'default': 'd3m.primitives.test.LossPrimitive(hyperparams=Hyperparams({}), random_seed=0)', + 'structural_type': 'd3m.primitive_interfaces.base.PrimitiveBase', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + }) + + @unittest.skipUnless(sys.version_info >= (3, 7), "Pickling of generic types does not work before Python 3.7.") + def test_primitive_annotation_python37(self): + # This test does not really belong here but it is easiest to make it here. + # Test that hyper-parameter can have a primitive instance as a default value + # and that such primitive can have its metadata converted to JSON. + + self.assertEqual(index.get_primitive('d3m.primitives.test.Model2Primitive').metadata.to_internal_json_structure()['primitive_code']['hyperparams']['base_estimator'], { + 'type': 'd3m.metadata.hyperparams.Hyperparameter', + 'default': 'd3m.primitives.test.LossPrimitive(hyperparams=Hyperparams({}), random_seed=0)', + 'structural_type': 'd3m.primitive_interfaces.base.PrimitiveBase', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + }) + + def test_pipeline_digest_mismatch(self): + logger = logging.getLogger('d3m.metadata.pipeline') + + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + pipeline.Pipeline.from_json(""" + { + "id": "c12a8de1-d4d7-4d4b-b51f-66488e1adcc6", + "digest": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2018-02-28T09:42:27.443844Z", + "name": "Test pipeline", + "description": "Just a test pipeline", + "inputs": [ + {} + ], + "outputs": [ + { + "data": "steps.0.produce" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "9c00d42d-382d-4177-a0e7-082da88a29c8", + "version": "0.1.0", + "python_path": "d3m.primitives.operator.sum.Test", + "name": "Sum Values", + "digest": "__SUM_DIGEST__" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] + } + """.replace('__SUM_DIGEST__', SumPrimitive.metadata.query()['digest']), resolver=Resolver()) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "Digest for pipeline '%(pipeline_id)s' does not match a computed one. Provided digest: %(pipeline_digest)s. Computed digest: %(new_pipeline_digest)s.") + + def test_digest_mismatch(self): + test_pipeline = json.loads(TEST_PIPELINE_1) + + full_primitive_description = copy.deepcopy(test_pipeline['steps'][3]['primitive']) + full_pipeline_description = copy.deepcopy(test_pipeline['steps'][5]['pipeline']) + + test_pipeline['steps'][3]['primitive']['digest'] = 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855' + test_pipeline['steps'][5]['pipeline']['digest'] = 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855' + + logger = logging.getLogger('d3m.metadata.pipeline') + + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + p = pipeline.Pipeline.from_json(json.dumps(test_pipeline), resolver=Resolver()) + + self.assertEqual(len(cm.records), 2) + self.assertEqual(cm.records[0].msg, "Digest for pipeline '%(pipeline_id)s' does not match a computed one. Provided digest: %(pipeline_digest)s. Computed digest: %(new_pipeline_digest)s.") + self.assertEqual(cm.records[1].msg, "Digest for primitive '%(primitive_id)s' does not match the one specified in the primitive description. Primitive description digest: %(primitive_digest)s. Resolved primitive digest: %(resolved_primitive_digest)s.") + + # After loading, primitive and pipeline information should be updated and fully populated. + self.assertEqual(p.to_json_structure()['steps'][3]['primitive'], full_primitive_description) + self.assertEqual(p.to_json_structure()['steps'][5]['pipeline'], full_pipeline_description) + + def test_invalid_data_reference(self): + with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'Cannot add step .*'): + pipeline.Pipeline.from_json(""" + { + "id": "c12a8de1-d4d7-4d4b-b51f-66488e1adcc6", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2018-02-28T09:42:27.443844Z", + "name": "Test pipeline", + "description": "Just a test pipeline", + "inputs": [ + {} + ], + "outputs": [ + { + "data": "steps.0.produce" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "9c00d42d-382d-4177-a0e7-082da88a29c8", + "version": "0.1.0", + "python_path": "d3m.primitives.operator.sum.Test", + "name": "Sum Values", + "digest": "__SUM_DIGEST__" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.1" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] + } + """.replace('__SUM_DIGEST__', SumPrimitive.metadata.query()['digest']), resolver=Resolver()) + + def test_invalid_data_reference_in_argument_list(self): + with self.assertRaisesRegex(exceptions.InvalidArgumentValueError, 'Cannot add step .*'): + pipeline.Pipeline.from_json(""" + { + "id": "c12a8de1-d4d7-4d4b-b51f-66488e1adcc6", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2018-02-28T09:42:27.443844Z", + "name": "Test pipeline", + "description": "Just a test pipeline", + "inputs": [ + {} + ], + "outputs": [ + { + "data": "steps.0.produce" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "9c00d42d-382d-4177-a0e7-082da88a29c8", + "version": "0.1.0", + "python_path": "d3m.primitives.operator.sum.Test", + "name": "Sum Values", + "digest": "__SUM_DIGEST__" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": [ + "inputs.1" + ] + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] + } + """.replace('__SUM_DIGEST__', SumPrimitive.metadata.query()['digest']), resolver=Resolver()) + + def test_invalid_argument_list_type_check(self): + with self.assertRaisesRegex(exceptions.InvalidPipelineError, 'should have type \'List\' to support getting a list of values'): + pipeline.Pipeline.from_json(""" + { + "id": "c12a8de1-d4d7-4d4b-b51f-66488e1adcc6", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2018-02-28T09:42:27.443844Z", + "name": "Test pipeline", + "description": "Just a test pipeline", + "inputs": [ + {} + ], + "outputs": [ + { + "data": "steps.0.produce" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "9c00d42d-382d-4177-a0e7-082da88a29c8", + "version": "0.1.0", + "python_path": "d3m.primitives.operator.sum.Test", + "name": "Sum Values", + "digest": "__SUM_DIGEST__" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": [ + "inputs.0" + ] + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] + } + """.replace('__SUM_DIGEST__', SumPrimitive.metadata.query()['digest']), resolver=Resolver()).check() + + def test_list_of_columns(self): + pipeline.Pipeline.from_json(""" + { + "id": "48fa0619-53f2-4a36-8a90-31ba8e08df02", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2018-02-28T09:42:27.443844Z", + "name": "Test pipeline", + "description": "Just a test pipeline", + "inputs": [ + {} + ], + "outputs": [ + { + "data": "steps.1.produce" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "fdabb0c2-0555-4188-8f08-eeda722e1f04", + "version": "0.1.0", + "python_path": "d3m.primitives.test.ColumnSelectionPrimitive", + "name": "Column Selection Primitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "02d966d6-4e4f-465b-ad93-83b14c7c47be", + "version": "0.1.0", + "python_path": "d3m.primitives.test.SimplePipelineTestPrimitive", + "name": "Simple Pipeline Test Primitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "columns_to_operate_on": { + "type": "CONTAINER", + "data": "steps.0.produce" + } + } + } + ] + } + """, resolver=Resolver()).check() + + def test_type_check(self): + with self.assertRaisesRegex(exceptions.InvalidPipelineError, 'Argument \'.*\' of step .* of pipeline \'.*\' has type \'.*\', but it is getting a type \'.*\''): + pipeline.Pipeline.from_json(""" + { + "id": "e8c4dd86-420d-4e1c-ad25-d592a5b5bb0b", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2018-02-28T09:42:27.443844Z", + "name": "Test pipeline", + "description": "Just a test pipeline", + "inputs": [ + {} + ], + "outputs": [ + { + "data": "steps.0.produce" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "9c00d42d-382d-4177-a0e7-082da88a29c8", + "version": "0.1.0", + "python_path": "d3m.primitives.operator.sum.Test", + "name": "Sum Values", + "digest": "__SUM_DIGEST__" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] + } + """.replace('__SUM_DIGEST__', SumPrimitive.metadata.query()['digest']), resolver=Resolver()).check() + + def _get_mock_primitive(self, primitive_id, use_set_hyperparamaters=False): + class _defaultdict(collections.defaultdict): + def get(self, k, default=None): + return self[k] + + def __contains__(self, item): + return True + + def _get_special_hyperparam(): + if use_set_hyperparamaters: + h = hyperparams.Set(hyperparams.Hyperparameter[object](None), ()) + else: + h = hyperparams.Hyperparameter[object](None) + h.value_from_json_structure = lambda x: x + return h + + class MockMetadata: + def get_hyperparams(self): + return self.query()['primitive_code']['class_type_arguments']['Hyperparams'] + + def query(self): + hparams = hyperparams.Hyperparams() + hparams.configuration = _defaultdict(_get_special_hyperparam) + arguments = _defaultdict(lambda: {'kind': metadata_base.PrimitiveArgumentKind.PIPELINE}) + produces = _defaultdict(lambda: {'kind': metadata_base.PrimitiveMethodKind.PRODUCE}) + return { + 'id': primitive_id, + 'primitive_code': { + 'class_type_arguments': { + 'Hyperparams': hparams + }, + 'arguments': arguments, + 'instance_methods': produces + } + } + + class MockPrimitve: + def __init__(self): + self.metadata = MockMetadata() + + return MockPrimitve() + + def _quick_build_pipeline_with_real_primitives(self, primitive_dicts: typing.List[dict]) -> pipeline.Pipeline: + pipe = pipeline.Pipeline() + pipe.add_input('inputs') + + for primitive_dict in primitive_dicts: + step = pipeline.PrimitiveStep(primitive=primitive_dict['primitive']) + + for name, data_ref in primitive_dict.get('container_args', {}).items(): + step.add_argument(name, 'CONTAINER', data_ref) + for name, data in primitive_dict.get('value_args', {}).items(): + step.add_argument(name, 'VALUE', data) + + for name, data_ref in primitive_dict.get('container_hyperparams', {}).items(): + step.add_hyperparameter(name, 'CONTAINER', data_ref) + for name, data in primitive_dict.get('value_hyperparams', {}).items(): + step.add_hyperparameter(name, 'VALUE', data) + + step.add_output('produce') + pipe.add_step(step) + + pipe.add_output(name='Output', data_reference=f'steps.{len(primitive_dicts)-1}.produce') + return pipe + + def test_pipeline_isomorphism_check(self): + primitive_1 = MockPrimitiveBuilder({ + 'dataset': {'type': 'CONTAINER'}, + 'mean': {'type': 'CONTAINER'}, + }, {}) + primitive_2 = MockPrimitiveBuilder({ + 'a': {'type': 'CONTAINER'}, + 'b': {'type': 'CONTAINER'}, + }, {}) + + # With hyperparameters + primitive_h1 = MockPrimitiveBuilder({ + 'dataset': {'type': 'CONTAINER'}, + 'mean': {'type': 'CONTAINER'}, + }, { + 'index': {'type': 'VALUE'}, + 'masks': {'type': 'DATA'}, + 'mat': {'type': 'CONTAINER'}, + }) + + primitive_h2 = MockPrimitiveBuilder({ + 'a': {'type': 'CONTAINER'}, + 'b': {'type': 'CONTAINER'}, + }, { + 'v': {'type': 'VALUE'}, + 'd': {'type': 'DATA'}, + 'p': {'type': 'PRIMITIVE'}, + 'c': {'type': 'CONTAINER'}, + }) + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.1.produce', b='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_1 = builder.build() + + # [Structure invariance test] Another mirrored pipeline + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.1.produce', b='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_2 = builder.build() + + # [Primitive output names invariance test] Another pipeline with different output names + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['world']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['gas']) + builder.add_primitive(primitive_2.build(a='steps.1.gas', b='steps.0.world'), outputs=['land']) + builder.add_primitive(primitive_2.build(a='steps.0.world', b='steps.1.gas'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.land', b='steps.3.produce'), outputs=['moon']) + builder.add_output('planet', 'steps.4.moon') + pipeline_3 = builder.build() + + # [Pipeline input names invariance test] Another pipeline with different input names + builder = MockPipelineBuilder(['cake', 'bread', 'ham']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['world']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['gas']) + builder.add_primitive(primitive_2.build(a='steps.1.gas', b='steps.0.world'), outputs=['land']) + builder.add_primitive(primitive_2.build(a='steps.0.world', b='steps.1.gas'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.land', b='steps.3.produce'), outputs=['moon']) + builder.add_output('planet', 'steps.4.moon') + pipeline_4 = builder.build() + + self.assertTrue(pipeline_1.hash() == pipeline_2.hash() == pipeline_3.hash() == pipeline_4.hash()) + # Strict order check. + self.assertFalse(pipeline_1.equals(pipeline_2, strict_order=True)) # Differ in steps order. + self.assertTrue(pipeline_2.equals(pipeline_3, strict_order=True)) # Only differ in names. + + # Different pipelines + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.1.produce', b='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_5 = builder.build() + + builder = MockPipelineBuilder(['input_1', 'input_0', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.1.produce', b='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_inputs_order_matters = builder.build() + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.1.produce', b='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output_1', 'steps.4.produce') + builder.add_output('output_2', 'steps.3.produce') + pipeline_output_order_matters_1 = builder.build() + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.1.produce', b='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output_2', 'steps.3.produce') + builder.add_output('output_1', 'steps.4.produce') + pipeline_output_order_matters_2 = builder.build() + + self.assertFalse(pipeline_5.equals(pipeline_1)) + self.assertFalse(pipeline_inputs_order_matters.equals(pipeline_1)) + self.assertFalse(pipeline_output_order_matters_1.equals(pipeline_output_order_matters_2)) + + # [Harder structure invariance test that assumes the immutable property of primitives] A extreme test case + builder = MockPipelineBuilder(['input_0']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.0'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.0'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.1.produce', b='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_6 = builder.build() + + # pipeline_7 should be same with pipeline_6 because step.0 & step.1 are indistinguishable. + builder = MockPipelineBuilder(['input_0']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.0'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.0'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.1.produce', b='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_7 = builder.build() + + self.assertTrue(pipeline_6.equals(pipeline_7)) + self.assertEqual(pipeline_6.hash(), pipeline_7.hash()) + + # A pipeline with placeholders + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.produce', b='steps.1.produce'), outputs=['produce']) + builder.add_placeholder(['steps.1.produce', 'steps.0.produce'], outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.produce', b='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_placeholder_1 = builder.build() + + # [Placeholder test] Another pipeline with placeholders + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['world']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['gas']) + builder.add_primitive(primitive_2.build(a='steps.1.gas', b='steps.0.world'), outputs=['land']) + builder.add_placeholder(['steps.0.world', 'steps.1.gas'], outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.2.land', b='steps.3.produce'), outputs=['moon']) + builder.add_output('planet', 'steps.4.moon') + pipeline_placeholder_2 = builder.build() + + self.assertTrue(pipeline_placeholder_1.equals(pipeline_placeholder_2)) + + # [Subgraph expanding test] A pipeline with subpipelines + builder = MockPipelineBuilder(['steps_0_world', 'steps_1_gas']) + builder.add_primitive(primitive_2.build(a='inputs.1', b='inputs.0'), outputs=['land']) + builder.add_placeholder(['inputs.0', 'inputs.1'], outputs=['produce']) + builder.add_primitive(primitive_2.build(a='steps.0.land', b='steps.1.produce'), outputs=['sun']) + builder.add_output('blaze', 'steps.2.sun') + subpipeline_1 = builder.build() + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_1.build(dataset='inputs.1', mean='inputs.2'), outputs=['world']) + builder.add_primitive(primitive_1.build(dataset='inputs.0', mean='inputs.2'), outputs=['gas']) + builder.add_subpipeline(subpipeline_1, ['steps.0.world', 'steps.1.gas'], outputs=['moon']) + builder.add_output('planet', 'steps.2.moon') + pipeline_subpipeline_1 = builder.build() + + self.assertTrue(pipeline_placeholder_1.equals(pipeline_subpipeline_1)) + self.assertEqual(pipeline_placeholder_1.hash(), pipeline_subpipeline_1.hash()) + + # Pipeline with hyperparameter test + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=0, masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=1, masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v='aa', p=0, d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v='bb', p=1, d=['steps.1.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v='cc', p=3, d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_hyperparams_1 = builder.build() + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=1, masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=0, masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v='aa', p=1, d=['steps.1.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v='bb', p=0, d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v='cc', p=3, d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_hyperparams_2 = builder.build() + + self.assertTrue(pipeline_hyperparams_1.equals(pipeline_hyperparams_2)) + self.assertEqual(pipeline_hyperparams_1.hash(), pipeline_hyperparams_2.hash()) + + # A different pipeline + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=1, masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=0, masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v='aa', p=0, d=['steps.1.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v='bb', p=1, d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v='cc', p=3, d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_hyperparams_3 = builder.build() + + self.assertFalse(pipeline_hyperparams_1.equals(pipeline_hyperparams_3)) + + # Primitives hyperparameter value encoding test. + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=0, masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=1, masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v=object(), p=0, d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v='bb', p=1, d=['steps.1.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v='cc', p=3, d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_hyperparams_encoding = builder.build() + target_step = pipeline_hyperparams_encoding.steps[2] + assert isinstance(target_step, pipeline.PrimitiveStep) + target_step.primitive = self._get_mock_primitive(target_step.primitive_description['id']) + target_step.primitive_description = None + repr_1 = pipeline.PipelineHasher(pipeline_hyperparams_1).unique_equivalence_class_repr() + hasher_2 = pipeline.PipelineHasher(pipeline_hyperparams_encoding) + hasher_2.graph.step_nodes[3]._serialize_hyperparamter_value = lambda *_: '"aa"' + self.assertEqual(repr_1, hasher_2.unique_equivalence_class_repr()) + + # Orders of sequential hyperparameters matter. + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=0, masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=1, masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v='aa', p=[0, 1], d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v='bb', p=1, d=['steps.1.produce', 'steps.2.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v='cc', p=3, d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_hyperparams_sequential_1 = builder.build() + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=0, masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=1, masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v='aa', p=[0, 1], d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v='bb', p=1, d=['steps.1.produce', 'steps.2.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v='cc', p=3, d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_hyperparams_sequential_2 = builder.build() + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=0, masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=1, masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v='aa', p=[1, 0], d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v='bb', p=1, d=['steps.1.produce', 'steps.2.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v='cc', p=3, d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_hyperparams_sequential_3 = builder.build() + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=0, masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=1, masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v='aa', p=[0, 1], d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v='bb', p=1, d=['steps.2.produce', 'steps.1.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v='cc', p=3, d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_hyperparams_sequential_4 = builder.build() + + self.assertTrue(pipeline_hyperparams_sequential_1.equals(pipeline_hyperparams_sequential_2)) + self.assertFalse(pipeline_hyperparams_sequential_1.equals(pipeline_hyperparams_sequential_3)) + self.assertFalse(pipeline_hyperparams_sequential_1.equals(pipeline_hyperparams_sequential_4)) + + # `Set` hyperparameters test + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=[0], masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=[1], masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v=['aa'], p=[0], d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v=['bb'], p=[1], d=['steps.1.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v=[{'cc': 1, 'dd': 2}, {'ee': 1, 'ff': 2}], p=[3, 2], d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_set_hyperparams_1 = builder.build(primitive_loading=lambda primitive_description: self._get_mock_primitive(primitive_description['id'], True)) + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=[0], masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=[1], masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v=['aa'], p=[0], d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v=['bb'], p=[1], d=['steps.1.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v=[{'cc': 1, 'dd': 2}, {'ee': 1, 'ff': 2}], p=[2, 3], d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_set_hyperparams_2 = builder.build(primitive_loading=lambda primitive_description: self._get_mock_primitive(primitive_description['id'], True)) + + builder = MockPipelineBuilder(['input_0', 'input_1', 'input_2']) + builder.add_primitive(primitive_h1.build(dataset='inputs.0', mean='inputs.2', index=[0], masks=['inputs.0'], mat='inputs.1'), outputs=['produce']) + builder.add_primitive(primitive_h1.build(dataset='inputs.1', mean='inputs.2', index=[1], masks=['inputs.1'], mat='inputs.2'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.0.produce', b='steps.1.produce', v=['aa'], p=[0], d=['steps.0.produce'], c='steps.1.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.1.produce', b='steps.0.produce', v=['bb'], p=[1], d=['steps.1.produce'], c='steps.0.produce'), outputs=['produce']) + builder.add_primitive(primitive_h2.build(a='steps.2.produce', b='steps.3.produce', v=[{'ee': 1, 'ff': 2}, {'cc': 1, 'dd': 2}], p=[3, 2], d=['steps.2.produce'], c='steps.3.produce'), outputs=['produce']) + builder.add_output('output', 'steps.4.produce') + pipeline_set_hyperparams_3 = builder.build(primitive_loading=lambda primitive_description: self._get_mock_primitive(primitive_description['id'], True)) + + self.assertTrue(pipeline_set_hyperparams_1.equals(pipeline_set_hyperparams_2)) + self.assertFalse(pipeline_set_hyperparams_1.equals(pipeline_set_hyperparams_2, strict_order=True)) + + self.assertTrue(pipeline_set_hyperparams_1.equals(pipeline_set_hyperparams_3)) + self.assertFalse(pipeline_set_hyperparams_1.equals(pipeline_set_hyperparams_3, strict_order=True)) + + def test_pipeline_isomorphism_check_control_only(self): + # Pipelines with different tuning hyperparameters should still be equal with + # the only_control_hyperparams flag set. + pipeline_diff_tuningparams_a = self._quick_build_pipeline_with_real_primitives([ + { + 'primitive': DatasetToDataFramePrimitive, + 'container_args': {'inputs': 'inputs.0'}, + }, { + 'primitive': RandomForestClassifierPrimitive, + 'container_args': {'inputs': 'steps.0.produce', 'outputs': 'steps.0.produce'}, + 'value_hyperparams': {'n_estimators': 250} + } + ]) + pipeline_diff_tuningparams_b = self._quick_build_pipeline_with_real_primitives([ + { + 'primitive': DatasetToDataFramePrimitive, + 'container_args': {'inputs': 'inputs.0'}, + }, { + 'primitive': RandomForestClassifierPrimitive, + 'container_args': {'inputs': 'steps.0.produce', 'outputs': 'steps.0.produce'}, + 'value_hyperparams': {'n_estimators': 500} # different value + } + ]) + self.assertFalse(pipeline_diff_tuningparams_a.equals(pipeline_diff_tuningparams_b)) + self.assertFalse(pipeline_diff_tuningparams_b.equals(pipeline_diff_tuningparams_a)) + self.assertTrue(pipeline_diff_tuningparams_a.equals(pipeline_diff_tuningparams_b, only_control_hyperparams=True)) + self.assertTrue(pipeline_diff_tuningparams_b.equals(pipeline_diff_tuningparams_a, only_control_hyperparams=True)) + pipeline_diff_tuningparams_a_copy = copy.deepcopy(pipeline_diff_tuningparams_a) + self.assertTrue(pipeline_diff_tuningparams_a.equals(pipeline_diff_tuningparams_a_copy)) + self.assertTrue(pipeline_diff_tuningparams_a.equals(pipeline_diff_tuningparams_a_copy, only_control_hyperparams=True)) + + # Pipelines with different control hyperparameters should not be equal, + # even with the only_control_hyperparams flag set. + pipeline_diff_controlparams_a = self._quick_build_pipeline_with_real_primitives([ + { + 'primitive': DatasetToDataFramePrimitive, + 'container_args': {'inputs': 'inputs.0'}, + }, { + 'primitive': ColumnParserPrimitive, + 'container_args': {'inputs': 'steps.0.produce'}, + 'value_hyperparams': {'return_result': 'replace'} + } + ]) + pipeline_diff_controlparams_b = self._quick_build_pipeline_with_real_primitives([ + { + 'primitive': DatasetToDataFramePrimitive, + 'container_args': {'inputs': 'inputs.0'}, + }, { + 'primitive': ColumnParserPrimitive, + 'container_args': {'inputs': 'steps.0.produce'}, + 'value_hyperparams': {'return_result': 'new'} # different value + } + ]) + self.assertFalse(pipeline_diff_controlparams_a.equals(pipeline_diff_controlparams_b)) + self.assertFalse(pipeline_diff_controlparams_b.equals(pipeline_diff_controlparams_a)) + self.assertFalse(pipeline_diff_controlparams_a.equals(pipeline_diff_controlparams_b, only_control_hyperparams=True)) + self.assertFalse(pipeline_diff_controlparams_b.equals(pipeline_diff_controlparams_a, only_control_hyperparams=True)) + pipeline_diff_controlparams_a_copy = copy.deepcopy(pipeline_diff_controlparams_a) + self.assertTrue(pipeline_diff_controlparams_a.equals(pipeline_diff_controlparams_a_copy)) + self.assertTrue(pipeline_diff_controlparams_a.equals(pipeline_diff_controlparams_a_copy, only_control_hyperparams=True)) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_pipeline_run.py b/d3m/tests/test_pipeline_run.py new file mode 100644 index 0000000..74cf5d9 --- /dev/null +++ b/d3m/tests/test_pipeline_run.py @@ -0,0 +1,280 @@ +import copy +import json +import os +import unittest + +import jsonschema + +import d3m +from d3m import utils +from d3m.environment_variables import D3M_BASE_IMAGE_NAME, D3M_BASE_IMAGE_DIGEST, D3M_IMAGE_NAME, D3M_IMAGE_DIGEST +from d3m.metadata import base as metadata_base +from d3m.metadata.pipeline_run import RuntimeEnvironment + + +class TestComputeResources(unittest.TestCase): + # todo + pass + + +class TestRuntimeEnvironment(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.repo_path = os.path.realpath(d3m.__file__).rsplit('d3m',1)[0] + cls.original_git_path = os.path.join(cls.repo_path, '.git') + cls.moved_git_path = os.path.join(cls.repo_path, '.git_moved') + + @classmethod + def tearDown(cls): + if os.path.exists(cls.moved_git_path): + os.rename(cls.moved_git_path, cls.original_git_path) + + def test_empty_instantiation(self): + with utils.silence(): + RuntimeEnvironment() + + def test_deterministic_id(self): + with utils.silence(): + env = RuntimeEnvironment() + id_ = env['id'] + del env['id'] + gen_id = utils.compute_hash_id(env) + self.assertEqual(id_, gen_id, 'environment.id not deterministically generated') + + def _set_env_vars(self): + self.D3M_BASE_IMAGE_NAME_set_previously = False + if D3M_BASE_IMAGE_NAME in os.environ: + self.D3M_BASE_IMAGE_NAME_set_previously = True + self.D3M_BASE_IMAGE_NAME_previous_value = os.environ[D3M_BASE_IMAGE_NAME] + os.environ[D3M_BASE_IMAGE_NAME] = 'D3M_BASE_IMAGE_NAME_VALUE' + + self.D3M_BASE_IMAGE_DIGEST_set_previously = False + if D3M_BASE_IMAGE_DIGEST in os.environ: + self.D3M_BASE_IMAGE_DIGEST_set_previously = True + self.D3M_BASE_IMAGE_DIGEST_previous_value = os.environ[D3M_BASE_IMAGE_DIGEST] + os.environ[D3M_BASE_IMAGE_DIGEST] = 'D3M_BASE_IMAGE_DIGEST_VALUE' + + self.D3M_IMAGE_NAME_set_previously = False + if D3M_IMAGE_NAME in os.environ: + self.D3M_IMAGE_NAME_set_previously = True + self.D3M_IMAGE_NAME_previous_value = os.environ[D3M_IMAGE_NAME] + os.environ[D3M_IMAGE_NAME] = 'D3M_IMAGE_NAME_VALUE' + + self.D3M_IMAGE_DIGEST_set_previously = False + if D3M_IMAGE_DIGEST in os.environ: + self.D3M_IMAGE_DIGEST_set_previously = True + self.D3M_IMAGE_DIGEST_previous_value = os.environ[D3M_IMAGE_DIGEST] + os.environ[D3M_IMAGE_DIGEST] = 'D3M_IMAGE_DIGEST_VALUE' + + def _unset_env_vars(self): + if self.D3M_BASE_IMAGE_NAME_set_previously: + os.environ[D3M_BASE_IMAGE_NAME] = self.D3M_BASE_IMAGE_NAME_previous_value + else: + del os.environ[D3M_BASE_IMAGE_NAME] + if self.D3M_BASE_IMAGE_DIGEST_set_previously: + os.environ[D3M_BASE_IMAGE_DIGEST] = self.D3M_BASE_IMAGE_DIGEST_previous_value + else: + del os.environ[D3M_BASE_IMAGE_DIGEST] + if self.D3M_IMAGE_NAME_set_previously: + os.environ[D3M_IMAGE_NAME] = self.D3M_IMAGE_NAME_previous_value + else: + del os.environ[D3M_IMAGE_NAME] + if self.D3M_IMAGE_DIGEST_set_previously: + os.environ[D3M_IMAGE_DIGEST] = self.D3M_IMAGE_DIGEST_previous_value + else: + del os.environ[D3M_IMAGE_DIGEST] + + def test_env_vars(self): + self._set_env_vars() + try: + with utils.silence(): + env = RuntimeEnvironment() + + self.assertEqual( + env['base_docker_image']['image_name'], + os.environ[D3M_BASE_IMAGE_NAME], + 'base_image_name incorrectly extracted from environment variables' + ) + self.assertEqual( + env['base_docker_image']['image_digest'], + os.environ[D3M_BASE_IMAGE_DIGEST], + 'base_image_digest incorrectly extracted from environment variables' + ) + self.assertEqual( + env['docker_image']['image_name'], + os.environ[D3M_IMAGE_NAME], + 'image_name incorrectly extracted from environment variables' + ) + self.assertEqual( + env['docker_image']['image_digest'], + os.environ[D3M_IMAGE_DIGEST], + 'image_digest incorrectly extracted from environment variables' + ) + + finally: + self._unset_env_vars() + + def test_no_git_repo(self): + git_path_moved = False + if os.path.exists(self.original_git_path): + os.rename(self.original_git_path, self.moved_git_path) + git_path_moved = True + try: + with utils.silence(): + env = RuntimeEnvironment() + + self.assertEqual( + env['reference_engine_version'], d3m.__version__, + 'reference_engine_version incorrectly extracted from d3m repo' + ) + + self.assertEqual( + env['engine_version'], d3m.__version__, + 'reference_engine_version incorrectly extracted from d3m repo' + ) + finally: + if git_path_moved: + os.rename(self.moved_git_path, self.original_git_path) + + +class TestPipelineRunSchema(unittest.TestCase): + def test_scoring(self): + """ + When scoring of a pipeline is performed without a data preparation pipeline, + the scoring datasets must be recorded in the pipeline run. + When scoring pipeline information is not recored in pipeline run, results.scores + should also not be recorded. + """ + + schemas = copy.copy(metadata_base.SCHEMAS) + schemas['http://example.com/testing_run.json'] = copy.copy(metadata_base.DEFINITIONS_JSON) + schemas['http://example.com/testing_run.json']['id'] = 'http://example.com/testing_run.json' + schemas['http://example.com/testing_run.json'].update(metadata_base.DEFINITIONS_JSON['definitions']['pipeline_run']) + + validator, = utils.load_schema_validators(schemas, ('testing_run.json',)) + + id_digest = { + 'id': '0000000000000000000000000000000000000000000000000000000000000000', + 'digest': '0000000000000000000000000000000000000000000000000000000000000000' + } + status = {'state': 'SUCCESS'} + run_base_json = {'phase': 'FIT'} + data_preparation_base_json = { + 'pipeline': id_digest, + 'steps': [ + { + 'type': 'PRIMITIVE', + 'status': status + } + ], + 'status': status + } + scoring_base_json = data_preparation_base_json + results_scores = { + 'results': { + 'scores': [ + { + 'metric': { + 'metric': 'ACCURACY', + }, + 'value': 0.5, + } + ] + } + } + + valid_cases = [ + { + **run_base_json + }, + { + **run_base_json, + 'data_preparation': data_preparation_base_json + }, + { + **run_base_json, + 'data_preparation': data_preparation_base_json, + 'scoring': scoring_base_json + }, + { + **run_base_json, + 'scoring': { + **scoring_base_json, + 'datasets': [ + id_digest + ] + } + }, + { + **run_base_json, + 'data_preparation': data_preparation_base_json, + 'scoring': scoring_base_json, + **results_scores, + }, + { + **run_base_json, + 'scoring': { + **scoring_base_json, + 'datasets': [ + id_digest + ] + }, + **results_scores, + }, + ] + + invalid_cases = [ + { + **run_base_json, + 'scoring': scoring_base_json + }, + { + **run_base_json, + **results_scores, + }, + { + **run_base_json, + 'data_preparation': data_preparation_base_json, + **results_scores, + }, + { + **run_base_json, + 'scoring': scoring_base_json, + **results_scores, + }, + { + **run_base_json, + 'data_preparation': data_preparation_base_json, + 'scoring': { + **scoring_base_json, + 'datasets': [ + id_digest + ] + } + }, + { + **run_base_json, + 'data_preparation': data_preparation_base_json, + 'scoring': { + **scoring_base_json, + 'datasets': [ + id_digest + ] + }, + **results_scores, + }, + ] + + for i, valid_case in enumerate(valid_cases): + try: + validator.validate(valid_case) + except jsonschema.exceptions.ValidationError as e: + self.fail(f'{i}: {e}') + + for i, invalid_case in enumerate(invalid_cases): + with self.assertRaises(jsonschema.exceptions.ValidationError, msg=str(i)): + validator.validate(invalid_case) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_plasma.py b/d3m/tests/test_plasma.py new file mode 100644 index 0000000..db55ceb --- /dev/null +++ b/d3m/tests/test_plasma.py @@ -0,0 +1,115 @@ +import os +import signal +import subprocess +import time +import unittest + +import numpy +import pandas + +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/66 +try: + from pyarrow import plasma +except ModuleNotFoundError: + plasma = None + +from d3m import container + + +@unittest.skipIf(plasma is None, "requires Plasma") +class TestPlasma(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.process = subprocess.Popen(['plasma_store', '-m', '1000000', '-s', '/tmp/plasma', '-d', '/dev/shm'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, encoding='utf8', preexec_fn=os.setpgrp) + time.sleep(5) + cls.client = plasma.connect('/tmp/plasma') + + @classmethod + def tearDownClass(cls): + cls.client.disconnect() + os.killpg(os.getpgid(cls.process.pid), signal.SIGTERM) + + def test_list(self): + l = container.List([1, 2, 3], generate_metadata=True) + + l.metadata = l.metadata.update((), { + 'test': 'foobar', + }) + + object_id = self.client.put(l) + l_copy = self.client.get(object_id) + + self.assertIsInstance(l_copy, container.List) + self.assertTrue(hasattr(l_copy, 'metadata')) + + self.assertSequenceEqual(l, l_copy) + self.assertEqual(l.metadata.to_internal_json_structure(), l_copy.metadata.to_internal_json_structure()) + self.assertEqual(l_copy.metadata.query(()).get('test'), 'foobar') + + def test_ndarray(self): + for name, dtype, values in ( + ('ints', numpy.int64, [1, 2, 3]), + ('strings', numpy.dtype(' base.CallResult[Outputs]: + pass + + self.assertEqual(TestPrimitive.metadata.query()['primitive_code'].get('hyperparams', {}), { + 'n_components': { + 'type': hyperparams.Hyperparameter, + 'default': None, + 'structural_type': typing.Optional[int], + 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TuningParameter',), + 'description': 'Number of components (< n_classes - 1) for dimensionality reduction.', + }, + 'learning_rate': { + 'type': hyperparams.Uniform, + 'default': 0.1, + 'structural_type': float, + 'semantic_types': ( + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ), + 'description': 'Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.', + 'lower': 0.01, + 'upper': 2, + 'lower_inclusive': True, + 'upper_inclusive': False, + }, + 'array1': { + 'type': hyperparams.Hyperparameter, + 'default': ((1, 2), (3, 4)), + 'structural_type': container.ndarray, + 'semantic_types': ( + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ), + }, + 'array2': { + 'type': hyperparams.Hyperparameter, + 'default': ((1, 2), (3, 4)), + 'structural_type': container.DataFrame, + 'semantic_types': ( + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ), + }, + }) + + json.dumps(TestPrimitive.metadata.to_json_structure()) + + def test_package_validation(self): + Inputs = container.List + Outputs = container.List + + class Hyperparams(hyperparams.Hyperparams): + pass + + with self.assertRaisesRegex(ValueError, 'Invalid package name'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'source': { + 'name': 'Test', + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git', + 'version': '0.1.0', + }], + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def test_package_uri_validation(self): + Inputs = container.List + Outputs = container.List + + class Hyperparams(hyperparams.Hyperparams): + pass + + with self.assertRaisesRegex(ValueError, 'Package URI does not include a commit hash'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'source': { + 'name': 'Test', + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git', + }], + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + with self.assertRaisesRegex(ValueError, 'Package URI does not include a commit hash'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'source': { + 'name': 'Test', + }, + 'installation': [{ + # Once with string. + 'type': 'PIP', + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@v0.1.0', + }], + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + with self.assertRaisesRegex(ValueError, 'Package URI does not include a commit hash'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'source': { + 'name': 'Test', + }, + 'installation': [{ + # Once with enum value. + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/tests-data.git@v0.1.0', + }], + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def test_union_extra_argument(self): + Inputs = typing.Union[container.List, container.ndarray] + Outputs = container.List + + class Hyperparams(hyperparams.Hyperparams): + pass + + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '5431cc97-9ebe-48c6-ae6d-e97a611e4a24', + 'version': '0.1.0', + 'name': "Test Primitive", + 'source': { + 'name': 'Test', + }, + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, additional: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, additional: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: + return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, additional=additional) + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, additional: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, additional=additional) + + def test_subclass(self): + Inputs = container.List + Outputs = container.List + + class TestHyperparams(hyperparams.Hyperparams): + a = hyperparams.Hyperparameter( + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, TestHyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': 'fd89a661-6aed-49ad-aa65-3d41ba9ee903', + 'version': '0.1.0', + 'name': "Test Primitive", + 'source': { + 'name': 'Test', + }, + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + class SubclassTestHyperparams(TestHyperparams): + b = hyperparams.Hyperparameter( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + + # Silence any validation warnings. + with utils.silence(): + class SubclassTestPrimitive(TestPrimitive, transformer.TransformerPrimitiveBase[Inputs, Outputs, SubclassTestHyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': 'f7ba1f51-ed06-4466-8fbd-857637b2d322', + 'version': '0.1.0', + 'name': "Subclass Test Primitive", + 'source': { + 'name': 'Test', + }, + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + self.assertEqual(SubclassTestPrimitive.metadata.query()['id'], 'f7ba1f51-ed06-4466-8fbd-857637b2d322') + self.assertIs(SubclassTestPrimitive.metadata.get_hyperparams(), SubclassTestHyperparams) + self.assertEqual(set(SubclassTestHyperparams.configuration.keys()), {'a', 'b'}) + + self.assertEqual(TestPrimitive.metadata.query()['id'], 'fd89a661-6aed-49ad-aa65-3d41ba9ee903') + self.assertIs(TestPrimitive.metadata.get_hyperparams(), TestHyperparams) + self.assertEqual(set(TestHyperparams.configuration.keys()), {'a'}) + + def test_base_class_descriptions_constant(self): + for loader, module_name, is_pkg in pkgutil.walk_packages(primitive_interfaces.__path__, primitive_interfaces.__name__ + '.'): + if is_pkg: + continue + + module = importlib.import_module(module_name) + for name, cls in inspect.getmembers(module, inspect.isclass): + if not issubclass(cls, base.PrimitiveBase): + continue + + # For each class that is a subclass of PrimitiveBase, check the doc string. + self.assertTrue(cls.__doc__.startswith(base.DEFAULT_DESCRIPTION), '{module_name}.{name}'.format(module_name=module_name, name=name)) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_primitive_sum.py b/d3m/tests/test_primitive_sum.py new file mode 100644 index 0000000..463569c --- /dev/null +++ b/d3m/tests/test_primitive_sum.py @@ -0,0 +1,283 @@ +import json +import unittest +import os.path +import sys + +import d3m +from d3m import container, utils +from d3m.metadata import base + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') + +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.primitive_sum import PrimitiveSumPrimitive +from test_primitives.null import NullTransformerPrimitive + + +EXPECTED_PRIMITIVE_DESCRIPTION_JSON = r""" +{ + "id": "6b061902-5e40-4a7a-9a21-b995dce1b2aa", + "version": "0.1.0", + "name": "Sum results of other primitives", + "keywords": [ + "test primitive" + ], + "source": { + "name": "Test team", + "contact": "mailto:author@example.com", + "uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/primitive_sum.py", + "https://gitlab.com/datadrivendiscovery/tests-data.git" + ] + }, + "installation": [ + { + "type": "PIP", + "package_uri": "git+https://gitlab.com/datadrivendiscovery/tests-data.git@__GIT_COMMIT__#egg=test_primitives&subdirectory=primitives" + } + ], + "location_uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/raw/__GIT_COMMIT__/primitives/test_primitives/add_primitives.py" + ], + "python_path": "d3m.primitives.operator.primitive_sum.Test", + "algorithm_types": [ + "COMPUTER_ALGEBRA" + ], + "primitive_family": "OPERATOR", + "preconditions": [ + "NO_MISSING_VALUES", + "NO_CATEGORICAL_VALUES" + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/primitive.json", + "original_python_path": "test_primitives.primitive_sum.PrimitiveSumPrimitive", + "primitive_code": { + "class_type_arguments": { + "Inputs": "d3m.container.list.List", + "Outputs": "d3m.container.list.List", + "Hyperparams": "test_primitives.primitive_sum.Hyperparams", + "Params": "NoneType" + }, + "interfaces_version": "__INTERFACES_VERSION__", + "interfaces": [ + "transformer.TransformerPrimitiveBase", + "base.PrimitiveBase" + ], + "hyperparams": { + "primitive_1": { + "type": "d3m.metadata.hyperparams.Primitive", + "default": "test_primitives.null.NullTransformerPrimitive", + "structural_type": "d3m.primitive_interfaces.base.PrimitiveBase", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/ControlParameter" + ], + "primitive_families": [ + + ], + "algorithm_types": [ + + ], + "produce_methods": [ + + ] + }, + "primitive_2": { + "type": "d3m.metadata.hyperparams.Primitive", + "default": "test_primitives.null.NullTransformerPrimitive", + "structural_type": "d3m.primitive_interfaces.base.PrimitiveBase", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/ControlParameter" + ], + "primitive_families": [ + + ], + "algorithm_types": [ + + ], + "produce_methods": [ + + ] + } + }, + "arguments": { + "hyperparams": { + "type": "test_primitives.primitive_sum.Hyperparams", + "kind": "RUNTIME" + }, + "random_seed": { + "type": "int", + "kind": "RUNTIME", + "default":0 + }, + "docker_containers": { + "type": "typing.Union[NoneType, typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]]", + "kind": "RUNTIME", + "default":null + }, + "volumes": { + "type": "typing.Union[NoneType, typing.Dict[str, str]]", + "kind": "RUNTIME", + "default":null + }, + "temporary_directory": { + "type": "typing.Union[NoneType, str]", + "kind": "RUNTIME", + "default": null + }, + "timeout": { + "type": "typing.Union[NoneType, float]", + "kind": "RUNTIME", + "default":null + }, + "iterations": { + "type": "typing.Union[NoneType, int]", + "kind": "RUNTIME", + "default":null + }, + "produce_methods": { + "type": "typing.Sequence[str]", + "kind": "RUNTIME" + }, + "inputs": { + "type": "d3m.container.list.List", + "kind": "PIPELINE" + }, + "params": { + "type": "NoneType", + "kind": "RUNTIME" + } + }, + "instance_methods": { + "__init__": { + "kind": "OTHER", + "arguments": [ + "hyperparams", + "random_seed", + "docker_containers", + "volumes", + "temporary_directory" + ], + "returns": "NoneType", + "description": "All primitives should accept all their hyper-parameters in a constructor as one value,\nan instance of type ``Hyperparams``.\n\nProvided random seed should control all randomness used by this primitive.\nPrimitive should behave exactly the same for the same random seed across multiple\ninvocations. You can call `numpy.random.RandomState(random_seed)` to obtain an\ninstance of a random generator using provided seed. If your primitive does not\nuse randomness, consider not exposing this argument in your primitive's constructor\nto signal that.\n\nPrimitives can be wrappers around or use one or more Docker images which they can\nspecify as part of ``installation`` field in their metadata. Each Docker image listed\nthere has a ``key`` field identifying that image. When primitive is created,\n``docker_containers`` contains a mapping between those keys and connection information\nwhich primitive can use to connect to a running Docker container for a particular Docker\nimage and its exposed ports. Docker containers might be long running and shared between\nmultiple instances of a primitive. If your primitive does not use Docker images,\nconsider not exposing this argument in your primitive's constructor.\n\n**Note**: Support for primitives using Docker containers has been put on hold.\nCurrently it is not expected that any runtime running primitives will run\nDocker containers for a primitive.\n\nPrimitives can also use additional static files which can be added as a dependency\nto ``installation`` metadata. When done so, given volumes are provided to the\nprimitive through ``volumes`` argument to the primitive's constructor as a\ndict mapping volume keys to file and directory paths where downloaded and\nextracted files are available to the primitive. All provided files and directories\nare read-only. If your primitive does not use static files, consider not exposing\nthis argument in your primitive's constructor.\n\nPrimitives can also use the provided temporary directory to store any files for\nthe duration of the current pipeline run phase. Directory is automatically\ncleaned up after the current pipeline run phase finishes. Do not store in this\ndirectory any primitive's state you would like to preserve between \"fit\" and\n\"produce\" phases of pipeline execution. Use ``Params`` for that. The main intent\nof this temporary directory is to store files referenced by any ``Dataset`` object\nyour primitive might create and followup primitives in the pipeline should have\naccess to. When storing files into this directory consider using capabilities\nof Python's `tempfile` module to generate filenames which will not conflict with\nany other files stored there. Use provided temporary directory as ``dir`` argument\nto set it as base directory to generate additional temporary files and directories\nas needed. If your primitive does not use temporary directory, consider not exposing\nthis argument in your primitive's constructor.\n\nNo other arguments to the constructor are allowed (except for private arguments)\nbecause we want instances of primitives to be created without a need for any other\nprior computation.\n\nModule in which a primitive is defined should be kept lightweight and on import not do\nany (pre)computation, data loading, or resource allocation/reservation. Any loading\nand resource allocation/reservation should be done in the constructor. Any (pre)computation\nshould be done lazily when needed once requested through other methods and not in the constructor." + }, + "fit": { + "kind": "OTHER", + "arguments": [ + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[NoneType]", + "description": "A noop.\n\nParameters\n----------\ntimeout:\n A maximum time this primitive should be fitting during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA ``CallResult`` with ``None`` value." + }, + "fit_multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling ``fit`` and after that multiple produce methods at once.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to both fit the primitive and produce outputs\n for all produce methods listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do for both fitting and producing\n outputs of all produce methods.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "get_params": { + "kind": "OTHER", + "arguments": [ + + ], + "returns": "NoneType", + "description": "A noop.\n\nReturns\n-------\nAn instance of parameters." + }, + "multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling multiple produce methods at once.\n\nWhen a primitive has multiple produce methods it is common that they might compute the\nsame internal results for same inputs but return different representations of those results.\nIf caller is interested in multiple of those representations, calling multiple produce\nmethods might lead to recomputing same internal results multiple times. To address this,\nthis method allows primitive author to implement an optimized version which computes\ninternal results only once for multiple calls of produce methods, but return those different\nrepresentations.\n\nIf any additional method arguments are added to primitive's produce method(s), they have\nto be added to this method as well. This method should accept an union of all arguments\naccepted by primitive's produce method(s) and then use them accordingly when computing\nresults.\n\nThe default implementation of this method just calls all produce methods listed in\n``produce_methods`` in order and is potentially inefficient.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to produce outputs for all produce methods\n listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "produce": { + "kind": "PRODUCE", + "arguments": [ + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[d3m.container.list.List]", + "singleton":false, + "inputs_across_samples": [ + + ], + "description": "Produce primitive's best choice of the output for each of the inputs.\n\nThe output value should be wrapped inside ``CallResult`` object before returning.\n\nIn many cases producing an output is a quick operation in comparison with ``fit``, but not\nall cases are like that. For example, a primitive can start a potentially long optimization\nprocess to compute outputs. ``timeout`` and ``iterations`` can serve as a way for a caller\nto guide the length of this process.\n\nIdeally, a primitive should adapt its call to try to produce the best outputs possible\ninside the time allocated. If this is not possible and the primitive reaches the timeout\nbefore producing outputs, it should raise a ``TimeoutError`` exception to signal that the\ncall was unsuccessful in the given time. The state of the primitive after the exception\nshould be as the method call has never happened and primitive should continue to operate\nnormally. The purpose of ``timeout`` is to give opportunity to a primitive to cleanly\nmanage its state instead of interrupting execution from outside. Maintaining stable internal\nstate should have precedence over respecting the ``timeout`` (caller can terminate the\nmisbehaving primitive from outside anyway). If a longer ``timeout`` would produce\ndifferent outputs, then ``CallResult``'s ``has_finished`` should be set to ``False``.\n\nSome primitives have internal iterations (for example, optimization iterations).\nFor those, caller can provide how many of primitive's internal iterations\nshould a primitive do before returning outputs. Primitives should make iterations as\nsmall as reasonable. If ``iterations`` is ``None``, then there is no limit on\nhow many iterations the primitive should do and primitive should choose the best amount\nof iterations on its own (potentially controlled through hyper-parameters).\nIf ``iterations`` is a number, a primitive has to do those number of iterations,\nif possible. ``timeout`` should still be respected and potentially less iterations\ncan be done because of that. Primitives with internal iterations should make\n``CallResult`` contain correct values.\n\nFor primitives which do not have internal iterations, any value of ``iterations``\nmeans that they should run fully, respecting only ``timeout``.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\ninputs:\n The inputs of shape [num_inputs, ...].\ntimeout:\n A maximum time this primitive should take to produce outputs during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nThe outputs of shape [num_inputs, ...] wrapped inside ``CallResult``." + }, + "set_params": { + "kind": "OTHER", + "arguments": [ + "params" + ], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------\nparams:\n An instance of parameters." + }, + "set_training_data": { + "kind": "OTHER", + "arguments": [ + + ], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------" + } + }, + "class_attributes": { + "logger": "logging.Logger", + "metadata": "d3m.metadata.base.PrimitiveMetadata" + }, + "class_methods": {}, + "instance_attributes": { + "hyperparams": "d3m.metadata.hyperparams.Hyperparams", + "random_seed": "int", + "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]", + "volumes": "typing.Dict[str, str]", + "temporary_directory": "typing.Union[NoneType, str]" + } + }, + "structural_type": "test_primitives.primitive_sum.PrimitiveSumPrimitive", + "description": "A primitive which element-wise sums the produced results of two other primitives. Each of those two primitives\nare given inputs (a list of numbers) to this primitive first as their inputs, are expected to return a list\nof numbers back, and then those lists are element-wise summed together, to produce the final list.\n\nThis primitive exists just as a demonstration. To sum results you would otherwise just simply\nsum the results directly instead of getting an instance of the primitive and call\nproduce methods on it. But this does allow more complicated ways of interacting with a\nprimitive and this primitive demonstrates it.\n\nAttributes\n----------\nmetadata:\n Primitive's metadata. Available as a class attribute.\nlogger:\n Primitive's logger. Available as a class attribute.\nhyperparams:\n Hyperparams passed to the constructor.\nrandom_seed:\n Random seed passed to the constructor.\ndocker_containers:\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes:\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory:\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.", + "digest": "__DIGEST__" +} +""".replace('__INTERFACES_VERSION__', d3m.__version__).replace('__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace('__DIGEST__', PrimitiveSumPrimitive.metadata.query()['digest']) + + +class TestPrimitiveSumPrimitive(unittest.TestCase): + def call_primitive(self, primitive, method_name, **kwargs): + return getattr(primitive, method_name)(**kwargs) + + def test_basic(self): + hyperparam_primitive1 = NullTransformerPrimitive(hyperparams=NullTransformerPrimitive.metadata.get_hyperparams().defaults()) + hyperparam_primitive2 = NullTransformerPrimitive(hyperparams=NullTransformerPrimitive.metadata.get_hyperparams().defaults()) + + primitive = PrimitiveSumPrimitive(hyperparams={'primitive_1': hyperparam_primitive1, 'primitive_2': hyperparam_primitive2}) + inputs = container.List([10, 20, 30], generate_metadata=True) + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertSequenceEqual(call_metadata.value, [20, 40, 60]) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query(())['dimension']['length'], 3) + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS,))['structural_type'], int) + + def test_metadata(self): + expected_description = json.loads(EXPECTED_PRIMITIVE_DESCRIPTION_JSON) + + # We stringify to JSON and parse it to make sure the description can be stringified to JSON. + description = json.loads(json.dumps(PrimitiveSumPrimitive.metadata.to_json_structure())) + + self.maxDiff = None + self.assertEqual(expected_description, description) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_primitive_validation.py b/d3m/tests/test_primitive_validation.py new file mode 100644 index 0000000..8ae6e74 --- /dev/null +++ b/d3m/tests/test_primitive_validation.py @@ -0,0 +1,808 @@ +import typing +import unittest +import logging + +from d3m import container, exceptions, utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces import base, transformer, unsupervised_learning + +Inputs = container.List +Outputs = container.List + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class TestPrimitiveValidation(unittest.TestCase): + def test_multi_produce_missing_argument(self): + with self.assertRaisesRegex(exceptions.InvalidPrimitiveCodeError, '\'multi_produce\' method arguments have to be an union of all arguments of all produce methods, but it does not accept all expected arguments'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def test_fit_multi_produce_missing_argument(self): + with self.assertRaisesRegex(exceptions.InvalidPrimitiveCodeError, '\'fit_multi_produce\' method arguments have to be an union of all arguments of \'set_training_data\' method and all produce methods, but it does not accept all expected arguments'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: + pass + + def test_multi_produce_extra_argument(self): + with self.assertRaisesRegex(exceptions.InvalidPrimitiveCodeError, '\'multi_produce\' method arguments have to be an union of all arguments of all produce methods, but it accepts unexpected arguments'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: + pass + + def test_fit_multi_produce_extra_argument(self): + with self.assertRaisesRegex(exceptions.InvalidPrimitiveCodeError, '\'fit_multi_produce\' method arguments have to be an union of all arguments of \'set_training_data\' method and all produce methods, but it accepts unexpected arguments'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, second_inputs: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: + pass + + def test_produce_using_produce_methods(self): + with self.assertRaisesRegex(exceptions.InvalidPrimitiveCodeError, 'Produce method cannot use \'produce_methods\' argument'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }) + + def produce(self, *, inputs: Inputs, produce_methods: typing.Sequence[str], timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def test_hyperparams_to_tune(self): + with self.assertRaisesRegex(exceptions.InvalidMetadataError, 'Hyper-parameter in \'hyperparams_to_tune\' metadata does not exist'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + 'hyperparams_to_tune': [ + 'foobar', + ] + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def test_inputs_across_samples(self): + with self.assertRaisesRegex(exceptions.InvalidPrimitiveCodeError, 'Method \'.*\' has an argument \'.*\' set as computing across samples, but it does not exist'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + 'hyperparams_to_tune': [ + 'foobar', + ] + }) + + @base.inputs_across_samples('foobar') + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + with self.assertRaisesRegex(exceptions.InvalidPrimitiveCodeError, 'Method \'.*\' has an argument \'.*\' set as computing across samples, but it is not a PIPELINE argument'): + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '67568a80-dec2-4597-a10f-39afb13d3b9c', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'd3m.primitives.test.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NUMERICAL_METHOD, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + 'hyperparams_to_tune': [ + 'foobar', + ] + }) + + @base.inputs_across_samples('timeout') + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + pass + + def test_can_detect_too_many_package_components(self): + logger = logging.getLogger('d3m.metadata.base') + + # Ensure a warning message is generated for too many package components + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.random_forest.SKLearn.toomany', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification. " + "Reason: must have 5 segments.") + + # Ensure a warning message is NOT generated for an acceptable number of components + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.random_forest.SKLearn', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + + def test_with_string_instead_of_enum(self): + logger = logging.getLogger(metadata_base.__name__) + + # Ensure a warning message is NOT generated for an acceptable number of components + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.random_forest.SKLearn', metadata_base.PrimitiveFamily.CLASSIFICATION.name) + + self.assertEqual(len(cm.records), 1) + + def test_can_detect_too_few_package_components(self): + logger = logging.getLogger(metadata_base.__name__) + + # Ensure a warning message is generated for too few package components + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.too_few', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification. " + "Reason: must have 5 segments.") + + # Ensure a warning message is NOT generated for an acceptable number of components + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.random_forest.SKLearn', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + + def test_can_detect_bad_primitive_family(self): + logger = logging.getLogger(metadata_base.__name__) + + # Ensure a warning message is generated for a bad primitive family + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.bad_family.random_forest.SKLearn', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification." + " Reason: primitive family segment must match primitive's primitive family.") + + # Ensure a warning message is NOT generated for an acceptable primitive family + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.random_forest.SKLearn', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + + def test_can_detect_bad_primitive_name(self): + logger = logging.getLogger(metadata_base.__name__) + + # Ensure a warning message is generated for a bad primitive name + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.bad_name.SKLearn', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification. " + "Reason: must have a known primitive name segment.") + + # Ensure a warning message is NOT generated for an acceptable primitive name + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.random_forest.SKLearn', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + + def test_can_detect_kind_not_capitalized(self): + logger = logging.getLogger(metadata_base.__name__) + + # Ensure a warning message is generated for a primitive kind not capitalized properly + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.random_forest.sklearn', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, + "%(python_path)s: Primitive's Python path does not adhere to d3m.primitives namespace specification. " + "Reason: primitive kind segment must start with upper case.") + + # Ensure a warning message is NOT generated for an acceptable primitive kind + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_namespace_compliance('d3m.primitives.classification.random_forest.SKLearn', metadata_base.PrimitiveFamily.CLASSIFICATION) + + self.assertEqual(len(cm.records), 1) + + def test_will_generate_warning_for_missing_contact(self): + logger = logging.getLogger(metadata_base.__name__) + + bad_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'source': { + 'name': 'Test author', + # 'contact': 'mailto:test@example.com', + 'uris': 'http://someplace' + } + }) + + good_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'source': { + 'name': 'Test author', + 'contact': 'mailto:test@example.com', + 'uris': 'http://someplace' + } + }) + + # Ensure a warning message is generated for a primitive with no contact specified in the metadata.source + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_contact_information(bad_metadata.query()) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "%(python_path)s: Contact information such as the email address of the author (e.g., \"mailto:author@example.com\") should be specified in primitive metadata in its \"source.contact\" field.") + + # Ensure a warning message is NOT generated for a primitive with a contact specified in the metadata.source + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_contact_information(good_metadata.query()) + + self.assertEqual(len(cm.records), 1) + + def test_will_generate_warning_for_empty_contact(self): + logger = logging.getLogger(metadata_base.__name__) + + bad_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'source': { + 'name': 'Test author', + 'contact': '', + 'uris': ['http://someplace'] + } + }) + + good_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'source': { + 'name': 'Test author', + 'contact': 'mailto:test@example.com', + 'uris': ['http://someplace'] + } + }) + + # Ensure a warning message is generated for a primitive with empty contact specified in the metadata.source. + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_contact_information(bad_metadata.query()) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "%(python_path)s: Contact information such as the email address of the author (e.g., \"mailto:author@example.com\") should be specified in primitive metadata in its \"source.contact\" field.") + + # Ensure a warning message is NOT generated when a contact value is specified. + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_contact_information(good_metadata.query()) + + self.assertEqual(len(cm.records), 1) + + def test_will_not_generate_missing_contact_warning_when_installation_not_specified(self): + logger = logging.getLogger(metadata_base.__name__) + + good_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'source': { + 'name': 'Test author', + 'uris': ['http://someplace'] + } + }) + + # Ensure a warning message is NOT generated when a contact value is not specified when installation is also + # not specified. + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_contact_information(good_metadata.query()) + + self.assertEqual(len(cm.records), 1) + + def test_will_generate_warning_for_missing_uris(self): + logger = logging.getLogger(metadata_base.__name__) + + bad_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'source': { + 'name': 'Test author', + 'contact': 'mailto:test@example.com', + } + }) + + good_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'source': { + 'name': 'Test author', + 'contact': 'mailto:test@example.com', + 'uris': ['http://someplace'], + } + }) + + # Ensure a warning message is generated for a primitive with no uris specified in the metadata.source. + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_contact_information(bad_metadata.query()) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "%(python_path)s: A bug reporting URI should be specified in primitive metadata in its \"source.uris\" field.") + + # Ensure a warning message is NOT generated when uris are specified in the metadata.source. + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_contact_information(good_metadata.query()) + + self.assertEqual(len(cm.records), 1) + + def test_will_generate_warning_for_empty_uris(self): + logger = logging.getLogger(metadata_base.__name__) + + bad_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'source': { + 'name': 'Test author', + 'contact': 'mailto:test@example.com', + 'uris': [], + } + }) + + good_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'source': { + 'name': 'Test author', + 'contact': 'mailto:test@example.com', + 'uris': ['http://someplace'], + } + }) + + # Ensure a warning message is generated for a primitive with empty uris specified in the metadata.source. + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_contact_information(bad_metadata.query()) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "%(python_path)s: A bug reporting URI should be specified in primitive metadata in its \"source.uris\" field.") + + # Ensure a warning message is NOT generated when non empty uris are specified in the metadata.source. + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_contact_information(good_metadata.query()) + + self.assertEqual(len(cm.records), 1) + + def test_validation_will_warn_on_missing_source(self): + logger = logging.getLogger(metadata_base.__name__) + + bad_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + }) + + good_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'source': { + 'name': 'Test author', + 'contact': 'mailto:test@example.com', + 'uris': ['http://someplace'], + } + }) + + # Ensure a warning message is generated for a primitive with no source + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_contact_information(bad_metadata.query()) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "%(python_path)s: No \"source\" field in the primitive metadata. Metadata should contain contact information and bug reporting URI.") + + # Ensure a warning message is NOT generated when source is present + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_contact_information(good_metadata.query()) + + self.assertEqual(len(cm.records), 1) + + def test_validation_will_warn_on_missing_description(self): + logger = logging.getLogger(metadata_base.__name__) + + bad_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + }) + + good_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'description': 'primitive description' + }) + + # Ensure a warning message is generated for a primitive with no description + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_description(bad_metadata.query()) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "%(python_path)s: Primitive is not providing a description through its docstring.") + + # Ensure a warning message is NOT generated when description is present + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_description(good_metadata.query()) + + self.assertEqual(len(cm.records), 1) + + def test_validation_will_warn_on_empty_description(self): + logger = logging.getLogger(metadata_base.__name__) + + bad_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'description': '' + }) + + good_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'description': 'primitive description' + }) + + # Ensure a warning message is generated for a primitive with no description + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_description(bad_metadata.query()) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "%(python_path)s: Primitive is not providing a description through its docstring.") + + # Ensure a warning message is NOT generated when description is present + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_description(good_metadata.query()) + + self.assertEqual(len(cm.records), 1) + + def test_validation_will_warn_on_inherited_description(self): + logger = logging.getLogger(metadata_base.__name__) + + bad_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'description': 'A base class for primitives description' + }) + + good_metadata = metadata_base.PrimitiveMetadata({ + 'id': 'id', + 'version': '0.1.0', + 'name': "Test Primitive", + 'python_path': 'path', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'foobar', + 'version': '0.1.0', + }], + 'description': 'primitive description' + }) + + # Ensure a warning message is generated for a primitive with no description + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + metadata_base.PrimitiveMetadata()._validate_description(bad_metadata.query()) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "%(python_path)s: Primitive is not providing a description through its docstring.") + + # Ensure a warning message is NOT generated when description is present + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + logger.debug("Dummy log") + metadata_base.PrimitiveMetadata()._validate_description(good_metadata.query()) + + self.assertEqual(len(cm.records), 1) + + def test_neural_network_mixin(self): + class MyNeuralNetworkModuleBase: + pass + + class Params(params.Params): + pass + + class MyNeuralNetworkModule(MyNeuralNetworkModuleBase): + pass + + # Silence any validation warnings. + with utils.silence(): + class TestPrimitive( + base.NeuralNetworkModuleMixin[Inputs, Outputs, Params, Hyperparams, MyNeuralNetworkModuleBase], + unsupervised_learning.UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ): + metadata = metadata_base.PrimitiveMetadata({ + 'id': '4164deb6-2418-4c96-9959-3d475dcf9584', + 'version': '0.1.0', + 'name': "Test neural network module", + 'python_path': 'd3m.primitives.layer.super.TestPrimitive', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.CONVOLUTIONAL_NEURAL_NETWORK_LAYER, + ], + 'primitive_family': metadata_base.PrimitiveFamily.LAYER, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + raise exceptions.NotSupportedError + + def set_training_data(self, *, inputs: Inputs) -> None: + raise exceptions.NotSupportedError + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + raise exceptions.NotSupportedError + + def get_params(self) -> Params: + return Params() + + def set_params(self, *, params: Params) -> None: + pass + + def get_module(self, *, input_module: MyNeuralNetworkModuleBase) -> MyNeuralNetworkModuleBase: + return MyNeuralNetworkModule() + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_problem.py b/d3m/tests/test_problem.py new file mode 100644 index 0000000..bc90b2b --- /dev/null +++ b/d3m/tests/test_problem.py @@ -0,0 +1,232 @@ +import os.path +import pickle +import unittest + +from d3m import utils +from d3m.metadata import problem, pipeline_run + + +class TestProblem(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + + problem_doc_path = os.path.join(os.path.dirname(__file__), 'data', 'problems', 'iris_problem_1', 'problemDoc.json') + + problem_uri = 'file://{problem_doc_path}'.format(problem_doc_path=problem_doc_path) + + problem_description = problem.Problem.load(problem_uri) + + self.assertEqual(problem_description.to_simple_structure(), { + 'id': 'iris_problem_1', + 'digest': '1a12135422967aa0de0c4629f4f58d08d39e97f9133f7b50da71420781aa18a5', + 'version': '4.0.0', + 'location_uris': [ + problem_uri, + ], + 'name': 'Distinguish Iris flowers', + 'description': 'Distinguish Iris flowers of three related species.', + 'schema': problem.PROBLEM_SCHEMA_VERSION, + 'problem': { + 'task_keywords': [problem.TaskKeyword.CLASSIFICATION, problem.TaskKeyword.MULTICLASS], + 'performance_metrics': [ + { + 'metric': problem.PerformanceMetric.ACCURACY, + } + ] + }, + 'inputs': [ + { + 'dataset_id': 'iris_dataset_1', + 'targets': [ + { + 'target_index': 0, + 'resource_id': 'learningData', + 'column_index': 5, + 'column_name': 'species', + } + ] + } + ], + }) + + self.assertEqual(problem_description.to_json_structure(), { + 'id': 'iris_problem_1', + 'digest': '1a12135422967aa0de0c4629f4f58d08d39e97f9133f7b50da71420781aa18a5', + 'version': '4.0.0', + 'location_uris': [ + problem_uri, + ], + 'name': 'Distinguish Iris flowers', + 'description': 'Distinguish Iris flowers of three related species.', + 'schema': problem.PROBLEM_SCHEMA_VERSION, + 'problem': { + 'task_keywords': [problem.TaskKeyword.CLASSIFICATION, problem.TaskKeyword.MULTICLASS], + 'performance_metrics': [ + { + 'metric': problem.PerformanceMetric.ACCURACY, + } + ] + }, + 'inputs': [ + { + 'dataset_id': 'iris_dataset_1', + 'targets': [ + { + 'target_index': 0, + 'resource_id': 'learningData', + 'column_index': 5, + 'column_name': 'species', + } + ] + } + ], + }) + + self.assertEqual(problem_description.to_json_structure(), { + 'id': 'iris_problem_1', + 'digest': '1a12135422967aa0de0c4629f4f58d08d39e97f9133f7b50da71420781aa18a5', + 'version': '4.0.0', + 'location_uris': [ + problem_uri, + ], + 'name': 'Distinguish Iris flowers', + 'description': 'Distinguish Iris flowers of three related species.', + 'schema': problem.PROBLEM_SCHEMA_VERSION, + 'problem': { + 'task_keywords': ['CLASSIFICATION', 'MULTICLASS'], + 'performance_metrics': [ + { + 'metric': 'ACCURACY', + } + ] + }, + 'inputs': [ + { + 'dataset_id': 'iris_dataset_1', + 'targets': [ + { + 'target_index': 0, + 'resource_id': 'learningData', + 'column_index': 5, + 'column_name': 'species', + } + ] + } + ], + }) + + pipeline_run.validate_problem(problem_description.to_json_structure(canonical=True)) + problem.PROBLEM_SCHEMA_VALIDATOR.validate(problem_description.to_json_structure(canonical=True)) + + def test_conversion(self): + problem_doc_path = os.path.join(os.path.dirname(__file__), 'data', 'problems', 'iris_problem_1', 'problemDoc.json') + + problem_uri = 'file://{problem_doc_path}'.format(problem_doc_path=problem_doc_path) + + problem_description = problem.Problem.load(problem_uri) + + self.assertEqual(problem_description.to_simple_structure(), problem.Problem.from_json_structure(problem_description.to_json_structure(), strict_digest=True).to_simple_structure()) + + # Legacy. + self.assertEqual(utils.to_json_structure(problem_description.to_simple_structure()), problem.Problem.from_json_structure(utils.to_json_structure(problem_description.to_simple_structure()), strict_digest=True).to_simple_structure()) + + self.assertIs(problem.Problem.from_json_structure(problem_description.to_json_structure(), strict_digest=True)['problem']['task_keywords'][0], problem.TaskKeyword.CLASSIFICATION) + + def test_unparse(self): + self.assertEqual(problem.TaskKeyword.CLASSIFICATION.unparse(), 'classification') + self.assertEqual(problem.TaskKeyword.MULTICLASS.unparse(), 'multiClass') + self.assertEqual(problem.PerformanceMetric.ACCURACY.unparse(), 'accuracy') + + def test_normalize(self): + self.assertEqual(problem.PerformanceMetric._normalize(0, 1, 0.5), 0.5) + self.assertEqual(problem.PerformanceMetric._normalize(0, 2, 0.5), 0.25) + self.assertEqual(problem.PerformanceMetric._normalize(1, 2, 1.5), 0.5) + + self.assertEqual(problem.PerformanceMetric._normalize(-1, 0, -0.5), 0.5) + self.assertEqual(problem.PerformanceMetric._normalize(-2, 0, -1.5), 0.25) + self.assertEqual(problem.PerformanceMetric._normalize(-2, -1, -1.5), 0.5) + + self.assertEqual(problem.PerformanceMetric._normalize(1, 0, 0.5), 0.5) + self.assertEqual(problem.PerformanceMetric._normalize(2, 0, 0.5), 0.75) + self.assertEqual(problem.PerformanceMetric._normalize(2, 1, 1.5), 0.5) + + self.assertEqual(problem.PerformanceMetric._normalize(0, -1, -0.5), 0.5) + self.assertEqual(problem.PerformanceMetric._normalize(0, -2, -1.5), 0.75) + self.assertEqual(problem.PerformanceMetric._normalize(-1, -2, -1.5), 0.5) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), 0, 0.0), 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), 0, 0.5), 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), 0, 1000.0), 0.5378828427399902) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), 0, 5000.0), 0.013385701848569713) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), 1, 1.0), 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), 1, 1.5), 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), 1, 1000.0), 0.5382761574524354) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), 1, 5000.0), 0.013399004523107192) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), -1, -1.0), 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), -1, -0.5), 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), -1, 1000.0), 0.5374897097430198) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), -1, 5000.0), 0.01337241229216877) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('inf'), -1, 0.0), 0.9995000000416667) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), 0, 0.0), 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), 0, -0.5), 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), 0, -1000.0), 0.5378828427399902) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), 0, -5000.0), 0.013385701848569713) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), 1, 1.0), 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), 1, 0.5), 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), 1, -1000.0), 0.5374897097430198) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), 1, -5000.0), 0.01337241229216877) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), 1, 0.0), 0.9995000000416667) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), -1, -1.0), 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), -1, -1.5), 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), -1, -1000.0), 0.5382761574524354) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(float('-inf'), -1, -5000.0), 0.013399004523107192) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(0, float('inf'), 0.0), 1 - 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(0, float('inf'), 0.5), 1 - 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(0, float('inf'), 1000.0), 1 - 0.5378828427399902) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(0, float('inf'), 5000.0), 1 - 0.013385701848569713) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(1, float('inf'), 1.0), 1 - 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(1, float('inf'), 1.5), 1 - 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(1, float('inf'), 1000.0), 1 - 0.5382761574524354) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(1, float('inf'), 5000.0), 1 - 0.013399004523107192) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(-1, float('inf'), -1.0), 1 - 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(-1, float('inf'), -0.5), 1 - 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(-1, float('inf'), 1000.0), 1 - 0.5374897097430198) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(-1, float('inf'), 5000.0), 1 - 0.01337241229216877) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(-1, float('inf'), 0.0), 1 - 0.9995000000416667) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(0, float('-inf'), 0.0), 1 - 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(0, float('-inf'), -0.5), 1 - 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(0, float('-inf'), -1000.0), 1 - 0.5378828427399902) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(0, float('-inf'), -5000.0), 1 - 0.013385701848569713) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(1, float('-inf'), 1.0), 1 - 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(1, float('-inf'), 0.5), 1 - 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(1, float('-inf'), -1000.0), 1 - 0.5374897097430198) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(1, float('-inf'), -5000.0), 1 - 0.01337241229216877) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(1, float('-inf'), 0.0), 1 - 0.9995000000416667) + + self.assertAlmostEqual(problem.PerformanceMetric._normalize(-1, float('-inf'), -1.0), 1 - 1.0) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(-1, float('-inf'), -1.5), 1 - 0.9997500000052083) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(-1, float('-inf'), -1000.0), 1 - 0.5382761574524354) + self.assertAlmostEqual(problem.PerformanceMetric._normalize(-1, float('-inf'), -5000.0), 1 - 0.013399004523107192) + + def test_pickle(self): + value = problem.PerformanceMetric.ACCURACY + + pickled = pickle.dumps(value) + unpickled = pickle.loads(pickled) + + self.assertEqual(value, unpickled) + self.assertIs(value.get_class(), unpickled.get_class()) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_random.py b/d3m/tests/test_random.py new file mode 100644 index 0000000..09ac3c4 --- /dev/null +++ b/d3m/tests/test_random.py @@ -0,0 +1,269 @@ +import json +import unittest +import os.path +import pickle +import sys + +import numpy + +import d3m +from d3m import container, utils +from d3m.metadata import base + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') + +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.random import RandomPrimitive + + +EXPECTED_PRIMITIVE_DESCRIPTION_JSON = r""" +{ + "id": "df3153a1-4411-47e2-bbc0-9d5e9925ad79", + "version": "0.1.0", + "name": "Random Samples", + "keywords": [ + "test primitive" + ], + "source": { + "name": "Test team", + "contact": "mailto:author@example.com", + "uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/random.py", + "https://gitlab.com/datadrivendiscovery/tests-data.git" + ] + }, + "installation": [ + { + "type": "PIP", + "package_uri": "git+https://gitlab.com/datadrivendiscovery/tests-data.git@__GIT_COMMIT__#egg=test_primitives&subdirectory=primitives" + } + ], + "location_uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/raw/__GIT_COMMIT__/primitives/test_primitives/random.py" + ], + "python_path": "d3m.primitives.data_generation.random.Test", + "algorithm_types": [ + "MERSENNE_TWISTER", + "NORMAL_DISTRIBUTION" + ], + "primitive_family": "DATA_GENERATION", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/primitive.json", + "original_python_path": "test_primitives.random.RandomPrimitive", + "primitive_code": { + "class_type_arguments": { + "Outputs": "d3m.container.pandas.DataFrame", + "Params": "NoneType", + "Hyperparams": "test_primitives.random.Hyperparams", + "Inputs": "d3m.container.list.List" + }, + "interfaces_version": "__INTERFACES_VERSION__", + "interfaces": [ + "generator.GeneratorPrimitiveBase", + "base.PrimitiveBase" + ], + "hyperparams": { + "mu": { + "type": "d3m.metadata.hyperparams.Hyperparameter", + "default": 0.0, + "structural_type": "float", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/ControlParameter", + "https://metadata.datadrivendiscovery.org/types/TuningParameter" + ] + }, + "sigma": { + "type": "d3m.metadata.hyperparams.Hyperparameter", + "default": 1.0, + "structural_type": "float", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/ControlParameter", + "https://metadata.datadrivendiscovery.org/types/TuningParameter" + ] + } + }, + "arguments": { + "hyperparams": { + "type": "test_primitives.random.Hyperparams", + "kind": "RUNTIME" + }, + "random_seed": { + "type": "int", + "kind": "RUNTIME", + "default": 0 + }, + "timeout": { + "type": "typing.Union[NoneType, float]", + "kind": "RUNTIME", + "default": null + }, + "iterations": { + "type": "typing.Union[NoneType, int]", + "kind": "RUNTIME", + "default": null + }, + "produce_methods": { + "type": "typing.Sequence[str]", + "kind": "RUNTIME" + }, + "inputs": { + "type": "d3m.container.list.List", + "kind": "PIPELINE" + }, + "params": { + "type": "NoneType", + "kind": "RUNTIME" + } + }, + "class_methods": {}, + "instance_methods": { + "__init__": { + "kind": "OTHER", + "arguments": [ + "hyperparams", + "random_seed" + ], + "returns": "NoneType" + }, + "fit": { + "kind": "OTHER", + "arguments": [ + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[NoneType]", + "description": "A noop.\n\nParameters\n----------\ntimeout:\n A maximum time this primitive should be fitting during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA ``CallResult`` with ``None`` value." + }, + "fit_multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling ``fit`` and after that multiple produce methods at once.\n\nParameters\n----------\nproduce_methods : Sequence[str]\n A list of names of produce methods to call.\ninputs : List\n The inputs given to all produce methods.\ntimeout : float\n A maximum time this primitive should take to both fit the primitive and produce outputs\n for all produce methods listed in ``produce_methods`` argument, in seconds.\niterations : int\n How many of internal iterations should the primitive do for both fitting and producing\n outputs of all produce methods.\n\nReturns\n-------\nMultiCallResult\n A dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "get_params": { + "kind": "OTHER", + "arguments": [], + "returns": "NoneType", + "description": "A noop.\n\nReturns\n-------\nAn instance of parameters." + }, + "multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling multiple produce methods at once.\n\nWhen a primitive has multiple produce methods it is common that they might compute the\nsame internal results for same inputs but return different representations of those results.\nIf caller is interested in multiple of those representations, calling multiple produce\nmethods might lead to recomputing same internal results multiple times. To address this,\nthis method allows primitive author to implement an optimized version which computes\ninternal results only once for multiple calls of produce methods, but return those different\nrepresentations.\n\nIf any additional method arguments are added to primitive's produce method(s), they have\nto be added to this method as well. This method should accept an union of all arguments\naccepted by primitive's produce method(s) and then use them accordingly when computing\nresults.\n\nThe default implementation of this method just calls all produce methods listed in\n``produce_methods`` in order and is potentially inefficient.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to produce outputs for all produce methods\n listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "produce": { + "kind": "PRODUCE", + "arguments": [ + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[d3m.container.pandas.DataFrame]", + "singleton": false, + "inputs_across_samples": [], + "description": "Produce primitive's best choice of the output for each of the inputs.\n\nThe output value should be wrapped inside ``CallResult`` object before returning.\n\nIn many cases producing an output is a quick operation in comparison with ``fit``, but not\nall cases are like that. For example, a primitive can start a potentially long optimization\nprocess to compute outputs. ``timeout`` and ``iterations`` can serve as a way for a caller\nto guide the length of this process.\n\nIdeally, a primitive should adapt its call to try to produce the best outputs possible\ninside the time allocated. If this is not possible and the primitive reaches the timeout\nbefore producing outputs, it should raise a ``TimeoutError`` exception to signal that the\ncall was unsuccessful in the given time. The state of the primitive after the exception\nshould be as the method call has never happened and primitive should continue to operate\nnormally. The purpose of ``timeout`` is to give opportunity to a primitive to cleanly\nmanage its state instead of interrupting execution from outside. Maintaining stable internal\nstate should have precedence over respecting the ``timeout`` (caller can terminate the\nmisbehaving primitive from outside anyway). If a longer ``timeout`` would produce\ndifferent outputs, then ``CallResult``'s ``has_finished`` should be set to ``False``.\n\nSome primitives have internal iterations (for example, optimization iterations).\nFor those, caller can provide how many of primitive's internal iterations\nshould a primitive do before returning outputs. Primitives should make iterations as\nsmall as reasonable. If ``iterations`` is ``None``, then there is no limit on\nhow many iterations the primitive should do and primitive should choose the best amount\nof iterations on its own (potentially controlled through hyper-parameters).\nIf ``iterations`` is a number, a primitive has to do those number of iterations,\nif possible. ``timeout`` should still be respected and potentially less iterations\ncan be done because of that. Primitives with internal iterations should make\n``CallResult`` contain correct values.\n\nFor primitives which do not have internal iterations, any value of ``iterations``\nmeans that they should run fully, respecting only ``timeout``.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\ninputs:\n The inputs of shape [num_inputs, ...].\ntimeout:\n A maximum time this primitive should take to produce outputs during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nThe outputs of shape [num_inputs, ...] wrapped inside ``CallResult``." + }, + "set_params": { + "kind": "OTHER", + "arguments": [ + "params" + ], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------\nparams:\n An instance of parameters." + }, + "set_training_data": { + "kind": "OTHER", + "arguments": [], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------\noutputs:\n The outputs." + } + }, + "class_attributes": { + "logger": "logging.Logger", + "metadata": "d3m.metadata.base.PrimitiveMetadata" + }, + "instance_attributes": { + "hyperparams": "d3m.metadata.hyperparams.Hyperparams", + "random_seed": "int", + "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]", + "volumes": "typing.Dict[str, str]", + "temporary_directory": "typing.Union[NoneType, str]" + } + }, + "structural_type": "test_primitives.random.RandomPrimitive", + "description": "A primitive which draws random samples from a normal distribution.\n\nAttributes\n----------\nmetadata:\n Primitive's metadata. Available as a class attribute.\nlogger:\n Primitive's logger. Available as a class attribute.\nhyperparams:\n Hyperparams passed to the constructor.\nrandom_seed:\n Random seed passed to the constructor.\ndocker_containers:\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes:\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory:\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.", + "digest": "__DIGEST__" +} +""".replace('__INTERFACES_VERSION__', d3m.__version__).replace('__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace('__DIGEST__', RandomPrimitive.metadata.query()['digest']) + + +class TestRandomPrimitive(unittest.TestCase): + def call_primitive(self, primitive, method_name, **kwargs): + return getattr(primitive, method_name)(**kwargs) + + def test_basic(self): + hyperparams_class = RandomPrimitive.metadata.get_hyperparams() + + primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults()) + + inputs = container.List(list(range(4)), generate_metadata=True) + + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([0.496714153011, -0.138264301171, 0.647688538101, 1.52302985641]).reshape((4, 1)))) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query((base.ALL_ELEMENTS, 0))['structural_type'], numpy.float64) + + def test_pickle(self): + # This test is not really useful anymore because primitive now does not keep random state + # anymore but outputs depend only on inputs, and not on previous calls to "produce" method. + + hyperparams_class = RandomPrimitive.metadata.get_hyperparams() + + primitive = RandomPrimitive(random_seed=42, hyperparams=hyperparams_class.defaults()) + + inputs = container.List(list(range(4)), generate_metadata=True) + + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([0.496714153011, -0.138264301171, 0.647688538101, 1.52302985641]).reshape(4, 1))) + + pickled_primitive = pickle.dumps(primitive) + + inputs = container.List(list(range(4, 8)), generate_metadata=True) + + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1))) + + unpickled_primitive = pickle.loads(pickled_primitive) + + call_metadata = self.call_primitive(unpickled_primitive, 'produce', inputs=inputs) + + self.assertTrue(numpy.allclose(call_metadata.value.values, container.ndarray([-0.23415337, -0.23413696, 1.57921282, 0.76743473]).reshape(4, 1))) + + def test_metadata(self): + expected_description = json.loads(EXPECTED_PRIMITIVE_DESCRIPTION_JSON) + + # We stringify to JSON and parse it to make sure the description can be stringified to JSON. + description = json.loads(json.dumps(RandomPrimitive.metadata.to_json_structure())) + + self.maxDiff = None + self.assertEqual(expected_description, description) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_runtime.py b/d3m/tests/test_runtime.py new file mode 100644 index 0000000..fc64224 --- /dev/null +++ b/d3m/tests/test_runtime.py @@ -0,0 +1,1534 @@ +import json +import os +import pickle +import shutil +import sys +import tempfile +import typing +import unittest + +import jsonschema +import pandas + +COMMON_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'common-primitives') +# NOTE: This insertion should appear before any code attempting to resolve or load primitives, +# so the git submodule version of `common_primitives` is looked at first. +sys.path.insert(0, COMMON_PRIMITIVES_DIR) + +from common_primitives.dataset_to_dataframe import DatasetToDataFramePrimitive +from common_primitives.redact_columns import RedactColumnsPrimitive +from common_primitives.train_score_split import TrainScoreDatasetSplitPrimitive +from common_primitives.random_forest import RandomForestClassifierPrimitive +from common_primitives.column_parser import ColumnParserPrimitive +from common_primitives.construct_predictions import ConstructPredictionsPrimitive +from common_primitives.no_split import NoSplitDatasetSplitPrimitive +from common_primitives.remove_columns import RemoveColumnsPrimitive +from common_primitives.simple_profiler import SimpleProfilerPrimitive + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.monomial import MonomialPrimitive +from test_primitives.random import RandomPrimitive +from test_primitives.sum import SumPrimitive +from test_primitives.increment import IncrementPrimitive, Hyperparams as IncrementHyperparams +from test_primitives.primitive_sum import PrimitiveSumPrimitive +from test_primitives.null import NullUnsupervisedLearnerPrimitive +from test_primitives.null import NullTransformerPrimitive +from test_primitives.random_classifier import RandomClassifierPrimitive +from test_primitives.fail import FailPrimitive +from test_primitives.data_hyperparam import DataHyperparamPrimitive +from test_primitives.abs_sum import AbsSumPrimitive +from test_primitives.container_hyperparam import ContainerHyperparamPrimitive +from test_primitives.multi_data_hyperparam import MultiDataHyperparamPrimitive +from test_primitives.primitive_hyperparam import PrimitiveHyperparamPrimitive + +from d3m import container, exceptions, index, runtime, utils +from d3m.metadata import base as metadata_base, hyperparams, pipeline as pipeline_module, problem +from d3m.metadata.pipeline_run import PIPELINE_RUN_SCHEMA_VALIDATOR, PipelineRun, RuntimeEnvironment, _validate_pipeline_run_status_consistency, _validate_pipeline_run_random_seeds, _validate_pipeline_run_timestamps +from d3m.primitive_interfaces import base, transformer + + +TEST_PIPELINE_1 = """ +{ + "created": "2018-11-05T04:14:02.720699Z", + "id": "3ffcc6a0-313e-44ae-b551-2ade1386c11e", + "inputs": [ + { + "name": "inputs1" + }, + { + "name": "inputs2" + }, + { + "name": "inputs3" + } + ], + "outputs": [ + { + "data": "steps.1.produce", + "name": "Metafeatures" + } + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "steps": [ + { + "arguments": { + "inputs": { + "data": [ + "inputs.0", + "inputs.1", + "inputs.2" + ], + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "8a8a8c15-bb69-488e-834c-f129de2dd2f6", + "name": "Vertical Concatenate Primitive", + "python_path": "d3m.primitives.data_transformation.vertical_concatenate.Test", + "version": "0.1.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "aea7fc39-f40b-43ce-b926-89758e560e50", + "name": "Voting Primitive", + "python_path": "d3m.primitives.classification.voting.Test", + "version": "0.1.0" + }, + "type": "PRIMITIVE" + } + ] +} +""" + + +class Resolver(pipeline_module.Resolver): + def _get_primitive(self, primitive_description: typing.Dict) -> typing.Optional[typing.Type[base.PrimitiveBase]]: + # To hide any logging or stdout output. + with utils.silence(): + return super()._get_primitive(primitive_description) + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +DataFramesInputs = container.List +DataFrameOutputs = container.DataFrame + + +class VerticalConcatenatePrimitive(transformer.TransformerPrimitiveBase[DataFramesInputs, DataFrameOutputs, Hyperparams]): + """Description.""" + + metadata = metadata_base.PrimitiveMetadata({ + 'id': '8a8a8c15-bb69-488e-834c-f129de2dd2f6', + 'version': '0.1.0', + 'name': "Vertical Concatenate Primitive", + 'python_path': 'd3m.primitives.data_transformation.vertical_concatenate.Test', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_CONCATENATION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION + }) + + def produce(self, *, inputs: DataFramesInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DataFrameOutputs]: + for i in range(len(inputs)): + if not inputs.metadata.has_semantic_type((i, metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'): + raise Exception("Required metadata missing.") + + outputs = pandas.concat(inputs, ignore_index=True) + outputs.metadata = outputs.metadata.generate(outputs) + return base.CallResult(outputs) + + +VotingInputs = container.DataFrame +VotingOutputs = container.DataFrame + + +class VotingPrimitive(transformer.TransformerPrimitiveBase[VotingInputs, VotingOutputs, Hyperparams]): + """Description.""" + + metadata = metadata_base.PrimitiveMetadata({ + 'id': 'aea7fc39-f40b-43ce-b926-89758e560e50', + 'version': '0.1.0', + 'name': "Voting Primitive", + 'python_path': 'd3m.primitives.classification.voting.Test', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.AGGREGATE_FUNCTION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION + }) + + def produce(self, *, inputs: VotingInputs, timeout: float = None, iterations: int = None) -> base.CallResult[VotingOutputs]: + result = inputs.groupby('d3mIndex').apply(lambda x: x['class'].mode()) + result.columns = ['class'] + result = result.reset_index() + return base.CallResult(container.DataFrame(result, generate_metadata=True)) + + +def set_additionProperties_False(schema_json): + if isinstance(schema_json, typing.Dict): + if 'additionalProperties' in schema_json: + schema_json['additionalProperties'] = False + for key, value in schema_json.items(): + set_additionProperties_False(value) + elif isinstance(schema_json, typing.List): + for item in schema_json: + set_additionProperties_False(item) + + +class TestRuntime(unittest.TestCase): + def setUp(self): + self.test_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.test_dir) + + @classmethod + def setUpClass(cls): + to_register = { + 'd3m.primitives.regression.monomial.Test': MonomialPrimitive, + 'd3m.primitives.data_generation.random.Test': RandomPrimitive, + 'd3m.primitives.operator.sum.Test': SumPrimitive, + 'd3m.primitives.operator.increment.Test': IncrementPrimitive, + 'd3m.primitives.operator.primitive_sum.Test': PrimitiveSumPrimitive, + 'd3m.primitives.classification.voting.Test': VotingPrimitive, + 'd3m.primitives.data_transformation.vertical_concatenate.Test': VerticalConcatenatePrimitive, + 'd3m.primitives.operator.null.FailTest': FailPrimitive, + 'd3m.primitives.operator.sum.ContainerHyperparamTest': ContainerHyperparamPrimitive, + 'd3m.primitives.operator.sum.DataHyperparamTest': DataHyperparamPrimitive, + 'd3m.primitives.operator.sum.MultiDataHyperparamTest': MultiDataHyperparamPrimitive, + 'd3m.primitives.operator.sum.PrimitiveHyperparamTest': PrimitiveHyperparamPrimitive, + 'd3m.primitives.operator.sum.AbsTest': AbsSumPrimitive, + 'd3m.primitives.operator.null.UnsupervisedLearnerTest': NullUnsupervisedLearnerPrimitive, + 'd3m.primitives.operator.null.TransformerTest': NullTransformerPrimitive, + 'd3m.primitives.data_transformation.dataset_to_dataframe.Common': DatasetToDataFramePrimitive, + 'd3m.primitives.classification.random_classifier.Test': RandomClassifierPrimitive, + 'd3m.primitives.evaluation.redact_columns.Common': RedactColumnsPrimitive, + 'd3m.primitives.evaluation.train_score_dataset_split.Common': TrainScoreDatasetSplitPrimitive, + 'd3m.primitives.classification.random_forest.Common': RandomForestClassifierPrimitive, + 'd3m.primitives.data_transformation.column_parser.Common': ColumnParserPrimitive, + 'd3m.primitives.data_transformation.construct_predictions.Common': ConstructPredictionsPrimitive, + 'd3m.primitives.evaluation.no_split_dataset_split.Common': NoSplitDatasetSplitPrimitive, + 'd3m.primitives.data_transformation.remove_columns.Common': RemoveColumnsPrimitive, + 'd3m.primitives.schema_discovery.profiler.Common': SimpleProfilerPrimitive + } + + # To hide any logging or stdout output. + with utils.silence(): + for python_path, primitive in to_register.items(): + index.register_primitive(python_path, primitive) + + from common_primitives.dataset_map import DataFrameDatasetMapPrimitive + + # We have to do it here because it depends on other primitives being first registered. + index.register_primitive('d3m.primitives.operator.dataset_map.DataFrameCommon', DataFrameDatasetMapPrimitive) + + # We create runtime environment ourselves so that it is done only once. + with utils.silence(): + cls.runtime_enviroment = RuntimeEnvironment( + worker_id='test', + base_docker_image={ + 'image_name': 'test', + 'image_digest': 'sha256:' + ('0' * 64), + }, + docker_image={ + 'image_name': 'test', + 'image_digest': 'sha256:' + ('0' * 64), + }, + ) + + def test_basic(self): + with open(os.path.join(os.path.dirname(__file__), 'data', 'pipelines', 'random-sample.yml'), 'r') as pipeline_file: + p = pipeline_module.Pipeline.from_yaml(pipeline_file, resolver=Resolver()) + + r = runtime.Runtime(p, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + + inputs = [container.List([0, 1, 42], generate_metadata=True)] + + result = r.fit(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertTrue(result.pipeline_run) + + self.assertEqual(len(result.values), 1) + + dataframe = result.values['outputs.0'] + + self.assertEqual(dataframe.values.tolist(), [ + [1.764052345967664 + 1], + [0.4001572083672233 + 1], + [-1.7062701906250126 + 1], + ]) + + result = r.produce(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + self.assertTrue(result.pipeline_run) + + dataframe = result.values['outputs.0'] + + self.assertEqual(dataframe.values.tolist(), [ + [1.764052345967664 + 1], + [0.4001572083672233 + 1], + [-1.7062701906250126 + 1], + ]) + + pickled = pickle.dumps(r) + restored = pickle.loads(pickled) + + result = restored.produce(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + self.assertTrue(result.pipeline_run) + + dataframe = result.values['outputs.0'] + + self.assertEqual(dataframe.values.tolist(), [ + [1.764052345967664 + 1], + [0.4001572083672233 + 1], + [-1.7062701906250126 + 1], + ]) + + pickle.dumps(r) + + r = runtime.Runtime(p, random_seed=42, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + + inputs = [container.List([0, 1, 42], generate_metadata=True)] + + result = r.fit(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + self.assertTrue(result.pipeline_run) + + dataframe = result.values['outputs.0'] + + self.assertEqual(dataframe.values.tolist(), [ + [0.4967141530112327 + 1], + [-0.13826430117118466 + 1], + [-0.11564828238824053 + 1], + ]) + + r = runtime.Runtime(p, [{}, {'amount': 10}], random_seed=42, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + + pickle.dumps(r) + + inputs = [container.List([0, 1, 42], generate_metadata=True)] + + result = r.fit(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + self.assertTrue(result.pipeline_run) + + dataframe = result.values['outputs.0'] + + self.assertEqual(dataframe.values.tolist(), [ + [0.4967141530112327 + 10], + [-0.13826430117118466 + 10], + [-0.11564828238824053 + 10], + ]) + + pickle.dumps(r) + + def test_argument_list(self): + p = pipeline_module.Pipeline.from_json(TEST_PIPELINE_1, resolver=Resolver()) + + r = runtime.Runtime(p, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + + inputs = [ + container.DataFrame({'d3mIndex': [1, 2, 3], 'class': [0, 0, 0]}, generate_metadata=True), + container.DataFrame({'d3mIndex': [1, 2, 3], 'class': [0, 0, 1]}, generate_metadata=True), + container.DataFrame({'d3mIndex': [1, 2, 3], 'class': [0, 1, 1]}, generate_metadata=True), + ] + + for df in inputs: + df.metadata = df.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') + + result = r.fit(inputs, return_values=['outputs.0']) + result.check_success() + dataframe = result.values['outputs.0'] + + self.assertEqual(dataframe.values.tolist(), [[1, 0], [2, 0], [3, 1]]) + + pickle.dumps(r) + + def test_pipeline_with_primitives_as_hyperparams_from_pipeline(self): + # We create the pipeline. + pipeline_description = pipeline_module.Pipeline() + pipeline_description.add_input(name='input_0') + pipeline_description.add_input(name='input_1') + + step_0_primitive = index.get_primitive('d3m.primitives.regression.monomial.Test') + step_0_primitive_metadata = step_0_primitive.metadata.query() + step_0_primitive_description = { + 'id': step_0_primitive_metadata['id'], + 'version': step_0_primitive_metadata['version'], + 'python_path': step_0_primitive_metadata['python_path'], + 'name': step_0_primitive_metadata['name'], + 'digest': step_0_primitive_metadata['digest'], + } + + step_0 = pipeline_module.PrimitiveStep(primitive_description=step_0_primitive_description) + step_0.add_argument(name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='inputs.0') + step_0.add_argument(name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='inputs.1') + step_0.add_output('produce') + pipeline_description.add_step(step_0) + + step_1_primitive = index.get_primitive('d3m.primitives.operator.primitive_sum.Test') + step_1_primitive_metadata = step_1_primitive.metadata.query() + step_1_primitive_description = { + 'id': step_1_primitive_metadata['id'], + 'version': step_1_primitive_metadata['version'], + 'python_path': step_1_primitive_metadata['python_path'], + 'name': step_1_primitive_metadata['name'], + 'digest': step_1_primitive_metadata['digest'], + } + + step_1 = pipeline_module.PrimitiveStep(primitive_description=step_1_primitive_description) + step_1.add_argument(name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='inputs.0') + step_1.add_hyperparameter(name='primitive_1', argument_type=metadata_base.ArgumentType.PRIMITIVE, data=0) + step_1.add_hyperparameter(name='primitive_2', argument_type=metadata_base.ArgumentType.PRIMITIVE, data=0) + step_1.add_output('produce') + pipeline_description.add_step(step_1) + + pipeline_description.add_output(name='output', data_reference='steps.1.produce') + + r = runtime.Runtime(pipeline_description, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + + inputs = [container.List([1, 2, 3, 4, 5], generate_metadata=True), container.List([2, 4, 6, 8, 100], generate_metadata=True)] + + result = r.fit(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + + results = result.values['outputs.0'] + + self.assertEqual(results, [ + 11.2, + 22.4, + 33.599999999999994, + 44.8, + 56.0, + ]) + + result = r.produce(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + + results = result.values['outputs.0'] + + self.assertEqual(results, [ + 11.2, + 22.4, + 33.599999999999994, + 44.8, + 56.0, + ]) + + # Random seed should be different from 0 for hyper-parameter primitive instance. + self.assertEqual(result.pipeline_run.previous_pipeline_run.steps[1].hyperparams['primitive_1'].random_seed, 1) + # Primitive should not be the same instance. + self.assertIsNot(result.pipeline_run.previous_pipeline_run.steps[1].hyperparams['primitive_1'], result.pipeline_run.previous_pipeline_run.steps[1].hyperparams['primitive_2']) + + pickle._dumps(r) + + def test_pipeline_with_primitives_as_hyperparams_as_class_value(self): + # We create the pipeline. + pipeline_description = pipeline_module.Pipeline() + pipeline_description.add_input(name='input_0') + + null_primitive = index.get_primitive('d3m.primitives.operator.null.TransformerTest') + + step_0_primitive = index.get_primitive('d3m.primitives.operator.primitive_sum.Test') + step_0_primitive_metadata = step_0_primitive.metadata.query() + step_0_primitive_description = { + 'id': step_0_primitive_metadata['id'], + 'version': step_0_primitive_metadata['version'], + 'python_path': step_0_primitive_metadata['python_path'], + 'name': step_0_primitive_metadata['name'], + 'digest': step_0_primitive_metadata['digest'], + } + + step_0 = pipeline_module.PrimitiveStep(primitive_description=step_0_primitive_description) + step_0.add_argument(name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='inputs.0') + step_0.add_hyperparameter(name='primitive_1', argument_type=metadata_base.ArgumentType.VALUE, data=null_primitive) + step_0.add_hyperparameter(name='primitive_2', argument_type=metadata_base.ArgumentType.VALUE, data=null_primitive) + step_0.add_output('produce') + pipeline_description.add_step(step_0) + + pipeline_description.add_output(name='output', data_reference='steps.0.produce') + + r = runtime.Runtime(pipeline_description, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + + inputs = [container.List([1, 2, 3, 4, 5], generate_metadata=True)] + + result = r.fit(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + + results = result.values['outputs.0'] + + self.assertEqual(results, [ + 2, 4, 6, 8, 10, + ]) + + result = r.produce(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + + results = result.values['outputs.0'] + + self.assertEqual(results, [ + 2, 4, 6, 8, 10, + ]) + + # Primitive should not be the same instance. + self.assertIsNot(result.pipeline_run.previous_pipeline_run.steps[0].hyperparams['primitive_1'], result.pipeline_run.previous_pipeline_run.steps[0].hyperparams['primitive_2']) + + pickle.dumps(r) + + def test_pipeline_with_primitives_as_hyperparams_as_instance_value(self): + # We create the pipeline. + pipeline_description = pipeline_module.Pipeline() + pipeline_description.add_input(name='input_0') + + null_primitive = index.get_primitive('d3m.primitives.operator.null.TransformerTest') + + hyperparams_class = null_primitive.metadata.get_hyperparams() + + primitive = null_primitive(hyperparams=hyperparams_class.defaults()) + + step_0_primitive = index.get_primitive('d3m.primitives.operator.primitive_sum.Test') + step_0_primitive_metadata = step_0_primitive.metadata.query() + step_0_primitive_description = { + 'id': step_0_primitive_metadata['id'], + 'version': step_0_primitive_metadata['version'], + 'python_path': step_0_primitive_metadata['python_path'], + 'name': step_0_primitive_metadata['name'], + 'digest': step_0_primitive_metadata['digest'], + } + + step_0 = pipeline_module.PrimitiveStep(primitive_description=step_0_primitive_description) + step_0.add_argument(name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='inputs.0') + step_0.add_hyperparameter(name='primitive_1', argument_type=metadata_base.ArgumentType.VALUE, data=primitive) + step_0.add_hyperparameter(name='primitive_2', argument_type=metadata_base.ArgumentType.VALUE, data=primitive) + step_0.add_output('produce') + pipeline_description.add_step(step_0) + + pipeline_description.add_output(name='output', data_reference='steps.0.produce') + + r = runtime.Runtime(pipeline_description, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + + inputs = [container.List([1, 2, 3, 4, 5], generate_metadata=True)] + + result = r.fit(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + + results = result.values['outputs.0'] + + self.assertEqual(results, [ + 2, 4, 6, 8, 10, + ]) + + result = r.produce(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + + results = result.values['outputs.0'] + + self.assertEqual(results, [ + 2, 4, 6, 8, 10, + ]) + + # Primitive should not be the same instance. + self.assertIsNot(null_primitive, result.pipeline_run.previous_pipeline_run.steps[0].hyperparams['primitive_1']) + self.assertIsNot(result.pipeline_run.previous_pipeline_run.steps[0].hyperparams['primitive_1'], result.pipeline_run.previous_pipeline_run.steps[0].hyperparams['primitive_2']) + + pickle.dumps(r) + + def _fake_inputs(self, runtime, pipeline_run, inputs): + # We fake that inputs were added even if this is not a standard pipeline. + # TODO: Make tests not require this. + for input_dataset in inputs: + pipeline_run.add_input_dataset(input_dataset) + if runtime is not None: + runtime._previous_pipeline_run_id = pipeline_run.get_id() + + def _build_pipeline(self, pipeline_id: str, sequence=None): + if sequence is None: + sequence = [{'primitive_class': RandomPrimitive}, {'primitive_class': IncrementPrimitive}] + + pipeline_description = { + 'source': { + 'name': 'Test team' + }, + 'name': 'Test pipeline', + 'description': 'Pipeline created to test pipeline-run' + } + + pipe = pipeline_module.Pipeline( + pipeline_id, + source=pipeline_module.Pipeline._get_source(pipeline_description), + name=pipeline_description['name'], + description=pipeline_description['description'], + ) + + pipe.add_input('input_data') + + for index, element in enumerate(sequence): + # default input, argument name is 'inputs', value specified below + if index == 0: + inputs = 'inputs.0' + else: + inputs = 'steps.{}.produce'.format(index - 1) + + if isinstance(element, pipeline_module.Pipeline): + step = pipeline_module.SubpipelineStep(element.to_json_structure(nest_subpipelines=True)) + step.add_input(inputs) + elif isinstance(element, dict): + primitive_description = element['primitive_class'].metadata.query() + step = pipeline_module.PrimitiveStep(primitive_description) + if 'INPUTS' in element: + for arg_name, value in element['INPUTS']: + value_str = 'steps.{}.produce'.format(value) + step.add_argument(arg_name, metadata_base.ArgumentType.CONTAINER, value_str) + else: + # if not specified, use default + step.add_argument('inputs', metadata_base.ArgumentType.CONTAINER, inputs) + if 'HYPERPARAMS' in element: + for hyperparam_name in element['HYPERPARAMS']: + hyperparam = element['HYPERPARAMS'][hyperparam_name] + step.add_hyperparameter(hyperparam_name, hyperparam['TYPE'], hyperparam['DATA']) + else: + raise exceptions.InvalidArgumentTypeError( + 'Unknown type {} in parameter \'sequence\''.format(type(element))) + step.add_output('produce') + pipe.add_step(step) + + pipe.add_output('steps.{}.produce'.format(len(sequence) - 1)) + + return pipe + + def _get_inputs(self): + # TODO: Make tests use a real Dataset instead of a list. Pipeline runs are defined on standard pipelines. + input_data = container.List([1, 3, 4, 2, 5, 3], generate_metadata=True) + # First have to add dummy metadata to the list, which otherwise exist in the dataset. + input_data.metadata = input_data.metadata.update((), { + 'id': '0000000000000000000000000000000000000000000000000000000000000000', + 'digest': '0000000000000000000000000000000000000000000000000000000000000000' + }) + inputs = [input_data] + return inputs + + def _fit_pipeline( + self, pipeline, inputs, problem_description=None, context=metadata_base.Context.TESTING, return_values=None + ): + r = runtime.Runtime( + pipeline, problem_description=problem_description, context=context, + environment=self.runtime_enviroment, + ) + fit_result = r.fit(inputs, return_values=return_values) + self.assertTrue(fit_result.pipeline_run) + # We fake that inputs were added even if this is not a standard pipeline. + # TODO: Make tests not require this. + for input_dataset in inputs: + fit_result.pipeline_run.add_input_dataset(input_dataset) + return fit_result.pipeline_run + + def _fit_and_produce_pipeline( + self, pipeline, inputs, problem_description = None, context = metadata_base.Context.TESTING + ): + r = runtime.Runtime( + pipeline, problem_description=problem_description, context=context, + environment=self.runtime_enviroment, + ) + fit_result = r.fit(inputs) + self.assertTrue(fit_result.pipeline_run) + self._fake_inputs(r, fit_result.pipeline_run, inputs) + self._check_pipelines_valid_and_succeeded([fit_result.pipeline_run]) + + produce_result = r.produce(inputs) + self.assertTrue(produce_result.pipeline_run) + self._fake_inputs(r, produce_result.pipeline_run, inputs) + self._check_pipelines_valid_and_succeeded([produce_result.pipeline_run]) + + return (fit_result.pipeline_run, produce_result.pipeline_run) + + def _is_pipeline_run_successful(self, pipeline_run_json): + if pipeline_run_json['status']['state'] == metadata_base.PipelineRunStatusState.SUCCESS: + return True + elif pipeline_run_json['status']['state'] == metadata_base.PipelineRunStatusState.FAILURE: + return False + else: + self.fail('Pipeline-run document status state set to invalid value') + + def _validate_pipeline_run_structure(self, json_structure): + try: + PIPELINE_RUN_SCHEMA_VALIDATOR.validate(json_structure) + _validate_pipeline_run_status_consistency(json_structure) + _validate_pipeline_run_timestamps(json_structure) + _validate_pipeline_run_random_seeds(json_structure) + except jsonschema.exceptions.ValidationError as error: + print('\n', error, '\n') + print("##### PRINTING RECURSIVE SUBERRORS #####\n") + self.print_recursive_suberrors(error, indent='\n') + self.fail("Pipeline_run document failed to validate against the schema") + + def _invalidate_pipeline_run_structure(self, json_structure): + is_valid = False + try: + PIPELINE_RUN_SCHEMA_VALIDATOR.validate(json_structure) + is_valid = True + except jsonschema.exceptions.ValidationError as error: + pass + if is_valid: + self.fail("Pipeline_run document should not have validated against the schema") + + def _check_pipelines_valid_and_succeeded(self, pipeline_runs): + for pipeline_run in pipeline_runs: + pipeline_run_json = pipeline_run.to_json_structure() + self._validate_pipeline_run_structure(pipeline_run_json) + self.assertTrue(self._is_pipeline_run_successful(pipeline_run_json), json.dumps(pipeline_run_json, indent=4)) + + def _check_pipelines_valid_and_failed(self, pipeline_runs): + for pipeline_run in pipeline_runs: + pipeline_run_json = pipeline_run.to_json_structure() + self._validate_pipeline_run_structure(pipeline_run_json) + self.assertFalse(self._is_pipeline_run_successful(pipeline_run_json)) + + def _check_pipelines_invalid(self, pipeline_runs): + for pipeline_run in pipeline_runs: + pipeline_run_json = pipeline_run.to_json_structure() + self._invalidate_pipeline_run_structure(pipeline_run_json) + + def test_basic_pipeline_run(self): + inputs = self._get_inputs() + pipe = self._build_pipeline('1490432b-b48a-4a62-8977-5a56e52a3e85') + pipeline_runs = self._fit_and_produce_pipeline(pipe, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + def test_pipeline_fit_with_return_values(self): + inputs = self._get_inputs() + pipe = self._build_pipeline('cf2e4f93-4b9a-4a49-9ab5-92927b3125df') + pipeline_runs = self._fit_pipeline(pipe, inputs, return_values=['steps.0.produce']) + self._check_pipelines_valid_and_succeeded([pipeline_runs]) + + def test_pipeline_run_failure(self): + inputs = self._get_inputs() + for hyperparam in ('__init__', 'set_training_data', 'fit', 'produce'): + failure_pipeline = self._build_pipeline('18e96ab3-e3c5-4b29-a446-3e81982eba9c', sequence=[{'primitive_class': RandomPrimitive}, + {'primitive_class': FailPrimitive, 'HYPERPARAMS': {'method_to_fail': {'TYPE': metadata_base.ArgumentType.VALUE, 'DATA': hyperparam}}}]) + fit_pipeline_run = self._fit_pipeline(failure_pipeline, inputs) + self._check_pipelines_valid_and_failed([fit_pipeline_run]) + + def test_pipeline_run_failure_return_error(self): + inputs = self._get_inputs() + pipeline = self._build_pipeline('80dee50d-9ca4-4ad5-9a52-7ea30f3eb3e5', sequence=[{'primitive_class': RandomPrimitive}, + {'primitive_class': FailPrimitive, 'HYPERPARAMS': {'method_to_fail': {'TYPE': metadata_base.ArgumentType.VALUE, 'DATA': 'fit'}}}]) + r = runtime.Runtime( + pipeline, context=metadata_base.Context.TESTING, + environment=self.runtime_enviroment, + ) + fit_result = r.fit(inputs) + + self.assertTrue(fit_result.error) + self.assertEqual(str(fit_result.error), 'Step 1 for pipeline 80dee50d-9ca4-4ad5-9a52-7ea30f3eb3e5 failed.') + self.assertIsInstance(fit_result.error, exceptions.StepFailedError) + + with self.assertRaises(exceptions.StepFailedError) as cm: + fit_result.check_success() + + self.assertEqual(str(cm.exception), 'Step 1 for pipeline 80dee50d-9ca4-4ad5-9a52-7ea30f3eb3e5 failed.') + + def test_pipeline_run_failure_with_subpipeline(self): + inputs = self._get_inputs() + for hyperparam in ('__init__', 'set_training_data', 'fit', 'produce'): + failure_subpipeline = self._build_pipeline('bcd96144-34ae-4a67-a1b5-b911a07d03ed', sequence=[{'primitive_class': FailPrimitive, 'HYPERPARAMS': {'method_to_fail': {'TYPE': metadata_base.ArgumentType.VALUE, 'DATA': hyperparam}}}]) + failure_pipeline = self._build_pipeline('cbec1cb2-64df-4d4a-81ea-a829eeac0612', sequence=[{'primitive_class': RandomPrimitive}, failure_subpipeline, {'primitive_class': IncrementPrimitive}]) + fit_pipeline_run = self._fit_pipeline(failure_pipeline, inputs) + self._check_pipelines_valid_and_failed([fit_pipeline_run]) + + # tests previous_pipeline_run when it should be None, and when it should be full + def test_all_previous_pipeline_run_types(self): + inputs = self._get_inputs() + pipe = self._build_pipeline('2617ca0c-552a-4014-a999-2904184ed648') + fit_pipeline_run, produce_pipeline_run = self._fit_and_produce_pipeline(pipe, inputs) + self._check_pipelines_valid_and_succeeded([fit_pipeline_run, produce_pipeline_run]) + fit_pipeline_run_json = fit_pipeline_run.to_json_structure() + self.assertTrue( + 'previous_pipeline_run' not in fit_pipeline_run_json, + 'pipeline_run should not contain previous_pipeline_run' + ) + produce_pipeline_run_json = produce_pipeline_run.to_json_structure() + self.assertNotEqual(produce_pipeline_run_json['previous_pipeline_run'], None) + self.assertEqual(fit_pipeline_run_json['id'], produce_pipeline_run_json['previous_pipeline_run']['id']) + + # tests pipeline_run given each type of context + def test_all_pipeline_run_context_types(self): + inputs = self._get_inputs() + pipe = self._build_pipeline('4fb64b4b-baa6-404a-afe3-1ad68a1993c1') + + for context in metadata_base.Context: + pipeline_runs = self._fit_and_produce_pipeline( + pipe, inputs, context=context + ) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + class InvalidContext: + def __init__(self, name): + self.name = name + + invalid_context = InvalidContext('INVALID_CONTEXT') + pipe = self._build_pipeline('1c05ae77-1f74-48bd-9341-c31338a9c9f0') + with self.assertRaises(jsonschema.exceptions.ValidationError): + pipeline_runs = self._fit_and_produce_pipeline(pipe, inputs, context=invalid_context) + + # tests pipeline_run given primitive steps and given subpipeline steps + def test_all_pipeline_run_step_types(self): + inputs = self._get_inputs() + + pipeline_without_subpipeline = self._build_pipeline('dca8efbe-4daa-47a6-a811-9ca633ffc90b', [{'primitive_class': RandomPrimitive}, {'primitive_class': IncrementPrimitive}, {'primitive_class': IncrementPrimitive}, {'primitive_class': IncrementPrimitive}]) + pipeline_runs = self._fit_and_produce_pipeline(pipeline_without_subpipeline, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + subpipeline = self._build_pipeline('06dfb07a-f151-467c-9f1c-51a6bf6378a3', [{'primitive_class': IncrementPrimitive}, {'primitive_class': IncrementPrimitive}]) + pipeline_with_subpipeline = self._build_pipeline('293c1883-f81a-459d-a1a8-ba19467d5ad6', [{'primitive_class': RandomPrimitive}, subpipeline, {'primitive_class': IncrementPrimitive}]) + pipeline_runs = self._fit_and_produce_pipeline(pipeline_with_subpipeline, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + # tests when there is a subpipeline within a subpipeline + def test_recursive_subpipeline(self): + inputs = self._get_inputs() + subpipeline = self._build_pipeline('1eba8278-45da-448e-92a8-a6daf780563f', [{'primitive_class': IncrementPrimitive}, {'primitive_class': IncrementPrimitive}]) + subpipeline = self._build_pipeline('b350beb3-4421-4627-906c-92cbbe900834', [{'primitive_class': IncrementPrimitive}, subpipeline, {'primitive_class': IncrementPrimitive}]) + pipeline_with_recursive_subpipeline = self._build_pipeline('17e3ae59-e132-4c56-8573-20be6f84ea05', [{'primitive_class': RandomPrimitive}, subpipeline, {'primitive_class': IncrementPrimitive}]) + pipeline_runs = self._fit_and_produce_pipeline(pipeline_with_recursive_subpipeline, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + def test_all_pipeline_run_hyperparam_types(self): + inputs = self._get_inputs() + + # test value_argument hyperparams (runtime sets defaults) + pipeline = self._build_pipeline('301702a9-cf1e-4332-9116-696c9908586a') + pipeline_runs = self._fit_and_produce_pipeline(pipeline, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + # test container_argument + pipeline = self._build_pipeline('8390ab6f-d619-4cc5-b343-22b91f81eecd', sequence=[{'primitive_class': RandomPrimitive}, + {'primitive_class': ContainerHyperparamPrimitive, 'HYPERPARAMS': {'dataframe': {'TYPE': metadata_base.ArgumentType.CONTAINER, 'DATA': 'steps.0.produce'}}}]) + pipeline_runs = self._fit_and_produce_pipeline(pipeline, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + # test data_argument + pipeline = self._build_pipeline('f0e0e370-97db-4e67-9eff-5e9b79f253e6', sequence=[{'primitive_class': RandomPrimitive}, {'primitive_class': AbsSumPrimitive}, + {'primitive_class': DataHyperparamPrimitive, 'INPUTS': [('inputs', 0)], 'HYPERPARAMS': {'value': {'TYPE': metadata_base.ArgumentType.DATA, 'DATA': 'steps.1.produce'}}}]) + pipeline_runs = self._fit_and_produce_pipeline(pipeline, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + # test data_arguments + pipeline = self._build_pipeline('ab71ff74-5cd1-4e36-8c63-c2cd79085173', sequence=[{'primitive_class': RandomPrimitive}, {'primitive_class': AbsSumPrimitive}, {'primitive_class': AbsSumPrimitive, 'INPUTS': [('inputs', 0)]}, + {'primitive_class': MultiDataHyperparamPrimitive, 'INPUTS': [('inputs', 0)], 'HYPERPARAMS': {'values': {'TYPE': metadata_base.ArgumentType.DATA, 'DATA': ['steps.1.produce', 'steps.2.produce']}}}]) + pipeline_runs = self._fit_and_produce_pipeline(pipeline, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + # test primitive argument + pipeline = self._build_pipeline('c8b291f1-ff67-49e0-b8a3-a0e6a2d6f013', sequence=[{'primitive_class': RandomPrimitive}, {'primitive_class': AbsSumPrimitive}, + {'primitive_class': PrimitiveHyperparamPrimitive, 'INPUTS': [('inputs', 0)], 'HYPERPARAMS': {'primitive': {'TYPE': metadata_base.ArgumentType.PRIMITIVE, 'DATA': 1}}}]) + pipeline_runs = self._fit_and_produce_pipeline(pipeline, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + def test_all_pipeline_run_method_call_base_metadata_types(self): + pipeline = pipeline_module.Pipeline.from_json(TEST_PIPELINE_1, resolver=Resolver()) + pipeline_run = PipelineRun( + pipeline, phase=metadata_base.PipelineRunPhase.FIT, context=metadata_base.Context.TESTING, + environment=self.runtime_enviroment, random_seed=0 + ) + inputs = self._get_inputs()[0] + pipeline_run.add_input_dataset(inputs) + pipeline_run.run_started() + pipeline_run.step_started(0) + primitive_step_id = pipeline_run.add_primitive_step(pipeline.steps[0]) + method_call_id = pipeline_run.add_method_call_to_primitive_step(primitive_step_id, 'fit') + pipeline_run.method_call_started(method_call_id) + result = base.CallResult(inputs) + pipeline_run.method_call_successful(method_call_id) + pipeline_run.set_method_call_result_metadata(method_call_id, result) + pipeline_run.step_successful(primitive_step_id) + pipeline_run.run_successful() + self._validate_pipeline_run_structure(pipeline_run.to_json_structure()) + + # test that the phase is set correctly for fit and produce + def test_all_pipeline_run_phase_types(self): + inputs = self._get_inputs() + pipeline = self._build_pipeline('d95a9816-8ede-4fe2-89c5-f5c9d9f1d9fd') + pipeline_runs = self._fit_and_produce_pipeline(pipeline, inputs) + self._check_pipelines_valid_and_succeeded(pipeline_runs) + + fit_pipeline_run = pipeline_runs[0] + fit_pipeline_run_json = fit_pipeline_run.to_json_structure() + self.assertEqual(fit_pipeline_run_json['run']['phase'], 'FIT') + + produce_pipeline_run = pipeline_runs[1] + produce_pipeline_run_json = produce_pipeline_run.to_json_structure() + self.assertEqual(produce_pipeline_run_json['run']['phase'], 'PRODUCE') + + # tests that the first method_call of each step is __init__() + def test_pipeline_run_init_method_calls(self): + inputs = self._get_inputs() + pipeline = self._build_pipeline('5a9321df-7e40-443b-9e12-f1d840a677cd') + pipeline_runs = self._fit_and_produce_pipeline(pipeline, inputs) + for pipeline_run in pipeline_runs: + pipeline_run_json = pipeline_run.to_json_structure() + if pipeline_run_json['run']['phase'] == 'FIT': + for step in pipeline_run_json['steps']: + first_method_call = step['method_calls'][0] + self.assertEqual(first_method_call['name'], '__init__') + + def print_recursive_suberrors(self, error, indent): + for suberror in sorted(error.context, key=lambda e: e.schema_path): + print(f'{indent}', list(suberror.schema_path), ", ", suberror.message) + self.print_recursive_suberrors(suberror, indent + '\t') + + def get_data(self, dataset_name='iris_dataset_1', problem_name='iris_problem_1'): + if problem_name: + problem_doc_path = os.path.join( + os.path.dirname(__file__), 'data', 'problems', problem_name, 'problemDoc.json' + ) + problem_description = problem.Problem.load('file://' + problem_doc_path) + else: + problem_description = None + + datasetDoc_path = 'file://' + os.path.join(os.path.dirname(__file__), 'data', 'datasets', dataset_name, 'datasetDoc.json') + iris_dataset = container.Dataset.load(datasetDoc_path) + return problem_description, iris_dataset + + def test_recording_hyperparams(self): + pipeline = self._build_pipeline( + '84d5dbb8-6e82-4187-801e-83a46069608f', + sequence=[ + { + 'primitive_class': IncrementPrimitive + }, + { + 'primitive_class': IncrementPrimitive, + 'HYPERPARAMS': { + 'amount': { + 'TYPE': metadata_base.ArgumentType.VALUE, + 'DATA': 3.14 + } + } + }, + { + 'primitive_class': IncrementPrimitive + } + ], + ) + runtime_hyperparams = [{}, {}, {'amount': 2.72}] + inputs = [container.DataFrame({'a': [1,2,3], 'b': [3,5,8]}, generate_metadata=True)] + # TODO: Make tests use a real Dataset instead of a dataframe. Pipeline runs are defined on standard pipelines. + # First have to add dummy metadata to the dataframe, which otherwise exist in the dataset. + inputs[0].metadata = inputs[0].metadata.update((), { + 'id': '0000000000000000000000000000000000000000000000000000000000000000', + 'digest': '0000000000000000000000000000000000000000000000000000000000000000' + }) + r = runtime.Runtime(pipeline, runtime_hyperparams, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + fit_result = r.fit(inputs=inputs) + self._fake_inputs(r, fit_result.pipeline_run, inputs) + fit_pipeline_run_json = fit_result.pipeline_run.to_json_structure() + + # test default hyperparams recorded in pipeline_run + self.assertTrue( + 'amount' in fit_pipeline_run_json['steps'][0]['hyperparams'], + 'default hyperparams not recorded in pipeline_run' + ) + self.assertEqual( + IncrementHyperparams.defaults().values_to_json_structure()['amount'], + fit_pipeline_run_json['steps'][0]['hyperparams']['amount']['data'], + 'defualt hyperparams incorrectly recorded in pipeline_run' + ) + + # test hyperparams specified in pipeline not recored in pipeline_run + self.assertFalse( + 'hyperparams' in fit_pipeline_run_json['steps'][1], + 'hyperparams specified in the pipeline should not be recorded in the pipeline_run' + ) + + # test hyperparams set at runtime recored in pipeline_run + self.assertTrue( + 'amount' in fit_pipeline_run_json['steps'][2]['hyperparams'], + 'runtime hyperparams not recorded in pipeline_run' + ) + self.assertEqual( + runtime_hyperparams[2]['amount'], + fit_pipeline_run_json['steps'][2]['hyperparams']['amount']['data'], + 'defualt hyperparams incorrectly recorded in pipeline_run' + ) + + produce_result = r.produce(inputs=inputs) + self._fake_inputs(r, produce_result.pipeline_run, inputs) + for step in produce_result.pipeline_run.to_json_structure()['steps']: + self.assertFalse( + 'hyperparams' in step, + 'hyperparams should not be set in produce pipeline_runs' + ) + + def test_recording_arguments(self): + pipeline = self._build_pipeline('46bb32a5-f9a0-4c33-97c8-f426ed147e0a') + inputs = self._get_inputs() + r = runtime.Runtime(pipeline, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + fit_result = r.fit(inputs=inputs) + self._fake_inputs(r, fit_result.pipeline_run, inputs) + fit_pipeline_run_json = fit_result.pipeline_run.to_json_structure() + + pipeline_json_structure = pipeline.to_json_structure() + for pipeline_step, pipeline_run_step in zip(pipeline_json_structure['steps'], fit_pipeline_run_json['steps']): + if 'arguments' in pipeline_run_step: + for argument_name in pipeline_step['arguments']: + self.assertFalse( + argument_name in pipeline_run_step['arguments'], + 'pipeline step arguments should not be recorded in pipeline_run method_call arguments' + ) + + produce_result = r.produce(inputs=inputs) + self._fake_inputs(r, produce_result.pipeline_run, inputs) + produce_pipeline_run_json = produce_result.pipeline_run.to_json_structure() + + for pipeline_step, pipeline_run_step in zip(pipeline_json_structure['steps'], produce_pipeline_run_json['steps']): + if 'arguments' in pipeline_run_step: + for argument_name in pipeline_step['arguments']: + self.assertFalse( + argument_name in pipeline_run_step['arguments'], + 'pipeline step arguments should not be recorded in pipeline_run method_call arguments' + ) + + def test_saving_to_file(self): + if not os.path.exists(self.test_dir): + os.makedirs(self.test_dir) + inputs = self._get_inputs() + pipeline = self._build_pipeline('4327ce61-0580-48b3-9aeb-d3e35c09376d') + + r = runtime.Runtime(pipeline, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment) + fit_result = r.fit(inputs=inputs) + self._fake_inputs(r, fit_result.pipeline_run, inputs) + fit_pipeline_run = fit_result.pipeline_run + fit_pipeline_run_json = fit_pipeline_run.to_json_structure() + fit_file_name = '{}.json'.format(fit_pipeline_run_json['id']) + fit_file_path = os.path.join(self.test_dir, fit_file_name) + with open(fit_file_path, 'w') as fit_file: + fit_pipeline_run.to_yaml(fit_file) + self.assertTrue(os.path.exists(fit_file_path), 'The fit pipeline_run object should have been saved to {}'.format(fit_file_path)) + with open(fit_file_path, 'r') as fit_file: + fit_json = utils.yaml_load(fit_file) + self._validate_pipeline_run_structure(fit_json) + self.assertEqual(fit_json['id'], fit_pipeline_run_json['id']) + self.assertEqual(len(fit_json['steps']), len(fit_pipeline_run.steps)) + self.assertEqual(fit_json['status'], fit_pipeline_run.status) + + produce_result = r.produce(inputs=inputs) + self._fake_inputs(r, produce_result.pipeline_run, inputs) + produce_pipeline_run = produce_result.pipeline_run + produce_pipeline_run_json = produce_pipeline_run.to_json_structure() + fit_produce_file_name = 'produce_pipeline.json' + fit_produce_file_path = os.path.join(self.test_dir, fit_produce_file_name) + with open(fit_produce_file_path, 'w') as fit_produce_file: + fit_pipeline_run.to_yaml(fit_produce_file) + produce_pipeline_run.to_yaml(fit_produce_file, appending=True) + self.assertTrue(os.path.exists(fit_produce_file_path), 'The fit and produce pipeline_run objects should have been saved to {}'.format(fit_produce_file_path)) + with open(fit_produce_file_path, 'r') as fit_produce_file: + fit_produce_jsons = list(utils.yaml_load_all(fit_produce_file)) + self.assertIsInstance(fit_produce_jsons, typing.Sequence, 'The fit_produce_file should contain a sequence of pipeline_run objects') + self.assertEqual(len(fit_produce_jsons), 2, 'The fit_produce_file should contain 2 pipeline_run objects') + fit_json = fit_produce_jsons[0] + self._validate_pipeline_run_structure(fit_json) + self.assertEqual(fit_json['id'], fit_pipeline_run_json['id']) + self.assertEqual(len(fit_json['steps']), len(fit_pipeline_run.steps)) + self.assertEqual(fit_json['status'], fit_pipeline_run.status) + produce_json = fit_produce_jsons[1] + self._validate_pipeline_run_structure(produce_json) + self.assertEqual(produce_json['id'], produce_pipeline_run_json['id']) + self.assertEqual(len(produce_json['steps']), len(produce_pipeline_run.steps)) + self.assertEqual(produce_json['status'], produce_pipeline_run.status) + + def test_fit(self): + pipeline = self._build_pipeline( + '6e79c2cc-e36d-4f22-9016-8184d3385714', + sequence=[ + { + 'primitive_class': DatasetToDataFramePrimitive, + }, + { + 'primitive_class': RandomClassifierPrimitive, + 'INPUTS': [('inputs', 0), ('outputs', 0)], + }, + ], + ) + iris_problem, iris_dataset = self.get_data() + inputs = [iris_dataset] + hyperparams = None + random_seed = 0 + volumes_dir: str = None + fitted_pipeline, predictions, fit_result = runtime.fit( + pipeline, inputs, problem_description=iris_problem, hyperparams=hyperparams, random_seed=random_seed, + volumes_dir=volumes_dir, context=metadata_base.Context.TESTING, + runtime_environment=self.runtime_enviroment, + ) + self._validate_pipeline_run_structure(fit_result.pipeline_run.to_json_structure()) + + def test_prepare_data(self): + with open( + os.path.join(os.path.dirname(__file__), 'data', 'pipelines', 'data-preparation-no-split.yml'), + 'r', + ) as data_pipeline_file: + data_pipeline = pipeline_module.Pipeline.from_yaml(data_pipeline_file, resolver=Resolver()) + + with open( + os.path.join(os.path.dirname(__file__), 'data', 'pipelines', 'random-forest-classifier.yml'), + 'r', + ) as data_pipeline_file: + with utils.silence(): + pipeline = pipeline_module.Pipeline.from_yaml(data_pipeline_file, resolver=Resolver()) + + iris_problem, iris_dataset = self.get_data(dataset_name='iris_dataset_1', problem_name='iris_problem_1') + inputs = [iris_dataset] + outputs, data_result = runtime.prepare_data( + data_pipeline=data_pipeline, problem_description=iris_problem, inputs=inputs, + data_params={}, context=metadata_base.Context.TESTING, runtime_environment=self.runtime_enviroment) + + fitted_pipeline, predictions, fit_result = runtime.fit( + pipeline, inputs, problem_description=iris_problem, context=metadata_base.Context.TESTING, + runtime_environment=self.runtime_enviroment, + ) + self.assertFalse(fit_result.has_error(), fit_result.error) + self.assertFalse(data_result.has_error(), data_result.error) + + with self.assertRaisesRegex(exceptions.InvalidStateError, "Pipeline run for a non-standard pipeline cannot be converted to a JSON structure."): + data_result.pipeline_run.to_json_structure() + + runtime.combine_pipeline_runs( + fit_result.pipeline_run, data_pipeline_run=data_result.pipeline_run, + ) + self.assertFalse(fit_result.has_error(), fit_result.error) + self.assertEqual(len(outputs), 3) + self._validate_pipeline_run_structure(fit_result.pipeline_run.to_json_structure()) + + def test_multi_input_fit(self): + with open( + os.path.join(os.path.dirname(__file__), 'data', 'pipelines', 'multi-input-test.json'), 'r' + ) as pipeline_file: + with utils.silence(): + pipeline = pipeline_module.Pipeline.from_json(pipeline_file, resolver=Resolver()) + + iris_problem, iris_dataset = self.get_data(dataset_name='iris_dataset_1', problem_name='multi_dataset_problem') + _, boston_dataset = self.get_data(dataset_name='boston_dataset_1', problem_name='') + inputs = [iris_dataset, boston_dataset] + hyperparams = None + random_seed = 0 + volumes_dir: str = None + fitted_pipeline, predictions, fit_result = runtime.fit( + pipeline, inputs, problem_description=iris_problem, hyperparams=hyperparams, random_seed=random_seed, + volumes_dir=volumes_dir, context=metadata_base.Context.TESTING, + runtime_environment=self.runtime_enviroment, + ) + self._validate_pipeline_run_structure(fit_result.pipeline_run.to_json_structure()) + + def test_multi_input_fit_without_problem(self): + with open( + os.path.join(os.path.dirname(__file__), 'data', 'pipelines', 'multi-input-test.json'), 'r' + ) as pipeline_file: + with utils.silence(): + pipeline = pipeline_module.Pipeline.from_json(pipeline_file, resolver=Resolver()) + + _, iris_dataset = self.get_data(dataset_name='iris_dataset_1', problem_name='') + _, boston_dataset = self.get_data(dataset_name='boston_dataset_1', problem_name='') + inputs = [iris_dataset, boston_dataset] + hyperparams = None + random_seed = 0 + volumes_dir: str = None + r = runtime.Runtime(pipeline, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment, + hyperparams=hyperparams, random_seed=random_seed, volumes_dir=volumes_dir) + r.fit(inputs=inputs) + + def test_multi_input_fit_with_one_dataset_associated(self): + with open( + os.path.join(os.path.dirname(__file__), 'data', 'pipelines', 'multi-input-test.json'), 'r' + ) as pipeline_file: + with utils.silence(): + pipeline = pipeline_module.Pipeline.from_json(pipeline_file, resolver=Resolver()) + _, iris_dataset = self.get_data(dataset_name='iris_dataset_1', problem_name='') + boston_problem, boston_dataset = self.get_data(dataset_name='boston_dataset_1', problem_name='boston_problem_1') + inputs = [iris_dataset, boston_dataset] + hyperparams = None + random_seed = 0 + volumes_dir: str = None + r = runtime.Runtime(pipeline, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment, + hyperparams=hyperparams, random_seed=random_seed, volumes_dir=volumes_dir, + problem_description=boston_problem) + r.fit(inputs=inputs) + + def test_produce(self): + pipeline = self._build_pipeline( + 'c99ae185-2a74-4919-88b1-66d02e2e21b2', + sequence=[ + { + 'primitive_class': DatasetToDataFramePrimitive + }, + { + 'primitive_class': RandomClassifierPrimitive, + 'INPUTS': [('inputs', 0), ('outputs', 0)], + }, + ], + ) + iris_problem, iris_dataset = self.get_data() + inputs = [iris_dataset] + hyperparams = None + random_seed = 0 + volumes_dir: str = None + fitted_pipeline, predictions, fit_result = runtime.fit( + pipeline, inputs, problem_description=iris_problem, hyperparams=hyperparams, random_seed=random_seed, + volumes_dir=volumes_dir, context=metadata_base.Context.TESTING, + runtime_environment=self.runtime_enviroment, + ) + predictions, produce_result = runtime.produce(fitted_pipeline, inputs) + self._validate_pipeline_run_structure(produce_result.pipeline_run.to_json_structure()) + + def test_multi_input_produce(self): + with open( + os.path.join(os.path.dirname(__file__), 'data', 'pipelines', 'multi-input-test.json'), 'r' + ) as pipeline_file: + with utils.silence(): + pipeline = pipeline_module.Pipeline.from_json(pipeline_file, resolver=Resolver()) + iris_problem, iris_dataset = self.get_data(dataset_name='iris_dataset_1', problem_name='multi_dataset_problem') + _, iris_dataset_2 = self.get_data(dataset_name='boston_dataset_1', problem_name='') + inputs = [iris_dataset, iris_dataset_2] + hyperparams = None + random_seed = 0 + volumes_dir: str = None + r = runtime.Runtime(pipeline, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment, + hyperparams=hyperparams, random_seed=random_seed, volumes_dir=volumes_dir, + problem_description=iris_problem) + r.fit(inputs=inputs) + r.produce(inputs=inputs) + + def test_multi_input_produce_without_problem(self): + with open( + os.path.join(os.path.dirname(__file__), 'data', 'pipelines', 'multi-input-test.json'), 'r' + ) as pipeline_file: + with utils.silence(): + pipeline = pipeline_module.Pipeline.from_json(pipeline_file, resolver=Resolver()) + _, iris_dataset = self.get_data(dataset_name='iris_dataset_1', problem_name='') + _, boston_dataset = self.get_data(dataset_name='boston_dataset_1', problem_name='') + inputs = [iris_dataset, boston_dataset] + hyperparams = None + random_seed = 0 + volumes_dir: str = None + r = runtime.Runtime(pipeline, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment, + hyperparams=hyperparams, random_seed=random_seed, volumes_dir=volumes_dir) + r.fit(inputs=inputs) + r.produce(inputs=inputs) + + def test_multi_input_produce_with_one_dataset_associated(self): + with open( + os.path.join(os.path.dirname(__file__), 'data', 'pipelines', 'multi-input-test.json'), 'r' + ) as pipeline_file: + with utils.silence(): + pipeline = pipeline_module.Pipeline.from_json(pipeline_file, resolver=Resolver()) + _, iris_dataset_1 = self.get_data(dataset_name='iris_dataset_1', problem_name='') + boston_problem, iris_dataset_2 = self.get_data(dataset_name='boston_dataset_1', problem_name='boston_problem_1') + inputs = [iris_dataset_1, iris_dataset_2] + hyperparams = None + random_seed = 0 + volumes_dir: str = None + r = runtime.Runtime(pipeline, context=metadata_base.Context.TESTING, environment=self.runtime_enviroment, + hyperparams=hyperparams, random_seed=random_seed, volumes_dir=volumes_dir, + problem_description=boston_problem) + r.fit(inputs=inputs) + r.produce(inputs=inputs) + + @staticmethod + def _build_fail_runtime(method_name, message): + + class FailRuntime(runtime.Runtime): + pass + + def fail_method(*args, **kwargs): + raise Exception(message) + + setattr(FailRuntime, method_name, fail_method) + + return FailRuntime + + def test_error_propgation(self): + for method_name in [ + '_call_primitive_method', '_create_pipeline_primitive', + '_run_primitive', '_run_subpipeline', '_run_step', '_do_run_step', '_do_run', + ]: + error_message = 'runtime failed in method "{}"'.format(method_name) + + inputs = self._get_inputs() + subpipeline = self._build_pipeline('06dfb07a-f151-467c-9f1c-51a6bf6378a3', [{'primitive_class': IncrementPrimitive}, {'primitive_class': IncrementPrimitive}]) + pipeline_with_subpipeline = self._build_pipeline('293c1883-f81a-459d-a1a8-ba19467d5ad6', [{'primitive_class': RandomPrimitive}, subpipeline, {'primitive_class': IncrementPrimitive}]) + fail_runtime_class = self._build_fail_runtime(method_name, error_message) + + r = fail_runtime_class( + pipeline_with_subpipeline, context=metadata_base.Context.TESTING, + environment=self.runtime_enviroment, + ) + + fit_result = r.fit(inputs) + self.assertTrue(fit_result.pipeline_run) + self._fake_inputs(r, fit_result.pipeline_run, inputs) + self._check_pipelines_valid_and_failed([fit_result.pipeline_run]) + self.assertTrue( + str(fit_result.error) in [ + error_message, + 'Step 0 for pipeline 293c1883-f81a-459d-a1a8-ba19467d5ad6 failed.', + 'Step 1 for pipeline 293c1883-f81a-459d-a1a8-ba19467d5ad6 failed.', + ], + 'Unexpected error message: {}'.format(fit_result.error) + ) + + def test_get_singleton_value(self): + l = container.List([1], generate_metadata=True) + l.metadata = l.metadata.update((0,), {'custom': 'metadata'}) + + s = runtime.get_singleton_value(l) + + self.assertEqual(s, 1) + + l = container.List([container.List([1], generate_metadata=True)], generate_metadata=True) + l.metadata = l.metadata.update((0,), {'custom': 'metadata1'}) + l.metadata = l.metadata.update((0, 0), {'custom': 'metadata2'}) + + s = runtime.get_singleton_value(l) + + self.assertEqual(s, [1]) + self.assertEqual(utils.to_json_structure(s.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'custom': 'metadata1', + 'dimension': {'length': 1}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'structural_type': 'd3m.container.list.List' + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': {'structural_type': 'int'}, + }, { + 'selector': [0], + 'metadata': {'custom': 'metadata2'}, + }]) + + d = container.DataFrame({'a': [1], 'b': ['one']}, generate_metadata=True) + + s = runtime.get_singleton_value(d) + + self.assertEqual(s, [1, 'one']) + self.assertEqual(utils.to_json_structure(s.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'dimension': { + 'length': 2, + # TODO: "name" and "semantic_types" here should be removed. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/336 + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'structural_type': 'd3m.container.list.List', + }, + }, { + 'selector': [0], + 'metadata': {'name': 'a', 'structural_type': 'numpy.int64'}, + }, { + 'selector': [1], + 'metadata': {'name': 'b', 'structural_type': 'str'}, + }]) + + def test_unfitted_primitive(self): + pipeline = pipeline_module.Pipeline() + pipeline.add_input() + + step = pipeline_module.PrimitiveStep( + { + 'id': '3b09ba74-cc90-4f22-9e0a-0cf4f29a7e28', + 'version': '0.1.0', + 'name': "Removes columns", + 'python_path': 'd3m.primitives.data_transformation.remove_columns.Common', + }, + resolver=pipeline_module.Resolver(), + ) + step.add_hyperparameter('columns', metadata_base.ArgumentType.VALUE, [3]) + + pipeline.add_step(step) + + step = pipeline_module.PrimitiveStep( + { + 'id': '5bef5738-1638-48d6-9935-72445f0eecdc', + 'version': '0.1.0', + 'name': "Map DataFrame resources to new resources using provided primitive", + 'python_path': 'd3m.primitives.operator.dataset_map.DataFrameCommon', + }, + resolver=pipeline_module.Resolver(), + ) + step.add_argument('inputs', metadata_base.ArgumentType.CONTAINER, 'inputs.0') + step.add_output('produce') + step.add_hyperparameter('primitive', metadata_base.ArgumentType.PRIMITIVE, 0) + + pipeline.add_step(step) + + pipeline.add_output('steps.1.produce') + + pipeline.check(allow_placeholders=False, standard_pipeline=False, input_types={'inputs.0': container.Dataset}) + + _, dataset = self.get_data() + + self.assertEqual(dataset['learningData'].shape, (150, 6)) + + r = runtime.Runtime(pipeline, context=metadata_base.Context.TESTING, is_standard_pipeline=False, environment=self.runtime_enviroment) + + inputs = [dataset] + + result = r.fit(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertTrue(result.pipeline_run) + + self.assertEqual(len(result.values), 1) + + output_dataset = result.values['outputs.0'] + + self.assertEqual(output_dataset['learningData'].shape, (150, 5)) + + result = r.produce(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + self.assertTrue(result.pipeline_run) + + output_dataset = result.values['outputs.0'] + + self.assertEqual(output_dataset['learningData'].shape, (150, 5)) + + pickled = pickle.dumps(r) + restored = pickle.loads(pickled) + + result = restored.produce(inputs, return_values=['outputs.0']) + result.check_success() + + self.assertEqual(len(result.values), 1) + self.assertTrue(result.pipeline_run) + + output_dataset = result.values['outputs.0'] + + self.assertEqual(output_dataset['learningData'].shape, (150, 5)) + + pickle.dumps(r) + + def test_pipeline_openml(self): + # Creating pipeline + pipeline_description = pipeline_module.Pipeline() + pipeline_description.add_input(name='inputs') + + # Step 0: dataset_to_dataframe + step_0 = pipeline_module.PrimitiveStep( + primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common'), + ) + step_0.add_argument(name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='inputs.0') + step_0.add_output('produce') + pipeline_description.add_step(step_0) + + # Step 1: profiler + step_1 = pipeline_module.PrimitiveStep( + primitive=index.get_primitive('d3m.primitives.schema_discovery.profiler.Common'), + ) + step_1.add_argument(name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='steps.0.produce') + step_1.add_output('produce') + pipeline_description.add_step(step_1) + + # Step 2: column_parser + step_2 = pipeline_module.PrimitiveStep( + primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common'), + ) + step_2.add_argument(name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='steps.1.produce') + step_2.add_output('produce') + pipeline_description.add_step(step_2) + + # Step 4: random_forest + step_3 = pipeline_module.PrimitiveStep( + primitive=index.get_primitive('d3m.primitives.classification.random_forest.Common'), + ) + step_3.add_argument(name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='steps.2.produce') + step_3.add_argument(name='outputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='steps.2.produce') + step_3.add_hyperparameter(name='return_result', argument_type=metadata_base.ArgumentType.VALUE, data='replace') + step_3.add_output('produce') + pipeline_description.add_step(step_3) + + # Step 5: construct predictions + step_4 = pipeline_module.PrimitiveStep( + primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common'), + ) + step_4.add_argument(name='inputs', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='steps.3.produce') + step_4.add_argument(name='reference', argument_type=metadata_base.ArgumentType.CONTAINER, data_reference='steps.2.produce') + step_4.add_output('produce') + pipeline_description.add_step(step_4) + + # Final Output + pipeline_description.add_output(name='output predictions', data_reference='steps.4.produce') + + # Load OpenML Dataset + dataset_id = 61 + dataset_name = 'iris' + openml_dataset_uri = 'https://www.openml.org/d/{dataset_id}'.format(dataset_id=dataset_id) + ds = container.Dataset.load(openml_dataset_uri, dataset_id=str(dataset_id), dataset_name=dataset_name) + + with utils.silence(): + r = runtime.Runtime(pipeline=pipeline_description, context=metadata_base.Context.TESTING) + r.fit(inputs=[ds]) + result = r.produce(inputs=[ds]) + + result.check_success() + predictions = result.values['outputs.0'] + + self.assertEqual(predictions.shape, (150, 2)) + self.assertTrue(predictions.metadata.has_semantic_type( + (metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'), + ) + self.assertFalse(predictions.metadata.has_semantic_type( + (metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget'), + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_split.py b/d3m/tests/test_split.py new file mode 100644 index 0000000..084c43f --- /dev/null +++ b/d3m/tests/test_split.py @@ -0,0 +1,22 @@ +import os +import sys +import unittest + +COMMON_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'common-primitives') +# NOTE: This insertion should appear before any code attempting to resolve or load primitives, +# so the git submodule version of `common-primitives` is looked at first. +sys.path.insert(0, COMMON_PRIMITIVES_DIR) + +COMMON_PRIMITIVES_TESTS_DIR = os.path.join(os.path.dirname(__file__), 'common-primitives', 'tests') +sys.path.insert(0, COMMON_PRIMITIVES_TESTS_DIR) + +import test_train_score_split + + +# We just reuse existings tests. This allows us to test the high-level data splitting class. +class TrainScoreDatasetSplitPrimitiveTestCase(test_train_score_split.TrainScoreDatasetSplitPrimitiveTestCase): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_sum.py b/d3m/tests/test_sum.py new file mode 100644 index 0000000..910504f --- /dev/null +++ b/d3m/tests/test_sum.py @@ -0,0 +1,319 @@ +import json +import unittest +import os +import os.path +import sys +import time + +import docker +import numpy + +import d3m +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.primitive_interfaces import base + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') + +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.sum import SumPrimitive + + +EXPECTED_PRIMITIVE_DESCRIPTION_JSON = r""" +{ + "id": "9c00d42d-382d-4177-a0e7-082da88a29c8", + "version": "0.1.0", + "name": "Sum Values", + "keywords": [ + "test primitive" + ], + "source": { + "name": "Test team", + "contact": "mailto:author@example.com", + "uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/blob/master/primitives/test_primitives/sum.py", + "https://gitlab.com/datadrivendiscovery/tests-data.git" + ] + }, + "installation": [ + { + "type": "PIP", + "package_uri": "git+https://gitlab.com/datadrivendiscovery/tests-data.git@__GIT_COMMIT__#egg=test_primitives&subdirectory=primitives" + }, + { + "type": "DOCKER", + "key": "summing", + "image_name": "registry.gitlab.com/datadrivendiscovery/tests-data/summing", + "image_digest": "sha256:f75e21720e44cfa29d8a8e239b5746c715aa7cf99f9fde7916623fabc30d3364" + } + ], + "location_uris": [ + "https://gitlab.com/datadrivendiscovery/tests-data/raw/__GIT_COMMIT__/primitives/test_primitives/sum.py" + ], + "python_path": "d3m.primitives.operator.sum.Test", + "algorithm_types": [ + "COMPUTER_ALGEBRA" + ], + "primitive_family": "OPERATOR", + "preconditions": [ + "NO_MISSING_VALUES", + "NO_CATEGORICAL_VALUES" + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/primitive.json", + "original_python_path": "test_primitives.sum.SumPrimitive", + "primitive_code": { + "class_type_arguments": { + "Inputs": "typing.Union[d3m.container.list.List, d3m.container.numpy.ndarray, d3m.container.pandas.DataFrame]", + "Outputs": "d3m.container.list.List", + "Hyperparams": "test_primitives.sum.Hyperparams", + "Params": "NoneType" + }, + "interfaces_version": "__INTERFACES_VERSION__", + "interfaces": [ + "transformer.TransformerPrimitiveBase", + "base.PrimitiveBase" + ], + "hyperparams": {}, + "arguments": { + "hyperparams": { + "type": "test_primitives.sum.Hyperparams", + "kind": "RUNTIME" + }, + "docker_containers": { + "type": "typing.Union[NoneType, typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]]", + "kind": "RUNTIME", + "default": null + }, + "timeout": { + "type": "typing.Union[NoneType, float]", + "kind": "RUNTIME", + "default": null + }, + "iterations": { + "type": "typing.Union[NoneType, int]", + "kind": "RUNTIME", + "default": null + }, + "produce_methods": { + "type": "typing.Sequence[str]", + "kind": "RUNTIME" + }, + "inputs": { + "type": "typing.Union[d3m.container.list.List, d3m.container.numpy.ndarray, d3m.container.pandas.DataFrame]", + "kind": "PIPELINE" + }, + "params": { + "type": "NoneType", + "kind": "RUNTIME" + } + }, + "class_methods": {}, + "instance_methods": { + "__init__": { + "kind": "OTHER", + "arguments": [ + "hyperparams", + "docker_containers" + ], + "returns": "NoneType" + }, + "fit": { + "kind": "OTHER", + "arguments": [ + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[NoneType]", + "description": "A noop.\n\nParameters\n----------\ntimeout:\n A maximum time this primitive should be fitting during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA ``CallResult`` with ``None`` value." + }, + "fit_multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling ``fit`` and after that multiple produce methods at once.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to both fit the primitive and produce outputs\n for all produce methods listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do for both fitting and producing\n outputs of all produce methods.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "get_params": { + "kind": "OTHER", + "arguments": [], + "returns": "NoneType", + "description": "A noop.\n\nReturns\n-------\nAn instance of parameters." + }, + "multi_produce": { + "kind": "OTHER", + "arguments": [ + "produce_methods", + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.MultiCallResult", + "description": "A method calling multiple produce methods at once.\n\nWhen a primitive has multiple produce methods it is common that they might compute the\nsame internal results for same inputs but return different representations of those results.\nIf caller is interested in multiple of those representations, calling multiple produce\nmethods might lead to recomputing same internal results multiple times. To address this,\nthis method allows primitive author to implement an optimized version which computes\ninternal results only once for multiple calls of produce methods, but return those different\nrepresentations.\n\nIf any additional method arguments are added to primitive's produce method(s), they have\nto be added to this method as well. This method should accept an union of all arguments\naccepted by primitive's produce method(s) and then use them accordingly when computing\nresults.\n\nThe default implementation of this method just calls all produce methods listed in\n``produce_methods`` in order and is potentially inefficient.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\nproduce_methods:\n A list of names of produce methods to call.\ninputs:\n The inputs given to all produce methods.\ntimeout:\n A maximum time this primitive should take to produce outputs for all produce methods\n listed in ``produce_methods`` argument, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nA dict of values for each produce method wrapped inside ``MultiCallResult``." + }, + "produce": { + "kind": "PRODUCE", + "arguments": [ + "inputs", + "timeout", + "iterations" + ], + "returns": "d3m.primitive_interfaces.base.CallResult[d3m.container.list.List]", + "singleton": true, + "inputs_across_samples": [], + "description": "Produce primitive's best choice of the output for each of the inputs.\n\nThe output value should be wrapped inside ``CallResult`` object before returning.\n\nIn many cases producing an output is a quick operation in comparison with ``fit``, but not\nall cases are like that. For example, a primitive can start a potentially long optimization\nprocess to compute outputs. ``timeout`` and ``iterations`` can serve as a way for a caller\nto guide the length of this process.\n\nIdeally, a primitive should adapt its call to try to produce the best outputs possible\ninside the time allocated. If this is not possible and the primitive reaches the timeout\nbefore producing outputs, it should raise a ``TimeoutError`` exception to signal that the\ncall was unsuccessful in the given time. The state of the primitive after the exception\nshould be as the method call has never happened and primitive should continue to operate\nnormally. The purpose of ``timeout`` is to give opportunity to a primitive to cleanly\nmanage its state instead of interrupting execution from outside. Maintaining stable internal\nstate should have precedence over respecting the ``timeout`` (caller can terminate the\nmisbehaving primitive from outside anyway). If a longer ``timeout`` would produce\ndifferent outputs, then ``CallResult``'s ``has_finished`` should be set to ``False``.\n\nSome primitives have internal iterations (for example, optimization iterations).\nFor those, caller can provide how many of primitive's internal iterations\nshould a primitive do before returning outputs. Primitives should make iterations as\nsmall as reasonable. If ``iterations`` is ``None``, then there is no limit on\nhow many iterations the primitive should do and primitive should choose the best amount\nof iterations on its own (potentially controlled through hyper-parameters).\nIf ``iterations`` is a number, a primitive has to do those number of iterations,\nif possible. ``timeout`` should still be respected and potentially less iterations\ncan be done because of that. Primitives with internal iterations should make\n``CallResult`` contain correct values.\n\nFor primitives which do not have internal iterations, any value of ``iterations``\nmeans that they should run fully, respecting only ``timeout``.\n\nIf primitive should have been fitted before calling this method, but it has not been,\nprimitive should raise a ``PrimitiveNotFittedError`` exception.\n\nParameters\n----------\ninputs:\n The inputs of shape [num_inputs, ...].\ntimeout:\n A maximum time this primitive should take to produce outputs during this method call, in seconds.\niterations:\n How many of internal iterations should the primitive do.\n\nReturns\n-------\nThe outputs of shape [num_inputs, ...] wrapped inside ``CallResult``." + }, + "set_params": { + "kind": "OTHER", + "arguments": [ + "params" + ], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------\nparams:\n An instance of parameters." + }, + "set_training_data": { + "kind": "OTHER", + "arguments": [], + "returns": "NoneType", + "description": "A noop.\n\nParameters\n----------" + } + }, + "class_attributes": { + "logger": "logging.Logger", + "metadata": "d3m.metadata.base.PrimitiveMetadata" + }, + "instance_attributes": { + "hyperparams": "d3m.metadata.hyperparams.Hyperparams", + "random_seed": "int", + "docker_containers": "typing.Dict[str, d3m.primitive_interfaces.base.DockerContainer]", + "volumes": "typing.Dict[str, str]", + "temporary_directory": "typing.Union[NoneType, str]" + } + }, + "structural_type": "test_primitives.sum.SumPrimitive", + "description": "A primitive which sums all the values on input into one number.\n\nAttributes\n----------\nmetadata:\n Primitive's metadata. Available as a class attribute.\nlogger:\n Primitive's logger. Available as a class attribute.\nhyperparams:\n Hyperparams passed to the constructor.\nrandom_seed:\n Random seed passed to the constructor.\ndocker_containers:\n A dict mapping Docker image keys from primitive's metadata to (named) tuples containing\n container's address under which the container is accessible by the primitive, and a\n dict mapping exposed ports to ports on that address.\nvolumes:\n A dict mapping volume keys from primitive's metadata to file and directory paths\n where downloaded and extracted files are available to the primitive.\ntemporary_directory:\n An absolute path to a temporary directory a primitive can use to store any files\n for the duration of the current pipeline run phase. Directory is automatically\n cleaned up after the current pipeline run phase finishes.", + "digest": "__DIGEST__" +} +""".replace('__INTERFACES_VERSION__', d3m.__version__).replace('__GIT_COMMIT__', utils.current_git_commit(TEST_PRIMITIVES_DIR)).replace('__DIGEST__', SumPrimitive.metadata.query()['digest']) + + +class TestSumPrimitive(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.docker_client = docker.from_env() + + cls.docker_containers = {} + + # Start all containers (this pulls images if they do not yet exist). + installation = SumPrimitive.metadata.query().get('installation', []) + for entry in installation: + if entry['type'] != metadata_base.PrimitiveInstallationType.DOCKER: + continue + + cls.docker_containers[entry['key']] = cls.docker_client.containers.run( + '{image_name}@{image_digest}'.format(image_name=entry['image_name'], image_digest=entry['image_digest']), + # Ports are mapped to random ports on the host so that they works in GitLab CI and Docker-in-Docker + # environment (ports are mapped to the Docker-in-Docker container itself, not the real host). + # In Docker-in-Docker environment you cannot directly connect to a container. + detach=True, auto_remove=True, publish_all_ports=True, + ) + + # Wait a bit for things to run. Even if status is "running" it does + # not really mean all services inside are really already running. + time.sleep(5) # 5 s + + # Wait for containers to be running. + for container in cls.docker_containers.values(): + for _ in range(100): # 100 * 100 ms = 10 s + container.reload() + if container.status == 'running': + assert container.attrs.get('NetworkSettings', {}).get('IPAddress', None) + break + elif container.status in ('removing', 'paused', 'exited', 'dead'): + raise ValueError("Container '{container}' is not running.".format(container=container)) + + time.sleep(0.1) # 100 ms + else: + raise ValueError("Container '{container}' is not running.".format(container=container)) + + @classmethod + def tearDownClass(cls): + for key, container in cls.docker_containers.items(): + container.stop() + + cls.docker_containers = {} + + def call_primitive(self, primitive, method_name, **kwargs): + return getattr(primitive, method_name)(**kwargs) + + def _map_ports(self, ports): + return {port: int(port_map[0]['HostPort']) for port, port_map in ports.items()} + + def get_docker_containers(self): + if os.environ.get('GITLAB_CI', None): + # In GitLab CI we use Docker-in-Docker to run containers, so container's ports are mapped to Docker-in-Docker + # container itself (with hostname "docker") and not to the host. + return {key: base.DockerContainer('docker', self._map_ports(container.attrs['NetworkSettings']['Ports'])) for key, container in self.docker_containers.items()} + else: + return {key: base.DockerContainer('localhost', self._map_ports(container.attrs['NetworkSettings']['Ports'])) for key, container in self.docker_containers.items()} + + def test_ndarray(self): + with self.assertLogs(SumPrimitive.metadata.query()['python_path'], level='DEBUG') as cm: + hyperparams_class = SumPrimitive.metadata.get_hyperparams() + + primitive = SumPrimitive(hyperparams=hyperparams_class.defaults(), docker_containers=self.get_docker_containers()) + + inputs = container.ndarray([[1, 2, 3, 4], [5, 6, 7, 8]], generate_metadata=True) + + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + # Because it is a singleton produce method we can know that there is exactly one value in outputs. + result = call_metadata.value[0] + + self.assertEqual(result, 36) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS,))['structural_type'], float) + + self.assertEqual(len(cm.records), 2) + self.assertEqual(cm.records[0].name, SumPrimitive.metadata.query()['python_path']) + self.assertEqual(cm.records[1].name, SumPrimitive.metadata.query()['python_path']) + + self.assertIsInstance(cm.records[0].data, numpy.ndarray) + self.assertEqual(cm.records[1].response.status, 200) + + def test_lists(self): + hyperparams_class = SumPrimitive.metadata.get_hyperparams() + + primitive = SumPrimitive(hyperparams=hyperparams_class.defaults(), docker_containers=self.get_docker_containers()) + + inputs = container.List([container.List([1, 2, 3, 4]), container.List([5, 6, 7, 8])], generate_metadata=True) + + call_metadata = self.call_primitive(primitive, 'produce', inputs=inputs) + + # Because it is a singleton produce method we can know that there is exactly one value in outputs. + result = call_metadata.value[0] + + self.assertEqual(result, 36) + self.assertEqual(call_metadata.has_finished, True) + self.assertEqual(call_metadata.iterations_done, None) + + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS,))['structural_type'], float) + + def test_metadata(self): + expected_description = json.loads(EXPECTED_PRIMITIVE_DESCRIPTION_JSON) + + # We stringify to JSON and parse it to make sure the description can be stringified to JSON. + description = json.loads(json.dumps(SumPrimitive.metadata.to_json_structure())) + + self.maxDiff = None + self.assertEqual(expected_description, description) + + +if __name__ == '__main__': + unittest.main() diff --git a/d3m/tests/test_utils.py b/d3m/tests/test_utils.py new file mode 100644 index 0000000..efcc560 --- /dev/null +++ b/d3m/tests/test_utils.py @@ -0,0 +1,506 @@ +import builtins +import copy +import io +import json +import logging +import sys +import random +import typing +import unittest + +import jsonschema +import numpy + +from d3m import container, types, utils +from d3m.container import list +from d3m.metadata import base as metadata_base + + +class TestUtils(unittest.TestCase): + def test_get_type_arguments(self): + A = typing.TypeVar('A') + B = typing.TypeVar('B') + C = typing.TypeVar('C') + + class Base(typing.Generic[A, B]): + pass + + class Foo(Base[A, None]): + pass + + class Bar(Foo[A], typing.Generic[A, C]): + pass + + class Baz(Bar[float, int]): + pass + + self.assertEqual(utils.get_type_arguments(Bar), { + A: typing.Any, + B: type(None), + C: typing.Any, + }) + self.assertEqual(utils.get_type_arguments(Baz), { + A: float, + B: type(None), + C: int, + }) + + self.assertEqual(utils.get_type_arguments(Base), { + A: typing.Any, + B: typing.Any, + }) + + self.assertEqual(utils.get_type_arguments(Base[float, int]), { + A: float, + B: int, + }) + + self.assertEqual(utils.get_type_arguments(Foo), { + A: typing.Any, + B: type(None), + }) + + self.assertEqual(utils.get_type_arguments(Foo[float]), { + A: float, + B: type(None), + }) + + def test_issubclass(self): + self.assertTrue(utils.is_subclass(list.List, types.Container)) + + T1 = typing.TypeVar('T1', bound=list.List) + self.assertTrue(utils.is_subclass(list.List, T1)) + + def test_create_enum(self): + obj = { + 'definitions': { + 'foobar1':{ + 'type': 'array', + 'items': { + 'anyOf':[ + {'enum': ['AAA']}, + {'enum': ['BBB']}, + {'enum': ['CCC']}, + {'enum': ['DDD']}, + ], + }, + }, + 'foobar2': { + 'type': 'array', + 'items': { + 'type': 'object', + 'anyOf': [ + { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['EEE'], + }, + }, + }, + { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['FFF'], + }, + }, + }, + { + 'properties': { + 'type': { + 'type': 'string', + 'enum': ['GGG'], + }, + }, + }, + ], + }, + }, + 'foobar3': { + 'type': 'string', + 'enum': ['HHH', 'HHH', 'III', 'JJJ'], + } + }, + } + + Foobar1 = utils.create_enum_from_json_schema_enum('Foobar1', obj, 'definitions.foobar1.items.anyOf[*].enum[*]') + Foobar2 = utils.create_enum_from_json_schema_enum('Foobar2', obj, 'definitions.foobar2.items.anyOf[*].properties.type.enum[*]') + Foobar3 = utils.create_enum_from_json_schema_enum('Foobar3', obj, 'definitions.foobar3.enum[*]') + + self.assertSequenceEqual(builtins.list(Foobar1.__members__.keys()), ['AAA', 'BBB', 'CCC', 'DDD']) + self.assertSequenceEqual([value.value for value in Foobar1.__members__.values()], ['AAA', 'BBB', 'CCC', 'DDD']) + + self.assertSequenceEqual(builtins.list(Foobar2.__members__.keys()), ['EEE', 'FFF', 'GGG']) + self.assertSequenceEqual([value.value for value in Foobar2.__members__.values()], ['EEE', 'FFF', 'GGG']) + + self.assertSequenceEqual(builtins.list(Foobar3.__members__.keys()), ['HHH', 'III', 'JJJ']) + self.assertSequenceEqual([value.value for value in Foobar3.__members__.values()], ['HHH', 'III', 'JJJ']) + + self.assertTrue(Foobar1.AAA.name == 'AAA') + self.assertTrue(Foobar1.AAA.value == 'AAA') + self.assertTrue(Foobar1.AAA == Foobar1.AAA) + self.assertTrue(Foobar1.AAA == 'AAA') + + def test_extendable_enum(self): + class Foobar(utils.Enum): + AAA = 1 + BBB = 2 + CCC = 3 + + self.assertSequenceEqual(builtins.list(Foobar.__members__.keys()), ['AAA', 'BBB', 'CCC']) + self.assertSequenceEqual([value.value for value in Foobar.__members__.values()], [1, 2, 3]) + + with self.assertRaises(AttributeError): + Foobar.register_value('CCC', 5) + + self.assertSequenceEqual(builtins.list(Foobar.__members__.keys()), ['AAA', 'BBB', 'CCC']) + self.assertSequenceEqual([value.value for value in Foobar.__members__.values()], [1, 2, 3]) + + Foobar.register_value('DDD', 4) + + self.assertSequenceEqual(builtins.list(Foobar.__members__.keys()), ['AAA', 'BBB', 'CCC', 'DDD']) + self.assertSequenceEqual([value.value for value in Foobar.__members__.values()], [1, 2, 3, 4]) + + self.assertEqual(Foobar['DDD'], 'DDD') + self.assertEqual(Foobar(4), 'DDD') + + Foobar.register_value('EEE', 4) + + self.assertSequenceEqual(builtins.list(Foobar.__members__.keys()), ['AAA', 'BBB', 'CCC', 'DDD', 'EEE']) + self.assertSequenceEqual([value.value for value in Foobar.__members__.values()], [1, 2, 3, 4, 4]) + + self.assertEqual(Foobar['EEE'], 'DDD') + self.assertEqual(Foobar(4), 'DDD') + + def test_redirect(self): + old_stdout = sys.stdout + old_stderr = sys.stderr + + test_stream = io.StringIO() + sys.stdout = test_stream + sys.stderr = test_stream + + logger = logging.getLogger('test_logger') + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + with utils.redirect_to_logging(logger=logger, pass_through=False): + print("Test.") + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].message, "Test.") + + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + with utils.redirect_to_logging(logger=logger, pass_through=False): + print("foo", "bar") + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].message, "foo bar") + + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + with utils.redirect_to_logging(logger=logger, pass_through=False): + print("Test.\nTe", end="") + print("st2.", end="") + + # The incomplete line should not be written to the logger. + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].message, "Test.") + + # Remaining contents should be written to logger upon closing. + self.assertEqual(len(cm.records), 2) + self.assertEqual(cm.records[0].message, "Test.") + self.assertEqual(cm.records[1].message, "Test2.") + + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + with utils.redirect_to_logging(logger=logger, pass_through=False): + print("Test. ") + print(" ") + print(" Test2.") + print(" ") + + # Trailing whitespace and new lines should not be logged. + self.assertEqual(len(cm.records), 2) + self.assertEqual(cm.records[0].message, "Test.") + self.assertEqual(cm.records[1].message, " Test2.") + + logger2 = logging.getLogger('test_logger2') + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + with self.assertLogs(logger=logger2, level=logging.DEBUG) as cm2: + with utils.redirect_to_logging(logger=logger, pass_through=True): + print("Test.") + with utils.redirect_to_logging(logger=logger2, pass_through=True): + print("Test2.") + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].message, "Test.") + self.assertEqual(len(cm2.records), 1) + self.assertEqual(cm2.records[0].message, "Test2.") + + pass_through_lines = test_stream.getvalue().split('\n') + self.assertEqual(len(pass_through_lines), 3) + self.assertEqual(pass_through_lines[0], "Test.") + self.assertEqual(pass_through_lines[1], "Test2.") + self.assertEqual(pass_through_lines[2], "") + + records = [] + + def callback(record): + nonlocal records + records.append(record) + + # Test recursion prevention. + with self.assertLogs(logger=logger, level=logging.DEBUG) as cm: + with self.assertLogs(logger=logger2, level=logging.DEBUG) as cm2: + # We add it twice so that we test that handler does not modify record while running. + logger2.addHandler(utils.CallbackHandler(callback)) + logger2.addHandler(utils.CallbackHandler(callback)) + + with utils.redirect_to_logging(logger=logger, pass_through=False): + print("Test.") + with utils.redirect_to_logging(logger=logger2, pass_through=False): + # We configure handler after redirecting. + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(logging.Formatter('Test format: %(message)s')) + logger2.addHandler(handler) + print("Test2.") + + # We use outer "redirect_to_logging" to make sure nothing from inner gets out. + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].message, "Test.") + + self.assertEqual(len(cm2.records), 2) + # This one comes from the print. + self.assertEqual(cm2.records[0].message, "Test2.") + # And this one comes from the stream handler. + self.assertEqual(cm2.records[1].message, "Test format: Test2.") + + self.assertEqual(len(records), 4) + self.assertEqual(records[0]['message'], "Test2.") + self.assertEqual(records[1]['message'], "Test2.") + self.assertEqual(records[2]['message'], "Test format: Test2.") + self.assertEqual(records[3]['message'], "Test format: Test2.") + + test_stream.close() + sys.stdout = old_stdout + sys.stderr = old_stderr + + def test_columns_sum(self): + dataframe = container.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, generate_metadata=True) + + dataframe_sum = utils.columns_sum(dataframe) + + self.assertEqual(dataframe_sum.values.tolist(), [[6, 15]]) + self.assertEqual(dataframe_sum.metadata.query((metadata_base.ALL_ELEMENTS, 0))['name'], 'a') + self.assertEqual(dataframe_sum.metadata.query((metadata_base.ALL_ELEMENTS, 1))['name'], 'b') + + array = container.ndarray(dataframe, generate_metadata=True) + + array_sum = utils.columns_sum(array) + + self.assertEqual(array_sum.tolist(), [[6, 15]]) + self.assertEqual(array_sum.metadata.query((metadata_base.ALL_ELEMENTS, 0))['name'], 'a') + self.assertEqual(array_sum.metadata.query((metadata_base.ALL_ELEMENTS, 1))['name'], 'b') + + def test_numeric(self): + self.assertTrue(utils.is_float(type(1.0))) + self.assertFalse(utils.is_float(type(1))) + self.assertFalse(utils.is_int(type(1.0))) + self.assertTrue(utils.is_int(type(1))) + self.assertTrue(utils.is_numeric(type(1.0))) + self.assertTrue(utils.is_numeric(type(1))) + + def test_yaml_representers(self): + self.assertEqual(utils.yaml_load(utils.yaml_dump(numpy.int32(1))), 1) + self.assertEqual(utils.yaml_load(utils.yaml_dump(numpy.int64(1))), 1) + self.assertEqual(utils.yaml_load(utils.yaml_dump(numpy.float32(1.0))), 1.0) + self.assertEqual(utils.yaml_load(utils.yaml_dump(numpy.float64(1.0))), 1.0) + + def test_json_schema_python_type(self): + schemas = copy.copy(metadata_base.SCHEMAS) + schemas['http://example.com/testing_python_type.json'] = { + 'id': 'http://example.com/testing_python_type.json', + 'properties': { + 'foobar': { + '$ref': 'https://metadata.datadrivendiscovery.org/schemas/v0/definitions.json#/definitions/python_type', + }, + }, + } + + validator, = utils.load_schema_validators(schemas, ('testing_python_type.json',)) + + validator.validate({'foobar': 'str'}) + validator.validate({'foobar': str}) + + with self.assertRaisesRegex(jsonschema.exceptions.ValidationError, 'python-type'): + validator.validate({'foobar': 1}) + + def test_json_schema_numeric(self): + schemas = copy.copy(metadata_base.SCHEMAS) + schemas['http://example.com/testing_numeric.json'] = { + 'id': 'http://example.com/testing_numeric.json', + 'properties': { + 'int': { + 'type': 'integer', + }, + 'float': { + 'type': 'number', + }, + }, + } + + validator, = utils.load_schema_validators(schemas, ('testing_numeric.json',)) + + validator.validate({'float': 0}) + validator.validate({'float': 1.0}) + validator.validate({'float': 1.2}) + + with self.assertRaisesRegex(jsonschema.exceptions.ValidationError, 'float'): + validator.validate({'float': '1.2'}) + + validator.validate({'int': 0}) + validator.validate({'int': 1.0}) + + with self.assertRaisesRegex(jsonschema.exceptions.ValidationError, 'int'): + validator.validate({'int': 1.2}) + + with self.assertRaisesRegex(jsonschema.exceptions.ValidationError, 'int'): + validator.validate({'int': '1.0'}) + + def test_digest(self): + self.assertEqual(utils.compute_digest({'a': 1.0, 'digest': 'xxx'}), utils.compute_digest({'a': 1.0})) + self.assertEqual(utils.compute_hash_id({'a': 1.0, 'id': 'xxx'}), utils.compute_hash_id({'a': 1.0})) + + self.assertEqual(utils.compute_digest({'a': 1.0}), utils.compute_digest({'a': 1})) + self.assertEqual(utils.compute_hash_id({'a': 1.0}), utils.compute_hash_id({'a': 1})) + + def test_json_equals(self): + basic_cases = ['hello', 0, -2, 3.14, False, True, [1, 2, 3], {'a': 1}, set(['z', 'y', 'x'])] + for case in basic_cases: + self.assertTrue(utils.json_structure_equals(case, case)) + + self.assertFalse(utils.json_structure_equals({'extra_key': 'value'}, {})) + self.assertFalse(utils.json_structure_equals({}, {'extra_key': 'value'})) + self.assertTrue(utils.json_structure_equals({}, {'extra_key': 'value'}, ignore_keys={'extra_key'})) + + list1 = {'a': builtins.list('type')} + list2 = {'a': builtins.list('typo')} + self.assertFalse(utils.json_structure_equals(list1, list2)) + + json1 = { + 'a': 1, + 'b': True, + 'c': 'hello', + 'd': -2.4, + 'e': { + 'a': 'world', + }, + 'f': [ + 0, + 1, + 2 + ], + 'ignore': { + 'a': False + }, + 'deep': [ + { + 'a': {}, + 'ignore': {} + }, + { + 'b': [], + 'ignore': -1 + } + ] + } + json2 = { + 'a': 1, + 'b': True, + 'c': 'hello', + 'd': -2.4, + 'e': { + 'a': 'world', + }, + 'f': [ + 0, + 1, + 2 + ], + 'ignore': { + 'a': True + }, + 'deep': [ + { + 'a': {}, + 'ignore': { + 'not_empty': 'hello world' + } + }, + { + 'b': [], + 'ignore': 1 + } + ] + } + + self.assertTrue(utils.json_structure_equals(json1, json2, ignore_keys={'ignore'})) + self.assertFalse(utils.json_structure_equals(json1, json2)) + + def test_reversible_json(self): + for obj in [ + 1, + "foobar", + b"foobar", + [1, 2, 3], + [1, [2], 3], + 1.2, + type(None), + int, + str, + numpy.ndarray, + {'foo': 'bar'}, + {'encoding': 'something', 'value': 'else'}, + metadata_base.NO_VALUE, + metadata_base.ALL_ELEMENTS, + ]: + self.assertEqual(utils.from_reversible_json_structure(json.loads(json.dumps(utils.to_reversible_json_structure(obj)))), obj, str(obj)) + + self.assertTrue(numpy.isnan(utils.from_reversible_json_structure(json.loads(json.dumps(utils.to_reversible_json_structure(float('nan'))))))) + + self.assertEqual(utils.from_reversible_json_structure(json.loads(json.dumps(utils.to_reversible_json_structure(numpy.array([1, 2, 3]))))).tolist(), [1, 2, 3]) + + with self.assertRaises(TypeError): + utils.to_reversible_json_structure({1: 2}) + + def test_global_randomness_warning(self): + with self.assertLogs(logger=utils.logger, level=logging.DEBUG) as cm: + with utils.global_randomness_warning(): + random.randint(0, 10) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].message, "Using global/shared random source using 'random.randint' can make execution not reproducible.") + + with self.assertLogs(logger=utils.logger, level=logging.DEBUG) as cm: + with utils.global_randomness_warning(): + numpy.random.randint(0, 10) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].message, "Using global/shared random source using 'numpy.random.randint' can make execution not reproducible.") + + if hasattr(numpy.random, 'default_rng'): + with self.assertLogs(logger=utils.logger, level=logging.DEBUG) as cm: + with utils.global_randomness_warning(): + numpy.random.default_rng() + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].message, "Using 'numpy.random.default_rng' without a seed can make execution not reproducible.") + + def test_yaml_float_parsing(self): + self.assertEqual(json.loads('1000.0'), 1000) + self.assertEqual(utils.yaml_load('1000.0'), 1000) + + self.assertEqual(json.loads('1e+3'), 1000) + self.assertEqual(utils.yaml_load('1e+3'), 1000) + + +if __name__ == '__main__': + unittest.main() diff --git a/datasets/anomaly/kpi/SCORE/dataset_TEST/datasetDoc.json b/datasets/anomaly/kpi/SCORE/dataset_TEST/datasetDoc.json new file mode 100644 index 0000000..2a04d60 --- /dev/null +++ b/datasets/anomaly/kpi/SCORE/dataset_TEST/datasetDoc.json @@ -0,0 +1,63 @@ +{ + "about": { + "datasetID": "kpi_dataset_TEST", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 4 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly/kpi/SCORE/dataset_TEST/tables/learningData.csv b/datasets/anomaly/kpi/SCORE/dataset_TEST/tables/learningData.csv new file mode 100644 index 0000000..b9e432d --- /dev/null +++ b/datasets/anomaly/kpi/SCORE/dataset_TEST/tables/learningData.csv @@ -0,0 +1,1758 @@ +d3mIndex,timestamp,value,ground_truth +7027,1475026500,0.32264705162415364,0 +7028,1475026800,0.32183430507799304,0 +7029,1475027100,0.31787914535951506,0 +7030,1475027400,0.3296732765365322,0 +7031,1475027700,0.33072178162272026,0 +7032,1475028000,0.3282773378117453,0 +7033,1475028300,0.3412378533449643,0 +7034,1475028600,0.3444485124115538,0 +7035,1475028900,0.34747304631385745,0 +7036,1475029200,0.34477423144747743,0 +7037,1475029500,0.34249419819706234,0 +7038,1475029800,0.3547319276800169,0 +7039,1475030100,0.3569188983482892,0 +7040,1475030400,0.3528241447571223,0 +7041,1475030700,0.3536617079911538,0 +7042,1475031000,0.3595928965267984,0 +7043,1475031300,0.3414456931108743,0 +7044,1475031600,0.3444702270131781,0 +7045,1475031900,0.3567327731850544,0 +7046,1475032200,0.344169324666176,0 +7047,1475032500,0.34747304631385745,0 +7048,1475032800,0.3413309159271072,0 +7049,1475033100,0.3411665053665474,0 +7050,1475033400,0.3484253867327069,0 +7051,1475033700,0.3466571976814503,0 +7052,1475034000,0.3524518944306527,0 +7053,1475034300,0.3450999504823503,0 +7054,1475034600,0.34230807303382743,0 +7055,1475034900,0.32953368266384336,0 +7056,1475035200,0.3585940248174029,0 +7057,1475035500,0.3494738918188949,0 +7058,1475035800,0.3478918279308732,0 +7059,1475036100,0.3570584922209781,0 +7060,1475036400,0.3642925568982159,0 +7061,1475036700,0.3735522837694128,0 +7062,1475037000,0.371529723662751,0 +7063,1475037300,0.36375899809743295,0 +7064,1475037600,0.3717623801165319,0 +7065,1475037900,0.3745759721677299,0 +7066,1475038200,0.3771134785597968,0 +7067,1475038500,0.38916508287951657,0 +7068,1475038800,0.3930954259090729,0 +7069,1475039100,0.3960051826276095,0 +7070,1475039400,0.3930023633279808,0 +7071,1475039700,0.37371669432997257,0 +7072,1475040000,0.3825328228962828,0 +7073,1475040300,0.35663971060291155,0 +7074,1475040600,0.3567793044756004,0 +7075,1475040900,0.3473334524411687,0 +7076,1475041200,0.35570908478673724,0 +7077,1475041500,0.3453077902482603,0 +7078,1475041800,0.3417031662535769,0 +7079,1475042100,0.34623841606443456,0 +7080,1475042400,0.3279050874852757,0 +7081,1475042700,0.3283486857912131,0 +7082,1475043000,0.3181583331038419,0 +7083,1475043300,0.3134586727324244,0 +7084,1475043600,0.3296050306433111,0 +7085,1475043900,0.3221817387155411,0 +7086,1475044200,0.3210184564455859,0 +7087,1475044500,0.31680892567065194,0 +7088,1475044800,0.3279764354647434,0 +7089,1475045100,0.3037088162647441,0 +7090,1475045400,0.30889860623264503,0 +7091,1475045700,0.3124349843341073,0 +7092,1475046000,0.3008021616321388,0 +7093,1475046300,0.3049651611168421,0 +7094,1475046600,0.3101084197941969,0 +7095,1475046900,0.302173283668004,0 +7096,1475047200,0.3071304171824393,0 +7097,1475047500,0.3046177274782433,0 +7098,1475047800,0.3044998482082296,0 +7099,1475048100,0.296124215862766,0 +7100,1475048400,0.3014287830150645,0 +7101,1475048700,0.2965678141684933,0 +7102,1475049000,0.2985686596733205,0 +7103,1475049300,0.2996854106527297,0 +7104,1475049600,0.2962420951327797,0 +7105,1475049900,0.2976845651479024,0 +7106,1475050200,0.29400859317396144,0 +7107,1475050500,0.3005912197805077,0 +7108,1475050800,0.29345021768425683,0 +7109,1475051100,0.28656358664446197,0 +7110,1475051400,0.2895881205470809,0 +7111,1475051700,0.2858873318847437,0 +7112,1475052000,0.28960983514944083,0 +7113,1475052300,0.28656358664446197,0 +7114,1475052600,0.2957302509339364,0 +7115,1475052900,0.2834677047625855,0 +7116,1475053200,0.29375112003146897,0 +7117,1475053500,0.2945204373729098,0 +7118,1475053800,0.2802353310943717,0 +7119,1475054100,0.2943808435004311,0 +7120,1475054400,0.2876555209354749,0 +7121,1475054700,0.28837830698605443,0 +7122,1475055000,0.2714161004429708,0 +7123,1475055300,0.27325563747295945,0 +7124,1475055600,0.2763267026664397,0 +7125,1475055900,0.2695083175197609,0 +7126,1475056200,0.2664620690147821,0 +7127,1475056500,0.2637849687502365,0 +7128,1475056800,0.2611575018625886,0 +7129,1475057100,0.2521304314454878,0 +7130,1475057400,0.2520373688638704,0 +7131,1475057700,0.2764197652480572,0 +7132,1475058000,0.2607387202452576,0 +7133,1475058300,0.24971080432343465,0 +7134,1475058600,0.2539916830779415,0 +7135,1475058900,0.2424519229571701,0 +7136,1475059200,0.2554093364045628,0 +7137,1475059500,0.2629008742249235,0 +7138,1475059800,0.2538055579147066,0 +7139,1475060100,0.2649482510205069,0 +7140,1475060400,0.2632979412397895,0 +7141,1475060700,0.264926536418147,0 +7142,1475061000,0.2610644392809712,0 +7143,1475061300,0.25969021515906965,0 +7144,1475061600,0.27055682260605396,0 +7145,1475061900,0.2705102913151928,0 +7146,1475062200,0.27753651622746633,0 +7147,1475062500,0.2853754876857953,0 +7148,1475062800,0.2817243324002506,0 +7149,1475063100,0.2922404041232303,0 +7150,1475063400,0.29065834023573395,0 +7151,1475063700,0.2966360600617144,0 +7152,1475064000,0.2832350483085944,0 +7153,1475064300,0.293518463577478,0 +7154,1475064600,0.2871436767365265,0 +7155,1475064900,0.29491440230173943,0 +7156,1475065200,0.2984042491124456,0 +7157,1475065500,0.3362590052292199,0 +7158,1475065800,0.31566735800232104,0 +7159,1475066100,0.2938907139039477,0 +7160,1475066400,0.3159465457476987,0 +7161,1475066700,0.30140706841270465,0 +7162,1475067000,0.2997536565458457,0 +7163,1475067300,0.2992200977446425,0 +7164,1475067600,0.2795125450437922,0 +7165,1475067900,0.2827945520889036,0 +7166,1475068200,0.2817243324002506,0 +7167,1475068500,0.2869358369709317,0 +7168,1475068800,0.29658952877085315,0 +7169,1475069100,0.2933106238117782,0 +7170,1475069400,0.2914959034702908,0 +7171,1475069700,0.2925195918680826,0 +7172,1475070000,0.30205850648402666,0 +7173,1475070300,0.2888901511848977,0 +7174,1475070600,0.2945886832660259,0 +7175,1475070900,0.30221981495876016,0 +7176,1475071200,0.2890297450573764,0 +7177,1475071500,0.2816995157118543,0 +7178,1475071800,0.2820252347475679,0 +7179,1475072100,0.2830023918544983,0 +7180,1475072400,0.2746050449064648,0 +7181,1475072700,0.2782344855895445,0 +7182,1475073000,0.281165956910546,0 +7183,1475073300,0.2609248454084925,0 +7184,1475073600,0.2648086571480282,0 +7185,1475073900,0.2579220261082334,0 +7186,1475074200,0.2676470658874123,0 +7187,1475074500,0.24947814786944364,0 +7188,1475074800,0.2447784874977109,0 +7189,1475075100,0.23386534875986525,0 +7190,1475075400,0.2357514170806101,0 +7191,1475075700,0.2283281251534706,0 +7192,1475076000,0.2276084411890325,0 +7193,1475076300,0.21660223986956945,0 +7194,1475076600,0.21753286568574376,0 +7195,1475076900,0.20918205002846632,0 +7196,1475077200,0.20020151090228985,0 +7197,1475077500,0.20303991964166346,0 +7198,1475077800,0.19931741637691366,0 +7199,1475078100,0.18635690084348466,0 +7200,1475078400,0.18859040280232395,0 +7201,1475078700,0.17681798622754055,0 +7202,1475079000,0.18230867854307395,0 +7203,1475079300,0.16653457095865698,0 +7204,1475079600,0.1629764782548348,0 +7205,1475079900,0.15660169141388328,0 +7206,1475080200,0.1467835890530869,0 +7207,1475080500,0.14766768357850502,0 +7208,1475080800,0.13910592606959646,0 +7209,1475081100,0.13633576322343344,0 +7210,1475081400,0.13194010728494932,0 +7211,1475081700,0.1272869782039728,0 +7212,1475082000,0.12249425525062264,0 +7213,1475082300,0.11211777740012167,0 +7214,1475082600,0.10723199186515407,0 +7215,1475082900,0.10087891962666752,0 +7216,1475083200,0.0976465459584538,0 +7217,1475083500,0.09383098011203414,0 +7218,1475083800,0.08482562429739833,0 +7219,1475084100,0.08080221868569913,0 +7220,1475084400,0.08082393328732347,0 +7221,1475084700,0.07554418282352629,0 +7222,1475085000,0.07328586417683709,0 +7223,1475085300,0.07188992545205025,0 +7224,1475085600,0.06672495217307105,0 +7225,1475085900,0.06828530145736683,0 +7226,1475086200,0.062422358815994186,0 +7227,1475086500,0.0631203281783876,0 +7228,1475086800,0.0577226984435259,0 +7229,1475087100,0.056280228429243837,0 +7230,1475087400,0.054626816561754436,0 +7231,1475087700,0.05669901004625956,0 +7232,1475088000,0.052579439766171,0 +7233,1475088300,0.057396979408653015,0 +7234,1475088600,0.05271903363885985,0 +7235,1475088900,0.051276563623527,0 +7236,1475089200,0.05116178643975985,0 +7237,1475089500,0.0513013803124487,0 +7238,1475089800,0.05243984589453297,0 +7239,1475090100,0.052067595568063264,0 +7240,1475090400,0.05172016192946443,0 +7241,1475090700,0.047439283175062685,0 +7242,1475091000,0.047740185522064715,0 +7243,1475091300,0.046462126068342366,0 +7244,1475091600,0.04441474927275894,0 +7245,1475091900,0.0436237173292735,0 +7246,1475092200,0.04120409020616953,0 +7247,1475092500,0.04348412345658465,0 +7248,1475092800,0.04278615409419124,0 +7249,1475093100,0.04299399386010122,0 +7250,1475093400,0.04108621093615582,0 +7251,1475093700,0.04246043505826752,0 +7252,1475094000,0.0424139037677215,0 +7253,1475094300,0.04117927351829865,0 +7254,1475094600,0.04285439998741237,0 +7255,1475094900,0.04255349764041035,0 +7256,1475095200,0.041504992554222346,0 +7257,1475095500,0.041132742227752636,0 +7258,1475095800,0.04339106087444181,0 +7259,1475096100,0.040971433752388674,0 +7260,1475096400,0.03929630728327494,0 +7261,1475096700,0.04031999568159207,0 +7262,1475097000,0.04280786869686635,0 +7263,1475097300,0.04115755891562354,0 +7264,1475097600,0.04255349764041035,0 +7265,1475097900,0.04208818473179781,0 +7266,1475098200,0.0413188673909875,0 +7267,1475098500,0.0413188673909875,0 +7268,1475098800,0.04557803154271409,0 +7269,1475099100,0.04632253219565356,0 +7270,1475099400,0.04841644028178297,0 +7271,1475099700,0.04683437639481212,0 +7272,1475100000,0.05144097418513755,0 +7273,1475100300,0.05232506870971503,0 +7274,1475100600,0.05797706949998192,0 +7275,1475100900,0.056767255939480725,0 +7276,1475101200,0.06551513861151907,0 +7277,1475101500,0.06414401657565393,0 +7278,1475101800,0.0640974852851079,0 +7279,1475102100,0.0660052682080025,0 +7280,1475102400,0.07026133027453338,0 +7281,1475102700,0.07149596052395624,0 +7282,1475103000,0.07317108699306994,0 +7283,1475103300,0.07514711580918569,0 +7284,1475103600,0.07859043132850517,0 +7285,1475103900,0.08021902650707287,0 +7286,1475104200,0.08454643655307144,0 +7287,1475104500,0.0854305310776489,0 +7288,1475104800,0.08652556745543376,0 +7289,1475105100,0.08682646980243576,0 +7290,1475105400,0.08719872012890545,0 +7291,1475105700,0.0928072917138221,0 +7292,1475106000,0.0912252278268512,0 +7293,1475106300,0.09501597698466444,0 +7294,1475106600,0.10369561376369177,0 +7295,1475106900,0.10176301415208568,0 +7296,1475107200,0.10364908247283053,0 +7297,1475107500,0.1077438360641025,0 +7298,1475107800,0.11169899578289576,0 +7299,1475108100,0.1208408433838688,0 +7300,1475108400,0.11490965484874965,0 +7301,1475108700,0.11870040400666795,0 +7302,1475109000,0.11879346658828535,0 +7303,1475109300,0.1214922814551908,0 +7304,1475109600,0.12835719789262576,0 +7305,1475109900,0.12186453178166053,0 +7306,1475110200,0.12688991118910678,0 +7307,1475110500,0.13098466478037873,0 +7308,1475110800,0.14366599256895554,0 +7309,1475111100,0.13545166869801534,0 +7310,1475111400,0.1395464222892873,0 +7311,1475111700,0.14645787001747845,0 +7312,1475112000,0.14405995749778516,0 +7313,1475112300,0.15041302973627171,0 +7314,1475112600,0.15622944108741355,0 +7315,1475112900,0.15308702791478088,0 +7316,1475113200,0.15367022009288173,0 +7317,1475113500,0.1577401569957574,0 +7318,1475113800,0.16897901476921154,0 +7319,1475114100,0.16688510668276682,0 +7320,1475114400,0.1693977963864374,0 +7321,1475114700,0.168001857662176,0 +7322,1475115000,0.16997788647860698,0 +7323,1475115300,0.17419051933989266,0 +7324,1475115600,0.16960563615213722,0 +7325,1475115900,0.17532898492166168,0 +7326,1475116200,0.1670712318460017,0 +7327,1475116500,0.17877230044150655,0 +7328,1475116800,0.1746558322479798,0 +7329,1475117100,0.1758191145181451,0 +7330,1475117400,0.17467754685033965,0 +7331,1475117700,0.1788188317323678,0 +7332,1475118000,0.1797494575485421,0 +7333,1475118300,0.18596293591465504,0 +7334,1475118600,0.1844739346086711,0 +7335,1475118900,0.17339948739609193,0 +7336,1475119200,0.17705064268163664,0 +7337,1475119500,0.17128076262125094,0 +7338,1475119800,0.16681375870350929,0 +7339,1475120100,0.1842164614662838,0 +7340,1475120400,0.1843095240479012,0 +7341,1475120700,0.18377596524659293,0 +7342,1475121000,0.19908475992285968,0 +7343,1475121300,0.1939415012453997,0 +7344,1475121600,0.2005737612287596,0 +7345,1475121900,0.21095023907930247,0 +7346,1475122200,0.22572237286818256,0 +7347,1475122500,0.2368216367692631,0 +7348,1475122800,0.2511067430476961,0 +7349,1475123100,0.2615762834798145,0 +7350,1475123400,0.28062929602320136,0 +7351,1475123700,0.277744355993061,0 +7352,1475124000,0.2713478545498547,0 +7353,1475124300,0.2875872750422537,0 +7354,1475124600,0.2715339797130896,0 +7355,1475124900,0.27039241204517905,0 +7356,1475125200,0.2593179648325999,0 +7357,1475125500,0.2555489302770415,0 +7358,1475125800,0.2493602685993249,0 +7359,1475126100,0.2434290800642057,0 +7360,1475126400,0.2381245129119071,0 +7361,1475126700,0.22241865122071133,0 +7362,1475127000,0.20887804559521767,0 +7363,1475127300,0.20415667062108311,0 +7364,1475127600,0.19252384791874685,0 +7365,1475127900,0.1901507520874604,0 +7366,1475128200,0.17954161778294736,0 +7367,1475128500,0.17546857879403527,0 +7368,1475128800,0.17046801607509038,0 +7369,1475129100,0.16383575609175155,0 +7370,1475129400,0.16162707082132954,0 +7371,1475129700,0.1565551601231271,0 +7372,1475130000,0.16313778672956827,0 +7373,1475130300,0.16139441436723342,0 +7374,1475130600,0.16497732375955698,0 +7375,1475130900,0.16320913470882584,0 +7376,1475131200,0.16688510668276682,0 +7377,1475131500,0.16078950758677268,0 +7378,1475131800,0.16348832245367814,0 +7379,1475132100,0.15636903495989227,0 +7380,1475132400,0.16825622871863194,0 +7381,1475132700,0.1679770409737797,0 +7382,1475133000,0.1589034392659227,0 +7383,1475133300,0.16420800641822128,0 +7384,1475133600,0.1661406060298274,0 +7385,1475133900,0.15827681788299702,0 +7386,1475134200,0.1581372240106234,0 +7387,1475134500,0.16104387864322867,0 +7388,1475134800,0.16095081606161127,0 +7389,1475135100,0.16290513027557724,0 +7390,1475135400,0.1705610786567078,0 +7391,1475135700,0.16592966417819624,0 +7392,1475136000,0.16188144187778555,0 +7393,1475136300,0.1679553263714198,0 +7394,1475136600,0.16597619546895245,0 +7395,1475136900,0.16923338582566752,0 +7396,1475137200,0.16869982702435926,0 +7397,1475137500,0.16816316613701454,0 +7398,1475137800,0.16895419808081524,0 +7399,1475138100,0.17228273641668282,0 +7400,1475138400,0.17402610877901767,0 +7401,1475138700,0.1787040545483905,0 +7402,1475139000,0.18149593199691336,0 +7403,1475139300,0.1832641210476445,0 +7404,1475139600,0.1798673368185558,0 +7405,1475139900,0.18086620852795124,0 +7406,1475140200,0.1858698733330376,0 +7407,1475140500,0.2018052893921881,0 +7408,1475140800,0.19847985314234112,0 +7409,1475141100,0.1994104789585312,0 +7410,1475141400,0.1929178128475975,0 +7411,1475141700,0.20757516945254226,0 +7412,1475142000,0.2017835747898072,0 +7413,1475142300,0.2094612377733186,0 +7414,1475142600,0.21634786881311344,0 +7415,1475142900,0.2105997033551927,0 +7416,1475143200,0.2147658049256172,0 +7417,1475143500,0.2158112079257688,0 +7418,1475143800,0.2109254223908012,0 +7419,1475144100,0.2267925925568356,0 +7420,1475144400,0.21751115108338387,0 +7421,1475144700,0.2202564972411505,0 +7422,1475145000,0.2234671563070044,0 +7423,1475145300,0.21932587142497625,0 +7424,1475145600,0.22132671692969846,0 +7425,1475145900,0.2261876857762697,0 +7426,1475146200,0.23635632386117605,0 +7427,1475146500,0.2259798460106749,0 +7428,1475146800,0.2263738109395045,0 +7429,1475147100,0.2305864438007903,0 +7430,1475147400,0.2418004848858481,0 +7431,1475147700,0.23889072816720644,0 +7432,1475148000,0.2488732410888778,0 +7433,1475148300,0.2500830546499044,0 +7434,1475148600,0.25289664670089224,0 +7435,1475148900,0.2560390598735249,0 +7436,1475149200,0.2524561504812013,0 +7437,1475149500,0.2655996890918289,0 +7438,1475149800,0.2696944426829958,0 +7439,1475150100,0.2600159341946781,0 +7440,1475150400,0.2715805110038457,0 +7441,1475150700,0.2727437932741161,0 +7442,1475151000,0.27148744842222833,0 +7443,1475151300,0.2720458239119329,0 +7444,1475151600,0.2779552978446922,0 +7445,1475151900,0.2750455411261556,0 +7446,1475152200,0.27969867020702704,0 +7447,1475152500,0.27292991843735104,0 +7448,1475152800,0.2846309870328559,0 +7449,1475153100,0.2743723884523686,0 +7450,1475153400,0.2822113599108027,0 +7451,1475153700,0.29072658612885,0 +7452,1475154000,0.2876803376238712,0 +7453,1475154300,0.28414395952240884,0 +7454,1475154600,0.2844913931603772,0 +7455,1475154900,0.28647052406284457,0 +7456,1475155200,0.28076888989568005,0 +7457,1475155500,0.29070487152649016,0 +7458,1475155800,0.2940768390671825,0 +7459,1475156100,0.2925661231588387,0 +7460,1475156400,0.28363211532346044,0 +7461,1475156700,0.28833177569519314,0 +7462,1475157000,0.2873080872974014,0 +7463,1475157300,0.29170374323588555,0 +7464,1475157600,0.2816312698186332,0 +7465,1475157900,0.28635264479283085,0 +7466,1475158200,0.2927057170313174,0 +7467,1475158500,0.2786067359161193,0 +7468,1475158800,0.28986730829193325,0 +7469,1475159100,0.2874942124606363,0 +7470,1475159400,0.2751634203961693,0 +7471,1475159700,0.2706250684992752,0 +7472,1475160000,0.2682519726679782,0 +7473,1475160300,0.26411068778595004,0 +7474,1475160600,0.2754891394318829,0 +7475,1475160900,0.26818372677475705,0 +7476,1475161200,0.2478030214002248,0 +7477,1475161500,0.2479426152727035,0 +7478,1475161800,0.23232981616312515,0 +7479,1475162100,0.23754132073380624,0 +7480,1475162400,0.2335861610150129,0 +7481,1475162700,0.2379849190395335,0 +7482,1475163000,0.21734674052250888,0 +7483,1475163300,0.22041780571588404,0 +7484,1475163600,0.2128580220024073,0 +7485,1475163900,0.205645671926983,0 +7486,1475164200,0.19403456382701714,0 +7487,1475164500,0.18023958714502555,0 +7488,1475164800,0.18449564921113606,0 +7489,1475165100,0.17760901817134125,0 +7490,1475165400,0.1739578628857965,0 +7491,1475165700,0.15727484408767026,0 +7492,1475166000,0.15083181135360266,0 +7493,1475166300,0.15085352595596255,0 +7494,1475166600,0.1451549938748344,0 +7495,1475166900,0.1325201973770138,0 +7496,1475167200,0.12510000753601574,0 +7497,1475167500,0.1167709064810982,0 +7498,1475167800,0.11490965484874965,0 +7499,1475168100,0.11058224480348663,0 +7500,1475168400,0.10295111311075236,0 +7501,1475168700,0.0993682037184288,0 +7502,1475169000,0.0974138895043577,0 +7503,1475169300,0.08927091361235978,0 +7504,1475169600,0.08459296784361743,0 +7505,1475169900,0.08147537135917088,0 +7506,1475170200,0.07542630355351257,0 +7507,1475170500,0.07442743184411715,0 +7508,1475170800,0.07314627030414825,0 +7509,1475171100,0.06926245856513794,0 +7510,1475171400,0.06732985895332161,0 +7511,1475171700,0.061631326872508725,0 +7512,1475172000,0.06195704590738161,0 +7513,1475172300,0.0646775753774876,0 +7514,1475172600,0.06293420301515275,0 +7515,1475172900,0.05776922973512274,0 +7516,1475173200,0.05662766206679187,0 +7517,1475173500,0.05367447614395582,0 +7518,1475173800,0.05248637718507899,0 +7519,1475174100,0.05309128396532956,0 +7520,1475174400,0.05386060130719069,0 +7521,1475174700,0.050743004822744096,0 +7522,1475175000,0.05230025202184412,0 +7523,1475175300,0.0541863203420636,0 +7524,1475175600,0.048649096736614654,0 +7525,1475175900,0.05009156675089673,0 +7526,1475176200,0.04841644028178297,0 +7527,1475176500,0.04608987574187265,0 +7528,1475176800,0.046716497124798376,0 +7529,1475177100,0.04401768225736754,0 +7530,1475177400,0.04359890064035179,0 +7531,1475177700,0.04294746256850437,0 +7532,1475178000,0.04264656022150239,0 +7533,1475178300,0.04101796504293469,0 +7534,1475178600,0.0405061208448269,0 +7535,1475178900,0.04243561837039664,0 +7536,1475179200,0.04190205956856294,0 +7537,1475179500,0.041458461262625534,0 +7538,1475179800,0.041551523844768366,0 +7539,1475180100,0.039901214063525516,0 +7540,1475180400,0.04190205956856294,0 +7541,1475180700,0.040692246008061775,0 +7542,1475181000,0.0389240569568052,0 +7543,1475181300,0.04139021536940436,0 +7544,1475181600,0.041715934405328094,0 +7545,1475181900,0.038179556303865776,0 +7546,1475182200,0.04010905382943551,0 +7547,1475182500,0.04080702319182894,0 +7548,1475182800,0.04115755891562354,0 +7549,1475183100,0.037016274033910605,0 +7550,1475183400,0.04010905382943551,0 +7551,1475183700,0.04122580480884467,0 +7552,1475184000,0.04038824157481321,0 +7553,1475184300,0.040155585119981525,0 +7554,1475184600,0.04083183987969982,0 +7555,1475184900,0.03987639737565465,0 +7556,1475185200,0.04227430989503268,0 +7557,1475185500,0.04478699959922865,0 +7558,1475185800,0.04536708969160833,0 +7559,1475186100,0.04664825123157725,0 +7560,1475186400,0.05146268878676186,0 +7561,1475186700,0.05274385032778155,0 +7562,1475187000,0.05834931982645161,0 +7563,1475187300,0.05932647693422274,0 +7564,1475187600,0.0621648856732916,0 +7565,1475187900,0.0668645460447091,0 +7566,1475188200,0.06679630015148794,0 +7567,1475188500,0.06863273509596564,0 +7568,1475188800,0.07035439285562536,0 +7569,1475189100,0.0710306476153437,0 +7570,1475189400,0.07372946248277452,0 +7571,1475189700,0.07898749834389658,0 +7572,1475190000,0.08243081386426683,0 +7573,1475190300,0.08659381334865487,0 +7574,1475190600,0.09250328728057353,0 +7575,1475190900,0.08682646980243576,0 +7576,1475191200,0.09178360331655576,0 +7577,1475191500,0.092968600189081,0 +7578,1475191800,0.08922438232181379,0 +7579,1475192100,0.09697029119873553,0 +7580,1475192400,0.09655150958140456,0 +7581,1475192700,0.1079764925180935,0 +7582,1475193000,0.10769730477324126,0 +7583,1475193300,0.10509155248795324,0 +7584,1475193600,0.1082773948654108,0 +7585,1475193900,0.1082773948654108,0 +7586,1475194200,0.11711834011917162,0 +7587,1475194500,0.11483830686949208,0 +7588,1475194800,0.1143047480681838,0 +7589,1475195100,0.1182133764962209,0 +7590,1475195400,0.12000328014931196,0 +7591,1475195700,0.12100525394474378,0 +7592,1475196000,0.12430897559221506,0 +7593,1475196300,0.1248673510819196,0 +7594,1475196600,0.13007885565260074,0 +7595,1475196900,0.1352438289324206,0 +7596,1475197200,0.1384762026006343,0 +7597,1475197500,0.1411284861767836,0 +7598,1475197800,0.14229176844705402,0 +7599,1475198100,0.14790034003260116,0 +7600,1475198400,0.14543418161968671,0 +7601,1475198700,0.14838736754304815,0 +7602,1475199000,0.15004077940980198,0 +7603,1475199300,0.1525069378227164,0 +7604,1475199600,0.15427512687344755,0 +7605,1475199900,0.16239328607662887,0 +7606,1475200200,0.16502385505041825,0 +7607,1475200500,0.16672069612189186,0 +7608,1475200800,0.16267247382148114,0 +7609,1475201100,0.16946604227965853,0 +7610,1475201400,0.17847139809429438,0 +7611,1475201700,0.17644573590107082,0 +7612,1475202000,0.1721431425442041,0 +7613,1475202300,0.16921167122320258,0 +7614,1475202600,0.1674434821724714,0 +7615,1475202900,0.16509210094363938,0 +7616,1475203200,0.16818798282541084,0 +7617,1475203500,0.17802779978856711,0 +7618,1475203800,0.17521420773768434,0 +7619,1475204100,0.17979598883940334,0 +7620,1475204400,0.1797742742369384,0 +7621,1475204700,0.17847139809429438,0 +7622,1475205000,0.1763526733194534,0 +7623,1475205300,0.16628019990230608,0 +7624,1475205600,0.1747706094319571,0 +7625,1475205900,0.1673038883000978,0 +7626,1475206200,0.17037495349347295,0 +7627,1475206500,0.17526073902844053,0 +7628,1475206800,0.1769823967884155,0 +7629,1475207100,0.18035436432910792,0 +7630,1475207400,0.18947449732770005,0 +7631,1475207700,0.18742712053208507,0 +7632,1475208000,0.19871250959638787,0 +7633,1475208300,0.2065762977431678,0 +7634,1475208600,0.22039609111352407,0 +7635,1475208900,0.2313061277653334,0 +7636,1475209200,0.2403114835799692,0 +7637,1475209500,0.24731289180374136,0 +7638,1475209800,0.2546648357516233,0 +7639,1475210100,0.2574101819093901,0 +7640,1475210400,0.2671817529793252,0 +7641,1475210700,0.27814142300792705,0 +7642,1475211000,0.2727655078764761,0 +7643,1475211300,0.2663441897447683,0 +7644,1475211600,0.2507344927212264,0 +7645,1475211900,0.2422657977939353,0 +7646,1475212200,0.2380314503302897,0 +7647,1475212500,0.2371938870957329,0 +7648,1475212800,0.21858137077203685,0 +7649,1475213100,0.2115086145690072,0 +7650,1475213400,0.2101343904471056,0 +7651,1475213700,0.205971390962644,0 +7652,1475214000,0.1927565043727904,0 +7653,1475214300,0.18398380501218767,0 +7654,1475214600,0.1925703792095556,0 +7655,1475214900,0.17586564580900635,0 +7656,1475215200,0.17246886157991764,0 +7657,1475215500,0.17300242038122596,0 +7658,1475215800,0.1697700467130122,0 +7659,1475216100,0.16169531671455067,0 +7660,1475216400,0.16427935439747882,0 +7661,1475216700,0.16381404148939166,0 +7662,1475217000,0.1567164685978606,0 +7663,1475217300,0.15992712766371445,0 +7664,1475217600,0.1548800336540134,0 +7665,1475217900,0.1579045675565273,0 +7666,1475218200,0.15499481083799072,0 +7667,1475218500,0.16190625856618182,0 +7668,1475218800,0.15862425152107046,0 +7669,1475219100,0.16262594253072496,0 +7670,1475219400,0.1547621543838946,0 +7671,1475219700,0.15848465764869685,0 +7672,1475220000,0.16239328607662887,0 +7673,1475220300,0.15753231723005756,0 +7674,1475220600,0.15278612556756868,0 +7675,1475220900,0.15643728085300831,0 +7676,1475221200,0.14631827614499976,0 +7677,1475221500,0.16069644500515526,0 +7678,1475221800,0.15457602922065972,0 +7679,1475222100,0.15888172466356282,0 +7680,1475222400,0.1556462489093127,0 +7681,1475222700,0.15962622531650228,0 +7682,1475223000,0.16155572284207198,0 +7683,1475223300,0.1650703863411744,0 +7684,1475223600,0.16590794957583635,0 +7685,1475223900,0.1687463583151154,0 +7686,1475224200,0.1584164117554757,0 +7687,1475224500,0.17023535962109934,0 +7688,1475224800,0.16169531671455067,0 +7689,1475225100,0.17416570265139128,0 +7690,1475225400,0.17053626196831154,0 +7691,1475225700,0.17721505324251166,0 +7692,1475226000,0.1789584256047414,0 +7693,1475226300,0.18075143134397392,0 +7694,1475226600,0.17847139809429438,0 +7695,1475226900,0.1831462417776308,0 +7696,1475227200,0.1974561647445347,0 +7697,1475227500,0.18500749340997927,0 +7698,1475227800,0.19433856826030774,0 +7699,1475228100,0.19650072223991047,0 +7700,1475228400,0.188987469817232,0 +7701,1475228700,0.19617500320424952,0 +7702,1475229000,0.2046685148199895,0 +7703,1475229300,0.20329429069808794,0 +7704,1475229600,0.1920585350106492,0 +7705,1475229900,0.20122519930009208,0 +7706,1475230200,0.2010390741368572,0 +7707,1475230500,0.20613269943745105,0 +7708,1475230800,0.20613269943745105,0 +7709,1475231100,0.2081118303398764,0 +7710,1475231400,0.21327680361973828,0 +7711,1475231700,0.20487635458560524,0 +7712,1475232000,0.20620404741669807,0 +7713,1475232300,0.21858137077203685,0 +7714,1475232600,0.21692795890517788,0 +7715,1475232900,0.21699930688443544,0 +7716,1475233200,0.21788340140985354,0 +7717,1475233500,0.2159973330890036,0 +7718,1475233800,0.2291439737856676,0 +7719,1475234100,0.22923703636728496,0 +7720,1475234400,0.22260477638394616,0 +7721,1475234700,0.22681740924523186,0 +7722,1475235000,0.22253653049083008,0 +7723,1475235300,0.22877172345919786,0 +7724,1475235600,0.2339584113414827,0 +7725,1475235900,0.2378670397694147,0 +7726,1475236200,0.2493137373085687,0 +7727,1475236500,0.24619614082433225,0 +7728,1475236800,0.25831909312320445,0 +7729,1475237100,0.25375902662384536,0 +7730,1475237400,0.2555954615677977,0 +7731,1475237700,0.26911435259093136,0 +7732,1475238000,0.2707181310808926,0 +7733,1475238300,0.2657640996527039,0 +7734,1475238600,0.2643216296375812,0 +7735,1475238900,0.26597193941829866,0 +7736,1475239200,0.25790031150587345,0 +7737,1475239500,0.26818372677475705,0 +7738,1475239800,0.2628791596224585,0 +7739,1475240100,0.25515496534810683,0 +7740,1475240400,0.2565509040724733,0 +7741,1475240700,0.26615806458153346,0 +7742,1475241000,0.2681123787954995,0 +7743,1475241300,0.27895416955408764,0 +7744,1475241600,0.2816995157118543,0 +7745,1475241900,0.27118344398897976,0 +7746,1475242200,0.27434757176397234,0 +7747,1475242500,0.2703706974428192,0 +7748,1475242800,0.2749772952329345,0 +7749,1475243100,0.2694152549381435,0 +7750,1475243400,0.27248632013162377,0 +7751,1475243700,0.26908953590253504,0 +7752,1475244000,0.2738822588558852,0 +7753,1475244300,0.2652057241629993,0 +7754,1475244600,0.2707181310808926,0 +7755,1475244900,0.2743723884523686,0 +7756,1475245200,0.26841638322885314,0 +7757,1475245500,0.25808643666910835,0 +7758,1475245800,0.2680658475047433,0 +7759,1475246100,0.2583873390163205,0 +7760,1475246400,0.2562251850367598,0 +7761,1475246700,0.25375902662384536,0 +7762,1475247000,0.2494316165785824,0 +7763,1475247300,0.2524096191904452,0 +7764,1475247600,0.2515472392673869,0 +7765,1475247900,0.24626748880358976,0 +7766,1475248200,0.23751960613144635,0 +7767,1475248500,0.2333317899585569,0 +7768,1475248800,0.220185149261893,0 +7769,1475249100,0.21776552213983985,0 +7770,1475249400,0.20429626449350924,0 +7771,1475249700,0.20143303906570784,0 +7772,1475250000,0.1998975064689993,0 +7773,1475250300,0.19215159759226666,0 +7774,1475250600,0.1914753428325063,0 +7775,1475250900,0.1825661516855664,0 +7776,1475251200,0.17537551621241784,0 +7777,1475251500,0.1741439880490314,0 +7778,1475251800,0.16150919155131582,0 +7779,1475252100,0.15555318632769527,0 +7780,1475252400,0.14405995749778516,0 +7781,1475252700,0.14264230417105875,0 +7782,1475253000,0.12889075669393404,0 +7783,1475253300,0.1319618218873092,0 +7784,1475253600,0.12316740792430453,0 +7785,1475253900,0.12081912878150892,0 +7786,1475254200,0.11986368627693834,0 +7787,1475254500,0.11253655901745263,0 +7788,1475254800,0.10185607673370307,0 +7789,1475255100,0.10059973188181526,0 +7790,1475255400,0.0938526947144991,0 +7791,1475255700,0.0920845056635578,0 +7792,1475256000,0.08059127683354259,0 +7793,1475256300,0.07779939938501973,0 +7794,1475256600,0.07600949573213882,0 +7795,1475256900,0.06965642349323196,0 +7796,1475257200,0.06805264500358592,0 +7797,1475257500,0.06900498542243537,0 +7798,1475257800,0.06928417316676225,0 +7799,1475258100,0.06405095399456187,0 +7800,1475258400,0.06004926298448705,0 +7801,1475258700,0.061746104056275876,0 +7802,1475259000,0.058653324259700185,0 +7803,1475259300,0.05779094433674705,0 +7804,1475259600,0.05497735228659987,0 +7805,1475259900,0.05569703625061759,0 +7806,1475260200,0.05397537849095788,0 +7807,1475260500,0.04809072124691008,0 +7808,1475260800,0.05055687965950928,0 +7809,1475261100,0.04729968930237384,0 +7810,1475261400,0.04815896714013123,0 +7811,1475261700,0.05015981264411786,0 +7812,1475262000,0.04534537508893322,0 +7813,1475262300,0.04387808838467869,0 +7814,1475262600,0.04699568486912525,0 +7815,1475262900,0.04557803154271409,0 +7816,1475263200,0.045878933889716124,0 +7817,1475263500,0.04443646387543407,0 +7818,1475263800,0.04359890064035179,0 +7819,1475264100,0.0424139037677215,0 +7820,1475264400,0.04227430989503268,0 +7821,1475264700,0.04117927351829865,0 +7822,1475265000,0.04348412345658465,0 +7823,1475265300,0.0400873392267604,0 +7824,1475265600,0.043459306767662936,0 +7825,1475265900,0.04178418029854925,0 +7826,1475266200,0.04004080793621436,0 +7827,1475266500,0.03957549502865262,0 +7828,1475266800,0.0395289637370558,0 +7829,1475267100,0.042202961915565,0 +7830,1475267400,0.0382260875944118,0 +7831,1475267700,0.0388527089773375,0 +7832,1475268000,0.039783334793511815,0 +7833,1475268300,0.03924977599272892,0 +7834,1475268600,0.03913189672271518,0 +7835,1475268900,0.038340864779229766,0 +7836,1475269200,0.03969027221241976,0 +7837,1475269500,0.03924977599272892,0 +7838,1475269800,0.03969027221241976,0 +7839,1475270100,0.03929630728327494,0 +7840,1475270400,0.038179556303865776,0 +7841,1475270700,0.03827261888600862,0 +7842,1475271000,0.040133870518357186,0 +7843,1475271300,0.04287921667633407,0 +7844,1475271600,0.041504992554222346,0 +7845,1475271900,0.04348412345658465,0 +7846,1475272200,0.046276000905107535,0 +7847,1475272500,0.04339106087444181,0 +7848,1475272800,0.051906287092699295,0 +7849,1475273100,0.05334875710803215,0 +7850,1475273400,0.05697819779058647,0 +7851,1475273700,0.05969872726069245,0 +7852,1475274000,0.061026420091207324,0 +7853,1475274300,0.062096639780070476,0 +7854,1475274600,0.06393307472454816,0 +7855,1475274900,0.06486370054072246,0 +7856,1475275200,0.0694237670394511,0 +7857,1475275500,0.07098411632479767,0 +7858,1475275800,0.07351852063061798,0 +7859,1475276100,0.07542630355351257,0 +7860,1475276400,0.07989330747114919,0 +7861,1475276700,0.07791727865503344,0 +7862,1475277000,0.08215162611888915,0 +7863,1475277300,0.08252387644535887,0 +7864,1475277600,0.0872452514194515,0 +7865,1475277900,0.08471084711363118,0 +7866,1475278200,0.0895966326482835,0 +7867,1475278500,0.09175878662763408,0 +7868,1475278800,0.09513385625478324,0 +7869,1475279100,0.09994829381049326,0 +7870,1475279400,0.10325201545796454,0 +7871,1475279700,0.10981292746204584,0 +7872,1475280000,0.10332336343722208,0 +7873,1475280300,0.10737158573763278,0 +7874,1475280600,0.10809126970207084,0 +7875,1475280900,0.10655573710543582,0 +7876,1475281200,0.11616599970053235,0 +7877,1475281500,0.11725793399154524,0 +7878,1475281800,0.1149561861395058,0 +7879,1475282100,0.11574721808330647,0 +7880,1475282400,0.11965584651123852,0 +7881,1475282700,0.12440203817383247,0 +7882,1475283000,0.12926300702040378,0 +7883,1475283300,0.129706605326131,0 +7884,1475283600,0.14210564328381914,0 +7885,1475283900,0.14026920833986686,0 +7886,1475284200,0.14934281004761873,0 +7887,1475284500,0.1468766516347043,0 +7888,1475284800,0.14971506037408844,0 +7889,1475285100,0.14850524681306188,0 +7890,1475285400,0.15548494043447414,0 +7891,1475285700,0.15899650184754016,0 +7892,1475286000,0.1566947539955007,0 +7893,1475286300,0.15543840914371795,0 +7894,1475286600,0.1652565115044093,0 +7895,1475286900,0.16409322923424394,0 +7896,1475287200,0.16576835570335768,0 +7897,1475287500,0.1690255460599677,0 +7898,1475287800,0.16972351542215094,0 +7899,1475288100,0.16579007030571755,0 +7900,1475288400,0.17028189091185553,0 +7901,1475288700,0.17828527293105953,0 +7902,1475289000,0.1741439880490314,0 +7903,1475289300,0.16686028999437053,0 +7904,1475289600,0.16727907161159644,0 +7905,1475289900,0.1813098068336785,0 +7906,1475290200,0.16688510668276682,0 +7907,1475290500,0.17458448426872225,0 +7908,1475290800,0.16790879508055853,0 +7909,1475291100,0.17365385845254794,0 +7910,1475291400,0.1763309587170935,0 +7911,1475291700,0.17095504358553745,0 +7912,1475292000,0.17125904801889108,0 +7913,1475292300,0.16432588568823495,0 +7914,1475292600,0.16304472414795085,0 +7915,1475292900,0.16041725726030295,0 +7916,1475293200,0.16279035309159995,0 +7917,1475293500,0.16988482389698953,0 +7918,1475293800,0.17272323263637365,0 +7919,1475294100,0.18377596524659293,0 +7920,1475294400,0.2057387345086004,0 +7921,1475294700,0.2024101961727013,0 +7922,1475295000,0.21739327181337006,0 +7923,1475295300,0.22681740924523186,0 +7924,1475295600,0.2403331981823291,0 +7925,1475295900,0.2480573924566808,0 +7926,1475296200,0.2518729583031005,0 +7927,1475296500,0.2597367464498258,0 +7928,1475296800,0.26169106066379183,0 +7929,1475297100,0.26206331099026153,0 +7930,1475297400,0.2561786537460036,0 +7931,1475297700,0.257828963526616,0 +7932,1475298000,0.2540382143686977,0 +7933,1475298300,0.2381710442027684,0 +7934,1475298600,0.2365424490244108,0 +7935,1475298900,0.2220464008942416,0 +7936,1475299200,0.21478751952797706,0 +7937,1475299500,0.21115807884489726,0 +7938,1475299800,0.1968977892548228,0 +7939,1475300100,0.19999056905061666,0 +7940,1475300400,0.17563298935491026,0 +7941,1475300700,0.1812384588544209,0 +7942,1475301000,0.17479542612035334,0 +7943,1475301300,0.16502385505041825,0 +7944,1475301600,0.161301351785616,0 +7945,1475301900,0.16109040993398485,0 +7946,1475302200,0.1569956563427129,0 +7947,1475302500,0.1618131959845644,0 +7948,1475302800,0.15690259376109547,0 +7949,1475303100,0.16243981736749016,0 +7950,1475303400,0.15720659819444913,0 +7951,1475303700,0.16586141828497508,0 +7952,1475304000,0.16274382180073868,0 +7953,1475304300,0.15555318632769527,0 +7954,1475304600,0.1553918778528567,0 +7955,1475304900,0.16234675478587268,0 +7956,1475305200,0.1593005062808938,0 +7957,1475305500,0.15997365895457571,0 +7958,1475305800,0.15415724760343386,0 +7959,1475306100,0.15706700432197046,0 +7960,1475306400,0.16793050968291842,0 +7961,1475306700,0.1586707828119317,0 +7962,1475307000,0.16281206769395984,0 +7963,1475307300,0.15150496402728456,0 +7964,1475307600,0.15345927824125058,0 +7965,1475307900,0.15567106559770902,0 +7966,1475308200,0.14796858592571718,0 +7967,1475308500,0.15429684147580744,0 +7968,1475308800,0.15390287654697785,0 +7969,1475309100,0.15743925464844016,0 +7970,1475309400,0.15792628215899224,0 +7971,1475309700,0.16139441436723342,0 +7972,1475310000,0.15888172466356282,0 +7973,1475310300,0.1562759723782748,0 +7974,1475310600,0.17025707422345926,0 +7975,1475310900,0.16346350576528185,0 +7976,1475311200,0.1683492913002494,0 +7977,1475311500,0.1675830760449501,0 +7978,1475311800,0.18107715037968747,0 +7979,1475312100,0.1748884887019708,0 +7980,1475312400,0.1825196203947051,0 +7981,1475312700,0.17905148818635885,0 +7982,1475313000,0.18240174112469126,0 +7983,1475313300,0.18789243344018275,0 +7984,1475313600,0.1883825630367082,0 +7985,1475313900,0.19768882119858144,0 +7986,1475314200,0.1948721270615845,0 +7987,1475314500,0.1946860018983496,0 +7988,1475314800,0.19417415769944327,0 +7989,1475315100,0.20108560542766601,0 +7990,1475315400,0.20622576201906848,0 +7991,1475315700,0.2017587581013688,0 +7992,1475316000,0.2038774828762308,0 +7993,1475316300,0.20934335850330488,0 +7994,1475316600,0.20485463998322434,0 +7995,1475316900,0.2118343336046156,0 +7996,1475317200,0.219930778205437,0 +7997,1475317500,0.2248165637404045,0 +7998,1475317800,0.21397477298181647,0 +7999,1475318100,0.2097156088297746,0 +8000,1475318400,0.2229304954196597,0 +8001,1475318700,0.2194189340065937,0 +8002,1475319000,0.22072181014923767,0 +8003,1475319300,0.2256075956842053,0 +8004,1475319600,0.22253653049083008,0 +8005,1475319900,0.22483827834286954,0 +8006,1475320200,0.2247452157612521,0 +8007,1475320500,0.22488480963362573,0 +8008,1475320800,0.2274223160257977,0 +8009,1475321100,0.2296310012962197,0 +8010,1475321400,0.23975310809026465,0 +8011,1475321700,0.23612366740707985,0 +8012,1475322000,0.2513145828133959,0 +8013,1475322300,0.24161435972261325,0 +8014,1475322600,0.2467793330024331,0 +8015,1475322900,0.2492672060177074,0 +8016,1475323200,0.2460100156610973,0 +8017,1475323500,0.2464753285691845,0 +8018,1475323800,0.2538272725170665,0 +8019,1475324100,0.2534550221905968,0 +8020,1475324400,0.26064565766364023,0 +8021,1475324700,0.2619950650971455,0 +8022,1475325000,0.2581794992507257,0 +8023,1475325300,0.265180907474498,0 +8024,1475325600,0.2650661302905206,0 +8025,1475325900,0.2565043727816121,0 +8026,1475326200,0.2698123219531146,0 +8027,1475326500,0.26776494515753113,0 +8028,1475326800,0.2658571622343213,0 +8029,1475327100,0.2705102913151928,0 +8030,1475327400,0.26601847070915985,0 +8031,1475327700,0.2646225319847933,0 +8032,1475328000,0.27118344398897976,0 +8033,1475328300,0.2724180742384026,0 +8034,1475328600,0.2671817529793252,0 +8035,1475328900,0.2662511271631509,0 +8036,1475329200,0.27830273148276563,0 +8037,1475329500,0.26643725232638577,0 +8038,1475329800,0.2686490396828442,0 +8039,1475330100,0.2789324549517277,0 +8040,1475330400,0.27211406980515396,0 +8041,1475330700,0.25757459247026504,0 +8042,1475331000,0.2658571622343213,0 +8043,1475331300,0.2618306545362705,0 +8044,1475331600,0.2555954615677977,0 +8045,1475331900,0.2600159341946781,0 +8046,1475332200,0.2541995228435362,0 +8047,1475332500,0.2596436838682084,0 +8048,1475332800,0.2443131745895187,0 +8049,1475333100,0.2586913434496742,0 +8050,1475333400,0.2416826056158344,0 +8051,1475333700,0.2493385539969649,0 +8052,1475334000,0.24147476585023964,0 +8053,1475334300,0.23323872737693954,0 +8054,1475334600,0.2253501225417129,0 +8055,1475334900,0.22223252605747645,0 +8056,1475335200,0.2181843037570657,0 +8057,1475335500,0.2226978389655636,0 +8058,1475335800,0.2092037646308262,0 +8059,1475336100,0.20797223646745025,0 +8060,1475336400,0.19617500320424952,0 +8061,1475336700,0.18796378141941927,0 +8062,1475337000,0.17798126849781093,0 +8063,1475337300,0.1806583687623565,0 +8064,1475337600,0.1725619241615351,0 +8065,1475337900,0.17367867514094426,0 +8066,1475338200,0.16292994696397356,0 +8067,1475338500,0.15922915830163625,0 +8068,1475338800,0.1495289352108536,0 +8069,1475339100,0.15108618240995358,0 +8070,1475339400,0.13836142541665702,0 +8071,1475339700,0.14219870586543654,0 +8072,1475340000,0.12626328980618107,0 +8073,1475340300,0.1246346946279286,0 +8074,1475340600,0.12184281717930065,0 +8075,1475340900,0.12409803374058392,0 +8076,1475341200,0.11860734142505053,0 +8077,1475341500,0.10323030085560464,0 +8078,1475341800,0.10592911572251007,0 +8079,1475342100,0.09080644620983548,0 +8080,1475342400,0.08519787462386805,0 +8081,1475342700,0.08652556745543376,0 +8082,1475343000,0.07703318412940516,0 +8083,1475343300,0.07761327422178488,0 +8084,1475343600,0.07095929963587598,0 +8085,1475343900,0.06609833079014532,0 +8086,1475344200,0.06621310797391249,0 +8087,1475344500,0.06395789141241906,0 +8088,1475344800,0.060979888800661325,0 +8089,1475345100,0.05797706949998192,0 +8090,1475345400,0.05588316141385245,0 +8091,1475345700,0.057837475628343876,0 +8092,1475346000,0.05583663012330643,0 +8093,1475346300,0.05325569452588934,0 +8094,1475346600,0.05406844107204989,0 +8095,1475346900,0.05078953611329014,0 +8096,1475347200,0.0493935973895541,0 +8097,1475347500,0.04846297157337981,0 +8098,1475347800,0.05109043846029216,0 +8099,1475348100,0.04648384067101752,0 +8100,1475348400,0.04680955970589041,0 +8101,1475348700,0.04855603415447182,0 +8102,1475349000,0.0480659045579884,0 +8103,1475349300,0.0459502818691838,0 +8104,1475349600,0.04711356413913901,0 +8105,1475349900,0.04466912032921496,0 +8106,1475350200,0.043785025803586654,0 +8107,1475350500,0.045180964528373516,0 +8108,1475350800,0.042600028930956366,0 +8109,1475351100,0.044157276130056385,0 +8110,1475351400,0.043040525150647206,0 +8111,1475351700,0.04418209281897807,0 +8112,1475352000,0.044250338711148425,0 +8113,1475352300,0.043018810547972096,0 +8114,1475352600,0.042249493207161766,0 +8115,1475352900,0.043530654747130665,0 +8116,1475353200,0.04162287182423609,0 +8117,1475353500,0.040133870518357186,0 +8118,1475353800,0.04048130415590522,0 +8119,1475354100,0.04297227925742608,0 +8120,1475354400,0.04004080793621436,0 +8121,1475354700,0.040971433752388674,0 +8122,1475355000,0.04315840442066095,0 +8123,1475355300,0.03992292866620067,0 +8124,1475355600,0.03976162019188748,0 +8125,1475355900,0.041504992554222346,0 +8126,1475356200,0.03889924026893432,0 +8127,1475356500,0.042342555788253806,0 +8128,1475356800,0.04250696634986431,0 +8129,1475357100,0.041064496334531485,0 +8130,1475357400,0.039783334793511815,0 +8131,1475357700,0.0424139037677215,0 +8132,1475358000,0.04273962280364522,0 +8133,1475358300,0.042600028930956366,0 +8134,1475358600,0.044203807420602405,0 +8135,1475358900,0.04515924992569837,0 +8136,1475359200,0.05181322451160726,0 +8137,1475359500,0.05164881394999672,0 +8138,1475359800,0.05351006558234531,0 +8139,1475360100,0.05765135046510901,0 +8140,1475360400,0.06367870366809217,0 +8141,1475360700,0.06321339075947963,0 +8142,1475361000,0.06670323757039591,0 +8143,1475361300,0.06698242531472283,0 +8144,1475361600,0.06486370054072246,0 +8145,1475361900,0.07121677277857852,0 +8146,1475362200,0.07184339416150426,0 +8147,1475362500,0.07600949573213882,0 +8148,1475362800,0.08238428257267004,0 +8149,1475363100,0.08119618361379316,0 +8150,1475363400,0.08436031138983659,0 +8151,1475363700,0.08689781778190346,0 +8152,1475364000,0.09006194555689602,0 +8153,1475364300,0.08680475519976064,0 +8154,1475364600,0.09048072717391176,0 +8155,1475364900,0.09059550435767892,0 +8156,1475365200,0.08822240852617177,0 +8157,1475365500,0.0978078544332924,0 +8158,1475365800,0.10206701858533423,0 +8159,1475366100,0.1094654938239724,0 +8160,1475366400,0.1019956706060767,0 +8161,1475366700,0.11239696514497396,0 +8162,1475367000,0.11909747102153395,0 +8163,1475367300,0.11595505784890126,0 +8164,1475367600,0.11348889943598682,0 +8165,1475367900,0.1190261230422764,0 +8166,1475368200,0.12342488106679694,0 +8167,1475368500,0.12423762761295752,0 +8168,1475368800,0.12395843986810524,0 +8169,1475369100,0.1252861326992506,0 +8170,1475369400,0.1326132599586312,0 +8171,1475369700,0.12952048016289616,0 +8172,1475370000,0.1318222280149356,0 +8173,1475370300,0.1438986490229466,0 +8174,1475370600,0.14771421486936626,0 +8175,1475370900,0.14429261395177614,0 +8176,1475371200,0.14350158200808058,0 +8177,1475371500,0.1494110559408399,0 +8178,1475371800,0.15097140522597627,0 +8179,1475372100,0.14731714785439518,0 +8180,1475372400,0.1518772143537543,0 +8181,1475372700,0.15520575268962186,0 +8182,1475373000,0.16132306638808094,0 +8183,1475373300,0.161301351785616,0 +8184,1475373600,0.17335295610533574,0 +8185,1475373900,0.1797494575485421,0 +8186,1475374200,0.1705827932590677,0 +8187,1475374500,0.1814711153085171,0 +8188,1475374800,0.17037495349347295,0 +8189,1475375100,0.1767032090435632,0 +8190,1475375400,0.17035013680507669,0 +8191,1475375700,0.16823451411627208,0 +8192,1475376000,0.17435182781462613,1 +8193,1475376300,0.23153878421932444,1 +8194,1475376600,0.17970292625778592,1 +8195,1475376900,0.1819612449050005,0 +8196,1475377200,0.17205007996258667,0 +8197,1475377500,0.1790763048748602,0 +8198,1475377800,0.17889017971162532,0 +8199,1475378100,0.17500326588594814,0 +8200,1475378400,0.17586564580900635,0 +8201,1475378700,0.1729341744880048,0 +8202,1475379000,0.1612765350972197,0 +8203,1475379300,0.1682810454070283,0 +8204,1475379600,0.1717708922177344,0 +8205,1475379900,0.1767249236459231,0 +8206,1475380200,0.1915001595209341,0 +8207,1475380500,0.19173281597497768,0 +8208,1475380800,0.2029685716624164,0 +8209,1475381100,0.2225117138023287,0 +8210,1475381400,0.2210940604757074,0 +8211,1475381700,0.23949563494777226,0 +8212,1475382000,0.2506166134512127,0 +8213,1475382300,0.2600159341946781,0 +8214,1475382600,0.264926536418147,0 +8215,1475382900,0.27865326720687544,0 +8216,1475383200,0.26401762520433264,0 +8217,1475383500,0.2631583473673108,0 +8218,1475383800,0.26906782130017515,0 +8219,1475384100,0.26031993862803177,0 +8220,1475384400,0.2616445293730357,0 +8221,1475384700,0.2449863272633057,0 +8222,1475385000,0.25259574435368004,0 +8223,1475385300,0.23072603767326896,0 +8224,1475385600,0.2132519869312369,0 +8225,1475385900,0.2184883081904194,0 +8226,1475386200,0.2117412710229981,0 +8227,1475386500,0.1969195038572005,0 +8228,1475386800,0.1853580291340892,0 +8229,1475387100,0.17968121165532094,0 +8230,1475387400,0.17509632846756554,0 +8231,1475387700,0.16427935439747882,0 +8232,1475388000,0.16313778672956827,0 +8233,1475388300,0.1644654795607137,0 +8234,1475388600,0.15988059637295826,0 +8235,1475388900,0.16969869873375468,0 +8236,1475389200,0.15860253691871054,0 +8237,1475389500,0.1612548204948598,0 +8238,1475389800,0.1541107163125726,0 +8239,1475390100,0.15860253691871054,0 +8240,1475390400,0.1602311320970681,0 +8241,1475390700,0.15660169141388328,0 +8242,1475391000,0.1562976869806347,0 +8243,1475391300,0.1527613088790673,0 +8244,1475391600,0.16895419808081524,0 +8245,1475391900,0.1567164685978606,0 +8246,1475392200,0.1580193447406097,0 +8247,1475392500,0.1595796940257461,0 +8248,1475392800,0.15827681788299702,0 +8249,1475393100,0.1570887189244354,0 +8250,1475393400,0.15034168175701418,0 +8251,1475393700,0.15716006690358789,0 +8252,1475394000,0.161301351785616,0 +8253,1475394300,0.15825200119460073,0 +8254,1475394600,0.15611156181739985,0 +8255,1475394900,0.16388228738250776,0 +8256,1475395200,0.15888172466356282,0 +8257,1475395500,0.16025284669942802,0 +8258,1475395800,0.16565047643334396,0 +8259,1475396100,0.1583915950670794,0 +8260,1475396400,0.16337044318366442,0 +8261,1475396700,0.1674651967748313,0 +8262,1475397000,0.16395363536176527,0 +8263,1475397300,0.1682810454070283,0 +8264,1475397600,0.17314201425359954,0 +8265,1475397900,0.17028189091185553,0 +8266,1475398200,0.17472407814109586,0 +8267,1475398500,0.17237579899830022,0 +8268,1475398800,0.187845902149374,0 +8269,1475399100,0.18947449732770005,0 +8270,1475399400,0.18500749340997927,0 +8271,1475399700,0.19007940410822385,0 +8272,1475400000,0.1904516544346936,0 +8273,1475400300,0.19038340854151445,0 +8274,1475400600,0.1944316308419252,0 +8275,1475400900,0.19933913097928416,0 +8276,1475401200,0.2038061348969838,0 +8277,1475401500,0.20622576201906848,0 +8278,1475401800,0.21416089814505126,0 +8279,1475402100,0.2043893270751267,0 +8280,1475402400,0.2103918635894929,0 +8281,1475402700,0.2101126758446406,0 +8282,1475403000,0.2224899991999688,0 +8283,1475403300,0.21276495942078988,0 +8284,1475403600,0.21022745302872287,0 +8285,1475403900,0.2270252490108266,0 +8286,1475404200,0.2216989672561681,0 +8287,1475404500,0.2264451589187621,0 +8288,1475404800,0.21909321497088013,0 +8289,1475405100,0.2317249093825593,0 +8290,1475405400,0.2230949059805347,0 +8291,1475405700,0.22923703636728496,0 +8292,1475406000,0.22348887090936426,0 +8293,1475406300,0.23326044197929946,0 +8294,1475406600,0.2246521531795296,0 +8295,1475406900,0.2422657977939353,0 +8296,1475407200,0.2369364139532404,0 +8297,1475407500,0.2413351719777609,0 +8298,1475407800,0.2487801785072604,0 +8299,1475408100,0.2448715500793283,0 +8300,1475408400,0.2541064602619188,0 +8301,1475408700,0.2493137373085687,0 +8302,1475409000,0.2652057241629993,0 +8303,1475409300,0.2569914002920591,0 +8304,1475409600,0.2727437932741161,0 +8305,1475409900,0.2700666930095706,0 +8306,1475410200,0.2849815227569657,0 +8307,1475410500,0.283306396287852,0 +8308,1475410800,0.2834459901602256,0 +8309,1475411100,0.28746939577223996,0 +8310,1475411400,0.2713695691522146,0 +8311,1475411700,0.27406838401912004,0 +8312,1475412000,0.27099731882574485,0 +8313,1475412300,0.2791185801149626,0 +8314,1475412600,0.27325563747295945,0 +8315,1475412900,0.2724180742384026,0 +8316,1475413200,0.2730446956213283,0 +8317,1475413500,0.2845162098488785,0 +8318,1475413800,0.28428355339478245,0 +8319,1475414100,0.2833994588694694,0 +8320,1475414400,0.2872832706090052,0 +8321,1475414700,0.2887257406241278,0 +8322,1475415000,0.2860269257571173,0 +8323,1475415300,0.2836786466143217,0 +8324,1475415600,0.2758148584674913,0 +8325,1475415900,0.2862130509203521,0 +8326,1475416200,0.2816312698186332,0 +8327,1475416500,0.2731377582029457,0 +8328,1475416800,0.2852358938134217,0 +8329,1475417100,0.2791868260081837,0 +8330,1475417400,0.27248632013162377,0 +8331,1475417700,0.2745336969272072,0 +8332,1475418000,0.2675540033057949,0 +8333,1475418300,0.2645542860916773,0 +8334,1475418600,0.2692074151725488,0 +8335,1475418900,0.2608317828268751,0 +8336,1475419200,0.2719062300395593,0 +8337,1475419500,0.25864481215881296,0 +8338,1475419800,0.26218119026038034,0 +8339,1475420100,0.26259997187760625,0 +8340,1475420400,0.2500117066706468,0 +8341,1475420700,0.24433488919198365,0 +8342,1475421000,0.2362849758819185,0 +8343,1475421300,0.23933432647293365,0 +8344,1475421600,0.23579794837147136,0 +8345,1475421900,0.22211774887349914,0 +8346,1475422200,0.22716484288330524,0 +8347,1475422500,0.21504499267046945,0 +8348,1475422800,0.20831967010549213,0 +8349,1475423100,0.19703738312724994,0 +8350,1475423400,0.2101126758446406,0 +8351,1475423700,0.1844739346086711,0 +8352,1475424000,0.19152187412331506,0 +8353,1475424300,0.18763806238375824,0 +8354,1475424600,0.16888595218759409,0 +8355,1475424900,0.16807010355539712,0 +8356,1475425200,0.15438990405742484,0 +8357,1475425500,0.1532266217871545,0 +8358,1475425800,0.14045533350310171,0 +8359,1475426100,0.13042628929056907,0 +8360,1475426400,0.1263315356994022,0 +8361,1475426700,0.11765500100651632,0 +8362,1475427000,0.11837468497095438,0 +8363,1475427300,0.10769730477324126,0 +8364,1475427600,0.1044618290189911,0 +8365,1475427900,0.1014838264072334,0 +8366,1475428200,0.09811185886654096,0 +8367,1475428500,0.08587412938358634,0 +8368,1475428800,0.08489697227686599,0 +8369,1475429100,0.08308225193506344,0 +8370,1475429400,0.07424130668088229,0 +8371,1475429700,0.07389077095708768,0 +8372,1475430000,0.07207605061528513,0 +8373,1475430300,0.06802782831571506,0 +8374,1475430600,0.06540036142775191,0 +8375,1475430900,0.0640974852851079,0 +8376,1475431200,0.06177092074414677,0 +8377,1475431500,0.061094665984428476,0 +8378,1475431800,0.06014232556557906,0 +8379,1475432100,0.056441536903557035,0 +8380,1475432400,0.0541863203420636,0 +8381,1475432700,0.05376753872504785,0 +8382,1475433000,0.05709297497540442,0 +8383,1475433300,0.055278254633601864,0 +8384,1475433600,0.05004503546035072,0 +8385,1475433900,0.048602565445017835,0 +8386,1475434200,0.051906287092699295,0 +8387,1475434500,0.050764719425419255,0 +8388,1475434800,0.051834939113231566,0 +8389,1475435100,0.04953319126119216,0 +8390,1475435400,0.046741313812669286,0 +8391,1475435700,0.0448800621803207,0 +8392,1475436000,0.0461829383229647,0 +8393,1475436300,0.0458324025991701,0 +8394,1475436600,0.04608987574187265,0 +8395,1475436900,0.04287921667633407,0 +8396,1475437200,0.04476218291030699,0 +8397,1475437500,0.04099314835506378,0 +8398,1475437800,0.04278615409419124,0 +8399,1475438100,0.039783334793511815,0 +8400,1475438400,0.03922495930380724,0 +8401,1475438700,0.038573521233010624,0 +8402,1475439000,0.042202961915565,0 +8403,1475439300,0.03927149059540405,0 +8404,1475439600,0.041111027625077526,0 +8405,1475439900,0.03766771210470722,0 +8406,1475440200,0.03982986608510861,0 +8407,1475440500,0.03999427664566834,0 +8408,1475440800,0.040155585119981525,0 +8409,1475441100,0.0397368035029658,0 +8410,1475441400,0.044250338711148425,0 +8411,1475441700,0.04252868095148864,0 +8412,1475442000,0.04085355448237496,0 +8413,1475442300,0.040760491901282926,0 +8414,1475442600,0.042342555788253806,0 +8415,1475442900,0.042342555788253806,0 +8416,1475443200,0.041808996987470905,0 +8417,1475443500,0.04139021536940436,0 +8418,1475443800,0.042296024497707814,0 +8419,1475444100,0.041458461262625534,0 +8420,1475444400,0.04429687000274525,0 +8421,1475444700,0.049626253843334966,0 +8422,1475445000,0.04725315801182785,0 +8423,1475445300,0.04692743897590413,0 +8424,1475445600,0.05453375398066244,0 +8425,1475445900,0.05376753872504785,0 +8426,1475446200,0.05844238240859444,0 +8427,1475446500,0.05977007523910935,0 +8428,1475446800,0.061398670417677076,0 +8429,1475447100,0.0638182975397302,0 +8430,1475447400,0.06681801475416305,0 +8431,1475447700,0.06565473248420793,0 +8432,1475448000,0.07363639990063169,0 +8433,1475448300,0.07277401997767856,0 +8434,1475448600,0.07328586417683709,0 +8435,1475448900,0.07852218543528401,0 +8436,1475449200,0.08098834384893397,0 +8437,1475449500,0.0774519657474717,0 +8438,1475449800,0.08494350356741202,0 +8439,1475450100,0.08454643655307144,0 +8440,1475450400,0.0931081940614546,0 +8441,1475450700,0.09131829040794323,0 +8442,1475451000,0.09513385625478324,0 +8443,1475451300,0.0912252278268512,0 +8444,1475451600,0.0941318824593514,0 +8445,1475451900,0.09994829381049326,0 +8446,1475452200,0.10059973188181526,0 +8447,1475452500,0.10690627282954564,0 +8448,1475452800,0.10325201545796454,0 +8449,1475453100,0.11302358652789968,0 +8450,1475453400,0.11379290386934048,0 +8451,1475453700,0.11230390256335654,0 +8452,1475454000,0.11169899578289576,0 +8453,1475454300,0.11790937206286728,0 +8454,1475454600,0.11942319005724747,0 +8455,1475454900,0.11800243464448468,0 +8456,1475455200,0.12349312696001807,0 +8457,1475455500,0.12102696854710365,0 +8458,1475455800,0.12754134926042876,0 +8459,1475456100,0.13356870246330688,0 +8460,1475456400,0.13231235761141902,0 +8461,1475456700,0.1428284293342936,0 +8462,1475457000,0.1424561790078239,0 +8463,1475457300,0.1556462489093127,0 +8464,1475457600,0.15176243716977694,0 +8465,1475457900,0.15466909180227714,0 +8466,1475458200,0.1644654795607137,0 +8467,1475458500,0.1521564020986066,0 +8468,1475458800,0.15878866208194542,0 +8469,1475459100,0.15611156181739985,0 +8470,1475459400,0.15848465764869685,0 +8471,1475459700,0.16893248347835027,0 +8472,1475460000,0.16860676444274178,0 +8473,1475460300,0.1711194541464124,0 +8474,1475460600,0.16776920120818495,0 +8475,1475460900,0.16176666469370315,0 +8476,1475461200,0.16983829260612826,0 +8477,1475461500,0.17181742350859566,0 +8478,1475461800,0.17588736041136624,0 +8479,1475462100,0.16993135518774571,0 +8480,1475462400,0.17025707422345926,0 +8481,1475462700,0.17551511008489654,0 +8482,1475463000,0.1729558890903647,0 +8483,1475463300,0.17276976392712984,0 +8484,1475463600,0.16997788647860698,0 +8485,1475463900,0.16223197760189534,0 +8486,1475464200,0.1652565115044093,0 +8487,1475464500,0.1732133622328571,0 +8488,1475464800,0.17274804932476995,0 +8489,1475465100,0.167837447101301,0 +8490,1475465400,0.16488426117793956,0 +8491,1475465700,0.16586141828497508,0 +8492,1475466000,0.1767032090435632,0 +8493,1475466300,0.18579852535378008,0 +8494,1475466600,0.1806335520739602,0 +8495,1475466900,0.1916149367049325,0 +8496,1475467200,0.2024101961727013,0 +8497,1475467500,0.2003876360655247,0 +8498,1475467800,0.21788340140985354,0 +8499,1475468100,0.2363780384635358,0 +8500,1475468400,0.2384967632383768,0 +8501,1475468700,0.25157205595578325,0 +8502,1475469000,0.25024436312474296,0 +8503,1475469300,0.2617624086430493,0 +8504,1475469600,0.2573884673070302,0 +8505,1475469900,0.26139015831657963,0 +8506,1475470200,0.2494316165785824,0 +8507,1475470500,0.2497573356142959,0 +8508,1475470800,0.2464753285691845,0 +8509,1475471100,0.2477316734210724,0 +8510,1475471400,0.23298125423444715,0 +8511,1475471700,0.22274437025642485,0 +8512,1475472000,0.2129510845840248,0 +8513,1475472300,0.20662282903397647,0 +8514,1475472600,0.20062029251956826,0 +8515,1475472900,0.18891612183798487,0 +8516,1475473200,0.1844491179202748,0 +8517,1475473500,0.1824948037063088,0 +8518,1475473800,0.1835184921041005,0 +8519,1475474100,0.1708154497131638,0 +8520,1475474400,0.16893248347835027,0 +8521,1475474700,0.16460507343308728,0 +8522,1475475000,0.16037072596944169,0 +8523,1475475300,0.16230022349501144,0 +8524,1475475600,0.15464737719991725,0 +8525,1475475900,0.16490597578040453,0 +8526,1475476200,0.15639074956225213,0 +8527,1475476500,0.1554135924552166,0 +8528,1475476800,0.16018460080620686,0 +8529,1475477100,0.15829853248546194,0 +8530,1475477400,0.15964793991886214,0 +8531,1475477700,0.16281206769395984,0 +8532,1475478000,0.1629764782548348,0 +8533,1475478300,0.15713525021519156,0 +8534,1475478600,0.16064991371429402,0 +8535,1475478900,0.15850947433709311,0 +8536,1475479200,0.16655938764715836,0 +8537,1475479500,0.16886113549919782,0 +8538,1475479800,0.16569700772410015,0 +8539,1475480100,0.1640684125457426,0 +8540,1475480400,0.16367444761691294,0 +8541,1475480700,0.1718856694017117,0 +8542,1475481000,0.16604754344820996,0 +8543,1475481300,0.17484195741121464,0 +8544,1475481600,0.1695373902589161,0 +8545,1475481900,0.16941951098890234,0 +8546,1475482200,0.18179683434412552,0 +8547,1475482500,0.1759587083906238,0 +8548,1475482800,0.17772689744135495,0 +8549,1475483100,0.18977850176099065,0 +8550,1475483400,0.1864034321342408,0 +8551,1475483700,0.1859164046237938,0 +8552,1475484000,0.19566315900534315,0 +8553,1475484300,0.20448238965674406,0 +8554,1475484600,0.20336563867732446,0 +8555,1475484900,0.20364482642217674,0 +8556,1475485200,0.2022488876978943,0 +8557,1475485500,0.203272576095707,0 +8558,1475485800,0.2183952456086969,0 +8559,1475486100,0.22351368759776047,0 +8560,1475486400,0.2229770267104159,0 +8561,1475486700,0.2339832280298789,0 +8562,1475487000,0.2281885312810969,0 +8563,1475487300,0.22439778212317868,0 +8564,1475487600,0.2375661374222025,0 +8565,1475487900,0.23768091460617985,0 +8566,1475488200,0.24021842099835186,0 +8567,1475488500,0.244614076936836,0 +8568,1475488800,0.2427311107020224,0 +8569,1475489100,0.2529679946801497,0 +8570,1475489400,0.2408698590696738,0 +8571,1475489700,0.2430785443400958,0 +8572,1475490000,0.2477316734210724,0 +8573,1475490300,0.2582942764347031,0 +8574,1475490600,0.2470585207472853,0 +8575,1475490900,0.25324718242500205,0 +8576,1475491200,0.262389030025975,0 +8577,1475491500,0.2555489302770415,0 +8578,1475491800,0.26294740551567963,0 +8579,1475492100,0.26169106066379183,0 +8580,1475492400,0.2627612803524448,0 +8581,1475492700,0.2650195989997645,0 +8582,1475493000,0.26843809783121303,0 +8583,1475493300,0.2716022256062056,0 +8584,1475493600,0.2857725547006613,0 +8585,1475493900,0.2765345424320345,0 +8586,1475494200,0.2863774614812272,0 +8587,1475494500,0.2893089328022287,0 +8588,1475494800,0.30087040752535993,0 +8589,1475495100,0.29000690216441194,0 +8590,1475495400,0.29977847323434714,0 +8591,1475495700,0.302405940121995,0 +8592,1475496000,0.3125962928094713,0 +8593,1475496300,0.300451625908029,0 +8594,1475496600,0.31501591993152445,0 +8595,1475496900,0.3256964022153791,0 +8596,1475497200,0.3201126473183333,0 +8597,1475497500,0.32469442841973706,0 +8598,1475497800,0.3282090919185243,0 +8599,1475498100,0.3270923409391151,0 +8600,1475498400,0.3254637457615982,0 +8601,1475498700,0.3320711890559103,0 +8602,1475499000,0.3287209361176828,0 +8603,1475499300,0.3341650971420397,0 +8604,1475499600,0.32704580964856905,0 +8605,1475499900,0.3424476669065163,0 +8606,1475500200,0.343120819579988,0 +8607,1475500500,0.3388864721161323,0 +8608,1475500800,0.3437505430491603,0 +8609,1475501100,0.3553833657518644,0 +8610,1475501400,0.3595928965267984,0 +8611,1475501700,0.35884839587385897,0 +8612,1475502000,0.35598827253211496,0 +8613,1475502300,0.3504727635282904,0 +8614,1475502600,0.3672240282194276,0 +8615,1475502900,0.3714583756832833,0 +8616,1475503200,0.36385206067852505,0 +8617,1475503500,0.3524053631401066,0 +8618,1475503800,0.34889069964026864,0 +8619,1475504100,0.3505658261093824,0 +8620,1475504400,0.3418892914168117,0 +8621,1475504700,0.3457048572636517,0 +8622,1475505000,0.3403754734220112,0 +8623,1475505300,0.3398170979323066,0 +8624,1475505600,0.3360015320865174,0 +8625,1475505900,0.32495190156243964,0 +8626,1475506200,0.3362341885402983,0 +8627,1475506500,0.3115726044111541,0 +8628,1475506800,0.3145754237118336,0 +8629,1475507100,0.30903820010533384,0 +8630,1475507400,0.30396628940744663,0 +8631,1475507700,0.3023842255196351,0 +8632,1475508000,0.3015683768874381,0 +8633,1475508300,0.2820965827267203,0 +8634,1475508600,0.2733487000545769,0 +8635,1475508900,0.2742793258707512,0 +8636,1475509200,0.2616693460614319,0 +8637,1475509500,0.2604130012096492,0 +8638,1475509800,0.23049338121917284,0 +8639,1475510100,0.2343771929587086,0 +8640,1475510400,0.2283994731327281,0 +8641,1475510700,0.2214663108021772,0 +8642,1475511000,0.21551030557855647,0 +8643,1475511300,0.2069237313812097,0 +8644,1475511600,0.19531572536730124,0 +8645,1475511900,0.17591217709976253,0 +8646,1475512200,0.1748884887019708,0 +8647,1475512500,0.17030360551421542,0 +8648,1475512800,0.1574609692509051,0 +8649,1475513100,0.15429684147580744,0 +8650,1475513400,0.15888172466356282,0 +8651,1475513700,0.138615796473113,0 +8652,1475514000,0.13214794705054406,0 +8653,1475514300,0.13594179829460384,0 +8654,1475514600,0.12181800049090435,0 +8655,1475514900,0.11532843646597553,0 +8656,1475515200,0.1120929607117254,0 +8657,1475515500,0.10830221155380708,0 +8658,1475515800,0.10267192536590007,0 +8659,1475516100,0.09199144308246578,0 +8660,1475516400,0.09143306759276118,0 +8661,1475516700,0.0912717591173972,0 +8662,1475517000,0.08331490838884432,0 +8663,1475517300,0.08475737840417719,0 +8664,1475517600,0.07612427291590597,0 +8665,1475517900,0.07358986861008568,0 +8666,1475518200,0.07002867381970168,0 +8667,1475518500,0.07005349050862336,0 +8668,1475518800,0.06993561123860964,0 +8669,1475519100,0.06807435960626106,0 +8670,1475519400,0.06982083405484249,0 +8671,1475519700,0.06872579767705768,0 +8672,1475520000,0.06081547824010159,0 +8673,1475520300,0.05834931982645161,0 +8674,1475520600,0.05930476233154761,0 +8675,1475520900,0.05646635359247872,0 +8676,1475521200,0.053395288398578135,0 +8677,1475521500,0.05392884720041181,0 +8678,1475521800,0.051602282659450716,0 +8679,1475522100,0.05092912998597899,0 +8680,1475522400,0.050066750063025835,0 +8681,1475522700,0.05111525514921381,0 +8682,1475523000,0.04983409360924496,0 +8683,1475523300,0.0467630284153444,0 +8684,1475523600,0.04764712294097267,0 +8685,1475523900,0.04702050155804696,0 +8686,1475524200,0.04380984249250836,0 +8687,1475524500,0.04666996583425236,0 +8688,1475524800,0.045298843798387216,0 +8689,1475525100,0.045553214854843226,0 +8690,1475525400,0.04478699959922865,0 +8691,1475525700,0.04318011902333607,0 +8692,1475526000,0.04504137065568466,0 +8693,1475526300,0.0454601522727004,0 +8694,1475526600,0.043902905073600375,0 +8695,1475526900,0.041808996987470905,0 +8696,1475527200,0.04257521224203466,0 +8697,1475527500,0.04108621093615582,0 +8698,1475527800,0.041551523844768366,0 +8699,1475528100,0.04197030546178409,0 +8700,1475528400,0.04608987574187265,0 +8701,1475528700,0.04257521224203466,0 +8702,1475529000,0.044228624109524085,0 +8703,1475529300,0.04273962280364522,0 +8704,1475529600,0.0454601522727004,0 +8705,1475529900,0.04399596765574322,0 +8706,1475530200,0.04373849451304064,0 +8707,1475530500,0.04653037196156353,0 +8708,1475530800,0.04727487261450297,0 +8709,1475531100,0.04969449973655612,0 +8710,1475531400,0.052346783312390135,0 +8711,1475531700,0.05490600430713215,0 +8712,1475532000,0.05937300822476872,0 +8713,1475532300,0.06000273169394104,0 +8714,1475532600,0.0673081443506465,0 +8715,1475532900,0.07182167955882914,0 +8716,1475533200,0.07900921294657168,0 +8717,1475533500,0.08422071751714774,0 +8718,1475533800,0.08508309744010087,0 +8719,1475534100,0.09389922600525527,0 +8720,1475534400,0.09038766459176896,0 +8721,1475534700,0.10285805052913492,0 +8722,1475535000,0.10360255118207436,0 +8723,1475535300,0.10913977478836394,0 +8724,1475535600,0.1112553974771685,0 +8725,1475535900,0.11593334324654135,0 +8726,1475536200,0.12603063335219006,0 +8727,1475536500,0.12761269723968632,0 +8728,1475536800,0.1336834796472842,0 +8729,1475537100,0.14617868227262615,0 +8730,1475537400,0.14199086609973674,0 +8731,1475537700,0.14903880561437013,0 +8732,1475538000,0.1516910891905194,0 +8733,1475538300,0.16262594253072496,0 +8734,1475538600,0.15690259376109547,0 +8735,1475538900,0.1707006725291865,0 +8736,1475539200,0.16974523002451086,0 +8737,1475539500,0.17491020330433069,0 +8738,1475539800,0.17400439417665778,0 +8739,1475540100,0.18689356173082933,0 +8740,1475540400,0.19103484661284686,0 +8741,1475540700,0.19103484661284686,0 +8742,1475541000,0.1991778225044771,0 +8743,1475541300,0.20071335510118568,0 +8744,1475541600,0.20792570517664155,0 +8745,1475541900,0.21101848497241865,0 +8746,1475542200,0.21416089814505126,0 +8747,1475542500,0.2257937208474401,0 +8748,1475542800,0.24287070457450105,0 +8749,1475543100,0.2380314503302897,0 +8750,1475543400,0.2489663036704952,0 +8751,1475543700,0.2555489302770415,0 +8752,1475544000,0.2535946160629704,0 +8753,1475544300,0.2590853083785038,0 +8754,1475544600,0.2673213468518039,0 +8755,1475544900,0.2681123787954995,0 +8756,1475545200,0.2762088233963209,0 +8757,1475545500,0.2662045958723947,0 +8758,1475545800,0.2700915096979669,0 +8759,1475546100,0.2864922386652045,0 +8760,1475546400,0.2939372451947039,0 +8761,1475546700,0.2877268689147324,0 +8762,1475547000,0.2870754308434104,0 +8763,1475547300,0.29945275419863354,0 +8764,1475547600,0.2889583970781189,0 +8765,1475547900,0.29738056071465385,0 +8766,1475548200,0.3018723813207918,0 +8767,1475548500,0.3191354902105622,0 +8768,1475548800,0.3033613826261452,0 +8769,1475549100,0.30647897911059185,0 +8770,1475549400,0.30010419226995555,0 +8771,1475549700,0.31322601627864355,0 +8772,1475550000,0.297172720948954,0 +8773,1475550300,0.30457119618769724,0 +8774,1475550600,0.29947446880099343,0 +8775,1475550900,0.3089668521258661,0 +8776,1475551200,0.29372940542910914,0 +8777,1475551500,0.2971479042605577,0 +8778,1475551800,0.2978706903111373,0 +8779,1475552100,0.3058492556414195,0 +8780,1475552400,0.30889860623264503,0 +8781,1475552700,0.3077818552532359,0 +8782,1475553000,0.309059914708009,0 +8783,1475553300,0.30985094665149443,0 diff --git a/datasets/anomaly/kpi/SCORE/problem_TEST/dataSplits.csv b/datasets/anomaly/kpi/SCORE/problem_TEST/dataSplits.csv new file mode 100644 index 0000000..1f92bd4 --- /dev/null +++ b/datasets/anomaly/kpi/SCORE/problem_TEST/dataSplits.csv @@ -0,0 +1,7028 @@ +d3mIndex,type,repeat,fold +7027,TEST,0,0 +7028,TEST,0,0 +7029,TEST,0,0 +7030,TEST,0,0 +7031,TEST,0,0 +7032,TEST,0,0 +7033,TEST,0,0 +7034,TEST,0,0 +7035,TEST,0,0 +7036,TEST,0,0 +7037,TEST,0,0 +7038,TEST,0,0 +7039,TEST,0,0 +7040,TEST,0,0 +7041,TEST,0,0 +7042,TEST,0,0 +7043,TEST,0,0 +7044,TEST,0,0 +7045,TEST,0,0 +7046,TEST,0,0 +7047,TEST,0,0 +7048,TEST,0,0 +7049,TEST,0,0 +7050,TEST,0,0 +7051,TEST,0,0 +7052,TEST,0,0 +7053,TEST,0,0 +7054,TEST,0,0 +7055,TEST,0,0 +7056,TEST,0,0 +7057,TEST,0,0 +7058,TEST,0,0 +7059,TEST,0,0 +7060,TEST,0,0 +7061,TEST,0,0 +7062,TEST,0,0 +7063,TEST,0,0 +7064,TEST,0,0 +7065,TEST,0,0 +7066,TEST,0,0 +7067,TEST,0,0 +7068,TEST,0,0 +7069,TEST,0,0 +7070,TEST,0,0 +7071,TEST,0,0 +7072,TEST,0,0 +7073,TEST,0,0 +7074,TEST,0,0 +7075,TEST,0,0 +7076,TEST,0,0 +7077,TEST,0,0 +7078,TEST,0,0 +7079,TEST,0,0 +7080,TEST,0,0 +7081,TEST,0,0 +7082,TEST,0,0 +7083,TEST,0,0 +7084,TEST,0,0 +7085,TEST,0,0 +7086,TEST,0,0 +7087,TEST,0,0 +7088,TEST,0,0 +7089,TEST,0,0 +7090,TEST,0,0 +7091,TEST,0,0 +7092,TEST,0,0 +7093,TEST,0,0 +7094,TEST,0,0 +7095,TEST,0,0 +7096,TEST,0,0 +7097,TEST,0,0 +7098,TEST,0,0 +7099,TEST,0,0 +7100,TEST,0,0 +7101,TEST,0,0 +7102,TEST,0,0 +7103,TEST,0,0 +7104,TEST,0,0 +7105,TEST,0,0 +7106,TEST,0,0 +7107,TEST,0,0 +7108,TEST,0,0 +7109,TEST,0,0 +7110,TEST,0,0 +7111,TEST,0,0 +7112,TEST,0,0 +7113,TEST,0,0 +7114,TEST,0,0 +7115,TEST,0,0 +7116,TEST,0,0 +7117,TEST,0,0 +7118,TEST,0,0 +7119,TEST,0,0 +7120,TEST,0,0 +7121,TEST,0,0 +7122,TEST,0,0 +7123,TEST,0,0 +7124,TEST,0,0 +7125,TEST,0,0 +7126,TEST,0,0 +7127,TEST,0,0 +7128,TEST,0,0 +7129,TEST,0,0 +7130,TEST,0,0 +7131,TEST,0,0 +7132,TEST,0,0 +7133,TEST,0,0 +7134,TEST,0,0 +7135,TEST,0,0 +7136,TEST,0,0 +7137,TEST,0,0 +7138,TEST,0,0 +7139,TEST,0,0 +7140,TEST,0,0 +7141,TEST,0,0 +7142,TEST,0,0 +7143,TEST,0,0 +7144,TEST,0,0 +7145,TEST,0,0 +7146,TEST,0,0 +7147,TEST,0,0 +7148,TEST,0,0 +7149,TEST,0,0 +7150,TEST,0,0 +7151,TEST,0,0 +7152,TEST,0,0 +7153,TEST,0,0 +7154,TEST,0,0 +7155,TEST,0,0 +7156,TEST,0,0 +7157,TEST,0,0 +7158,TEST,0,0 +7159,TEST,0,0 +7160,TEST,0,0 +7161,TEST,0,0 +7162,TEST,0,0 +7163,TEST,0,0 +7164,TEST,0,0 +7165,TEST,0,0 +7166,TEST,0,0 +7167,TEST,0,0 +7168,TEST,0,0 +7169,TEST,0,0 +7170,TEST,0,0 +7171,TEST,0,0 +7172,TEST,0,0 +7173,TEST,0,0 +7174,TEST,0,0 +7175,TEST,0,0 +7176,TEST,0,0 +7177,TEST,0,0 +7178,TEST,0,0 +7179,TEST,0,0 +7180,TEST,0,0 +7181,TEST,0,0 +7182,TEST,0,0 +7183,TEST,0,0 +7184,TEST,0,0 +7185,TEST,0,0 +7186,TEST,0,0 +7187,TEST,0,0 +7188,TEST,0,0 +7189,TEST,0,0 +7190,TEST,0,0 +7191,TEST,0,0 +7192,TEST,0,0 +7193,TEST,0,0 +7194,TEST,0,0 +7195,TEST,0,0 +7196,TEST,0,0 +7197,TEST,0,0 +7198,TEST,0,0 +7199,TEST,0,0 +7200,TEST,0,0 +7201,TEST,0,0 +7202,TEST,0,0 +7203,TEST,0,0 +7204,TEST,0,0 +7205,TEST,0,0 +7206,TEST,0,0 +7207,TEST,0,0 +7208,TEST,0,0 +7209,TEST,0,0 +7210,TEST,0,0 +7211,TEST,0,0 +7212,TEST,0,0 +7213,TEST,0,0 +7214,TEST,0,0 +7215,TEST,0,0 +7216,TEST,0,0 +7217,TEST,0,0 +7218,TEST,0,0 +7219,TEST,0,0 +7220,TEST,0,0 +7221,TEST,0,0 +7222,TEST,0,0 +7223,TEST,0,0 +7224,TEST,0,0 +7225,TEST,0,0 +7226,TEST,0,0 +7227,TEST,0,0 +7228,TEST,0,0 +7229,TEST,0,0 +7230,TEST,0,0 +7231,TEST,0,0 +7232,TEST,0,0 +7233,TEST,0,0 +7234,TEST,0,0 +7235,TEST,0,0 +7236,TEST,0,0 +7237,TEST,0,0 +7238,TEST,0,0 +7239,TEST,0,0 +7240,TEST,0,0 +7241,TEST,0,0 +7242,TEST,0,0 +7243,TEST,0,0 +7244,TEST,0,0 +7245,TEST,0,0 +7246,TEST,0,0 +7247,TEST,0,0 +7248,TEST,0,0 +7249,TEST,0,0 +7250,TEST,0,0 +7251,TEST,0,0 +7252,TEST,0,0 +7253,TEST,0,0 +7254,TEST,0,0 +7255,TEST,0,0 +7256,TEST,0,0 +7257,TEST,0,0 +7258,TEST,0,0 +7259,TEST,0,0 +7260,TEST,0,0 +7261,TEST,0,0 +7262,TEST,0,0 +7263,TEST,0,0 +7264,TEST,0,0 +7265,TEST,0,0 +7266,TEST,0,0 +7267,TEST,0,0 +7268,TEST,0,0 +7269,TEST,0,0 +7270,TEST,0,0 +7271,TEST,0,0 +7272,TEST,0,0 +7273,TEST,0,0 +7274,TEST,0,0 +7275,TEST,0,0 +7276,TEST,0,0 +7277,TEST,0,0 +7278,TEST,0,0 +7279,TEST,0,0 +7280,TEST,0,0 +7281,TEST,0,0 +7282,TEST,0,0 +7283,TEST,0,0 +7284,TEST,0,0 +7285,TEST,0,0 +7286,TEST,0,0 +7287,TEST,0,0 +7288,TEST,0,0 +7289,TEST,0,0 +7290,TEST,0,0 +7291,TEST,0,0 +7292,TEST,0,0 +7293,TEST,0,0 +7294,TEST,0,0 +7295,TEST,0,0 +7296,TEST,0,0 +7297,TEST,0,0 +7298,TEST,0,0 +7299,TEST,0,0 +7300,TEST,0,0 +7301,TEST,0,0 +7302,TEST,0,0 +7303,TEST,0,0 +7304,TEST,0,0 +7305,TEST,0,0 +7306,TEST,0,0 +7307,TEST,0,0 +7308,TEST,0,0 +7309,TEST,0,0 +7310,TEST,0,0 +7311,TEST,0,0 +7312,TEST,0,0 +7313,TEST,0,0 +7314,TEST,0,0 +7315,TEST,0,0 +7316,TEST,0,0 +7317,TEST,0,0 +7318,TEST,0,0 +7319,TEST,0,0 +7320,TEST,0,0 +7321,TEST,0,0 +7322,TEST,0,0 +7323,TEST,0,0 +7324,TEST,0,0 +7325,TEST,0,0 +7326,TEST,0,0 +7327,TEST,0,0 +7328,TEST,0,0 +7329,TEST,0,0 +7330,TEST,0,0 +7331,TEST,0,0 +7332,TEST,0,0 +7333,TEST,0,0 +7334,TEST,0,0 +7335,TEST,0,0 +7336,TEST,0,0 +7337,TEST,0,0 +7338,TEST,0,0 +7339,TEST,0,0 +7340,TEST,0,0 +7341,TEST,0,0 +7342,TEST,0,0 +7343,TEST,0,0 +7344,TEST,0,0 +7345,TEST,0,0 +7346,TEST,0,0 +7347,TEST,0,0 +7348,TEST,0,0 +7349,TEST,0,0 +7350,TEST,0,0 +7351,TEST,0,0 +7352,TEST,0,0 +7353,TEST,0,0 +7354,TEST,0,0 +7355,TEST,0,0 +7356,TEST,0,0 +7357,TEST,0,0 +7358,TEST,0,0 +7359,TEST,0,0 +7360,TEST,0,0 +7361,TEST,0,0 +7362,TEST,0,0 +7363,TEST,0,0 +7364,TEST,0,0 +7365,TEST,0,0 +7366,TEST,0,0 +7367,TEST,0,0 +7368,TEST,0,0 +7369,TEST,0,0 +7370,TEST,0,0 +7371,TEST,0,0 +7372,TEST,0,0 +7373,TEST,0,0 +7374,TEST,0,0 +7375,TEST,0,0 +7376,TEST,0,0 +7377,TEST,0,0 +7378,TEST,0,0 +7379,TEST,0,0 +7380,TEST,0,0 +7381,TEST,0,0 +7382,TEST,0,0 +7383,TEST,0,0 +7384,TEST,0,0 +7385,TEST,0,0 +7386,TEST,0,0 +7387,TEST,0,0 +7388,TEST,0,0 +7389,TEST,0,0 +7390,TEST,0,0 +7391,TEST,0,0 +7392,TEST,0,0 +7393,TEST,0,0 +7394,TEST,0,0 +7395,TEST,0,0 +7396,TEST,0,0 +7397,TEST,0,0 +7398,TEST,0,0 +7399,TEST,0,0 +7400,TEST,0,0 +7401,TEST,0,0 +7402,TEST,0,0 +7403,TEST,0,0 +7404,TEST,0,0 +7405,TEST,0,0 +7406,TEST,0,0 +7407,TEST,0,0 +7408,TEST,0,0 +7409,TEST,0,0 +7410,TEST,0,0 +7411,TEST,0,0 +7412,TEST,0,0 +7413,TEST,0,0 +7414,TEST,0,0 +7415,TEST,0,0 +7416,TEST,0,0 +7417,TEST,0,0 +7418,TEST,0,0 +7419,TEST,0,0 +7420,TEST,0,0 +7421,TEST,0,0 +7422,TEST,0,0 +7423,TEST,0,0 +7424,TEST,0,0 +7425,TEST,0,0 +7426,TEST,0,0 +7427,TEST,0,0 +7428,TEST,0,0 +7429,TEST,0,0 +7430,TEST,0,0 +7431,TEST,0,0 +7432,TEST,0,0 +7433,TEST,0,0 +7434,TEST,0,0 +7435,TEST,0,0 +7436,TEST,0,0 +7437,TEST,0,0 +7438,TEST,0,0 +7439,TEST,0,0 +7440,TEST,0,0 +7441,TEST,0,0 +7442,TEST,0,0 +7443,TEST,0,0 +7444,TEST,0,0 +7445,TEST,0,0 +7446,TEST,0,0 +7447,TEST,0,0 +7448,TEST,0,0 +7449,TEST,0,0 +7450,TEST,0,0 +7451,TEST,0,0 +7452,TEST,0,0 +7453,TEST,0,0 +7454,TEST,0,0 +7455,TEST,0,0 +7456,TEST,0,0 +7457,TEST,0,0 +7458,TEST,0,0 +7459,TEST,0,0 +7460,TEST,0,0 +7461,TEST,0,0 +7462,TEST,0,0 +7463,TEST,0,0 +7464,TEST,0,0 +7465,TEST,0,0 +7466,TEST,0,0 +7467,TEST,0,0 +7468,TEST,0,0 +7469,TEST,0,0 +7470,TEST,0,0 +7471,TEST,0,0 +7472,TEST,0,0 +7473,TEST,0,0 +7474,TEST,0,0 +7475,TEST,0,0 +7476,TEST,0,0 +7477,TEST,0,0 +7478,TEST,0,0 +7479,TEST,0,0 +7480,TEST,0,0 +7481,TEST,0,0 +7482,TEST,0,0 +7483,TEST,0,0 +7484,TEST,0,0 +7485,TEST,0,0 +7486,TEST,0,0 +7487,TEST,0,0 +7488,TEST,0,0 +7489,TEST,0,0 +7490,TEST,0,0 +7491,TEST,0,0 +7492,TEST,0,0 +7493,TEST,0,0 +7494,TEST,0,0 +7495,TEST,0,0 +7496,TEST,0,0 +7497,TEST,0,0 +7498,TEST,0,0 +7499,TEST,0,0 +7500,TEST,0,0 +7501,TEST,0,0 +7502,TEST,0,0 +7503,TEST,0,0 +7504,TEST,0,0 +7505,TEST,0,0 +7506,TEST,0,0 +7507,TEST,0,0 +7508,TEST,0,0 +7509,TEST,0,0 +7510,TEST,0,0 +7511,TEST,0,0 +7512,TEST,0,0 +7513,TEST,0,0 +7514,TEST,0,0 +7515,TEST,0,0 +7516,TEST,0,0 +7517,TEST,0,0 +7518,TEST,0,0 +7519,TEST,0,0 +7520,TEST,0,0 +7521,TEST,0,0 +7522,TEST,0,0 +7523,TEST,0,0 +7524,TEST,0,0 +7525,TEST,0,0 +7526,TEST,0,0 +7527,TEST,0,0 +7528,TEST,0,0 +7529,TEST,0,0 +7530,TEST,0,0 +7531,TEST,0,0 +7532,TEST,0,0 +7533,TEST,0,0 +7534,TEST,0,0 +7535,TEST,0,0 +7536,TEST,0,0 +7537,TEST,0,0 +7538,TEST,0,0 +7539,TEST,0,0 +7540,TEST,0,0 +7541,TEST,0,0 +7542,TEST,0,0 +7543,TEST,0,0 +7544,TEST,0,0 +7545,TEST,0,0 +7546,TEST,0,0 +7547,TEST,0,0 +7548,TEST,0,0 +7549,TEST,0,0 +7550,TEST,0,0 +7551,TEST,0,0 +7552,TEST,0,0 +7553,TEST,0,0 +7554,TEST,0,0 +7555,TEST,0,0 +7556,TEST,0,0 +7557,TEST,0,0 +7558,TEST,0,0 +7559,TEST,0,0 +7560,TEST,0,0 +7561,TEST,0,0 +7562,TEST,0,0 +7563,TEST,0,0 +7564,TEST,0,0 +7565,TEST,0,0 +7566,TEST,0,0 +7567,TEST,0,0 +7568,TEST,0,0 +7569,TEST,0,0 +7570,TEST,0,0 +7571,TEST,0,0 +7572,TEST,0,0 +7573,TEST,0,0 +7574,TEST,0,0 +7575,TEST,0,0 +7576,TEST,0,0 +7577,TEST,0,0 +7578,TEST,0,0 +7579,TEST,0,0 +7580,TEST,0,0 +7581,TEST,0,0 +7582,TEST,0,0 +7583,TEST,0,0 +7584,TEST,0,0 +7585,TEST,0,0 +7586,TEST,0,0 +7587,TEST,0,0 +7588,TEST,0,0 +7589,TEST,0,0 +7590,TEST,0,0 +7591,TEST,0,0 +7592,TEST,0,0 +7593,TEST,0,0 +7594,TEST,0,0 +7595,TEST,0,0 +7596,TEST,0,0 +7597,TEST,0,0 +7598,TEST,0,0 +7599,TEST,0,0 +7600,TEST,0,0 +7601,TEST,0,0 +7602,TEST,0,0 +7603,TEST,0,0 +7604,TEST,0,0 +7605,TEST,0,0 +7606,TEST,0,0 +7607,TEST,0,0 +7608,TEST,0,0 +7609,TEST,0,0 +7610,TEST,0,0 +7611,TEST,0,0 +7612,TEST,0,0 +7613,TEST,0,0 +7614,TEST,0,0 +7615,TEST,0,0 +7616,TEST,0,0 +7617,TEST,0,0 +7618,TEST,0,0 +7619,TEST,0,0 +7620,TEST,0,0 +7621,TEST,0,0 +7622,TEST,0,0 +7623,TEST,0,0 +7624,TEST,0,0 +7625,TEST,0,0 +7626,TEST,0,0 +7627,TEST,0,0 +7628,TEST,0,0 +7629,TEST,0,0 +7630,TEST,0,0 +7631,TEST,0,0 +7632,TEST,0,0 +7633,TEST,0,0 +7634,TEST,0,0 +7635,TEST,0,0 +7636,TEST,0,0 +7637,TEST,0,0 +7638,TEST,0,0 +7639,TEST,0,0 +7640,TEST,0,0 +7641,TEST,0,0 +7642,TEST,0,0 +7643,TEST,0,0 +7644,TEST,0,0 +7645,TEST,0,0 +7646,TEST,0,0 +7647,TEST,0,0 +7648,TEST,0,0 +7649,TEST,0,0 +7650,TEST,0,0 +7651,TEST,0,0 +7652,TEST,0,0 +7653,TEST,0,0 +7654,TEST,0,0 +7655,TEST,0,0 +7656,TEST,0,0 +7657,TEST,0,0 +7658,TEST,0,0 +7659,TEST,0,0 +7660,TEST,0,0 +7661,TEST,0,0 +7662,TEST,0,0 +7663,TEST,0,0 +7664,TEST,0,0 +7665,TEST,0,0 +7666,TEST,0,0 +7667,TEST,0,0 +7668,TEST,0,0 +7669,TEST,0,0 +7670,TEST,0,0 +7671,TEST,0,0 +7672,TEST,0,0 +7673,TEST,0,0 +7674,TEST,0,0 +7675,TEST,0,0 +7676,TEST,0,0 +7677,TEST,0,0 +7678,TEST,0,0 +7679,TEST,0,0 +7680,TEST,0,0 +7681,TEST,0,0 +7682,TEST,0,0 +7683,TEST,0,0 +7684,TEST,0,0 +7685,TEST,0,0 +7686,TEST,0,0 +7687,TEST,0,0 +7688,TEST,0,0 +7689,TEST,0,0 +7690,TEST,0,0 +7691,TEST,0,0 +7692,TEST,0,0 +7693,TEST,0,0 +7694,TEST,0,0 +7695,TEST,0,0 +7696,TEST,0,0 +7697,TEST,0,0 +7698,TEST,0,0 +7699,TEST,0,0 +7700,TEST,0,0 +7701,TEST,0,0 +7702,TEST,0,0 +7703,TEST,0,0 +7704,TEST,0,0 +7705,TEST,0,0 +7706,TEST,0,0 +7707,TEST,0,0 +7708,TEST,0,0 +7709,TEST,0,0 +7710,TEST,0,0 +7711,TEST,0,0 +7712,TEST,0,0 +7713,TEST,0,0 +7714,TEST,0,0 +7715,TEST,0,0 +7716,TEST,0,0 +7717,TEST,0,0 +7718,TEST,0,0 +7719,TEST,0,0 +7720,TEST,0,0 +7721,TEST,0,0 +7722,TEST,0,0 +7723,TEST,0,0 +7724,TEST,0,0 +7725,TEST,0,0 +7726,TEST,0,0 +7727,TEST,0,0 +7728,TEST,0,0 +7729,TEST,0,0 +7730,TEST,0,0 +7731,TEST,0,0 +7732,TEST,0,0 +7733,TEST,0,0 +7734,TEST,0,0 +7735,TEST,0,0 +7736,TEST,0,0 +7737,TEST,0,0 +7738,TEST,0,0 +7739,TEST,0,0 +7740,TEST,0,0 +7741,TEST,0,0 +7742,TEST,0,0 +7743,TEST,0,0 +7744,TEST,0,0 +7745,TEST,0,0 +7746,TEST,0,0 +7747,TEST,0,0 +7748,TEST,0,0 +7749,TEST,0,0 +7750,TEST,0,0 +7751,TEST,0,0 +7752,TEST,0,0 +7753,TEST,0,0 +7754,TEST,0,0 +7755,TEST,0,0 +7756,TEST,0,0 +7757,TEST,0,0 +7758,TEST,0,0 +7759,TEST,0,0 +7760,TEST,0,0 +7761,TEST,0,0 +7762,TEST,0,0 +7763,TEST,0,0 +7764,TEST,0,0 +7765,TEST,0,0 +7766,TEST,0,0 +7767,TEST,0,0 +7768,TEST,0,0 +7769,TEST,0,0 +7770,TEST,0,0 +7771,TEST,0,0 +7772,TEST,0,0 +7773,TEST,0,0 +7774,TEST,0,0 +7775,TEST,0,0 +7776,TEST,0,0 +7777,TEST,0,0 +7778,TEST,0,0 +7779,TEST,0,0 +7780,TEST,0,0 +7781,TEST,0,0 +7782,TEST,0,0 +7783,TEST,0,0 +7784,TEST,0,0 +7785,TEST,0,0 +7786,TEST,0,0 +7787,TEST,0,0 +7788,TEST,0,0 +7789,TEST,0,0 +7790,TEST,0,0 +7791,TEST,0,0 +7792,TEST,0,0 +7793,TEST,0,0 +7794,TEST,0,0 +7795,TEST,0,0 +7796,TEST,0,0 +7797,TEST,0,0 +7798,TEST,0,0 +7799,TEST,0,0 +7800,TEST,0,0 +7801,TEST,0,0 +7802,TEST,0,0 +7803,TEST,0,0 +7804,TEST,0,0 +7805,TEST,0,0 +7806,TEST,0,0 +7807,TEST,0,0 +7808,TEST,0,0 +7809,TEST,0,0 +7810,TEST,0,0 +7811,TEST,0,0 +7812,TEST,0,0 +7813,TEST,0,0 +7814,TEST,0,0 +7815,TEST,0,0 +7816,TEST,0,0 +7817,TEST,0,0 +7818,TEST,0,0 +7819,TEST,0,0 +7820,TEST,0,0 +7821,TEST,0,0 +7822,TEST,0,0 +7823,TEST,0,0 +7824,TEST,0,0 +7825,TEST,0,0 +7826,TEST,0,0 +7827,TEST,0,0 +7828,TEST,0,0 +7829,TEST,0,0 +7830,TEST,0,0 +7831,TEST,0,0 +7832,TEST,0,0 +7833,TEST,0,0 +7834,TEST,0,0 +7835,TEST,0,0 +7836,TEST,0,0 +7837,TEST,0,0 +7838,TEST,0,0 +7839,TEST,0,0 +7840,TEST,0,0 +7841,TEST,0,0 +7842,TEST,0,0 +7843,TEST,0,0 +7844,TEST,0,0 +7845,TEST,0,0 +7846,TEST,0,0 +7847,TEST,0,0 +7848,TEST,0,0 +7849,TEST,0,0 +7850,TEST,0,0 +7851,TEST,0,0 +7852,TEST,0,0 +7853,TEST,0,0 +7854,TEST,0,0 +7855,TEST,0,0 +7856,TEST,0,0 +7857,TEST,0,0 +7858,TEST,0,0 +7859,TEST,0,0 +7860,TEST,0,0 +7861,TEST,0,0 +7862,TEST,0,0 +7863,TEST,0,0 +7864,TEST,0,0 +7865,TEST,0,0 +7866,TEST,0,0 +7867,TEST,0,0 +7868,TEST,0,0 +7869,TEST,0,0 +7870,TEST,0,0 +7871,TEST,0,0 +7872,TEST,0,0 +7873,TEST,0,0 +7874,TEST,0,0 +7875,TEST,0,0 +7876,TEST,0,0 +7877,TEST,0,0 +7878,TEST,0,0 +7879,TEST,0,0 +7880,TEST,0,0 +7881,TEST,0,0 +7882,TEST,0,0 +7883,TEST,0,0 +7884,TEST,0,0 +7885,TEST,0,0 +7886,TEST,0,0 +7887,TEST,0,0 +7888,TEST,0,0 +7889,TEST,0,0 +7890,TEST,0,0 +7891,TEST,0,0 +7892,TEST,0,0 +7893,TEST,0,0 +7894,TEST,0,0 +7895,TEST,0,0 +7896,TEST,0,0 +7897,TEST,0,0 +7898,TEST,0,0 +7899,TEST,0,0 +7900,TEST,0,0 +7901,TEST,0,0 +7902,TEST,0,0 +7903,TEST,0,0 +7904,TEST,0,0 +7905,TEST,0,0 +7906,TEST,0,0 +7907,TEST,0,0 +7908,TEST,0,0 +7909,TEST,0,0 +7910,TEST,0,0 +7911,TEST,0,0 +7912,TEST,0,0 +7913,TEST,0,0 +7914,TEST,0,0 +7915,TEST,0,0 +7916,TEST,0,0 +7917,TEST,0,0 +7918,TEST,0,0 +7919,TEST,0,0 +7920,TEST,0,0 +7921,TEST,0,0 +7922,TEST,0,0 +7923,TEST,0,0 +7924,TEST,0,0 +7925,TEST,0,0 +7926,TEST,0,0 +7927,TEST,0,0 +7928,TEST,0,0 +7929,TEST,0,0 +7930,TEST,0,0 +7931,TEST,0,0 +7932,TEST,0,0 +7933,TEST,0,0 +7934,TEST,0,0 +7935,TEST,0,0 +7936,TEST,0,0 +7937,TEST,0,0 +7938,TEST,0,0 +7939,TEST,0,0 +7940,TEST,0,0 +7941,TEST,0,0 +7942,TEST,0,0 +7943,TEST,0,0 +7944,TEST,0,0 +7945,TEST,0,0 +7946,TEST,0,0 +7947,TEST,0,0 +7948,TEST,0,0 +7949,TEST,0,0 +7950,TEST,0,0 +7951,TEST,0,0 +7952,TEST,0,0 +7953,TEST,0,0 +7954,TEST,0,0 +7955,TEST,0,0 +7956,TEST,0,0 +7957,TEST,0,0 +7958,TEST,0,0 +7959,TEST,0,0 +7960,TEST,0,0 +7961,TEST,0,0 +7962,TEST,0,0 +7963,TEST,0,0 +7964,TEST,0,0 +7965,TEST,0,0 +7966,TEST,0,0 +7967,TEST,0,0 +7968,TEST,0,0 +7969,TEST,0,0 +7970,TEST,0,0 +7971,TEST,0,0 +7972,TEST,0,0 +7973,TEST,0,0 +7974,TEST,0,0 +7975,TEST,0,0 +7976,TEST,0,0 +7977,TEST,0,0 +7978,TEST,0,0 +7979,TEST,0,0 +7980,TEST,0,0 +7981,TEST,0,0 +7982,TEST,0,0 +7983,TEST,0,0 +7984,TEST,0,0 +7985,TEST,0,0 +7986,TEST,0,0 +7987,TEST,0,0 +7988,TEST,0,0 +7989,TEST,0,0 +7990,TEST,0,0 +7991,TEST,0,0 +7992,TEST,0,0 +7993,TEST,0,0 +7994,TEST,0,0 +7995,TEST,0,0 +7996,TEST,0,0 +7997,TEST,0,0 +7998,TEST,0,0 +7999,TEST,0,0 +8000,TEST,0,0 +8001,TEST,0,0 +8002,TEST,0,0 +8003,TEST,0,0 +8004,TEST,0,0 +8005,TEST,0,0 +8006,TEST,0,0 +8007,TEST,0,0 +8008,TEST,0,0 +8009,TEST,0,0 +8010,TEST,0,0 +8011,TEST,0,0 +8012,TEST,0,0 +8013,TEST,0,0 +8014,TEST,0,0 +8015,TEST,0,0 +8016,TEST,0,0 +8017,TEST,0,0 +8018,TEST,0,0 +8019,TEST,0,0 +8020,TEST,0,0 +8021,TEST,0,0 +8022,TEST,0,0 +8023,TEST,0,0 +8024,TEST,0,0 +8025,TEST,0,0 +8026,TEST,0,0 +8027,TEST,0,0 +8028,TEST,0,0 +8029,TEST,0,0 +8030,TEST,0,0 +8031,TEST,0,0 +8032,TEST,0,0 +8033,TEST,0,0 +8034,TEST,0,0 +8035,TEST,0,0 +8036,TEST,0,0 +8037,TEST,0,0 +8038,TEST,0,0 +8039,TEST,0,0 +8040,TEST,0,0 +8041,TEST,0,0 +8042,TEST,0,0 +8043,TEST,0,0 +8044,TEST,0,0 +8045,TEST,0,0 +8046,TEST,0,0 +8047,TEST,0,0 +8048,TEST,0,0 +8049,TEST,0,0 +8050,TEST,0,0 +8051,TEST,0,0 +8052,TEST,0,0 +8053,TEST,0,0 +8054,TEST,0,0 +8055,TEST,0,0 +8056,TEST,0,0 +8057,TEST,0,0 +8058,TEST,0,0 +8059,TEST,0,0 +8060,TEST,0,0 +8061,TEST,0,0 +8062,TEST,0,0 +8063,TEST,0,0 +8064,TEST,0,0 +8065,TEST,0,0 +8066,TEST,0,0 +8067,TEST,0,0 +8068,TEST,0,0 +8069,TEST,0,0 +8070,TEST,0,0 +8071,TEST,0,0 +8072,TEST,0,0 +8073,TEST,0,0 +8074,TEST,0,0 +8075,TEST,0,0 +8076,TEST,0,0 +8077,TEST,0,0 +8078,TEST,0,0 +8079,TEST,0,0 +8080,TEST,0,0 +8081,TEST,0,0 +8082,TEST,0,0 +8083,TEST,0,0 +8084,TEST,0,0 +8085,TEST,0,0 +8086,TEST,0,0 +8087,TEST,0,0 +8088,TEST,0,0 +8089,TEST,0,0 +8090,TEST,0,0 +8091,TEST,0,0 +8092,TEST,0,0 +8093,TEST,0,0 +8094,TEST,0,0 +8095,TEST,0,0 +8096,TEST,0,0 +8097,TEST,0,0 +8098,TEST,0,0 +8099,TEST,0,0 +8100,TEST,0,0 +8101,TEST,0,0 +8102,TEST,0,0 +8103,TEST,0,0 +8104,TEST,0,0 +8105,TEST,0,0 +8106,TEST,0,0 +8107,TEST,0,0 +8108,TEST,0,0 +8109,TEST,0,0 +8110,TEST,0,0 +8111,TEST,0,0 +8112,TEST,0,0 +8113,TEST,0,0 +8114,TEST,0,0 +8115,TEST,0,0 +8116,TEST,0,0 +8117,TEST,0,0 +8118,TEST,0,0 +8119,TEST,0,0 +8120,TEST,0,0 +8121,TEST,0,0 +8122,TEST,0,0 +8123,TEST,0,0 +8124,TEST,0,0 +8125,TEST,0,0 +8126,TEST,0,0 +8127,TEST,0,0 +8128,TEST,0,0 +8129,TEST,0,0 +8130,TEST,0,0 +8131,TEST,0,0 +8132,TEST,0,0 +8133,TEST,0,0 +8134,TEST,0,0 +8135,TEST,0,0 +8136,TEST,0,0 +8137,TEST,0,0 +8138,TEST,0,0 +8139,TEST,0,0 +8140,TEST,0,0 +8141,TEST,0,0 +8142,TEST,0,0 +8143,TEST,0,0 +8144,TEST,0,0 +8145,TEST,0,0 +8146,TEST,0,0 +8147,TEST,0,0 +8148,TEST,0,0 +8149,TEST,0,0 +8150,TEST,0,0 +8151,TEST,0,0 +8152,TEST,0,0 +8153,TEST,0,0 +8154,TEST,0,0 +8155,TEST,0,0 +8156,TEST,0,0 +8157,TEST,0,0 +8158,TEST,0,0 +8159,TEST,0,0 +8160,TEST,0,0 +8161,TEST,0,0 +8162,TEST,0,0 +8163,TEST,0,0 +8164,TEST,0,0 +8165,TEST,0,0 +8166,TEST,0,0 +8167,TEST,0,0 +8168,TEST,0,0 +8169,TEST,0,0 +8170,TEST,0,0 +8171,TEST,0,0 +8172,TEST,0,0 +8173,TEST,0,0 +8174,TEST,0,0 +8175,TEST,0,0 +8176,TEST,0,0 +8177,TEST,0,0 +8178,TEST,0,0 +8179,TEST,0,0 +8180,TEST,0,0 +8181,TEST,0,0 +8182,TEST,0,0 +8183,TEST,0,0 +8184,TEST,0,0 +8185,TEST,0,0 +8186,TEST,0,0 +8187,TEST,0,0 +8188,TEST,0,0 +8189,TEST,0,0 +8190,TEST,0,0 +8191,TEST,0,0 +8192,TEST,0,0 +8193,TEST,0,0 +8194,TEST,0,0 +8195,TEST,0,0 +8196,TEST,0,0 +8197,TEST,0,0 +8198,TEST,0,0 +8199,TEST,0,0 +8200,TEST,0,0 +8201,TEST,0,0 +8202,TEST,0,0 +8203,TEST,0,0 +8204,TEST,0,0 +8205,TEST,0,0 +8206,TEST,0,0 +8207,TEST,0,0 +8208,TEST,0,0 +8209,TEST,0,0 +8210,TEST,0,0 +8211,TEST,0,0 +8212,TEST,0,0 +8213,TEST,0,0 +8214,TEST,0,0 +8215,TEST,0,0 +8216,TEST,0,0 +8217,TEST,0,0 +8218,TEST,0,0 +8219,TEST,0,0 +8220,TEST,0,0 +8221,TEST,0,0 +8222,TEST,0,0 +8223,TEST,0,0 +8224,TEST,0,0 +8225,TEST,0,0 +8226,TEST,0,0 +8227,TEST,0,0 +8228,TEST,0,0 +8229,TEST,0,0 +8230,TEST,0,0 +8231,TEST,0,0 +8232,TEST,0,0 +8233,TEST,0,0 +8234,TEST,0,0 +8235,TEST,0,0 +8236,TEST,0,0 +8237,TEST,0,0 +8238,TEST,0,0 +8239,TEST,0,0 +8240,TEST,0,0 +8241,TEST,0,0 +8242,TEST,0,0 +8243,TEST,0,0 +8244,TEST,0,0 +8245,TEST,0,0 +8246,TEST,0,0 +8247,TEST,0,0 +8248,TEST,0,0 +8249,TEST,0,0 +8250,TEST,0,0 +8251,TEST,0,0 +8252,TEST,0,0 +8253,TEST,0,0 +8254,TEST,0,0 +8255,TEST,0,0 +8256,TEST,0,0 +8257,TEST,0,0 +8258,TEST,0,0 +8259,TEST,0,0 +8260,TEST,0,0 +8261,TEST,0,0 +8262,TEST,0,0 +8263,TEST,0,0 +8264,TEST,0,0 +8265,TEST,0,0 +8266,TEST,0,0 +8267,TEST,0,0 +8268,TEST,0,0 +8269,TEST,0,0 +8270,TEST,0,0 +8271,TEST,0,0 +8272,TEST,0,0 +8273,TEST,0,0 +8274,TEST,0,0 +8275,TEST,0,0 +8276,TEST,0,0 +8277,TEST,0,0 +8278,TEST,0,0 +8279,TEST,0,0 +8280,TEST,0,0 +8281,TEST,0,0 +8282,TEST,0,0 +8283,TEST,0,0 +8284,TEST,0,0 +8285,TEST,0,0 +8286,TEST,0,0 +8287,TEST,0,0 +8288,TEST,0,0 +8289,TEST,0,0 +8290,TEST,0,0 +8291,TEST,0,0 +8292,TEST,0,0 +8293,TEST,0,0 +8294,TEST,0,0 +8295,TEST,0,0 +8296,TEST,0,0 +8297,TEST,0,0 +8298,TEST,0,0 +8299,TEST,0,0 +8300,TEST,0,0 +8301,TEST,0,0 +8302,TEST,0,0 +8303,TEST,0,0 +8304,TEST,0,0 +8305,TEST,0,0 +8306,TEST,0,0 +8307,TEST,0,0 +8308,TEST,0,0 +8309,TEST,0,0 +8310,TEST,0,0 +8311,TEST,0,0 +8312,TEST,0,0 +8313,TEST,0,0 +8314,TEST,0,0 +8315,TEST,0,0 +8316,TEST,0,0 +8317,TEST,0,0 +8318,TEST,0,0 +8319,TEST,0,0 +8320,TEST,0,0 +8321,TEST,0,0 +8322,TEST,0,0 +8323,TEST,0,0 +8324,TEST,0,0 +8325,TEST,0,0 +8326,TEST,0,0 +8327,TEST,0,0 +8328,TEST,0,0 +8329,TEST,0,0 +8330,TEST,0,0 +8331,TEST,0,0 +8332,TEST,0,0 +8333,TEST,0,0 +8334,TEST,0,0 +8335,TEST,0,0 +8336,TEST,0,0 +8337,TEST,0,0 +8338,TEST,0,0 +8339,TEST,0,0 +8340,TEST,0,0 +8341,TEST,0,0 +8342,TEST,0,0 +8343,TEST,0,0 +8344,TEST,0,0 +8345,TEST,0,0 +8346,TEST,0,0 +8347,TEST,0,0 +8348,TEST,0,0 +8349,TEST,0,0 +8350,TEST,0,0 +8351,TEST,0,0 +8352,TEST,0,0 +8353,TEST,0,0 +8354,TEST,0,0 +8355,TEST,0,0 +8356,TEST,0,0 +8357,TEST,0,0 +8358,TEST,0,0 +8359,TEST,0,0 +8360,TEST,0,0 +8361,TEST,0,0 +8362,TEST,0,0 +8363,TEST,0,0 +8364,TEST,0,0 +8365,TEST,0,0 +8366,TEST,0,0 +8367,TEST,0,0 +8368,TEST,0,0 +8369,TEST,0,0 +8370,TEST,0,0 +8371,TEST,0,0 +8372,TEST,0,0 +8373,TEST,0,0 +8374,TEST,0,0 +8375,TEST,0,0 +8376,TEST,0,0 +8377,TEST,0,0 +8378,TEST,0,0 +8379,TEST,0,0 +8380,TEST,0,0 +8381,TEST,0,0 +8382,TEST,0,0 +8383,TEST,0,0 +8384,TEST,0,0 +8385,TEST,0,0 +8386,TEST,0,0 +8387,TEST,0,0 +8388,TEST,0,0 +8389,TEST,0,0 +8390,TEST,0,0 +8391,TEST,0,0 +8392,TEST,0,0 +8393,TEST,0,0 +8394,TEST,0,0 +8395,TEST,0,0 +8396,TEST,0,0 +8397,TEST,0,0 +8398,TEST,0,0 +8399,TEST,0,0 +8400,TEST,0,0 +8401,TEST,0,0 +8402,TEST,0,0 +8403,TEST,0,0 +8404,TEST,0,0 +8405,TEST,0,0 +8406,TEST,0,0 +8407,TEST,0,0 +8408,TEST,0,0 +8409,TEST,0,0 +8410,TEST,0,0 +8411,TEST,0,0 +8412,TEST,0,0 +8413,TEST,0,0 +8414,TEST,0,0 +8415,TEST,0,0 +8416,TEST,0,0 +8417,TEST,0,0 +8418,TEST,0,0 +8419,TEST,0,0 +8420,TEST,0,0 +8421,TEST,0,0 +8422,TEST,0,0 +8423,TEST,0,0 +8424,TEST,0,0 +8425,TEST,0,0 +8426,TEST,0,0 +8427,TEST,0,0 +8428,TEST,0,0 +8429,TEST,0,0 +8430,TEST,0,0 +8431,TEST,0,0 +8432,TEST,0,0 +8433,TEST,0,0 +8434,TEST,0,0 +8435,TEST,0,0 +8436,TEST,0,0 +8437,TEST,0,0 +8438,TEST,0,0 +8439,TEST,0,0 +8440,TEST,0,0 +8441,TEST,0,0 +8442,TEST,0,0 +8443,TEST,0,0 +8444,TEST,0,0 +8445,TEST,0,0 +8446,TEST,0,0 +8447,TEST,0,0 +8448,TEST,0,0 +8449,TEST,0,0 +8450,TEST,0,0 +8451,TEST,0,0 +8452,TEST,0,0 +8453,TEST,0,0 +8454,TEST,0,0 +8455,TEST,0,0 +8456,TEST,0,0 +8457,TEST,0,0 +8458,TEST,0,0 +8459,TEST,0,0 +8460,TEST,0,0 +8461,TEST,0,0 +8462,TEST,0,0 +8463,TEST,0,0 +8464,TEST,0,0 +8465,TEST,0,0 +8466,TEST,0,0 +8467,TEST,0,0 +8468,TEST,0,0 +8469,TEST,0,0 +8470,TEST,0,0 +8471,TEST,0,0 +8472,TEST,0,0 +8473,TEST,0,0 +8474,TEST,0,0 +8475,TEST,0,0 +8476,TEST,0,0 +8477,TEST,0,0 +8478,TEST,0,0 +8479,TEST,0,0 +8480,TEST,0,0 +8481,TEST,0,0 +8482,TEST,0,0 +8483,TEST,0,0 +8484,TEST,0,0 +8485,TEST,0,0 +8486,TEST,0,0 +8487,TEST,0,0 +8488,TEST,0,0 +8489,TEST,0,0 +8490,TEST,0,0 +8491,TEST,0,0 +8492,TEST,0,0 +8493,TEST,0,0 +8494,TEST,0,0 +8495,TEST,0,0 +8496,TEST,0,0 +8497,TEST,0,0 +8498,TEST,0,0 +8499,TEST,0,0 +8500,TEST,0,0 +8501,TEST,0,0 +8502,TEST,0,0 +8503,TEST,0,0 +8504,TEST,0,0 +8505,TEST,0,0 +8506,TEST,0,0 +8507,TEST,0,0 +8508,TEST,0,0 +8509,TEST,0,0 +8510,TEST,0,0 +8511,TEST,0,0 +8512,TEST,0,0 +8513,TEST,0,0 +8514,TEST,0,0 +8515,TEST,0,0 +8516,TEST,0,0 +8517,TEST,0,0 +8518,TEST,0,0 +8519,TEST,0,0 +8520,TEST,0,0 +8521,TEST,0,0 +8522,TEST,0,0 +8523,TEST,0,0 +8524,TEST,0,0 +8525,TEST,0,0 +8526,TEST,0,0 +8527,TEST,0,0 +8528,TEST,0,0 +8529,TEST,0,0 +8530,TEST,0,0 +8531,TEST,0,0 +8532,TEST,0,0 +8533,TEST,0,0 +8534,TEST,0,0 +8535,TEST,0,0 +8536,TEST,0,0 +8537,TEST,0,0 +8538,TEST,0,0 +8539,TEST,0,0 +8540,TEST,0,0 +8541,TEST,0,0 +8542,TEST,0,0 +8543,TEST,0,0 +8544,TEST,0,0 +8545,TEST,0,0 +8546,TEST,0,0 +8547,TEST,0,0 +8548,TEST,0,0 +8549,TEST,0,0 +8550,TEST,0,0 +8551,TEST,0,0 +8552,TEST,0,0 +8553,TEST,0,0 +8554,TEST,0,0 +8555,TEST,0,0 +8556,TEST,0,0 +8557,TEST,0,0 +8558,TEST,0,0 +8559,TEST,0,0 +8560,TEST,0,0 +8561,TEST,0,0 +8562,TEST,0,0 +8563,TEST,0,0 +8564,TEST,0,0 +8565,TEST,0,0 +8566,TEST,0,0 +8567,TEST,0,0 +8568,TEST,0,0 +8569,TEST,0,0 +8570,TEST,0,0 +8571,TEST,0,0 +8572,TEST,0,0 +8573,TEST,0,0 +8574,TEST,0,0 +8575,TEST,0,0 +8576,TEST,0,0 +8577,TEST,0,0 +8578,TEST,0,0 +8579,TEST,0,0 +8580,TEST,0,0 +8581,TEST,0,0 +8582,TEST,0,0 +8583,TEST,0,0 +8584,TEST,0,0 +8585,TEST,0,0 +8586,TEST,0,0 +8587,TEST,0,0 +8588,TEST,0,0 +8589,TEST,0,0 +8590,TEST,0,0 +8591,TEST,0,0 +8592,TEST,0,0 +8593,TEST,0,0 +8594,TEST,0,0 +8595,TEST,0,0 +8596,TEST,0,0 +8597,TEST,0,0 +8598,TEST,0,0 +8599,TEST,0,0 +8600,TEST,0,0 +8601,TEST,0,0 +8602,TEST,0,0 +8603,TEST,0,0 +8604,TEST,0,0 +8605,TEST,0,0 +8606,TEST,0,0 +8607,TEST,0,0 +8608,TEST,0,0 +8609,TEST,0,0 +8610,TEST,0,0 +8611,TEST,0,0 +8612,TEST,0,0 +8613,TEST,0,0 +8614,TEST,0,0 +8615,TEST,0,0 +8616,TEST,0,0 +8617,TEST,0,0 +8618,TEST,0,0 +8619,TEST,0,0 +8620,TEST,0,0 +8621,TEST,0,0 +8622,TEST,0,0 +8623,TEST,0,0 +8624,TEST,0,0 +8625,TEST,0,0 +8626,TEST,0,0 +8627,TEST,0,0 +8628,TEST,0,0 +8629,TEST,0,0 +8630,TEST,0,0 +8631,TEST,0,0 +8632,TEST,0,0 +8633,TEST,0,0 +8634,TEST,0,0 +8635,TEST,0,0 +8636,TEST,0,0 +8637,TEST,0,0 +8638,TEST,0,0 +8639,TEST,0,0 +8640,TEST,0,0 +8641,TEST,0,0 +8642,TEST,0,0 +8643,TEST,0,0 +8644,TEST,0,0 +8645,TEST,0,0 +8646,TEST,0,0 +8647,TEST,0,0 +8648,TEST,0,0 +8649,TEST,0,0 +8650,TEST,0,0 +8651,TEST,0,0 +8652,TEST,0,0 +8653,TEST,0,0 +8654,TEST,0,0 +8655,TEST,0,0 +8656,TEST,0,0 +8657,TEST,0,0 +8658,TEST,0,0 +8659,TEST,0,0 +8660,TEST,0,0 +8661,TEST,0,0 +8662,TEST,0,0 +8663,TEST,0,0 +8664,TEST,0,0 +8665,TEST,0,0 +8666,TEST,0,0 +8667,TEST,0,0 +8668,TEST,0,0 +8669,TEST,0,0 +8670,TEST,0,0 +8671,TEST,0,0 +8672,TEST,0,0 +8673,TEST,0,0 +8674,TEST,0,0 +8675,TEST,0,0 +8676,TEST,0,0 +8677,TEST,0,0 +8678,TEST,0,0 +8679,TEST,0,0 +8680,TEST,0,0 +8681,TEST,0,0 +8682,TEST,0,0 +8683,TEST,0,0 +8684,TEST,0,0 +8685,TEST,0,0 +8686,TEST,0,0 +8687,TEST,0,0 +8688,TEST,0,0 +8689,TEST,0,0 +8690,TEST,0,0 +8691,TEST,0,0 +8692,TEST,0,0 +8693,TEST,0,0 +8694,TEST,0,0 +8695,TEST,0,0 +8696,TEST,0,0 +8697,TEST,0,0 +8698,TEST,0,0 +8699,TEST,0,0 +8700,TEST,0,0 +8701,TEST,0,0 +8702,TEST,0,0 +8703,TEST,0,0 +8704,TEST,0,0 +8705,TEST,0,0 +8706,TEST,0,0 +8707,TEST,0,0 +8708,TEST,0,0 +8709,TEST,0,0 +8710,TEST,0,0 +8711,TEST,0,0 +8712,TEST,0,0 +8713,TEST,0,0 +8714,TEST,0,0 +8715,TEST,0,0 +8716,TEST,0,0 +8717,TEST,0,0 +8718,TEST,0,0 +8719,TEST,0,0 +8720,TEST,0,0 +8721,TEST,0,0 +8722,TEST,0,0 +8723,TEST,0,0 +8724,TEST,0,0 +8725,TEST,0,0 +8726,TEST,0,0 +8727,TEST,0,0 +8728,TEST,0,0 +8729,TEST,0,0 +8730,TEST,0,0 +8731,TEST,0,0 +8732,TEST,0,0 +8733,TEST,0,0 +8734,TEST,0,0 +8735,TEST,0,0 +8736,TEST,0,0 +8737,TEST,0,0 +8738,TEST,0,0 +8739,TEST,0,0 +8740,TEST,0,0 +8741,TEST,0,0 +8742,TEST,0,0 +8743,TEST,0,0 +8744,TEST,0,0 +8745,TEST,0,0 +8746,TEST,0,0 +8747,TEST,0,0 +8748,TEST,0,0 +8749,TEST,0,0 +8750,TEST,0,0 +8751,TEST,0,0 +8752,TEST,0,0 +8753,TEST,0,0 +8754,TEST,0,0 +8755,TEST,0,0 +8756,TEST,0,0 +8757,TEST,0,0 +8758,TEST,0,0 +8759,TEST,0,0 +8760,TEST,0,0 +8761,TEST,0,0 +8762,TEST,0,0 +8763,TEST,0,0 +8764,TEST,0,0 +8765,TEST,0,0 +8766,TEST,0,0 +8767,TEST,0,0 +8768,TEST,0,0 +8769,TEST,0,0 +8770,TEST,0,0 +8771,TEST,0,0 +8772,TEST,0,0 +8773,TEST,0,0 +8774,TEST,0,0 +8775,TEST,0,0 +8776,TEST,0,0 +8777,TEST,0,0 +8778,TEST,0,0 +8779,TEST,0,0 +8780,TEST,0,0 +8781,TEST,0,0 +8782,TEST,0,0 +8783,TEST,0,0 +8784,TEST,0,0 +8785,TEST,0,0 +8786,TEST,0,0 +8787,TEST,0,0 +8788,TEST,0,0 +8789,TEST,0,0 +8790,TEST,0,0 +8791,TEST,0,0 +8792,TEST,0,0 +8793,TEST,0,0 +8794,TEST,0,0 +8795,TEST,0,0 +8796,TEST,0,0 +8797,TEST,0,0 +8798,TEST,0,0 +8799,TEST,0,0 +8800,TEST,0,0 +8801,TEST,0,0 +8802,TEST,0,0 +8803,TEST,0,0 +8804,TEST,0,0 +8805,TEST,0,0 +8806,TEST,0,0 +8807,TEST,0,0 +8808,TEST,0,0 +8809,TEST,0,0 +8810,TEST,0,0 +8811,TEST,0,0 +8812,TEST,0,0 +8813,TEST,0,0 +8814,TEST,0,0 +8815,TEST,0,0 +8816,TEST,0,0 +8817,TEST,0,0 +8818,TEST,0,0 +8819,TEST,0,0 +8820,TEST,0,0 +8821,TEST,0,0 +8822,TEST,0,0 +8823,TEST,0,0 +8824,TEST,0,0 +8825,TEST,0,0 +8826,TEST,0,0 +8827,TEST,0,0 +8828,TEST,0,0 +8829,TEST,0,0 +8830,TEST,0,0 +8831,TEST,0,0 +8832,TEST,0,0 +8833,TEST,0,0 +8834,TEST,0,0 +8835,TEST,0,0 +8836,TEST,0,0 +8837,TEST,0,0 +8838,TEST,0,0 +8839,TEST,0,0 +8840,TEST,0,0 +8841,TEST,0,0 +8842,TEST,0,0 +8843,TEST,0,0 +8844,TEST,0,0 +8845,TEST,0,0 +8846,TEST,0,0 +8847,TEST,0,0 +8848,TEST,0,0 +8849,TEST,0,0 +8850,TEST,0,0 +8851,TEST,0,0 +8852,TEST,0,0 +8853,TEST,0,0 +8854,TEST,0,0 +8855,TEST,0,0 +8856,TEST,0,0 +8857,TEST,0,0 +8858,TEST,0,0 +8859,TEST,0,0 +8860,TEST,0,0 +8861,TEST,0,0 +8862,TEST,0,0 +8863,TEST,0,0 +8864,TEST,0,0 +8865,TEST,0,0 +8866,TEST,0,0 +8867,TEST,0,0 +8868,TEST,0,0 +8869,TEST,0,0 +8870,TEST,0,0 +8871,TEST,0,0 +8872,TEST,0,0 +8873,TEST,0,0 +8874,TEST,0,0 +8875,TEST,0,0 +8876,TEST,0,0 +8877,TEST,0,0 +8878,TEST,0,0 +8879,TEST,0,0 +8880,TEST,0,0 +8881,TEST,0,0 +8882,TEST,0,0 +8883,TEST,0,0 +8884,TEST,0,0 +8885,TEST,0,0 +8886,TEST,0,0 +8887,TEST,0,0 +8888,TEST,0,0 +8889,TEST,0,0 +8890,TEST,0,0 +8891,TEST,0,0 +8892,TEST,0,0 +8893,TEST,0,0 +8894,TEST,0,0 +8895,TEST,0,0 +8896,TEST,0,0 +8897,TEST,0,0 +8898,TEST,0,0 +8899,TEST,0,0 +8900,TEST,0,0 +8901,TEST,0,0 +8902,TEST,0,0 +8903,TEST,0,0 +8904,TEST,0,0 +8905,TEST,0,0 +8906,TEST,0,0 +8907,TEST,0,0 +8908,TEST,0,0 +8909,TEST,0,0 +8910,TEST,0,0 +8911,TEST,0,0 +8912,TEST,0,0 +8913,TEST,0,0 +8914,TEST,0,0 +8915,TEST,0,0 +8916,TEST,0,0 +8917,TEST,0,0 +8918,TEST,0,0 +8919,TEST,0,0 +8920,TEST,0,0 +8921,TEST,0,0 +8922,TEST,0,0 +8923,TEST,0,0 +8924,TEST,0,0 +8925,TEST,0,0 +8926,TEST,0,0 +8927,TEST,0,0 +8928,TEST,0,0 +8929,TEST,0,0 +8930,TEST,0,0 +8931,TEST,0,0 +8932,TEST,0,0 +8933,TEST,0,0 +8934,TEST,0,0 +8935,TEST,0,0 +8936,TEST,0,0 +8937,TEST,0,0 +8938,TEST,0,0 +8939,TEST,0,0 +8940,TEST,0,0 +8941,TEST,0,0 +8942,TEST,0,0 +8943,TEST,0,0 +8944,TEST,0,0 +8945,TEST,0,0 +8946,TEST,0,0 +8947,TEST,0,0 +8948,TEST,0,0 +8949,TEST,0,0 +8950,TEST,0,0 +8951,TEST,0,0 +8952,TEST,0,0 +8953,TEST,0,0 +8954,TEST,0,0 +8955,TEST,0,0 +8956,TEST,0,0 +8957,TEST,0,0 +8958,TEST,0,0 +8959,TEST,0,0 +8960,TEST,0,0 +8961,TEST,0,0 +8962,TEST,0,0 +8963,TEST,0,0 +8964,TEST,0,0 +8965,TEST,0,0 +8966,TEST,0,0 +8967,TEST,0,0 +8968,TEST,0,0 +8969,TEST,0,0 +8970,TEST,0,0 +8971,TEST,0,0 +8972,TEST,0,0 +8973,TEST,0,0 +8974,TEST,0,0 +8975,TEST,0,0 +8976,TEST,0,0 +8977,TEST,0,0 +8978,TEST,0,0 +8979,TEST,0,0 +8980,TEST,0,0 +8981,TEST,0,0 +8982,TEST,0,0 +8983,TEST,0,0 +8984,TEST,0,0 +8985,TEST,0,0 +8986,TEST,0,0 +8987,TEST,0,0 +8988,TEST,0,0 +8989,TEST,0,0 +8990,TEST,0,0 +8991,TEST,0,0 +8992,TEST,0,0 +8993,TEST,0,0 +8994,TEST,0,0 +8995,TEST,0,0 +8996,TEST,0,0 +8997,TEST,0,0 +8998,TEST,0,0 +8999,TEST,0,0 +9000,TEST,0,0 +9001,TEST,0,0 +9002,TEST,0,0 +9003,TEST,0,0 +9004,TEST,0,0 +9005,TEST,0,0 +9006,TEST,0,0 +9007,TEST,0,0 +9008,TEST,0,0 +9009,TEST,0,0 +9010,TEST,0,0 +9011,TEST,0,0 +9012,TEST,0,0 +9013,TEST,0,0 +9014,TEST,0,0 +9015,TEST,0,0 +9016,TEST,0,0 +9017,TEST,0,0 +9018,TEST,0,0 +9019,TEST,0,0 +9020,TEST,0,0 +9021,TEST,0,0 +9022,TEST,0,0 +9023,TEST,0,0 +9024,TEST,0,0 +9025,TEST,0,0 +9026,TEST,0,0 +9027,TEST,0,0 +9028,TEST,0,0 +9029,TEST,0,0 +9030,TEST,0,0 +9031,TEST,0,0 +9032,TEST,0,0 +9033,TEST,0,0 +9034,TEST,0,0 +9035,TEST,0,0 +9036,TEST,0,0 +9037,TEST,0,0 +9038,TEST,0,0 +9039,TEST,0,0 +9040,TEST,0,0 +9041,TEST,0,0 +9042,TEST,0,0 +9043,TEST,0,0 +9044,TEST,0,0 +9045,TEST,0,0 +9046,TEST,0,0 +9047,TEST,0,0 +9048,TEST,0,0 +9049,TEST,0,0 +9050,TEST,0,0 +9051,TEST,0,0 +9052,TEST,0,0 +9053,TEST,0,0 +9054,TEST,0,0 +9055,TEST,0,0 +9056,TEST,0,0 +9057,TEST,0,0 +9058,TEST,0,0 +9059,TEST,0,0 +9060,TEST,0,0 +9061,TEST,0,0 +9062,TEST,0,0 +9063,TEST,0,0 +9064,TEST,0,0 +9065,TEST,0,0 +9066,TEST,0,0 +9067,TEST,0,0 +9068,TEST,0,0 +9069,TEST,0,0 +9070,TEST,0,0 +9071,TEST,0,0 +9072,TEST,0,0 +9073,TEST,0,0 +9074,TEST,0,0 +9075,TEST,0,0 +9076,TEST,0,0 +9077,TEST,0,0 +9078,TEST,0,0 +9079,TEST,0,0 +9080,TEST,0,0 +9081,TEST,0,0 +9082,TEST,0,0 +9083,TEST,0,0 +9084,TEST,0,0 +9085,TEST,0,0 +9086,TEST,0,0 +9087,TEST,0,0 +9088,TEST,0,0 +9089,TEST,0,0 +9090,TEST,0,0 +9091,TEST,0,0 +9092,TEST,0,0 +9093,TEST,0,0 +9094,TEST,0,0 +9095,TEST,0,0 +9096,TEST,0,0 +9097,TEST,0,0 +9098,TEST,0,0 +9099,TEST,0,0 +9100,TEST,0,0 +9101,TEST,0,0 +9102,TEST,0,0 +9103,TEST,0,0 +9104,TEST,0,0 +9105,TEST,0,0 +9106,TEST,0,0 +9107,TEST,0,0 +9108,TEST,0,0 +9109,TEST,0,0 +9110,TEST,0,0 +9111,TEST,0,0 +9112,TEST,0,0 +9113,TEST,0,0 +9114,TEST,0,0 +9115,TEST,0,0 +9116,TEST,0,0 +9117,TEST,0,0 +9118,TEST,0,0 +9119,TEST,0,0 +9120,TEST,0,0 +9121,TEST,0,0 +9122,TEST,0,0 +9123,TEST,0,0 +9124,TEST,0,0 +9125,TEST,0,0 +9126,TEST,0,0 +9127,TEST,0,0 +9128,TEST,0,0 +9129,TEST,0,0 +9130,TEST,0,0 +9131,TEST,0,0 +9132,TEST,0,0 +9133,TEST,0,0 +9134,TEST,0,0 +9135,TEST,0,0 +9136,TEST,0,0 +9137,TEST,0,0 +9138,TEST,0,0 +9139,TEST,0,0 +9140,TEST,0,0 +9141,TEST,0,0 +9142,TEST,0,0 +9143,TEST,0,0 +9144,TEST,0,0 +9145,TEST,0,0 +9146,TEST,0,0 +9147,TEST,0,0 +9148,TEST,0,0 +9149,TEST,0,0 +9150,TEST,0,0 +9151,TEST,0,0 +9152,TEST,0,0 +9153,TEST,0,0 +9154,TEST,0,0 +9155,TEST,0,0 +9156,TEST,0,0 +9157,TEST,0,0 +9158,TEST,0,0 +9159,TEST,0,0 +9160,TEST,0,0 +9161,TEST,0,0 +9162,TEST,0,0 +9163,TEST,0,0 +9164,TEST,0,0 +9165,TEST,0,0 +9166,TEST,0,0 +9167,TEST,0,0 +9168,TEST,0,0 +9169,TEST,0,0 +9170,TEST,0,0 +9171,TEST,0,0 +9172,TEST,0,0 +9173,TEST,0,0 +9174,TEST,0,0 +9175,TEST,0,0 +9176,TEST,0,0 +9177,TEST,0,0 +9178,TEST,0,0 +9179,TEST,0,0 +9180,TEST,0,0 +9181,TEST,0,0 +9182,TEST,0,0 +9183,TEST,0,0 +9184,TEST,0,0 +9185,TEST,0,0 +9186,TEST,0,0 +9187,TEST,0,0 +9188,TEST,0,0 +9189,TEST,0,0 +9190,TEST,0,0 +9191,TEST,0,0 +9192,TEST,0,0 +9193,TEST,0,0 +9194,TEST,0,0 +9195,TEST,0,0 +9196,TEST,0,0 +9197,TEST,0,0 +9198,TEST,0,0 +9199,TEST,0,0 +9200,TEST,0,0 +9201,TEST,0,0 +9202,TEST,0,0 +9203,TEST,0,0 +9204,TEST,0,0 +9205,TEST,0,0 +9206,TEST,0,0 +9207,TEST,0,0 +9208,TEST,0,0 +9209,TEST,0,0 +9210,TEST,0,0 +9211,TEST,0,0 +9212,TEST,0,0 +9213,TEST,0,0 +9214,TEST,0,0 +9215,TEST,0,0 +9216,TEST,0,0 +9217,TEST,0,0 +9218,TEST,0,0 +9219,TEST,0,0 +9220,TEST,0,0 +9221,TEST,0,0 +9222,TEST,0,0 +9223,TEST,0,0 +9224,TEST,0,0 +9225,TEST,0,0 +9226,TEST,0,0 +9227,TEST,0,0 +9228,TEST,0,0 +9229,TEST,0,0 +9230,TEST,0,0 +9231,TEST,0,0 +9232,TEST,0,0 +9233,TEST,0,0 +9234,TEST,0,0 +9235,TEST,0,0 +9236,TEST,0,0 +9237,TEST,0,0 +9238,TEST,0,0 +9239,TEST,0,0 +9240,TEST,0,0 +9241,TEST,0,0 +9242,TEST,0,0 +9243,TEST,0,0 +9244,TEST,0,0 +9245,TEST,0,0 +9246,TEST,0,0 +9247,TEST,0,0 +9248,TEST,0,0 +9249,TEST,0,0 +9250,TEST,0,0 +9251,TEST,0,0 +9252,TEST,0,0 +9253,TEST,0,0 +9254,TEST,0,0 +9255,TEST,0,0 +9256,TEST,0,0 +9257,TEST,0,0 +9258,TEST,0,0 +9259,TEST,0,0 +9260,TEST,0,0 +9261,TEST,0,0 +9262,TEST,0,0 +9263,TEST,0,0 +9264,TEST,0,0 +9265,TEST,0,0 +9266,TEST,0,0 +9267,TEST,0,0 +9268,TEST,0,0 +9269,TEST,0,0 +9270,TEST,0,0 +9271,TEST,0,0 +9272,TEST,0,0 +9273,TEST,0,0 +9274,TEST,0,0 +9275,TEST,0,0 +9276,TEST,0,0 +9277,TEST,0,0 +9278,TEST,0,0 +9279,TEST,0,0 +9280,TEST,0,0 +9281,TEST,0,0 +9282,TEST,0,0 +9283,TEST,0,0 +9284,TEST,0,0 +9285,TEST,0,0 +9286,TEST,0,0 +9287,TEST,0,0 +9288,TEST,0,0 +9289,TEST,0,0 +9290,TEST,0,0 +9291,TEST,0,0 +9292,TEST,0,0 +9293,TEST,0,0 +9294,TEST,0,0 +9295,TEST,0,0 +9296,TEST,0,0 +9297,TEST,0,0 +9298,TEST,0,0 +9299,TEST,0,0 +9300,TEST,0,0 +9301,TEST,0,0 +9302,TEST,0,0 +9303,TEST,0,0 +9304,TEST,0,0 +9305,TEST,0,0 +9306,TEST,0,0 +9307,TEST,0,0 +9308,TEST,0,0 +9309,TEST,0,0 +9310,TEST,0,0 +9311,TEST,0,0 +9312,TEST,0,0 +9313,TEST,0,0 +9314,TEST,0,0 +9315,TEST,0,0 +9316,TEST,0,0 +9317,TEST,0,0 +9318,TEST,0,0 +9319,TEST,0,0 +9320,TEST,0,0 +9321,TEST,0,0 +9322,TEST,0,0 +9323,TEST,0,0 +9324,TEST,0,0 +9325,TEST,0,0 +9326,TEST,0,0 +9327,TEST,0,0 +9328,TEST,0,0 +9329,TEST,0,0 +9330,TEST,0,0 +9331,TEST,0,0 +9332,TEST,0,0 +9333,TEST,0,0 +9334,TEST,0,0 +9335,TEST,0,0 +9336,TEST,0,0 +9337,TEST,0,0 +9338,TEST,0,0 +9339,TEST,0,0 +9340,TEST,0,0 +9341,TEST,0,0 +9342,TEST,0,0 +9343,TEST,0,0 +9344,TEST,0,0 +9345,TEST,0,0 +9346,TEST,0,0 +9347,TEST,0,0 +9348,TEST,0,0 +9349,TEST,0,0 +9350,TEST,0,0 +9351,TEST,0,0 +9352,TEST,0,0 +9353,TEST,0,0 +9354,TEST,0,0 +9355,TEST,0,0 +9356,TEST,0,0 +9357,TEST,0,0 +9358,TEST,0,0 +9359,TEST,0,0 +9360,TEST,0,0 +9361,TEST,0,0 +9362,TEST,0,0 +9363,TEST,0,0 +9364,TEST,0,0 +9365,TEST,0,0 +9366,TEST,0,0 +9367,TEST,0,0 +9368,TEST,0,0 +9369,TEST,0,0 +9370,TEST,0,0 +9371,TEST,0,0 +9372,TEST,0,0 +9373,TEST,0,0 +9374,TEST,0,0 +9375,TEST,0,0 +9376,TEST,0,0 +9377,TEST,0,0 +9378,TEST,0,0 +9379,TEST,0,0 +9380,TEST,0,0 +9381,TEST,0,0 +9382,TEST,0,0 +9383,TEST,0,0 +9384,TEST,0,0 +9385,TEST,0,0 +9386,TEST,0,0 +9387,TEST,0,0 +9388,TEST,0,0 +9389,TEST,0,0 +9390,TEST,0,0 +9391,TEST,0,0 +9392,TEST,0,0 +9393,TEST,0,0 +9394,TEST,0,0 +9395,TEST,0,0 +9396,TEST,0,0 +9397,TEST,0,0 +9398,TEST,0,0 +9399,TEST,0,0 +9400,TEST,0,0 +9401,TEST,0,0 +9402,TEST,0,0 +9403,TEST,0,0 +9404,TEST,0,0 +9405,TEST,0,0 +9406,TEST,0,0 +9407,TEST,0,0 +9408,TEST,0,0 +9409,TEST,0,0 +9410,TEST,0,0 +9411,TEST,0,0 +9412,TEST,0,0 +9413,TEST,0,0 +9414,TEST,0,0 +9415,TEST,0,0 +9416,TEST,0,0 +9417,TEST,0,0 +9418,TEST,0,0 +9419,TEST,0,0 +9420,TEST,0,0 +9421,TEST,0,0 +9422,TEST,0,0 +9423,TEST,0,0 +9424,TEST,0,0 +9425,TEST,0,0 +9426,TEST,0,0 +9427,TEST,0,0 +9428,TEST,0,0 +9429,TEST,0,0 +9430,TEST,0,0 +9431,TEST,0,0 +9432,TEST,0,0 +9433,TEST,0,0 +9434,TEST,0,0 +9435,TEST,0,0 +9436,TEST,0,0 +9437,TEST,0,0 +9438,TEST,0,0 +9439,TEST,0,0 +9440,TEST,0,0 +9441,TEST,0,0 +9442,TEST,0,0 +9443,TEST,0,0 +9444,TEST,0,0 +9445,TEST,0,0 +9446,TEST,0,0 +9447,TEST,0,0 +9448,TEST,0,0 +9449,TEST,0,0 +9450,TEST,0,0 +9451,TEST,0,0 +9452,TEST,0,0 +9453,TEST,0,0 +9454,TEST,0,0 +9455,TEST,0,0 +9456,TEST,0,0 +9457,TEST,0,0 +9458,TEST,0,0 +9459,TEST,0,0 +9460,TEST,0,0 +9461,TEST,0,0 +9462,TEST,0,0 +9463,TEST,0,0 +9464,TEST,0,0 +9465,TEST,0,0 +9466,TEST,0,0 +9467,TEST,0,0 +9468,TEST,0,0 +9469,TEST,0,0 +9470,TEST,0,0 +9471,TEST,0,0 +9472,TEST,0,0 +9473,TEST,0,0 +9474,TEST,0,0 +9475,TEST,0,0 +9476,TEST,0,0 +9477,TEST,0,0 +9478,TEST,0,0 +9479,TEST,0,0 +9480,TEST,0,0 +9481,TEST,0,0 +9482,TEST,0,0 +9483,TEST,0,0 +9484,TEST,0,0 +9485,TEST,0,0 +9486,TEST,0,0 +9487,TEST,0,0 +9488,TEST,0,0 +9489,TEST,0,0 +9490,TEST,0,0 +9491,TEST,0,0 +9492,TEST,0,0 +9493,TEST,0,0 +9494,TEST,0,0 +9495,TEST,0,0 +9496,TEST,0,0 +9497,TEST,0,0 +9498,TEST,0,0 +9499,TEST,0,0 +9500,TEST,0,0 +9501,TEST,0,0 +9502,TEST,0,0 +9503,TEST,0,0 +9504,TEST,0,0 +9505,TEST,0,0 +9506,TEST,0,0 +9507,TEST,0,0 +9508,TEST,0,0 +9509,TEST,0,0 +9510,TEST,0,0 +9511,TEST,0,0 +9512,TEST,0,0 +9513,TEST,0,0 +9514,TEST,0,0 +9515,TEST,0,0 +9516,TEST,0,0 +9517,TEST,0,0 +9518,TEST,0,0 +9519,TEST,0,0 +9520,TEST,0,0 +9521,TEST,0,0 +9522,TEST,0,0 +9523,TEST,0,0 +9524,TEST,0,0 +9525,TEST,0,0 +9526,TEST,0,0 +9527,TEST,0,0 +9528,TEST,0,0 +9529,TEST,0,0 +9530,TEST,0,0 +9531,TEST,0,0 +9532,TEST,0,0 +9533,TEST,0,0 +9534,TEST,0,0 +9535,TEST,0,0 +9536,TEST,0,0 +9537,TEST,0,0 +9538,TEST,0,0 +9539,TEST,0,0 +9540,TEST,0,0 +9541,TEST,0,0 +9542,TEST,0,0 +9543,TEST,0,0 +9544,TEST,0,0 +9545,TEST,0,0 +9546,TEST,0,0 +9547,TEST,0,0 +9548,TEST,0,0 +9549,TEST,0,0 +9550,TEST,0,0 +9551,TEST,0,0 +9552,TEST,0,0 +9553,TEST,0,0 +9554,TEST,0,0 +9555,TEST,0,0 +9556,TEST,0,0 +9557,TEST,0,0 +9558,TEST,0,0 +9559,TEST,0,0 +9560,TEST,0,0 +9561,TEST,0,0 +9562,TEST,0,0 +9563,TEST,0,0 +9564,TEST,0,0 +9565,TEST,0,0 +9566,TEST,0,0 +9567,TEST,0,0 +9568,TEST,0,0 +9569,TEST,0,0 +9570,TEST,0,0 +9571,TEST,0,0 +9572,TEST,0,0 +9573,TEST,0,0 +9574,TEST,0,0 +9575,TEST,0,0 +9576,TEST,0,0 +9577,TEST,0,0 +9578,TEST,0,0 +9579,TEST,0,0 +9580,TEST,0,0 +9581,TEST,0,0 +9582,TEST,0,0 +9583,TEST,0,0 +9584,TEST,0,0 +9585,TEST,0,0 +9586,TEST,0,0 +9587,TEST,0,0 +9588,TEST,0,0 +9589,TEST,0,0 +9590,TEST,0,0 +9591,TEST,0,0 +9592,TEST,0,0 +9593,TEST,0,0 +9594,TEST,0,0 +9595,TEST,0,0 +9596,TEST,0,0 +9597,TEST,0,0 +9598,TEST,0,0 +9599,TEST,0,0 +9600,TEST,0,0 +9601,TEST,0,0 +9602,TEST,0,0 +9603,TEST,0,0 +9604,TEST,0,0 +9605,TEST,0,0 +9606,TEST,0,0 +9607,TEST,0,0 +9608,TEST,0,0 +9609,TEST,0,0 +9610,TEST,0,0 +9611,TEST,0,0 +9612,TEST,0,0 +9613,TEST,0,0 +9614,TEST,0,0 +9615,TEST,0,0 +9616,TEST,0,0 +9617,TEST,0,0 +9618,TEST,0,0 +9619,TEST,0,0 +9620,TEST,0,0 +9621,TEST,0,0 +9622,TEST,0,0 +9623,TEST,0,0 +9624,TEST,0,0 +9625,TEST,0,0 +9626,TEST,0,0 +9627,TEST,0,0 +9628,TEST,0,0 +9629,TEST,0,0 +9630,TEST,0,0 +9631,TEST,0,0 +9632,TEST,0,0 +9633,TEST,0,0 +9634,TEST,0,0 +9635,TEST,0,0 +9636,TEST,0,0 +9637,TEST,0,0 +9638,TEST,0,0 +9639,TEST,0,0 +9640,TEST,0,0 +9641,TEST,0,0 +9642,TEST,0,0 +9643,TEST,0,0 +9644,TEST,0,0 +9645,TEST,0,0 +9646,TEST,0,0 +9647,TEST,0,0 +9648,TEST,0,0 +9649,TEST,0,0 +9650,TEST,0,0 +9651,TEST,0,0 +9652,TEST,0,0 +9653,TEST,0,0 +9654,TEST,0,0 +9655,TEST,0,0 +9656,TEST,0,0 +9657,TEST,0,0 +9658,TEST,0,0 +9659,TEST,0,0 +9660,TEST,0,0 +9661,TEST,0,0 +9662,TEST,0,0 +9663,TEST,0,0 +9664,TEST,0,0 +9665,TEST,0,0 +9666,TEST,0,0 +9667,TEST,0,0 +9668,TEST,0,0 +9669,TEST,0,0 +9670,TEST,0,0 +9671,TEST,0,0 +9672,TEST,0,0 +9673,TEST,0,0 +9674,TEST,0,0 +9675,TEST,0,0 +9676,TEST,0,0 +9677,TEST,0,0 +9678,TEST,0,0 +9679,TEST,0,0 +9680,TEST,0,0 +9681,TEST,0,0 +9682,TEST,0,0 +9683,TEST,0,0 +9684,TEST,0,0 +9685,TEST,0,0 +9686,TEST,0,0 +9687,TEST,0,0 +9688,TEST,0,0 +9689,TEST,0,0 +9690,TEST,0,0 +9691,TEST,0,0 +9692,TEST,0,0 +9693,TEST,0,0 +9694,TEST,0,0 +9695,TEST,0,0 +9696,TEST,0,0 +9697,TEST,0,0 +9698,TEST,0,0 +9699,TEST,0,0 +9700,TEST,0,0 +9701,TEST,0,0 +9702,TEST,0,0 +9703,TEST,0,0 +9704,TEST,0,0 +9705,TEST,0,0 +9706,TEST,0,0 +9707,TEST,0,0 +9708,TEST,0,0 +9709,TEST,0,0 +9710,TEST,0,0 +9711,TEST,0,0 +9712,TEST,0,0 +9713,TEST,0,0 +9714,TEST,0,0 +9715,TEST,0,0 +9716,TEST,0,0 +9717,TEST,0,0 +9718,TEST,0,0 +9719,TEST,0,0 +9720,TEST,0,0 +9721,TEST,0,0 +9722,TEST,0,0 +9723,TEST,0,0 +9724,TEST,0,0 +9725,TEST,0,0 +9726,TEST,0,0 +9727,TEST,0,0 +9728,TEST,0,0 +9729,TEST,0,0 +9730,TEST,0,0 +9731,TEST,0,0 +9732,TEST,0,0 +9733,TEST,0,0 +9734,TEST,0,0 +9735,TEST,0,0 +9736,TEST,0,0 +9737,TEST,0,0 +9738,TEST,0,0 +9739,TEST,0,0 +9740,TEST,0,0 +9741,TEST,0,0 +9742,TEST,0,0 +9743,TEST,0,0 +9744,TEST,0,0 +9745,TEST,0,0 +9746,TEST,0,0 +9747,TEST,0,0 +9748,TEST,0,0 +9749,TEST,0,0 +9750,TEST,0,0 +9751,TEST,0,0 +9752,TEST,0,0 +9753,TEST,0,0 +9754,TEST,0,0 +9755,TEST,0,0 +9756,TEST,0,0 +9757,TEST,0,0 +9758,TEST,0,0 +9759,TEST,0,0 +9760,TEST,0,0 +9761,TEST,0,0 +9762,TEST,0,0 +9763,TEST,0,0 +9764,TEST,0,0 +9765,TEST,0,0 +9766,TEST,0,0 +9767,TEST,0,0 +9768,TEST,0,0 +9769,TEST,0,0 +9770,TEST,0,0 +9771,TEST,0,0 +9772,TEST,0,0 +9773,TEST,0,0 +9774,TEST,0,0 +9775,TEST,0,0 +9776,TEST,0,0 +9777,TEST,0,0 +9778,TEST,0,0 +9779,TEST,0,0 +9780,TEST,0,0 +9781,TEST,0,0 +9782,TEST,0,0 +9783,TEST,0,0 +9784,TEST,0,0 +9785,TEST,0,0 +9786,TEST,0,0 +9787,TEST,0,0 +9788,TEST,0,0 +9789,TEST,0,0 +9790,TEST,0,0 +9791,TEST,0,0 +9792,TEST,0,0 +9793,TEST,0,0 +9794,TEST,0,0 +9795,TEST,0,0 +9796,TEST,0,0 +9797,TEST,0,0 +9798,TEST,0,0 +9799,TEST,0,0 +9800,TEST,0,0 +9801,TEST,0,0 +9802,TEST,0,0 +9803,TEST,0,0 +9804,TEST,0,0 +9805,TEST,0,0 +9806,TEST,0,0 +9807,TEST,0,0 +9808,TEST,0,0 +9809,TEST,0,0 +9810,TEST,0,0 +9811,TEST,0,0 +9812,TEST,0,0 +9813,TEST,0,0 +9814,TEST,0,0 +9815,TEST,0,0 +9816,TEST,0,0 +9817,TEST,0,0 +9818,TEST,0,0 +9819,TEST,0,0 +9820,TEST,0,0 +9821,TEST,0,0 +9822,TEST,0,0 +9823,TEST,0,0 +9824,TEST,0,0 +9825,TEST,0,0 +9826,TEST,0,0 +9827,TEST,0,0 +9828,TEST,0,0 +9829,TEST,0,0 +9830,TEST,0,0 +9831,TEST,0,0 +9832,TEST,0,0 +9833,TEST,0,0 +9834,TEST,0,0 +9835,TEST,0,0 +9836,TEST,0,0 +9837,TEST,0,0 +9838,TEST,0,0 +9839,TEST,0,0 +9840,TEST,0,0 +9841,TEST,0,0 +9842,TEST,0,0 +9843,TEST,0,0 +9844,TEST,0,0 +9845,TEST,0,0 +9846,TEST,0,0 +9847,TEST,0,0 +9848,TEST,0,0 +9849,TEST,0,0 +9850,TEST,0,0 +9851,TEST,0,0 +9852,TEST,0,0 +9853,TEST,0,0 +9854,TEST,0,0 +9855,TEST,0,0 +9856,TEST,0,0 +9857,TEST,0,0 +9858,TEST,0,0 +9859,TEST,0,0 +9860,TEST,0,0 +9861,TEST,0,0 +9862,TEST,0,0 +9863,TEST,0,0 +9864,TEST,0,0 +9865,TEST,0,0 +9866,TEST,0,0 +9867,TEST,0,0 +9868,TEST,0,0 +9869,TEST,0,0 +9870,TEST,0,0 +9871,TEST,0,0 +9872,TEST,0,0 +9873,TEST,0,0 +9874,TEST,0,0 +9875,TEST,0,0 +9876,TEST,0,0 +9877,TEST,0,0 +9878,TEST,0,0 +9879,TEST,0,0 +9880,TEST,0,0 +9881,TEST,0,0 +9882,TEST,0,0 +9883,TEST,0,0 +9884,TEST,0,0 +9885,TEST,0,0 +9886,TEST,0,0 +9887,TEST,0,0 +9888,TEST,0,0 +9889,TEST,0,0 +9890,TEST,0,0 +9891,TEST,0,0 +9892,TEST,0,0 +9893,TEST,0,0 +9894,TEST,0,0 +9895,TEST,0,0 +9896,TEST,0,0 +9897,TEST,0,0 +9898,TEST,0,0 +9899,TEST,0,0 +9900,TEST,0,0 +9901,TEST,0,0 +9902,TEST,0,0 +9903,TEST,0,0 +9904,TEST,0,0 +9905,TEST,0,0 +9906,TEST,0,0 +9907,TEST,0,0 +9908,TEST,0,0 +9909,TEST,0,0 +9910,TEST,0,0 +9911,TEST,0,0 +9912,TEST,0,0 +9913,TEST,0,0 +9914,TEST,0,0 +9915,TEST,0,0 +9916,TEST,0,0 +9917,TEST,0,0 +9918,TEST,0,0 +9919,TEST,0,0 +9920,TEST,0,0 +9921,TEST,0,0 +9922,TEST,0,0 +9923,TEST,0,0 +9924,TEST,0,0 +9925,TEST,0,0 +9926,TEST,0,0 +9927,TEST,0,0 +9928,TEST,0,0 +9929,TEST,0,0 +9930,TEST,0,0 +9931,TEST,0,0 +9932,TEST,0,0 +9933,TEST,0,0 +9934,TEST,0,0 +9935,TEST,0,0 +9936,TEST,0,0 +9937,TEST,0,0 +9938,TEST,0,0 +9939,TEST,0,0 +9940,TEST,0,0 +9941,TEST,0,0 +9942,TEST,0,0 +9943,TEST,0,0 +9944,TEST,0,0 +9945,TEST,0,0 +9946,TEST,0,0 +9947,TEST,0,0 +9948,TEST,0,0 +9949,TEST,0,0 +9950,TEST,0,0 +9951,TEST,0,0 +9952,TEST,0,0 +9953,TEST,0,0 +9954,TEST,0,0 +9955,TEST,0,0 +9956,TEST,0,0 +9957,TEST,0,0 +9958,TEST,0,0 +9959,TEST,0,0 +9960,TEST,0,0 +9961,TEST,0,0 +9962,TEST,0,0 +9963,TEST,0,0 +9964,TEST,0,0 +9965,TEST,0,0 +9966,TEST,0,0 +9967,TEST,0,0 +9968,TEST,0,0 +9969,TEST,0,0 +9970,TEST,0,0 +9971,TEST,0,0 +9972,TEST,0,0 +9973,TEST,0,0 +9974,TEST,0,0 +9975,TEST,0,0 +9976,TEST,0,0 +9977,TEST,0,0 +9978,TEST,0,0 +9979,TEST,0,0 +9980,TEST,0,0 +9981,TEST,0,0 +9982,TEST,0,0 +9983,TEST,0,0 +9984,TEST,0,0 +9985,TEST,0,0 +9986,TEST,0,0 +9987,TEST,0,0 +9988,TEST,0,0 +9989,TEST,0,0 +9990,TEST,0,0 +9991,TEST,0,0 +9992,TEST,0,0 +9993,TEST,0,0 +9994,TEST,0,0 +9995,TEST,0,0 +9996,TEST,0,0 +9997,TEST,0,0 +9998,TEST,0,0 +9999,TEST,0,0 +10000,TEST,0,0 +10001,TEST,0,0 +10002,TEST,0,0 +10003,TEST,0,0 +10004,TEST,0,0 +10005,TEST,0,0 +10006,TEST,0,0 +10007,TEST,0,0 +10008,TEST,0,0 +10009,TEST,0,0 +10010,TEST,0,0 +10011,TEST,0,0 +10012,TEST,0,0 +10013,TEST,0,0 +10014,TEST,0,0 +10015,TEST,0,0 +10016,TEST,0,0 +10017,TEST,0,0 +10018,TEST,0,0 +10019,TEST,0,0 +10020,TEST,0,0 +10021,TEST,0,0 +10022,TEST,0,0 +10023,TEST,0,0 +10024,TEST,0,0 +10025,TEST,0,0 +10026,TEST,0,0 +10027,TEST,0,0 +10028,TEST,0,0 +10029,TEST,0,0 +10030,TEST,0,0 +10031,TEST,0,0 +10032,TEST,0,0 +10033,TEST,0,0 +10034,TEST,0,0 +10035,TEST,0,0 +10036,TEST,0,0 +10037,TEST,0,0 +10038,TEST,0,0 +10039,TEST,0,0 +10040,TEST,0,0 +10041,TEST,0,0 +10042,TEST,0,0 +10043,TEST,0,0 +10044,TEST,0,0 +10045,TEST,0,0 +10046,TEST,0,0 +10047,TEST,0,0 +10048,TEST,0,0 +10049,TEST,0,0 +10050,TEST,0,0 +10051,TEST,0,0 +10052,TEST,0,0 +10053,TEST,0,0 +10054,TEST,0,0 +10055,TEST,0,0 +10056,TEST,0,0 +10057,TEST,0,0 +10058,TEST,0,0 +10059,TEST,0,0 +10060,TEST,0,0 +10061,TEST,0,0 +10062,TEST,0,0 +10063,TEST,0,0 +10064,TEST,0,0 +10065,TEST,0,0 +10066,TEST,0,0 +10067,TEST,0,0 +10068,TEST,0,0 +10069,TEST,0,0 +10070,TEST,0,0 +10071,TEST,0,0 +10072,TEST,0,0 +10073,TEST,0,0 +10074,TEST,0,0 +10075,TEST,0,0 +10076,TEST,0,0 +10077,TEST,0,0 +10078,TEST,0,0 +10079,TEST,0,0 +10080,TEST,0,0 +10081,TEST,0,0 +10082,TEST,0,0 +10083,TEST,0,0 +10084,TEST,0,0 +10085,TEST,0,0 +10086,TEST,0,0 +10087,TEST,0,0 +10088,TEST,0,0 +10089,TEST,0,0 +10090,TEST,0,0 +10091,TEST,0,0 +10092,TEST,0,0 +10093,TEST,0,0 +10094,TEST,0,0 +10095,TEST,0,0 +10096,TEST,0,0 +10097,TEST,0,0 +10098,TEST,0,0 +10099,TEST,0,0 +10100,TEST,0,0 +10101,TEST,0,0 +10102,TEST,0,0 +10103,TEST,0,0 +10104,TEST,0,0 +10105,TEST,0,0 +10106,TEST,0,0 +10107,TEST,0,0 +10108,TEST,0,0 +10109,TEST,0,0 +10110,TEST,0,0 +10111,TEST,0,0 +10112,TEST,0,0 +10113,TEST,0,0 +10114,TEST,0,0 +10115,TEST,0,0 +10116,TEST,0,0 +10117,TEST,0,0 +10118,TEST,0,0 +10119,TEST,0,0 +10120,TEST,0,0 +10121,TEST,0,0 +10122,TEST,0,0 +10123,TEST,0,0 +10124,TEST,0,0 +10125,TEST,0,0 +10126,TEST,0,0 +10127,TEST,0,0 +10128,TEST,0,0 +10129,TEST,0,0 +10130,TEST,0,0 +10131,TEST,0,0 +10132,TEST,0,0 +10133,TEST,0,0 +10134,TEST,0,0 +10135,TEST,0,0 +10136,TEST,0,0 +10137,TEST,0,0 +10138,TEST,0,0 +10139,TEST,0,0 +10140,TEST,0,0 +10141,TEST,0,0 +10142,TEST,0,0 +10143,TEST,0,0 +10144,TEST,0,0 +10145,TEST,0,0 +10146,TEST,0,0 +10147,TEST,0,0 +10148,TEST,0,0 +10149,TEST,0,0 +10150,TEST,0,0 +10151,TEST,0,0 +10152,TEST,0,0 +10153,TEST,0,0 +10154,TEST,0,0 +10155,TEST,0,0 +10156,TEST,0,0 +10157,TEST,0,0 +10158,TEST,0,0 +10159,TEST,0,0 +10160,TEST,0,0 +10161,TEST,0,0 +10162,TEST,0,0 +10163,TEST,0,0 +10164,TEST,0,0 +10165,TEST,0,0 +10166,TEST,0,0 +10167,TEST,0,0 +10168,TEST,0,0 +10169,TEST,0,0 +10170,TEST,0,0 +10171,TEST,0,0 +10172,TEST,0,0 +10173,TEST,0,0 +10174,TEST,0,0 +10175,TEST,0,0 +10176,TEST,0,0 +10177,TEST,0,0 +10178,TEST,0,0 +10179,TEST,0,0 +10180,TEST,0,0 +10181,TEST,0,0 +10182,TEST,0,0 +10183,TEST,0,0 +10184,TEST,0,0 +10185,TEST,0,0 +10186,TEST,0,0 +10187,TEST,0,0 +10188,TEST,0,0 +10189,TEST,0,0 +10190,TEST,0,0 +10191,TEST,0,0 +10192,TEST,0,0 +10193,TEST,0,0 +10194,TEST,0,0 +10195,TEST,0,0 +10196,TEST,0,0 +10197,TEST,0,0 +10198,TEST,0,0 +10199,TEST,0,0 +10200,TEST,0,0 +10201,TEST,0,0 +10202,TEST,0,0 +10203,TEST,0,0 +10204,TEST,0,0 +10205,TEST,0,0 +10206,TEST,0,0 +10207,TEST,0,0 +10208,TEST,0,0 +10209,TEST,0,0 +10210,TEST,0,0 +10211,TEST,0,0 +10212,TEST,0,0 +10213,TEST,0,0 +10214,TEST,0,0 +10215,TEST,0,0 +10216,TEST,0,0 +10217,TEST,0,0 +10218,TEST,0,0 +10219,TEST,0,0 +10220,TEST,0,0 +10221,TEST,0,0 +10222,TEST,0,0 +10223,TEST,0,0 +10224,TEST,0,0 +10225,TEST,0,0 +10226,TEST,0,0 +10227,TEST,0,0 +10228,TEST,0,0 +10229,TEST,0,0 +10230,TEST,0,0 +10231,TEST,0,0 +10232,TEST,0,0 +10233,TEST,0,0 +10234,TEST,0,0 +10235,TEST,0,0 +10236,TEST,0,0 +10237,TEST,0,0 +10238,TEST,0,0 +10239,TEST,0,0 +10240,TEST,0,0 +10241,TEST,0,0 +10242,TEST,0,0 +10243,TEST,0,0 +10244,TEST,0,0 +10245,TEST,0,0 +10246,TEST,0,0 +10247,TEST,0,0 +10248,TEST,0,0 +10249,TEST,0,0 +10250,TEST,0,0 +10251,TEST,0,0 +10252,TEST,0,0 +10253,TEST,0,0 +10254,TEST,0,0 +10255,TEST,0,0 +10256,TEST,0,0 +10257,TEST,0,0 +10258,TEST,0,0 +10259,TEST,0,0 +10260,TEST,0,0 +10261,TEST,0,0 +10262,TEST,0,0 +10263,TEST,0,0 +10264,TEST,0,0 +10265,TEST,0,0 +10266,TEST,0,0 +10267,TEST,0,0 +10268,TEST,0,0 +10269,TEST,0,0 +10270,TEST,0,0 +10271,TEST,0,0 +10272,TEST,0,0 +10273,TEST,0,0 +10274,TEST,0,0 +10275,TEST,0,0 +10276,TEST,0,0 +10277,TEST,0,0 +10278,TEST,0,0 +10279,TEST,0,0 +10280,TEST,0,0 +10281,TEST,0,0 +10282,TEST,0,0 +10283,TEST,0,0 +10284,TEST,0,0 +10285,TEST,0,0 +10286,TEST,0,0 +10287,TEST,0,0 +10288,TEST,0,0 +10289,TEST,0,0 +10290,TEST,0,0 +10291,TEST,0,0 +10292,TEST,0,0 +10293,TEST,0,0 +10294,TEST,0,0 +10295,TEST,0,0 +10296,TEST,0,0 +10297,TEST,0,0 +10298,TEST,0,0 +10299,TEST,0,0 +10300,TEST,0,0 +10301,TEST,0,0 +10302,TEST,0,0 +10303,TEST,0,0 +10304,TEST,0,0 +10305,TEST,0,0 +10306,TEST,0,0 +10307,TEST,0,0 +10308,TEST,0,0 +10309,TEST,0,0 +10310,TEST,0,0 +10311,TEST,0,0 +10312,TEST,0,0 +10313,TEST,0,0 +10314,TEST,0,0 +10315,TEST,0,0 +10316,TEST,0,0 +10317,TEST,0,0 +10318,TEST,0,0 +10319,TEST,0,0 +10320,TEST,0,0 +10321,TEST,0,0 +10322,TEST,0,0 +10323,TEST,0,0 +10324,TEST,0,0 +10325,TEST,0,0 +10326,TEST,0,0 +10327,TEST,0,0 +10328,TEST,0,0 +10329,TEST,0,0 +10330,TEST,0,0 +10331,TEST,0,0 +10332,TEST,0,0 +10333,TEST,0,0 +10334,TEST,0,0 +10335,TEST,0,0 +10336,TEST,0,0 +10337,TEST,0,0 +10338,TEST,0,0 +10339,TEST,0,0 +10340,TEST,0,0 +10341,TEST,0,0 +10342,TEST,0,0 +10343,TEST,0,0 +10344,TEST,0,0 +10345,TEST,0,0 +10346,TEST,0,0 +10347,TEST,0,0 +10348,TEST,0,0 +10349,TEST,0,0 +10350,TEST,0,0 +10351,TEST,0,0 +10352,TEST,0,0 +10353,TEST,0,0 +10354,TEST,0,0 +10355,TEST,0,0 +10356,TEST,0,0 +10357,TEST,0,0 +10358,TEST,0,0 +10359,TEST,0,0 +10360,TEST,0,0 +10361,TEST,0,0 +10362,TEST,0,0 +10363,TEST,0,0 +10364,TEST,0,0 +10365,TEST,0,0 +10366,TEST,0,0 +10367,TEST,0,0 +10368,TEST,0,0 +10369,TEST,0,0 +10370,TEST,0,0 +10371,TEST,0,0 +10372,TEST,0,0 +10373,TEST,0,0 +10374,TEST,0,0 +10375,TEST,0,0 +10376,TEST,0,0 +10377,TEST,0,0 +10378,TEST,0,0 +10379,TEST,0,0 +10380,TEST,0,0 +10381,TEST,0,0 +10382,TEST,0,0 +10383,TEST,0,0 +10384,TEST,0,0 +10385,TEST,0,0 +10386,TEST,0,0 +10387,TEST,0,0 +10388,TEST,0,0 +10389,TEST,0,0 +10390,TEST,0,0 +10391,TEST,0,0 +10392,TEST,0,0 +10393,TEST,0,0 +10394,TEST,0,0 +10395,TEST,0,0 +10396,TEST,0,0 +10397,TEST,0,0 +10398,TEST,0,0 +10399,TEST,0,0 +10400,TEST,0,0 +10401,TEST,0,0 +10402,TEST,0,0 +10403,TEST,0,0 +10404,TEST,0,0 +10405,TEST,0,0 +10406,TEST,0,0 +10407,TEST,0,0 +10408,TEST,0,0 +10409,TEST,0,0 +10410,TEST,0,0 +10411,TEST,0,0 +10412,TEST,0,0 +10413,TEST,0,0 +10414,TEST,0,0 +10415,TEST,0,0 +10416,TEST,0,0 +10417,TEST,0,0 +10418,TEST,0,0 +10419,TEST,0,0 +10420,TEST,0,0 +10421,TEST,0,0 +10422,TEST,0,0 +10423,TEST,0,0 +10424,TEST,0,0 +10425,TEST,0,0 +10426,TEST,0,0 +10427,TEST,0,0 +10428,TEST,0,0 +10429,TEST,0,0 +10430,TEST,0,0 +10431,TEST,0,0 +10432,TEST,0,0 +10433,TEST,0,0 +10434,TEST,0,0 +10435,TEST,0,0 +10436,TEST,0,0 +10437,TEST,0,0 +10438,TEST,0,0 +10439,TEST,0,0 +10440,TEST,0,0 +10441,TEST,0,0 +10442,TEST,0,0 +10443,TEST,0,0 +10444,TEST,0,0 +10445,TEST,0,0 +10446,TEST,0,0 +10447,TEST,0,0 +10448,TEST,0,0 +10449,TEST,0,0 +10450,TEST,0,0 +10451,TEST,0,0 +10452,TEST,0,0 +10453,TEST,0,0 +10454,TEST,0,0 +10455,TEST,0,0 +10456,TEST,0,0 +10457,TEST,0,0 +10458,TEST,0,0 +10459,TEST,0,0 +10460,TEST,0,0 +10461,TEST,0,0 +10462,TEST,0,0 +10463,TEST,0,0 +10464,TEST,0,0 +10465,TEST,0,0 +10466,TEST,0,0 +10467,TEST,0,0 +10468,TEST,0,0 +10469,TEST,0,0 +10470,TEST,0,0 +10471,TEST,0,0 +10472,TEST,0,0 +10473,TEST,0,0 +10474,TEST,0,0 +10475,TEST,0,0 +10476,TEST,0,0 +10477,TEST,0,0 +10478,TEST,0,0 +10479,TEST,0,0 +10480,TEST,0,0 +10481,TEST,0,0 +10482,TEST,0,0 +10483,TEST,0,0 +10484,TEST,0,0 +10485,TEST,0,0 +10486,TEST,0,0 +10487,TEST,0,0 +10488,TEST,0,0 +10489,TEST,0,0 +10490,TEST,0,0 +10491,TEST,0,0 +10492,TEST,0,0 +10493,TEST,0,0 +10494,TEST,0,0 +10495,TEST,0,0 +10496,TEST,0,0 +10497,TEST,0,0 +10498,TEST,0,0 +10499,TEST,0,0 +10500,TEST,0,0 +10501,TEST,0,0 +10502,TEST,0,0 +10503,TEST,0,0 +10504,TEST,0,0 +10505,TEST,0,0 +10506,TEST,0,0 +10507,TEST,0,0 +10508,TEST,0,0 +10509,TEST,0,0 +10510,TEST,0,0 +10511,TEST,0,0 +10512,TEST,0,0 +10513,TEST,0,0 +10514,TEST,0,0 +10515,TEST,0,0 +10516,TEST,0,0 +10517,TEST,0,0 +10518,TEST,0,0 +10519,TEST,0,0 +10520,TEST,0,0 +10521,TEST,0,0 +10522,TEST,0,0 +10523,TEST,0,0 +10524,TEST,0,0 +10525,TEST,0,0 +10526,TEST,0,0 +10527,TEST,0,0 +10528,TEST,0,0 +10529,TEST,0,0 +10530,TEST,0,0 +10531,TEST,0,0 +10532,TEST,0,0 +10533,TEST,0,0 +10534,TEST,0,0 +10535,TEST,0,0 +10536,TEST,0,0 +10537,TEST,0,0 +10538,TEST,0,0 +10539,TEST,0,0 +10540,TEST,0,0 +10541,TEST,0,0 +10542,TEST,0,0 +10543,TEST,0,0 +10544,TEST,0,0 +10545,TEST,0,0 +10546,TEST,0,0 +10547,TEST,0,0 +10548,TEST,0,0 +10549,TEST,0,0 +10550,TEST,0,0 +10551,TEST,0,0 +10552,TEST,0,0 +10553,TEST,0,0 +10554,TEST,0,0 +10555,TEST,0,0 +10556,TEST,0,0 +10557,TEST,0,0 +10558,TEST,0,0 +10559,TEST,0,0 +10560,TEST,0,0 +10561,TEST,0,0 +10562,TEST,0,0 +10563,TEST,0,0 +10564,TEST,0,0 +10565,TEST,0,0 +10566,TEST,0,0 +10567,TEST,0,0 +10568,TEST,0,0 +10569,TEST,0,0 +10570,TEST,0,0 +10571,TEST,0,0 +10572,TEST,0,0 +10573,TEST,0,0 +10574,TEST,0,0 +10575,TEST,0,0 +10576,TEST,0,0 +10577,TEST,0,0 +10578,TEST,0,0 +10579,TEST,0,0 +10580,TEST,0,0 +10581,TEST,0,0 +10582,TEST,0,0 +10583,TEST,0,0 +10584,TEST,0,0 +10585,TEST,0,0 +10586,TEST,0,0 +10587,TEST,0,0 +10588,TEST,0,0 +10589,TEST,0,0 +10590,TEST,0,0 +10591,TEST,0,0 +10592,TEST,0,0 +10593,TEST,0,0 +10594,TEST,0,0 +10595,TEST,0,0 +10596,TEST,0,0 +10597,TEST,0,0 +10598,TEST,0,0 +10599,TEST,0,0 +10600,TEST,0,0 +10601,TEST,0,0 +10602,TEST,0,0 +10603,TEST,0,0 +10604,TEST,0,0 +10605,TEST,0,0 +10606,TEST,0,0 +10607,TEST,0,0 +10608,TEST,0,0 +10609,TEST,0,0 +10610,TEST,0,0 +10611,TEST,0,0 +10612,TEST,0,0 +10613,TEST,0,0 +10614,TEST,0,0 +10615,TEST,0,0 +10616,TEST,0,0 +10617,TEST,0,0 +10618,TEST,0,0 +10619,TEST,0,0 +10620,TEST,0,0 +10621,TEST,0,0 +10622,TEST,0,0 +10623,TEST,0,0 +10624,TEST,0,0 +10625,TEST,0,0 +10626,TEST,0,0 +10627,TEST,0,0 +10628,TEST,0,0 +10629,TEST,0,0 +10630,TEST,0,0 +10631,TEST,0,0 +10632,TEST,0,0 +10633,TEST,0,0 +10634,TEST,0,0 +10635,TEST,0,0 +10636,TEST,0,0 +10637,TEST,0,0 +10638,TEST,0,0 +10639,TEST,0,0 +10640,TEST,0,0 +10641,TEST,0,0 +10642,TEST,0,0 +10643,TEST,0,0 +10644,TEST,0,0 +10645,TEST,0,0 +10646,TEST,0,0 +10647,TEST,0,0 +10648,TEST,0,0 +10649,TEST,0,0 +10650,TEST,0,0 +10651,TEST,0,0 +10652,TEST,0,0 +10653,TEST,0,0 +10654,TEST,0,0 +10655,TEST,0,0 +10656,TEST,0,0 +10657,TEST,0,0 +10658,TEST,0,0 +10659,TEST,0,0 +10660,TEST,0,0 +10661,TEST,0,0 +10662,TEST,0,0 +10663,TEST,0,0 +10664,TEST,0,0 +10665,TEST,0,0 +10666,TEST,0,0 +10667,TEST,0,0 +10668,TEST,0,0 +10669,TEST,0,0 +10670,TEST,0,0 +10671,TEST,0,0 +10672,TEST,0,0 +10673,TEST,0,0 +10674,TEST,0,0 +10675,TEST,0,0 +10676,TEST,0,0 +10677,TEST,0,0 +10678,TEST,0,0 +10679,TEST,0,0 +10680,TEST,0,0 +10681,TEST,0,0 +10682,TEST,0,0 +10683,TEST,0,0 +10684,TEST,0,0 +10685,TEST,0,0 +10686,TEST,0,0 +10687,TEST,0,0 +10688,TEST,0,0 +10689,TEST,0,0 +10690,TEST,0,0 +10691,TEST,0,0 +10692,TEST,0,0 +10693,TEST,0,0 +10694,TEST,0,0 +10695,TEST,0,0 +10696,TEST,0,0 +10697,TEST,0,0 +10698,TEST,0,0 +10699,TEST,0,0 +10700,TEST,0,0 +10701,TEST,0,0 +10702,TEST,0,0 +10703,TEST,0,0 +10704,TEST,0,0 +10705,TEST,0,0 +10706,TEST,0,0 +10707,TEST,0,0 +10708,TEST,0,0 +10709,TEST,0,0 +10710,TEST,0,0 +10711,TEST,0,0 +10712,TEST,0,0 +10713,TEST,0,0 +10714,TEST,0,0 +10715,TEST,0,0 +10716,TEST,0,0 +10717,TEST,0,0 +10718,TEST,0,0 +10719,TEST,0,0 +10720,TEST,0,0 +10721,TEST,0,0 +10722,TEST,0,0 +10723,TEST,0,0 +10724,TEST,0,0 +10725,TEST,0,0 +10726,TEST,0,0 +10727,TEST,0,0 +10728,TEST,0,0 +10729,TEST,0,0 +10730,TEST,0,0 +10731,TEST,0,0 +10732,TEST,0,0 +10733,TEST,0,0 +10734,TEST,0,0 +10735,TEST,0,0 +10736,TEST,0,0 +10737,TEST,0,0 +10738,TEST,0,0 +10739,TEST,0,0 +10740,TEST,0,0 +10741,TEST,0,0 +10742,TEST,0,0 +10743,TEST,0,0 +10744,TEST,0,0 +10745,TEST,0,0 +10746,TEST,0,0 +10747,TEST,0,0 +10748,TEST,0,0 +10749,TEST,0,0 +10750,TEST,0,0 +10751,TEST,0,0 +10752,TEST,0,0 +10753,TEST,0,0 +10754,TEST,0,0 +10755,TEST,0,0 +10756,TEST,0,0 +10757,TEST,0,0 +10758,TEST,0,0 +10759,TEST,0,0 +10760,TEST,0,0 +10761,TEST,0,0 +10762,TEST,0,0 +10763,TEST,0,0 +10764,TEST,0,0 +10765,TEST,0,0 +10766,TEST,0,0 +10767,TEST,0,0 +10768,TEST,0,0 +10769,TEST,0,0 +10770,TEST,0,0 +10771,TEST,0,0 +10772,TEST,0,0 +10773,TEST,0,0 +10774,TEST,0,0 +10775,TEST,0,0 +10776,TEST,0,0 +10777,TEST,0,0 +10778,TEST,0,0 +10779,TEST,0,0 +10780,TEST,0,0 +10781,TEST,0,0 +10782,TEST,0,0 +10783,TEST,0,0 +10784,TEST,0,0 +10785,TEST,0,0 +10786,TEST,0,0 +10787,TEST,0,0 +10788,TEST,0,0 +10789,TEST,0,0 +10790,TEST,0,0 +10791,TEST,0,0 +10792,TEST,0,0 +10793,TEST,0,0 +10794,TEST,0,0 +10795,TEST,0,0 +10796,TEST,0,0 +10797,TEST,0,0 +10798,TEST,0,0 +10799,TEST,0,0 +10800,TEST,0,0 +10801,TEST,0,0 +10802,TEST,0,0 +10803,TEST,0,0 +10804,TEST,0,0 +10805,TEST,0,0 +10806,TEST,0,0 +10807,TEST,0,0 +10808,TEST,0,0 +10809,TEST,0,0 +10810,TEST,0,0 +10811,TEST,0,0 +10812,TEST,0,0 +10813,TEST,0,0 +10814,TEST,0,0 +10815,TEST,0,0 +10816,TEST,0,0 +10817,TEST,0,0 +10818,TEST,0,0 +10819,TEST,0,0 +10820,TEST,0,0 +10821,TEST,0,0 +10822,TEST,0,0 +10823,TEST,0,0 +10824,TEST,0,0 +10825,TEST,0,0 +10826,TEST,0,0 +10827,TEST,0,0 +10828,TEST,0,0 +10829,TEST,0,0 +10830,TEST,0,0 +10831,TEST,0,0 +10832,TEST,0,0 +10833,TEST,0,0 +10834,TEST,0,0 +10835,TEST,0,0 +10836,TEST,0,0 +10837,TEST,0,0 +10838,TEST,0,0 +10839,TEST,0,0 +10840,TEST,0,0 +10841,TEST,0,0 +10842,TEST,0,0 +10843,TEST,0,0 +10844,TEST,0,0 +10845,TEST,0,0 +10846,TEST,0,0 +10847,TEST,0,0 +10848,TEST,0,0 +10849,TEST,0,0 +10850,TEST,0,0 +10851,TEST,0,0 +10852,TEST,0,0 +10853,TEST,0,0 +10854,TEST,0,0 +10855,TEST,0,0 +10856,TEST,0,0 +10857,TEST,0,0 +10858,TEST,0,0 +10859,TEST,0,0 +10860,TEST,0,0 +10861,TEST,0,0 +10862,TEST,0,0 +10863,TEST,0,0 +10864,TEST,0,0 +10865,TEST,0,0 +10866,TEST,0,0 +10867,TEST,0,0 +10868,TEST,0,0 +10869,TEST,0,0 +10870,TEST,0,0 +10871,TEST,0,0 +10872,TEST,0,0 +10873,TEST,0,0 +10874,TEST,0,0 +10875,TEST,0,0 +10876,TEST,0,0 +10877,TEST,0,0 +10878,TEST,0,0 +10879,TEST,0,0 +10880,TEST,0,0 +10881,TEST,0,0 +10882,TEST,0,0 +10883,TEST,0,0 +10884,TEST,0,0 +10885,TEST,0,0 +10886,TEST,0,0 +10887,TEST,0,0 +10888,TEST,0,0 +10889,TEST,0,0 +10890,TEST,0,0 +10891,TEST,0,0 +10892,TEST,0,0 +10893,TEST,0,0 +10894,TEST,0,0 +10895,TEST,0,0 +10896,TEST,0,0 +10897,TEST,0,0 +10898,TEST,0,0 +10899,TEST,0,0 +10900,TEST,0,0 +10901,TEST,0,0 +10902,TEST,0,0 +10903,TEST,0,0 +10904,TEST,0,0 +10905,TEST,0,0 +10906,TEST,0,0 +10907,TEST,0,0 +10908,TEST,0,0 +10909,TEST,0,0 +10910,TEST,0,0 +10911,TEST,0,0 +10912,TEST,0,0 +10913,TEST,0,0 +10914,TEST,0,0 +10915,TEST,0,0 +10916,TEST,0,0 +10917,TEST,0,0 +10918,TEST,0,0 +10919,TEST,0,0 +10920,TEST,0,0 +10921,TEST,0,0 +10922,TEST,0,0 +10923,TEST,0,0 +10924,TEST,0,0 +10925,TEST,0,0 +10926,TEST,0,0 +10927,TEST,0,0 +10928,TEST,0,0 +10929,TEST,0,0 +10930,TEST,0,0 +10931,TEST,0,0 +10932,TEST,0,0 +10933,TEST,0,0 +10934,TEST,0,0 +10935,TEST,0,0 +10936,TEST,0,0 +10937,TEST,0,0 +10938,TEST,0,0 +10939,TEST,0,0 +10940,TEST,0,0 +10941,TEST,0,0 +10942,TEST,0,0 +10943,TEST,0,0 +10944,TEST,0,0 +10945,TEST,0,0 +10946,TEST,0,0 +10947,TEST,0,0 +10948,TEST,0,0 +10949,TEST,0,0 +10950,TEST,0,0 +10951,TEST,0,0 +10952,TEST,0,0 +10953,TEST,0,0 +10954,TEST,0,0 +10955,TEST,0,0 +10956,TEST,0,0 +10957,TEST,0,0 +10958,TEST,0,0 +10959,TEST,0,0 +10960,TEST,0,0 +10961,TEST,0,0 +10962,TEST,0,0 +10963,TEST,0,0 +10964,TEST,0,0 +10965,TEST,0,0 +10966,TEST,0,0 +10967,TEST,0,0 +10968,TEST,0,0 +10969,TEST,0,0 +10970,TEST,0,0 +10971,TEST,0,0 +10972,TEST,0,0 +10973,TEST,0,0 +10974,TEST,0,0 +10975,TEST,0,0 +10976,TEST,0,0 +10977,TEST,0,0 +10978,TEST,0,0 +10979,TEST,0,0 +10980,TEST,0,0 +10981,TEST,0,0 +10982,TEST,0,0 +10983,TEST,0,0 +10984,TEST,0,0 +10985,TEST,0,0 +10986,TEST,0,0 +10987,TEST,0,0 +10988,TEST,0,0 +10989,TEST,0,0 +10990,TEST,0,0 +10991,TEST,0,0 +10992,TEST,0,0 +10993,TEST,0,0 +10994,TEST,0,0 +10995,TEST,0,0 +10996,TEST,0,0 +10997,TEST,0,0 +10998,TEST,0,0 +10999,TEST,0,0 +11000,TEST,0,0 +11001,TEST,0,0 +11002,TEST,0,0 +11003,TEST,0,0 +11004,TEST,0,0 +11005,TEST,0,0 +11006,TEST,0,0 +11007,TEST,0,0 +11008,TEST,0,0 +11009,TEST,0,0 +11010,TEST,0,0 +11011,TEST,0,0 +11012,TEST,0,0 +11013,TEST,0,0 +11014,TEST,0,0 +11015,TEST,0,0 +11016,TEST,0,0 +11017,TEST,0,0 +11018,TEST,0,0 +11019,TEST,0,0 +11020,TEST,0,0 +11021,TEST,0,0 +11022,TEST,0,0 +11023,TEST,0,0 +11024,TEST,0,0 +11025,TEST,0,0 +11026,TEST,0,0 +11027,TEST,0,0 +11028,TEST,0,0 +11029,TEST,0,0 +11030,TEST,0,0 +11031,TEST,0,0 +11032,TEST,0,0 +11033,TEST,0,0 +11034,TEST,0,0 +11035,TEST,0,0 +11036,TEST,0,0 +11037,TEST,0,0 +11038,TEST,0,0 +11039,TEST,0,0 +11040,TEST,0,0 +11041,TEST,0,0 +11042,TEST,0,0 +11043,TEST,0,0 +11044,TEST,0,0 +11045,TEST,0,0 +11046,TEST,0,0 +11047,TEST,0,0 +11048,TEST,0,0 +11049,TEST,0,0 +11050,TEST,0,0 +11051,TEST,0,0 +11052,TEST,0,0 +11053,TEST,0,0 +11054,TEST,0,0 +11055,TEST,0,0 +11056,TEST,0,0 +11057,TEST,0,0 +11058,TEST,0,0 +11059,TEST,0,0 +11060,TEST,0,0 +11061,TEST,0,0 +11062,TEST,0,0 +11063,TEST,0,0 +11064,TEST,0,0 +11065,TEST,0,0 +11066,TEST,0,0 +11067,TEST,0,0 +11068,TEST,0,0 +11069,TEST,0,0 +11070,TEST,0,0 +11071,TEST,0,0 +11072,TEST,0,0 +11073,TEST,0,0 +11074,TEST,0,0 +11075,TEST,0,0 +11076,TEST,0,0 +11077,TEST,0,0 +11078,TEST,0,0 +11079,TEST,0,0 +11080,TEST,0,0 +11081,TEST,0,0 +11082,TEST,0,0 +11083,TEST,0,0 +11084,TEST,0,0 +11085,TEST,0,0 +11086,TEST,0,0 +11087,TEST,0,0 +11088,TEST,0,0 +11089,TEST,0,0 +11090,TEST,0,0 +11091,TEST,0,0 +11092,TEST,0,0 +11093,TEST,0,0 +11094,TEST,0,0 +11095,TEST,0,0 +11096,TEST,0,0 +11097,TEST,0,0 +11098,TEST,0,0 +11099,TEST,0,0 +11100,TEST,0,0 +11101,TEST,0,0 +11102,TEST,0,0 +11103,TEST,0,0 +11104,TEST,0,0 +11105,TEST,0,0 +11106,TEST,0,0 +11107,TEST,0,0 +11108,TEST,0,0 +11109,TEST,0,0 +11110,TEST,0,0 +11111,TEST,0,0 +11112,TEST,0,0 +11113,TEST,0,0 +11114,TEST,0,0 +11115,TEST,0,0 +11116,TEST,0,0 +11117,TEST,0,0 +11118,TEST,0,0 +11119,TEST,0,0 +11120,TEST,0,0 +11121,TEST,0,0 +11122,TEST,0,0 +11123,TEST,0,0 +11124,TEST,0,0 +11125,TEST,0,0 +11126,TEST,0,0 +11127,TEST,0,0 +11128,TEST,0,0 +11129,TEST,0,0 +11130,TEST,0,0 +11131,TEST,0,0 +11132,TEST,0,0 +11133,TEST,0,0 +11134,TEST,0,0 +11135,TEST,0,0 +11136,TEST,0,0 +11137,TEST,0,0 +11138,TEST,0,0 +11139,TEST,0,0 +11140,TEST,0,0 +11141,TEST,0,0 +11142,TEST,0,0 +11143,TEST,0,0 +11144,TEST,0,0 +11145,TEST,0,0 +11146,TEST,0,0 +11147,TEST,0,0 +11148,TEST,0,0 +11149,TEST,0,0 +11150,TEST,0,0 +11151,TEST,0,0 +11152,TEST,0,0 +11153,TEST,0,0 +11154,TEST,0,0 +11155,TEST,0,0 +11156,TEST,0,0 +11157,TEST,0,0 +11158,TEST,0,0 +11159,TEST,0,0 +11160,TEST,0,0 +11161,TEST,0,0 +11162,TEST,0,0 +11163,TEST,0,0 +11164,TEST,0,0 +11165,TEST,0,0 +11166,TEST,0,0 +11167,TEST,0,0 +11168,TEST,0,0 +11169,TEST,0,0 +11170,TEST,0,0 +11171,TEST,0,0 +11172,TEST,0,0 +11173,TEST,0,0 +11174,TEST,0,0 +11175,TEST,0,0 +11176,TEST,0,0 +11177,TEST,0,0 +11178,TEST,0,0 +11179,TEST,0,0 +11180,TEST,0,0 +11181,TEST,0,0 +11182,TEST,0,0 +11183,TEST,0,0 +11184,TEST,0,0 +11185,TEST,0,0 +11186,TEST,0,0 +11187,TEST,0,0 +11188,TEST,0,0 +11189,TEST,0,0 +11190,TEST,0,0 +11191,TEST,0,0 +11192,TEST,0,0 +11193,TEST,0,0 +11194,TEST,0,0 +11195,TEST,0,0 +11196,TEST,0,0 +11197,TEST,0,0 +11198,TEST,0,0 +11199,TEST,0,0 +11200,TEST,0,0 +11201,TEST,0,0 +11202,TEST,0,0 +11203,TEST,0,0 +11204,TEST,0,0 +11205,TEST,0,0 +11206,TEST,0,0 +11207,TEST,0,0 +11208,TEST,0,0 +11209,TEST,0,0 +11210,TEST,0,0 +11211,TEST,0,0 +11212,TEST,0,0 +11213,TEST,0,0 +11214,TEST,0,0 +11215,TEST,0,0 +11216,TEST,0,0 +11217,TEST,0,0 +11218,TEST,0,0 +11219,TEST,0,0 +11220,TEST,0,0 +11221,TEST,0,0 +11222,TEST,0,0 +11223,TEST,0,0 +11224,TEST,0,0 +11225,TEST,0,0 +11226,TEST,0,0 +11227,TEST,0,0 +11228,TEST,0,0 +11229,TEST,0,0 +11230,TEST,0,0 +11231,TEST,0,0 +11232,TEST,0,0 +11233,TEST,0,0 +11234,TEST,0,0 +11235,TEST,0,0 +11236,TEST,0,0 +11237,TEST,0,0 +11238,TEST,0,0 +11239,TEST,0,0 +11240,TEST,0,0 +11241,TEST,0,0 +11242,TEST,0,0 +11243,TEST,0,0 +11244,TEST,0,0 +11245,TEST,0,0 +11246,TEST,0,0 +11247,TEST,0,0 +11248,TEST,0,0 +11249,TEST,0,0 +11250,TEST,0,0 +11251,TEST,0,0 +11252,TEST,0,0 +11253,TEST,0,0 +11254,TEST,0,0 +11255,TEST,0,0 +11256,TEST,0,0 +11257,TEST,0,0 +11258,TEST,0,0 +11259,TEST,0,0 +11260,TEST,0,0 +11261,TEST,0,0 +11262,TEST,0,0 +11263,TEST,0,0 +11264,TEST,0,0 +11265,TEST,0,0 +11266,TEST,0,0 +11267,TEST,0,0 +11268,TEST,0,0 +11269,TEST,0,0 +11270,TEST,0,0 +11271,TEST,0,0 +11272,TEST,0,0 +11273,TEST,0,0 +11274,TEST,0,0 +11275,TEST,0,0 +11276,TEST,0,0 +11277,TEST,0,0 +11278,TEST,0,0 +11279,TEST,0,0 +11280,TEST,0,0 +11281,TEST,0,0 +11282,TEST,0,0 +11283,TEST,0,0 +11284,TEST,0,0 +11285,TEST,0,0 +11286,TEST,0,0 +11287,TEST,0,0 +11288,TEST,0,0 +11289,TEST,0,0 +11290,TEST,0,0 +11291,TEST,0,0 +11292,TEST,0,0 +11293,TEST,0,0 +11294,TEST,0,0 +11295,TEST,0,0 +11296,TEST,0,0 +11297,TEST,0,0 +11298,TEST,0,0 +11299,TEST,0,0 +11300,TEST,0,0 +11301,TEST,0,0 +11302,TEST,0,0 +11303,TEST,0,0 +11304,TEST,0,0 +11305,TEST,0,0 +11306,TEST,0,0 +11307,TEST,0,0 +11308,TEST,0,0 +11309,TEST,0,0 +11310,TEST,0,0 +11311,TEST,0,0 +11312,TEST,0,0 +11313,TEST,0,0 +11314,TEST,0,0 +11315,TEST,0,0 +11316,TEST,0,0 +11317,TEST,0,0 +11318,TEST,0,0 +11319,TEST,0,0 +11320,TEST,0,0 +11321,TEST,0,0 +11322,TEST,0,0 +11323,TEST,0,0 +11324,TEST,0,0 +11325,TEST,0,0 +11326,TEST,0,0 +11327,TEST,0,0 +11328,TEST,0,0 +11329,TEST,0,0 +11330,TEST,0,0 +11331,TEST,0,0 +11332,TEST,0,0 +11333,TEST,0,0 +11334,TEST,0,0 +11335,TEST,0,0 +11336,TEST,0,0 +11337,TEST,0,0 +11338,TEST,0,0 +11339,TEST,0,0 +11340,TEST,0,0 +11341,TEST,0,0 +11342,TEST,0,0 +11343,TEST,0,0 +11344,TEST,0,0 +11345,TEST,0,0 +11346,TEST,0,0 +11347,TEST,0,0 +11348,TEST,0,0 +11349,TEST,0,0 +11350,TEST,0,0 +11351,TEST,0,0 +11352,TEST,0,0 +11353,TEST,0,0 +11354,TEST,0,0 +11355,TEST,0,0 +11356,TEST,0,0 +11357,TEST,0,0 +11358,TEST,0,0 +11359,TEST,0,0 +11360,TEST,0,0 +11361,TEST,0,0 +11362,TEST,0,0 +11363,TEST,0,0 +11364,TEST,0,0 +11365,TEST,0,0 +11366,TEST,0,0 +11367,TEST,0,0 +11368,TEST,0,0 +11369,TEST,0,0 +11370,TEST,0,0 +11371,TEST,0,0 +11372,TEST,0,0 +11373,TEST,0,0 +11374,TEST,0,0 +11375,TEST,0,0 +11376,TEST,0,0 +11377,TEST,0,0 +11378,TEST,0,0 +11379,TEST,0,0 +11380,TEST,0,0 +11381,TEST,0,0 +11382,TEST,0,0 +11383,TEST,0,0 +11384,TEST,0,0 +11385,TEST,0,0 +11386,TEST,0,0 +11387,TEST,0,0 +11388,TEST,0,0 +11389,TEST,0,0 +11390,TEST,0,0 +11391,TEST,0,0 +11392,TEST,0,0 +11393,TEST,0,0 +11394,TEST,0,0 +11395,TEST,0,0 +11396,TEST,0,0 +11397,TEST,0,0 +11398,TEST,0,0 +11399,TEST,0,0 +11400,TEST,0,0 +11401,TEST,0,0 +11402,TEST,0,0 +11403,TEST,0,0 +11404,TEST,0,0 +11405,TEST,0,0 +11406,TEST,0,0 +11407,TEST,0,0 +11408,TEST,0,0 +11409,TEST,0,0 +11410,TEST,0,0 +11411,TEST,0,0 +11412,TEST,0,0 +11413,TEST,0,0 +11414,TEST,0,0 +11415,TEST,0,0 +11416,TEST,0,0 +11417,TEST,0,0 +11418,TEST,0,0 +11419,TEST,0,0 +11420,TEST,0,0 +11421,TEST,0,0 +11422,TEST,0,0 +11423,TEST,0,0 +11424,TEST,0,0 +11425,TEST,0,0 +11426,TEST,0,0 +11427,TEST,0,0 +11428,TEST,0,0 +11429,TEST,0,0 +11430,TEST,0,0 +11431,TEST,0,0 +11432,TEST,0,0 +11433,TEST,0,0 +11434,TEST,0,0 +11435,TEST,0,0 +11436,TEST,0,0 +11437,TEST,0,0 +11438,TEST,0,0 +11439,TEST,0,0 +11440,TEST,0,0 +11441,TEST,0,0 +11442,TEST,0,0 +11443,TEST,0,0 +11444,TEST,0,0 +11445,TEST,0,0 +11446,TEST,0,0 +11447,TEST,0,0 +11448,TEST,0,0 +11449,TEST,0,0 +11450,TEST,0,0 +11451,TEST,0,0 +11452,TEST,0,0 +11453,TEST,0,0 +11454,TEST,0,0 +11455,TEST,0,0 +11456,TEST,0,0 +11457,TEST,0,0 +11458,TEST,0,0 +11459,TEST,0,0 +11460,TEST,0,0 +11461,TEST,0,0 +11462,TEST,0,0 +11463,TEST,0,0 +11464,TEST,0,0 +11465,TEST,0,0 +11466,TEST,0,0 +11467,TEST,0,0 +11468,TEST,0,0 +11469,TEST,0,0 +11470,TEST,0,0 +11471,TEST,0,0 +11472,TEST,0,0 +11473,TEST,0,0 +11474,TEST,0,0 +11475,TEST,0,0 +11476,TEST,0,0 +11477,TEST,0,0 +11478,TEST,0,0 +11479,TEST,0,0 +11480,TEST,0,0 +11481,TEST,0,0 +11482,TEST,0,0 +11483,TEST,0,0 +11484,TEST,0,0 +11485,TEST,0,0 +11486,TEST,0,0 +11487,TEST,0,0 +11488,TEST,0,0 +11489,TEST,0,0 +11490,TEST,0,0 +11491,TEST,0,0 +11492,TEST,0,0 +11493,TEST,0,0 +11494,TEST,0,0 +11495,TEST,0,0 +11496,TEST,0,0 +11497,TEST,0,0 +11498,TEST,0,0 +11499,TEST,0,0 +11500,TEST,0,0 +11501,TEST,0,0 +11502,TEST,0,0 +11503,TEST,0,0 +11504,TEST,0,0 +11505,TEST,0,0 +11506,TEST,0,0 +11507,TEST,0,0 +11508,TEST,0,0 +11509,TEST,0,0 +11510,TEST,0,0 +11511,TEST,0,0 +11512,TEST,0,0 +11513,TEST,0,0 +11514,TEST,0,0 +11515,TEST,0,0 +11516,TEST,0,0 +11517,TEST,0,0 +11518,TEST,0,0 +11519,TEST,0,0 +11520,TEST,0,0 +11521,TEST,0,0 +11522,TEST,0,0 +11523,TEST,0,0 +11524,TEST,0,0 +11525,TEST,0,0 +11526,TEST,0,0 +11527,TEST,0,0 +11528,TEST,0,0 +11529,TEST,0,0 +11530,TEST,0,0 +11531,TEST,0,0 +11532,TEST,0,0 +11533,TEST,0,0 +11534,TEST,0,0 +11535,TEST,0,0 +11536,TEST,0,0 +11537,TEST,0,0 +11538,TEST,0,0 +11539,TEST,0,0 +11540,TEST,0,0 +11541,TEST,0,0 +11542,TEST,0,0 +11543,TEST,0,0 +11544,TEST,0,0 +11545,TEST,0,0 +11546,TEST,0,0 +11547,TEST,0,0 +11548,TEST,0,0 +11549,TEST,0,0 +11550,TEST,0,0 +11551,TEST,0,0 +11552,TEST,0,0 +11553,TEST,0,0 +11554,TEST,0,0 +11555,TEST,0,0 +11556,TEST,0,0 +11557,TEST,0,0 +11558,TEST,0,0 +11559,TEST,0,0 +11560,TEST,0,0 +11561,TEST,0,0 +11562,TEST,0,0 +11563,TEST,0,0 +11564,TEST,0,0 +11565,TEST,0,0 +11566,TEST,0,0 +11567,TEST,0,0 +11568,TEST,0,0 +11569,TEST,0,0 +11570,TEST,0,0 +11571,TEST,0,0 +11572,TEST,0,0 +11573,TEST,0,0 +11574,TEST,0,0 +11575,TEST,0,0 +11576,TEST,0,0 +11577,TEST,0,0 +11578,TEST,0,0 +11579,TEST,0,0 +11580,TEST,0,0 +11581,TEST,0,0 +11582,TEST,0,0 +11583,TEST,0,0 +11584,TEST,0,0 +11585,TEST,0,0 +11586,TEST,0,0 +11587,TEST,0,0 +11588,TEST,0,0 +11589,TEST,0,0 +11590,TEST,0,0 +11591,TEST,0,0 +11592,TEST,0,0 +11593,TEST,0,0 +11594,TEST,0,0 +11595,TEST,0,0 +11596,TEST,0,0 +11597,TEST,0,0 +11598,TEST,0,0 +11599,TEST,0,0 +11600,TEST,0,0 +11601,TEST,0,0 +11602,TEST,0,0 +11603,TEST,0,0 +11604,TEST,0,0 +11605,TEST,0,0 +11606,TEST,0,0 +11607,TEST,0,0 +11608,TEST,0,0 +11609,TEST,0,0 +11610,TEST,0,0 +11611,TEST,0,0 +11612,TEST,0,0 +11613,TEST,0,0 +11614,TEST,0,0 +11615,TEST,0,0 +11616,TEST,0,0 +11617,TEST,0,0 +11618,TEST,0,0 +11619,TEST,0,0 +11620,TEST,0,0 +11621,TEST,0,0 +11622,TEST,0,0 +11623,TEST,0,0 +11624,TEST,0,0 +11625,TEST,0,0 +11626,TEST,0,0 +11627,TEST,0,0 +11628,TEST,0,0 +11629,TEST,0,0 +11630,TEST,0,0 +11631,TEST,0,0 +11632,TEST,0,0 +11633,TEST,0,0 +11634,TEST,0,0 +11635,TEST,0,0 +11636,TEST,0,0 +11637,TEST,0,0 +11638,TEST,0,0 +11639,TEST,0,0 +11640,TEST,0,0 +11641,TEST,0,0 +11642,TEST,0,0 +11643,TEST,0,0 +11644,TEST,0,0 +11645,TEST,0,0 +11646,TEST,0,0 +11647,TEST,0,0 +11648,TEST,0,0 +11649,TEST,0,0 +11650,TEST,0,0 +11651,TEST,0,0 +11652,TEST,0,0 +11653,TEST,0,0 +11654,TEST,0,0 +11655,TEST,0,0 +11656,TEST,0,0 +11657,TEST,0,0 +11658,TEST,0,0 +11659,TEST,0,0 +11660,TEST,0,0 +11661,TEST,0,0 +11662,TEST,0,0 +11663,TEST,0,0 +11664,TEST,0,0 +11665,TEST,0,0 +11666,TEST,0,0 +11667,TEST,0,0 +11668,TEST,0,0 +11669,TEST,0,0 +11670,TEST,0,0 +11671,TEST,0,0 +11672,TEST,0,0 +11673,TEST,0,0 +11674,TEST,0,0 +11675,TEST,0,0 +11676,TEST,0,0 +11677,TEST,0,0 +11678,TEST,0,0 +11679,TEST,0,0 +11680,TEST,0,0 +11681,TEST,0,0 +11682,TEST,0,0 +11683,TEST,0,0 +11684,TEST,0,0 +11685,TEST,0,0 +11686,TEST,0,0 +11687,TEST,0,0 +11688,TEST,0,0 +11689,TEST,0,0 +11690,TEST,0,0 +11691,TEST,0,0 +11692,TEST,0,0 +11693,TEST,0,0 +11694,TEST,0,0 +11695,TEST,0,0 +11696,TEST,0,0 +11697,TEST,0,0 +11698,TEST,0,0 +11699,TEST,0,0 +11700,TEST,0,0 +11701,TEST,0,0 +11702,TEST,0,0 +11703,TEST,0,0 +11704,TEST,0,0 +11705,TEST,0,0 +11706,TEST,0,0 +11707,TEST,0,0 +11708,TEST,0,0 +11709,TEST,0,0 +11710,TEST,0,0 +11711,TEST,0,0 +11712,TEST,0,0 +11713,TEST,0,0 +11714,TEST,0,0 +11715,TEST,0,0 +11716,TEST,0,0 +11717,TEST,0,0 +11718,TEST,0,0 +11719,TEST,0,0 +11720,TEST,0,0 +11721,TEST,0,0 +11722,TEST,0,0 +11723,TEST,0,0 +11724,TEST,0,0 +11725,TEST,0,0 +11726,TEST,0,0 +11727,TEST,0,0 +11728,TEST,0,0 +11729,TEST,0,0 +11730,TEST,0,0 +11731,TEST,0,0 +11732,TEST,0,0 +11733,TEST,0,0 +11734,TEST,0,0 +11735,TEST,0,0 +11736,TEST,0,0 +11737,TEST,0,0 +11738,TEST,0,0 +11739,TEST,0,0 +11740,TEST,0,0 +11741,TEST,0,0 +11742,TEST,0,0 +11743,TEST,0,0 +11744,TEST,0,0 +11745,TEST,0,0 +11746,TEST,0,0 +11747,TEST,0,0 +11748,TEST,0,0 +11749,TEST,0,0 +11750,TEST,0,0 +11751,TEST,0,0 +11752,TEST,0,0 +11753,TEST,0,0 +11754,TEST,0,0 +11755,TEST,0,0 +11756,TEST,0,0 +11757,TEST,0,0 +11758,TEST,0,0 +11759,TEST,0,0 +11760,TEST,0,0 +11761,TEST,0,0 +11762,TEST,0,0 +11763,TEST,0,0 +11764,TEST,0,0 +11765,TEST,0,0 +11766,TEST,0,0 +11767,TEST,0,0 +11768,TEST,0,0 +11769,TEST,0,0 +11770,TEST,0,0 +11771,TEST,0,0 +11772,TEST,0,0 +11773,TEST,0,0 +11774,TEST,0,0 +11775,TEST,0,0 +11776,TEST,0,0 +11777,TEST,0,0 +11778,TEST,0,0 +11779,TEST,0,0 +11780,TEST,0,0 +11781,TEST,0,0 +11782,TEST,0,0 +11783,TEST,0,0 +11784,TEST,0,0 +11785,TEST,0,0 +11786,TEST,0,0 +11787,TEST,0,0 +11788,TEST,0,0 +11789,TEST,0,0 +11790,TEST,0,0 +11791,TEST,0,0 +11792,TEST,0,0 +11793,TEST,0,0 +11794,TEST,0,0 +11795,TEST,0,0 +11796,TEST,0,0 +11797,TEST,0,0 +11798,TEST,0,0 +11799,TEST,0,0 +11800,TEST,0,0 +11801,TEST,0,0 +11802,TEST,0,0 +11803,TEST,0,0 +11804,TEST,0,0 +11805,TEST,0,0 +11806,TEST,0,0 +11807,TEST,0,0 +11808,TEST,0,0 +11809,TEST,0,0 +11810,TEST,0,0 +11811,TEST,0,0 +11812,TEST,0,0 +11813,TEST,0,0 +11814,TEST,0,0 +11815,TEST,0,0 +11816,TEST,0,0 +11817,TEST,0,0 +11818,TEST,0,0 +11819,TEST,0,0 +11820,TEST,0,0 +11821,TEST,0,0 +11822,TEST,0,0 +11823,TEST,0,0 +11824,TEST,0,0 +11825,TEST,0,0 +11826,TEST,0,0 +11827,TEST,0,0 +11828,TEST,0,0 +11829,TEST,0,0 +11830,TEST,0,0 +11831,TEST,0,0 +11832,TEST,0,0 +11833,TEST,0,0 +11834,TEST,0,0 +11835,TEST,0,0 +11836,TEST,0,0 +11837,TEST,0,0 +11838,TEST,0,0 +11839,TEST,0,0 +11840,TEST,0,0 +11841,TEST,0,0 +11842,TEST,0,0 +11843,TEST,0,0 +11844,TEST,0,0 +11845,TEST,0,0 +11846,TEST,0,0 +11847,TEST,0,0 +11848,TEST,0,0 +11849,TEST,0,0 +11850,TEST,0,0 +11851,TEST,0,0 +11852,TEST,0,0 +11853,TEST,0,0 +11854,TEST,0,0 +11855,TEST,0,0 +11856,TEST,0,0 +11857,TEST,0,0 +11858,TEST,0,0 +11859,TEST,0,0 +11860,TEST,0,0 +11861,TEST,0,0 +11862,TEST,0,0 +11863,TEST,0,0 +11864,TEST,0,0 +11865,TEST,0,0 +11866,TEST,0,0 +11867,TEST,0,0 +11868,TEST,0,0 +11869,TEST,0,0 +11870,TEST,0,0 +11871,TEST,0,0 +11872,TEST,0,0 +11873,TEST,0,0 +11874,TEST,0,0 +11875,TEST,0,0 +11876,TEST,0,0 +11877,TEST,0,0 +11878,TEST,0,0 +11879,TEST,0,0 +11880,TEST,0,0 +11881,TEST,0,0 +11882,TEST,0,0 +11883,TEST,0,0 +11884,TEST,0,0 +11885,TEST,0,0 +11886,TEST,0,0 +11887,TEST,0,0 +11888,TEST,0,0 +11889,TEST,0,0 +11890,TEST,0,0 +11891,TEST,0,0 +11892,TEST,0,0 +11893,TEST,0,0 +11894,TEST,0,0 +11895,TEST,0,0 +11896,TEST,0,0 +11897,TEST,0,0 +11898,TEST,0,0 +11899,TEST,0,0 +11900,TEST,0,0 +11901,TEST,0,0 +11902,TEST,0,0 +11903,TEST,0,0 +11904,TEST,0,0 +11905,TEST,0,0 +11906,TEST,0,0 +11907,TEST,0,0 +11908,TEST,0,0 +11909,TEST,0,0 +11910,TEST,0,0 +11911,TEST,0,0 +11912,TEST,0,0 +11913,TEST,0,0 +11914,TEST,0,0 +11915,TEST,0,0 +11916,TEST,0,0 +11917,TEST,0,0 +11918,TEST,0,0 +11919,TEST,0,0 +11920,TEST,0,0 +11921,TEST,0,0 +11922,TEST,0,0 +11923,TEST,0,0 +11924,TEST,0,0 +11925,TEST,0,0 +11926,TEST,0,0 +11927,TEST,0,0 +11928,TEST,0,0 +11929,TEST,0,0 +11930,TEST,0,0 +11931,TEST,0,0 +11932,TEST,0,0 +11933,TEST,0,0 +11934,TEST,0,0 +11935,TEST,0,0 +11936,TEST,0,0 +11937,TEST,0,0 +11938,TEST,0,0 +11939,TEST,0,0 +11940,TEST,0,0 +11941,TEST,0,0 +11942,TEST,0,0 +11943,TEST,0,0 +11944,TEST,0,0 +11945,TEST,0,0 +11946,TEST,0,0 +11947,TEST,0,0 +11948,TEST,0,0 +11949,TEST,0,0 +11950,TEST,0,0 +11951,TEST,0,0 +11952,TEST,0,0 +11953,TEST,0,0 +11954,TEST,0,0 +11955,TEST,0,0 +11956,TEST,0,0 +11957,TEST,0,0 +11958,TEST,0,0 +11959,TEST,0,0 +11960,TEST,0,0 +11961,TEST,0,0 +11962,TEST,0,0 +11963,TEST,0,0 +11964,TEST,0,0 +11965,TEST,0,0 +11966,TEST,0,0 +11967,TEST,0,0 +11968,TEST,0,0 +11969,TEST,0,0 +11970,TEST,0,0 +11971,TEST,0,0 +11972,TEST,0,0 +11973,TEST,0,0 +11974,TEST,0,0 +11975,TEST,0,0 +11976,TEST,0,0 +11977,TEST,0,0 +11978,TEST,0,0 +11979,TEST,0,0 +11980,TEST,0,0 +11981,TEST,0,0 +11982,TEST,0,0 +11983,TEST,0,0 +11984,TEST,0,0 +11985,TEST,0,0 +11986,TEST,0,0 +11987,TEST,0,0 +11988,TEST,0,0 +11989,TEST,0,0 +11990,TEST,0,0 +11991,TEST,0,0 +11992,TEST,0,0 +11993,TEST,0,0 +11994,TEST,0,0 +11995,TEST,0,0 +11996,TEST,0,0 +11997,TEST,0,0 +11998,TEST,0,0 +11999,TEST,0,0 +12000,TEST,0,0 +12001,TEST,0,0 +12002,TEST,0,0 +12003,TEST,0,0 +12004,TEST,0,0 +12005,TEST,0,0 +12006,TEST,0,0 +12007,TEST,0,0 +12008,TEST,0,0 +12009,TEST,0,0 +12010,TEST,0,0 +12011,TEST,0,0 +12012,TEST,0,0 +12013,TEST,0,0 +12014,TEST,0,0 +12015,TEST,0,0 +12016,TEST,0,0 +12017,TEST,0,0 +12018,TEST,0,0 +12019,TEST,0,0 +12020,TEST,0,0 +12021,TEST,0,0 +12022,TEST,0,0 +12023,TEST,0,0 +12024,TEST,0,0 +12025,TEST,0,0 +12026,TEST,0,0 +12027,TEST,0,0 +12028,TEST,0,0 +12029,TEST,0,0 +12030,TEST,0,0 +12031,TEST,0,0 +12032,TEST,0,0 +12033,TEST,0,0 +12034,TEST,0,0 +12035,TEST,0,0 +12036,TEST,0,0 +12037,TEST,0,0 +12038,TEST,0,0 +12039,TEST,0,0 +12040,TEST,0,0 +12041,TEST,0,0 +12042,TEST,0,0 +12043,TEST,0,0 +12044,TEST,0,0 +12045,TEST,0,0 +12046,TEST,0,0 +12047,TEST,0,0 +12048,TEST,0,0 +12049,TEST,0,0 +12050,TEST,0,0 +12051,TEST,0,0 +12052,TEST,0,0 +12053,TEST,0,0 +12054,TEST,0,0 +12055,TEST,0,0 +12056,TEST,0,0 +12057,TEST,0,0 +12058,TEST,0,0 +12059,TEST,0,0 +12060,TEST,0,0 +12061,TEST,0,0 +12062,TEST,0,0 +12063,TEST,0,0 +12064,TEST,0,0 +12065,TEST,0,0 +12066,TEST,0,0 +12067,TEST,0,0 +12068,TEST,0,0 +12069,TEST,0,0 +12070,TEST,0,0 +12071,TEST,0,0 +12072,TEST,0,0 +12073,TEST,0,0 +12074,TEST,0,0 +12075,TEST,0,0 +12076,TEST,0,0 +12077,TEST,0,0 +12078,TEST,0,0 +12079,TEST,0,0 +12080,TEST,0,0 +12081,TEST,0,0 +12082,TEST,0,0 +12083,TEST,0,0 +12084,TEST,0,0 +12085,TEST,0,0 +12086,TEST,0,0 +12087,TEST,0,0 +12088,TEST,0,0 +12089,TEST,0,0 +12090,TEST,0,0 +12091,TEST,0,0 +12092,TEST,0,0 +12093,TEST,0,0 +12094,TEST,0,0 +12095,TEST,0,0 +12096,TEST,0,0 +12097,TEST,0,0 +12098,TEST,0,0 +12099,TEST,0,0 +12100,TEST,0,0 +12101,TEST,0,0 +12102,TEST,0,0 +12103,TEST,0,0 +12104,TEST,0,0 +12105,TEST,0,0 +12106,TEST,0,0 +12107,TEST,0,0 +12108,TEST,0,0 +12109,TEST,0,0 +12110,TEST,0,0 +12111,TEST,0,0 +12112,TEST,0,0 +12113,TEST,0,0 +12114,TEST,0,0 +12115,TEST,0,0 +12116,TEST,0,0 +12117,TEST,0,0 +12118,TEST,0,0 +12119,TEST,0,0 +12120,TEST,0,0 +12121,TEST,0,0 +12122,TEST,0,0 +12123,TEST,0,0 +12124,TEST,0,0 +12125,TEST,0,0 +12126,TEST,0,0 +12127,TEST,0,0 +12128,TEST,0,0 +12129,TEST,0,0 +12130,TEST,0,0 +12131,TEST,0,0 +12132,TEST,0,0 +12133,TEST,0,0 +12134,TEST,0,0 +12135,TEST,0,0 +12136,TEST,0,0 +12137,TEST,0,0 +12138,TEST,0,0 +12139,TEST,0,0 +12140,TEST,0,0 +12141,TEST,0,0 +12142,TEST,0,0 +12143,TEST,0,0 +12144,TEST,0,0 +12145,TEST,0,0 +12146,TEST,0,0 +12147,TEST,0,0 +12148,TEST,0,0 +12149,TEST,0,0 +12150,TEST,0,0 +12151,TEST,0,0 +12152,TEST,0,0 +12153,TEST,0,0 +12154,TEST,0,0 +12155,TEST,0,0 +12156,TEST,0,0 +12157,TEST,0,0 +12158,TEST,0,0 +12159,TEST,0,0 +12160,TEST,0,0 +12161,TEST,0,0 +12162,TEST,0,0 +12163,TEST,0,0 +12164,TEST,0,0 +12165,TEST,0,0 +12166,TEST,0,0 +12167,TEST,0,0 +12168,TEST,0,0 +12169,TEST,0,0 +12170,TEST,0,0 +12171,TEST,0,0 +12172,TEST,0,0 +12173,TEST,0,0 +12174,TEST,0,0 +12175,TEST,0,0 +12176,TEST,0,0 +12177,TEST,0,0 +12178,TEST,0,0 +12179,TEST,0,0 +12180,TEST,0,0 +12181,TEST,0,0 +12182,TEST,0,0 +12183,TEST,0,0 +12184,TEST,0,0 +12185,TEST,0,0 +12186,TEST,0,0 +12187,TEST,0,0 +12188,TEST,0,0 +12189,TEST,0,0 +12190,TEST,0,0 +12191,TEST,0,0 +12192,TEST,0,0 +12193,TEST,0,0 +12194,TEST,0,0 +12195,TEST,0,0 +12196,TEST,0,0 +12197,TEST,0,0 +12198,TEST,0,0 +12199,TEST,0,0 +12200,TEST,0,0 +12201,TEST,0,0 +12202,TEST,0,0 +12203,TEST,0,0 +12204,TEST,0,0 +12205,TEST,0,0 +12206,TEST,0,0 +12207,TEST,0,0 +12208,TEST,0,0 +12209,TEST,0,0 +12210,TEST,0,0 +12211,TEST,0,0 +12212,TEST,0,0 +12213,TEST,0,0 +12214,TEST,0,0 +12215,TEST,0,0 +12216,TEST,0,0 +12217,TEST,0,0 +12218,TEST,0,0 +12219,TEST,0,0 +12220,TEST,0,0 +12221,TEST,0,0 +12222,TEST,0,0 +12223,TEST,0,0 +12224,TEST,0,0 +12225,TEST,0,0 +12226,TEST,0,0 +12227,TEST,0,0 +12228,TEST,0,0 +12229,TEST,0,0 +12230,TEST,0,0 +12231,TEST,0,0 +12232,TEST,0,0 +12233,TEST,0,0 +12234,TEST,0,0 +12235,TEST,0,0 +12236,TEST,0,0 +12237,TEST,0,0 +12238,TEST,0,0 +12239,TEST,0,0 +12240,TEST,0,0 +12241,TEST,0,0 +12242,TEST,0,0 +12243,TEST,0,0 +12244,TEST,0,0 +12245,TEST,0,0 +12246,TEST,0,0 +12247,TEST,0,0 +12248,TEST,0,0 +12249,TEST,0,0 +12250,TEST,0,0 +12251,TEST,0,0 +12252,TEST,0,0 +12253,TEST,0,0 +12254,TEST,0,0 +12255,TEST,0,0 +12256,TEST,0,0 +12257,TEST,0,0 +12258,TEST,0,0 +12259,TEST,0,0 +12260,TEST,0,0 +12261,TEST,0,0 +12262,TEST,0,0 +12263,TEST,0,0 +12264,TEST,0,0 +12265,TEST,0,0 +12266,TEST,0,0 +12267,TEST,0,0 +12268,TEST,0,0 +12269,TEST,0,0 +12270,TEST,0,0 +12271,TEST,0,0 +12272,TEST,0,0 +12273,TEST,0,0 +12274,TEST,0,0 +12275,TEST,0,0 +12276,TEST,0,0 +12277,TEST,0,0 +12278,TEST,0,0 +12279,TEST,0,0 +12280,TEST,0,0 +12281,TEST,0,0 +12282,TEST,0,0 +12283,TEST,0,0 +12284,TEST,0,0 +12285,TEST,0,0 +12286,TEST,0,0 +12287,TEST,0,0 +12288,TEST,0,0 +12289,TEST,0,0 +12290,TEST,0,0 +12291,TEST,0,0 +12292,TEST,0,0 +12293,TEST,0,0 +12294,TEST,0,0 +12295,TEST,0,0 +12296,TEST,0,0 +12297,TEST,0,0 +12298,TEST,0,0 +12299,TEST,0,0 +12300,TEST,0,0 +12301,TEST,0,0 +12302,TEST,0,0 +12303,TEST,0,0 +12304,TEST,0,0 +12305,TEST,0,0 +12306,TEST,0,0 +12307,TEST,0,0 +12308,TEST,0,0 +12309,TEST,0,0 +12310,TEST,0,0 +12311,TEST,0,0 +12312,TEST,0,0 +12313,TEST,0,0 +12314,TEST,0,0 +12315,TEST,0,0 +12316,TEST,0,0 +12317,TEST,0,0 +12318,TEST,0,0 +12319,TEST,0,0 +12320,TEST,0,0 +12321,TEST,0,0 +12322,TEST,0,0 +12323,TEST,0,0 +12324,TEST,0,0 +12325,TEST,0,0 +12326,TEST,0,0 +12327,TEST,0,0 +12328,TEST,0,0 +12329,TEST,0,0 +12330,TEST,0,0 +12331,TEST,0,0 +12332,TEST,0,0 +12333,TEST,0,0 +12334,TEST,0,0 +12335,TEST,0,0 +12336,TEST,0,0 +12337,TEST,0,0 +12338,TEST,0,0 +12339,TEST,0,0 +12340,TEST,0,0 +12341,TEST,0,0 +12342,TEST,0,0 +12343,TEST,0,0 +12344,TEST,0,0 +12345,TEST,0,0 +12346,TEST,0,0 +12347,TEST,0,0 +12348,TEST,0,0 +12349,TEST,0,0 +12350,TEST,0,0 +12351,TEST,0,0 +12352,TEST,0,0 +12353,TEST,0,0 +12354,TEST,0,0 +12355,TEST,0,0 +12356,TEST,0,0 +12357,TEST,0,0 +12358,TEST,0,0 +12359,TEST,0,0 +12360,TEST,0,0 +12361,TEST,0,0 +12362,TEST,0,0 +12363,TEST,0,0 +12364,TEST,0,0 +12365,TEST,0,0 +12366,TEST,0,0 +12367,TEST,0,0 +12368,TEST,0,0 +12369,TEST,0,0 +12370,TEST,0,0 +12371,TEST,0,0 +12372,TEST,0,0 +12373,TEST,0,0 +12374,TEST,0,0 +12375,TEST,0,0 +12376,TEST,0,0 +12377,TEST,0,0 +12378,TEST,0,0 +12379,TEST,0,0 +12380,TEST,0,0 +12381,TEST,0,0 +12382,TEST,0,0 +12383,TEST,0,0 +12384,TEST,0,0 +12385,TEST,0,0 +12386,TEST,0,0 +12387,TEST,0,0 +12388,TEST,0,0 +12389,TEST,0,0 +12390,TEST,0,0 +12391,TEST,0,0 +12392,TEST,0,0 +12393,TEST,0,0 +12394,TEST,0,0 +12395,TEST,0,0 +12396,TEST,0,0 +12397,TEST,0,0 +12398,TEST,0,0 +12399,TEST,0,0 +12400,TEST,0,0 +12401,TEST,0,0 +12402,TEST,0,0 +12403,TEST,0,0 +12404,TEST,0,0 +12405,TEST,0,0 +12406,TEST,0,0 +12407,TEST,0,0 +12408,TEST,0,0 +12409,TEST,0,0 +12410,TEST,0,0 +12411,TEST,0,0 +12412,TEST,0,0 +12413,TEST,0,0 +12414,TEST,0,0 +12415,TEST,0,0 +12416,TEST,0,0 +12417,TEST,0,0 +12418,TEST,0,0 +12419,TEST,0,0 +12420,TEST,0,0 +12421,TEST,0,0 +12422,TEST,0,0 +12423,TEST,0,0 +12424,TEST,0,0 +12425,TEST,0,0 +12426,TEST,0,0 +12427,TEST,0,0 +12428,TEST,0,0 +12429,TEST,0,0 +12430,TEST,0,0 +12431,TEST,0,0 +12432,TEST,0,0 +12433,TEST,0,0 +12434,TEST,0,0 +12435,TEST,0,0 +12436,TEST,0,0 +12437,TEST,0,0 +12438,TEST,0,0 +12439,TEST,0,0 +12440,TEST,0,0 +12441,TEST,0,0 +12442,TEST,0,0 +12443,TEST,0,0 +12444,TEST,0,0 +12445,TEST,0,0 +12446,TEST,0,0 +12447,TEST,0,0 +12448,TEST,0,0 +12449,TEST,0,0 +12450,TEST,0,0 +12451,TEST,0,0 +12452,TEST,0,0 +12453,TEST,0,0 +12454,TEST,0,0 +12455,TEST,0,0 +12456,TEST,0,0 +12457,TEST,0,0 +12458,TEST,0,0 +12459,TEST,0,0 +12460,TEST,0,0 +12461,TEST,0,0 +12462,TEST,0,0 +12463,TEST,0,0 +12464,TEST,0,0 +12465,TEST,0,0 +12466,TEST,0,0 +12467,TEST,0,0 +12468,TEST,0,0 +12469,TEST,0,0 +12470,TEST,0,0 +12471,TEST,0,0 +12472,TEST,0,0 +12473,TEST,0,0 +12474,TEST,0,0 +12475,TEST,0,0 +12476,TEST,0,0 +12477,TEST,0,0 +12478,TEST,0,0 +12479,TEST,0,0 +12480,TEST,0,0 +12481,TEST,0,0 +12482,TEST,0,0 +12483,TEST,0,0 +12484,TEST,0,0 +12485,TEST,0,0 +12486,TEST,0,0 +12487,TEST,0,0 +12488,TEST,0,0 +12489,TEST,0,0 +12490,TEST,0,0 +12491,TEST,0,0 +12492,TEST,0,0 +12493,TEST,0,0 +12494,TEST,0,0 +12495,TEST,0,0 +12496,TEST,0,0 +12497,TEST,0,0 +12498,TEST,0,0 +12499,TEST,0,0 +12500,TEST,0,0 +12501,TEST,0,0 +12502,TEST,0,0 +12503,TEST,0,0 +12504,TEST,0,0 +12505,TEST,0,0 +12506,TEST,0,0 +12507,TEST,0,0 +12508,TEST,0,0 +12509,TEST,0,0 +12510,TEST,0,0 +12511,TEST,0,0 +12512,TEST,0,0 +12513,TEST,0,0 +12514,TEST,0,0 +12515,TEST,0,0 +12516,TEST,0,0 +12517,TEST,0,0 +12518,TEST,0,0 +12519,TEST,0,0 +12520,TEST,0,0 +12521,TEST,0,0 +12522,TEST,0,0 +12523,TEST,0,0 +12524,TEST,0,0 +12525,TEST,0,0 +12526,TEST,0,0 +12527,TEST,0,0 +12528,TEST,0,0 +12529,TEST,0,0 +12530,TEST,0,0 +12531,TEST,0,0 +12532,TEST,0,0 +12533,TEST,0,0 +12534,TEST,0,0 +12535,TEST,0,0 +12536,TEST,0,0 +12537,TEST,0,0 +12538,TEST,0,0 +12539,TEST,0,0 +12540,TEST,0,0 +12541,TEST,0,0 +12542,TEST,0,0 +12543,TEST,0,0 +12544,TEST,0,0 +12545,TEST,0,0 +12546,TEST,0,0 +12547,TEST,0,0 +12548,TEST,0,0 +12549,TEST,0,0 +12550,TEST,0,0 +12551,TEST,0,0 +12552,TEST,0,0 +12553,TEST,0,0 +12554,TEST,0,0 +12555,TEST,0,0 +12556,TEST,0,0 +12557,TEST,0,0 +12558,TEST,0,0 +12559,TEST,0,0 +12560,TEST,0,0 +12561,TEST,0,0 +12562,TEST,0,0 +12563,TEST,0,0 +12564,TEST,0,0 +12565,TEST,0,0 +12566,TEST,0,0 +12567,TEST,0,0 +12568,TEST,0,0 +12569,TEST,0,0 +12570,TEST,0,0 +12571,TEST,0,0 +12572,TEST,0,0 +12573,TEST,0,0 +12574,TEST,0,0 +12575,TEST,0,0 +12576,TEST,0,0 +12577,TEST,0,0 +12578,TEST,0,0 +12579,TEST,0,0 +12580,TEST,0,0 +12581,TEST,0,0 +12582,TEST,0,0 +12583,TEST,0,0 +12584,TEST,0,0 +12585,TEST,0,0 +12586,TEST,0,0 +12587,TEST,0,0 +12588,TEST,0,0 +12589,TEST,0,0 +12590,TEST,0,0 +12591,TEST,0,0 +12592,TEST,0,0 +12593,TEST,0,0 +12594,TEST,0,0 +12595,TEST,0,0 +12596,TEST,0,0 +12597,TEST,0,0 +12598,TEST,0,0 +12599,TEST,0,0 +12600,TEST,0,0 +12601,TEST,0,0 +12602,TEST,0,0 +12603,TEST,0,0 +12604,TEST,0,0 +12605,TEST,0,0 +12606,TEST,0,0 +12607,TEST,0,0 +12608,TEST,0,0 +12609,TEST,0,0 +12610,TEST,0,0 +12611,TEST,0,0 +12612,TEST,0,0 +12613,TEST,0,0 +12614,TEST,0,0 +12615,TEST,0,0 +12616,TEST,0,0 +12617,TEST,0,0 +12618,TEST,0,0 +12619,TEST,0,0 +12620,TEST,0,0 +12621,TEST,0,0 +12622,TEST,0,0 +12623,TEST,0,0 +12624,TEST,0,0 +12625,TEST,0,0 +12626,TEST,0,0 +12627,TEST,0,0 +12628,TEST,0,0 +12629,TEST,0,0 +12630,TEST,0,0 +12631,TEST,0,0 +12632,TEST,0,0 +12633,TEST,0,0 +12634,TEST,0,0 +12635,TEST,0,0 +12636,TEST,0,0 +12637,TEST,0,0 +12638,TEST,0,0 +12639,TEST,0,0 +12640,TEST,0,0 +12641,TEST,0,0 +12642,TEST,0,0 +12643,TEST,0,0 +12644,TEST,0,0 +12645,TEST,0,0 +12646,TEST,0,0 +12647,TEST,0,0 +12648,TEST,0,0 +12649,TEST,0,0 +12650,TEST,0,0 +12651,TEST,0,0 +12652,TEST,0,0 +12653,TEST,0,0 +12654,TEST,0,0 +12655,TEST,0,0 +12656,TEST,0,0 +12657,TEST,0,0 +12658,TEST,0,0 +12659,TEST,0,0 +12660,TEST,0,0 +12661,TEST,0,0 +12662,TEST,0,0 +12663,TEST,0,0 +12664,TEST,0,0 +12665,TEST,0,0 +12666,TEST,0,0 +12667,TEST,0,0 +12668,TEST,0,0 +12669,TEST,0,0 +12670,TEST,0,0 +12671,TEST,0,0 +12672,TEST,0,0 +12673,TEST,0,0 +12674,TEST,0,0 +12675,TEST,0,0 +12676,TEST,0,0 +12677,TEST,0,0 +12678,TEST,0,0 +12679,TEST,0,0 +12680,TEST,0,0 +12681,TEST,0,0 +12682,TEST,0,0 +12683,TEST,0,0 +12684,TEST,0,0 +12685,TEST,0,0 +12686,TEST,0,0 +12687,TEST,0,0 +12688,TEST,0,0 +12689,TEST,0,0 +12690,TEST,0,0 +12691,TEST,0,0 +12692,TEST,0,0 +12693,TEST,0,0 +12694,TEST,0,0 +12695,TEST,0,0 +12696,TEST,0,0 +12697,TEST,0,0 +12698,TEST,0,0 +12699,TEST,0,0 +12700,TEST,0,0 +12701,TEST,0,0 +12702,TEST,0,0 +12703,TEST,0,0 +12704,TEST,0,0 +12705,TEST,0,0 +12706,TEST,0,0 +12707,TEST,0,0 +12708,TEST,0,0 +12709,TEST,0,0 +12710,TEST,0,0 +12711,TEST,0,0 +12712,TEST,0,0 +12713,TEST,0,0 +12714,TEST,0,0 +12715,TEST,0,0 +12716,TEST,0,0 +12717,TEST,0,0 +12718,TEST,0,0 +12719,TEST,0,0 +12720,TEST,0,0 +12721,TEST,0,0 +12722,TEST,0,0 +12723,TEST,0,0 +12724,TEST,0,0 +12725,TEST,0,0 +12726,TEST,0,0 +12727,TEST,0,0 +12728,TEST,0,0 +12729,TEST,0,0 +12730,TEST,0,0 +12731,TEST,0,0 +12732,TEST,0,0 +12733,TEST,0,0 +12734,TEST,0,0 +12735,TEST,0,0 +12736,TEST,0,0 +12737,TEST,0,0 +12738,TEST,0,0 +12739,TEST,0,0 +12740,TEST,0,0 +12741,TEST,0,0 +12742,TEST,0,0 +12743,TEST,0,0 +12744,TEST,0,0 +12745,TEST,0,0 +12746,TEST,0,0 +12747,TEST,0,0 +12748,TEST,0,0 +12749,TEST,0,0 +12750,TEST,0,0 +12751,TEST,0,0 +12752,TEST,0,0 +12753,TEST,0,0 +12754,TEST,0,0 +12755,TEST,0,0 +12756,TEST,0,0 +12757,TEST,0,0 +12758,TEST,0,0 +12759,TEST,0,0 +12760,TEST,0,0 +12761,TEST,0,0 +12762,TEST,0,0 +12763,TEST,0,0 +12764,TEST,0,0 +12765,TEST,0,0 +12766,TEST,0,0 +12767,TEST,0,0 +12768,TEST,0,0 +12769,TEST,0,0 +12770,TEST,0,0 +12771,TEST,0,0 +12772,TEST,0,0 +12773,TEST,0,0 +12774,TEST,0,0 +12775,TEST,0,0 +12776,TEST,0,0 +12777,TEST,0,0 +12778,TEST,0,0 +12779,TEST,0,0 +12780,TEST,0,0 +12781,TEST,0,0 +12782,TEST,0,0 +12783,TEST,0,0 +12784,TEST,0,0 +12785,TEST,0,0 +12786,TEST,0,0 +12787,TEST,0,0 +12788,TEST,0,0 +12789,TEST,0,0 +12790,TEST,0,0 +12791,TEST,0,0 +12792,TEST,0,0 +12793,TEST,0,0 +12794,TEST,0,0 +12795,TEST,0,0 +12796,TEST,0,0 +12797,TEST,0,0 +12798,TEST,0,0 +12799,TEST,0,0 +12800,TEST,0,0 +12801,TEST,0,0 +12802,TEST,0,0 +12803,TEST,0,0 +12804,TEST,0,0 +12805,TEST,0,0 +12806,TEST,0,0 +12807,TEST,0,0 +12808,TEST,0,0 +12809,TEST,0,0 +12810,TEST,0,0 +12811,TEST,0,0 +12812,TEST,0,0 +12813,TEST,0,0 +12814,TEST,0,0 +12815,TEST,0,0 +12816,TEST,0,0 +12817,TEST,0,0 +12818,TEST,0,0 +12819,TEST,0,0 +12820,TEST,0,0 +12821,TEST,0,0 +12822,TEST,0,0 +12823,TEST,0,0 +12824,TEST,0,0 +12825,TEST,0,0 +12826,TEST,0,0 +12827,TEST,0,0 +12828,TEST,0,0 +12829,TEST,0,0 +12830,TEST,0,0 +12831,TEST,0,0 +12832,TEST,0,0 +12833,TEST,0,0 +12834,TEST,0,0 +12835,TEST,0,0 +12836,TEST,0,0 +12837,TEST,0,0 +12838,TEST,0,0 +12839,TEST,0,0 +12840,TEST,0,0 +12841,TEST,0,0 +12842,TEST,0,0 +12843,TEST,0,0 +12844,TEST,0,0 +12845,TEST,0,0 +12846,TEST,0,0 +12847,TEST,0,0 +12848,TEST,0,0 +12849,TEST,0,0 +12850,TEST,0,0 +12851,TEST,0,0 +12852,TEST,0,0 +12853,TEST,0,0 +12854,TEST,0,0 +12855,TEST,0,0 +12856,TEST,0,0 +12857,TEST,0,0 +12858,TEST,0,0 +12859,TEST,0,0 +12860,TEST,0,0 +12861,TEST,0,0 +12862,TEST,0,0 +12863,TEST,0,0 +12864,TEST,0,0 +12865,TEST,0,0 +12866,TEST,0,0 +12867,TEST,0,0 +12868,TEST,0,0 +12869,TEST,0,0 +12870,TEST,0,0 +12871,TEST,0,0 +12872,TEST,0,0 +12873,TEST,0,0 +12874,TEST,0,0 +12875,TEST,0,0 +12876,TEST,0,0 +12877,TEST,0,0 +12878,TEST,0,0 +12879,TEST,0,0 +12880,TEST,0,0 +12881,TEST,0,0 +12882,TEST,0,0 +12883,TEST,0,0 +12884,TEST,0,0 +12885,TEST,0,0 +12886,TEST,0,0 +12887,TEST,0,0 +12888,TEST,0,0 +12889,TEST,0,0 +12890,TEST,0,0 +12891,TEST,0,0 +12892,TEST,0,0 +12893,TEST,0,0 +12894,TEST,0,0 +12895,TEST,0,0 +12896,TEST,0,0 +12897,TEST,0,0 +12898,TEST,0,0 +12899,TEST,0,0 +12900,TEST,0,0 +12901,TEST,0,0 +12902,TEST,0,0 +12903,TEST,0,0 +12904,TEST,0,0 +12905,TEST,0,0 +12906,TEST,0,0 +12907,TEST,0,0 +12908,TEST,0,0 +12909,TEST,0,0 +12910,TEST,0,0 +12911,TEST,0,0 +12912,TEST,0,0 +12913,TEST,0,0 +12914,TEST,0,0 +12915,TEST,0,0 +12916,TEST,0,0 +12917,TEST,0,0 +12918,TEST,0,0 +12919,TEST,0,0 +12920,TEST,0,0 +12921,TEST,0,0 +12922,TEST,0,0 +12923,TEST,0,0 +12924,TEST,0,0 +12925,TEST,0,0 +12926,TEST,0,0 +12927,TEST,0,0 +12928,TEST,0,0 +12929,TEST,0,0 +12930,TEST,0,0 +12931,TEST,0,0 +12932,TEST,0,0 +12933,TEST,0,0 +12934,TEST,0,0 +12935,TEST,0,0 +12936,TEST,0,0 +12937,TEST,0,0 +12938,TEST,0,0 +12939,TEST,0,0 +12940,TEST,0,0 +12941,TEST,0,0 +12942,TEST,0,0 +12943,TEST,0,0 +12944,TEST,0,0 +12945,TEST,0,0 +12946,TEST,0,0 +12947,TEST,0,0 +12948,TEST,0,0 +12949,TEST,0,0 +12950,TEST,0,0 +12951,TEST,0,0 +12952,TEST,0,0 +12953,TEST,0,0 +12954,TEST,0,0 +12955,TEST,0,0 +12956,TEST,0,0 +12957,TEST,0,0 +12958,TEST,0,0 +12959,TEST,0,0 +12960,TEST,0,0 +12961,TEST,0,0 +12962,TEST,0,0 +12963,TEST,0,0 +12964,TEST,0,0 +12965,TEST,0,0 +12966,TEST,0,0 +12967,TEST,0,0 +12968,TEST,0,0 +12969,TEST,0,0 +12970,TEST,0,0 +12971,TEST,0,0 +12972,TEST,0,0 +12973,TEST,0,0 +12974,TEST,0,0 +12975,TEST,0,0 +12976,TEST,0,0 +12977,TEST,0,0 +12978,TEST,0,0 +12979,TEST,0,0 +12980,TEST,0,0 +12981,TEST,0,0 +12982,TEST,0,0 +12983,TEST,0,0 +12984,TEST,0,0 +12985,TEST,0,0 +12986,TEST,0,0 +12987,TEST,0,0 +12988,TEST,0,0 +12989,TEST,0,0 +12990,TEST,0,0 +12991,TEST,0,0 +12992,TEST,0,0 +12993,TEST,0,0 +12994,TEST,0,0 +12995,TEST,0,0 +12996,TEST,0,0 +12997,TEST,0,0 +12998,TEST,0,0 +12999,TEST,0,0 +13000,TEST,0,0 +13001,TEST,0,0 +13002,TEST,0,0 +13003,TEST,0,0 +13004,TEST,0,0 +13005,TEST,0,0 +13006,TEST,0,0 +13007,TEST,0,0 +13008,TEST,0,0 +13009,TEST,0,0 +13010,TEST,0,0 +13011,TEST,0,0 +13012,TEST,0,0 +13013,TEST,0,0 +13014,TEST,0,0 +13015,TEST,0,0 +13016,TEST,0,0 +13017,TEST,0,0 +13018,TEST,0,0 +13019,TEST,0,0 +13020,TEST,0,0 +13021,TEST,0,0 +13022,TEST,0,0 +13023,TEST,0,0 +13024,TEST,0,0 +13025,TEST,0,0 +13026,TEST,0,0 +13027,TEST,0,0 +13028,TEST,0,0 +13029,TEST,0,0 +13030,TEST,0,0 +13031,TEST,0,0 +13032,TEST,0,0 +13033,TEST,0,0 +13034,TEST,0,0 +13035,TEST,0,0 +13036,TEST,0,0 +13037,TEST,0,0 +13038,TEST,0,0 +13039,TEST,0,0 +13040,TEST,0,0 +13041,TEST,0,0 +13042,TEST,0,0 +13043,TEST,0,0 +13044,TEST,0,0 +13045,TEST,0,0 +13046,TEST,0,0 +13047,TEST,0,0 +13048,TEST,0,0 +13049,TEST,0,0 +13050,TEST,0,0 +13051,TEST,0,0 +13052,TEST,0,0 +13053,TEST,0,0 +13054,TEST,0,0 +13055,TEST,0,0 +13056,TEST,0,0 +13057,TEST,0,0 +13058,TEST,0,0 +13059,TEST,0,0 +13060,TEST,0,0 +13061,TEST,0,0 +13062,TEST,0,0 +13063,TEST,0,0 +13064,TEST,0,0 +13065,TEST,0,0 +13066,TEST,0,0 +13067,TEST,0,0 +13068,TEST,0,0 +13069,TEST,0,0 +13070,TEST,0,0 +13071,TEST,0,0 +13072,TEST,0,0 +13073,TEST,0,0 +13074,TEST,0,0 +13075,TEST,0,0 +13076,TEST,0,0 +13077,TEST,0,0 +13078,TEST,0,0 +13079,TEST,0,0 +13080,TEST,0,0 +13081,TEST,0,0 +13082,TEST,0,0 +13083,TEST,0,0 +13084,TEST,0,0 +13085,TEST,0,0 +13086,TEST,0,0 +13087,TEST,0,0 +13088,TEST,0,0 +13089,TEST,0,0 +13090,TEST,0,0 +13091,TEST,0,0 +13092,TEST,0,0 +13093,TEST,0,0 +13094,TEST,0,0 +13095,TEST,0,0 +13096,TEST,0,0 +13097,TEST,0,0 +13098,TEST,0,0 +13099,TEST,0,0 +13100,TEST,0,0 +13101,TEST,0,0 +13102,TEST,0,0 +13103,TEST,0,0 +13104,TEST,0,0 +13105,TEST,0,0 +13106,TEST,0,0 +13107,TEST,0,0 +13108,TEST,0,0 +13109,TEST,0,0 +13110,TEST,0,0 +13111,TEST,0,0 +13112,TEST,0,0 +13113,TEST,0,0 +13114,TEST,0,0 +13115,TEST,0,0 +13116,TEST,0,0 +13117,TEST,0,0 +13118,TEST,0,0 +13119,TEST,0,0 +13120,TEST,0,0 +13121,TEST,0,0 +13122,TEST,0,0 +13123,TEST,0,0 +13124,TEST,0,0 +13125,TEST,0,0 +13126,TEST,0,0 +13127,TEST,0,0 +13128,TEST,0,0 +13129,TEST,0,0 +13130,TEST,0,0 +13131,TEST,0,0 +13132,TEST,0,0 +13133,TEST,0,0 +13134,TEST,0,0 +13135,TEST,0,0 +13136,TEST,0,0 +13137,TEST,0,0 +13138,TEST,0,0 +13139,TEST,0,0 +13140,TEST,0,0 +13141,TEST,0,0 +13142,TEST,0,0 +13143,TEST,0,0 +13144,TEST,0,0 +13145,TEST,0,0 +13146,TEST,0,0 +13147,TEST,0,0 +13148,TEST,0,0 +13149,TEST,0,0 +13150,TEST,0,0 +13151,TEST,0,0 +13152,TEST,0,0 +13153,TEST,0,0 +13154,TEST,0,0 +13155,TEST,0,0 +13156,TEST,0,0 +13157,TEST,0,0 +13158,TEST,0,0 +13159,TEST,0,0 +13160,TEST,0,0 +13161,TEST,0,0 +13162,TEST,0,0 +13163,TEST,0,0 +13164,TEST,0,0 +13165,TEST,0,0 +13166,TEST,0,0 +13167,TEST,0,0 +13168,TEST,0,0 +13169,TEST,0,0 +13170,TEST,0,0 +13171,TEST,0,0 +13172,TEST,0,0 +13173,TEST,0,0 +13174,TEST,0,0 +13175,TEST,0,0 +13176,TEST,0,0 +13177,TEST,0,0 +13178,TEST,0,0 +13179,TEST,0,0 +13180,TEST,0,0 +13181,TEST,0,0 +13182,TEST,0,0 +13183,TEST,0,0 +13184,TEST,0,0 +13185,TEST,0,0 +13186,TEST,0,0 +13187,TEST,0,0 +13188,TEST,0,0 +13189,TEST,0,0 +13190,TEST,0,0 +13191,TEST,0,0 +13192,TEST,0,0 +13193,TEST,0,0 +13194,TEST,0,0 +13195,TEST,0,0 +13196,TEST,0,0 +13197,TEST,0,0 +13198,TEST,0,0 +13199,TEST,0,0 +13200,TEST,0,0 +13201,TEST,0,0 +13202,TEST,0,0 +13203,TEST,0,0 +13204,TEST,0,0 +13205,TEST,0,0 +13206,TEST,0,0 +13207,TEST,0,0 +13208,TEST,0,0 +13209,TEST,0,0 +13210,TEST,0,0 +13211,TEST,0,0 +13212,TEST,0,0 +13213,TEST,0,0 +13214,TEST,0,0 +13215,TEST,0,0 +13216,TEST,0,0 +13217,TEST,0,0 +13218,TEST,0,0 +13219,TEST,0,0 +13220,TEST,0,0 +13221,TEST,0,0 +13222,TEST,0,0 +13223,TEST,0,0 +13224,TEST,0,0 +13225,TEST,0,0 +13226,TEST,0,0 +13227,TEST,0,0 +13228,TEST,0,0 +13229,TEST,0,0 +13230,TEST,0,0 +13231,TEST,0,0 +13232,TEST,0,0 +13233,TEST,0,0 +13234,TEST,0,0 +13235,TEST,0,0 +13236,TEST,0,0 +13237,TEST,0,0 +13238,TEST,0,0 +13239,TEST,0,0 +13240,TEST,0,0 +13241,TEST,0,0 +13242,TEST,0,0 +13243,TEST,0,0 +13244,TEST,0,0 +13245,TEST,0,0 +13246,TEST,0,0 +13247,TEST,0,0 +13248,TEST,0,0 +13249,TEST,0,0 +13250,TEST,0,0 +13251,TEST,0,0 +13252,TEST,0,0 +13253,TEST,0,0 +13254,TEST,0,0 +13255,TEST,0,0 +13256,TEST,0,0 +13257,TEST,0,0 +13258,TEST,0,0 +13259,TEST,0,0 +13260,TEST,0,0 +13261,TEST,0,0 +13262,TEST,0,0 +13263,TEST,0,0 +13264,TEST,0,0 +13265,TEST,0,0 +13266,TEST,0,0 +13267,TEST,0,0 +13268,TEST,0,0 +13269,TEST,0,0 +13270,TEST,0,0 +13271,TEST,0,0 +13272,TEST,0,0 +13273,TEST,0,0 +13274,TEST,0,0 +13275,TEST,0,0 +13276,TEST,0,0 +13277,TEST,0,0 +13278,TEST,0,0 +13279,TEST,0,0 +13280,TEST,0,0 +13281,TEST,0,0 +13282,TEST,0,0 +13283,TEST,0,0 +13284,TEST,0,0 +13285,TEST,0,0 +13286,TEST,0,0 +13287,TEST,0,0 +13288,TEST,0,0 +13289,TEST,0,0 +13290,TEST,0,0 +13291,TEST,0,0 +13292,TEST,0,0 +13293,TEST,0,0 +13294,TEST,0,0 +13295,TEST,0,0 +13296,TEST,0,0 +13297,TEST,0,0 +13298,TEST,0,0 +13299,TEST,0,0 +13300,TEST,0,0 +13301,TEST,0,0 +13302,TEST,0,0 +13303,TEST,0,0 +13304,TEST,0,0 +13305,TEST,0,0 +13306,TEST,0,0 +13307,TEST,0,0 +13308,TEST,0,0 +13309,TEST,0,0 +13310,TEST,0,0 +13311,TEST,0,0 +13312,TEST,0,0 +13313,TEST,0,0 +13314,TEST,0,0 +13315,TEST,0,0 +13316,TEST,0,0 +13317,TEST,0,0 +13318,TEST,0,0 +13319,TEST,0,0 +13320,TEST,0,0 +13321,TEST,0,0 +13322,TEST,0,0 +13323,TEST,0,0 +13324,TEST,0,0 +13325,TEST,0,0 +13326,TEST,0,0 +13327,TEST,0,0 +13328,TEST,0,0 +13329,TEST,0,0 +13330,TEST,0,0 +13331,TEST,0,0 +13332,TEST,0,0 +13333,TEST,0,0 +13334,TEST,0,0 +13335,TEST,0,0 +13336,TEST,0,0 +13337,TEST,0,0 +13338,TEST,0,0 +13339,TEST,0,0 +13340,TEST,0,0 +13341,TEST,0,0 +13342,TEST,0,0 +13343,TEST,0,0 +13344,TEST,0,0 +13345,TEST,0,0 +13346,TEST,0,0 +13347,TEST,0,0 +13348,TEST,0,0 +13349,TEST,0,0 +13350,TEST,0,0 +13351,TEST,0,0 +13352,TEST,0,0 +13353,TEST,0,0 +13354,TEST,0,0 +13355,TEST,0,0 +13356,TEST,0,0 +13357,TEST,0,0 +13358,TEST,0,0 +13359,TEST,0,0 +13360,TEST,0,0 +13361,TEST,0,0 +13362,TEST,0,0 +13363,TEST,0,0 +13364,TEST,0,0 +13365,TEST,0,0 +13366,TEST,0,0 +13367,TEST,0,0 +13368,TEST,0,0 +13369,TEST,0,0 +13370,TEST,0,0 +13371,TEST,0,0 +13372,TEST,0,0 +13373,TEST,0,0 +13374,TEST,0,0 +13375,TEST,0,0 +13376,TEST,0,0 +13377,TEST,0,0 +13378,TEST,0,0 +13379,TEST,0,0 +13380,TEST,0,0 +13381,TEST,0,0 +13382,TEST,0,0 +13383,TEST,0,0 +13384,TEST,0,0 +13385,TEST,0,0 +13386,TEST,0,0 +13387,TEST,0,0 +13388,TEST,0,0 +13389,TEST,0,0 +13390,TEST,0,0 +13391,TEST,0,0 +13392,TEST,0,0 +13393,TEST,0,0 +13394,TEST,0,0 +13395,TEST,0,0 +13396,TEST,0,0 +13397,TEST,0,0 +13398,TEST,0,0 +13399,TEST,0,0 +13400,TEST,0,0 +13401,TEST,0,0 +13402,TEST,0,0 +13403,TEST,0,0 +13404,TEST,0,0 +13405,TEST,0,0 +13406,TEST,0,0 +13407,TEST,0,0 +13408,TEST,0,0 +13409,TEST,0,0 +13410,TEST,0,0 +13411,TEST,0,0 +13412,TEST,0,0 +13413,TEST,0,0 +13414,TEST,0,0 +13415,TEST,0,0 +13416,TEST,0,0 +13417,TEST,0,0 +13418,TEST,0,0 +13419,TEST,0,0 +13420,TEST,0,0 +13421,TEST,0,0 +13422,TEST,0,0 +13423,TEST,0,0 +13424,TEST,0,0 +13425,TEST,0,0 +13426,TEST,0,0 +13427,TEST,0,0 +13428,TEST,0,0 +13429,TEST,0,0 +13430,TEST,0,0 +13431,TEST,0,0 +13432,TEST,0,0 +13433,TEST,0,0 +13434,TEST,0,0 +13435,TEST,0,0 +13436,TEST,0,0 +13437,TEST,0,0 +13438,TEST,0,0 +13439,TEST,0,0 +13440,TEST,0,0 +13441,TEST,0,0 +13442,TEST,0,0 +13443,TEST,0,0 +13444,TEST,0,0 +13445,TEST,0,0 +13446,TEST,0,0 +13447,TEST,0,0 +13448,TEST,0,0 +13449,TEST,0,0 +13450,TEST,0,0 +13451,TEST,0,0 +13452,TEST,0,0 +13453,TEST,0,0 +13454,TEST,0,0 +13455,TEST,0,0 +13456,TEST,0,0 +13457,TEST,0,0 +13458,TEST,0,0 +13459,TEST,0,0 +13460,TEST,0,0 +13461,TEST,0,0 +13462,TEST,0,0 +13463,TEST,0,0 +13464,TEST,0,0 +13465,TEST,0,0 +13466,TEST,0,0 +13467,TEST,0,0 +13468,TEST,0,0 +13469,TEST,0,0 +13470,TEST,0,0 +13471,TEST,0,0 +13472,TEST,0,0 +13473,TEST,0,0 +13474,TEST,0,0 +13475,TEST,0,0 +13476,TEST,0,0 +13477,TEST,0,0 +13478,TEST,0,0 +13479,TEST,0,0 +13480,TEST,0,0 +13481,TEST,0,0 +13482,TEST,0,0 +13483,TEST,0,0 +13484,TEST,0,0 +13485,TEST,0,0 +13486,TEST,0,0 +13487,TEST,0,0 +13488,TEST,0,0 +13489,TEST,0,0 +13490,TEST,0,0 +13491,TEST,0,0 +13492,TEST,0,0 +13493,TEST,0,0 +13494,TEST,0,0 +13495,TEST,0,0 +13496,TEST,0,0 +13497,TEST,0,0 +13498,TEST,0,0 +13499,TEST,0,0 +13500,TEST,0,0 +13501,TEST,0,0 +13502,TEST,0,0 +13503,TEST,0,0 +13504,TEST,0,0 +13505,TEST,0,0 +13506,TEST,0,0 +13507,TEST,0,0 +13508,TEST,0,0 +13509,TEST,0,0 +13510,TEST,0,0 +13511,TEST,0,0 +13512,TEST,0,0 +13513,TEST,0,0 +13514,TEST,0,0 +13515,TEST,0,0 +13516,TEST,0,0 +13517,TEST,0,0 +13518,TEST,0,0 +13519,TEST,0,0 +13520,TEST,0,0 +13521,TEST,0,0 +13522,TEST,0,0 +13523,TEST,0,0 +13524,TEST,0,0 +13525,TEST,0,0 +13526,TEST,0,0 +13527,TEST,0,0 +13528,TEST,0,0 +13529,TEST,0,0 +13530,TEST,0,0 +13531,TEST,0,0 +13532,TEST,0,0 +13533,TEST,0,0 +13534,TEST,0,0 +13535,TEST,0,0 +13536,TEST,0,0 +13537,TEST,0,0 +13538,TEST,0,0 +13539,TEST,0,0 +13540,TEST,0,0 +13541,TEST,0,0 +13542,TEST,0,0 +13543,TEST,0,0 +13544,TEST,0,0 +13545,TEST,0,0 +13546,TEST,0,0 +13547,TEST,0,0 +13548,TEST,0,0 +13549,TEST,0,0 +13550,TEST,0,0 +13551,TEST,0,0 +13552,TEST,0,0 +13553,TEST,0,0 +13554,TEST,0,0 +13555,TEST,0,0 +13556,TEST,0,0 +13557,TEST,0,0 +13558,TEST,0,0 +13559,TEST,0,0 +13560,TEST,0,0 +13561,TEST,0,0 +13562,TEST,0,0 +13563,TEST,0,0 +13564,TEST,0,0 +13565,TEST,0,0 +13566,TEST,0,0 +13567,TEST,0,0 +13568,TEST,0,0 +13569,TEST,0,0 +13570,TEST,0,0 +13571,TEST,0,0 +13572,TEST,0,0 +13573,TEST,0,0 +13574,TEST,0,0 +13575,TEST,0,0 +13576,TEST,0,0 +13577,TEST,0,0 +13578,TEST,0,0 +13579,TEST,0,0 +13580,TEST,0,0 +13581,TEST,0,0 +13582,TEST,0,0 +13583,TEST,0,0 +13584,TEST,0,0 +13585,TEST,0,0 +13586,TEST,0,0 +13587,TEST,0,0 +13588,TEST,0,0 +13589,TEST,0,0 +13590,TEST,0,0 +13591,TEST,0,0 +13592,TEST,0,0 +13593,TEST,0,0 +13594,TEST,0,0 +13595,TEST,0,0 +13596,TEST,0,0 +13597,TEST,0,0 +13598,TEST,0,0 +13599,TEST,0,0 +13600,TEST,0,0 +13601,TEST,0,0 +13602,TEST,0,0 +13603,TEST,0,0 +13604,TEST,0,0 +13605,TEST,0,0 +13606,TEST,0,0 +13607,TEST,0,0 +13608,TEST,0,0 +13609,TEST,0,0 +13610,TEST,0,0 +13611,TEST,0,0 +13612,TEST,0,0 +13613,TEST,0,0 +13614,TEST,0,0 +13615,TEST,0,0 +13616,TEST,0,0 +13617,TEST,0,0 +13618,TEST,0,0 +13619,TEST,0,0 +13620,TEST,0,0 +13621,TEST,0,0 +13622,TEST,0,0 +13623,TEST,0,0 +13624,TEST,0,0 +13625,TEST,0,0 +13626,TEST,0,0 +13627,TEST,0,0 +13628,TEST,0,0 +13629,TEST,0,0 +13630,TEST,0,0 +13631,TEST,0,0 +13632,TEST,0,0 +13633,TEST,0,0 +13634,TEST,0,0 +13635,TEST,0,0 +13636,TEST,0,0 +13637,TEST,0,0 +13638,TEST,0,0 +13639,TEST,0,0 +13640,TEST,0,0 +13641,TEST,0,0 +13642,TEST,0,0 +13643,TEST,0,0 +13644,TEST,0,0 +13645,TEST,0,0 +13646,TEST,0,0 +13647,TEST,0,0 +13648,TEST,0,0 +13649,TEST,0,0 +13650,TEST,0,0 +13651,TEST,0,0 +13652,TEST,0,0 +13653,TEST,0,0 +13654,TEST,0,0 +13655,TEST,0,0 +13656,TEST,0,0 +13657,TEST,0,0 +13658,TEST,0,0 +13659,TEST,0,0 +13660,TEST,0,0 +13661,TEST,0,0 +13662,TEST,0,0 +13663,TEST,0,0 +13664,TEST,0,0 +13665,TEST,0,0 +13666,TEST,0,0 +13667,TEST,0,0 +13668,TEST,0,0 +13669,TEST,0,0 +13670,TEST,0,0 +13671,TEST,0,0 +13672,TEST,0,0 +13673,TEST,0,0 +13674,TEST,0,0 +13675,TEST,0,0 +13676,TEST,0,0 +13677,TEST,0,0 +13678,TEST,0,0 +13679,TEST,0,0 +13680,TEST,0,0 +13681,TEST,0,0 +13682,TEST,0,0 +13683,TEST,0,0 +13684,TEST,0,0 +13685,TEST,0,0 +13686,TEST,0,0 +13687,TEST,0,0 +13688,TEST,0,0 +13689,TEST,0,0 +13690,TEST,0,0 +13691,TEST,0,0 +13692,TEST,0,0 +13693,TEST,0,0 +13694,TEST,0,0 +13695,TEST,0,0 +13696,TEST,0,0 +13697,TEST,0,0 +13698,TEST,0,0 +13699,TEST,0,0 +13700,TEST,0,0 +13701,TEST,0,0 +13702,TEST,0,0 +13703,TEST,0,0 +13704,TEST,0,0 +13705,TEST,0,0 +13706,TEST,0,0 +13707,TEST,0,0 +13708,TEST,0,0 +13709,TEST,0,0 +13710,TEST,0,0 +13711,TEST,0,0 +13712,TEST,0,0 +13713,TEST,0,0 +13714,TEST,0,0 +13715,TEST,0,0 +13716,TEST,0,0 +13717,TEST,0,0 +13718,TEST,0,0 +13719,TEST,0,0 +13720,TEST,0,0 +13721,TEST,0,0 +13722,TEST,0,0 +13723,TEST,0,0 +13724,TEST,0,0 +13725,TEST,0,0 +13726,TEST,0,0 +13727,TEST,0,0 +13728,TEST,0,0 +13729,TEST,0,0 +13730,TEST,0,0 +13731,TEST,0,0 +13732,TEST,0,0 +13733,TEST,0,0 +13734,TEST,0,0 +13735,TEST,0,0 +13736,TEST,0,0 +13737,TEST,0,0 +13738,TEST,0,0 +13739,TEST,0,0 +13740,TEST,0,0 +13741,TEST,0,0 +13742,TEST,0,0 +13743,TEST,0,0 +13744,TEST,0,0 +13745,TEST,0,0 +13746,TEST,0,0 +13747,TEST,0,0 +13748,TEST,0,0 +13749,TEST,0,0 +13750,TEST,0,0 +13751,TEST,0,0 +13752,TEST,0,0 +13753,TEST,0,0 +13754,TEST,0,0 +13755,TEST,0,0 +13756,TEST,0,0 +13757,TEST,0,0 +13758,TEST,0,0 +13759,TEST,0,0 +13760,TEST,0,0 +13761,TEST,0,0 +13762,TEST,0,0 +13763,TEST,0,0 +13764,TEST,0,0 +13765,TEST,0,0 +13766,TEST,0,0 +13767,TEST,0,0 +13768,TEST,0,0 +13769,TEST,0,0 +13770,TEST,0,0 +13771,TEST,0,0 +13772,TEST,0,0 +13773,TEST,0,0 +13774,TEST,0,0 +13775,TEST,0,0 +13776,TEST,0,0 +13777,TEST,0,0 +13778,TEST,0,0 +13779,TEST,0,0 +13780,TEST,0,0 +13781,TEST,0,0 +13782,TEST,0,0 +13783,TEST,0,0 +13784,TEST,0,0 +13785,TEST,0,0 +13786,TEST,0,0 +13787,TEST,0,0 +13788,TEST,0,0 +13789,TEST,0,0 +13790,TEST,0,0 +13791,TEST,0,0 +13792,TEST,0,0 +13793,TEST,0,0 +13794,TEST,0,0 +13795,TEST,0,0 +13796,TEST,0,0 +13797,TEST,0,0 +13798,TEST,0,0 +13799,TEST,0,0 +13800,TEST,0,0 +13801,TEST,0,0 +13802,TEST,0,0 +13803,TEST,0,0 +13804,TEST,0,0 +13805,TEST,0,0 +13806,TEST,0,0 +13807,TEST,0,0 +13808,TEST,0,0 +13809,TEST,0,0 +13810,TEST,0,0 +13811,TEST,0,0 +13812,TEST,0,0 +13813,TEST,0,0 +13814,TEST,0,0 +13815,TEST,0,0 +13816,TEST,0,0 +13817,TEST,0,0 +13818,TEST,0,0 +13819,TEST,0,0 +13820,TEST,0,0 +13821,TEST,0,0 +13822,TEST,0,0 +13823,TEST,0,0 +13824,TEST,0,0 +13825,TEST,0,0 +13826,TEST,0,0 +13827,TEST,0,0 +13828,TEST,0,0 +13829,TEST,0,0 +13830,TEST,0,0 +13831,TEST,0,0 +13832,TEST,0,0 +13833,TEST,0,0 +13834,TEST,0,0 +13835,TEST,0,0 +13836,TEST,0,0 +13837,TEST,0,0 +13838,TEST,0,0 +13839,TEST,0,0 +13840,TEST,0,0 +13841,TEST,0,0 +13842,TEST,0,0 +13843,TEST,0,0 +13844,TEST,0,0 +13845,TEST,0,0 +13846,TEST,0,0 +13847,TEST,0,0 +13848,TEST,0,0 +13849,TEST,0,0 +13850,TEST,0,0 +13851,TEST,0,0 +13852,TEST,0,0 +13853,TEST,0,0 +13854,TEST,0,0 +13855,TEST,0,0 +13856,TEST,0,0 +13857,TEST,0,0 +13858,TEST,0,0 +13859,TEST,0,0 +13860,TEST,0,0 +13861,TEST,0,0 +13862,TEST,0,0 +13863,TEST,0,0 +13864,TEST,0,0 +13865,TEST,0,0 +13866,TEST,0,0 +13867,TEST,0,0 +13868,TEST,0,0 +13869,TEST,0,0 +13870,TEST,0,0 +13871,TEST,0,0 +13872,TEST,0,0 +13873,TEST,0,0 +13874,TEST,0,0 +13875,TEST,0,0 +13876,TEST,0,0 +13877,TEST,0,0 +13878,TEST,0,0 +13879,TEST,0,0 +13880,TEST,0,0 +13881,TEST,0,0 +13882,TEST,0,0 +13883,TEST,0,0 +13884,TEST,0,0 +13885,TEST,0,0 +13886,TEST,0,0 +13887,TEST,0,0 +13888,TEST,0,0 +13889,TEST,0,0 +13890,TEST,0,0 +13891,TEST,0,0 +13892,TEST,0,0 +13893,TEST,0,0 +13894,TEST,0,0 +13895,TEST,0,0 +13896,TEST,0,0 +13897,TEST,0,0 +13898,TEST,0,0 +13899,TEST,0,0 +13900,TEST,0,0 +13901,TEST,0,0 +13902,TEST,0,0 +13903,TEST,0,0 +13904,TEST,0,0 +13905,TEST,0,0 +13906,TEST,0,0 +13907,TEST,0,0 +13908,TEST,0,0 +13909,TEST,0,0 +13910,TEST,0,0 +13911,TEST,0,0 +13912,TEST,0,0 +13913,TEST,0,0 +13914,TEST,0,0 +13915,TEST,0,0 +13916,TEST,0,0 +13917,TEST,0,0 +13918,TEST,0,0 +13919,TEST,0,0 +13920,TEST,0,0 +13921,TEST,0,0 +13922,TEST,0,0 +13923,TEST,0,0 +13924,TEST,0,0 +13925,TEST,0,0 +13926,TEST,0,0 +13927,TEST,0,0 +13928,TEST,0,0 +13929,TEST,0,0 +13930,TEST,0,0 +13931,TEST,0,0 +13932,TEST,0,0 +13933,TEST,0,0 +13934,TEST,0,0 +13935,TEST,0,0 +13936,TEST,0,0 +13937,TEST,0,0 +13938,TEST,0,0 +13939,TEST,0,0 +13940,TEST,0,0 +13941,TEST,0,0 +13942,TEST,0,0 +13943,TEST,0,0 +13944,TEST,0,0 +13945,TEST,0,0 +13946,TEST,0,0 +13947,TEST,0,0 +13948,TEST,0,0 +13949,TEST,0,0 +13950,TEST,0,0 +13951,TEST,0,0 +13952,TEST,0,0 +13953,TEST,0,0 +13954,TEST,0,0 +13955,TEST,0,0 +13956,TEST,0,0 +13957,TEST,0,0 +13958,TEST,0,0 +13959,TEST,0,0 +13960,TEST,0,0 +13961,TEST,0,0 +13962,TEST,0,0 +13963,TEST,0,0 +13964,TEST,0,0 +13965,TEST,0,0 +13966,TEST,0,0 +13967,TEST,0,0 +13968,TEST,0,0 +13969,TEST,0,0 +13970,TEST,0,0 +13971,TEST,0,0 +13972,TEST,0,0 +13973,TEST,0,0 +13974,TEST,0,0 +13975,TEST,0,0 +13976,TEST,0,0 +13977,TEST,0,0 +13978,TEST,0,0 +13979,TEST,0,0 +13980,TEST,0,0 +13981,TEST,0,0 +13982,TEST,0,0 +13983,TEST,0,0 +13984,TEST,0,0 +13985,TEST,0,0 +13986,TEST,0,0 +13987,TEST,0,0 +13988,TEST,0,0 +13989,TEST,0,0 +13990,TEST,0,0 +13991,TEST,0,0 +13992,TEST,0,0 +13993,TEST,0,0 +13994,TEST,0,0 +13995,TEST,0,0 +13996,TEST,0,0 +13997,TEST,0,0 +13998,TEST,0,0 +13999,TEST,0,0 +14000,TEST,0,0 +14001,TEST,0,0 +14002,TEST,0,0 +14003,TEST,0,0 +14004,TEST,0,0 +14005,TEST,0,0 +14006,TEST,0,0 +14007,TEST,0,0 +14008,TEST,0,0 +14009,TEST,0,0 +14010,TEST,0,0 +14011,TEST,0,0 +14012,TEST,0,0 +14013,TEST,0,0 +14014,TEST,0,0 +14015,TEST,0,0 +14016,TEST,0,0 +14017,TEST,0,0 +14018,TEST,0,0 +14019,TEST,0,0 +14020,TEST,0,0 +14021,TEST,0,0 +14022,TEST,0,0 +14023,TEST,0,0 +14024,TEST,0,0 +14025,TEST,0,0 +14026,TEST,0,0 +14027,TEST,0,0 +14028,TEST,0,0 +14029,TEST,0,0 +14030,TEST,0,0 +14031,TEST,0,0 +14032,TEST,0,0 +14033,TEST,0,0 +14034,TEST,0,0 +14035,TEST,0,0 +14036,TEST,0,0 +14037,TEST,0,0 +14038,TEST,0,0 +14039,TEST,0,0 +14040,TEST,0,0 +14041,TEST,0,0 +14042,TEST,0,0 +14043,TEST,0,0 +14044,TEST,0,0 +14045,TEST,0,0 +14046,TEST,0,0 +14047,TEST,0,0 +14048,TEST,0,0 +14049,TEST,0,0 +14050,TEST,0,0 +14051,TEST,0,0 +14052,TEST,0,0 +14053,TEST,0,0 diff --git a/datasets/anomaly/kpi/SCORE/problem_TEST/problemDoc.json b/datasets/anomaly/kpi/SCORE/problem_TEST/problemDoc.json new file mode 100644 index 0000000..1fd55ad --- /dev/null +++ b/datasets/anomaly/kpi/SCORE/problem_TEST/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "kpi_problem", + "problemName": "kpi_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "kpi_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 3, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TRAIN" + } + ], + "test": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TEST" + } + ], + "score": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly/kpi/SCORE/targets.csv b/datasets/anomaly/kpi/SCORE/targets.csv new file mode 100644 index 0000000..e69de29 diff --git a/datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json b/datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json new file mode 100644 index 0000000..2a04d60 --- /dev/null +++ b/datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json @@ -0,0 +1,63 @@ +{ + "about": { + "datasetID": "kpi_dataset_TEST", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 4 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly/kpi/TEST/dataset_TEST/tables/learningData.csv b/datasets/anomaly/kpi/TEST/dataset_TEST/tables/learningData.csv new file mode 100644 index 0000000..b9e432d --- /dev/null +++ b/datasets/anomaly/kpi/TEST/dataset_TEST/tables/learningData.csv @@ -0,0 +1,1758 @@ +d3mIndex,timestamp,value,ground_truth +7027,1475026500,0.32264705162415364,0 +7028,1475026800,0.32183430507799304,0 +7029,1475027100,0.31787914535951506,0 +7030,1475027400,0.3296732765365322,0 +7031,1475027700,0.33072178162272026,0 +7032,1475028000,0.3282773378117453,0 +7033,1475028300,0.3412378533449643,0 +7034,1475028600,0.3444485124115538,0 +7035,1475028900,0.34747304631385745,0 +7036,1475029200,0.34477423144747743,0 +7037,1475029500,0.34249419819706234,0 +7038,1475029800,0.3547319276800169,0 +7039,1475030100,0.3569188983482892,0 +7040,1475030400,0.3528241447571223,0 +7041,1475030700,0.3536617079911538,0 +7042,1475031000,0.3595928965267984,0 +7043,1475031300,0.3414456931108743,0 +7044,1475031600,0.3444702270131781,0 +7045,1475031900,0.3567327731850544,0 +7046,1475032200,0.344169324666176,0 +7047,1475032500,0.34747304631385745,0 +7048,1475032800,0.3413309159271072,0 +7049,1475033100,0.3411665053665474,0 +7050,1475033400,0.3484253867327069,0 +7051,1475033700,0.3466571976814503,0 +7052,1475034000,0.3524518944306527,0 +7053,1475034300,0.3450999504823503,0 +7054,1475034600,0.34230807303382743,0 +7055,1475034900,0.32953368266384336,0 +7056,1475035200,0.3585940248174029,0 +7057,1475035500,0.3494738918188949,0 +7058,1475035800,0.3478918279308732,0 +7059,1475036100,0.3570584922209781,0 +7060,1475036400,0.3642925568982159,0 +7061,1475036700,0.3735522837694128,0 +7062,1475037000,0.371529723662751,0 +7063,1475037300,0.36375899809743295,0 +7064,1475037600,0.3717623801165319,0 +7065,1475037900,0.3745759721677299,0 +7066,1475038200,0.3771134785597968,0 +7067,1475038500,0.38916508287951657,0 +7068,1475038800,0.3930954259090729,0 +7069,1475039100,0.3960051826276095,0 +7070,1475039400,0.3930023633279808,0 +7071,1475039700,0.37371669432997257,0 +7072,1475040000,0.3825328228962828,0 +7073,1475040300,0.35663971060291155,0 +7074,1475040600,0.3567793044756004,0 +7075,1475040900,0.3473334524411687,0 +7076,1475041200,0.35570908478673724,0 +7077,1475041500,0.3453077902482603,0 +7078,1475041800,0.3417031662535769,0 +7079,1475042100,0.34623841606443456,0 +7080,1475042400,0.3279050874852757,0 +7081,1475042700,0.3283486857912131,0 +7082,1475043000,0.3181583331038419,0 +7083,1475043300,0.3134586727324244,0 +7084,1475043600,0.3296050306433111,0 +7085,1475043900,0.3221817387155411,0 +7086,1475044200,0.3210184564455859,0 +7087,1475044500,0.31680892567065194,0 +7088,1475044800,0.3279764354647434,0 +7089,1475045100,0.3037088162647441,0 +7090,1475045400,0.30889860623264503,0 +7091,1475045700,0.3124349843341073,0 +7092,1475046000,0.3008021616321388,0 +7093,1475046300,0.3049651611168421,0 +7094,1475046600,0.3101084197941969,0 +7095,1475046900,0.302173283668004,0 +7096,1475047200,0.3071304171824393,0 +7097,1475047500,0.3046177274782433,0 +7098,1475047800,0.3044998482082296,0 +7099,1475048100,0.296124215862766,0 +7100,1475048400,0.3014287830150645,0 +7101,1475048700,0.2965678141684933,0 +7102,1475049000,0.2985686596733205,0 +7103,1475049300,0.2996854106527297,0 +7104,1475049600,0.2962420951327797,0 +7105,1475049900,0.2976845651479024,0 +7106,1475050200,0.29400859317396144,0 +7107,1475050500,0.3005912197805077,0 +7108,1475050800,0.29345021768425683,0 +7109,1475051100,0.28656358664446197,0 +7110,1475051400,0.2895881205470809,0 +7111,1475051700,0.2858873318847437,0 +7112,1475052000,0.28960983514944083,0 +7113,1475052300,0.28656358664446197,0 +7114,1475052600,0.2957302509339364,0 +7115,1475052900,0.2834677047625855,0 +7116,1475053200,0.29375112003146897,0 +7117,1475053500,0.2945204373729098,0 +7118,1475053800,0.2802353310943717,0 +7119,1475054100,0.2943808435004311,0 +7120,1475054400,0.2876555209354749,0 +7121,1475054700,0.28837830698605443,0 +7122,1475055000,0.2714161004429708,0 +7123,1475055300,0.27325563747295945,0 +7124,1475055600,0.2763267026664397,0 +7125,1475055900,0.2695083175197609,0 +7126,1475056200,0.2664620690147821,0 +7127,1475056500,0.2637849687502365,0 +7128,1475056800,0.2611575018625886,0 +7129,1475057100,0.2521304314454878,0 +7130,1475057400,0.2520373688638704,0 +7131,1475057700,0.2764197652480572,0 +7132,1475058000,0.2607387202452576,0 +7133,1475058300,0.24971080432343465,0 +7134,1475058600,0.2539916830779415,0 +7135,1475058900,0.2424519229571701,0 +7136,1475059200,0.2554093364045628,0 +7137,1475059500,0.2629008742249235,0 +7138,1475059800,0.2538055579147066,0 +7139,1475060100,0.2649482510205069,0 +7140,1475060400,0.2632979412397895,0 +7141,1475060700,0.264926536418147,0 +7142,1475061000,0.2610644392809712,0 +7143,1475061300,0.25969021515906965,0 +7144,1475061600,0.27055682260605396,0 +7145,1475061900,0.2705102913151928,0 +7146,1475062200,0.27753651622746633,0 +7147,1475062500,0.2853754876857953,0 +7148,1475062800,0.2817243324002506,0 +7149,1475063100,0.2922404041232303,0 +7150,1475063400,0.29065834023573395,0 +7151,1475063700,0.2966360600617144,0 +7152,1475064000,0.2832350483085944,0 +7153,1475064300,0.293518463577478,0 +7154,1475064600,0.2871436767365265,0 +7155,1475064900,0.29491440230173943,0 +7156,1475065200,0.2984042491124456,0 +7157,1475065500,0.3362590052292199,0 +7158,1475065800,0.31566735800232104,0 +7159,1475066100,0.2938907139039477,0 +7160,1475066400,0.3159465457476987,0 +7161,1475066700,0.30140706841270465,0 +7162,1475067000,0.2997536565458457,0 +7163,1475067300,0.2992200977446425,0 +7164,1475067600,0.2795125450437922,0 +7165,1475067900,0.2827945520889036,0 +7166,1475068200,0.2817243324002506,0 +7167,1475068500,0.2869358369709317,0 +7168,1475068800,0.29658952877085315,0 +7169,1475069100,0.2933106238117782,0 +7170,1475069400,0.2914959034702908,0 +7171,1475069700,0.2925195918680826,0 +7172,1475070000,0.30205850648402666,0 +7173,1475070300,0.2888901511848977,0 +7174,1475070600,0.2945886832660259,0 +7175,1475070900,0.30221981495876016,0 +7176,1475071200,0.2890297450573764,0 +7177,1475071500,0.2816995157118543,0 +7178,1475071800,0.2820252347475679,0 +7179,1475072100,0.2830023918544983,0 +7180,1475072400,0.2746050449064648,0 +7181,1475072700,0.2782344855895445,0 +7182,1475073000,0.281165956910546,0 +7183,1475073300,0.2609248454084925,0 +7184,1475073600,0.2648086571480282,0 +7185,1475073900,0.2579220261082334,0 +7186,1475074200,0.2676470658874123,0 +7187,1475074500,0.24947814786944364,0 +7188,1475074800,0.2447784874977109,0 +7189,1475075100,0.23386534875986525,0 +7190,1475075400,0.2357514170806101,0 +7191,1475075700,0.2283281251534706,0 +7192,1475076000,0.2276084411890325,0 +7193,1475076300,0.21660223986956945,0 +7194,1475076600,0.21753286568574376,0 +7195,1475076900,0.20918205002846632,0 +7196,1475077200,0.20020151090228985,0 +7197,1475077500,0.20303991964166346,0 +7198,1475077800,0.19931741637691366,0 +7199,1475078100,0.18635690084348466,0 +7200,1475078400,0.18859040280232395,0 +7201,1475078700,0.17681798622754055,0 +7202,1475079000,0.18230867854307395,0 +7203,1475079300,0.16653457095865698,0 +7204,1475079600,0.1629764782548348,0 +7205,1475079900,0.15660169141388328,0 +7206,1475080200,0.1467835890530869,0 +7207,1475080500,0.14766768357850502,0 +7208,1475080800,0.13910592606959646,0 +7209,1475081100,0.13633576322343344,0 +7210,1475081400,0.13194010728494932,0 +7211,1475081700,0.1272869782039728,0 +7212,1475082000,0.12249425525062264,0 +7213,1475082300,0.11211777740012167,0 +7214,1475082600,0.10723199186515407,0 +7215,1475082900,0.10087891962666752,0 +7216,1475083200,0.0976465459584538,0 +7217,1475083500,0.09383098011203414,0 +7218,1475083800,0.08482562429739833,0 +7219,1475084100,0.08080221868569913,0 +7220,1475084400,0.08082393328732347,0 +7221,1475084700,0.07554418282352629,0 +7222,1475085000,0.07328586417683709,0 +7223,1475085300,0.07188992545205025,0 +7224,1475085600,0.06672495217307105,0 +7225,1475085900,0.06828530145736683,0 +7226,1475086200,0.062422358815994186,0 +7227,1475086500,0.0631203281783876,0 +7228,1475086800,0.0577226984435259,0 +7229,1475087100,0.056280228429243837,0 +7230,1475087400,0.054626816561754436,0 +7231,1475087700,0.05669901004625956,0 +7232,1475088000,0.052579439766171,0 +7233,1475088300,0.057396979408653015,0 +7234,1475088600,0.05271903363885985,0 +7235,1475088900,0.051276563623527,0 +7236,1475089200,0.05116178643975985,0 +7237,1475089500,0.0513013803124487,0 +7238,1475089800,0.05243984589453297,0 +7239,1475090100,0.052067595568063264,0 +7240,1475090400,0.05172016192946443,0 +7241,1475090700,0.047439283175062685,0 +7242,1475091000,0.047740185522064715,0 +7243,1475091300,0.046462126068342366,0 +7244,1475091600,0.04441474927275894,0 +7245,1475091900,0.0436237173292735,0 +7246,1475092200,0.04120409020616953,0 +7247,1475092500,0.04348412345658465,0 +7248,1475092800,0.04278615409419124,0 +7249,1475093100,0.04299399386010122,0 +7250,1475093400,0.04108621093615582,0 +7251,1475093700,0.04246043505826752,0 +7252,1475094000,0.0424139037677215,0 +7253,1475094300,0.04117927351829865,0 +7254,1475094600,0.04285439998741237,0 +7255,1475094900,0.04255349764041035,0 +7256,1475095200,0.041504992554222346,0 +7257,1475095500,0.041132742227752636,0 +7258,1475095800,0.04339106087444181,0 +7259,1475096100,0.040971433752388674,0 +7260,1475096400,0.03929630728327494,0 +7261,1475096700,0.04031999568159207,0 +7262,1475097000,0.04280786869686635,0 +7263,1475097300,0.04115755891562354,0 +7264,1475097600,0.04255349764041035,0 +7265,1475097900,0.04208818473179781,0 +7266,1475098200,0.0413188673909875,0 +7267,1475098500,0.0413188673909875,0 +7268,1475098800,0.04557803154271409,0 +7269,1475099100,0.04632253219565356,0 +7270,1475099400,0.04841644028178297,0 +7271,1475099700,0.04683437639481212,0 +7272,1475100000,0.05144097418513755,0 +7273,1475100300,0.05232506870971503,0 +7274,1475100600,0.05797706949998192,0 +7275,1475100900,0.056767255939480725,0 +7276,1475101200,0.06551513861151907,0 +7277,1475101500,0.06414401657565393,0 +7278,1475101800,0.0640974852851079,0 +7279,1475102100,0.0660052682080025,0 +7280,1475102400,0.07026133027453338,0 +7281,1475102700,0.07149596052395624,0 +7282,1475103000,0.07317108699306994,0 +7283,1475103300,0.07514711580918569,0 +7284,1475103600,0.07859043132850517,0 +7285,1475103900,0.08021902650707287,0 +7286,1475104200,0.08454643655307144,0 +7287,1475104500,0.0854305310776489,0 +7288,1475104800,0.08652556745543376,0 +7289,1475105100,0.08682646980243576,0 +7290,1475105400,0.08719872012890545,0 +7291,1475105700,0.0928072917138221,0 +7292,1475106000,0.0912252278268512,0 +7293,1475106300,0.09501597698466444,0 +7294,1475106600,0.10369561376369177,0 +7295,1475106900,0.10176301415208568,0 +7296,1475107200,0.10364908247283053,0 +7297,1475107500,0.1077438360641025,0 +7298,1475107800,0.11169899578289576,0 +7299,1475108100,0.1208408433838688,0 +7300,1475108400,0.11490965484874965,0 +7301,1475108700,0.11870040400666795,0 +7302,1475109000,0.11879346658828535,0 +7303,1475109300,0.1214922814551908,0 +7304,1475109600,0.12835719789262576,0 +7305,1475109900,0.12186453178166053,0 +7306,1475110200,0.12688991118910678,0 +7307,1475110500,0.13098466478037873,0 +7308,1475110800,0.14366599256895554,0 +7309,1475111100,0.13545166869801534,0 +7310,1475111400,0.1395464222892873,0 +7311,1475111700,0.14645787001747845,0 +7312,1475112000,0.14405995749778516,0 +7313,1475112300,0.15041302973627171,0 +7314,1475112600,0.15622944108741355,0 +7315,1475112900,0.15308702791478088,0 +7316,1475113200,0.15367022009288173,0 +7317,1475113500,0.1577401569957574,0 +7318,1475113800,0.16897901476921154,0 +7319,1475114100,0.16688510668276682,0 +7320,1475114400,0.1693977963864374,0 +7321,1475114700,0.168001857662176,0 +7322,1475115000,0.16997788647860698,0 +7323,1475115300,0.17419051933989266,0 +7324,1475115600,0.16960563615213722,0 +7325,1475115900,0.17532898492166168,0 +7326,1475116200,0.1670712318460017,0 +7327,1475116500,0.17877230044150655,0 +7328,1475116800,0.1746558322479798,0 +7329,1475117100,0.1758191145181451,0 +7330,1475117400,0.17467754685033965,0 +7331,1475117700,0.1788188317323678,0 +7332,1475118000,0.1797494575485421,0 +7333,1475118300,0.18596293591465504,0 +7334,1475118600,0.1844739346086711,0 +7335,1475118900,0.17339948739609193,0 +7336,1475119200,0.17705064268163664,0 +7337,1475119500,0.17128076262125094,0 +7338,1475119800,0.16681375870350929,0 +7339,1475120100,0.1842164614662838,0 +7340,1475120400,0.1843095240479012,0 +7341,1475120700,0.18377596524659293,0 +7342,1475121000,0.19908475992285968,0 +7343,1475121300,0.1939415012453997,0 +7344,1475121600,0.2005737612287596,0 +7345,1475121900,0.21095023907930247,0 +7346,1475122200,0.22572237286818256,0 +7347,1475122500,0.2368216367692631,0 +7348,1475122800,0.2511067430476961,0 +7349,1475123100,0.2615762834798145,0 +7350,1475123400,0.28062929602320136,0 +7351,1475123700,0.277744355993061,0 +7352,1475124000,0.2713478545498547,0 +7353,1475124300,0.2875872750422537,0 +7354,1475124600,0.2715339797130896,0 +7355,1475124900,0.27039241204517905,0 +7356,1475125200,0.2593179648325999,0 +7357,1475125500,0.2555489302770415,0 +7358,1475125800,0.2493602685993249,0 +7359,1475126100,0.2434290800642057,0 +7360,1475126400,0.2381245129119071,0 +7361,1475126700,0.22241865122071133,0 +7362,1475127000,0.20887804559521767,0 +7363,1475127300,0.20415667062108311,0 +7364,1475127600,0.19252384791874685,0 +7365,1475127900,0.1901507520874604,0 +7366,1475128200,0.17954161778294736,0 +7367,1475128500,0.17546857879403527,0 +7368,1475128800,0.17046801607509038,0 +7369,1475129100,0.16383575609175155,0 +7370,1475129400,0.16162707082132954,0 +7371,1475129700,0.1565551601231271,0 +7372,1475130000,0.16313778672956827,0 +7373,1475130300,0.16139441436723342,0 +7374,1475130600,0.16497732375955698,0 +7375,1475130900,0.16320913470882584,0 +7376,1475131200,0.16688510668276682,0 +7377,1475131500,0.16078950758677268,0 +7378,1475131800,0.16348832245367814,0 +7379,1475132100,0.15636903495989227,0 +7380,1475132400,0.16825622871863194,0 +7381,1475132700,0.1679770409737797,0 +7382,1475133000,0.1589034392659227,0 +7383,1475133300,0.16420800641822128,0 +7384,1475133600,0.1661406060298274,0 +7385,1475133900,0.15827681788299702,0 +7386,1475134200,0.1581372240106234,0 +7387,1475134500,0.16104387864322867,0 +7388,1475134800,0.16095081606161127,0 +7389,1475135100,0.16290513027557724,0 +7390,1475135400,0.1705610786567078,0 +7391,1475135700,0.16592966417819624,0 +7392,1475136000,0.16188144187778555,0 +7393,1475136300,0.1679553263714198,0 +7394,1475136600,0.16597619546895245,0 +7395,1475136900,0.16923338582566752,0 +7396,1475137200,0.16869982702435926,0 +7397,1475137500,0.16816316613701454,0 +7398,1475137800,0.16895419808081524,0 +7399,1475138100,0.17228273641668282,0 +7400,1475138400,0.17402610877901767,0 +7401,1475138700,0.1787040545483905,0 +7402,1475139000,0.18149593199691336,0 +7403,1475139300,0.1832641210476445,0 +7404,1475139600,0.1798673368185558,0 +7405,1475139900,0.18086620852795124,0 +7406,1475140200,0.1858698733330376,0 +7407,1475140500,0.2018052893921881,0 +7408,1475140800,0.19847985314234112,0 +7409,1475141100,0.1994104789585312,0 +7410,1475141400,0.1929178128475975,0 +7411,1475141700,0.20757516945254226,0 +7412,1475142000,0.2017835747898072,0 +7413,1475142300,0.2094612377733186,0 +7414,1475142600,0.21634786881311344,0 +7415,1475142900,0.2105997033551927,0 +7416,1475143200,0.2147658049256172,0 +7417,1475143500,0.2158112079257688,0 +7418,1475143800,0.2109254223908012,0 +7419,1475144100,0.2267925925568356,0 +7420,1475144400,0.21751115108338387,0 +7421,1475144700,0.2202564972411505,0 +7422,1475145000,0.2234671563070044,0 +7423,1475145300,0.21932587142497625,0 +7424,1475145600,0.22132671692969846,0 +7425,1475145900,0.2261876857762697,0 +7426,1475146200,0.23635632386117605,0 +7427,1475146500,0.2259798460106749,0 +7428,1475146800,0.2263738109395045,0 +7429,1475147100,0.2305864438007903,0 +7430,1475147400,0.2418004848858481,0 +7431,1475147700,0.23889072816720644,0 +7432,1475148000,0.2488732410888778,0 +7433,1475148300,0.2500830546499044,0 +7434,1475148600,0.25289664670089224,0 +7435,1475148900,0.2560390598735249,0 +7436,1475149200,0.2524561504812013,0 +7437,1475149500,0.2655996890918289,0 +7438,1475149800,0.2696944426829958,0 +7439,1475150100,0.2600159341946781,0 +7440,1475150400,0.2715805110038457,0 +7441,1475150700,0.2727437932741161,0 +7442,1475151000,0.27148744842222833,0 +7443,1475151300,0.2720458239119329,0 +7444,1475151600,0.2779552978446922,0 +7445,1475151900,0.2750455411261556,0 +7446,1475152200,0.27969867020702704,0 +7447,1475152500,0.27292991843735104,0 +7448,1475152800,0.2846309870328559,0 +7449,1475153100,0.2743723884523686,0 +7450,1475153400,0.2822113599108027,0 +7451,1475153700,0.29072658612885,0 +7452,1475154000,0.2876803376238712,0 +7453,1475154300,0.28414395952240884,0 +7454,1475154600,0.2844913931603772,0 +7455,1475154900,0.28647052406284457,0 +7456,1475155200,0.28076888989568005,0 +7457,1475155500,0.29070487152649016,0 +7458,1475155800,0.2940768390671825,0 +7459,1475156100,0.2925661231588387,0 +7460,1475156400,0.28363211532346044,0 +7461,1475156700,0.28833177569519314,0 +7462,1475157000,0.2873080872974014,0 +7463,1475157300,0.29170374323588555,0 +7464,1475157600,0.2816312698186332,0 +7465,1475157900,0.28635264479283085,0 +7466,1475158200,0.2927057170313174,0 +7467,1475158500,0.2786067359161193,0 +7468,1475158800,0.28986730829193325,0 +7469,1475159100,0.2874942124606363,0 +7470,1475159400,0.2751634203961693,0 +7471,1475159700,0.2706250684992752,0 +7472,1475160000,0.2682519726679782,0 +7473,1475160300,0.26411068778595004,0 +7474,1475160600,0.2754891394318829,0 +7475,1475160900,0.26818372677475705,0 +7476,1475161200,0.2478030214002248,0 +7477,1475161500,0.2479426152727035,0 +7478,1475161800,0.23232981616312515,0 +7479,1475162100,0.23754132073380624,0 +7480,1475162400,0.2335861610150129,0 +7481,1475162700,0.2379849190395335,0 +7482,1475163000,0.21734674052250888,0 +7483,1475163300,0.22041780571588404,0 +7484,1475163600,0.2128580220024073,0 +7485,1475163900,0.205645671926983,0 +7486,1475164200,0.19403456382701714,0 +7487,1475164500,0.18023958714502555,0 +7488,1475164800,0.18449564921113606,0 +7489,1475165100,0.17760901817134125,0 +7490,1475165400,0.1739578628857965,0 +7491,1475165700,0.15727484408767026,0 +7492,1475166000,0.15083181135360266,0 +7493,1475166300,0.15085352595596255,0 +7494,1475166600,0.1451549938748344,0 +7495,1475166900,0.1325201973770138,0 +7496,1475167200,0.12510000753601574,0 +7497,1475167500,0.1167709064810982,0 +7498,1475167800,0.11490965484874965,0 +7499,1475168100,0.11058224480348663,0 +7500,1475168400,0.10295111311075236,0 +7501,1475168700,0.0993682037184288,0 +7502,1475169000,0.0974138895043577,0 +7503,1475169300,0.08927091361235978,0 +7504,1475169600,0.08459296784361743,0 +7505,1475169900,0.08147537135917088,0 +7506,1475170200,0.07542630355351257,0 +7507,1475170500,0.07442743184411715,0 +7508,1475170800,0.07314627030414825,0 +7509,1475171100,0.06926245856513794,0 +7510,1475171400,0.06732985895332161,0 +7511,1475171700,0.061631326872508725,0 +7512,1475172000,0.06195704590738161,0 +7513,1475172300,0.0646775753774876,0 +7514,1475172600,0.06293420301515275,0 +7515,1475172900,0.05776922973512274,0 +7516,1475173200,0.05662766206679187,0 +7517,1475173500,0.05367447614395582,0 +7518,1475173800,0.05248637718507899,0 +7519,1475174100,0.05309128396532956,0 +7520,1475174400,0.05386060130719069,0 +7521,1475174700,0.050743004822744096,0 +7522,1475175000,0.05230025202184412,0 +7523,1475175300,0.0541863203420636,0 +7524,1475175600,0.048649096736614654,0 +7525,1475175900,0.05009156675089673,0 +7526,1475176200,0.04841644028178297,0 +7527,1475176500,0.04608987574187265,0 +7528,1475176800,0.046716497124798376,0 +7529,1475177100,0.04401768225736754,0 +7530,1475177400,0.04359890064035179,0 +7531,1475177700,0.04294746256850437,0 +7532,1475178000,0.04264656022150239,0 +7533,1475178300,0.04101796504293469,0 +7534,1475178600,0.0405061208448269,0 +7535,1475178900,0.04243561837039664,0 +7536,1475179200,0.04190205956856294,0 +7537,1475179500,0.041458461262625534,0 +7538,1475179800,0.041551523844768366,0 +7539,1475180100,0.039901214063525516,0 +7540,1475180400,0.04190205956856294,0 +7541,1475180700,0.040692246008061775,0 +7542,1475181000,0.0389240569568052,0 +7543,1475181300,0.04139021536940436,0 +7544,1475181600,0.041715934405328094,0 +7545,1475181900,0.038179556303865776,0 +7546,1475182200,0.04010905382943551,0 +7547,1475182500,0.04080702319182894,0 +7548,1475182800,0.04115755891562354,0 +7549,1475183100,0.037016274033910605,0 +7550,1475183400,0.04010905382943551,0 +7551,1475183700,0.04122580480884467,0 +7552,1475184000,0.04038824157481321,0 +7553,1475184300,0.040155585119981525,0 +7554,1475184600,0.04083183987969982,0 +7555,1475184900,0.03987639737565465,0 +7556,1475185200,0.04227430989503268,0 +7557,1475185500,0.04478699959922865,0 +7558,1475185800,0.04536708969160833,0 +7559,1475186100,0.04664825123157725,0 +7560,1475186400,0.05146268878676186,0 +7561,1475186700,0.05274385032778155,0 +7562,1475187000,0.05834931982645161,0 +7563,1475187300,0.05932647693422274,0 +7564,1475187600,0.0621648856732916,0 +7565,1475187900,0.0668645460447091,0 +7566,1475188200,0.06679630015148794,0 +7567,1475188500,0.06863273509596564,0 +7568,1475188800,0.07035439285562536,0 +7569,1475189100,0.0710306476153437,0 +7570,1475189400,0.07372946248277452,0 +7571,1475189700,0.07898749834389658,0 +7572,1475190000,0.08243081386426683,0 +7573,1475190300,0.08659381334865487,0 +7574,1475190600,0.09250328728057353,0 +7575,1475190900,0.08682646980243576,0 +7576,1475191200,0.09178360331655576,0 +7577,1475191500,0.092968600189081,0 +7578,1475191800,0.08922438232181379,0 +7579,1475192100,0.09697029119873553,0 +7580,1475192400,0.09655150958140456,0 +7581,1475192700,0.1079764925180935,0 +7582,1475193000,0.10769730477324126,0 +7583,1475193300,0.10509155248795324,0 +7584,1475193600,0.1082773948654108,0 +7585,1475193900,0.1082773948654108,0 +7586,1475194200,0.11711834011917162,0 +7587,1475194500,0.11483830686949208,0 +7588,1475194800,0.1143047480681838,0 +7589,1475195100,0.1182133764962209,0 +7590,1475195400,0.12000328014931196,0 +7591,1475195700,0.12100525394474378,0 +7592,1475196000,0.12430897559221506,0 +7593,1475196300,0.1248673510819196,0 +7594,1475196600,0.13007885565260074,0 +7595,1475196900,0.1352438289324206,0 +7596,1475197200,0.1384762026006343,0 +7597,1475197500,0.1411284861767836,0 +7598,1475197800,0.14229176844705402,0 +7599,1475198100,0.14790034003260116,0 +7600,1475198400,0.14543418161968671,0 +7601,1475198700,0.14838736754304815,0 +7602,1475199000,0.15004077940980198,0 +7603,1475199300,0.1525069378227164,0 +7604,1475199600,0.15427512687344755,0 +7605,1475199900,0.16239328607662887,0 +7606,1475200200,0.16502385505041825,0 +7607,1475200500,0.16672069612189186,0 +7608,1475200800,0.16267247382148114,0 +7609,1475201100,0.16946604227965853,0 +7610,1475201400,0.17847139809429438,0 +7611,1475201700,0.17644573590107082,0 +7612,1475202000,0.1721431425442041,0 +7613,1475202300,0.16921167122320258,0 +7614,1475202600,0.1674434821724714,0 +7615,1475202900,0.16509210094363938,0 +7616,1475203200,0.16818798282541084,0 +7617,1475203500,0.17802779978856711,0 +7618,1475203800,0.17521420773768434,0 +7619,1475204100,0.17979598883940334,0 +7620,1475204400,0.1797742742369384,0 +7621,1475204700,0.17847139809429438,0 +7622,1475205000,0.1763526733194534,0 +7623,1475205300,0.16628019990230608,0 +7624,1475205600,0.1747706094319571,0 +7625,1475205900,0.1673038883000978,0 +7626,1475206200,0.17037495349347295,0 +7627,1475206500,0.17526073902844053,0 +7628,1475206800,0.1769823967884155,0 +7629,1475207100,0.18035436432910792,0 +7630,1475207400,0.18947449732770005,0 +7631,1475207700,0.18742712053208507,0 +7632,1475208000,0.19871250959638787,0 +7633,1475208300,0.2065762977431678,0 +7634,1475208600,0.22039609111352407,0 +7635,1475208900,0.2313061277653334,0 +7636,1475209200,0.2403114835799692,0 +7637,1475209500,0.24731289180374136,0 +7638,1475209800,0.2546648357516233,0 +7639,1475210100,0.2574101819093901,0 +7640,1475210400,0.2671817529793252,0 +7641,1475210700,0.27814142300792705,0 +7642,1475211000,0.2727655078764761,0 +7643,1475211300,0.2663441897447683,0 +7644,1475211600,0.2507344927212264,0 +7645,1475211900,0.2422657977939353,0 +7646,1475212200,0.2380314503302897,0 +7647,1475212500,0.2371938870957329,0 +7648,1475212800,0.21858137077203685,0 +7649,1475213100,0.2115086145690072,0 +7650,1475213400,0.2101343904471056,0 +7651,1475213700,0.205971390962644,0 +7652,1475214000,0.1927565043727904,0 +7653,1475214300,0.18398380501218767,0 +7654,1475214600,0.1925703792095556,0 +7655,1475214900,0.17586564580900635,0 +7656,1475215200,0.17246886157991764,0 +7657,1475215500,0.17300242038122596,0 +7658,1475215800,0.1697700467130122,0 +7659,1475216100,0.16169531671455067,0 +7660,1475216400,0.16427935439747882,0 +7661,1475216700,0.16381404148939166,0 +7662,1475217000,0.1567164685978606,0 +7663,1475217300,0.15992712766371445,0 +7664,1475217600,0.1548800336540134,0 +7665,1475217900,0.1579045675565273,0 +7666,1475218200,0.15499481083799072,0 +7667,1475218500,0.16190625856618182,0 +7668,1475218800,0.15862425152107046,0 +7669,1475219100,0.16262594253072496,0 +7670,1475219400,0.1547621543838946,0 +7671,1475219700,0.15848465764869685,0 +7672,1475220000,0.16239328607662887,0 +7673,1475220300,0.15753231723005756,0 +7674,1475220600,0.15278612556756868,0 +7675,1475220900,0.15643728085300831,0 +7676,1475221200,0.14631827614499976,0 +7677,1475221500,0.16069644500515526,0 +7678,1475221800,0.15457602922065972,0 +7679,1475222100,0.15888172466356282,0 +7680,1475222400,0.1556462489093127,0 +7681,1475222700,0.15962622531650228,0 +7682,1475223000,0.16155572284207198,0 +7683,1475223300,0.1650703863411744,0 +7684,1475223600,0.16590794957583635,0 +7685,1475223900,0.1687463583151154,0 +7686,1475224200,0.1584164117554757,0 +7687,1475224500,0.17023535962109934,0 +7688,1475224800,0.16169531671455067,0 +7689,1475225100,0.17416570265139128,0 +7690,1475225400,0.17053626196831154,0 +7691,1475225700,0.17721505324251166,0 +7692,1475226000,0.1789584256047414,0 +7693,1475226300,0.18075143134397392,0 +7694,1475226600,0.17847139809429438,0 +7695,1475226900,0.1831462417776308,0 +7696,1475227200,0.1974561647445347,0 +7697,1475227500,0.18500749340997927,0 +7698,1475227800,0.19433856826030774,0 +7699,1475228100,0.19650072223991047,0 +7700,1475228400,0.188987469817232,0 +7701,1475228700,0.19617500320424952,0 +7702,1475229000,0.2046685148199895,0 +7703,1475229300,0.20329429069808794,0 +7704,1475229600,0.1920585350106492,0 +7705,1475229900,0.20122519930009208,0 +7706,1475230200,0.2010390741368572,0 +7707,1475230500,0.20613269943745105,0 +7708,1475230800,0.20613269943745105,0 +7709,1475231100,0.2081118303398764,0 +7710,1475231400,0.21327680361973828,0 +7711,1475231700,0.20487635458560524,0 +7712,1475232000,0.20620404741669807,0 +7713,1475232300,0.21858137077203685,0 +7714,1475232600,0.21692795890517788,0 +7715,1475232900,0.21699930688443544,0 +7716,1475233200,0.21788340140985354,0 +7717,1475233500,0.2159973330890036,0 +7718,1475233800,0.2291439737856676,0 +7719,1475234100,0.22923703636728496,0 +7720,1475234400,0.22260477638394616,0 +7721,1475234700,0.22681740924523186,0 +7722,1475235000,0.22253653049083008,0 +7723,1475235300,0.22877172345919786,0 +7724,1475235600,0.2339584113414827,0 +7725,1475235900,0.2378670397694147,0 +7726,1475236200,0.2493137373085687,0 +7727,1475236500,0.24619614082433225,0 +7728,1475236800,0.25831909312320445,0 +7729,1475237100,0.25375902662384536,0 +7730,1475237400,0.2555954615677977,0 +7731,1475237700,0.26911435259093136,0 +7732,1475238000,0.2707181310808926,0 +7733,1475238300,0.2657640996527039,0 +7734,1475238600,0.2643216296375812,0 +7735,1475238900,0.26597193941829866,0 +7736,1475239200,0.25790031150587345,0 +7737,1475239500,0.26818372677475705,0 +7738,1475239800,0.2628791596224585,0 +7739,1475240100,0.25515496534810683,0 +7740,1475240400,0.2565509040724733,0 +7741,1475240700,0.26615806458153346,0 +7742,1475241000,0.2681123787954995,0 +7743,1475241300,0.27895416955408764,0 +7744,1475241600,0.2816995157118543,0 +7745,1475241900,0.27118344398897976,0 +7746,1475242200,0.27434757176397234,0 +7747,1475242500,0.2703706974428192,0 +7748,1475242800,0.2749772952329345,0 +7749,1475243100,0.2694152549381435,0 +7750,1475243400,0.27248632013162377,0 +7751,1475243700,0.26908953590253504,0 +7752,1475244000,0.2738822588558852,0 +7753,1475244300,0.2652057241629993,0 +7754,1475244600,0.2707181310808926,0 +7755,1475244900,0.2743723884523686,0 +7756,1475245200,0.26841638322885314,0 +7757,1475245500,0.25808643666910835,0 +7758,1475245800,0.2680658475047433,0 +7759,1475246100,0.2583873390163205,0 +7760,1475246400,0.2562251850367598,0 +7761,1475246700,0.25375902662384536,0 +7762,1475247000,0.2494316165785824,0 +7763,1475247300,0.2524096191904452,0 +7764,1475247600,0.2515472392673869,0 +7765,1475247900,0.24626748880358976,0 +7766,1475248200,0.23751960613144635,0 +7767,1475248500,0.2333317899585569,0 +7768,1475248800,0.220185149261893,0 +7769,1475249100,0.21776552213983985,0 +7770,1475249400,0.20429626449350924,0 +7771,1475249700,0.20143303906570784,0 +7772,1475250000,0.1998975064689993,0 +7773,1475250300,0.19215159759226666,0 +7774,1475250600,0.1914753428325063,0 +7775,1475250900,0.1825661516855664,0 +7776,1475251200,0.17537551621241784,0 +7777,1475251500,0.1741439880490314,0 +7778,1475251800,0.16150919155131582,0 +7779,1475252100,0.15555318632769527,0 +7780,1475252400,0.14405995749778516,0 +7781,1475252700,0.14264230417105875,0 +7782,1475253000,0.12889075669393404,0 +7783,1475253300,0.1319618218873092,0 +7784,1475253600,0.12316740792430453,0 +7785,1475253900,0.12081912878150892,0 +7786,1475254200,0.11986368627693834,0 +7787,1475254500,0.11253655901745263,0 +7788,1475254800,0.10185607673370307,0 +7789,1475255100,0.10059973188181526,0 +7790,1475255400,0.0938526947144991,0 +7791,1475255700,0.0920845056635578,0 +7792,1475256000,0.08059127683354259,0 +7793,1475256300,0.07779939938501973,0 +7794,1475256600,0.07600949573213882,0 +7795,1475256900,0.06965642349323196,0 +7796,1475257200,0.06805264500358592,0 +7797,1475257500,0.06900498542243537,0 +7798,1475257800,0.06928417316676225,0 +7799,1475258100,0.06405095399456187,0 +7800,1475258400,0.06004926298448705,0 +7801,1475258700,0.061746104056275876,0 +7802,1475259000,0.058653324259700185,0 +7803,1475259300,0.05779094433674705,0 +7804,1475259600,0.05497735228659987,0 +7805,1475259900,0.05569703625061759,0 +7806,1475260200,0.05397537849095788,0 +7807,1475260500,0.04809072124691008,0 +7808,1475260800,0.05055687965950928,0 +7809,1475261100,0.04729968930237384,0 +7810,1475261400,0.04815896714013123,0 +7811,1475261700,0.05015981264411786,0 +7812,1475262000,0.04534537508893322,0 +7813,1475262300,0.04387808838467869,0 +7814,1475262600,0.04699568486912525,0 +7815,1475262900,0.04557803154271409,0 +7816,1475263200,0.045878933889716124,0 +7817,1475263500,0.04443646387543407,0 +7818,1475263800,0.04359890064035179,0 +7819,1475264100,0.0424139037677215,0 +7820,1475264400,0.04227430989503268,0 +7821,1475264700,0.04117927351829865,0 +7822,1475265000,0.04348412345658465,0 +7823,1475265300,0.0400873392267604,0 +7824,1475265600,0.043459306767662936,0 +7825,1475265900,0.04178418029854925,0 +7826,1475266200,0.04004080793621436,0 +7827,1475266500,0.03957549502865262,0 +7828,1475266800,0.0395289637370558,0 +7829,1475267100,0.042202961915565,0 +7830,1475267400,0.0382260875944118,0 +7831,1475267700,0.0388527089773375,0 +7832,1475268000,0.039783334793511815,0 +7833,1475268300,0.03924977599272892,0 +7834,1475268600,0.03913189672271518,0 +7835,1475268900,0.038340864779229766,0 +7836,1475269200,0.03969027221241976,0 +7837,1475269500,0.03924977599272892,0 +7838,1475269800,0.03969027221241976,0 +7839,1475270100,0.03929630728327494,0 +7840,1475270400,0.038179556303865776,0 +7841,1475270700,0.03827261888600862,0 +7842,1475271000,0.040133870518357186,0 +7843,1475271300,0.04287921667633407,0 +7844,1475271600,0.041504992554222346,0 +7845,1475271900,0.04348412345658465,0 +7846,1475272200,0.046276000905107535,0 +7847,1475272500,0.04339106087444181,0 +7848,1475272800,0.051906287092699295,0 +7849,1475273100,0.05334875710803215,0 +7850,1475273400,0.05697819779058647,0 +7851,1475273700,0.05969872726069245,0 +7852,1475274000,0.061026420091207324,0 +7853,1475274300,0.062096639780070476,0 +7854,1475274600,0.06393307472454816,0 +7855,1475274900,0.06486370054072246,0 +7856,1475275200,0.0694237670394511,0 +7857,1475275500,0.07098411632479767,0 +7858,1475275800,0.07351852063061798,0 +7859,1475276100,0.07542630355351257,0 +7860,1475276400,0.07989330747114919,0 +7861,1475276700,0.07791727865503344,0 +7862,1475277000,0.08215162611888915,0 +7863,1475277300,0.08252387644535887,0 +7864,1475277600,0.0872452514194515,0 +7865,1475277900,0.08471084711363118,0 +7866,1475278200,0.0895966326482835,0 +7867,1475278500,0.09175878662763408,0 +7868,1475278800,0.09513385625478324,0 +7869,1475279100,0.09994829381049326,0 +7870,1475279400,0.10325201545796454,0 +7871,1475279700,0.10981292746204584,0 +7872,1475280000,0.10332336343722208,0 +7873,1475280300,0.10737158573763278,0 +7874,1475280600,0.10809126970207084,0 +7875,1475280900,0.10655573710543582,0 +7876,1475281200,0.11616599970053235,0 +7877,1475281500,0.11725793399154524,0 +7878,1475281800,0.1149561861395058,0 +7879,1475282100,0.11574721808330647,0 +7880,1475282400,0.11965584651123852,0 +7881,1475282700,0.12440203817383247,0 +7882,1475283000,0.12926300702040378,0 +7883,1475283300,0.129706605326131,0 +7884,1475283600,0.14210564328381914,0 +7885,1475283900,0.14026920833986686,0 +7886,1475284200,0.14934281004761873,0 +7887,1475284500,0.1468766516347043,0 +7888,1475284800,0.14971506037408844,0 +7889,1475285100,0.14850524681306188,0 +7890,1475285400,0.15548494043447414,0 +7891,1475285700,0.15899650184754016,0 +7892,1475286000,0.1566947539955007,0 +7893,1475286300,0.15543840914371795,0 +7894,1475286600,0.1652565115044093,0 +7895,1475286900,0.16409322923424394,0 +7896,1475287200,0.16576835570335768,0 +7897,1475287500,0.1690255460599677,0 +7898,1475287800,0.16972351542215094,0 +7899,1475288100,0.16579007030571755,0 +7900,1475288400,0.17028189091185553,0 +7901,1475288700,0.17828527293105953,0 +7902,1475289000,0.1741439880490314,0 +7903,1475289300,0.16686028999437053,0 +7904,1475289600,0.16727907161159644,0 +7905,1475289900,0.1813098068336785,0 +7906,1475290200,0.16688510668276682,0 +7907,1475290500,0.17458448426872225,0 +7908,1475290800,0.16790879508055853,0 +7909,1475291100,0.17365385845254794,0 +7910,1475291400,0.1763309587170935,0 +7911,1475291700,0.17095504358553745,0 +7912,1475292000,0.17125904801889108,0 +7913,1475292300,0.16432588568823495,0 +7914,1475292600,0.16304472414795085,0 +7915,1475292900,0.16041725726030295,0 +7916,1475293200,0.16279035309159995,0 +7917,1475293500,0.16988482389698953,0 +7918,1475293800,0.17272323263637365,0 +7919,1475294100,0.18377596524659293,0 +7920,1475294400,0.2057387345086004,0 +7921,1475294700,0.2024101961727013,0 +7922,1475295000,0.21739327181337006,0 +7923,1475295300,0.22681740924523186,0 +7924,1475295600,0.2403331981823291,0 +7925,1475295900,0.2480573924566808,0 +7926,1475296200,0.2518729583031005,0 +7927,1475296500,0.2597367464498258,0 +7928,1475296800,0.26169106066379183,0 +7929,1475297100,0.26206331099026153,0 +7930,1475297400,0.2561786537460036,0 +7931,1475297700,0.257828963526616,0 +7932,1475298000,0.2540382143686977,0 +7933,1475298300,0.2381710442027684,0 +7934,1475298600,0.2365424490244108,0 +7935,1475298900,0.2220464008942416,0 +7936,1475299200,0.21478751952797706,0 +7937,1475299500,0.21115807884489726,0 +7938,1475299800,0.1968977892548228,0 +7939,1475300100,0.19999056905061666,0 +7940,1475300400,0.17563298935491026,0 +7941,1475300700,0.1812384588544209,0 +7942,1475301000,0.17479542612035334,0 +7943,1475301300,0.16502385505041825,0 +7944,1475301600,0.161301351785616,0 +7945,1475301900,0.16109040993398485,0 +7946,1475302200,0.1569956563427129,0 +7947,1475302500,0.1618131959845644,0 +7948,1475302800,0.15690259376109547,0 +7949,1475303100,0.16243981736749016,0 +7950,1475303400,0.15720659819444913,0 +7951,1475303700,0.16586141828497508,0 +7952,1475304000,0.16274382180073868,0 +7953,1475304300,0.15555318632769527,0 +7954,1475304600,0.1553918778528567,0 +7955,1475304900,0.16234675478587268,0 +7956,1475305200,0.1593005062808938,0 +7957,1475305500,0.15997365895457571,0 +7958,1475305800,0.15415724760343386,0 +7959,1475306100,0.15706700432197046,0 +7960,1475306400,0.16793050968291842,0 +7961,1475306700,0.1586707828119317,0 +7962,1475307000,0.16281206769395984,0 +7963,1475307300,0.15150496402728456,0 +7964,1475307600,0.15345927824125058,0 +7965,1475307900,0.15567106559770902,0 +7966,1475308200,0.14796858592571718,0 +7967,1475308500,0.15429684147580744,0 +7968,1475308800,0.15390287654697785,0 +7969,1475309100,0.15743925464844016,0 +7970,1475309400,0.15792628215899224,0 +7971,1475309700,0.16139441436723342,0 +7972,1475310000,0.15888172466356282,0 +7973,1475310300,0.1562759723782748,0 +7974,1475310600,0.17025707422345926,0 +7975,1475310900,0.16346350576528185,0 +7976,1475311200,0.1683492913002494,0 +7977,1475311500,0.1675830760449501,0 +7978,1475311800,0.18107715037968747,0 +7979,1475312100,0.1748884887019708,0 +7980,1475312400,0.1825196203947051,0 +7981,1475312700,0.17905148818635885,0 +7982,1475313000,0.18240174112469126,0 +7983,1475313300,0.18789243344018275,0 +7984,1475313600,0.1883825630367082,0 +7985,1475313900,0.19768882119858144,0 +7986,1475314200,0.1948721270615845,0 +7987,1475314500,0.1946860018983496,0 +7988,1475314800,0.19417415769944327,0 +7989,1475315100,0.20108560542766601,0 +7990,1475315400,0.20622576201906848,0 +7991,1475315700,0.2017587581013688,0 +7992,1475316000,0.2038774828762308,0 +7993,1475316300,0.20934335850330488,0 +7994,1475316600,0.20485463998322434,0 +7995,1475316900,0.2118343336046156,0 +7996,1475317200,0.219930778205437,0 +7997,1475317500,0.2248165637404045,0 +7998,1475317800,0.21397477298181647,0 +7999,1475318100,0.2097156088297746,0 +8000,1475318400,0.2229304954196597,0 +8001,1475318700,0.2194189340065937,0 +8002,1475319000,0.22072181014923767,0 +8003,1475319300,0.2256075956842053,0 +8004,1475319600,0.22253653049083008,0 +8005,1475319900,0.22483827834286954,0 +8006,1475320200,0.2247452157612521,0 +8007,1475320500,0.22488480963362573,0 +8008,1475320800,0.2274223160257977,0 +8009,1475321100,0.2296310012962197,0 +8010,1475321400,0.23975310809026465,0 +8011,1475321700,0.23612366740707985,0 +8012,1475322000,0.2513145828133959,0 +8013,1475322300,0.24161435972261325,0 +8014,1475322600,0.2467793330024331,0 +8015,1475322900,0.2492672060177074,0 +8016,1475323200,0.2460100156610973,0 +8017,1475323500,0.2464753285691845,0 +8018,1475323800,0.2538272725170665,0 +8019,1475324100,0.2534550221905968,0 +8020,1475324400,0.26064565766364023,0 +8021,1475324700,0.2619950650971455,0 +8022,1475325000,0.2581794992507257,0 +8023,1475325300,0.265180907474498,0 +8024,1475325600,0.2650661302905206,0 +8025,1475325900,0.2565043727816121,0 +8026,1475326200,0.2698123219531146,0 +8027,1475326500,0.26776494515753113,0 +8028,1475326800,0.2658571622343213,0 +8029,1475327100,0.2705102913151928,0 +8030,1475327400,0.26601847070915985,0 +8031,1475327700,0.2646225319847933,0 +8032,1475328000,0.27118344398897976,0 +8033,1475328300,0.2724180742384026,0 +8034,1475328600,0.2671817529793252,0 +8035,1475328900,0.2662511271631509,0 +8036,1475329200,0.27830273148276563,0 +8037,1475329500,0.26643725232638577,0 +8038,1475329800,0.2686490396828442,0 +8039,1475330100,0.2789324549517277,0 +8040,1475330400,0.27211406980515396,0 +8041,1475330700,0.25757459247026504,0 +8042,1475331000,0.2658571622343213,0 +8043,1475331300,0.2618306545362705,0 +8044,1475331600,0.2555954615677977,0 +8045,1475331900,0.2600159341946781,0 +8046,1475332200,0.2541995228435362,0 +8047,1475332500,0.2596436838682084,0 +8048,1475332800,0.2443131745895187,0 +8049,1475333100,0.2586913434496742,0 +8050,1475333400,0.2416826056158344,0 +8051,1475333700,0.2493385539969649,0 +8052,1475334000,0.24147476585023964,0 +8053,1475334300,0.23323872737693954,0 +8054,1475334600,0.2253501225417129,0 +8055,1475334900,0.22223252605747645,0 +8056,1475335200,0.2181843037570657,0 +8057,1475335500,0.2226978389655636,0 +8058,1475335800,0.2092037646308262,0 +8059,1475336100,0.20797223646745025,0 +8060,1475336400,0.19617500320424952,0 +8061,1475336700,0.18796378141941927,0 +8062,1475337000,0.17798126849781093,0 +8063,1475337300,0.1806583687623565,0 +8064,1475337600,0.1725619241615351,0 +8065,1475337900,0.17367867514094426,0 +8066,1475338200,0.16292994696397356,0 +8067,1475338500,0.15922915830163625,0 +8068,1475338800,0.1495289352108536,0 +8069,1475339100,0.15108618240995358,0 +8070,1475339400,0.13836142541665702,0 +8071,1475339700,0.14219870586543654,0 +8072,1475340000,0.12626328980618107,0 +8073,1475340300,0.1246346946279286,0 +8074,1475340600,0.12184281717930065,0 +8075,1475340900,0.12409803374058392,0 +8076,1475341200,0.11860734142505053,0 +8077,1475341500,0.10323030085560464,0 +8078,1475341800,0.10592911572251007,0 +8079,1475342100,0.09080644620983548,0 +8080,1475342400,0.08519787462386805,0 +8081,1475342700,0.08652556745543376,0 +8082,1475343000,0.07703318412940516,0 +8083,1475343300,0.07761327422178488,0 +8084,1475343600,0.07095929963587598,0 +8085,1475343900,0.06609833079014532,0 +8086,1475344200,0.06621310797391249,0 +8087,1475344500,0.06395789141241906,0 +8088,1475344800,0.060979888800661325,0 +8089,1475345100,0.05797706949998192,0 +8090,1475345400,0.05588316141385245,0 +8091,1475345700,0.057837475628343876,0 +8092,1475346000,0.05583663012330643,0 +8093,1475346300,0.05325569452588934,0 +8094,1475346600,0.05406844107204989,0 +8095,1475346900,0.05078953611329014,0 +8096,1475347200,0.0493935973895541,0 +8097,1475347500,0.04846297157337981,0 +8098,1475347800,0.05109043846029216,0 +8099,1475348100,0.04648384067101752,0 +8100,1475348400,0.04680955970589041,0 +8101,1475348700,0.04855603415447182,0 +8102,1475349000,0.0480659045579884,0 +8103,1475349300,0.0459502818691838,0 +8104,1475349600,0.04711356413913901,0 +8105,1475349900,0.04466912032921496,0 +8106,1475350200,0.043785025803586654,0 +8107,1475350500,0.045180964528373516,0 +8108,1475350800,0.042600028930956366,0 +8109,1475351100,0.044157276130056385,0 +8110,1475351400,0.043040525150647206,0 +8111,1475351700,0.04418209281897807,0 +8112,1475352000,0.044250338711148425,0 +8113,1475352300,0.043018810547972096,0 +8114,1475352600,0.042249493207161766,0 +8115,1475352900,0.043530654747130665,0 +8116,1475353200,0.04162287182423609,0 +8117,1475353500,0.040133870518357186,0 +8118,1475353800,0.04048130415590522,0 +8119,1475354100,0.04297227925742608,0 +8120,1475354400,0.04004080793621436,0 +8121,1475354700,0.040971433752388674,0 +8122,1475355000,0.04315840442066095,0 +8123,1475355300,0.03992292866620067,0 +8124,1475355600,0.03976162019188748,0 +8125,1475355900,0.041504992554222346,0 +8126,1475356200,0.03889924026893432,0 +8127,1475356500,0.042342555788253806,0 +8128,1475356800,0.04250696634986431,0 +8129,1475357100,0.041064496334531485,0 +8130,1475357400,0.039783334793511815,0 +8131,1475357700,0.0424139037677215,0 +8132,1475358000,0.04273962280364522,0 +8133,1475358300,0.042600028930956366,0 +8134,1475358600,0.044203807420602405,0 +8135,1475358900,0.04515924992569837,0 +8136,1475359200,0.05181322451160726,0 +8137,1475359500,0.05164881394999672,0 +8138,1475359800,0.05351006558234531,0 +8139,1475360100,0.05765135046510901,0 +8140,1475360400,0.06367870366809217,0 +8141,1475360700,0.06321339075947963,0 +8142,1475361000,0.06670323757039591,0 +8143,1475361300,0.06698242531472283,0 +8144,1475361600,0.06486370054072246,0 +8145,1475361900,0.07121677277857852,0 +8146,1475362200,0.07184339416150426,0 +8147,1475362500,0.07600949573213882,0 +8148,1475362800,0.08238428257267004,0 +8149,1475363100,0.08119618361379316,0 +8150,1475363400,0.08436031138983659,0 +8151,1475363700,0.08689781778190346,0 +8152,1475364000,0.09006194555689602,0 +8153,1475364300,0.08680475519976064,0 +8154,1475364600,0.09048072717391176,0 +8155,1475364900,0.09059550435767892,0 +8156,1475365200,0.08822240852617177,0 +8157,1475365500,0.0978078544332924,0 +8158,1475365800,0.10206701858533423,0 +8159,1475366100,0.1094654938239724,0 +8160,1475366400,0.1019956706060767,0 +8161,1475366700,0.11239696514497396,0 +8162,1475367000,0.11909747102153395,0 +8163,1475367300,0.11595505784890126,0 +8164,1475367600,0.11348889943598682,0 +8165,1475367900,0.1190261230422764,0 +8166,1475368200,0.12342488106679694,0 +8167,1475368500,0.12423762761295752,0 +8168,1475368800,0.12395843986810524,0 +8169,1475369100,0.1252861326992506,0 +8170,1475369400,0.1326132599586312,0 +8171,1475369700,0.12952048016289616,0 +8172,1475370000,0.1318222280149356,0 +8173,1475370300,0.1438986490229466,0 +8174,1475370600,0.14771421486936626,0 +8175,1475370900,0.14429261395177614,0 +8176,1475371200,0.14350158200808058,0 +8177,1475371500,0.1494110559408399,0 +8178,1475371800,0.15097140522597627,0 +8179,1475372100,0.14731714785439518,0 +8180,1475372400,0.1518772143537543,0 +8181,1475372700,0.15520575268962186,0 +8182,1475373000,0.16132306638808094,0 +8183,1475373300,0.161301351785616,0 +8184,1475373600,0.17335295610533574,0 +8185,1475373900,0.1797494575485421,0 +8186,1475374200,0.1705827932590677,0 +8187,1475374500,0.1814711153085171,0 +8188,1475374800,0.17037495349347295,0 +8189,1475375100,0.1767032090435632,0 +8190,1475375400,0.17035013680507669,0 +8191,1475375700,0.16823451411627208,0 +8192,1475376000,0.17435182781462613,1 +8193,1475376300,0.23153878421932444,1 +8194,1475376600,0.17970292625778592,1 +8195,1475376900,0.1819612449050005,0 +8196,1475377200,0.17205007996258667,0 +8197,1475377500,0.1790763048748602,0 +8198,1475377800,0.17889017971162532,0 +8199,1475378100,0.17500326588594814,0 +8200,1475378400,0.17586564580900635,0 +8201,1475378700,0.1729341744880048,0 +8202,1475379000,0.1612765350972197,0 +8203,1475379300,0.1682810454070283,0 +8204,1475379600,0.1717708922177344,0 +8205,1475379900,0.1767249236459231,0 +8206,1475380200,0.1915001595209341,0 +8207,1475380500,0.19173281597497768,0 +8208,1475380800,0.2029685716624164,0 +8209,1475381100,0.2225117138023287,0 +8210,1475381400,0.2210940604757074,0 +8211,1475381700,0.23949563494777226,0 +8212,1475382000,0.2506166134512127,0 +8213,1475382300,0.2600159341946781,0 +8214,1475382600,0.264926536418147,0 +8215,1475382900,0.27865326720687544,0 +8216,1475383200,0.26401762520433264,0 +8217,1475383500,0.2631583473673108,0 +8218,1475383800,0.26906782130017515,0 +8219,1475384100,0.26031993862803177,0 +8220,1475384400,0.2616445293730357,0 +8221,1475384700,0.2449863272633057,0 +8222,1475385000,0.25259574435368004,0 +8223,1475385300,0.23072603767326896,0 +8224,1475385600,0.2132519869312369,0 +8225,1475385900,0.2184883081904194,0 +8226,1475386200,0.2117412710229981,0 +8227,1475386500,0.1969195038572005,0 +8228,1475386800,0.1853580291340892,0 +8229,1475387100,0.17968121165532094,0 +8230,1475387400,0.17509632846756554,0 +8231,1475387700,0.16427935439747882,0 +8232,1475388000,0.16313778672956827,0 +8233,1475388300,0.1644654795607137,0 +8234,1475388600,0.15988059637295826,0 +8235,1475388900,0.16969869873375468,0 +8236,1475389200,0.15860253691871054,0 +8237,1475389500,0.1612548204948598,0 +8238,1475389800,0.1541107163125726,0 +8239,1475390100,0.15860253691871054,0 +8240,1475390400,0.1602311320970681,0 +8241,1475390700,0.15660169141388328,0 +8242,1475391000,0.1562976869806347,0 +8243,1475391300,0.1527613088790673,0 +8244,1475391600,0.16895419808081524,0 +8245,1475391900,0.1567164685978606,0 +8246,1475392200,0.1580193447406097,0 +8247,1475392500,0.1595796940257461,0 +8248,1475392800,0.15827681788299702,0 +8249,1475393100,0.1570887189244354,0 +8250,1475393400,0.15034168175701418,0 +8251,1475393700,0.15716006690358789,0 +8252,1475394000,0.161301351785616,0 +8253,1475394300,0.15825200119460073,0 +8254,1475394600,0.15611156181739985,0 +8255,1475394900,0.16388228738250776,0 +8256,1475395200,0.15888172466356282,0 +8257,1475395500,0.16025284669942802,0 +8258,1475395800,0.16565047643334396,0 +8259,1475396100,0.1583915950670794,0 +8260,1475396400,0.16337044318366442,0 +8261,1475396700,0.1674651967748313,0 +8262,1475397000,0.16395363536176527,0 +8263,1475397300,0.1682810454070283,0 +8264,1475397600,0.17314201425359954,0 +8265,1475397900,0.17028189091185553,0 +8266,1475398200,0.17472407814109586,0 +8267,1475398500,0.17237579899830022,0 +8268,1475398800,0.187845902149374,0 +8269,1475399100,0.18947449732770005,0 +8270,1475399400,0.18500749340997927,0 +8271,1475399700,0.19007940410822385,0 +8272,1475400000,0.1904516544346936,0 +8273,1475400300,0.19038340854151445,0 +8274,1475400600,0.1944316308419252,0 +8275,1475400900,0.19933913097928416,0 +8276,1475401200,0.2038061348969838,0 +8277,1475401500,0.20622576201906848,0 +8278,1475401800,0.21416089814505126,0 +8279,1475402100,0.2043893270751267,0 +8280,1475402400,0.2103918635894929,0 +8281,1475402700,0.2101126758446406,0 +8282,1475403000,0.2224899991999688,0 +8283,1475403300,0.21276495942078988,0 +8284,1475403600,0.21022745302872287,0 +8285,1475403900,0.2270252490108266,0 +8286,1475404200,0.2216989672561681,0 +8287,1475404500,0.2264451589187621,0 +8288,1475404800,0.21909321497088013,0 +8289,1475405100,0.2317249093825593,0 +8290,1475405400,0.2230949059805347,0 +8291,1475405700,0.22923703636728496,0 +8292,1475406000,0.22348887090936426,0 +8293,1475406300,0.23326044197929946,0 +8294,1475406600,0.2246521531795296,0 +8295,1475406900,0.2422657977939353,0 +8296,1475407200,0.2369364139532404,0 +8297,1475407500,0.2413351719777609,0 +8298,1475407800,0.2487801785072604,0 +8299,1475408100,0.2448715500793283,0 +8300,1475408400,0.2541064602619188,0 +8301,1475408700,0.2493137373085687,0 +8302,1475409000,0.2652057241629993,0 +8303,1475409300,0.2569914002920591,0 +8304,1475409600,0.2727437932741161,0 +8305,1475409900,0.2700666930095706,0 +8306,1475410200,0.2849815227569657,0 +8307,1475410500,0.283306396287852,0 +8308,1475410800,0.2834459901602256,0 +8309,1475411100,0.28746939577223996,0 +8310,1475411400,0.2713695691522146,0 +8311,1475411700,0.27406838401912004,0 +8312,1475412000,0.27099731882574485,0 +8313,1475412300,0.2791185801149626,0 +8314,1475412600,0.27325563747295945,0 +8315,1475412900,0.2724180742384026,0 +8316,1475413200,0.2730446956213283,0 +8317,1475413500,0.2845162098488785,0 +8318,1475413800,0.28428355339478245,0 +8319,1475414100,0.2833994588694694,0 +8320,1475414400,0.2872832706090052,0 +8321,1475414700,0.2887257406241278,0 +8322,1475415000,0.2860269257571173,0 +8323,1475415300,0.2836786466143217,0 +8324,1475415600,0.2758148584674913,0 +8325,1475415900,0.2862130509203521,0 +8326,1475416200,0.2816312698186332,0 +8327,1475416500,0.2731377582029457,0 +8328,1475416800,0.2852358938134217,0 +8329,1475417100,0.2791868260081837,0 +8330,1475417400,0.27248632013162377,0 +8331,1475417700,0.2745336969272072,0 +8332,1475418000,0.2675540033057949,0 +8333,1475418300,0.2645542860916773,0 +8334,1475418600,0.2692074151725488,0 +8335,1475418900,0.2608317828268751,0 +8336,1475419200,0.2719062300395593,0 +8337,1475419500,0.25864481215881296,0 +8338,1475419800,0.26218119026038034,0 +8339,1475420100,0.26259997187760625,0 +8340,1475420400,0.2500117066706468,0 +8341,1475420700,0.24433488919198365,0 +8342,1475421000,0.2362849758819185,0 +8343,1475421300,0.23933432647293365,0 +8344,1475421600,0.23579794837147136,0 +8345,1475421900,0.22211774887349914,0 +8346,1475422200,0.22716484288330524,0 +8347,1475422500,0.21504499267046945,0 +8348,1475422800,0.20831967010549213,0 +8349,1475423100,0.19703738312724994,0 +8350,1475423400,0.2101126758446406,0 +8351,1475423700,0.1844739346086711,0 +8352,1475424000,0.19152187412331506,0 +8353,1475424300,0.18763806238375824,0 +8354,1475424600,0.16888595218759409,0 +8355,1475424900,0.16807010355539712,0 +8356,1475425200,0.15438990405742484,0 +8357,1475425500,0.1532266217871545,0 +8358,1475425800,0.14045533350310171,0 +8359,1475426100,0.13042628929056907,0 +8360,1475426400,0.1263315356994022,0 +8361,1475426700,0.11765500100651632,0 +8362,1475427000,0.11837468497095438,0 +8363,1475427300,0.10769730477324126,0 +8364,1475427600,0.1044618290189911,0 +8365,1475427900,0.1014838264072334,0 +8366,1475428200,0.09811185886654096,0 +8367,1475428500,0.08587412938358634,0 +8368,1475428800,0.08489697227686599,0 +8369,1475429100,0.08308225193506344,0 +8370,1475429400,0.07424130668088229,0 +8371,1475429700,0.07389077095708768,0 +8372,1475430000,0.07207605061528513,0 +8373,1475430300,0.06802782831571506,0 +8374,1475430600,0.06540036142775191,0 +8375,1475430900,0.0640974852851079,0 +8376,1475431200,0.06177092074414677,0 +8377,1475431500,0.061094665984428476,0 +8378,1475431800,0.06014232556557906,0 +8379,1475432100,0.056441536903557035,0 +8380,1475432400,0.0541863203420636,0 +8381,1475432700,0.05376753872504785,0 +8382,1475433000,0.05709297497540442,0 +8383,1475433300,0.055278254633601864,0 +8384,1475433600,0.05004503546035072,0 +8385,1475433900,0.048602565445017835,0 +8386,1475434200,0.051906287092699295,0 +8387,1475434500,0.050764719425419255,0 +8388,1475434800,0.051834939113231566,0 +8389,1475435100,0.04953319126119216,0 +8390,1475435400,0.046741313812669286,0 +8391,1475435700,0.0448800621803207,0 +8392,1475436000,0.0461829383229647,0 +8393,1475436300,0.0458324025991701,0 +8394,1475436600,0.04608987574187265,0 +8395,1475436900,0.04287921667633407,0 +8396,1475437200,0.04476218291030699,0 +8397,1475437500,0.04099314835506378,0 +8398,1475437800,0.04278615409419124,0 +8399,1475438100,0.039783334793511815,0 +8400,1475438400,0.03922495930380724,0 +8401,1475438700,0.038573521233010624,0 +8402,1475439000,0.042202961915565,0 +8403,1475439300,0.03927149059540405,0 +8404,1475439600,0.041111027625077526,0 +8405,1475439900,0.03766771210470722,0 +8406,1475440200,0.03982986608510861,0 +8407,1475440500,0.03999427664566834,0 +8408,1475440800,0.040155585119981525,0 +8409,1475441100,0.0397368035029658,0 +8410,1475441400,0.044250338711148425,0 +8411,1475441700,0.04252868095148864,0 +8412,1475442000,0.04085355448237496,0 +8413,1475442300,0.040760491901282926,0 +8414,1475442600,0.042342555788253806,0 +8415,1475442900,0.042342555788253806,0 +8416,1475443200,0.041808996987470905,0 +8417,1475443500,0.04139021536940436,0 +8418,1475443800,0.042296024497707814,0 +8419,1475444100,0.041458461262625534,0 +8420,1475444400,0.04429687000274525,0 +8421,1475444700,0.049626253843334966,0 +8422,1475445000,0.04725315801182785,0 +8423,1475445300,0.04692743897590413,0 +8424,1475445600,0.05453375398066244,0 +8425,1475445900,0.05376753872504785,0 +8426,1475446200,0.05844238240859444,0 +8427,1475446500,0.05977007523910935,0 +8428,1475446800,0.061398670417677076,0 +8429,1475447100,0.0638182975397302,0 +8430,1475447400,0.06681801475416305,0 +8431,1475447700,0.06565473248420793,0 +8432,1475448000,0.07363639990063169,0 +8433,1475448300,0.07277401997767856,0 +8434,1475448600,0.07328586417683709,0 +8435,1475448900,0.07852218543528401,0 +8436,1475449200,0.08098834384893397,0 +8437,1475449500,0.0774519657474717,0 +8438,1475449800,0.08494350356741202,0 +8439,1475450100,0.08454643655307144,0 +8440,1475450400,0.0931081940614546,0 +8441,1475450700,0.09131829040794323,0 +8442,1475451000,0.09513385625478324,0 +8443,1475451300,0.0912252278268512,0 +8444,1475451600,0.0941318824593514,0 +8445,1475451900,0.09994829381049326,0 +8446,1475452200,0.10059973188181526,0 +8447,1475452500,0.10690627282954564,0 +8448,1475452800,0.10325201545796454,0 +8449,1475453100,0.11302358652789968,0 +8450,1475453400,0.11379290386934048,0 +8451,1475453700,0.11230390256335654,0 +8452,1475454000,0.11169899578289576,0 +8453,1475454300,0.11790937206286728,0 +8454,1475454600,0.11942319005724747,0 +8455,1475454900,0.11800243464448468,0 +8456,1475455200,0.12349312696001807,0 +8457,1475455500,0.12102696854710365,0 +8458,1475455800,0.12754134926042876,0 +8459,1475456100,0.13356870246330688,0 +8460,1475456400,0.13231235761141902,0 +8461,1475456700,0.1428284293342936,0 +8462,1475457000,0.1424561790078239,0 +8463,1475457300,0.1556462489093127,0 +8464,1475457600,0.15176243716977694,0 +8465,1475457900,0.15466909180227714,0 +8466,1475458200,0.1644654795607137,0 +8467,1475458500,0.1521564020986066,0 +8468,1475458800,0.15878866208194542,0 +8469,1475459100,0.15611156181739985,0 +8470,1475459400,0.15848465764869685,0 +8471,1475459700,0.16893248347835027,0 +8472,1475460000,0.16860676444274178,0 +8473,1475460300,0.1711194541464124,0 +8474,1475460600,0.16776920120818495,0 +8475,1475460900,0.16176666469370315,0 +8476,1475461200,0.16983829260612826,0 +8477,1475461500,0.17181742350859566,0 +8478,1475461800,0.17588736041136624,0 +8479,1475462100,0.16993135518774571,0 +8480,1475462400,0.17025707422345926,0 +8481,1475462700,0.17551511008489654,0 +8482,1475463000,0.1729558890903647,0 +8483,1475463300,0.17276976392712984,0 +8484,1475463600,0.16997788647860698,0 +8485,1475463900,0.16223197760189534,0 +8486,1475464200,0.1652565115044093,0 +8487,1475464500,0.1732133622328571,0 +8488,1475464800,0.17274804932476995,0 +8489,1475465100,0.167837447101301,0 +8490,1475465400,0.16488426117793956,0 +8491,1475465700,0.16586141828497508,0 +8492,1475466000,0.1767032090435632,0 +8493,1475466300,0.18579852535378008,0 +8494,1475466600,0.1806335520739602,0 +8495,1475466900,0.1916149367049325,0 +8496,1475467200,0.2024101961727013,0 +8497,1475467500,0.2003876360655247,0 +8498,1475467800,0.21788340140985354,0 +8499,1475468100,0.2363780384635358,0 +8500,1475468400,0.2384967632383768,0 +8501,1475468700,0.25157205595578325,0 +8502,1475469000,0.25024436312474296,0 +8503,1475469300,0.2617624086430493,0 +8504,1475469600,0.2573884673070302,0 +8505,1475469900,0.26139015831657963,0 +8506,1475470200,0.2494316165785824,0 +8507,1475470500,0.2497573356142959,0 +8508,1475470800,0.2464753285691845,0 +8509,1475471100,0.2477316734210724,0 +8510,1475471400,0.23298125423444715,0 +8511,1475471700,0.22274437025642485,0 +8512,1475472000,0.2129510845840248,0 +8513,1475472300,0.20662282903397647,0 +8514,1475472600,0.20062029251956826,0 +8515,1475472900,0.18891612183798487,0 +8516,1475473200,0.1844491179202748,0 +8517,1475473500,0.1824948037063088,0 +8518,1475473800,0.1835184921041005,0 +8519,1475474100,0.1708154497131638,0 +8520,1475474400,0.16893248347835027,0 +8521,1475474700,0.16460507343308728,0 +8522,1475475000,0.16037072596944169,0 +8523,1475475300,0.16230022349501144,0 +8524,1475475600,0.15464737719991725,0 +8525,1475475900,0.16490597578040453,0 +8526,1475476200,0.15639074956225213,0 +8527,1475476500,0.1554135924552166,0 +8528,1475476800,0.16018460080620686,0 +8529,1475477100,0.15829853248546194,0 +8530,1475477400,0.15964793991886214,0 +8531,1475477700,0.16281206769395984,0 +8532,1475478000,0.1629764782548348,0 +8533,1475478300,0.15713525021519156,0 +8534,1475478600,0.16064991371429402,0 +8535,1475478900,0.15850947433709311,0 +8536,1475479200,0.16655938764715836,0 +8537,1475479500,0.16886113549919782,0 +8538,1475479800,0.16569700772410015,0 +8539,1475480100,0.1640684125457426,0 +8540,1475480400,0.16367444761691294,0 +8541,1475480700,0.1718856694017117,0 +8542,1475481000,0.16604754344820996,0 +8543,1475481300,0.17484195741121464,0 +8544,1475481600,0.1695373902589161,0 +8545,1475481900,0.16941951098890234,0 +8546,1475482200,0.18179683434412552,0 +8547,1475482500,0.1759587083906238,0 +8548,1475482800,0.17772689744135495,0 +8549,1475483100,0.18977850176099065,0 +8550,1475483400,0.1864034321342408,0 +8551,1475483700,0.1859164046237938,0 +8552,1475484000,0.19566315900534315,0 +8553,1475484300,0.20448238965674406,0 +8554,1475484600,0.20336563867732446,0 +8555,1475484900,0.20364482642217674,0 +8556,1475485200,0.2022488876978943,0 +8557,1475485500,0.203272576095707,0 +8558,1475485800,0.2183952456086969,0 +8559,1475486100,0.22351368759776047,0 +8560,1475486400,0.2229770267104159,0 +8561,1475486700,0.2339832280298789,0 +8562,1475487000,0.2281885312810969,0 +8563,1475487300,0.22439778212317868,0 +8564,1475487600,0.2375661374222025,0 +8565,1475487900,0.23768091460617985,0 +8566,1475488200,0.24021842099835186,0 +8567,1475488500,0.244614076936836,0 +8568,1475488800,0.2427311107020224,0 +8569,1475489100,0.2529679946801497,0 +8570,1475489400,0.2408698590696738,0 +8571,1475489700,0.2430785443400958,0 +8572,1475490000,0.2477316734210724,0 +8573,1475490300,0.2582942764347031,0 +8574,1475490600,0.2470585207472853,0 +8575,1475490900,0.25324718242500205,0 +8576,1475491200,0.262389030025975,0 +8577,1475491500,0.2555489302770415,0 +8578,1475491800,0.26294740551567963,0 +8579,1475492100,0.26169106066379183,0 +8580,1475492400,0.2627612803524448,0 +8581,1475492700,0.2650195989997645,0 +8582,1475493000,0.26843809783121303,0 +8583,1475493300,0.2716022256062056,0 +8584,1475493600,0.2857725547006613,0 +8585,1475493900,0.2765345424320345,0 +8586,1475494200,0.2863774614812272,0 +8587,1475494500,0.2893089328022287,0 +8588,1475494800,0.30087040752535993,0 +8589,1475495100,0.29000690216441194,0 +8590,1475495400,0.29977847323434714,0 +8591,1475495700,0.302405940121995,0 +8592,1475496000,0.3125962928094713,0 +8593,1475496300,0.300451625908029,0 +8594,1475496600,0.31501591993152445,0 +8595,1475496900,0.3256964022153791,0 +8596,1475497200,0.3201126473183333,0 +8597,1475497500,0.32469442841973706,0 +8598,1475497800,0.3282090919185243,0 +8599,1475498100,0.3270923409391151,0 +8600,1475498400,0.3254637457615982,0 +8601,1475498700,0.3320711890559103,0 +8602,1475499000,0.3287209361176828,0 +8603,1475499300,0.3341650971420397,0 +8604,1475499600,0.32704580964856905,0 +8605,1475499900,0.3424476669065163,0 +8606,1475500200,0.343120819579988,0 +8607,1475500500,0.3388864721161323,0 +8608,1475500800,0.3437505430491603,0 +8609,1475501100,0.3553833657518644,0 +8610,1475501400,0.3595928965267984,0 +8611,1475501700,0.35884839587385897,0 +8612,1475502000,0.35598827253211496,0 +8613,1475502300,0.3504727635282904,0 +8614,1475502600,0.3672240282194276,0 +8615,1475502900,0.3714583756832833,0 +8616,1475503200,0.36385206067852505,0 +8617,1475503500,0.3524053631401066,0 +8618,1475503800,0.34889069964026864,0 +8619,1475504100,0.3505658261093824,0 +8620,1475504400,0.3418892914168117,0 +8621,1475504700,0.3457048572636517,0 +8622,1475505000,0.3403754734220112,0 +8623,1475505300,0.3398170979323066,0 +8624,1475505600,0.3360015320865174,0 +8625,1475505900,0.32495190156243964,0 +8626,1475506200,0.3362341885402983,0 +8627,1475506500,0.3115726044111541,0 +8628,1475506800,0.3145754237118336,0 +8629,1475507100,0.30903820010533384,0 +8630,1475507400,0.30396628940744663,0 +8631,1475507700,0.3023842255196351,0 +8632,1475508000,0.3015683768874381,0 +8633,1475508300,0.2820965827267203,0 +8634,1475508600,0.2733487000545769,0 +8635,1475508900,0.2742793258707512,0 +8636,1475509200,0.2616693460614319,0 +8637,1475509500,0.2604130012096492,0 +8638,1475509800,0.23049338121917284,0 +8639,1475510100,0.2343771929587086,0 +8640,1475510400,0.2283994731327281,0 +8641,1475510700,0.2214663108021772,0 +8642,1475511000,0.21551030557855647,0 +8643,1475511300,0.2069237313812097,0 +8644,1475511600,0.19531572536730124,0 +8645,1475511900,0.17591217709976253,0 +8646,1475512200,0.1748884887019708,0 +8647,1475512500,0.17030360551421542,0 +8648,1475512800,0.1574609692509051,0 +8649,1475513100,0.15429684147580744,0 +8650,1475513400,0.15888172466356282,0 +8651,1475513700,0.138615796473113,0 +8652,1475514000,0.13214794705054406,0 +8653,1475514300,0.13594179829460384,0 +8654,1475514600,0.12181800049090435,0 +8655,1475514900,0.11532843646597553,0 +8656,1475515200,0.1120929607117254,0 +8657,1475515500,0.10830221155380708,0 +8658,1475515800,0.10267192536590007,0 +8659,1475516100,0.09199144308246578,0 +8660,1475516400,0.09143306759276118,0 +8661,1475516700,0.0912717591173972,0 +8662,1475517000,0.08331490838884432,0 +8663,1475517300,0.08475737840417719,0 +8664,1475517600,0.07612427291590597,0 +8665,1475517900,0.07358986861008568,0 +8666,1475518200,0.07002867381970168,0 +8667,1475518500,0.07005349050862336,0 +8668,1475518800,0.06993561123860964,0 +8669,1475519100,0.06807435960626106,0 +8670,1475519400,0.06982083405484249,0 +8671,1475519700,0.06872579767705768,0 +8672,1475520000,0.06081547824010159,0 +8673,1475520300,0.05834931982645161,0 +8674,1475520600,0.05930476233154761,0 +8675,1475520900,0.05646635359247872,0 +8676,1475521200,0.053395288398578135,0 +8677,1475521500,0.05392884720041181,0 +8678,1475521800,0.051602282659450716,0 +8679,1475522100,0.05092912998597899,0 +8680,1475522400,0.050066750063025835,0 +8681,1475522700,0.05111525514921381,0 +8682,1475523000,0.04983409360924496,0 +8683,1475523300,0.0467630284153444,0 +8684,1475523600,0.04764712294097267,0 +8685,1475523900,0.04702050155804696,0 +8686,1475524200,0.04380984249250836,0 +8687,1475524500,0.04666996583425236,0 +8688,1475524800,0.045298843798387216,0 +8689,1475525100,0.045553214854843226,0 +8690,1475525400,0.04478699959922865,0 +8691,1475525700,0.04318011902333607,0 +8692,1475526000,0.04504137065568466,0 +8693,1475526300,0.0454601522727004,0 +8694,1475526600,0.043902905073600375,0 +8695,1475526900,0.041808996987470905,0 +8696,1475527200,0.04257521224203466,0 +8697,1475527500,0.04108621093615582,0 +8698,1475527800,0.041551523844768366,0 +8699,1475528100,0.04197030546178409,0 +8700,1475528400,0.04608987574187265,0 +8701,1475528700,0.04257521224203466,0 +8702,1475529000,0.044228624109524085,0 +8703,1475529300,0.04273962280364522,0 +8704,1475529600,0.0454601522727004,0 +8705,1475529900,0.04399596765574322,0 +8706,1475530200,0.04373849451304064,0 +8707,1475530500,0.04653037196156353,0 +8708,1475530800,0.04727487261450297,0 +8709,1475531100,0.04969449973655612,0 +8710,1475531400,0.052346783312390135,0 +8711,1475531700,0.05490600430713215,0 +8712,1475532000,0.05937300822476872,0 +8713,1475532300,0.06000273169394104,0 +8714,1475532600,0.0673081443506465,0 +8715,1475532900,0.07182167955882914,0 +8716,1475533200,0.07900921294657168,0 +8717,1475533500,0.08422071751714774,0 +8718,1475533800,0.08508309744010087,0 +8719,1475534100,0.09389922600525527,0 +8720,1475534400,0.09038766459176896,0 +8721,1475534700,0.10285805052913492,0 +8722,1475535000,0.10360255118207436,0 +8723,1475535300,0.10913977478836394,0 +8724,1475535600,0.1112553974771685,0 +8725,1475535900,0.11593334324654135,0 +8726,1475536200,0.12603063335219006,0 +8727,1475536500,0.12761269723968632,0 +8728,1475536800,0.1336834796472842,0 +8729,1475537100,0.14617868227262615,0 +8730,1475537400,0.14199086609973674,0 +8731,1475537700,0.14903880561437013,0 +8732,1475538000,0.1516910891905194,0 +8733,1475538300,0.16262594253072496,0 +8734,1475538600,0.15690259376109547,0 +8735,1475538900,0.1707006725291865,0 +8736,1475539200,0.16974523002451086,0 +8737,1475539500,0.17491020330433069,0 +8738,1475539800,0.17400439417665778,0 +8739,1475540100,0.18689356173082933,0 +8740,1475540400,0.19103484661284686,0 +8741,1475540700,0.19103484661284686,0 +8742,1475541000,0.1991778225044771,0 +8743,1475541300,0.20071335510118568,0 +8744,1475541600,0.20792570517664155,0 +8745,1475541900,0.21101848497241865,0 +8746,1475542200,0.21416089814505126,0 +8747,1475542500,0.2257937208474401,0 +8748,1475542800,0.24287070457450105,0 +8749,1475543100,0.2380314503302897,0 +8750,1475543400,0.2489663036704952,0 +8751,1475543700,0.2555489302770415,0 +8752,1475544000,0.2535946160629704,0 +8753,1475544300,0.2590853083785038,0 +8754,1475544600,0.2673213468518039,0 +8755,1475544900,0.2681123787954995,0 +8756,1475545200,0.2762088233963209,0 +8757,1475545500,0.2662045958723947,0 +8758,1475545800,0.2700915096979669,0 +8759,1475546100,0.2864922386652045,0 +8760,1475546400,0.2939372451947039,0 +8761,1475546700,0.2877268689147324,0 +8762,1475547000,0.2870754308434104,0 +8763,1475547300,0.29945275419863354,0 +8764,1475547600,0.2889583970781189,0 +8765,1475547900,0.29738056071465385,0 +8766,1475548200,0.3018723813207918,0 +8767,1475548500,0.3191354902105622,0 +8768,1475548800,0.3033613826261452,0 +8769,1475549100,0.30647897911059185,0 +8770,1475549400,0.30010419226995555,0 +8771,1475549700,0.31322601627864355,0 +8772,1475550000,0.297172720948954,0 +8773,1475550300,0.30457119618769724,0 +8774,1475550600,0.29947446880099343,0 +8775,1475550900,0.3089668521258661,0 +8776,1475551200,0.29372940542910914,0 +8777,1475551500,0.2971479042605577,0 +8778,1475551800,0.2978706903111373,0 +8779,1475552100,0.3058492556414195,0 +8780,1475552400,0.30889860623264503,0 +8781,1475552700,0.3077818552532359,0 +8782,1475553000,0.309059914708009,0 +8783,1475553300,0.30985094665149443,0 diff --git a/datasets/anomaly/kpi/TEST/problem_TEST/dataSplits.csv b/datasets/anomaly/kpi/TEST/problem_TEST/dataSplits.csv new file mode 100644 index 0000000..1f92bd4 --- /dev/null +++ b/datasets/anomaly/kpi/TEST/problem_TEST/dataSplits.csv @@ -0,0 +1,7028 @@ +d3mIndex,type,repeat,fold +7027,TEST,0,0 +7028,TEST,0,0 +7029,TEST,0,0 +7030,TEST,0,0 +7031,TEST,0,0 +7032,TEST,0,0 +7033,TEST,0,0 +7034,TEST,0,0 +7035,TEST,0,0 +7036,TEST,0,0 +7037,TEST,0,0 +7038,TEST,0,0 +7039,TEST,0,0 +7040,TEST,0,0 +7041,TEST,0,0 +7042,TEST,0,0 +7043,TEST,0,0 +7044,TEST,0,0 +7045,TEST,0,0 +7046,TEST,0,0 +7047,TEST,0,0 +7048,TEST,0,0 +7049,TEST,0,0 +7050,TEST,0,0 +7051,TEST,0,0 +7052,TEST,0,0 +7053,TEST,0,0 +7054,TEST,0,0 +7055,TEST,0,0 +7056,TEST,0,0 +7057,TEST,0,0 +7058,TEST,0,0 +7059,TEST,0,0 +7060,TEST,0,0 +7061,TEST,0,0 +7062,TEST,0,0 +7063,TEST,0,0 +7064,TEST,0,0 +7065,TEST,0,0 +7066,TEST,0,0 +7067,TEST,0,0 +7068,TEST,0,0 +7069,TEST,0,0 +7070,TEST,0,0 +7071,TEST,0,0 +7072,TEST,0,0 +7073,TEST,0,0 +7074,TEST,0,0 +7075,TEST,0,0 +7076,TEST,0,0 +7077,TEST,0,0 +7078,TEST,0,0 +7079,TEST,0,0 +7080,TEST,0,0 +7081,TEST,0,0 +7082,TEST,0,0 +7083,TEST,0,0 +7084,TEST,0,0 +7085,TEST,0,0 +7086,TEST,0,0 +7087,TEST,0,0 +7088,TEST,0,0 +7089,TEST,0,0 +7090,TEST,0,0 +7091,TEST,0,0 +7092,TEST,0,0 +7093,TEST,0,0 +7094,TEST,0,0 +7095,TEST,0,0 +7096,TEST,0,0 +7097,TEST,0,0 +7098,TEST,0,0 +7099,TEST,0,0 +7100,TEST,0,0 +7101,TEST,0,0 +7102,TEST,0,0 +7103,TEST,0,0 +7104,TEST,0,0 +7105,TEST,0,0 +7106,TEST,0,0 +7107,TEST,0,0 +7108,TEST,0,0 +7109,TEST,0,0 +7110,TEST,0,0 +7111,TEST,0,0 +7112,TEST,0,0 +7113,TEST,0,0 +7114,TEST,0,0 +7115,TEST,0,0 +7116,TEST,0,0 +7117,TEST,0,0 +7118,TEST,0,0 +7119,TEST,0,0 +7120,TEST,0,0 +7121,TEST,0,0 +7122,TEST,0,0 +7123,TEST,0,0 +7124,TEST,0,0 +7125,TEST,0,0 +7126,TEST,0,0 +7127,TEST,0,0 +7128,TEST,0,0 +7129,TEST,0,0 +7130,TEST,0,0 +7131,TEST,0,0 +7132,TEST,0,0 +7133,TEST,0,0 +7134,TEST,0,0 +7135,TEST,0,0 +7136,TEST,0,0 +7137,TEST,0,0 +7138,TEST,0,0 +7139,TEST,0,0 +7140,TEST,0,0 +7141,TEST,0,0 +7142,TEST,0,0 +7143,TEST,0,0 +7144,TEST,0,0 +7145,TEST,0,0 +7146,TEST,0,0 +7147,TEST,0,0 +7148,TEST,0,0 +7149,TEST,0,0 +7150,TEST,0,0 +7151,TEST,0,0 +7152,TEST,0,0 +7153,TEST,0,0 +7154,TEST,0,0 +7155,TEST,0,0 +7156,TEST,0,0 +7157,TEST,0,0 +7158,TEST,0,0 +7159,TEST,0,0 +7160,TEST,0,0 +7161,TEST,0,0 +7162,TEST,0,0 +7163,TEST,0,0 +7164,TEST,0,0 +7165,TEST,0,0 +7166,TEST,0,0 +7167,TEST,0,0 +7168,TEST,0,0 +7169,TEST,0,0 +7170,TEST,0,0 +7171,TEST,0,0 +7172,TEST,0,0 +7173,TEST,0,0 +7174,TEST,0,0 +7175,TEST,0,0 +7176,TEST,0,0 +7177,TEST,0,0 +7178,TEST,0,0 +7179,TEST,0,0 +7180,TEST,0,0 +7181,TEST,0,0 +7182,TEST,0,0 +7183,TEST,0,0 +7184,TEST,0,0 +7185,TEST,0,0 +7186,TEST,0,0 +7187,TEST,0,0 +7188,TEST,0,0 +7189,TEST,0,0 +7190,TEST,0,0 +7191,TEST,0,0 +7192,TEST,0,0 +7193,TEST,0,0 +7194,TEST,0,0 +7195,TEST,0,0 +7196,TEST,0,0 +7197,TEST,0,0 +7198,TEST,0,0 +7199,TEST,0,0 +7200,TEST,0,0 +7201,TEST,0,0 +7202,TEST,0,0 +7203,TEST,0,0 +7204,TEST,0,0 +7205,TEST,0,0 +7206,TEST,0,0 +7207,TEST,0,0 +7208,TEST,0,0 +7209,TEST,0,0 +7210,TEST,0,0 +7211,TEST,0,0 +7212,TEST,0,0 +7213,TEST,0,0 +7214,TEST,0,0 +7215,TEST,0,0 +7216,TEST,0,0 +7217,TEST,0,0 +7218,TEST,0,0 +7219,TEST,0,0 +7220,TEST,0,0 +7221,TEST,0,0 +7222,TEST,0,0 +7223,TEST,0,0 +7224,TEST,0,0 +7225,TEST,0,0 +7226,TEST,0,0 +7227,TEST,0,0 +7228,TEST,0,0 +7229,TEST,0,0 +7230,TEST,0,0 +7231,TEST,0,0 +7232,TEST,0,0 +7233,TEST,0,0 +7234,TEST,0,0 +7235,TEST,0,0 +7236,TEST,0,0 +7237,TEST,0,0 +7238,TEST,0,0 +7239,TEST,0,0 +7240,TEST,0,0 +7241,TEST,0,0 +7242,TEST,0,0 +7243,TEST,0,0 +7244,TEST,0,0 +7245,TEST,0,0 +7246,TEST,0,0 +7247,TEST,0,0 +7248,TEST,0,0 +7249,TEST,0,0 +7250,TEST,0,0 +7251,TEST,0,0 +7252,TEST,0,0 +7253,TEST,0,0 +7254,TEST,0,0 +7255,TEST,0,0 +7256,TEST,0,0 +7257,TEST,0,0 +7258,TEST,0,0 +7259,TEST,0,0 +7260,TEST,0,0 +7261,TEST,0,0 +7262,TEST,0,0 +7263,TEST,0,0 +7264,TEST,0,0 +7265,TEST,0,0 +7266,TEST,0,0 +7267,TEST,0,0 +7268,TEST,0,0 +7269,TEST,0,0 +7270,TEST,0,0 +7271,TEST,0,0 +7272,TEST,0,0 +7273,TEST,0,0 +7274,TEST,0,0 +7275,TEST,0,0 +7276,TEST,0,0 +7277,TEST,0,0 +7278,TEST,0,0 +7279,TEST,0,0 +7280,TEST,0,0 +7281,TEST,0,0 +7282,TEST,0,0 +7283,TEST,0,0 +7284,TEST,0,0 +7285,TEST,0,0 +7286,TEST,0,0 +7287,TEST,0,0 +7288,TEST,0,0 +7289,TEST,0,0 +7290,TEST,0,0 +7291,TEST,0,0 +7292,TEST,0,0 +7293,TEST,0,0 +7294,TEST,0,0 +7295,TEST,0,0 +7296,TEST,0,0 +7297,TEST,0,0 +7298,TEST,0,0 +7299,TEST,0,0 +7300,TEST,0,0 +7301,TEST,0,0 +7302,TEST,0,0 +7303,TEST,0,0 +7304,TEST,0,0 +7305,TEST,0,0 +7306,TEST,0,0 +7307,TEST,0,0 +7308,TEST,0,0 +7309,TEST,0,0 +7310,TEST,0,0 +7311,TEST,0,0 +7312,TEST,0,0 +7313,TEST,0,0 +7314,TEST,0,0 +7315,TEST,0,0 +7316,TEST,0,0 +7317,TEST,0,0 +7318,TEST,0,0 +7319,TEST,0,0 +7320,TEST,0,0 +7321,TEST,0,0 +7322,TEST,0,0 +7323,TEST,0,0 +7324,TEST,0,0 +7325,TEST,0,0 +7326,TEST,0,0 +7327,TEST,0,0 +7328,TEST,0,0 +7329,TEST,0,0 +7330,TEST,0,0 +7331,TEST,0,0 +7332,TEST,0,0 +7333,TEST,0,0 +7334,TEST,0,0 +7335,TEST,0,0 +7336,TEST,0,0 +7337,TEST,0,0 +7338,TEST,0,0 +7339,TEST,0,0 +7340,TEST,0,0 +7341,TEST,0,0 +7342,TEST,0,0 +7343,TEST,0,0 +7344,TEST,0,0 +7345,TEST,0,0 +7346,TEST,0,0 +7347,TEST,0,0 +7348,TEST,0,0 +7349,TEST,0,0 +7350,TEST,0,0 +7351,TEST,0,0 +7352,TEST,0,0 +7353,TEST,0,0 +7354,TEST,0,0 +7355,TEST,0,0 +7356,TEST,0,0 +7357,TEST,0,0 +7358,TEST,0,0 +7359,TEST,0,0 +7360,TEST,0,0 +7361,TEST,0,0 +7362,TEST,0,0 +7363,TEST,0,0 +7364,TEST,0,0 +7365,TEST,0,0 +7366,TEST,0,0 +7367,TEST,0,0 +7368,TEST,0,0 +7369,TEST,0,0 +7370,TEST,0,0 +7371,TEST,0,0 +7372,TEST,0,0 +7373,TEST,0,0 +7374,TEST,0,0 +7375,TEST,0,0 +7376,TEST,0,0 +7377,TEST,0,0 +7378,TEST,0,0 +7379,TEST,0,0 +7380,TEST,0,0 +7381,TEST,0,0 +7382,TEST,0,0 +7383,TEST,0,0 +7384,TEST,0,0 +7385,TEST,0,0 +7386,TEST,0,0 +7387,TEST,0,0 +7388,TEST,0,0 +7389,TEST,0,0 +7390,TEST,0,0 +7391,TEST,0,0 +7392,TEST,0,0 +7393,TEST,0,0 +7394,TEST,0,0 +7395,TEST,0,0 +7396,TEST,0,0 +7397,TEST,0,0 +7398,TEST,0,0 +7399,TEST,0,0 +7400,TEST,0,0 +7401,TEST,0,0 +7402,TEST,0,0 +7403,TEST,0,0 +7404,TEST,0,0 +7405,TEST,0,0 +7406,TEST,0,0 +7407,TEST,0,0 +7408,TEST,0,0 +7409,TEST,0,0 +7410,TEST,0,0 +7411,TEST,0,0 +7412,TEST,0,0 +7413,TEST,0,0 +7414,TEST,0,0 +7415,TEST,0,0 +7416,TEST,0,0 +7417,TEST,0,0 +7418,TEST,0,0 +7419,TEST,0,0 +7420,TEST,0,0 +7421,TEST,0,0 +7422,TEST,0,0 +7423,TEST,0,0 +7424,TEST,0,0 +7425,TEST,0,0 +7426,TEST,0,0 +7427,TEST,0,0 +7428,TEST,0,0 +7429,TEST,0,0 +7430,TEST,0,0 +7431,TEST,0,0 +7432,TEST,0,0 +7433,TEST,0,0 +7434,TEST,0,0 +7435,TEST,0,0 +7436,TEST,0,0 +7437,TEST,0,0 +7438,TEST,0,0 +7439,TEST,0,0 +7440,TEST,0,0 +7441,TEST,0,0 +7442,TEST,0,0 +7443,TEST,0,0 +7444,TEST,0,0 +7445,TEST,0,0 +7446,TEST,0,0 +7447,TEST,0,0 +7448,TEST,0,0 +7449,TEST,0,0 +7450,TEST,0,0 +7451,TEST,0,0 +7452,TEST,0,0 +7453,TEST,0,0 +7454,TEST,0,0 +7455,TEST,0,0 +7456,TEST,0,0 +7457,TEST,0,0 +7458,TEST,0,0 +7459,TEST,0,0 +7460,TEST,0,0 +7461,TEST,0,0 +7462,TEST,0,0 +7463,TEST,0,0 +7464,TEST,0,0 +7465,TEST,0,0 +7466,TEST,0,0 +7467,TEST,0,0 +7468,TEST,0,0 +7469,TEST,0,0 +7470,TEST,0,0 +7471,TEST,0,0 +7472,TEST,0,0 +7473,TEST,0,0 +7474,TEST,0,0 +7475,TEST,0,0 +7476,TEST,0,0 +7477,TEST,0,0 +7478,TEST,0,0 +7479,TEST,0,0 +7480,TEST,0,0 +7481,TEST,0,0 +7482,TEST,0,0 +7483,TEST,0,0 +7484,TEST,0,0 +7485,TEST,0,0 +7486,TEST,0,0 +7487,TEST,0,0 +7488,TEST,0,0 +7489,TEST,0,0 +7490,TEST,0,0 +7491,TEST,0,0 +7492,TEST,0,0 +7493,TEST,0,0 +7494,TEST,0,0 +7495,TEST,0,0 +7496,TEST,0,0 +7497,TEST,0,0 +7498,TEST,0,0 +7499,TEST,0,0 +7500,TEST,0,0 +7501,TEST,0,0 +7502,TEST,0,0 +7503,TEST,0,0 +7504,TEST,0,0 +7505,TEST,0,0 +7506,TEST,0,0 +7507,TEST,0,0 +7508,TEST,0,0 +7509,TEST,0,0 +7510,TEST,0,0 +7511,TEST,0,0 +7512,TEST,0,0 +7513,TEST,0,0 +7514,TEST,0,0 +7515,TEST,0,0 +7516,TEST,0,0 +7517,TEST,0,0 +7518,TEST,0,0 +7519,TEST,0,0 +7520,TEST,0,0 +7521,TEST,0,0 +7522,TEST,0,0 +7523,TEST,0,0 +7524,TEST,0,0 +7525,TEST,0,0 +7526,TEST,0,0 +7527,TEST,0,0 +7528,TEST,0,0 +7529,TEST,0,0 +7530,TEST,0,0 +7531,TEST,0,0 +7532,TEST,0,0 +7533,TEST,0,0 +7534,TEST,0,0 +7535,TEST,0,0 +7536,TEST,0,0 +7537,TEST,0,0 +7538,TEST,0,0 +7539,TEST,0,0 +7540,TEST,0,0 +7541,TEST,0,0 +7542,TEST,0,0 +7543,TEST,0,0 +7544,TEST,0,0 +7545,TEST,0,0 +7546,TEST,0,0 +7547,TEST,0,0 +7548,TEST,0,0 +7549,TEST,0,0 +7550,TEST,0,0 +7551,TEST,0,0 +7552,TEST,0,0 +7553,TEST,0,0 +7554,TEST,0,0 +7555,TEST,0,0 +7556,TEST,0,0 +7557,TEST,0,0 +7558,TEST,0,0 +7559,TEST,0,0 +7560,TEST,0,0 +7561,TEST,0,0 +7562,TEST,0,0 +7563,TEST,0,0 +7564,TEST,0,0 +7565,TEST,0,0 +7566,TEST,0,0 +7567,TEST,0,0 +7568,TEST,0,0 +7569,TEST,0,0 +7570,TEST,0,0 +7571,TEST,0,0 +7572,TEST,0,0 +7573,TEST,0,0 +7574,TEST,0,0 +7575,TEST,0,0 +7576,TEST,0,0 +7577,TEST,0,0 +7578,TEST,0,0 +7579,TEST,0,0 +7580,TEST,0,0 +7581,TEST,0,0 +7582,TEST,0,0 +7583,TEST,0,0 +7584,TEST,0,0 +7585,TEST,0,0 +7586,TEST,0,0 +7587,TEST,0,0 +7588,TEST,0,0 +7589,TEST,0,0 +7590,TEST,0,0 +7591,TEST,0,0 +7592,TEST,0,0 +7593,TEST,0,0 +7594,TEST,0,0 +7595,TEST,0,0 +7596,TEST,0,0 +7597,TEST,0,0 +7598,TEST,0,0 +7599,TEST,0,0 +7600,TEST,0,0 +7601,TEST,0,0 +7602,TEST,0,0 +7603,TEST,0,0 +7604,TEST,0,0 +7605,TEST,0,0 +7606,TEST,0,0 +7607,TEST,0,0 +7608,TEST,0,0 +7609,TEST,0,0 +7610,TEST,0,0 +7611,TEST,0,0 +7612,TEST,0,0 +7613,TEST,0,0 +7614,TEST,0,0 +7615,TEST,0,0 +7616,TEST,0,0 +7617,TEST,0,0 +7618,TEST,0,0 +7619,TEST,0,0 +7620,TEST,0,0 +7621,TEST,0,0 +7622,TEST,0,0 +7623,TEST,0,0 +7624,TEST,0,0 +7625,TEST,0,0 +7626,TEST,0,0 +7627,TEST,0,0 +7628,TEST,0,0 +7629,TEST,0,0 +7630,TEST,0,0 +7631,TEST,0,0 +7632,TEST,0,0 +7633,TEST,0,0 +7634,TEST,0,0 +7635,TEST,0,0 +7636,TEST,0,0 +7637,TEST,0,0 +7638,TEST,0,0 +7639,TEST,0,0 +7640,TEST,0,0 +7641,TEST,0,0 +7642,TEST,0,0 +7643,TEST,0,0 +7644,TEST,0,0 +7645,TEST,0,0 +7646,TEST,0,0 +7647,TEST,0,0 +7648,TEST,0,0 +7649,TEST,0,0 +7650,TEST,0,0 +7651,TEST,0,0 +7652,TEST,0,0 +7653,TEST,0,0 +7654,TEST,0,0 +7655,TEST,0,0 +7656,TEST,0,0 +7657,TEST,0,0 +7658,TEST,0,0 +7659,TEST,0,0 +7660,TEST,0,0 +7661,TEST,0,0 +7662,TEST,0,0 +7663,TEST,0,0 +7664,TEST,0,0 +7665,TEST,0,0 +7666,TEST,0,0 +7667,TEST,0,0 +7668,TEST,0,0 +7669,TEST,0,0 +7670,TEST,0,0 +7671,TEST,0,0 +7672,TEST,0,0 +7673,TEST,0,0 +7674,TEST,0,0 +7675,TEST,0,0 +7676,TEST,0,0 +7677,TEST,0,0 +7678,TEST,0,0 +7679,TEST,0,0 +7680,TEST,0,0 +7681,TEST,0,0 +7682,TEST,0,0 +7683,TEST,0,0 +7684,TEST,0,0 +7685,TEST,0,0 +7686,TEST,0,0 +7687,TEST,0,0 +7688,TEST,0,0 +7689,TEST,0,0 +7690,TEST,0,0 +7691,TEST,0,0 +7692,TEST,0,0 +7693,TEST,0,0 +7694,TEST,0,0 +7695,TEST,0,0 +7696,TEST,0,0 +7697,TEST,0,0 +7698,TEST,0,0 +7699,TEST,0,0 +7700,TEST,0,0 +7701,TEST,0,0 +7702,TEST,0,0 +7703,TEST,0,0 +7704,TEST,0,0 +7705,TEST,0,0 +7706,TEST,0,0 +7707,TEST,0,0 +7708,TEST,0,0 +7709,TEST,0,0 +7710,TEST,0,0 +7711,TEST,0,0 +7712,TEST,0,0 +7713,TEST,0,0 +7714,TEST,0,0 +7715,TEST,0,0 +7716,TEST,0,0 +7717,TEST,0,0 +7718,TEST,0,0 +7719,TEST,0,0 +7720,TEST,0,0 +7721,TEST,0,0 +7722,TEST,0,0 +7723,TEST,0,0 +7724,TEST,0,0 +7725,TEST,0,0 +7726,TEST,0,0 +7727,TEST,0,0 +7728,TEST,0,0 +7729,TEST,0,0 +7730,TEST,0,0 +7731,TEST,0,0 +7732,TEST,0,0 +7733,TEST,0,0 +7734,TEST,0,0 +7735,TEST,0,0 +7736,TEST,0,0 +7737,TEST,0,0 +7738,TEST,0,0 +7739,TEST,0,0 +7740,TEST,0,0 +7741,TEST,0,0 +7742,TEST,0,0 +7743,TEST,0,0 +7744,TEST,0,0 +7745,TEST,0,0 +7746,TEST,0,0 +7747,TEST,0,0 +7748,TEST,0,0 +7749,TEST,0,0 +7750,TEST,0,0 +7751,TEST,0,0 +7752,TEST,0,0 +7753,TEST,0,0 +7754,TEST,0,0 +7755,TEST,0,0 +7756,TEST,0,0 +7757,TEST,0,0 +7758,TEST,0,0 +7759,TEST,0,0 +7760,TEST,0,0 +7761,TEST,0,0 +7762,TEST,0,0 +7763,TEST,0,0 +7764,TEST,0,0 +7765,TEST,0,0 +7766,TEST,0,0 +7767,TEST,0,0 +7768,TEST,0,0 +7769,TEST,0,0 +7770,TEST,0,0 +7771,TEST,0,0 +7772,TEST,0,0 +7773,TEST,0,0 +7774,TEST,0,0 +7775,TEST,0,0 +7776,TEST,0,0 +7777,TEST,0,0 +7778,TEST,0,0 +7779,TEST,0,0 +7780,TEST,0,0 +7781,TEST,0,0 +7782,TEST,0,0 +7783,TEST,0,0 +7784,TEST,0,0 +7785,TEST,0,0 +7786,TEST,0,0 +7787,TEST,0,0 +7788,TEST,0,0 +7789,TEST,0,0 +7790,TEST,0,0 +7791,TEST,0,0 +7792,TEST,0,0 +7793,TEST,0,0 +7794,TEST,0,0 +7795,TEST,0,0 +7796,TEST,0,0 +7797,TEST,0,0 +7798,TEST,0,0 +7799,TEST,0,0 +7800,TEST,0,0 +7801,TEST,0,0 +7802,TEST,0,0 +7803,TEST,0,0 +7804,TEST,0,0 +7805,TEST,0,0 +7806,TEST,0,0 +7807,TEST,0,0 +7808,TEST,0,0 +7809,TEST,0,0 +7810,TEST,0,0 +7811,TEST,0,0 +7812,TEST,0,0 +7813,TEST,0,0 +7814,TEST,0,0 +7815,TEST,0,0 +7816,TEST,0,0 +7817,TEST,0,0 +7818,TEST,0,0 +7819,TEST,0,0 +7820,TEST,0,0 +7821,TEST,0,0 +7822,TEST,0,0 +7823,TEST,0,0 +7824,TEST,0,0 +7825,TEST,0,0 +7826,TEST,0,0 +7827,TEST,0,0 +7828,TEST,0,0 +7829,TEST,0,0 +7830,TEST,0,0 +7831,TEST,0,0 +7832,TEST,0,0 +7833,TEST,0,0 +7834,TEST,0,0 +7835,TEST,0,0 +7836,TEST,0,0 +7837,TEST,0,0 +7838,TEST,0,0 +7839,TEST,0,0 +7840,TEST,0,0 +7841,TEST,0,0 +7842,TEST,0,0 +7843,TEST,0,0 +7844,TEST,0,0 +7845,TEST,0,0 +7846,TEST,0,0 +7847,TEST,0,0 +7848,TEST,0,0 +7849,TEST,0,0 +7850,TEST,0,0 +7851,TEST,0,0 +7852,TEST,0,0 +7853,TEST,0,0 +7854,TEST,0,0 +7855,TEST,0,0 +7856,TEST,0,0 +7857,TEST,0,0 +7858,TEST,0,0 +7859,TEST,0,0 +7860,TEST,0,0 +7861,TEST,0,0 +7862,TEST,0,0 +7863,TEST,0,0 +7864,TEST,0,0 +7865,TEST,0,0 +7866,TEST,0,0 +7867,TEST,0,0 +7868,TEST,0,0 +7869,TEST,0,0 +7870,TEST,0,0 +7871,TEST,0,0 +7872,TEST,0,0 +7873,TEST,0,0 +7874,TEST,0,0 +7875,TEST,0,0 +7876,TEST,0,0 +7877,TEST,0,0 +7878,TEST,0,0 +7879,TEST,0,0 +7880,TEST,0,0 +7881,TEST,0,0 +7882,TEST,0,0 +7883,TEST,0,0 +7884,TEST,0,0 +7885,TEST,0,0 +7886,TEST,0,0 +7887,TEST,0,0 +7888,TEST,0,0 +7889,TEST,0,0 +7890,TEST,0,0 +7891,TEST,0,0 +7892,TEST,0,0 +7893,TEST,0,0 +7894,TEST,0,0 +7895,TEST,0,0 +7896,TEST,0,0 +7897,TEST,0,0 +7898,TEST,0,0 +7899,TEST,0,0 +7900,TEST,0,0 +7901,TEST,0,0 +7902,TEST,0,0 +7903,TEST,0,0 +7904,TEST,0,0 +7905,TEST,0,0 +7906,TEST,0,0 +7907,TEST,0,0 +7908,TEST,0,0 +7909,TEST,0,0 +7910,TEST,0,0 +7911,TEST,0,0 +7912,TEST,0,0 +7913,TEST,0,0 +7914,TEST,0,0 +7915,TEST,0,0 +7916,TEST,0,0 +7917,TEST,0,0 +7918,TEST,0,0 +7919,TEST,0,0 +7920,TEST,0,0 +7921,TEST,0,0 +7922,TEST,0,0 +7923,TEST,0,0 +7924,TEST,0,0 +7925,TEST,0,0 +7926,TEST,0,0 +7927,TEST,0,0 +7928,TEST,0,0 +7929,TEST,0,0 +7930,TEST,0,0 +7931,TEST,0,0 +7932,TEST,0,0 +7933,TEST,0,0 +7934,TEST,0,0 +7935,TEST,0,0 +7936,TEST,0,0 +7937,TEST,0,0 +7938,TEST,0,0 +7939,TEST,0,0 +7940,TEST,0,0 +7941,TEST,0,0 +7942,TEST,0,0 +7943,TEST,0,0 +7944,TEST,0,0 +7945,TEST,0,0 +7946,TEST,0,0 +7947,TEST,0,0 +7948,TEST,0,0 +7949,TEST,0,0 +7950,TEST,0,0 +7951,TEST,0,0 +7952,TEST,0,0 +7953,TEST,0,0 +7954,TEST,0,0 +7955,TEST,0,0 +7956,TEST,0,0 +7957,TEST,0,0 +7958,TEST,0,0 +7959,TEST,0,0 +7960,TEST,0,0 +7961,TEST,0,0 +7962,TEST,0,0 +7963,TEST,0,0 +7964,TEST,0,0 +7965,TEST,0,0 +7966,TEST,0,0 +7967,TEST,0,0 +7968,TEST,0,0 +7969,TEST,0,0 +7970,TEST,0,0 +7971,TEST,0,0 +7972,TEST,0,0 +7973,TEST,0,0 +7974,TEST,0,0 +7975,TEST,0,0 +7976,TEST,0,0 +7977,TEST,0,0 +7978,TEST,0,0 +7979,TEST,0,0 +7980,TEST,0,0 +7981,TEST,0,0 +7982,TEST,0,0 +7983,TEST,0,0 +7984,TEST,0,0 +7985,TEST,0,0 +7986,TEST,0,0 +7987,TEST,0,0 +7988,TEST,0,0 +7989,TEST,0,0 +7990,TEST,0,0 +7991,TEST,0,0 +7992,TEST,0,0 +7993,TEST,0,0 +7994,TEST,0,0 +7995,TEST,0,0 +7996,TEST,0,0 +7997,TEST,0,0 +7998,TEST,0,0 +7999,TEST,0,0 +8000,TEST,0,0 +8001,TEST,0,0 +8002,TEST,0,0 +8003,TEST,0,0 +8004,TEST,0,0 +8005,TEST,0,0 +8006,TEST,0,0 +8007,TEST,0,0 +8008,TEST,0,0 +8009,TEST,0,0 +8010,TEST,0,0 +8011,TEST,0,0 +8012,TEST,0,0 +8013,TEST,0,0 +8014,TEST,0,0 +8015,TEST,0,0 +8016,TEST,0,0 +8017,TEST,0,0 +8018,TEST,0,0 +8019,TEST,0,0 +8020,TEST,0,0 +8021,TEST,0,0 +8022,TEST,0,0 +8023,TEST,0,0 +8024,TEST,0,0 +8025,TEST,0,0 +8026,TEST,0,0 +8027,TEST,0,0 +8028,TEST,0,0 +8029,TEST,0,0 +8030,TEST,0,0 +8031,TEST,0,0 +8032,TEST,0,0 +8033,TEST,0,0 +8034,TEST,0,0 +8035,TEST,0,0 +8036,TEST,0,0 +8037,TEST,0,0 +8038,TEST,0,0 +8039,TEST,0,0 +8040,TEST,0,0 +8041,TEST,0,0 +8042,TEST,0,0 +8043,TEST,0,0 +8044,TEST,0,0 +8045,TEST,0,0 +8046,TEST,0,0 +8047,TEST,0,0 +8048,TEST,0,0 +8049,TEST,0,0 +8050,TEST,0,0 +8051,TEST,0,0 +8052,TEST,0,0 +8053,TEST,0,0 +8054,TEST,0,0 +8055,TEST,0,0 +8056,TEST,0,0 +8057,TEST,0,0 +8058,TEST,0,0 +8059,TEST,0,0 +8060,TEST,0,0 +8061,TEST,0,0 +8062,TEST,0,0 +8063,TEST,0,0 +8064,TEST,0,0 +8065,TEST,0,0 +8066,TEST,0,0 +8067,TEST,0,0 +8068,TEST,0,0 +8069,TEST,0,0 +8070,TEST,0,0 +8071,TEST,0,0 +8072,TEST,0,0 +8073,TEST,0,0 +8074,TEST,0,0 +8075,TEST,0,0 +8076,TEST,0,0 +8077,TEST,0,0 +8078,TEST,0,0 +8079,TEST,0,0 +8080,TEST,0,0 +8081,TEST,0,0 +8082,TEST,0,0 +8083,TEST,0,0 +8084,TEST,0,0 +8085,TEST,0,0 +8086,TEST,0,0 +8087,TEST,0,0 +8088,TEST,0,0 +8089,TEST,0,0 +8090,TEST,0,0 +8091,TEST,0,0 +8092,TEST,0,0 +8093,TEST,0,0 +8094,TEST,0,0 +8095,TEST,0,0 +8096,TEST,0,0 +8097,TEST,0,0 +8098,TEST,0,0 +8099,TEST,0,0 +8100,TEST,0,0 +8101,TEST,0,0 +8102,TEST,0,0 +8103,TEST,0,0 +8104,TEST,0,0 +8105,TEST,0,0 +8106,TEST,0,0 +8107,TEST,0,0 +8108,TEST,0,0 +8109,TEST,0,0 +8110,TEST,0,0 +8111,TEST,0,0 +8112,TEST,0,0 +8113,TEST,0,0 +8114,TEST,0,0 +8115,TEST,0,0 +8116,TEST,0,0 +8117,TEST,0,0 +8118,TEST,0,0 +8119,TEST,0,0 +8120,TEST,0,0 +8121,TEST,0,0 +8122,TEST,0,0 +8123,TEST,0,0 +8124,TEST,0,0 +8125,TEST,0,0 +8126,TEST,0,0 +8127,TEST,0,0 +8128,TEST,0,0 +8129,TEST,0,0 +8130,TEST,0,0 +8131,TEST,0,0 +8132,TEST,0,0 +8133,TEST,0,0 +8134,TEST,0,0 +8135,TEST,0,0 +8136,TEST,0,0 +8137,TEST,0,0 +8138,TEST,0,0 +8139,TEST,0,0 +8140,TEST,0,0 +8141,TEST,0,0 +8142,TEST,0,0 +8143,TEST,0,0 +8144,TEST,0,0 +8145,TEST,0,0 +8146,TEST,0,0 +8147,TEST,0,0 +8148,TEST,0,0 +8149,TEST,0,0 +8150,TEST,0,0 +8151,TEST,0,0 +8152,TEST,0,0 +8153,TEST,0,0 +8154,TEST,0,0 +8155,TEST,0,0 +8156,TEST,0,0 +8157,TEST,0,0 +8158,TEST,0,0 +8159,TEST,0,0 +8160,TEST,0,0 +8161,TEST,0,0 +8162,TEST,0,0 +8163,TEST,0,0 +8164,TEST,0,0 +8165,TEST,0,0 +8166,TEST,0,0 +8167,TEST,0,0 +8168,TEST,0,0 +8169,TEST,0,0 +8170,TEST,0,0 +8171,TEST,0,0 +8172,TEST,0,0 +8173,TEST,0,0 +8174,TEST,0,0 +8175,TEST,0,0 +8176,TEST,0,0 +8177,TEST,0,0 +8178,TEST,0,0 +8179,TEST,0,0 +8180,TEST,0,0 +8181,TEST,0,0 +8182,TEST,0,0 +8183,TEST,0,0 +8184,TEST,0,0 +8185,TEST,0,0 +8186,TEST,0,0 +8187,TEST,0,0 +8188,TEST,0,0 +8189,TEST,0,0 +8190,TEST,0,0 +8191,TEST,0,0 +8192,TEST,0,0 +8193,TEST,0,0 +8194,TEST,0,0 +8195,TEST,0,0 +8196,TEST,0,0 +8197,TEST,0,0 +8198,TEST,0,0 +8199,TEST,0,0 +8200,TEST,0,0 +8201,TEST,0,0 +8202,TEST,0,0 +8203,TEST,0,0 +8204,TEST,0,0 +8205,TEST,0,0 +8206,TEST,0,0 +8207,TEST,0,0 +8208,TEST,0,0 +8209,TEST,0,0 +8210,TEST,0,0 +8211,TEST,0,0 +8212,TEST,0,0 +8213,TEST,0,0 +8214,TEST,0,0 +8215,TEST,0,0 +8216,TEST,0,0 +8217,TEST,0,0 +8218,TEST,0,0 +8219,TEST,0,0 +8220,TEST,0,0 +8221,TEST,0,0 +8222,TEST,0,0 +8223,TEST,0,0 +8224,TEST,0,0 +8225,TEST,0,0 +8226,TEST,0,0 +8227,TEST,0,0 +8228,TEST,0,0 +8229,TEST,0,0 +8230,TEST,0,0 +8231,TEST,0,0 +8232,TEST,0,0 +8233,TEST,0,0 +8234,TEST,0,0 +8235,TEST,0,0 +8236,TEST,0,0 +8237,TEST,0,0 +8238,TEST,0,0 +8239,TEST,0,0 +8240,TEST,0,0 +8241,TEST,0,0 +8242,TEST,0,0 +8243,TEST,0,0 +8244,TEST,0,0 +8245,TEST,0,0 +8246,TEST,0,0 +8247,TEST,0,0 +8248,TEST,0,0 +8249,TEST,0,0 +8250,TEST,0,0 +8251,TEST,0,0 +8252,TEST,0,0 +8253,TEST,0,0 +8254,TEST,0,0 +8255,TEST,0,0 +8256,TEST,0,0 +8257,TEST,0,0 +8258,TEST,0,0 +8259,TEST,0,0 +8260,TEST,0,0 +8261,TEST,0,0 +8262,TEST,0,0 +8263,TEST,0,0 +8264,TEST,0,0 +8265,TEST,0,0 +8266,TEST,0,0 +8267,TEST,0,0 +8268,TEST,0,0 +8269,TEST,0,0 +8270,TEST,0,0 +8271,TEST,0,0 +8272,TEST,0,0 +8273,TEST,0,0 +8274,TEST,0,0 +8275,TEST,0,0 +8276,TEST,0,0 +8277,TEST,0,0 +8278,TEST,0,0 +8279,TEST,0,0 +8280,TEST,0,0 +8281,TEST,0,0 +8282,TEST,0,0 +8283,TEST,0,0 +8284,TEST,0,0 +8285,TEST,0,0 +8286,TEST,0,0 +8287,TEST,0,0 +8288,TEST,0,0 +8289,TEST,0,0 +8290,TEST,0,0 +8291,TEST,0,0 +8292,TEST,0,0 +8293,TEST,0,0 +8294,TEST,0,0 +8295,TEST,0,0 +8296,TEST,0,0 +8297,TEST,0,0 +8298,TEST,0,0 +8299,TEST,0,0 +8300,TEST,0,0 +8301,TEST,0,0 +8302,TEST,0,0 +8303,TEST,0,0 +8304,TEST,0,0 +8305,TEST,0,0 +8306,TEST,0,0 +8307,TEST,0,0 +8308,TEST,0,0 +8309,TEST,0,0 +8310,TEST,0,0 +8311,TEST,0,0 +8312,TEST,0,0 +8313,TEST,0,0 +8314,TEST,0,0 +8315,TEST,0,0 +8316,TEST,0,0 +8317,TEST,0,0 +8318,TEST,0,0 +8319,TEST,0,0 +8320,TEST,0,0 +8321,TEST,0,0 +8322,TEST,0,0 +8323,TEST,0,0 +8324,TEST,0,0 +8325,TEST,0,0 +8326,TEST,0,0 +8327,TEST,0,0 +8328,TEST,0,0 +8329,TEST,0,0 +8330,TEST,0,0 +8331,TEST,0,0 +8332,TEST,0,0 +8333,TEST,0,0 +8334,TEST,0,0 +8335,TEST,0,0 +8336,TEST,0,0 +8337,TEST,0,0 +8338,TEST,0,0 +8339,TEST,0,0 +8340,TEST,0,0 +8341,TEST,0,0 +8342,TEST,0,0 +8343,TEST,0,0 +8344,TEST,0,0 +8345,TEST,0,0 +8346,TEST,0,0 +8347,TEST,0,0 +8348,TEST,0,0 +8349,TEST,0,0 +8350,TEST,0,0 +8351,TEST,0,0 +8352,TEST,0,0 +8353,TEST,0,0 +8354,TEST,0,0 +8355,TEST,0,0 +8356,TEST,0,0 +8357,TEST,0,0 +8358,TEST,0,0 +8359,TEST,0,0 +8360,TEST,0,0 +8361,TEST,0,0 +8362,TEST,0,0 +8363,TEST,0,0 +8364,TEST,0,0 +8365,TEST,0,0 +8366,TEST,0,0 +8367,TEST,0,0 +8368,TEST,0,0 +8369,TEST,0,0 +8370,TEST,0,0 +8371,TEST,0,0 +8372,TEST,0,0 +8373,TEST,0,0 +8374,TEST,0,0 +8375,TEST,0,0 +8376,TEST,0,0 +8377,TEST,0,0 +8378,TEST,0,0 +8379,TEST,0,0 +8380,TEST,0,0 +8381,TEST,0,0 +8382,TEST,0,0 +8383,TEST,0,0 +8384,TEST,0,0 +8385,TEST,0,0 +8386,TEST,0,0 +8387,TEST,0,0 +8388,TEST,0,0 +8389,TEST,0,0 +8390,TEST,0,0 +8391,TEST,0,0 +8392,TEST,0,0 +8393,TEST,0,0 +8394,TEST,0,0 +8395,TEST,0,0 +8396,TEST,0,0 +8397,TEST,0,0 +8398,TEST,0,0 +8399,TEST,0,0 +8400,TEST,0,0 +8401,TEST,0,0 +8402,TEST,0,0 +8403,TEST,0,0 +8404,TEST,0,0 +8405,TEST,0,0 +8406,TEST,0,0 +8407,TEST,0,0 +8408,TEST,0,0 +8409,TEST,0,0 +8410,TEST,0,0 +8411,TEST,0,0 +8412,TEST,0,0 +8413,TEST,0,0 +8414,TEST,0,0 +8415,TEST,0,0 +8416,TEST,0,0 +8417,TEST,0,0 +8418,TEST,0,0 +8419,TEST,0,0 +8420,TEST,0,0 +8421,TEST,0,0 +8422,TEST,0,0 +8423,TEST,0,0 +8424,TEST,0,0 +8425,TEST,0,0 +8426,TEST,0,0 +8427,TEST,0,0 +8428,TEST,0,0 +8429,TEST,0,0 +8430,TEST,0,0 +8431,TEST,0,0 +8432,TEST,0,0 +8433,TEST,0,0 +8434,TEST,0,0 +8435,TEST,0,0 +8436,TEST,0,0 +8437,TEST,0,0 +8438,TEST,0,0 +8439,TEST,0,0 +8440,TEST,0,0 +8441,TEST,0,0 +8442,TEST,0,0 +8443,TEST,0,0 +8444,TEST,0,0 +8445,TEST,0,0 +8446,TEST,0,0 +8447,TEST,0,0 +8448,TEST,0,0 +8449,TEST,0,0 +8450,TEST,0,0 +8451,TEST,0,0 +8452,TEST,0,0 +8453,TEST,0,0 +8454,TEST,0,0 +8455,TEST,0,0 +8456,TEST,0,0 +8457,TEST,0,0 +8458,TEST,0,0 +8459,TEST,0,0 +8460,TEST,0,0 +8461,TEST,0,0 +8462,TEST,0,0 +8463,TEST,0,0 +8464,TEST,0,0 +8465,TEST,0,0 +8466,TEST,0,0 +8467,TEST,0,0 +8468,TEST,0,0 +8469,TEST,0,0 +8470,TEST,0,0 +8471,TEST,0,0 +8472,TEST,0,0 +8473,TEST,0,0 +8474,TEST,0,0 +8475,TEST,0,0 +8476,TEST,0,0 +8477,TEST,0,0 +8478,TEST,0,0 +8479,TEST,0,0 +8480,TEST,0,0 +8481,TEST,0,0 +8482,TEST,0,0 +8483,TEST,0,0 +8484,TEST,0,0 +8485,TEST,0,0 +8486,TEST,0,0 +8487,TEST,0,0 +8488,TEST,0,0 +8489,TEST,0,0 +8490,TEST,0,0 +8491,TEST,0,0 +8492,TEST,0,0 +8493,TEST,0,0 +8494,TEST,0,0 +8495,TEST,0,0 +8496,TEST,0,0 +8497,TEST,0,0 +8498,TEST,0,0 +8499,TEST,0,0 +8500,TEST,0,0 +8501,TEST,0,0 +8502,TEST,0,0 +8503,TEST,0,0 +8504,TEST,0,0 +8505,TEST,0,0 +8506,TEST,0,0 +8507,TEST,0,0 +8508,TEST,0,0 +8509,TEST,0,0 +8510,TEST,0,0 +8511,TEST,0,0 +8512,TEST,0,0 +8513,TEST,0,0 +8514,TEST,0,0 +8515,TEST,0,0 +8516,TEST,0,0 +8517,TEST,0,0 +8518,TEST,0,0 +8519,TEST,0,0 +8520,TEST,0,0 +8521,TEST,0,0 +8522,TEST,0,0 +8523,TEST,0,0 +8524,TEST,0,0 +8525,TEST,0,0 +8526,TEST,0,0 +8527,TEST,0,0 +8528,TEST,0,0 +8529,TEST,0,0 +8530,TEST,0,0 +8531,TEST,0,0 +8532,TEST,0,0 +8533,TEST,0,0 +8534,TEST,0,0 +8535,TEST,0,0 +8536,TEST,0,0 +8537,TEST,0,0 +8538,TEST,0,0 +8539,TEST,0,0 +8540,TEST,0,0 +8541,TEST,0,0 +8542,TEST,0,0 +8543,TEST,0,0 +8544,TEST,0,0 +8545,TEST,0,0 +8546,TEST,0,0 +8547,TEST,0,0 +8548,TEST,0,0 +8549,TEST,0,0 +8550,TEST,0,0 +8551,TEST,0,0 +8552,TEST,0,0 +8553,TEST,0,0 +8554,TEST,0,0 +8555,TEST,0,0 +8556,TEST,0,0 +8557,TEST,0,0 +8558,TEST,0,0 +8559,TEST,0,0 +8560,TEST,0,0 +8561,TEST,0,0 +8562,TEST,0,0 +8563,TEST,0,0 +8564,TEST,0,0 +8565,TEST,0,0 +8566,TEST,0,0 +8567,TEST,0,0 +8568,TEST,0,0 +8569,TEST,0,0 +8570,TEST,0,0 +8571,TEST,0,0 +8572,TEST,0,0 +8573,TEST,0,0 +8574,TEST,0,0 +8575,TEST,0,0 +8576,TEST,0,0 +8577,TEST,0,0 +8578,TEST,0,0 +8579,TEST,0,0 +8580,TEST,0,0 +8581,TEST,0,0 +8582,TEST,0,0 +8583,TEST,0,0 +8584,TEST,0,0 +8585,TEST,0,0 +8586,TEST,0,0 +8587,TEST,0,0 +8588,TEST,0,0 +8589,TEST,0,0 +8590,TEST,0,0 +8591,TEST,0,0 +8592,TEST,0,0 +8593,TEST,0,0 +8594,TEST,0,0 +8595,TEST,0,0 +8596,TEST,0,0 +8597,TEST,0,0 +8598,TEST,0,0 +8599,TEST,0,0 +8600,TEST,0,0 +8601,TEST,0,0 +8602,TEST,0,0 +8603,TEST,0,0 +8604,TEST,0,0 +8605,TEST,0,0 +8606,TEST,0,0 +8607,TEST,0,0 +8608,TEST,0,0 +8609,TEST,0,0 +8610,TEST,0,0 +8611,TEST,0,0 +8612,TEST,0,0 +8613,TEST,0,0 +8614,TEST,0,0 +8615,TEST,0,0 +8616,TEST,0,0 +8617,TEST,0,0 +8618,TEST,0,0 +8619,TEST,0,0 +8620,TEST,0,0 +8621,TEST,0,0 +8622,TEST,0,0 +8623,TEST,0,0 +8624,TEST,0,0 +8625,TEST,0,0 +8626,TEST,0,0 +8627,TEST,0,0 +8628,TEST,0,0 +8629,TEST,0,0 +8630,TEST,0,0 +8631,TEST,0,0 +8632,TEST,0,0 +8633,TEST,0,0 +8634,TEST,0,0 +8635,TEST,0,0 +8636,TEST,0,0 +8637,TEST,0,0 +8638,TEST,0,0 +8639,TEST,0,0 +8640,TEST,0,0 +8641,TEST,0,0 +8642,TEST,0,0 +8643,TEST,0,0 +8644,TEST,0,0 +8645,TEST,0,0 +8646,TEST,0,0 +8647,TEST,0,0 +8648,TEST,0,0 +8649,TEST,0,0 +8650,TEST,0,0 +8651,TEST,0,0 +8652,TEST,0,0 +8653,TEST,0,0 +8654,TEST,0,0 +8655,TEST,0,0 +8656,TEST,0,0 +8657,TEST,0,0 +8658,TEST,0,0 +8659,TEST,0,0 +8660,TEST,0,0 +8661,TEST,0,0 +8662,TEST,0,0 +8663,TEST,0,0 +8664,TEST,0,0 +8665,TEST,0,0 +8666,TEST,0,0 +8667,TEST,0,0 +8668,TEST,0,0 +8669,TEST,0,0 +8670,TEST,0,0 +8671,TEST,0,0 +8672,TEST,0,0 +8673,TEST,0,0 +8674,TEST,0,0 +8675,TEST,0,0 +8676,TEST,0,0 +8677,TEST,0,0 +8678,TEST,0,0 +8679,TEST,0,0 +8680,TEST,0,0 +8681,TEST,0,0 +8682,TEST,0,0 +8683,TEST,0,0 +8684,TEST,0,0 +8685,TEST,0,0 +8686,TEST,0,0 +8687,TEST,0,0 +8688,TEST,0,0 +8689,TEST,0,0 +8690,TEST,0,0 +8691,TEST,0,0 +8692,TEST,0,0 +8693,TEST,0,0 +8694,TEST,0,0 +8695,TEST,0,0 +8696,TEST,0,0 +8697,TEST,0,0 +8698,TEST,0,0 +8699,TEST,0,0 +8700,TEST,0,0 +8701,TEST,0,0 +8702,TEST,0,0 +8703,TEST,0,0 +8704,TEST,0,0 +8705,TEST,0,0 +8706,TEST,0,0 +8707,TEST,0,0 +8708,TEST,0,0 +8709,TEST,0,0 +8710,TEST,0,0 +8711,TEST,0,0 +8712,TEST,0,0 +8713,TEST,0,0 +8714,TEST,0,0 +8715,TEST,0,0 +8716,TEST,0,0 +8717,TEST,0,0 +8718,TEST,0,0 +8719,TEST,0,0 +8720,TEST,0,0 +8721,TEST,0,0 +8722,TEST,0,0 +8723,TEST,0,0 +8724,TEST,0,0 +8725,TEST,0,0 +8726,TEST,0,0 +8727,TEST,0,0 +8728,TEST,0,0 +8729,TEST,0,0 +8730,TEST,0,0 +8731,TEST,0,0 +8732,TEST,0,0 +8733,TEST,0,0 +8734,TEST,0,0 +8735,TEST,0,0 +8736,TEST,0,0 +8737,TEST,0,0 +8738,TEST,0,0 +8739,TEST,0,0 +8740,TEST,0,0 +8741,TEST,0,0 +8742,TEST,0,0 +8743,TEST,0,0 +8744,TEST,0,0 +8745,TEST,0,0 +8746,TEST,0,0 +8747,TEST,0,0 +8748,TEST,0,0 +8749,TEST,0,0 +8750,TEST,0,0 +8751,TEST,0,0 +8752,TEST,0,0 +8753,TEST,0,0 +8754,TEST,0,0 +8755,TEST,0,0 +8756,TEST,0,0 +8757,TEST,0,0 +8758,TEST,0,0 +8759,TEST,0,0 +8760,TEST,0,0 +8761,TEST,0,0 +8762,TEST,0,0 +8763,TEST,0,0 +8764,TEST,0,0 +8765,TEST,0,0 +8766,TEST,0,0 +8767,TEST,0,0 +8768,TEST,0,0 +8769,TEST,0,0 +8770,TEST,0,0 +8771,TEST,0,0 +8772,TEST,0,0 +8773,TEST,0,0 +8774,TEST,0,0 +8775,TEST,0,0 +8776,TEST,0,0 +8777,TEST,0,0 +8778,TEST,0,0 +8779,TEST,0,0 +8780,TEST,0,0 +8781,TEST,0,0 +8782,TEST,0,0 +8783,TEST,0,0 +8784,TEST,0,0 +8785,TEST,0,0 +8786,TEST,0,0 +8787,TEST,0,0 +8788,TEST,0,0 +8789,TEST,0,0 +8790,TEST,0,0 +8791,TEST,0,0 +8792,TEST,0,0 +8793,TEST,0,0 +8794,TEST,0,0 +8795,TEST,0,0 +8796,TEST,0,0 +8797,TEST,0,0 +8798,TEST,0,0 +8799,TEST,0,0 +8800,TEST,0,0 +8801,TEST,0,0 +8802,TEST,0,0 +8803,TEST,0,0 +8804,TEST,0,0 +8805,TEST,0,0 +8806,TEST,0,0 +8807,TEST,0,0 +8808,TEST,0,0 +8809,TEST,0,0 +8810,TEST,0,0 +8811,TEST,0,0 +8812,TEST,0,0 +8813,TEST,0,0 +8814,TEST,0,0 +8815,TEST,0,0 +8816,TEST,0,0 +8817,TEST,0,0 +8818,TEST,0,0 +8819,TEST,0,0 +8820,TEST,0,0 +8821,TEST,0,0 +8822,TEST,0,0 +8823,TEST,0,0 +8824,TEST,0,0 +8825,TEST,0,0 +8826,TEST,0,0 +8827,TEST,0,0 +8828,TEST,0,0 +8829,TEST,0,0 +8830,TEST,0,0 +8831,TEST,0,0 +8832,TEST,0,0 +8833,TEST,0,0 +8834,TEST,0,0 +8835,TEST,0,0 +8836,TEST,0,0 +8837,TEST,0,0 +8838,TEST,0,0 +8839,TEST,0,0 +8840,TEST,0,0 +8841,TEST,0,0 +8842,TEST,0,0 +8843,TEST,0,0 +8844,TEST,0,0 +8845,TEST,0,0 +8846,TEST,0,0 +8847,TEST,0,0 +8848,TEST,0,0 +8849,TEST,0,0 +8850,TEST,0,0 +8851,TEST,0,0 +8852,TEST,0,0 +8853,TEST,0,0 +8854,TEST,0,0 +8855,TEST,0,0 +8856,TEST,0,0 +8857,TEST,0,0 +8858,TEST,0,0 +8859,TEST,0,0 +8860,TEST,0,0 +8861,TEST,0,0 +8862,TEST,0,0 +8863,TEST,0,0 +8864,TEST,0,0 +8865,TEST,0,0 +8866,TEST,0,0 +8867,TEST,0,0 +8868,TEST,0,0 +8869,TEST,0,0 +8870,TEST,0,0 +8871,TEST,0,0 +8872,TEST,0,0 +8873,TEST,0,0 +8874,TEST,0,0 +8875,TEST,0,0 +8876,TEST,0,0 +8877,TEST,0,0 +8878,TEST,0,0 +8879,TEST,0,0 +8880,TEST,0,0 +8881,TEST,0,0 +8882,TEST,0,0 +8883,TEST,0,0 +8884,TEST,0,0 +8885,TEST,0,0 +8886,TEST,0,0 +8887,TEST,0,0 +8888,TEST,0,0 +8889,TEST,0,0 +8890,TEST,0,0 +8891,TEST,0,0 +8892,TEST,0,0 +8893,TEST,0,0 +8894,TEST,0,0 +8895,TEST,0,0 +8896,TEST,0,0 +8897,TEST,0,0 +8898,TEST,0,0 +8899,TEST,0,0 +8900,TEST,0,0 +8901,TEST,0,0 +8902,TEST,0,0 +8903,TEST,0,0 +8904,TEST,0,0 +8905,TEST,0,0 +8906,TEST,0,0 +8907,TEST,0,0 +8908,TEST,0,0 +8909,TEST,0,0 +8910,TEST,0,0 +8911,TEST,0,0 +8912,TEST,0,0 +8913,TEST,0,0 +8914,TEST,0,0 +8915,TEST,0,0 +8916,TEST,0,0 +8917,TEST,0,0 +8918,TEST,0,0 +8919,TEST,0,0 +8920,TEST,0,0 +8921,TEST,0,0 +8922,TEST,0,0 +8923,TEST,0,0 +8924,TEST,0,0 +8925,TEST,0,0 +8926,TEST,0,0 +8927,TEST,0,0 +8928,TEST,0,0 +8929,TEST,0,0 +8930,TEST,0,0 +8931,TEST,0,0 +8932,TEST,0,0 +8933,TEST,0,0 +8934,TEST,0,0 +8935,TEST,0,0 +8936,TEST,0,0 +8937,TEST,0,0 +8938,TEST,0,0 +8939,TEST,0,0 +8940,TEST,0,0 +8941,TEST,0,0 +8942,TEST,0,0 +8943,TEST,0,0 +8944,TEST,0,0 +8945,TEST,0,0 +8946,TEST,0,0 +8947,TEST,0,0 +8948,TEST,0,0 +8949,TEST,0,0 +8950,TEST,0,0 +8951,TEST,0,0 +8952,TEST,0,0 +8953,TEST,0,0 +8954,TEST,0,0 +8955,TEST,0,0 +8956,TEST,0,0 +8957,TEST,0,0 +8958,TEST,0,0 +8959,TEST,0,0 +8960,TEST,0,0 +8961,TEST,0,0 +8962,TEST,0,0 +8963,TEST,0,0 +8964,TEST,0,0 +8965,TEST,0,0 +8966,TEST,0,0 +8967,TEST,0,0 +8968,TEST,0,0 +8969,TEST,0,0 +8970,TEST,0,0 +8971,TEST,0,0 +8972,TEST,0,0 +8973,TEST,0,0 +8974,TEST,0,0 +8975,TEST,0,0 +8976,TEST,0,0 +8977,TEST,0,0 +8978,TEST,0,0 +8979,TEST,0,0 +8980,TEST,0,0 +8981,TEST,0,0 +8982,TEST,0,0 +8983,TEST,0,0 +8984,TEST,0,0 +8985,TEST,0,0 +8986,TEST,0,0 +8987,TEST,0,0 +8988,TEST,0,0 +8989,TEST,0,0 +8990,TEST,0,0 +8991,TEST,0,0 +8992,TEST,0,0 +8993,TEST,0,0 +8994,TEST,0,0 +8995,TEST,0,0 +8996,TEST,0,0 +8997,TEST,0,0 +8998,TEST,0,0 +8999,TEST,0,0 +9000,TEST,0,0 +9001,TEST,0,0 +9002,TEST,0,0 +9003,TEST,0,0 +9004,TEST,0,0 +9005,TEST,0,0 +9006,TEST,0,0 +9007,TEST,0,0 +9008,TEST,0,0 +9009,TEST,0,0 +9010,TEST,0,0 +9011,TEST,0,0 +9012,TEST,0,0 +9013,TEST,0,0 +9014,TEST,0,0 +9015,TEST,0,0 +9016,TEST,0,0 +9017,TEST,0,0 +9018,TEST,0,0 +9019,TEST,0,0 +9020,TEST,0,0 +9021,TEST,0,0 +9022,TEST,0,0 +9023,TEST,0,0 +9024,TEST,0,0 +9025,TEST,0,0 +9026,TEST,0,0 +9027,TEST,0,0 +9028,TEST,0,0 +9029,TEST,0,0 +9030,TEST,0,0 +9031,TEST,0,0 +9032,TEST,0,0 +9033,TEST,0,0 +9034,TEST,0,0 +9035,TEST,0,0 +9036,TEST,0,0 +9037,TEST,0,0 +9038,TEST,0,0 +9039,TEST,0,0 +9040,TEST,0,0 +9041,TEST,0,0 +9042,TEST,0,0 +9043,TEST,0,0 +9044,TEST,0,0 +9045,TEST,0,0 +9046,TEST,0,0 +9047,TEST,0,0 +9048,TEST,0,0 +9049,TEST,0,0 +9050,TEST,0,0 +9051,TEST,0,0 +9052,TEST,0,0 +9053,TEST,0,0 +9054,TEST,0,0 +9055,TEST,0,0 +9056,TEST,0,0 +9057,TEST,0,0 +9058,TEST,0,0 +9059,TEST,0,0 +9060,TEST,0,0 +9061,TEST,0,0 +9062,TEST,0,0 +9063,TEST,0,0 +9064,TEST,0,0 +9065,TEST,0,0 +9066,TEST,0,0 +9067,TEST,0,0 +9068,TEST,0,0 +9069,TEST,0,0 +9070,TEST,0,0 +9071,TEST,0,0 +9072,TEST,0,0 +9073,TEST,0,0 +9074,TEST,0,0 +9075,TEST,0,0 +9076,TEST,0,0 +9077,TEST,0,0 +9078,TEST,0,0 +9079,TEST,0,0 +9080,TEST,0,0 +9081,TEST,0,0 +9082,TEST,0,0 +9083,TEST,0,0 +9084,TEST,0,0 +9085,TEST,0,0 +9086,TEST,0,0 +9087,TEST,0,0 +9088,TEST,0,0 +9089,TEST,0,0 +9090,TEST,0,0 +9091,TEST,0,0 +9092,TEST,0,0 +9093,TEST,0,0 +9094,TEST,0,0 +9095,TEST,0,0 +9096,TEST,0,0 +9097,TEST,0,0 +9098,TEST,0,0 +9099,TEST,0,0 +9100,TEST,0,0 +9101,TEST,0,0 +9102,TEST,0,0 +9103,TEST,0,0 +9104,TEST,0,0 +9105,TEST,0,0 +9106,TEST,0,0 +9107,TEST,0,0 +9108,TEST,0,0 +9109,TEST,0,0 +9110,TEST,0,0 +9111,TEST,0,0 +9112,TEST,0,0 +9113,TEST,0,0 +9114,TEST,0,0 +9115,TEST,0,0 +9116,TEST,0,0 +9117,TEST,0,0 +9118,TEST,0,0 +9119,TEST,0,0 +9120,TEST,0,0 +9121,TEST,0,0 +9122,TEST,0,0 +9123,TEST,0,0 +9124,TEST,0,0 +9125,TEST,0,0 +9126,TEST,0,0 +9127,TEST,0,0 +9128,TEST,0,0 +9129,TEST,0,0 +9130,TEST,0,0 +9131,TEST,0,0 +9132,TEST,0,0 +9133,TEST,0,0 +9134,TEST,0,0 +9135,TEST,0,0 +9136,TEST,0,0 +9137,TEST,0,0 +9138,TEST,0,0 +9139,TEST,0,0 +9140,TEST,0,0 +9141,TEST,0,0 +9142,TEST,0,0 +9143,TEST,0,0 +9144,TEST,0,0 +9145,TEST,0,0 +9146,TEST,0,0 +9147,TEST,0,0 +9148,TEST,0,0 +9149,TEST,0,0 +9150,TEST,0,0 +9151,TEST,0,0 +9152,TEST,0,0 +9153,TEST,0,0 +9154,TEST,0,0 +9155,TEST,0,0 +9156,TEST,0,0 +9157,TEST,0,0 +9158,TEST,0,0 +9159,TEST,0,0 +9160,TEST,0,0 +9161,TEST,0,0 +9162,TEST,0,0 +9163,TEST,0,0 +9164,TEST,0,0 +9165,TEST,0,0 +9166,TEST,0,0 +9167,TEST,0,0 +9168,TEST,0,0 +9169,TEST,0,0 +9170,TEST,0,0 +9171,TEST,0,0 +9172,TEST,0,0 +9173,TEST,0,0 +9174,TEST,0,0 +9175,TEST,0,0 +9176,TEST,0,0 +9177,TEST,0,0 +9178,TEST,0,0 +9179,TEST,0,0 +9180,TEST,0,0 +9181,TEST,0,0 +9182,TEST,0,0 +9183,TEST,0,0 +9184,TEST,0,0 +9185,TEST,0,0 +9186,TEST,0,0 +9187,TEST,0,0 +9188,TEST,0,0 +9189,TEST,0,0 +9190,TEST,0,0 +9191,TEST,0,0 +9192,TEST,0,0 +9193,TEST,0,0 +9194,TEST,0,0 +9195,TEST,0,0 +9196,TEST,0,0 +9197,TEST,0,0 +9198,TEST,0,0 +9199,TEST,0,0 +9200,TEST,0,0 +9201,TEST,0,0 +9202,TEST,0,0 +9203,TEST,0,0 +9204,TEST,0,0 +9205,TEST,0,0 +9206,TEST,0,0 +9207,TEST,0,0 +9208,TEST,0,0 +9209,TEST,0,0 +9210,TEST,0,0 +9211,TEST,0,0 +9212,TEST,0,0 +9213,TEST,0,0 +9214,TEST,0,0 +9215,TEST,0,0 +9216,TEST,0,0 +9217,TEST,0,0 +9218,TEST,0,0 +9219,TEST,0,0 +9220,TEST,0,0 +9221,TEST,0,0 +9222,TEST,0,0 +9223,TEST,0,0 +9224,TEST,0,0 +9225,TEST,0,0 +9226,TEST,0,0 +9227,TEST,0,0 +9228,TEST,0,0 +9229,TEST,0,0 +9230,TEST,0,0 +9231,TEST,0,0 +9232,TEST,0,0 +9233,TEST,0,0 +9234,TEST,0,0 +9235,TEST,0,0 +9236,TEST,0,0 +9237,TEST,0,0 +9238,TEST,0,0 +9239,TEST,0,0 +9240,TEST,0,0 +9241,TEST,0,0 +9242,TEST,0,0 +9243,TEST,0,0 +9244,TEST,0,0 +9245,TEST,0,0 +9246,TEST,0,0 +9247,TEST,0,0 +9248,TEST,0,0 +9249,TEST,0,0 +9250,TEST,0,0 +9251,TEST,0,0 +9252,TEST,0,0 +9253,TEST,0,0 +9254,TEST,0,0 +9255,TEST,0,0 +9256,TEST,0,0 +9257,TEST,0,0 +9258,TEST,0,0 +9259,TEST,0,0 +9260,TEST,0,0 +9261,TEST,0,0 +9262,TEST,0,0 +9263,TEST,0,0 +9264,TEST,0,0 +9265,TEST,0,0 +9266,TEST,0,0 +9267,TEST,0,0 +9268,TEST,0,0 +9269,TEST,0,0 +9270,TEST,0,0 +9271,TEST,0,0 +9272,TEST,0,0 +9273,TEST,0,0 +9274,TEST,0,0 +9275,TEST,0,0 +9276,TEST,0,0 +9277,TEST,0,0 +9278,TEST,0,0 +9279,TEST,0,0 +9280,TEST,0,0 +9281,TEST,0,0 +9282,TEST,0,0 +9283,TEST,0,0 +9284,TEST,0,0 +9285,TEST,0,0 +9286,TEST,0,0 +9287,TEST,0,0 +9288,TEST,0,0 +9289,TEST,0,0 +9290,TEST,0,0 +9291,TEST,0,0 +9292,TEST,0,0 +9293,TEST,0,0 +9294,TEST,0,0 +9295,TEST,0,0 +9296,TEST,0,0 +9297,TEST,0,0 +9298,TEST,0,0 +9299,TEST,0,0 +9300,TEST,0,0 +9301,TEST,0,0 +9302,TEST,0,0 +9303,TEST,0,0 +9304,TEST,0,0 +9305,TEST,0,0 +9306,TEST,0,0 +9307,TEST,0,0 +9308,TEST,0,0 +9309,TEST,0,0 +9310,TEST,0,0 +9311,TEST,0,0 +9312,TEST,0,0 +9313,TEST,0,0 +9314,TEST,0,0 +9315,TEST,0,0 +9316,TEST,0,0 +9317,TEST,0,0 +9318,TEST,0,0 +9319,TEST,0,0 +9320,TEST,0,0 +9321,TEST,0,0 +9322,TEST,0,0 +9323,TEST,0,0 +9324,TEST,0,0 +9325,TEST,0,0 +9326,TEST,0,0 +9327,TEST,0,0 +9328,TEST,0,0 +9329,TEST,0,0 +9330,TEST,0,0 +9331,TEST,0,0 +9332,TEST,0,0 +9333,TEST,0,0 +9334,TEST,0,0 +9335,TEST,0,0 +9336,TEST,0,0 +9337,TEST,0,0 +9338,TEST,0,0 +9339,TEST,0,0 +9340,TEST,0,0 +9341,TEST,0,0 +9342,TEST,0,0 +9343,TEST,0,0 +9344,TEST,0,0 +9345,TEST,0,0 +9346,TEST,0,0 +9347,TEST,0,0 +9348,TEST,0,0 +9349,TEST,0,0 +9350,TEST,0,0 +9351,TEST,0,0 +9352,TEST,0,0 +9353,TEST,0,0 +9354,TEST,0,0 +9355,TEST,0,0 +9356,TEST,0,0 +9357,TEST,0,0 +9358,TEST,0,0 +9359,TEST,0,0 +9360,TEST,0,0 +9361,TEST,0,0 +9362,TEST,0,0 +9363,TEST,0,0 +9364,TEST,0,0 +9365,TEST,0,0 +9366,TEST,0,0 +9367,TEST,0,0 +9368,TEST,0,0 +9369,TEST,0,0 +9370,TEST,0,0 +9371,TEST,0,0 +9372,TEST,0,0 +9373,TEST,0,0 +9374,TEST,0,0 +9375,TEST,0,0 +9376,TEST,0,0 +9377,TEST,0,0 +9378,TEST,0,0 +9379,TEST,0,0 +9380,TEST,0,0 +9381,TEST,0,0 +9382,TEST,0,0 +9383,TEST,0,0 +9384,TEST,0,0 +9385,TEST,0,0 +9386,TEST,0,0 +9387,TEST,0,0 +9388,TEST,0,0 +9389,TEST,0,0 +9390,TEST,0,0 +9391,TEST,0,0 +9392,TEST,0,0 +9393,TEST,0,0 +9394,TEST,0,0 +9395,TEST,0,0 +9396,TEST,0,0 +9397,TEST,0,0 +9398,TEST,0,0 +9399,TEST,0,0 +9400,TEST,0,0 +9401,TEST,0,0 +9402,TEST,0,0 +9403,TEST,0,0 +9404,TEST,0,0 +9405,TEST,0,0 +9406,TEST,0,0 +9407,TEST,0,0 +9408,TEST,0,0 +9409,TEST,0,0 +9410,TEST,0,0 +9411,TEST,0,0 +9412,TEST,0,0 +9413,TEST,0,0 +9414,TEST,0,0 +9415,TEST,0,0 +9416,TEST,0,0 +9417,TEST,0,0 +9418,TEST,0,0 +9419,TEST,0,0 +9420,TEST,0,0 +9421,TEST,0,0 +9422,TEST,0,0 +9423,TEST,0,0 +9424,TEST,0,0 +9425,TEST,0,0 +9426,TEST,0,0 +9427,TEST,0,0 +9428,TEST,0,0 +9429,TEST,0,0 +9430,TEST,0,0 +9431,TEST,0,0 +9432,TEST,0,0 +9433,TEST,0,0 +9434,TEST,0,0 +9435,TEST,0,0 +9436,TEST,0,0 +9437,TEST,0,0 +9438,TEST,0,0 +9439,TEST,0,0 +9440,TEST,0,0 +9441,TEST,0,0 +9442,TEST,0,0 +9443,TEST,0,0 +9444,TEST,0,0 +9445,TEST,0,0 +9446,TEST,0,0 +9447,TEST,0,0 +9448,TEST,0,0 +9449,TEST,0,0 +9450,TEST,0,0 +9451,TEST,0,0 +9452,TEST,0,0 +9453,TEST,0,0 +9454,TEST,0,0 +9455,TEST,0,0 +9456,TEST,0,0 +9457,TEST,0,0 +9458,TEST,0,0 +9459,TEST,0,0 +9460,TEST,0,0 +9461,TEST,0,0 +9462,TEST,0,0 +9463,TEST,0,0 +9464,TEST,0,0 +9465,TEST,0,0 +9466,TEST,0,0 +9467,TEST,0,0 +9468,TEST,0,0 +9469,TEST,0,0 +9470,TEST,0,0 +9471,TEST,0,0 +9472,TEST,0,0 +9473,TEST,0,0 +9474,TEST,0,0 +9475,TEST,0,0 +9476,TEST,0,0 +9477,TEST,0,0 +9478,TEST,0,0 +9479,TEST,0,0 +9480,TEST,0,0 +9481,TEST,0,0 +9482,TEST,0,0 +9483,TEST,0,0 +9484,TEST,0,0 +9485,TEST,0,0 +9486,TEST,0,0 +9487,TEST,0,0 +9488,TEST,0,0 +9489,TEST,0,0 +9490,TEST,0,0 +9491,TEST,0,0 +9492,TEST,0,0 +9493,TEST,0,0 +9494,TEST,0,0 +9495,TEST,0,0 +9496,TEST,0,0 +9497,TEST,0,0 +9498,TEST,0,0 +9499,TEST,0,0 +9500,TEST,0,0 +9501,TEST,0,0 +9502,TEST,0,0 +9503,TEST,0,0 +9504,TEST,0,0 +9505,TEST,0,0 +9506,TEST,0,0 +9507,TEST,0,0 +9508,TEST,0,0 +9509,TEST,0,0 +9510,TEST,0,0 +9511,TEST,0,0 +9512,TEST,0,0 +9513,TEST,0,0 +9514,TEST,0,0 +9515,TEST,0,0 +9516,TEST,0,0 +9517,TEST,0,0 +9518,TEST,0,0 +9519,TEST,0,0 +9520,TEST,0,0 +9521,TEST,0,0 +9522,TEST,0,0 +9523,TEST,0,0 +9524,TEST,0,0 +9525,TEST,0,0 +9526,TEST,0,0 +9527,TEST,0,0 +9528,TEST,0,0 +9529,TEST,0,0 +9530,TEST,0,0 +9531,TEST,0,0 +9532,TEST,0,0 +9533,TEST,0,0 +9534,TEST,0,0 +9535,TEST,0,0 +9536,TEST,0,0 +9537,TEST,0,0 +9538,TEST,0,0 +9539,TEST,0,0 +9540,TEST,0,0 +9541,TEST,0,0 +9542,TEST,0,0 +9543,TEST,0,0 +9544,TEST,0,0 +9545,TEST,0,0 +9546,TEST,0,0 +9547,TEST,0,0 +9548,TEST,0,0 +9549,TEST,0,0 +9550,TEST,0,0 +9551,TEST,0,0 +9552,TEST,0,0 +9553,TEST,0,0 +9554,TEST,0,0 +9555,TEST,0,0 +9556,TEST,0,0 +9557,TEST,0,0 +9558,TEST,0,0 +9559,TEST,0,0 +9560,TEST,0,0 +9561,TEST,0,0 +9562,TEST,0,0 +9563,TEST,0,0 +9564,TEST,0,0 +9565,TEST,0,0 +9566,TEST,0,0 +9567,TEST,0,0 +9568,TEST,0,0 +9569,TEST,0,0 +9570,TEST,0,0 +9571,TEST,0,0 +9572,TEST,0,0 +9573,TEST,0,0 +9574,TEST,0,0 +9575,TEST,0,0 +9576,TEST,0,0 +9577,TEST,0,0 +9578,TEST,0,0 +9579,TEST,0,0 +9580,TEST,0,0 +9581,TEST,0,0 +9582,TEST,0,0 +9583,TEST,0,0 +9584,TEST,0,0 +9585,TEST,0,0 +9586,TEST,0,0 +9587,TEST,0,0 +9588,TEST,0,0 +9589,TEST,0,0 +9590,TEST,0,0 +9591,TEST,0,0 +9592,TEST,0,0 +9593,TEST,0,0 +9594,TEST,0,0 +9595,TEST,0,0 +9596,TEST,0,0 +9597,TEST,0,0 +9598,TEST,0,0 +9599,TEST,0,0 +9600,TEST,0,0 +9601,TEST,0,0 +9602,TEST,0,0 +9603,TEST,0,0 +9604,TEST,0,0 +9605,TEST,0,0 +9606,TEST,0,0 +9607,TEST,0,0 +9608,TEST,0,0 +9609,TEST,0,0 +9610,TEST,0,0 +9611,TEST,0,0 +9612,TEST,0,0 +9613,TEST,0,0 +9614,TEST,0,0 +9615,TEST,0,0 +9616,TEST,0,0 +9617,TEST,0,0 +9618,TEST,0,0 +9619,TEST,0,0 +9620,TEST,0,0 +9621,TEST,0,0 +9622,TEST,0,0 +9623,TEST,0,0 +9624,TEST,0,0 +9625,TEST,0,0 +9626,TEST,0,0 +9627,TEST,0,0 +9628,TEST,0,0 +9629,TEST,0,0 +9630,TEST,0,0 +9631,TEST,0,0 +9632,TEST,0,0 +9633,TEST,0,0 +9634,TEST,0,0 +9635,TEST,0,0 +9636,TEST,0,0 +9637,TEST,0,0 +9638,TEST,0,0 +9639,TEST,0,0 +9640,TEST,0,0 +9641,TEST,0,0 +9642,TEST,0,0 +9643,TEST,0,0 +9644,TEST,0,0 +9645,TEST,0,0 +9646,TEST,0,0 +9647,TEST,0,0 +9648,TEST,0,0 +9649,TEST,0,0 +9650,TEST,0,0 +9651,TEST,0,0 +9652,TEST,0,0 +9653,TEST,0,0 +9654,TEST,0,0 +9655,TEST,0,0 +9656,TEST,0,0 +9657,TEST,0,0 +9658,TEST,0,0 +9659,TEST,0,0 +9660,TEST,0,0 +9661,TEST,0,0 +9662,TEST,0,0 +9663,TEST,0,0 +9664,TEST,0,0 +9665,TEST,0,0 +9666,TEST,0,0 +9667,TEST,0,0 +9668,TEST,0,0 +9669,TEST,0,0 +9670,TEST,0,0 +9671,TEST,0,0 +9672,TEST,0,0 +9673,TEST,0,0 +9674,TEST,0,0 +9675,TEST,0,0 +9676,TEST,0,0 +9677,TEST,0,0 +9678,TEST,0,0 +9679,TEST,0,0 +9680,TEST,0,0 +9681,TEST,0,0 +9682,TEST,0,0 +9683,TEST,0,0 +9684,TEST,0,0 +9685,TEST,0,0 +9686,TEST,0,0 +9687,TEST,0,0 +9688,TEST,0,0 +9689,TEST,0,0 +9690,TEST,0,0 +9691,TEST,0,0 +9692,TEST,0,0 +9693,TEST,0,0 +9694,TEST,0,0 +9695,TEST,0,0 +9696,TEST,0,0 +9697,TEST,0,0 +9698,TEST,0,0 +9699,TEST,0,0 +9700,TEST,0,0 +9701,TEST,0,0 +9702,TEST,0,0 +9703,TEST,0,0 +9704,TEST,0,0 +9705,TEST,0,0 +9706,TEST,0,0 +9707,TEST,0,0 +9708,TEST,0,0 +9709,TEST,0,0 +9710,TEST,0,0 +9711,TEST,0,0 +9712,TEST,0,0 +9713,TEST,0,0 +9714,TEST,0,0 +9715,TEST,0,0 +9716,TEST,0,0 +9717,TEST,0,0 +9718,TEST,0,0 +9719,TEST,0,0 +9720,TEST,0,0 +9721,TEST,0,0 +9722,TEST,0,0 +9723,TEST,0,0 +9724,TEST,0,0 +9725,TEST,0,0 +9726,TEST,0,0 +9727,TEST,0,0 +9728,TEST,0,0 +9729,TEST,0,0 +9730,TEST,0,0 +9731,TEST,0,0 +9732,TEST,0,0 +9733,TEST,0,0 +9734,TEST,0,0 +9735,TEST,0,0 +9736,TEST,0,0 +9737,TEST,0,0 +9738,TEST,0,0 +9739,TEST,0,0 +9740,TEST,0,0 +9741,TEST,0,0 +9742,TEST,0,0 +9743,TEST,0,0 +9744,TEST,0,0 +9745,TEST,0,0 +9746,TEST,0,0 +9747,TEST,0,0 +9748,TEST,0,0 +9749,TEST,0,0 +9750,TEST,0,0 +9751,TEST,0,0 +9752,TEST,0,0 +9753,TEST,0,0 +9754,TEST,0,0 +9755,TEST,0,0 +9756,TEST,0,0 +9757,TEST,0,0 +9758,TEST,0,0 +9759,TEST,0,0 +9760,TEST,0,0 +9761,TEST,0,0 +9762,TEST,0,0 +9763,TEST,0,0 +9764,TEST,0,0 +9765,TEST,0,0 +9766,TEST,0,0 +9767,TEST,0,0 +9768,TEST,0,0 +9769,TEST,0,0 +9770,TEST,0,0 +9771,TEST,0,0 +9772,TEST,0,0 +9773,TEST,0,0 +9774,TEST,0,0 +9775,TEST,0,0 +9776,TEST,0,0 +9777,TEST,0,0 +9778,TEST,0,0 +9779,TEST,0,0 +9780,TEST,0,0 +9781,TEST,0,0 +9782,TEST,0,0 +9783,TEST,0,0 +9784,TEST,0,0 +9785,TEST,0,0 +9786,TEST,0,0 +9787,TEST,0,0 +9788,TEST,0,0 +9789,TEST,0,0 +9790,TEST,0,0 +9791,TEST,0,0 +9792,TEST,0,0 +9793,TEST,0,0 +9794,TEST,0,0 +9795,TEST,0,0 +9796,TEST,0,0 +9797,TEST,0,0 +9798,TEST,0,0 +9799,TEST,0,0 +9800,TEST,0,0 +9801,TEST,0,0 +9802,TEST,0,0 +9803,TEST,0,0 +9804,TEST,0,0 +9805,TEST,0,0 +9806,TEST,0,0 +9807,TEST,0,0 +9808,TEST,0,0 +9809,TEST,0,0 +9810,TEST,0,0 +9811,TEST,0,0 +9812,TEST,0,0 +9813,TEST,0,0 +9814,TEST,0,0 +9815,TEST,0,0 +9816,TEST,0,0 +9817,TEST,0,0 +9818,TEST,0,0 +9819,TEST,0,0 +9820,TEST,0,0 +9821,TEST,0,0 +9822,TEST,0,0 +9823,TEST,0,0 +9824,TEST,0,0 +9825,TEST,0,0 +9826,TEST,0,0 +9827,TEST,0,0 +9828,TEST,0,0 +9829,TEST,0,0 +9830,TEST,0,0 +9831,TEST,0,0 +9832,TEST,0,0 +9833,TEST,0,0 +9834,TEST,0,0 +9835,TEST,0,0 +9836,TEST,0,0 +9837,TEST,0,0 +9838,TEST,0,0 +9839,TEST,0,0 +9840,TEST,0,0 +9841,TEST,0,0 +9842,TEST,0,0 +9843,TEST,0,0 +9844,TEST,0,0 +9845,TEST,0,0 +9846,TEST,0,0 +9847,TEST,0,0 +9848,TEST,0,0 +9849,TEST,0,0 +9850,TEST,0,0 +9851,TEST,0,0 +9852,TEST,0,0 +9853,TEST,0,0 +9854,TEST,0,0 +9855,TEST,0,0 +9856,TEST,0,0 +9857,TEST,0,0 +9858,TEST,0,0 +9859,TEST,0,0 +9860,TEST,0,0 +9861,TEST,0,0 +9862,TEST,0,0 +9863,TEST,0,0 +9864,TEST,0,0 +9865,TEST,0,0 +9866,TEST,0,0 +9867,TEST,0,0 +9868,TEST,0,0 +9869,TEST,0,0 +9870,TEST,0,0 +9871,TEST,0,0 +9872,TEST,0,0 +9873,TEST,0,0 +9874,TEST,0,0 +9875,TEST,0,0 +9876,TEST,0,0 +9877,TEST,0,0 +9878,TEST,0,0 +9879,TEST,0,0 +9880,TEST,0,0 +9881,TEST,0,0 +9882,TEST,0,0 +9883,TEST,0,0 +9884,TEST,0,0 +9885,TEST,0,0 +9886,TEST,0,0 +9887,TEST,0,0 +9888,TEST,0,0 +9889,TEST,0,0 +9890,TEST,0,0 +9891,TEST,0,0 +9892,TEST,0,0 +9893,TEST,0,0 +9894,TEST,0,0 +9895,TEST,0,0 +9896,TEST,0,0 +9897,TEST,0,0 +9898,TEST,0,0 +9899,TEST,0,0 +9900,TEST,0,0 +9901,TEST,0,0 +9902,TEST,0,0 +9903,TEST,0,0 +9904,TEST,0,0 +9905,TEST,0,0 +9906,TEST,0,0 +9907,TEST,0,0 +9908,TEST,0,0 +9909,TEST,0,0 +9910,TEST,0,0 +9911,TEST,0,0 +9912,TEST,0,0 +9913,TEST,0,0 +9914,TEST,0,0 +9915,TEST,0,0 +9916,TEST,0,0 +9917,TEST,0,0 +9918,TEST,0,0 +9919,TEST,0,0 +9920,TEST,0,0 +9921,TEST,0,0 +9922,TEST,0,0 +9923,TEST,0,0 +9924,TEST,0,0 +9925,TEST,0,0 +9926,TEST,0,0 +9927,TEST,0,0 +9928,TEST,0,0 +9929,TEST,0,0 +9930,TEST,0,0 +9931,TEST,0,0 +9932,TEST,0,0 +9933,TEST,0,0 +9934,TEST,0,0 +9935,TEST,0,0 +9936,TEST,0,0 +9937,TEST,0,0 +9938,TEST,0,0 +9939,TEST,0,0 +9940,TEST,0,0 +9941,TEST,0,0 +9942,TEST,0,0 +9943,TEST,0,0 +9944,TEST,0,0 +9945,TEST,0,0 +9946,TEST,0,0 +9947,TEST,0,0 +9948,TEST,0,0 +9949,TEST,0,0 +9950,TEST,0,0 +9951,TEST,0,0 +9952,TEST,0,0 +9953,TEST,0,0 +9954,TEST,0,0 +9955,TEST,0,0 +9956,TEST,0,0 +9957,TEST,0,0 +9958,TEST,0,0 +9959,TEST,0,0 +9960,TEST,0,0 +9961,TEST,0,0 +9962,TEST,0,0 +9963,TEST,0,0 +9964,TEST,0,0 +9965,TEST,0,0 +9966,TEST,0,0 +9967,TEST,0,0 +9968,TEST,0,0 +9969,TEST,0,0 +9970,TEST,0,0 +9971,TEST,0,0 +9972,TEST,0,0 +9973,TEST,0,0 +9974,TEST,0,0 +9975,TEST,0,0 +9976,TEST,0,0 +9977,TEST,0,0 +9978,TEST,0,0 +9979,TEST,0,0 +9980,TEST,0,0 +9981,TEST,0,0 +9982,TEST,0,0 +9983,TEST,0,0 +9984,TEST,0,0 +9985,TEST,0,0 +9986,TEST,0,0 +9987,TEST,0,0 +9988,TEST,0,0 +9989,TEST,0,0 +9990,TEST,0,0 +9991,TEST,0,0 +9992,TEST,0,0 +9993,TEST,0,0 +9994,TEST,0,0 +9995,TEST,0,0 +9996,TEST,0,0 +9997,TEST,0,0 +9998,TEST,0,0 +9999,TEST,0,0 +10000,TEST,0,0 +10001,TEST,0,0 +10002,TEST,0,0 +10003,TEST,0,0 +10004,TEST,0,0 +10005,TEST,0,0 +10006,TEST,0,0 +10007,TEST,0,0 +10008,TEST,0,0 +10009,TEST,0,0 +10010,TEST,0,0 +10011,TEST,0,0 +10012,TEST,0,0 +10013,TEST,0,0 +10014,TEST,0,0 +10015,TEST,0,0 +10016,TEST,0,0 +10017,TEST,0,0 +10018,TEST,0,0 +10019,TEST,0,0 +10020,TEST,0,0 +10021,TEST,0,0 +10022,TEST,0,0 +10023,TEST,0,0 +10024,TEST,0,0 +10025,TEST,0,0 +10026,TEST,0,0 +10027,TEST,0,0 +10028,TEST,0,0 +10029,TEST,0,0 +10030,TEST,0,0 +10031,TEST,0,0 +10032,TEST,0,0 +10033,TEST,0,0 +10034,TEST,0,0 +10035,TEST,0,0 +10036,TEST,0,0 +10037,TEST,0,0 +10038,TEST,0,0 +10039,TEST,0,0 +10040,TEST,0,0 +10041,TEST,0,0 +10042,TEST,0,0 +10043,TEST,0,0 +10044,TEST,0,0 +10045,TEST,0,0 +10046,TEST,0,0 +10047,TEST,0,0 +10048,TEST,0,0 +10049,TEST,0,0 +10050,TEST,0,0 +10051,TEST,0,0 +10052,TEST,0,0 +10053,TEST,0,0 +10054,TEST,0,0 +10055,TEST,0,0 +10056,TEST,0,0 +10057,TEST,0,0 +10058,TEST,0,0 +10059,TEST,0,0 +10060,TEST,0,0 +10061,TEST,0,0 +10062,TEST,0,0 +10063,TEST,0,0 +10064,TEST,0,0 +10065,TEST,0,0 +10066,TEST,0,0 +10067,TEST,0,0 +10068,TEST,0,0 +10069,TEST,0,0 +10070,TEST,0,0 +10071,TEST,0,0 +10072,TEST,0,0 +10073,TEST,0,0 +10074,TEST,0,0 +10075,TEST,0,0 +10076,TEST,0,0 +10077,TEST,0,0 +10078,TEST,0,0 +10079,TEST,0,0 +10080,TEST,0,0 +10081,TEST,0,0 +10082,TEST,0,0 +10083,TEST,0,0 +10084,TEST,0,0 +10085,TEST,0,0 +10086,TEST,0,0 +10087,TEST,0,0 +10088,TEST,0,0 +10089,TEST,0,0 +10090,TEST,0,0 +10091,TEST,0,0 +10092,TEST,0,0 +10093,TEST,0,0 +10094,TEST,0,0 +10095,TEST,0,0 +10096,TEST,0,0 +10097,TEST,0,0 +10098,TEST,0,0 +10099,TEST,0,0 +10100,TEST,0,0 +10101,TEST,0,0 +10102,TEST,0,0 +10103,TEST,0,0 +10104,TEST,0,0 +10105,TEST,0,0 +10106,TEST,0,0 +10107,TEST,0,0 +10108,TEST,0,0 +10109,TEST,0,0 +10110,TEST,0,0 +10111,TEST,0,0 +10112,TEST,0,0 +10113,TEST,0,0 +10114,TEST,0,0 +10115,TEST,0,0 +10116,TEST,0,0 +10117,TEST,0,0 +10118,TEST,0,0 +10119,TEST,0,0 +10120,TEST,0,0 +10121,TEST,0,0 +10122,TEST,0,0 +10123,TEST,0,0 +10124,TEST,0,0 +10125,TEST,0,0 +10126,TEST,0,0 +10127,TEST,0,0 +10128,TEST,0,0 +10129,TEST,0,0 +10130,TEST,0,0 +10131,TEST,0,0 +10132,TEST,0,0 +10133,TEST,0,0 +10134,TEST,0,0 +10135,TEST,0,0 +10136,TEST,0,0 +10137,TEST,0,0 +10138,TEST,0,0 +10139,TEST,0,0 +10140,TEST,0,0 +10141,TEST,0,0 +10142,TEST,0,0 +10143,TEST,0,0 +10144,TEST,0,0 +10145,TEST,0,0 +10146,TEST,0,0 +10147,TEST,0,0 +10148,TEST,0,0 +10149,TEST,0,0 +10150,TEST,0,0 +10151,TEST,0,0 +10152,TEST,0,0 +10153,TEST,0,0 +10154,TEST,0,0 +10155,TEST,0,0 +10156,TEST,0,0 +10157,TEST,0,0 +10158,TEST,0,0 +10159,TEST,0,0 +10160,TEST,0,0 +10161,TEST,0,0 +10162,TEST,0,0 +10163,TEST,0,0 +10164,TEST,0,0 +10165,TEST,0,0 +10166,TEST,0,0 +10167,TEST,0,0 +10168,TEST,0,0 +10169,TEST,0,0 +10170,TEST,0,0 +10171,TEST,0,0 +10172,TEST,0,0 +10173,TEST,0,0 +10174,TEST,0,0 +10175,TEST,0,0 +10176,TEST,0,0 +10177,TEST,0,0 +10178,TEST,0,0 +10179,TEST,0,0 +10180,TEST,0,0 +10181,TEST,0,0 +10182,TEST,0,0 +10183,TEST,0,0 +10184,TEST,0,0 +10185,TEST,0,0 +10186,TEST,0,0 +10187,TEST,0,0 +10188,TEST,0,0 +10189,TEST,0,0 +10190,TEST,0,0 +10191,TEST,0,0 +10192,TEST,0,0 +10193,TEST,0,0 +10194,TEST,0,0 +10195,TEST,0,0 +10196,TEST,0,0 +10197,TEST,0,0 +10198,TEST,0,0 +10199,TEST,0,0 +10200,TEST,0,0 +10201,TEST,0,0 +10202,TEST,0,0 +10203,TEST,0,0 +10204,TEST,0,0 +10205,TEST,0,0 +10206,TEST,0,0 +10207,TEST,0,0 +10208,TEST,0,0 +10209,TEST,0,0 +10210,TEST,0,0 +10211,TEST,0,0 +10212,TEST,0,0 +10213,TEST,0,0 +10214,TEST,0,0 +10215,TEST,0,0 +10216,TEST,0,0 +10217,TEST,0,0 +10218,TEST,0,0 +10219,TEST,0,0 +10220,TEST,0,0 +10221,TEST,0,0 +10222,TEST,0,0 +10223,TEST,0,0 +10224,TEST,0,0 +10225,TEST,0,0 +10226,TEST,0,0 +10227,TEST,0,0 +10228,TEST,0,0 +10229,TEST,0,0 +10230,TEST,0,0 +10231,TEST,0,0 +10232,TEST,0,0 +10233,TEST,0,0 +10234,TEST,0,0 +10235,TEST,0,0 +10236,TEST,0,0 +10237,TEST,0,0 +10238,TEST,0,0 +10239,TEST,0,0 +10240,TEST,0,0 +10241,TEST,0,0 +10242,TEST,0,0 +10243,TEST,0,0 +10244,TEST,0,0 +10245,TEST,0,0 +10246,TEST,0,0 +10247,TEST,0,0 +10248,TEST,0,0 +10249,TEST,0,0 +10250,TEST,0,0 +10251,TEST,0,0 +10252,TEST,0,0 +10253,TEST,0,0 +10254,TEST,0,0 +10255,TEST,0,0 +10256,TEST,0,0 +10257,TEST,0,0 +10258,TEST,0,0 +10259,TEST,0,0 +10260,TEST,0,0 +10261,TEST,0,0 +10262,TEST,0,0 +10263,TEST,0,0 +10264,TEST,0,0 +10265,TEST,0,0 +10266,TEST,0,0 +10267,TEST,0,0 +10268,TEST,0,0 +10269,TEST,0,0 +10270,TEST,0,0 +10271,TEST,0,0 +10272,TEST,0,0 +10273,TEST,0,0 +10274,TEST,0,0 +10275,TEST,0,0 +10276,TEST,0,0 +10277,TEST,0,0 +10278,TEST,0,0 +10279,TEST,0,0 +10280,TEST,0,0 +10281,TEST,0,0 +10282,TEST,0,0 +10283,TEST,0,0 +10284,TEST,0,0 +10285,TEST,0,0 +10286,TEST,0,0 +10287,TEST,0,0 +10288,TEST,0,0 +10289,TEST,0,0 +10290,TEST,0,0 +10291,TEST,0,0 +10292,TEST,0,0 +10293,TEST,0,0 +10294,TEST,0,0 +10295,TEST,0,0 +10296,TEST,0,0 +10297,TEST,0,0 +10298,TEST,0,0 +10299,TEST,0,0 +10300,TEST,0,0 +10301,TEST,0,0 +10302,TEST,0,0 +10303,TEST,0,0 +10304,TEST,0,0 +10305,TEST,0,0 +10306,TEST,0,0 +10307,TEST,0,0 +10308,TEST,0,0 +10309,TEST,0,0 +10310,TEST,0,0 +10311,TEST,0,0 +10312,TEST,0,0 +10313,TEST,0,0 +10314,TEST,0,0 +10315,TEST,0,0 +10316,TEST,0,0 +10317,TEST,0,0 +10318,TEST,0,0 +10319,TEST,0,0 +10320,TEST,0,0 +10321,TEST,0,0 +10322,TEST,0,0 +10323,TEST,0,0 +10324,TEST,0,0 +10325,TEST,0,0 +10326,TEST,0,0 +10327,TEST,0,0 +10328,TEST,0,0 +10329,TEST,0,0 +10330,TEST,0,0 +10331,TEST,0,0 +10332,TEST,0,0 +10333,TEST,0,0 +10334,TEST,0,0 +10335,TEST,0,0 +10336,TEST,0,0 +10337,TEST,0,0 +10338,TEST,0,0 +10339,TEST,0,0 +10340,TEST,0,0 +10341,TEST,0,0 +10342,TEST,0,0 +10343,TEST,0,0 +10344,TEST,0,0 +10345,TEST,0,0 +10346,TEST,0,0 +10347,TEST,0,0 +10348,TEST,0,0 +10349,TEST,0,0 +10350,TEST,0,0 +10351,TEST,0,0 +10352,TEST,0,0 +10353,TEST,0,0 +10354,TEST,0,0 +10355,TEST,0,0 +10356,TEST,0,0 +10357,TEST,0,0 +10358,TEST,0,0 +10359,TEST,0,0 +10360,TEST,0,0 +10361,TEST,0,0 +10362,TEST,0,0 +10363,TEST,0,0 +10364,TEST,0,0 +10365,TEST,0,0 +10366,TEST,0,0 +10367,TEST,0,0 +10368,TEST,0,0 +10369,TEST,0,0 +10370,TEST,0,0 +10371,TEST,0,0 +10372,TEST,0,0 +10373,TEST,0,0 +10374,TEST,0,0 +10375,TEST,0,0 +10376,TEST,0,0 +10377,TEST,0,0 +10378,TEST,0,0 +10379,TEST,0,0 +10380,TEST,0,0 +10381,TEST,0,0 +10382,TEST,0,0 +10383,TEST,0,0 +10384,TEST,0,0 +10385,TEST,0,0 +10386,TEST,0,0 +10387,TEST,0,0 +10388,TEST,0,0 +10389,TEST,0,0 +10390,TEST,0,0 +10391,TEST,0,0 +10392,TEST,0,0 +10393,TEST,0,0 +10394,TEST,0,0 +10395,TEST,0,0 +10396,TEST,0,0 +10397,TEST,0,0 +10398,TEST,0,0 +10399,TEST,0,0 +10400,TEST,0,0 +10401,TEST,0,0 +10402,TEST,0,0 +10403,TEST,0,0 +10404,TEST,0,0 +10405,TEST,0,0 +10406,TEST,0,0 +10407,TEST,0,0 +10408,TEST,0,0 +10409,TEST,0,0 +10410,TEST,0,0 +10411,TEST,0,0 +10412,TEST,0,0 +10413,TEST,0,0 +10414,TEST,0,0 +10415,TEST,0,0 +10416,TEST,0,0 +10417,TEST,0,0 +10418,TEST,0,0 +10419,TEST,0,0 +10420,TEST,0,0 +10421,TEST,0,0 +10422,TEST,0,0 +10423,TEST,0,0 +10424,TEST,0,0 +10425,TEST,0,0 +10426,TEST,0,0 +10427,TEST,0,0 +10428,TEST,0,0 +10429,TEST,0,0 +10430,TEST,0,0 +10431,TEST,0,0 +10432,TEST,0,0 +10433,TEST,0,0 +10434,TEST,0,0 +10435,TEST,0,0 +10436,TEST,0,0 +10437,TEST,0,0 +10438,TEST,0,0 +10439,TEST,0,0 +10440,TEST,0,0 +10441,TEST,0,0 +10442,TEST,0,0 +10443,TEST,0,0 +10444,TEST,0,0 +10445,TEST,0,0 +10446,TEST,0,0 +10447,TEST,0,0 +10448,TEST,0,0 +10449,TEST,0,0 +10450,TEST,0,0 +10451,TEST,0,0 +10452,TEST,0,0 +10453,TEST,0,0 +10454,TEST,0,0 +10455,TEST,0,0 +10456,TEST,0,0 +10457,TEST,0,0 +10458,TEST,0,0 +10459,TEST,0,0 +10460,TEST,0,0 +10461,TEST,0,0 +10462,TEST,0,0 +10463,TEST,0,0 +10464,TEST,0,0 +10465,TEST,0,0 +10466,TEST,0,0 +10467,TEST,0,0 +10468,TEST,0,0 +10469,TEST,0,0 +10470,TEST,0,0 +10471,TEST,0,0 +10472,TEST,0,0 +10473,TEST,0,0 +10474,TEST,0,0 +10475,TEST,0,0 +10476,TEST,0,0 +10477,TEST,0,0 +10478,TEST,0,0 +10479,TEST,0,0 +10480,TEST,0,0 +10481,TEST,0,0 +10482,TEST,0,0 +10483,TEST,0,0 +10484,TEST,0,0 +10485,TEST,0,0 +10486,TEST,0,0 +10487,TEST,0,0 +10488,TEST,0,0 +10489,TEST,0,0 +10490,TEST,0,0 +10491,TEST,0,0 +10492,TEST,0,0 +10493,TEST,0,0 +10494,TEST,0,0 +10495,TEST,0,0 +10496,TEST,0,0 +10497,TEST,0,0 +10498,TEST,0,0 +10499,TEST,0,0 +10500,TEST,0,0 +10501,TEST,0,0 +10502,TEST,0,0 +10503,TEST,0,0 +10504,TEST,0,0 +10505,TEST,0,0 +10506,TEST,0,0 +10507,TEST,0,0 +10508,TEST,0,0 +10509,TEST,0,0 +10510,TEST,0,0 +10511,TEST,0,0 +10512,TEST,0,0 +10513,TEST,0,0 +10514,TEST,0,0 +10515,TEST,0,0 +10516,TEST,0,0 +10517,TEST,0,0 +10518,TEST,0,0 +10519,TEST,0,0 +10520,TEST,0,0 +10521,TEST,0,0 +10522,TEST,0,0 +10523,TEST,0,0 +10524,TEST,0,0 +10525,TEST,0,0 +10526,TEST,0,0 +10527,TEST,0,0 +10528,TEST,0,0 +10529,TEST,0,0 +10530,TEST,0,0 +10531,TEST,0,0 +10532,TEST,0,0 +10533,TEST,0,0 +10534,TEST,0,0 +10535,TEST,0,0 +10536,TEST,0,0 +10537,TEST,0,0 +10538,TEST,0,0 +10539,TEST,0,0 +10540,TEST,0,0 +10541,TEST,0,0 +10542,TEST,0,0 +10543,TEST,0,0 +10544,TEST,0,0 +10545,TEST,0,0 +10546,TEST,0,0 +10547,TEST,0,0 +10548,TEST,0,0 +10549,TEST,0,0 +10550,TEST,0,0 +10551,TEST,0,0 +10552,TEST,0,0 +10553,TEST,0,0 +10554,TEST,0,0 +10555,TEST,0,0 +10556,TEST,0,0 +10557,TEST,0,0 +10558,TEST,0,0 +10559,TEST,0,0 +10560,TEST,0,0 +10561,TEST,0,0 +10562,TEST,0,0 +10563,TEST,0,0 +10564,TEST,0,0 +10565,TEST,0,0 +10566,TEST,0,0 +10567,TEST,0,0 +10568,TEST,0,0 +10569,TEST,0,0 +10570,TEST,0,0 +10571,TEST,0,0 +10572,TEST,0,0 +10573,TEST,0,0 +10574,TEST,0,0 +10575,TEST,0,0 +10576,TEST,0,0 +10577,TEST,0,0 +10578,TEST,0,0 +10579,TEST,0,0 +10580,TEST,0,0 +10581,TEST,0,0 +10582,TEST,0,0 +10583,TEST,0,0 +10584,TEST,0,0 +10585,TEST,0,0 +10586,TEST,0,0 +10587,TEST,0,0 +10588,TEST,0,0 +10589,TEST,0,0 +10590,TEST,0,0 +10591,TEST,0,0 +10592,TEST,0,0 +10593,TEST,0,0 +10594,TEST,0,0 +10595,TEST,0,0 +10596,TEST,0,0 +10597,TEST,0,0 +10598,TEST,0,0 +10599,TEST,0,0 +10600,TEST,0,0 +10601,TEST,0,0 +10602,TEST,0,0 +10603,TEST,0,0 +10604,TEST,0,0 +10605,TEST,0,0 +10606,TEST,0,0 +10607,TEST,0,0 +10608,TEST,0,0 +10609,TEST,0,0 +10610,TEST,0,0 +10611,TEST,0,0 +10612,TEST,0,0 +10613,TEST,0,0 +10614,TEST,0,0 +10615,TEST,0,0 +10616,TEST,0,0 +10617,TEST,0,0 +10618,TEST,0,0 +10619,TEST,0,0 +10620,TEST,0,0 +10621,TEST,0,0 +10622,TEST,0,0 +10623,TEST,0,0 +10624,TEST,0,0 +10625,TEST,0,0 +10626,TEST,0,0 +10627,TEST,0,0 +10628,TEST,0,0 +10629,TEST,0,0 +10630,TEST,0,0 +10631,TEST,0,0 +10632,TEST,0,0 +10633,TEST,0,0 +10634,TEST,0,0 +10635,TEST,0,0 +10636,TEST,0,0 +10637,TEST,0,0 +10638,TEST,0,0 +10639,TEST,0,0 +10640,TEST,0,0 +10641,TEST,0,0 +10642,TEST,0,0 +10643,TEST,0,0 +10644,TEST,0,0 +10645,TEST,0,0 +10646,TEST,0,0 +10647,TEST,0,0 +10648,TEST,0,0 +10649,TEST,0,0 +10650,TEST,0,0 +10651,TEST,0,0 +10652,TEST,0,0 +10653,TEST,0,0 +10654,TEST,0,0 +10655,TEST,0,0 +10656,TEST,0,0 +10657,TEST,0,0 +10658,TEST,0,0 +10659,TEST,0,0 +10660,TEST,0,0 +10661,TEST,0,0 +10662,TEST,0,0 +10663,TEST,0,0 +10664,TEST,0,0 +10665,TEST,0,0 +10666,TEST,0,0 +10667,TEST,0,0 +10668,TEST,0,0 +10669,TEST,0,0 +10670,TEST,0,0 +10671,TEST,0,0 +10672,TEST,0,0 +10673,TEST,0,0 +10674,TEST,0,0 +10675,TEST,0,0 +10676,TEST,0,0 +10677,TEST,0,0 +10678,TEST,0,0 +10679,TEST,0,0 +10680,TEST,0,0 +10681,TEST,0,0 +10682,TEST,0,0 +10683,TEST,0,0 +10684,TEST,0,0 +10685,TEST,0,0 +10686,TEST,0,0 +10687,TEST,0,0 +10688,TEST,0,0 +10689,TEST,0,0 +10690,TEST,0,0 +10691,TEST,0,0 +10692,TEST,0,0 +10693,TEST,0,0 +10694,TEST,0,0 +10695,TEST,0,0 +10696,TEST,0,0 +10697,TEST,0,0 +10698,TEST,0,0 +10699,TEST,0,0 +10700,TEST,0,0 +10701,TEST,0,0 +10702,TEST,0,0 +10703,TEST,0,0 +10704,TEST,0,0 +10705,TEST,0,0 +10706,TEST,0,0 +10707,TEST,0,0 +10708,TEST,0,0 +10709,TEST,0,0 +10710,TEST,0,0 +10711,TEST,0,0 +10712,TEST,0,0 +10713,TEST,0,0 +10714,TEST,0,0 +10715,TEST,0,0 +10716,TEST,0,0 +10717,TEST,0,0 +10718,TEST,0,0 +10719,TEST,0,0 +10720,TEST,0,0 +10721,TEST,0,0 +10722,TEST,0,0 +10723,TEST,0,0 +10724,TEST,0,0 +10725,TEST,0,0 +10726,TEST,0,0 +10727,TEST,0,0 +10728,TEST,0,0 +10729,TEST,0,0 +10730,TEST,0,0 +10731,TEST,0,0 +10732,TEST,0,0 +10733,TEST,0,0 +10734,TEST,0,0 +10735,TEST,0,0 +10736,TEST,0,0 +10737,TEST,0,0 +10738,TEST,0,0 +10739,TEST,0,0 +10740,TEST,0,0 +10741,TEST,0,0 +10742,TEST,0,0 +10743,TEST,0,0 +10744,TEST,0,0 +10745,TEST,0,0 +10746,TEST,0,0 +10747,TEST,0,0 +10748,TEST,0,0 +10749,TEST,0,0 +10750,TEST,0,0 +10751,TEST,0,0 +10752,TEST,0,0 +10753,TEST,0,0 +10754,TEST,0,0 +10755,TEST,0,0 +10756,TEST,0,0 +10757,TEST,0,0 +10758,TEST,0,0 +10759,TEST,0,0 +10760,TEST,0,0 +10761,TEST,0,0 +10762,TEST,0,0 +10763,TEST,0,0 +10764,TEST,0,0 +10765,TEST,0,0 +10766,TEST,0,0 +10767,TEST,0,0 +10768,TEST,0,0 +10769,TEST,0,0 +10770,TEST,0,0 +10771,TEST,0,0 +10772,TEST,0,0 +10773,TEST,0,0 +10774,TEST,0,0 +10775,TEST,0,0 +10776,TEST,0,0 +10777,TEST,0,0 +10778,TEST,0,0 +10779,TEST,0,0 +10780,TEST,0,0 +10781,TEST,0,0 +10782,TEST,0,0 +10783,TEST,0,0 +10784,TEST,0,0 +10785,TEST,0,0 +10786,TEST,0,0 +10787,TEST,0,0 +10788,TEST,0,0 +10789,TEST,0,0 +10790,TEST,0,0 +10791,TEST,0,0 +10792,TEST,0,0 +10793,TEST,0,0 +10794,TEST,0,0 +10795,TEST,0,0 +10796,TEST,0,0 +10797,TEST,0,0 +10798,TEST,0,0 +10799,TEST,0,0 +10800,TEST,0,0 +10801,TEST,0,0 +10802,TEST,0,0 +10803,TEST,0,0 +10804,TEST,0,0 +10805,TEST,0,0 +10806,TEST,0,0 +10807,TEST,0,0 +10808,TEST,0,0 +10809,TEST,0,0 +10810,TEST,0,0 +10811,TEST,0,0 +10812,TEST,0,0 +10813,TEST,0,0 +10814,TEST,0,0 +10815,TEST,0,0 +10816,TEST,0,0 +10817,TEST,0,0 +10818,TEST,0,0 +10819,TEST,0,0 +10820,TEST,0,0 +10821,TEST,0,0 +10822,TEST,0,0 +10823,TEST,0,0 +10824,TEST,0,0 +10825,TEST,0,0 +10826,TEST,0,0 +10827,TEST,0,0 +10828,TEST,0,0 +10829,TEST,0,0 +10830,TEST,0,0 +10831,TEST,0,0 +10832,TEST,0,0 +10833,TEST,0,0 +10834,TEST,0,0 +10835,TEST,0,0 +10836,TEST,0,0 +10837,TEST,0,0 +10838,TEST,0,0 +10839,TEST,0,0 +10840,TEST,0,0 +10841,TEST,0,0 +10842,TEST,0,0 +10843,TEST,0,0 +10844,TEST,0,0 +10845,TEST,0,0 +10846,TEST,0,0 +10847,TEST,0,0 +10848,TEST,0,0 +10849,TEST,0,0 +10850,TEST,0,0 +10851,TEST,0,0 +10852,TEST,0,0 +10853,TEST,0,0 +10854,TEST,0,0 +10855,TEST,0,0 +10856,TEST,0,0 +10857,TEST,0,0 +10858,TEST,0,0 +10859,TEST,0,0 +10860,TEST,0,0 +10861,TEST,0,0 +10862,TEST,0,0 +10863,TEST,0,0 +10864,TEST,0,0 +10865,TEST,0,0 +10866,TEST,0,0 +10867,TEST,0,0 +10868,TEST,0,0 +10869,TEST,0,0 +10870,TEST,0,0 +10871,TEST,0,0 +10872,TEST,0,0 +10873,TEST,0,0 +10874,TEST,0,0 +10875,TEST,0,0 +10876,TEST,0,0 +10877,TEST,0,0 +10878,TEST,0,0 +10879,TEST,0,0 +10880,TEST,0,0 +10881,TEST,0,0 +10882,TEST,0,0 +10883,TEST,0,0 +10884,TEST,0,0 +10885,TEST,0,0 +10886,TEST,0,0 +10887,TEST,0,0 +10888,TEST,0,0 +10889,TEST,0,0 +10890,TEST,0,0 +10891,TEST,0,0 +10892,TEST,0,0 +10893,TEST,0,0 +10894,TEST,0,0 +10895,TEST,0,0 +10896,TEST,0,0 +10897,TEST,0,0 +10898,TEST,0,0 +10899,TEST,0,0 +10900,TEST,0,0 +10901,TEST,0,0 +10902,TEST,0,0 +10903,TEST,0,0 +10904,TEST,0,0 +10905,TEST,0,0 +10906,TEST,0,0 +10907,TEST,0,0 +10908,TEST,0,0 +10909,TEST,0,0 +10910,TEST,0,0 +10911,TEST,0,0 +10912,TEST,0,0 +10913,TEST,0,0 +10914,TEST,0,0 +10915,TEST,0,0 +10916,TEST,0,0 +10917,TEST,0,0 +10918,TEST,0,0 +10919,TEST,0,0 +10920,TEST,0,0 +10921,TEST,0,0 +10922,TEST,0,0 +10923,TEST,0,0 +10924,TEST,0,0 +10925,TEST,0,0 +10926,TEST,0,0 +10927,TEST,0,0 +10928,TEST,0,0 +10929,TEST,0,0 +10930,TEST,0,0 +10931,TEST,0,0 +10932,TEST,0,0 +10933,TEST,0,0 +10934,TEST,0,0 +10935,TEST,0,0 +10936,TEST,0,0 +10937,TEST,0,0 +10938,TEST,0,0 +10939,TEST,0,0 +10940,TEST,0,0 +10941,TEST,0,0 +10942,TEST,0,0 +10943,TEST,0,0 +10944,TEST,0,0 +10945,TEST,0,0 +10946,TEST,0,0 +10947,TEST,0,0 +10948,TEST,0,0 +10949,TEST,0,0 +10950,TEST,0,0 +10951,TEST,0,0 +10952,TEST,0,0 +10953,TEST,0,0 +10954,TEST,0,0 +10955,TEST,0,0 +10956,TEST,0,0 +10957,TEST,0,0 +10958,TEST,0,0 +10959,TEST,0,0 +10960,TEST,0,0 +10961,TEST,0,0 +10962,TEST,0,0 +10963,TEST,0,0 +10964,TEST,0,0 +10965,TEST,0,0 +10966,TEST,0,0 +10967,TEST,0,0 +10968,TEST,0,0 +10969,TEST,0,0 +10970,TEST,0,0 +10971,TEST,0,0 +10972,TEST,0,0 +10973,TEST,0,0 +10974,TEST,0,0 +10975,TEST,0,0 +10976,TEST,0,0 +10977,TEST,0,0 +10978,TEST,0,0 +10979,TEST,0,0 +10980,TEST,0,0 +10981,TEST,0,0 +10982,TEST,0,0 +10983,TEST,0,0 +10984,TEST,0,0 +10985,TEST,0,0 +10986,TEST,0,0 +10987,TEST,0,0 +10988,TEST,0,0 +10989,TEST,0,0 +10990,TEST,0,0 +10991,TEST,0,0 +10992,TEST,0,0 +10993,TEST,0,0 +10994,TEST,0,0 +10995,TEST,0,0 +10996,TEST,0,0 +10997,TEST,0,0 +10998,TEST,0,0 +10999,TEST,0,0 +11000,TEST,0,0 +11001,TEST,0,0 +11002,TEST,0,0 +11003,TEST,0,0 +11004,TEST,0,0 +11005,TEST,0,0 +11006,TEST,0,0 +11007,TEST,0,0 +11008,TEST,0,0 +11009,TEST,0,0 +11010,TEST,0,0 +11011,TEST,0,0 +11012,TEST,0,0 +11013,TEST,0,0 +11014,TEST,0,0 +11015,TEST,0,0 +11016,TEST,0,0 +11017,TEST,0,0 +11018,TEST,0,0 +11019,TEST,0,0 +11020,TEST,0,0 +11021,TEST,0,0 +11022,TEST,0,0 +11023,TEST,0,0 +11024,TEST,0,0 +11025,TEST,0,0 +11026,TEST,0,0 +11027,TEST,0,0 +11028,TEST,0,0 +11029,TEST,0,0 +11030,TEST,0,0 +11031,TEST,0,0 +11032,TEST,0,0 +11033,TEST,0,0 +11034,TEST,0,0 +11035,TEST,0,0 +11036,TEST,0,0 +11037,TEST,0,0 +11038,TEST,0,0 +11039,TEST,0,0 +11040,TEST,0,0 +11041,TEST,0,0 +11042,TEST,0,0 +11043,TEST,0,0 +11044,TEST,0,0 +11045,TEST,0,0 +11046,TEST,0,0 +11047,TEST,0,0 +11048,TEST,0,0 +11049,TEST,0,0 +11050,TEST,0,0 +11051,TEST,0,0 +11052,TEST,0,0 +11053,TEST,0,0 +11054,TEST,0,0 +11055,TEST,0,0 +11056,TEST,0,0 +11057,TEST,0,0 +11058,TEST,0,0 +11059,TEST,0,0 +11060,TEST,0,0 +11061,TEST,0,0 +11062,TEST,0,0 +11063,TEST,0,0 +11064,TEST,0,0 +11065,TEST,0,0 +11066,TEST,0,0 +11067,TEST,0,0 +11068,TEST,0,0 +11069,TEST,0,0 +11070,TEST,0,0 +11071,TEST,0,0 +11072,TEST,0,0 +11073,TEST,0,0 +11074,TEST,0,0 +11075,TEST,0,0 +11076,TEST,0,0 +11077,TEST,0,0 +11078,TEST,0,0 +11079,TEST,0,0 +11080,TEST,0,0 +11081,TEST,0,0 +11082,TEST,0,0 +11083,TEST,0,0 +11084,TEST,0,0 +11085,TEST,0,0 +11086,TEST,0,0 +11087,TEST,0,0 +11088,TEST,0,0 +11089,TEST,0,0 +11090,TEST,0,0 +11091,TEST,0,0 +11092,TEST,0,0 +11093,TEST,0,0 +11094,TEST,0,0 +11095,TEST,0,0 +11096,TEST,0,0 +11097,TEST,0,0 +11098,TEST,0,0 +11099,TEST,0,0 +11100,TEST,0,0 +11101,TEST,0,0 +11102,TEST,0,0 +11103,TEST,0,0 +11104,TEST,0,0 +11105,TEST,0,0 +11106,TEST,0,0 +11107,TEST,0,0 +11108,TEST,0,0 +11109,TEST,0,0 +11110,TEST,0,0 +11111,TEST,0,0 +11112,TEST,0,0 +11113,TEST,0,0 +11114,TEST,0,0 +11115,TEST,0,0 +11116,TEST,0,0 +11117,TEST,0,0 +11118,TEST,0,0 +11119,TEST,0,0 +11120,TEST,0,0 +11121,TEST,0,0 +11122,TEST,0,0 +11123,TEST,0,0 +11124,TEST,0,0 +11125,TEST,0,0 +11126,TEST,0,0 +11127,TEST,0,0 +11128,TEST,0,0 +11129,TEST,0,0 +11130,TEST,0,0 +11131,TEST,0,0 +11132,TEST,0,0 +11133,TEST,0,0 +11134,TEST,0,0 +11135,TEST,0,0 +11136,TEST,0,0 +11137,TEST,0,0 +11138,TEST,0,0 +11139,TEST,0,0 +11140,TEST,0,0 +11141,TEST,0,0 +11142,TEST,0,0 +11143,TEST,0,0 +11144,TEST,0,0 +11145,TEST,0,0 +11146,TEST,0,0 +11147,TEST,0,0 +11148,TEST,0,0 +11149,TEST,0,0 +11150,TEST,0,0 +11151,TEST,0,0 +11152,TEST,0,0 +11153,TEST,0,0 +11154,TEST,0,0 +11155,TEST,0,0 +11156,TEST,0,0 +11157,TEST,0,0 +11158,TEST,0,0 +11159,TEST,0,0 +11160,TEST,0,0 +11161,TEST,0,0 +11162,TEST,0,0 +11163,TEST,0,0 +11164,TEST,0,0 +11165,TEST,0,0 +11166,TEST,0,0 +11167,TEST,0,0 +11168,TEST,0,0 +11169,TEST,0,0 +11170,TEST,0,0 +11171,TEST,0,0 +11172,TEST,0,0 +11173,TEST,0,0 +11174,TEST,0,0 +11175,TEST,0,0 +11176,TEST,0,0 +11177,TEST,0,0 +11178,TEST,0,0 +11179,TEST,0,0 +11180,TEST,0,0 +11181,TEST,0,0 +11182,TEST,0,0 +11183,TEST,0,0 +11184,TEST,0,0 +11185,TEST,0,0 +11186,TEST,0,0 +11187,TEST,0,0 +11188,TEST,0,0 +11189,TEST,0,0 +11190,TEST,0,0 +11191,TEST,0,0 +11192,TEST,0,0 +11193,TEST,0,0 +11194,TEST,0,0 +11195,TEST,0,0 +11196,TEST,0,0 +11197,TEST,0,0 +11198,TEST,0,0 +11199,TEST,0,0 +11200,TEST,0,0 +11201,TEST,0,0 +11202,TEST,0,0 +11203,TEST,0,0 +11204,TEST,0,0 +11205,TEST,0,0 +11206,TEST,0,0 +11207,TEST,0,0 +11208,TEST,0,0 +11209,TEST,0,0 +11210,TEST,0,0 +11211,TEST,0,0 +11212,TEST,0,0 +11213,TEST,0,0 +11214,TEST,0,0 +11215,TEST,0,0 +11216,TEST,0,0 +11217,TEST,0,0 +11218,TEST,0,0 +11219,TEST,0,0 +11220,TEST,0,0 +11221,TEST,0,0 +11222,TEST,0,0 +11223,TEST,0,0 +11224,TEST,0,0 +11225,TEST,0,0 +11226,TEST,0,0 +11227,TEST,0,0 +11228,TEST,0,0 +11229,TEST,0,0 +11230,TEST,0,0 +11231,TEST,0,0 +11232,TEST,0,0 +11233,TEST,0,0 +11234,TEST,0,0 +11235,TEST,0,0 +11236,TEST,0,0 +11237,TEST,0,0 +11238,TEST,0,0 +11239,TEST,0,0 +11240,TEST,0,0 +11241,TEST,0,0 +11242,TEST,0,0 +11243,TEST,0,0 +11244,TEST,0,0 +11245,TEST,0,0 +11246,TEST,0,0 +11247,TEST,0,0 +11248,TEST,0,0 +11249,TEST,0,0 +11250,TEST,0,0 +11251,TEST,0,0 +11252,TEST,0,0 +11253,TEST,0,0 +11254,TEST,0,0 +11255,TEST,0,0 +11256,TEST,0,0 +11257,TEST,0,0 +11258,TEST,0,0 +11259,TEST,0,0 +11260,TEST,0,0 +11261,TEST,0,0 +11262,TEST,0,0 +11263,TEST,0,0 +11264,TEST,0,0 +11265,TEST,0,0 +11266,TEST,0,0 +11267,TEST,0,0 +11268,TEST,0,0 +11269,TEST,0,0 +11270,TEST,0,0 +11271,TEST,0,0 +11272,TEST,0,0 +11273,TEST,0,0 +11274,TEST,0,0 +11275,TEST,0,0 +11276,TEST,0,0 +11277,TEST,0,0 +11278,TEST,0,0 +11279,TEST,0,0 +11280,TEST,0,0 +11281,TEST,0,0 +11282,TEST,0,0 +11283,TEST,0,0 +11284,TEST,0,0 +11285,TEST,0,0 +11286,TEST,0,0 +11287,TEST,0,0 +11288,TEST,0,0 +11289,TEST,0,0 +11290,TEST,0,0 +11291,TEST,0,0 +11292,TEST,0,0 +11293,TEST,0,0 +11294,TEST,0,0 +11295,TEST,0,0 +11296,TEST,0,0 +11297,TEST,0,0 +11298,TEST,0,0 +11299,TEST,0,0 +11300,TEST,0,0 +11301,TEST,0,0 +11302,TEST,0,0 +11303,TEST,0,0 +11304,TEST,0,0 +11305,TEST,0,0 +11306,TEST,0,0 +11307,TEST,0,0 +11308,TEST,0,0 +11309,TEST,0,0 +11310,TEST,0,0 +11311,TEST,0,0 +11312,TEST,0,0 +11313,TEST,0,0 +11314,TEST,0,0 +11315,TEST,0,0 +11316,TEST,0,0 +11317,TEST,0,0 +11318,TEST,0,0 +11319,TEST,0,0 +11320,TEST,0,0 +11321,TEST,0,0 +11322,TEST,0,0 +11323,TEST,0,0 +11324,TEST,0,0 +11325,TEST,0,0 +11326,TEST,0,0 +11327,TEST,0,0 +11328,TEST,0,0 +11329,TEST,0,0 +11330,TEST,0,0 +11331,TEST,0,0 +11332,TEST,0,0 +11333,TEST,0,0 +11334,TEST,0,0 +11335,TEST,0,0 +11336,TEST,0,0 +11337,TEST,0,0 +11338,TEST,0,0 +11339,TEST,0,0 +11340,TEST,0,0 +11341,TEST,0,0 +11342,TEST,0,0 +11343,TEST,0,0 +11344,TEST,0,0 +11345,TEST,0,0 +11346,TEST,0,0 +11347,TEST,0,0 +11348,TEST,0,0 +11349,TEST,0,0 +11350,TEST,0,0 +11351,TEST,0,0 +11352,TEST,0,0 +11353,TEST,0,0 +11354,TEST,0,0 +11355,TEST,0,0 +11356,TEST,0,0 +11357,TEST,0,0 +11358,TEST,0,0 +11359,TEST,0,0 +11360,TEST,0,0 +11361,TEST,0,0 +11362,TEST,0,0 +11363,TEST,0,0 +11364,TEST,0,0 +11365,TEST,0,0 +11366,TEST,0,0 +11367,TEST,0,0 +11368,TEST,0,0 +11369,TEST,0,0 +11370,TEST,0,0 +11371,TEST,0,0 +11372,TEST,0,0 +11373,TEST,0,0 +11374,TEST,0,0 +11375,TEST,0,0 +11376,TEST,0,0 +11377,TEST,0,0 +11378,TEST,0,0 +11379,TEST,0,0 +11380,TEST,0,0 +11381,TEST,0,0 +11382,TEST,0,0 +11383,TEST,0,0 +11384,TEST,0,0 +11385,TEST,0,0 +11386,TEST,0,0 +11387,TEST,0,0 +11388,TEST,0,0 +11389,TEST,0,0 +11390,TEST,0,0 +11391,TEST,0,0 +11392,TEST,0,0 +11393,TEST,0,0 +11394,TEST,0,0 +11395,TEST,0,0 +11396,TEST,0,0 +11397,TEST,0,0 +11398,TEST,0,0 +11399,TEST,0,0 +11400,TEST,0,0 +11401,TEST,0,0 +11402,TEST,0,0 +11403,TEST,0,0 +11404,TEST,0,0 +11405,TEST,0,0 +11406,TEST,0,0 +11407,TEST,0,0 +11408,TEST,0,0 +11409,TEST,0,0 +11410,TEST,0,0 +11411,TEST,0,0 +11412,TEST,0,0 +11413,TEST,0,0 +11414,TEST,0,0 +11415,TEST,0,0 +11416,TEST,0,0 +11417,TEST,0,0 +11418,TEST,0,0 +11419,TEST,0,0 +11420,TEST,0,0 +11421,TEST,0,0 +11422,TEST,0,0 +11423,TEST,0,0 +11424,TEST,0,0 +11425,TEST,0,0 +11426,TEST,0,0 +11427,TEST,0,0 +11428,TEST,0,0 +11429,TEST,0,0 +11430,TEST,0,0 +11431,TEST,0,0 +11432,TEST,0,0 +11433,TEST,0,0 +11434,TEST,0,0 +11435,TEST,0,0 +11436,TEST,0,0 +11437,TEST,0,0 +11438,TEST,0,0 +11439,TEST,0,0 +11440,TEST,0,0 +11441,TEST,0,0 +11442,TEST,0,0 +11443,TEST,0,0 +11444,TEST,0,0 +11445,TEST,0,0 +11446,TEST,0,0 +11447,TEST,0,0 +11448,TEST,0,0 +11449,TEST,0,0 +11450,TEST,0,0 +11451,TEST,0,0 +11452,TEST,0,0 +11453,TEST,0,0 +11454,TEST,0,0 +11455,TEST,0,0 +11456,TEST,0,0 +11457,TEST,0,0 +11458,TEST,0,0 +11459,TEST,0,0 +11460,TEST,0,0 +11461,TEST,0,0 +11462,TEST,0,0 +11463,TEST,0,0 +11464,TEST,0,0 +11465,TEST,0,0 +11466,TEST,0,0 +11467,TEST,0,0 +11468,TEST,0,0 +11469,TEST,0,0 +11470,TEST,0,0 +11471,TEST,0,0 +11472,TEST,0,0 +11473,TEST,0,0 +11474,TEST,0,0 +11475,TEST,0,0 +11476,TEST,0,0 +11477,TEST,0,0 +11478,TEST,0,0 +11479,TEST,0,0 +11480,TEST,0,0 +11481,TEST,0,0 +11482,TEST,0,0 +11483,TEST,0,0 +11484,TEST,0,0 +11485,TEST,0,0 +11486,TEST,0,0 +11487,TEST,0,0 +11488,TEST,0,0 +11489,TEST,0,0 +11490,TEST,0,0 +11491,TEST,0,0 +11492,TEST,0,0 +11493,TEST,0,0 +11494,TEST,0,0 +11495,TEST,0,0 +11496,TEST,0,0 +11497,TEST,0,0 +11498,TEST,0,0 +11499,TEST,0,0 +11500,TEST,0,0 +11501,TEST,0,0 +11502,TEST,0,0 +11503,TEST,0,0 +11504,TEST,0,0 +11505,TEST,0,0 +11506,TEST,0,0 +11507,TEST,0,0 +11508,TEST,0,0 +11509,TEST,0,0 +11510,TEST,0,0 +11511,TEST,0,0 +11512,TEST,0,0 +11513,TEST,0,0 +11514,TEST,0,0 +11515,TEST,0,0 +11516,TEST,0,0 +11517,TEST,0,0 +11518,TEST,0,0 +11519,TEST,0,0 +11520,TEST,0,0 +11521,TEST,0,0 +11522,TEST,0,0 +11523,TEST,0,0 +11524,TEST,0,0 +11525,TEST,0,0 +11526,TEST,0,0 +11527,TEST,0,0 +11528,TEST,0,0 +11529,TEST,0,0 +11530,TEST,0,0 +11531,TEST,0,0 +11532,TEST,0,0 +11533,TEST,0,0 +11534,TEST,0,0 +11535,TEST,0,0 +11536,TEST,0,0 +11537,TEST,0,0 +11538,TEST,0,0 +11539,TEST,0,0 +11540,TEST,0,0 +11541,TEST,0,0 +11542,TEST,0,0 +11543,TEST,0,0 +11544,TEST,0,0 +11545,TEST,0,0 +11546,TEST,0,0 +11547,TEST,0,0 +11548,TEST,0,0 +11549,TEST,0,0 +11550,TEST,0,0 +11551,TEST,0,0 +11552,TEST,0,0 +11553,TEST,0,0 +11554,TEST,0,0 +11555,TEST,0,0 +11556,TEST,0,0 +11557,TEST,0,0 +11558,TEST,0,0 +11559,TEST,0,0 +11560,TEST,0,0 +11561,TEST,0,0 +11562,TEST,0,0 +11563,TEST,0,0 +11564,TEST,0,0 +11565,TEST,0,0 +11566,TEST,0,0 +11567,TEST,0,0 +11568,TEST,0,0 +11569,TEST,0,0 +11570,TEST,0,0 +11571,TEST,0,0 +11572,TEST,0,0 +11573,TEST,0,0 +11574,TEST,0,0 +11575,TEST,0,0 +11576,TEST,0,0 +11577,TEST,0,0 +11578,TEST,0,0 +11579,TEST,0,0 +11580,TEST,0,0 +11581,TEST,0,0 +11582,TEST,0,0 +11583,TEST,0,0 +11584,TEST,0,0 +11585,TEST,0,0 +11586,TEST,0,0 +11587,TEST,0,0 +11588,TEST,0,0 +11589,TEST,0,0 +11590,TEST,0,0 +11591,TEST,0,0 +11592,TEST,0,0 +11593,TEST,0,0 +11594,TEST,0,0 +11595,TEST,0,0 +11596,TEST,0,0 +11597,TEST,0,0 +11598,TEST,0,0 +11599,TEST,0,0 +11600,TEST,0,0 +11601,TEST,0,0 +11602,TEST,0,0 +11603,TEST,0,0 +11604,TEST,0,0 +11605,TEST,0,0 +11606,TEST,0,0 +11607,TEST,0,0 +11608,TEST,0,0 +11609,TEST,0,0 +11610,TEST,0,0 +11611,TEST,0,0 +11612,TEST,0,0 +11613,TEST,0,0 +11614,TEST,0,0 +11615,TEST,0,0 +11616,TEST,0,0 +11617,TEST,0,0 +11618,TEST,0,0 +11619,TEST,0,0 +11620,TEST,0,0 +11621,TEST,0,0 +11622,TEST,0,0 +11623,TEST,0,0 +11624,TEST,0,0 +11625,TEST,0,0 +11626,TEST,0,0 +11627,TEST,0,0 +11628,TEST,0,0 +11629,TEST,0,0 +11630,TEST,0,0 +11631,TEST,0,0 +11632,TEST,0,0 +11633,TEST,0,0 +11634,TEST,0,0 +11635,TEST,0,0 +11636,TEST,0,0 +11637,TEST,0,0 +11638,TEST,0,0 +11639,TEST,0,0 +11640,TEST,0,0 +11641,TEST,0,0 +11642,TEST,0,0 +11643,TEST,0,0 +11644,TEST,0,0 +11645,TEST,0,0 +11646,TEST,0,0 +11647,TEST,0,0 +11648,TEST,0,0 +11649,TEST,0,0 +11650,TEST,0,0 +11651,TEST,0,0 +11652,TEST,0,0 +11653,TEST,0,0 +11654,TEST,0,0 +11655,TEST,0,0 +11656,TEST,0,0 +11657,TEST,0,0 +11658,TEST,0,0 +11659,TEST,0,0 +11660,TEST,0,0 +11661,TEST,0,0 +11662,TEST,0,0 +11663,TEST,0,0 +11664,TEST,0,0 +11665,TEST,0,0 +11666,TEST,0,0 +11667,TEST,0,0 +11668,TEST,0,0 +11669,TEST,0,0 +11670,TEST,0,0 +11671,TEST,0,0 +11672,TEST,0,0 +11673,TEST,0,0 +11674,TEST,0,0 +11675,TEST,0,0 +11676,TEST,0,0 +11677,TEST,0,0 +11678,TEST,0,0 +11679,TEST,0,0 +11680,TEST,0,0 +11681,TEST,0,0 +11682,TEST,0,0 +11683,TEST,0,0 +11684,TEST,0,0 +11685,TEST,0,0 +11686,TEST,0,0 +11687,TEST,0,0 +11688,TEST,0,0 +11689,TEST,0,0 +11690,TEST,0,0 +11691,TEST,0,0 +11692,TEST,0,0 +11693,TEST,0,0 +11694,TEST,0,0 +11695,TEST,0,0 +11696,TEST,0,0 +11697,TEST,0,0 +11698,TEST,0,0 +11699,TEST,0,0 +11700,TEST,0,0 +11701,TEST,0,0 +11702,TEST,0,0 +11703,TEST,0,0 +11704,TEST,0,0 +11705,TEST,0,0 +11706,TEST,0,0 +11707,TEST,0,0 +11708,TEST,0,0 +11709,TEST,0,0 +11710,TEST,0,0 +11711,TEST,0,0 +11712,TEST,0,0 +11713,TEST,0,0 +11714,TEST,0,0 +11715,TEST,0,0 +11716,TEST,0,0 +11717,TEST,0,0 +11718,TEST,0,0 +11719,TEST,0,0 +11720,TEST,0,0 +11721,TEST,0,0 +11722,TEST,0,0 +11723,TEST,0,0 +11724,TEST,0,0 +11725,TEST,0,0 +11726,TEST,0,0 +11727,TEST,0,0 +11728,TEST,0,0 +11729,TEST,0,0 +11730,TEST,0,0 +11731,TEST,0,0 +11732,TEST,0,0 +11733,TEST,0,0 +11734,TEST,0,0 +11735,TEST,0,0 +11736,TEST,0,0 +11737,TEST,0,0 +11738,TEST,0,0 +11739,TEST,0,0 +11740,TEST,0,0 +11741,TEST,0,0 +11742,TEST,0,0 +11743,TEST,0,0 +11744,TEST,0,0 +11745,TEST,0,0 +11746,TEST,0,0 +11747,TEST,0,0 +11748,TEST,0,0 +11749,TEST,0,0 +11750,TEST,0,0 +11751,TEST,0,0 +11752,TEST,0,0 +11753,TEST,0,0 +11754,TEST,0,0 +11755,TEST,0,0 +11756,TEST,0,0 +11757,TEST,0,0 +11758,TEST,0,0 +11759,TEST,0,0 +11760,TEST,0,0 +11761,TEST,0,0 +11762,TEST,0,0 +11763,TEST,0,0 +11764,TEST,0,0 +11765,TEST,0,0 +11766,TEST,0,0 +11767,TEST,0,0 +11768,TEST,0,0 +11769,TEST,0,0 +11770,TEST,0,0 +11771,TEST,0,0 +11772,TEST,0,0 +11773,TEST,0,0 +11774,TEST,0,0 +11775,TEST,0,0 +11776,TEST,0,0 +11777,TEST,0,0 +11778,TEST,0,0 +11779,TEST,0,0 +11780,TEST,0,0 +11781,TEST,0,0 +11782,TEST,0,0 +11783,TEST,0,0 +11784,TEST,0,0 +11785,TEST,0,0 +11786,TEST,0,0 +11787,TEST,0,0 +11788,TEST,0,0 +11789,TEST,0,0 +11790,TEST,0,0 +11791,TEST,0,0 +11792,TEST,0,0 +11793,TEST,0,0 +11794,TEST,0,0 +11795,TEST,0,0 +11796,TEST,0,0 +11797,TEST,0,0 +11798,TEST,0,0 +11799,TEST,0,0 +11800,TEST,0,0 +11801,TEST,0,0 +11802,TEST,0,0 +11803,TEST,0,0 +11804,TEST,0,0 +11805,TEST,0,0 +11806,TEST,0,0 +11807,TEST,0,0 +11808,TEST,0,0 +11809,TEST,0,0 +11810,TEST,0,0 +11811,TEST,0,0 +11812,TEST,0,0 +11813,TEST,0,0 +11814,TEST,0,0 +11815,TEST,0,0 +11816,TEST,0,0 +11817,TEST,0,0 +11818,TEST,0,0 +11819,TEST,0,0 +11820,TEST,0,0 +11821,TEST,0,0 +11822,TEST,0,0 +11823,TEST,0,0 +11824,TEST,0,0 +11825,TEST,0,0 +11826,TEST,0,0 +11827,TEST,0,0 +11828,TEST,0,0 +11829,TEST,0,0 +11830,TEST,0,0 +11831,TEST,0,0 +11832,TEST,0,0 +11833,TEST,0,0 +11834,TEST,0,0 +11835,TEST,0,0 +11836,TEST,0,0 +11837,TEST,0,0 +11838,TEST,0,0 +11839,TEST,0,0 +11840,TEST,0,0 +11841,TEST,0,0 +11842,TEST,0,0 +11843,TEST,0,0 +11844,TEST,0,0 +11845,TEST,0,0 +11846,TEST,0,0 +11847,TEST,0,0 +11848,TEST,0,0 +11849,TEST,0,0 +11850,TEST,0,0 +11851,TEST,0,0 +11852,TEST,0,0 +11853,TEST,0,0 +11854,TEST,0,0 +11855,TEST,0,0 +11856,TEST,0,0 +11857,TEST,0,0 +11858,TEST,0,0 +11859,TEST,0,0 +11860,TEST,0,0 +11861,TEST,0,0 +11862,TEST,0,0 +11863,TEST,0,0 +11864,TEST,0,0 +11865,TEST,0,0 +11866,TEST,0,0 +11867,TEST,0,0 +11868,TEST,0,0 +11869,TEST,0,0 +11870,TEST,0,0 +11871,TEST,0,0 +11872,TEST,0,0 +11873,TEST,0,0 +11874,TEST,0,0 +11875,TEST,0,0 +11876,TEST,0,0 +11877,TEST,0,0 +11878,TEST,0,0 +11879,TEST,0,0 +11880,TEST,0,0 +11881,TEST,0,0 +11882,TEST,0,0 +11883,TEST,0,0 +11884,TEST,0,0 +11885,TEST,0,0 +11886,TEST,0,0 +11887,TEST,0,0 +11888,TEST,0,0 +11889,TEST,0,0 +11890,TEST,0,0 +11891,TEST,0,0 +11892,TEST,0,0 +11893,TEST,0,0 +11894,TEST,0,0 +11895,TEST,0,0 +11896,TEST,0,0 +11897,TEST,0,0 +11898,TEST,0,0 +11899,TEST,0,0 +11900,TEST,0,0 +11901,TEST,0,0 +11902,TEST,0,0 +11903,TEST,0,0 +11904,TEST,0,0 +11905,TEST,0,0 +11906,TEST,0,0 +11907,TEST,0,0 +11908,TEST,0,0 +11909,TEST,0,0 +11910,TEST,0,0 +11911,TEST,0,0 +11912,TEST,0,0 +11913,TEST,0,0 +11914,TEST,0,0 +11915,TEST,0,0 +11916,TEST,0,0 +11917,TEST,0,0 +11918,TEST,0,0 +11919,TEST,0,0 +11920,TEST,0,0 +11921,TEST,0,0 +11922,TEST,0,0 +11923,TEST,0,0 +11924,TEST,0,0 +11925,TEST,0,0 +11926,TEST,0,0 +11927,TEST,0,0 +11928,TEST,0,0 +11929,TEST,0,0 +11930,TEST,0,0 +11931,TEST,0,0 +11932,TEST,0,0 +11933,TEST,0,0 +11934,TEST,0,0 +11935,TEST,0,0 +11936,TEST,0,0 +11937,TEST,0,0 +11938,TEST,0,0 +11939,TEST,0,0 +11940,TEST,0,0 +11941,TEST,0,0 +11942,TEST,0,0 +11943,TEST,0,0 +11944,TEST,0,0 +11945,TEST,0,0 +11946,TEST,0,0 +11947,TEST,0,0 +11948,TEST,0,0 +11949,TEST,0,0 +11950,TEST,0,0 +11951,TEST,0,0 +11952,TEST,0,0 +11953,TEST,0,0 +11954,TEST,0,0 +11955,TEST,0,0 +11956,TEST,0,0 +11957,TEST,0,0 +11958,TEST,0,0 +11959,TEST,0,0 +11960,TEST,0,0 +11961,TEST,0,0 +11962,TEST,0,0 +11963,TEST,0,0 +11964,TEST,0,0 +11965,TEST,0,0 +11966,TEST,0,0 +11967,TEST,0,0 +11968,TEST,0,0 +11969,TEST,0,0 +11970,TEST,0,0 +11971,TEST,0,0 +11972,TEST,0,0 +11973,TEST,0,0 +11974,TEST,0,0 +11975,TEST,0,0 +11976,TEST,0,0 +11977,TEST,0,0 +11978,TEST,0,0 +11979,TEST,0,0 +11980,TEST,0,0 +11981,TEST,0,0 +11982,TEST,0,0 +11983,TEST,0,0 +11984,TEST,0,0 +11985,TEST,0,0 +11986,TEST,0,0 +11987,TEST,0,0 +11988,TEST,0,0 +11989,TEST,0,0 +11990,TEST,0,0 +11991,TEST,0,0 +11992,TEST,0,0 +11993,TEST,0,0 +11994,TEST,0,0 +11995,TEST,0,0 +11996,TEST,0,0 +11997,TEST,0,0 +11998,TEST,0,0 +11999,TEST,0,0 +12000,TEST,0,0 +12001,TEST,0,0 +12002,TEST,0,0 +12003,TEST,0,0 +12004,TEST,0,0 +12005,TEST,0,0 +12006,TEST,0,0 +12007,TEST,0,0 +12008,TEST,0,0 +12009,TEST,0,0 +12010,TEST,0,0 +12011,TEST,0,0 +12012,TEST,0,0 +12013,TEST,0,0 +12014,TEST,0,0 +12015,TEST,0,0 +12016,TEST,0,0 +12017,TEST,0,0 +12018,TEST,0,0 +12019,TEST,0,0 +12020,TEST,0,0 +12021,TEST,0,0 +12022,TEST,0,0 +12023,TEST,0,0 +12024,TEST,0,0 +12025,TEST,0,0 +12026,TEST,0,0 +12027,TEST,0,0 +12028,TEST,0,0 +12029,TEST,0,0 +12030,TEST,0,0 +12031,TEST,0,0 +12032,TEST,0,0 +12033,TEST,0,0 +12034,TEST,0,0 +12035,TEST,0,0 +12036,TEST,0,0 +12037,TEST,0,0 +12038,TEST,0,0 +12039,TEST,0,0 +12040,TEST,0,0 +12041,TEST,0,0 +12042,TEST,0,0 +12043,TEST,0,0 +12044,TEST,0,0 +12045,TEST,0,0 +12046,TEST,0,0 +12047,TEST,0,0 +12048,TEST,0,0 +12049,TEST,0,0 +12050,TEST,0,0 +12051,TEST,0,0 +12052,TEST,0,0 +12053,TEST,0,0 +12054,TEST,0,0 +12055,TEST,0,0 +12056,TEST,0,0 +12057,TEST,0,0 +12058,TEST,0,0 +12059,TEST,0,0 +12060,TEST,0,0 +12061,TEST,0,0 +12062,TEST,0,0 +12063,TEST,0,0 +12064,TEST,0,0 +12065,TEST,0,0 +12066,TEST,0,0 +12067,TEST,0,0 +12068,TEST,0,0 +12069,TEST,0,0 +12070,TEST,0,0 +12071,TEST,0,0 +12072,TEST,0,0 +12073,TEST,0,0 +12074,TEST,0,0 +12075,TEST,0,0 +12076,TEST,0,0 +12077,TEST,0,0 +12078,TEST,0,0 +12079,TEST,0,0 +12080,TEST,0,0 +12081,TEST,0,0 +12082,TEST,0,0 +12083,TEST,0,0 +12084,TEST,0,0 +12085,TEST,0,0 +12086,TEST,0,0 +12087,TEST,0,0 +12088,TEST,0,0 +12089,TEST,0,0 +12090,TEST,0,0 +12091,TEST,0,0 +12092,TEST,0,0 +12093,TEST,0,0 +12094,TEST,0,0 +12095,TEST,0,0 +12096,TEST,0,0 +12097,TEST,0,0 +12098,TEST,0,0 +12099,TEST,0,0 +12100,TEST,0,0 +12101,TEST,0,0 +12102,TEST,0,0 +12103,TEST,0,0 +12104,TEST,0,0 +12105,TEST,0,0 +12106,TEST,0,0 +12107,TEST,0,0 +12108,TEST,0,0 +12109,TEST,0,0 +12110,TEST,0,0 +12111,TEST,0,0 +12112,TEST,0,0 +12113,TEST,0,0 +12114,TEST,0,0 +12115,TEST,0,0 +12116,TEST,0,0 +12117,TEST,0,0 +12118,TEST,0,0 +12119,TEST,0,0 +12120,TEST,0,0 +12121,TEST,0,0 +12122,TEST,0,0 +12123,TEST,0,0 +12124,TEST,0,0 +12125,TEST,0,0 +12126,TEST,0,0 +12127,TEST,0,0 +12128,TEST,0,0 +12129,TEST,0,0 +12130,TEST,0,0 +12131,TEST,0,0 +12132,TEST,0,0 +12133,TEST,0,0 +12134,TEST,0,0 +12135,TEST,0,0 +12136,TEST,0,0 +12137,TEST,0,0 +12138,TEST,0,0 +12139,TEST,0,0 +12140,TEST,0,0 +12141,TEST,0,0 +12142,TEST,0,0 +12143,TEST,0,0 +12144,TEST,0,0 +12145,TEST,0,0 +12146,TEST,0,0 +12147,TEST,0,0 +12148,TEST,0,0 +12149,TEST,0,0 +12150,TEST,0,0 +12151,TEST,0,0 +12152,TEST,0,0 +12153,TEST,0,0 +12154,TEST,0,0 +12155,TEST,0,0 +12156,TEST,0,0 +12157,TEST,0,0 +12158,TEST,0,0 +12159,TEST,0,0 +12160,TEST,0,0 +12161,TEST,0,0 +12162,TEST,0,0 +12163,TEST,0,0 +12164,TEST,0,0 +12165,TEST,0,0 +12166,TEST,0,0 +12167,TEST,0,0 +12168,TEST,0,0 +12169,TEST,0,0 +12170,TEST,0,0 +12171,TEST,0,0 +12172,TEST,0,0 +12173,TEST,0,0 +12174,TEST,0,0 +12175,TEST,0,0 +12176,TEST,0,0 +12177,TEST,0,0 +12178,TEST,0,0 +12179,TEST,0,0 +12180,TEST,0,0 +12181,TEST,0,0 +12182,TEST,0,0 +12183,TEST,0,0 +12184,TEST,0,0 +12185,TEST,0,0 +12186,TEST,0,0 +12187,TEST,0,0 +12188,TEST,0,0 +12189,TEST,0,0 +12190,TEST,0,0 +12191,TEST,0,0 +12192,TEST,0,0 +12193,TEST,0,0 +12194,TEST,0,0 +12195,TEST,0,0 +12196,TEST,0,0 +12197,TEST,0,0 +12198,TEST,0,0 +12199,TEST,0,0 +12200,TEST,0,0 +12201,TEST,0,0 +12202,TEST,0,0 +12203,TEST,0,0 +12204,TEST,0,0 +12205,TEST,0,0 +12206,TEST,0,0 +12207,TEST,0,0 +12208,TEST,0,0 +12209,TEST,0,0 +12210,TEST,0,0 +12211,TEST,0,0 +12212,TEST,0,0 +12213,TEST,0,0 +12214,TEST,0,0 +12215,TEST,0,0 +12216,TEST,0,0 +12217,TEST,0,0 +12218,TEST,0,0 +12219,TEST,0,0 +12220,TEST,0,0 +12221,TEST,0,0 +12222,TEST,0,0 +12223,TEST,0,0 +12224,TEST,0,0 +12225,TEST,0,0 +12226,TEST,0,0 +12227,TEST,0,0 +12228,TEST,0,0 +12229,TEST,0,0 +12230,TEST,0,0 +12231,TEST,0,0 +12232,TEST,0,0 +12233,TEST,0,0 +12234,TEST,0,0 +12235,TEST,0,0 +12236,TEST,0,0 +12237,TEST,0,0 +12238,TEST,0,0 +12239,TEST,0,0 +12240,TEST,0,0 +12241,TEST,0,0 +12242,TEST,0,0 +12243,TEST,0,0 +12244,TEST,0,0 +12245,TEST,0,0 +12246,TEST,0,0 +12247,TEST,0,0 +12248,TEST,0,0 +12249,TEST,0,0 +12250,TEST,0,0 +12251,TEST,0,0 +12252,TEST,0,0 +12253,TEST,0,0 +12254,TEST,0,0 +12255,TEST,0,0 +12256,TEST,0,0 +12257,TEST,0,0 +12258,TEST,0,0 +12259,TEST,0,0 +12260,TEST,0,0 +12261,TEST,0,0 +12262,TEST,0,0 +12263,TEST,0,0 +12264,TEST,0,0 +12265,TEST,0,0 +12266,TEST,0,0 +12267,TEST,0,0 +12268,TEST,0,0 +12269,TEST,0,0 +12270,TEST,0,0 +12271,TEST,0,0 +12272,TEST,0,0 +12273,TEST,0,0 +12274,TEST,0,0 +12275,TEST,0,0 +12276,TEST,0,0 +12277,TEST,0,0 +12278,TEST,0,0 +12279,TEST,0,0 +12280,TEST,0,0 +12281,TEST,0,0 +12282,TEST,0,0 +12283,TEST,0,0 +12284,TEST,0,0 +12285,TEST,0,0 +12286,TEST,0,0 +12287,TEST,0,0 +12288,TEST,0,0 +12289,TEST,0,0 +12290,TEST,0,0 +12291,TEST,0,0 +12292,TEST,0,0 +12293,TEST,0,0 +12294,TEST,0,0 +12295,TEST,0,0 +12296,TEST,0,0 +12297,TEST,0,0 +12298,TEST,0,0 +12299,TEST,0,0 +12300,TEST,0,0 +12301,TEST,0,0 +12302,TEST,0,0 +12303,TEST,0,0 +12304,TEST,0,0 +12305,TEST,0,0 +12306,TEST,0,0 +12307,TEST,0,0 +12308,TEST,0,0 +12309,TEST,0,0 +12310,TEST,0,0 +12311,TEST,0,0 +12312,TEST,0,0 +12313,TEST,0,0 +12314,TEST,0,0 +12315,TEST,0,0 +12316,TEST,0,0 +12317,TEST,0,0 +12318,TEST,0,0 +12319,TEST,0,0 +12320,TEST,0,0 +12321,TEST,0,0 +12322,TEST,0,0 +12323,TEST,0,0 +12324,TEST,0,0 +12325,TEST,0,0 +12326,TEST,0,0 +12327,TEST,0,0 +12328,TEST,0,0 +12329,TEST,0,0 +12330,TEST,0,0 +12331,TEST,0,0 +12332,TEST,0,0 +12333,TEST,0,0 +12334,TEST,0,0 +12335,TEST,0,0 +12336,TEST,0,0 +12337,TEST,0,0 +12338,TEST,0,0 +12339,TEST,0,0 +12340,TEST,0,0 +12341,TEST,0,0 +12342,TEST,0,0 +12343,TEST,0,0 +12344,TEST,0,0 +12345,TEST,0,0 +12346,TEST,0,0 +12347,TEST,0,0 +12348,TEST,0,0 +12349,TEST,0,0 +12350,TEST,0,0 +12351,TEST,0,0 +12352,TEST,0,0 +12353,TEST,0,0 +12354,TEST,0,0 +12355,TEST,0,0 +12356,TEST,0,0 +12357,TEST,0,0 +12358,TEST,0,0 +12359,TEST,0,0 +12360,TEST,0,0 +12361,TEST,0,0 +12362,TEST,0,0 +12363,TEST,0,0 +12364,TEST,0,0 +12365,TEST,0,0 +12366,TEST,0,0 +12367,TEST,0,0 +12368,TEST,0,0 +12369,TEST,0,0 +12370,TEST,0,0 +12371,TEST,0,0 +12372,TEST,0,0 +12373,TEST,0,0 +12374,TEST,0,0 +12375,TEST,0,0 +12376,TEST,0,0 +12377,TEST,0,0 +12378,TEST,0,0 +12379,TEST,0,0 +12380,TEST,0,0 +12381,TEST,0,0 +12382,TEST,0,0 +12383,TEST,0,0 +12384,TEST,0,0 +12385,TEST,0,0 +12386,TEST,0,0 +12387,TEST,0,0 +12388,TEST,0,0 +12389,TEST,0,0 +12390,TEST,0,0 +12391,TEST,0,0 +12392,TEST,0,0 +12393,TEST,0,0 +12394,TEST,0,0 +12395,TEST,0,0 +12396,TEST,0,0 +12397,TEST,0,0 +12398,TEST,0,0 +12399,TEST,0,0 +12400,TEST,0,0 +12401,TEST,0,0 +12402,TEST,0,0 +12403,TEST,0,0 +12404,TEST,0,0 +12405,TEST,0,0 +12406,TEST,0,0 +12407,TEST,0,0 +12408,TEST,0,0 +12409,TEST,0,0 +12410,TEST,0,0 +12411,TEST,0,0 +12412,TEST,0,0 +12413,TEST,0,0 +12414,TEST,0,0 +12415,TEST,0,0 +12416,TEST,0,0 +12417,TEST,0,0 +12418,TEST,0,0 +12419,TEST,0,0 +12420,TEST,0,0 +12421,TEST,0,0 +12422,TEST,0,0 +12423,TEST,0,0 +12424,TEST,0,0 +12425,TEST,0,0 +12426,TEST,0,0 +12427,TEST,0,0 +12428,TEST,0,0 +12429,TEST,0,0 +12430,TEST,0,0 +12431,TEST,0,0 +12432,TEST,0,0 +12433,TEST,0,0 +12434,TEST,0,0 +12435,TEST,0,0 +12436,TEST,0,0 +12437,TEST,0,0 +12438,TEST,0,0 +12439,TEST,0,0 +12440,TEST,0,0 +12441,TEST,0,0 +12442,TEST,0,0 +12443,TEST,0,0 +12444,TEST,0,0 +12445,TEST,0,0 +12446,TEST,0,0 +12447,TEST,0,0 +12448,TEST,0,0 +12449,TEST,0,0 +12450,TEST,0,0 +12451,TEST,0,0 +12452,TEST,0,0 +12453,TEST,0,0 +12454,TEST,0,0 +12455,TEST,0,0 +12456,TEST,0,0 +12457,TEST,0,0 +12458,TEST,0,0 +12459,TEST,0,0 +12460,TEST,0,0 +12461,TEST,0,0 +12462,TEST,0,0 +12463,TEST,0,0 +12464,TEST,0,0 +12465,TEST,0,0 +12466,TEST,0,0 +12467,TEST,0,0 +12468,TEST,0,0 +12469,TEST,0,0 +12470,TEST,0,0 +12471,TEST,0,0 +12472,TEST,0,0 +12473,TEST,0,0 +12474,TEST,0,0 +12475,TEST,0,0 +12476,TEST,0,0 +12477,TEST,0,0 +12478,TEST,0,0 +12479,TEST,0,0 +12480,TEST,0,0 +12481,TEST,0,0 +12482,TEST,0,0 +12483,TEST,0,0 +12484,TEST,0,0 +12485,TEST,0,0 +12486,TEST,0,0 +12487,TEST,0,0 +12488,TEST,0,0 +12489,TEST,0,0 +12490,TEST,0,0 +12491,TEST,0,0 +12492,TEST,0,0 +12493,TEST,0,0 +12494,TEST,0,0 +12495,TEST,0,0 +12496,TEST,0,0 +12497,TEST,0,0 +12498,TEST,0,0 +12499,TEST,0,0 +12500,TEST,0,0 +12501,TEST,0,0 +12502,TEST,0,0 +12503,TEST,0,0 +12504,TEST,0,0 +12505,TEST,0,0 +12506,TEST,0,0 +12507,TEST,0,0 +12508,TEST,0,0 +12509,TEST,0,0 +12510,TEST,0,0 +12511,TEST,0,0 +12512,TEST,0,0 +12513,TEST,0,0 +12514,TEST,0,0 +12515,TEST,0,0 +12516,TEST,0,0 +12517,TEST,0,0 +12518,TEST,0,0 +12519,TEST,0,0 +12520,TEST,0,0 +12521,TEST,0,0 +12522,TEST,0,0 +12523,TEST,0,0 +12524,TEST,0,0 +12525,TEST,0,0 +12526,TEST,0,0 +12527,TEST,0,0 +12528,TEST,0,0 +12529,TEST,0,0 +12530,TEST,0,0 +12531,TEST,0,0 +12532,TEST,0,0 +12533,TEST,0,0 +12534,TEST,0,0 +12535,TEST,0,0 +12536,TEST,0,0 +12537,TEST,0,0 +12538,TEST,0,0 +12539,TEST,0,0 +12540,TEST,0,0 +12541,TEST,0,0 +12542,TEST,0,0 +12543,TEST,0,0 +12544,TEST,0,0 +12545,TEST,0,0 +12546,TEST,0,0 +12547,TEST,0,0 +12548,TEST,0,0 +12549,TEST,0,0 +12550,TEST,0,0 +12551,TEST,0,0 +12552,TEST,0,0 +12553,TEST,0,0 +12554,TEST,0,0 +12555,TEST,0,0 +12556,TEST,0,0 +12557,TEST,0,0 +12558,TEST,0,0 +12559,TEST,0,0 +12560,TEST,0,0 +12561,TEST,0,0 +12562,TEST,0,0 +12563,TEST,0,0 +12564,TEST,0,0 +12565,TEST,0,0 +12566,TEST,0,0 +12567,TEST,0,0 +12568,TEST,0,0 +12569,TEST,0,0 +12570,TEST,0,0 +12571,TEST,0,0 +12572,TEST,0,0 +12573,TEST,0,0 +12574,TEST,0,0 +12575,TEST,0,0 +12576,TEST,0,0 +12577,TEST,0,0 +12578,TEST,0,0 +12579,TEST,0,0 +12580,TEST,0,0 +12581,TEST,0,0 +12582,TEST,0,0 +12583,TEST,0,0 +12584,TEST,0,0 +12585,TEST,0,0 +12586,TEST,0,0 +12587,TEST,0,0 +12588,TEST,0,0 +12589,TEST,0,0 +12590,TEST,0,0 +12591,TEST,0,0 +12592,TEST,0,0 +12593,TEST,0,0 +12594,TEST,0,0 +12595,TEST,0,0 +12596,TEST,0,0 +12597,TEST,0,0 +12598,TEST,0,0 +12599,TEST,0,0 +12600,TEST,0,0 +12601,TEST,0,0 +12602,TEST,0,0 +12603,TEST,0,0 +12604,TEST,0,0 +12605,TEST,0,0 +12606,TEST,0,0 +12607,TEST,0,0 +12608,TEST,0,0 +12609,TEST,0,0 +12610,TEST,0,0 +12611,TEST,0,0 +12612,TEST,0,0 +12613,TEST,0,0 +12614,TEST,0,0 +12615,TEST,0,0 +12616,TEST,0,0 +12617,TEST,0,0 +12618,TEST,0,0 +12619,TEST,0,0 +12620,TEST,0,0 +12621,TEST,0,0 +12622,TEST,0,0 +12623,TEST,0,0 +12624,TEST,0,0 +12625,TEST,0,0 +12626,TEST,0,0 +12627,TEST,0,0 +12628,TEST,0,0 +12629,TEST,0,0 +12630,TEST,0,0 +12631,TEST,0,0 +12632,TEST,0,0 +12633,TEST,0,0 +12634,TEST,0,0 +12635,TEST,0,0 +12636,TEST,0,0 +12637,TEST,0,0 +12638,TEST,0,0 +12639,TEST,0,0 +12640,TEST,0,0 +12641,TEST,0,0 +12642,TEST,0,0 +12643,TEST,0,0 +12644,TEST,0,0 +12645,TEST,0,0 +12646,TEST,0,0 +12647,TEST,0,0 +12648,TEST,0,0 +12649,TEST,0,0 +12650,TEST,0,0 +12651,TEST,0,0 +12652,TEST,0,0 +12653,TEST,0,0 +12654,TEST,0,0 +12655,TEST,0,0 +12656,TEST,0,0 +12657,TEST,0,0 +12658,TEST,0,0 +12659,TEST,0,0 +12660,TEST,0,0 +12661,TEST,0,0 +12662,TEST,0,0 +12663,TEST,0,0 +12664,TEST,0,0 +12665,TEST,0,0 +12666,TEST,0,0 +12667,TEST,0,0 +12668,TEST,0,0 +12669,TEST,0,0 +12670,TEST,0,0 +12671,TEST,0,0 +12672,TEST,0,0 +12673,TEST,0,0 +12674,TEST,0,0 +12675,TEST,0,0 +12676,TEST,0,0 +12677,TEST,0,0 +12678,TEST,0,0 +12679,TEST,0,0 +12680,TEST,0,0 +12681,TEST,0,0 +12682,TEST,0,0 +12683,TEST,0,0 +12684,TEST,0,0 +12685,TEST,0,0 +12686,TEST,0,0 +12687,TEST,0,0 +12688,TEST,0,0 +12689,TEST,0,0 +12690,TEST,0,0 +12691,TEST,0,0 +12692,TEST,0,0 +12693,TEST,0,0 +12694,TEST,0,0 +12695,TEST,0,0 +12696,TEST,0,0 +12697,TEST,0,0 +12698,TEST,0,0 +12699,TEST,0,0 +12700,TEST,0,0 +12701,TEST,0,0 +12702,TEST,0,0 +12703,TEST,0,0 +12704,TEST,0,0 +12705,TEST,0,0 +12706,TEST,0,0 +12707,TEST,0,0 +12708,TEST,0,0 +12709,TEST,0,0 +12710,TEST,0,0 +12711,TEST,0,0 +12712,TEST,0,0 +12713,TEST,0,0 +12714,TEST,0,0 +12715,TEST,0,0 +12716,TEST,0,0 +12717,TEST,0,0 +12718,TEST,0,0 +12719,TEST,0,0 +12720,TEST,0,0 +12721,TEST,0,0 +12722,TEST,0,0 +12723,TEST,0,0 +12724,TEST,0,0 +12725,TEST,0,0 +12726,TEST,0,0 +12727,TEST,0,0 +12728,TEST,0,0 +12729,TEST,0,0 +12730,TEST,0,0 +12731,TEST,0,0 +12732,TEST,0,0 +12733,TEST,0,0 +12734,TEST,0,0 +12735,TEST,0,0 +12736,TEST,0,0 +12737,TEST,0,0 +12738,TEST,0,0 +12739,TEST,0,0 +12740,TEST,0,0 +12741,TEST,0,0 +12742,TEST,0,0 +12743,TEST,0,0 +12744,TEST,0,0 +12745,TEST,0,0 +12746,TEST,0,0 +12747,TEST,0,0 +12748,TEST,0,0 +12749,TEST,0,0 +12750,TEST,0,0 +12751,TEST,0,0 +12752,TEST,0,0 +12753,TEST,0,0 +12754,TEST,0,0 +12755,TEST,0,0 +12756,TEST,0,0 +12757,TEST,0,0 +12758,TEST,0,0 +12759,TEST,0,0 +12760,TEST,0,0 +12761,TEST,0,0 +12762,TEST,0,0 +12763,TEST,0,0 +12764,TEST,0,0 +12765,TEST,0,0 +12766,TEST,0,0 +12767,TEST,0,0 +12768,TEST,0,0 +12769,TEST,0,0 +12770,TEST,0,0 +12771,TEST,0,0 +12772,TEST,0,0 +12773,TEST,0,0 +12774,TEST,0,0 +12775,TEST,0,0 +12776,TEST,0,0 +12777,TEST,0,0 +12778,TEST,0,0 +12779,TEST,0,0 +12780,TEST,0,0 +12781,TEST,0,0 +12782,TEST,0,0 +12783,TEST,0,0 +12784,TEST,0,0 +12785,TEST,0,0 +12786,TEST,0,0 +12787,TEST,0,0 +12788,TEST,0,0 +12789,TEST,0,0 +12790,TEST,0,0 +12791,TEST,0,0 +12792,TEST,0,0 +12793,TEST,0,0 +12794,TEST,0,0 +12795,TEST,0,0 +12796,TEST,0,0 +12797,TEST,0,0 +12798,TEST,0,0 +12799,TEST,0,0 +12800,TEST,0,0 +12801,TEST,0,0 +12802,TEST,0,0 +12803,TEST,0,0 +12804,TEST,0,0 +12805,TEST,0,0 +12806,TEST,0,0 +12807,TEST,0,0 +12808,TEST,0,0 +12809,TEST,0,0 +12810,TEST,0,0 +12811,TEST,0,0 +12812,TEST,0,0 +12813,TEST,0,0 +12814,TEST,0,0 +12815,TEST,0,0 +12816,TEST,0,0 +12817,TEST,0,0 +12818,TEST,0,0 +12819,TEST,0,0 +12820,TEST,0,0 +12821,TEST,0,0 +12822,TEST,0,0 +12823,TEST,0,0 +12824,TEST,0,0 +12825,TEST,0,0 +12826,TEST,0,0 +12827,TEST,0,0 +12828,TEST,0,0 +12829,TEST,0,0 +12830,TEST,0,0 +12831,TEST,0,0 +12832,TEST,0,0 +12833,TEST,0,0 +12834,TEST,0,0 +12835,TEST,0,0 +12836,TEST,0,0 +12837,TEST,0,0 +12838,TEST,0,0 +12839,TEST,0,0 +12840,TEST,0,0 +12841,TEST,0,0 +12842,TEST,0,0 +12843,TEST,0,0 +12844,TEST,0,0 +12845,TEST,0,0 +12846,TEST,0,0 +12847,TEST,0,0 +12848,TEST,0,0 +12849,TEST,0,0 +12850,TEST,0,0 +12851,TEST,0,0 +12852,TEST,0,0 +12853,TEST,0,0 +12854,TEST,0,0 +12855,TEST,0,0 +12856,TEST,0,0 +12857,TEST,0,0 +12858,TEST,0,0 +12859,TEST,0,0 +12860,TEST,0,0 +12861,TEST,0,0 +12862,TEST,0,0 +12863,TEST,0,0 +12864,TEST,0,0 +12865,TEST,0,0 +12866,TEST,0,0 +12867,TEST,0,0 +12868,TEST,0,0 +12869,TEST,0,0 +12870,TEST,0,0 +12871,TEST,0,0 +12872,TEST,0,0 +12873,TEST,0,0 +12874,TEST,0,0 +12875,TEST,0,0 +12876,TEST,0,0 +12877,TEST,0,0 +12878,TEST,0,0 +12879,TEST,0,0 +12880,TEST,0,0 +12881,TEST,0,0 +12882,TEST,0,0 +12883,TEST,0,0 +12884,TEST,0,0 +12885,TEST,0,0 +12886,TEST,0,0 +12887,TEST,0,0 +12888,TEST,0,0 +12889,TEST,0,0 +12890,TEST,0,0 +12891,TEST,0,0 +12892,TEST,0,0 +12893,TEST,0,0 +12894,TEST,0,0 +12895,TEST,0,0 +12896,TEST,0,0 +12897,TEST,0,0 +12898,TEST,0,0 +12899,TEST,0,0 +12900,TEST,0,0 +12901,TEST,0,0 +12902,TEST,0,0 +12903,TEST,0,0 +12904,TEST,0,0 +12905,TEST,0,0 +12906,TEST,0,0 +12907,TEST,0,0 +12908,TEST,0,0 +12909,TEST,0,0 +12910,TEST,0,0 +12911,TEST,0,0 +12912,TEST,0,0 +12913,TEST,0,0 +12914,TEST,0,0 +12915,TEST,0,0 +12916,TEST,0,0 +12917,TEST,0,0 +12918,TEST,0,0 +12919,TEST,0,0 +12920,TEST,0,0 +12921,TEST,0,0 +12922,TEST,0,0 +12923,TEST,0,0 +12924,TEST,0,0 +12925,TEST,0,0 +12926,TEST,0,0 +12927,TEST,0,0 +12928,TEST,0,0 +12929,TEST,0,0 +12930,TEST,0,0 +12931,TEST,0,0 +12932,TEST,0,0 +12933,TEST,0,0 +12934,TEST,0,0 +12935,TEST,0,0 +12936,TEST,0,0 +12937,TEST,0,0 +12938,TEST,0,0 +12939,TEST,0,0 +12940,TEST,0,0 +12941,TEST,0,0 +12942,TEST,0,0 +12943,TEST,0,0 +12944,TEST,0,0 +12945,TEST,0,0 +12946,TEST,0,0 +12947,TEST,0,0 +12948,TEST,0,0 +12949,TEST,0,0 +12950,TEST,0,0 +12951,TEST,0,0 +12952,TEST,0,0 +12953,TEST,0,0 +12954,TEST,0,0 +12955,TEST,0,0 +12956,TEST,0,0 +12957,TEST,0,0 +12958,TEST,0,0 +12959,TEST,0,0 +12960,TEST,0,0 +12961,TEST,0,0 +12962,TEST,0,0 +12963,TEST,0,0 +12964,TEST,0,0 +12965,TEST,0,0 +12966,TEST,0,0 +12967,TEST,0,0 +12968,TEST,0,0 +12969,TEST,0,0 +12970,TEST,0,0 +12971,TEST,0,0 +12972,TEST,0,0 +12973,TEST,0,0 +12974,TEST,0,0 +12975,TEST,0,0 +12976,TEST,0,0 +12977,TEST,0,0 +12978,TEST,0,0 +12979,TEST,0,0 +12980,TEST,0,0 +12981,TEST,0,0 +12982,TEST,0,0 +12983,TEST,0,0 +12984,TEST,0,0 +12985,TEST,0,0 +12986,TEST,0,0 +12987,TEST,0,0 +12988,TEST,0,0 +12989,TEST,0,0 +12990,TEST,0,0 +12991,TEST,0,0 +12992,TEST,0,0 +12993,TEST,0,0 +12994,TEST,0,0 +12995,TEST,0,0 +12996,TEST,0,0 +12997,TEST,0,0 +12998,TEST,0,0 +12999,TEST,0,0 +13000,TEST,0,0 +13001,TEST,0,0 +13002,TEST,0,0 +13003,TEST,0,0 +13004,TEST,0,0 +13005,TEST,0,0 +13006,TEST,0,0 +13007,TEST,0,0 +13008,TEST,0,0 +13009,TEST,0,0 +13010,TEST,0,0 +13011,TEST,0,0 +13012,TEST,0,0 +13013,TEST,0,0 +13014,TEST,0,0 +13015,TEST,0,0 +13016,TEST,0,0 +13017,TEST,0,0 +13018,TEST,0,0 +13019,TEST,0,0 +13020,TEST,0,0 +13021,TEST,0,0 +13022,TEST,0,0 +13023,TEST,0,0 +13024,TEST,0,0 +13025,TEST,0,0 +13026,TEST,0,0 +13027,TEST,0,0 +13028,TEST,0,0 +13029,TEST,0,0 +13030,TEST,0,0 +13031,TEST,0,0 +13032,TEST,0,0 +13033,TEST,0,0 +13034,TEST,0,0 +13035,TEST,0,0 +13036,TEST,0,0 +13037,TEST,0,0 +13038,TEST,0,0 +13039,TEST,0,0 +13040,TEST,0,0 +13041,TEST,0,0 +13042,TEST,0,0 +13043,TEST,0,0 +13044,TEST,0,0 +13045,TEST,0,0 +13046,TEST,0,0 +13047,TEST,0,0 +13048,TEST,0,0 +13049,TEST,0,0 +13050,TEST,0,0 +13051,TEST,0,0 +13052,TEST,0,0 +13053,TEST,0,0 +13054,TEST,0,0 +13055,TEST,0,0 +13056,TEST,0,0 +13057,TEST,0,0 +13058,TEST,0,0 +13059,TEST,0,0 +13060,TEST,0,0 +13061,TEST,0,0 +13062,TEST,0,0 +13063,TEST,0,0 +13064,TEST,0,0 +13065,TEST,0,0 +13066,TEST,0,0 +13067,TEST,0,0 +13068,TEST,0,0 +13069,TEST,0,0 +13070,TEST,0,0 +13071,TEST,0,0 +13072,TEST,0,0 +13073,TEST,0,0 +13074,TEST,0,0 +13075,TEST,0,0 +13076,TEST,0,0 +13077,TEST,0,0 +13078,TEST,0,0 +13079,TEST,0,0 +13080,TEST,0,0 +13081,TEST,0,0 +13082,TEST,0,0 +13083,TEST,0,0 +13084,TEST,0,0 +13085,TEST,0,0 +13086,TEST,0,0 +13087,TEST,0,0 +13088,TEST,0,0 +13089,TEST,0,0 +13090,TEST,0,0 +13091,TEST,0,0 +13092,TEST,0,0 +13093,TEST,0,0 +13094,TEST,0,0 +13095,TEST,0,0 +13096,TEST,0,0 +13097,TEST,0,0 +13098,TEST,0,0 +13099,TEST,0,0 +13100,TEST,0,0 +13101,TEST,0,0 +13102,TEST,0,0 +13103,TEST,0,0 +13104,TEST,0,0 +13105,TEST,0,0 +13106,TEST,0,0 +13107,TEST,0,0 +13108,TEST,0,0 +13109,TEST,0,0 +13110,TEST,0,0 +13111,TEST,0,0 +13112,TEST,0,0 +13113,TEST,0,0 +13114,TEST,0,0 +13115,TEST,0,0 +13116,TEST,0,0 +13117,TEST,0,0 +13118,TEST,0,0 +13119,TEST,0,0 +13120,TEST,0,0 +13121,TEST,0,0 +13122,TEST,0,0 +13123,TEST,0,0 +13124,TEST,0,0 +13125,TEST,0,0 +13126,TEST,0,0 +13127,TEST,0,0 +13128,TEST,0,0 +13129,TEST,0,0 +13130,TEST,0,0 +13131,TEST,0,0 +13132,TEST,0,0 +13133,TEST,0,0 +13134,TEST,0,0 +13135,TEST,0,0 +13136,TEST,0,0 +13137,TEST,0,0 +13138,TEST,0,0 +13139,TEST,0,0 +13140,TEST,0,0 +13141,TEST,0,0 +13142,TEST,0,0 +13143,TEST,0,0 +13144,TEST,0,0 +13145,TEST,0,0 +13146,TEST,0,0 +13147,TEST,0,0 +13148,TEST,0,0 +13149,TEST,0,0 +13150,TEST,0,0 +13151,TEST,0,0 +13152,TEST,0,0 +13153,TEST,0,0 +13154,TEST,0,0 +13155,TEST,0,0 +13156,TEST,0,0 +13157,TEST,0,0 +13158,TEST,0,0 +13159,TEST,0,0 +13160,TEST,0,0 +13161,TEST,0,0 +13162,TEST,0,0 +13163,TEST,0,0 +13164,TEST,0,0 +13165,TEST,0,0 +13166,TEST,0,0 +13167,TEST,0,0 +13168,TEST,0,0 +13169,TEST,0,0 +13170,TEST,0,0 +13171,TEST,0,0 +13172,TEST,0,0 +13173,TEST,0,0 +13174,TEST,0,0 +13175,TEST,0,0 +13176,TEST,0,0 +13177,TEST,0,0 +13178,TEST,0,0 +13179,TEST,0,0 +13180,TEST,0,0 +13181,TEST,0,0 +13182,TEST,0,0 +13183,TEST,0,0 +13184,TEST,0,0 +13185,TEST,0,0 +13186,TEST,0,0 +13187,TEST,0,0 +13188,TEST,0,0 +13189,TEST,0,0 +13190,TEST,0,0 +13191,TEST,0,0 +13192,TEST,0,0 +13193,TEST,0,0 +13194,TEST,0,0 +13195,TEST,0,0 +13196,TEST,0,0 +13197,TEST,0,0 +13198,TEST,0,0 +13199,TEST,0,0 +13200,TEST,0,0 +13201,TEST,0,0 +13202,TEST,0,0 +13203,TEST,0,0 +13204,TEST,0,0 +13205,TEST,0,0 +13206,TEST,0,0 +13207,TEST,0,0 +13208,TEST,0,0 +13209,TEST,0,0 +13210,TEST,0,0 +13211,TEST,0,0 +13212,TEST,0,0 +13213,TEST,0,0 +13214,TEST,0,0 +13215,TEST,0,0 +13216,TEST,0,0 +13217,TEST,0,0 +13218,TEST,0,0 +13219,TEST,0,0 +13220,TEST,0,0 +13221,TEST,0,0 +13222,TEST,0,0 +13223,TEST,0,0 +13224,TEST,0,0 +13225,TEST,0,0 +13226,TEST,0,0 +13227,TEST,0,0 +13228,TEST,0,0 +13229,TEST,0,0 +13230,TEST,0,0 +13231,TEST,0,0 +13232,TEST,0,0 +13233,TEST,0,0 +13234,TEST,0,0 +13235,TEST,0,0 +13236,TEST,0,0 +13237,TEST,0,0 +13238,TEST,0,0 +13239,TEST,0,0 +13240,TEST,0,0 +13241,TEST,0,0 +13242,TEST,0,0 +13243,TEST,0,0 +13244,TEST,0,0 +13245,TEST,0,0 +13246,TEST,0,0 +13247,TEST,0,0 +13248,TEST,0,0 +13249,TEST,0,0 +13250,TEST,0,0 +13251,TEST,0,0 +13252,TEST,0,0 +13253,TEST,0,0 +13254,TEST,0,0 +13255,TEST,0,0 +13256,TEST,0,0 +13257,TEST,0,0 +13258,TEST,0,0 +13259,TEST,0,0 +13260,TEST,0,0 +13261,TEST,0,0 +13262,TEST,0,0 +13263,TEST,0,0 +13264,TEST,0,0 +13265,TEST,0,0 +13266,TEST,0,0 +13267,TEST,0,0 +13268,TEST,0,0 +13269,TEST,0,0 +13270,TEST,0,0 +13271,TEST,0,0 +13272,TEST,0,0 +13273,TEST,0,0 +13274,TEST,0,0 +13275,TEST,0,0 +13276,TEST,0,0 +13277,TEST,0,0 +13278,TEST,0,0 +13279,TEST,0,0 +13280,TEST,0,0 +13281,TEST,0,0 +13282,TEST,0,0 +13283,TEST,0,0 +13284,TEST,0,0 +13285,TEST,0,0 +13286,TEST,0,0 +13287,TEST,0,0 +13288,TEST,0,0 +13289,TEST,0,0 +13290,TEST,0,0 +13291,TEST,0,0 +13292,TEST,0,0 +13293,TEST,0,0 +13294,TEST,0,0 +13295,TEST,0,0 +13296,TEST,0,0 +13297,TEST,0,0 +13298,TEST,0,0 +13299,TEST,0,0 +13300,TEST,0,0 +13301,TEST,0,0 +13302,TEST,0,0 +13303,TEST,0,0 +13304,TEST,0,0 +13305,TEST,0,0 +13306,TEST,0,0 +13307,TEST,0,0 +13308,TEST,0,0 +13309,TEST,0,0 +13310,TEST,0,0 +13311,TEST,0,0 +13312,TEST,0,0 +13313,TEST,0,0 +13314,TEST,0,0 +13315,TEST,0,0 +13316,TEST,0,0 +13317,TEST,0,0 +13318,TEST,0,0 +13319,TEST,0,0 +13320,TEST,0,0 +13321,TEST,0,0 +13322,TEST,0,0 +13323,TEST,0,0 +13324,TEST,0,0 +13325,TEST,0,0 +13326,TEST,0,0 +13327,TEST,0,0 +13328,TEST,0,0 +13329,TEST,0,0 +13330,TEST,0,0 +13331,TEST,0,0 +13332,TEST,0,0 +13333,TEST,0,0 +13334,TEST,0,0 +13335,TEST,0,0 +13336,TEST,0,0 +13337,TEST,0,0 +13338,TEST,0,0 +13339,TEST,0,0 +13340,TEST,0,0 +13341,TEST,0,0 +13342,TEST,0,0 +13343,TEST,0,0 +13344,TEST,0,0 +13345,TEST,0,0 +13346,TEST,0,0 +13347,TEST,0,0 +13348,TEST,0,0 +13349,TEST,0,0 +13350,TEST,0,0 +13351,TEST,0,0 +13352,TEST,0,0 +13353,TEST,0,0 +13354,TEST,0,0 +13355,TEST,0,0 +13356,TEST,0,0 +13357,TEST,0,0 +13358,TEST,0,0 +13359,TEST,0,0 +13360,TEST,0,0 +13361,TEST,0,0 +13362,TEST,0,0 +13363,TEST,0,0 +13364,TEST,0,0 +13365,TEST,0,0 +13366,TEST,0,0 +13367,TEST,0,0 +13368,TEST,0,0 +13369,TEST,0,0 +13370,TEST,0,0 +13371,TEST,0,0 +13372,TEST,0,0 +13373,TEST,0,0 +13374,TEST,0,0 +13375,TEST,0,0 +13376,TEST,0,0 +13377,TEST,0,0 +13378,TEST,0,0 +13379,TEST,0,0 +13380,TEST,0,0 +13381,TEST,0,0 +13382,TEST,0,0 +13383,TEST,0,0 +13384,TEST,0,0 +13385,TEST,0,0 +13386,TEST,0,0 +13387,TEST,0,0 +13388,TEST,0,0 +13389,TEST,0,0 +13390,TEST,0,0 +13391,TEST,0,0 +13392,TEST,0,0 +13393,TEST,0,0 +13394,TEST,0,0 +13395,TEST,0,0 +13396,TEST,0,0 +13397,TEST,0,0 +13398,TEST,0,0 +13399,TEST,0,0 +13400,TEST,0,0 +13401,TEST,0,0 +13402,TEST,0,0 +13403,TEST,0,0 +13404,TEST,0,0 +13405,TEST,0,0 +13406,TEST,0,0 +13407,TEST,0,0 +13408,TEST,0,0 +13409,TEST,0,0 +13410,TEST,0,0 +13411,TEST,0,0 +13412,TEST,0,0 +13413,TEST,0,0 +13414,TEST,0,0 +13415,TEST,0,0 +13416,TEST,0,0 +13417,TEST,0,0 +13418,TEST,0,0 +13419,TEST,0,0 +13420,TEST,0,0 +13421,TEST,0,0 +13422,TEST,0,0 +13423,TEST,0,0 +13424,TEST,0,0 +13425,TEST,0,0 +13426,TEST,0,0 +13427,TEST,0,0 +13428,TEST,0,0 +13429,TEST,0,0 +13430,TEST,0,0 +13431,TEST,0,0 +13432,TEST,0,0 +13433,TEST,0,0 +13434,TEST,0,0 +13435,TEST,0,0 +13436,TEST,0,0 +13437,TEST,0,0 +13438,TEST,0,0 +13439,TEST,0,0 +13440,TEST,0,0 +13441,TEST,0,0 +13442,TEST,0,0 +13443,TEST,0,0 +13444,TEST,0,0 +13445,TEST,0,0 +13446,TEST,0,0 +13447,TEST,0,0 +13448,TEST,0,0 +13449,TEST,0,0 +13450,TEST,0,0 +13451,TEST,0,0 +13452,TEST,0,0 +13453,TEST,0,0 +13454,TEST,0,0 +13455,TEST,0,0 +13456,TEST,0,0 +13457,TEST,0,0 +13458,TEST,0,0 +13459,TEST,0,0 +13460,TEST,0,0 +13461,TEST,0,0 +13462,TEST,0,0 +13463,TEST,0,0 +13464,TEST,0,0 +13465,TEST,0,0 +13466,TEST,0,0 +13467,TEST,0,0 +13468,TEST,0,0 +13469,TEST,0,0 +13470,TEST,0,0 +13471,TEST,0,0 +13472,TEST,0,0 +13473,TEST,0,0 +13474,TEST,0,0 +13475,TEST,0,0 +13476,TEST,0,0 +13477,TEST,0,0 +13478,TEST,0,0 +13479,TEST,0,0 +13480,TEST,0,0 +13481,TEST,0,0 +13482,TEST,0,0 +13483,TEST,0,0 +13484,TEST,0,0 +13485,TEST,0,0 +13486,TEST,0,0 +13487,TEST,0,0 +13488,TEST,0,0 +13489,TEST,0,0 +13490,TEST,0,0 +13491,TEST,0,0 +13492,TEST,0,0 +13493,TEST,0,0 +13494,TEST,0,0 +13495,TEST,0,0 +13496,TEST,0,0 +13497,TEST,0,0 +13498,TEST,0,0 +13499,TEST,0,0 +13500,TEST,0,0 +13501,TEST,0,0 +13502,TEST,0,0 +13503,TEST,0,0 +13504,TEST,0,0 +13505,TEST,0,0 +13506,TEST,0,0 +13507,TEST,0,0 +13508,TEST,0,0 +13509,TEST,0,0 +13510,TEST,0,0 +13511,TEST,0,0 +13512,TEST,0,0 +13513,TEST,0,0 +13514,TEST,0,0 +13515,TEST,0,0 +13516,TEST,0,0 +13517,TEST,0,0 +13518,TEST,0,0 +13519,TEST,0,0 +13520,TEST,0,0 +13521,TEST,0,0 +13522,TEST,0,0 +13523,TEST,0,0 +13524,TEST,0,0 +13525,TEST,0,0 +13526,TEST,0,0 +13527,TEST,0,0 +13528,TEST,0,0 +13529,TEST,0,0 +13530,TEST,0,0 +13531,TEST,0,0 +13532,TEST,0,0 +13533,TEST,0,0 +13534,TEST,0,0 +13535,TEST,0,0 +13536,TEST,0,0 +13537,TEST,0,0 +13538,TEST,0,0 +13539,TEST,0,0 +13540,TEST,0,0 +13541,TEST,0,0 +13542,TEST,0,0 +13543,TEST,0,0 +13544,TEST,0,0 +13545,TEST,0,0 +13546,TEST,0,0 +13547,TEST,0,0 +13548,TEST,0,0 +13549,TEST,0,0 +13550,TEST,0,0 +13551,TEST,0,0 +13552,TEST,0,0 +13553,TEST,0,0 +13554,TEST,0,0 +13555,TEST,0,0 +13556,TEST,0,0 +13557,TEST,0,0 +13558,TEST,0,0 +13559,TEST,0,0 +13560,TEST,0,0 +13561,TEST,0,0 +13562,TEST,0,0 +13563,TEST,0,0 +13564,TEST,0,0 +13565,TEST,0,0 +13566,TEST,0,0 +13567,TEST,0,0 +13568,TEST,0,0 +13569,TEST,0,0 +13570,TEST,0,0 +13571,TEST,0,0 +13572,TEST,0,0 +13573,TEST,0,0 +13574,TEST,0,0 +13575,TEST,0,0 +13576,TEST,0,0 +13577,TEST,0,0 +13578,TEST,0,0 +13579,TEST,0,0 +13580,TEST,0,0 +13581,TEST,0,0 +13582,TEST,0,0 +13583,TEST,0,0 +13584,TEST,0,0 +13585,TEST,0,0 +13586,TEST,0,0 +13587,TEST,0,0 +13588,TEST,0,0 +13589,TEST,0,0 +13590,TEST,0,0 +13591,TEST,0,0 +13592,TEST,0,0 +13593,TEST,0,0 +13594,TEST,0,0 +13595,TEST,0,0 +13596,TEST,0,0 +13597,TEST,0,0 +13598,TEST,0,0 +13599,TEST,0,0 +13600,TEST,0,0 +13601,TEST,0,0 +13602,TEST,0,0 +13603,TEST,0,0 +13604,TEST,0,0 +13605,TEST,0,0 +13606,TEST,0,0 +13607,TEST,0,0 +13608,TEST,0,0 +13609,TEST,0,0 +13610,TEST,0,0 +13611,TEST,0,0 +13612,TEST,0,0 +13613,TEST,0,0 +13614,TEST,0,0 +13615,TEST,0,0 +13616,TEST,0,0 +13617,TEST,0,0 +13618,TEST,0,0 +13619,TEST,0,0 +13620,TEST,0,0 +13621,TEST,0,0 +13622,TEST,0,0 +13623,TEST,0,0 +13624,TEST,0,0 +13625,TEST,0,0 +13626,TEST,0,0 +13627,TEST,0,0 +13628,TEST,0,0 +13629,TEST,0,0 +13630,TEST,0,0 +13631,TEST,0,0 +13632,TEST,0,0 +13633,TEST,0,0 +13634,TEST,0,0 +13635,TEST,0,0 +13636,TEST,0,0 +13637,TEST,0,0 +13638,TEST,0,0 +13639,TEST,0,0 +13640,TEST,0,0 +13641,TEST,0,0 +13642,TEST,0,0 +13643,TEST,0,0 +13644,TEST,0,0 +13645,TEST,0,0 +13646,TEST,0,0 +13647,TEST,0,0 +13648,TEST,0,0 +13649,TEST,0,0 +13650,TEST,0,0 +13651,TEST,0,0 +13652,TEST,0,0 +13653,TEST,0,0 +13654,TEST,0,0 +13655,TEST,0,0 +13656,TEST,0,0 +13657,TEST,0,0 +13658,TEST,0,0 +13659,TEST,0,0 +13660,TEST,0,0 +13661,TEST,0,0 +13662,TEST,0,0 +13663,TEST,0,0 +13664,TEST,0,0 +13665,TEST,0,0 +13666,TEST,0,0 +13667,TEST,0,0 +13668,TEST,0,0 +13669,TEST,0,0 +13670,TEST,0,0 +13671,TEST,0,0 +13672,TEST,0,0 +13673,TEST,0,0 +13674,TEST,0,0 +13675,TEST,0,0 +13676,TEST,0,0 +13677,TEST,0,0 +13678,TEST,0,0 +13679,TEST,0,0 +13680,TEST,0,0 +13681,TEST,0,0 +13682,TEST,0,0 +13683,TEST,0,0 +13684,TEST,0,0 +13685,TEST,0,0 +13686,TEST,0,0 +13687,TEST,0,0 +13688,TEST,0,0 +13689,TEST,0,0 +13690,TEST,0,0 +13691,TEST,0,0 +13692,TEST,0,0 +13693,TEST,0,0 +13694,TEST,0,0 +13695,TEST,0,0 +13696,TEST,0,0 +13697,TEST,0,0 +13698,TEST,0,0 +13699,TEST,0,0 +13700,TEST,0,0 +13701,TEST,0,0 +13702,TEST,0,0 +13703,TEST,0,0 +13704,TEST,0,0 +13705,TEST,0,0 +13706,TEST,0,0 +13707,TEST,0,0 +13708,TEST,0,0 +13709,TEST,0,0 +13710,TEST,0,0 +13711,TEST,0,0 +13712,TEST,0,0 +13713,TEST,0,0 +13714,TEST,0,0 +13715,TEST,0,0 +13716,TEST,0,0 +13717,TEST,0,0 +13718,TEST,0,0 +13719,TEST,0,0 +13720,TEST,0,0 +13721,TEST,0,0 +13722,TEST,0,0 +13723,TEST,0,0 +13724,TEST,0,0 +13725,TEST,0,0 +13726,TEST,0,0 +13727,TEST,0,0 +13728,TEST,0,0 +13729,TEST,0,0 +13730,TEST,0,0 +13731,TEST,0,0 +13732,TEST,0,0 +13733,TEST,0,0 +13734,TEST,0,0 +13735,TEST,0,0 +13736,TEST,0,0 +13737,TEST,0,0 +13738,TEST,0,0 +13739,TEST,0,0 +13740,TEST,0,0 +13741,TEST,0,0 +13742,TEST,0,0 +13743,TEST,0,0 +13744,TEST,0,0 +13745,TEST,0,0 +13746,TEST,0,0 +13747,TEST,0,0 +13748,TEST,0,0 +13749,TEST,0,0 +13750,TEST,0,0 +13751,TEST,0,0 +13752,TEST,0,0 +13753,TEST,0,0 +13754,TEST,0,0 +13755,TEST,0,0 +13756,TEST,0,0 +13757,TEST,0,0 +13758,TEST,0,0 +13759,TEST,0,0 +13760,TEST,0,0 +13761,TEST,0,0 +13762,TEST,0,0 +13763,TEST,0,0 +13764,TEST,0,0 +13765,TEST,0,0 +13766,TEST,0,0 +13767,TEST,0,0 +13768,TEST,0,0 +13769,TEST,0,0 +13770,TEST,0,0 +13771,TEST,0,0 +13772,TEST,0,0 +13773,TEST,0,0 +13774,TEST,0,0 +13775,TEST,0,0 +13776,TEST,0,0 +13777,TEST,0,0 +13778,TEST,0,0 +13779,TEST,0,0 +13780,TEST,0,0 +13781,TEST,0,0 +13782,TEST,0,0 +13783,TEST,0,0 +13784,TEST,0,0 +13785,TEST,0,0 +13786,TEST,0,0 +13787,TEST,0,0 +13788,TEST,0,0 +13789,TEST,0,0 +13790,TEST,0,0 +13791,TEST,0,0 +13792,TEST,0,0 +13793,TEST,0,0 +13794,TEST,0,0 +13795,TEST,0,0 +13796,TEST,0,0 +13797,TEST,0,0 +13798,TEST,0,0 +13799,TEST,0,0 +13800,TEST,0,0 +13801,TEST,0,0 +13802,TEST,0,0 +13803,TEST,0,0 +13804,TEST,0,0 +13805,TEST,0,0 +13806,TEST,0,0 +13807,TEST,0,0 +13808,TEST,0,0 +13809,TEST,0,0 +13810,TEST,0,0 +13811,TEST,0,0 +13812,TEST,0,0 +13813,TEST,0,0 +13814,TEST,0,0 +13815,TEST,0,0 +13816,TEST,0,0 +13817,TEST,0,0 +13818,TEST,0,0 +13819,TEST,0,0 +13820,TEST,0,0 +13821,TEST,0,0 +13822,TEST,0,0 +13823,TEST,0,0 +13824,TEST,0,0 +13825,TEST,0,0 +13826,TEST,0,0 +13827,TEST,0,0 +13828,TEST,0,0 +13829,TEST,0,0 +13830,TEST,0,0 +13831,TEST,0,0 +13832,TEST,0,0 +13833,TEST,0,0 +13834,TEST,0,0 +13835,TEST,0,0 +13836,TEST,0,0 +13837,TEST,0,0 +13838,TEST,0,0 +13839,TEST,0,0 +13840,TEST,0,0 +13841,TEST,0,0 +13842,TEST,0,0 +13843,TEST,0,0 +13844,TEST,0,0 +13845,TEST,0,0 +13846,TEST,0,0 +13847,TEST,0,0 +13848,TEST,0,0 +13849,TEST,0,0 +13850,TEST,0,0 +13851,TEST,0,0 +13852,TEST,0,0 +13853,TEST,0,0 +13854,TEST,0,0 +13855,TEST,0,0 +13856,TEST,0,0 +13857,TEST,0,0 +13858,TEST,0,0 +13859,TEST,0,0 +13860,TEST,0,0 +13861,TEST,0,0 +13862,TEST,0,0 +13863,TEST,0,0 +13864,TEST,0,0 +13865,TEST,0,0 +13866,TEST,0,0 +13867,TEST,0,0 +13868,TEST,0,0 +13869,TEST,0,0 +13870,TEST,0,0 +13871,TEST,0,0 +13872,TEST,0,0 +13873,TEST,0,0 +13874,TEST,0,0 +13875,TEST,0,0 +13876,TEST,0,0 +13877,TEST,0,0 +13878,TEST,0,0 +13879,TEST,0,0 +13880,TEST,0,0 +13881,TEST,0,0 +13882,TEST,0,0 +13883,TEST,0,0 +13884,TEST,0,0 +13885,TEST,0,0 +13886,TEST,0,0 +13887,TEST,0,0 +13888,TEST,0,0 +13889,TEST,0,0 +13890,TEST,0,0 +13891,TEST,0,0 +13892,TEST,0,0 +13893,TEST,0,0 +13894,TEST,0,0 +13895,TEST,0,0 +13896,TEST,0,0 +13897,TEST,0,0 +13898,TEST,0,0 +13899,TEST,0,0 +13900,TEST,0,0 +13901,TEST,0,0 +13902,TEST,0,0 +13903,TEST,0,0 +13904,TEST,0,0 +13905,TEST,0,0 +13906,TEST,0,0 +13907,TEST,0,0 +13908,TEST,0,0 +13909,TEST,0,0 +13910,TEST,0,0 +13911,TEST,0,0 +13912,TEST,0,0 +13913,TEST,0,0 +13914,TEST,0,0 +13915,TEST,0,0 +13916,TEST,0,0 +13917,TEST,0,0 +13918,TEST,0,0 +13919,TEST,0,0 +13920,TEST,0,0 +13921,TEST,0,0 +13922,TEST,0,0 +13923,TEST,0,0 +13924,TEST,0,0 +13925,TEST,0,0 +13926,TEST,0,0 +13927,TEST,0,0 +13928,TEST,0,0 +13929,TEST,0,0 +13930,TEST,0,0 +13931,TEST,0,0 +13932,TEST,0,0 +13933,TEST,0,0 +13934,TEST,0,0 +13935,TEST,0,0 +13936,TEST,0,0 +13937,TEST,0,0 +13938,TEST,0,0 +13939,TEST,0,0 +13940,TEST,0,0 +13941,TEST,0,0 +13942,TEST,0,0 +13943,TEST,0,0 +13944,TEST,0,0 +13945,TEST,0,0 +13946,TEST,0,0 +13947,TEST,0,0 +13948,TEST,0,0 +13949,TEST,0,0 +13950,TEST,0,0 +13951,TEST,0,0 +13952,TEST,0,0 +13953,TEST,0,0 +13954,TEST,0,0 +13955,TEST,0,0 +13956,TEST,0,0 +13957,TEST,0,0 +13958,TEST,0,0 +13959,TEST,0,0 +13960,TEST,0,0 +13961,TEST,0,0 +13962,TEST,0,0 +13963,TEST,0,0 +13964,TEST,0,0 +13965,TEST,0,0 +13966,TEST,0,0 +13967,TEST,0,0 +13968,TEST,0,0 +13969,TEST,0,0 +13970,TEST,0,0 +13971,TEST,0,0 +13972,TEST,0,0 +13973,TEST,0,0 +13974,TEST,0,0 +13975,TEST,0,0 +13976,TEST,0,0 +13977,TEST,0,0 +13978,TEST,0,0 +13979,TEST,0,0 +13980,TEST,0,0 +13981,TEST,0,0 +13982,TEST,0,0 +13983,TEST,0,0 +13984,TEST,0,0 +13985,TEST,0,0 +13986,TEST,0,0 +13987,TEST,0,0 +13988,TEST,0,0 +13989,TEST,0,0 +13990,TEST,0,0 +13991,TEST,0,0 +13992,TEST,0,0 +13993,TEST,0,0 +13994,TEST,0,0 +13995,TEST,0,0 +13996,TEST,0,0 +13997,TEST,0,0 +13998,TEST,0,0 +13999,TEST,0,0 +14000,TEST,0,0 +14001,TEST,0,0 +14002,TEST,0,0 +14003,TEST,0,0 +14004,TEST,0,0 +14005,TEST,0,0 +14006,TEST,0,0 +14007,TEST,0,0 +14008,TEST,0,0 +14009,TEST,0,0 +14010,TEST,0,0 +14011,TEST,0,0 +14012,TEST,0,0 +14013,TEST,0,0 +14014,TEST,0,0 +14015,TEST,0,0 +14016,TEST,0,0 +14017,TEST,0,0 +14018,TEST,0,0 +14019,TEST,0,0 +14020,TEST,0,0 +14021,TEST,0,0 +14022,TEST,0,0 +14023,TEST,0,0 +14024,TEST,0,0 +14025,TEST,0,0 +14026,TEST,0,0 +14027,TEST,0,0 +14028,TEST,0,0 +14029,TEST,0,0 +14030,TEST,0,0 +14031,TEST,0,0 +14032,TEST,0,0 +14033,TEST,0,0 +14034,TEST,0,0 +14035,TEST,0,0 +14036,TEST,0,0 +14037,TEST,0,0 +14038,TEST,0,0 +14039,TEST,0,0 +14040,TEST,0,0 +14041,TEST,0,0 +14042,TEST,0,0 +14043,TEST,0,0 +14044,TEST,0,0 +14045,TEST,0,0 +14046,TEST,0,0 +14047,TEST,0,0 +14048,TEST,0,0 +14049,TEST,0,0 +14050,TEST,0,0 +14051,TEST,0,0 +14052,TEST,0,0 +14053,TEST,0,0 diff --git a/datasets/anomaly/kpi/TEST/problem_TEST/problemDoc.json b/datasets/anomaly/kpi/TEST/problem_TEST/problemDoc.json new file mode 100644 index 0000000..1fd55ad --- /dev/null +++ b/datasets/anomaly/kpi/TEST/problem_TEST/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "kpi_problem", + "problemName": "kpi_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "kpi_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 3, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TRAIN" + } + ], + "test": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TEST" + } + ], + "score": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json b/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json new file mode 100644 index 0000000..a196e7d --- /dev/null +++ b/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json @@ -0,0 +1,63 @@ +{ + "about": { + "datasetID": "kpi_dataset_TRAIN", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 4 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv.REMOVED.git-id b/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv.REMOVED.git-id new file mode 100644 index 0000000..7376921 --- /dev/null +++ b/datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv.REMOVED.git-id @@ -0,0 +1 @@ +44db328c252a8156434142a37ef65765869e7548 \ No newline at end of file diff --git a/datasets/anomaly/kpi/TRAIN/problem_TRAIN/dataSplits.csv b/datasets/anomaly/kpi/TRAIN/problem_TRAIN/dataSplits.csv new file mode 100644 index 0000000..41a5012 --- /dev/null +++ b/datasets/anomaly/kpi/TRAIN/problem_TRAIN/dataSplits.csv @@ -0,0 +1,7028 @@ +d3mIndex,type,repeat,fold +0,TRAIN,0,0 +1,TRAIN,0,0 +2,TRAIN,0,0 +3,TRAIN,0,0 +4,TRAIN,0,0 +5,TRAIN,0,0 +6,TRAIN,0,0 +7,TRAIN,0,0 +8,TRAIN,0,0 +9,TRAIN,0,0 +10,TRAIN,0,0 +11,TRAIN,0,0 +12,TRAIN,0,0 +13,TRAIN,0,0 +14,TRAIN,0,0 +15,TRAIN,0,0 +16,TRAIN,0,0 +17,TRAIN,0,0 +18,TRAIN,0,0 +19,TRAIN,0,0 +20,TRAIN,0,0 +21,TRAIN,0,0 +22,TRAIN,0,0 +23,TRAIN,0,0 +24,TRAIN,0,0 +25,TRAIN,0,0 +26,TRAIN,0,0 +27,TRAIN,0,0 +28,TRAIN,0,0 +29,TRAIN,0,0 +30,TRAIN,0,0 +31,TRAIN,0,0 +32,TRAIN,0,0 +33,TRAIN,0,0 +34,TRAIN,0,0 +35,TRAIN,0,0 +36,TRAIN,0,0 +37,TRAIN,0,0 +38,TRAIN,0,0 +39,TRAIN,0,0 +40,TRAIN,0,0 +41,TRAIN,0,0 +42,TRAIN,0,0 +43,TRAIN,0,0 +44,TRAIN,0,0 +45,TRAIN,0,0 +46,TRAIN,0,0 +47,TRAIN,0,0 +48,TRAIN,0,0 +49,TRAIN,0,0 +50,TRAIN,0,0 +51,TRAIN,0,0 +52,TRAIN,0,0 +53,TRAIN,0,0 +54,TRAIN,0,0 +55,TRAIN,0,0 +56,TRAIN,0,0 +57,TRAIN,0,0 +58,TRAIN,0,0 +59,TRAIN,0,0 +60,TRAIN,0,0 +61,TRAIN,0,0 +62,TRAIN,0,0 +63,TRAIN,0,0 +64,TRAIN,0,0 +65,TRAIN,0,0 +66,TRAIN,0,0 +67,TRAIN,0,0 +68,TRAIN,0,0 +69,TRAIN,0,0 +70,TRAIN,0,0 +71,TRAIN,0,0 +72,TRAIN,0,0 +73,TRAIN,0,0 +74,TRAIN,0,0 +75,TRAIN,0,0 +76,TRAIN,0,0 +77,TRAIN,0,0 +78,TRAIN,0,0 +79,TRAIN,0,0 +80,TRAIN,0,0 +81,TRAIN,0,0 +82,TRAIN,0,0 +83,TRAIN,0,0 +84,TRAIN,0,0 +85,TRAIN,0,0 +86,TRAIN,0,0 +87,TRAIN,0,0 +88,TRAIN,0,0 +89,TRAIN,0,0 +90,TRAIN,0,0 +91,TRAIN,0,0 +92,TRAIN,0,0 +93,TRAIN,0,0 +94,TRAIN,0,0 +95,TRAIN,0,0 +96,TRAIN,0,0 +97,TRAIN,0,0 +98,TRAIN,0,0 +99,TRAIN,0,0 +100,TRAIN,0,0 +101,TRAIN,0,0 +102,TRAIN,0,0 +103,TRAIN,0,0 +104,TRAIN,0,0 +105,TRAIN,0,0 +106,TRAIN,0,0 +107,TRAIN,0,0 +108,TRAIN,0,0 +109,TRAIN,0,0 +110,TRAIN,0,0 +111,TRAIN,0,0 +112,TRAIN,0,0 +113,TRAIN,0,0 +114,TRAIN,0,0 +115,TRAIN,0,0 +116,TRAIN,0,0 +117,TRAIN,0,0 +118,TRAIN,0,0 +119,TRAIN,0,0 +120,TRAIN,0,0 +121,TRAIN,0,0 +122,TRAIN,0,0 +123,TRAIN,0,0 +124,TRAIN,0,0 +125,TRAIN,0,0 +126,TRAIN,0,0 +127,TRAIN,0,0 +128,TRAIN,0,0 +129,TRAIN,0,0 +130,TRAIN,0,0 +131,TRAIN,0,0 +132,TRAIN,0,0 +133,TRAIN,0,0 +134,TRAIN,0,0 +135,TRAIN,0,0 +136,TRAIN,0,0 +137,TRAIN,0,0 +138,TRAIN,0,0 +139,TRAIN,0,0 +140,TRAIN,0,0 +141,TRAIN,0,0 +142,TRAIN,0,0 +143,TRAIN,0,0 +144,TRAIN,0,0 +145,TRAIN,0,0 +146,TRAIN,0,0 +147,TRAIN,0,0 +148,TRAIN,0,0 +149,TRAIN,0,0 +150,TRAIN,0,0 +151,TRAIN,0,0 +152,TRAIN,0,0 +153,TRAIN,0,0 +154,TRAIN,0,0 +155,TRAIN,0,0 +156,TRAIN,0,0 +157,TRAIN,0,0 +158,TRAIN,0,0 +159,TRAIN,0,0 +160,TRAIN,0,0 +161,TRAIN,0,0 +162,TRAIN,0,0 +163,TRAIN,0,0 +164,TRAIN,0,0 +165,TRAIN,0,0 +166,TRAIN,0,0 +167,TRAIN,0,0 +168,TRAIN,0,0 +169,TRAIN,0,0 +170,TRAIN,0,0 +171,TRAIN,0,0 +172,TRAIN,0,0 +173,TRAIN,0,0 +174,TRAIN,0,0 +175,TRAIN,0,0 +176,TRAIN,0,0 +177,TRAIN,0,0 +178,TRAIN,0,0 +179,TRAIN,0,0 +180,TRAIN,0,0 +181,TRAIN,0,0 +182,TRAIN,0,0 +183,TRAIN,0,0 +184,TRAIN,0,0 +185,TRAIN,0,0 +186,TRAIN,0,0 +187,TRAIN,0,0 +188,TRAIN,0,0 +189,TRAIN,0,0 +190,TRAIN,0,0 +191,TRAIN,0,0 +192,TRAIN,0,0 +193,TRAIN,0,0 +194,TRAIN,0,0 +195,TRAIN,0,0 +196,TRAIN,0,0 +197,TRAIN,0,0 +198,TRAIN,0,0 +199,TRAIN,0,0 +200,TRAIN,0,0 +201,TRAIN,0,0 +202,TRAIN,0,0 +203,TRAIN,0,0 +204,TRAIN,0,0 +205,TRAIN,0,0 +206,TRAIN,0,0 +207,TRAIN,0,0 +208,TRAIN,0,0 +209,TRAIN,0,0 +210,TRAIN,0,0 +211,TRAIN,0,0 +212,TRAIN,0,0 +213,TRAIN,0,0 +214,TRAIN,0,0 +215,TRAIN,0,0 +216,TRAIN,0,0 +217,TRAIN,0,0 +218,TRAIN,0,0 +219,TRAIN,0,0 +220,TRAIN,0,0 +221,TRAIN,0,0 +222,TRAIN,0,0 +223,TRAIN,0,0 +224,TRAIN,0,0 +225,TRAIN,0,0 +226,TRAIN,0,0 +227,TRAIN,0,0 +228,TRAIN,0,0 +229,TRAIN,0,0 +230,TRAIN,0,0 +231,TRAIN,0,0 +232,TRAIN,0,0 +233,TRAIN,0,0 +234,TRAIN,0,0 +235,TRAIN,0,0 +236,TRAIN,0,0 +237,TRAIN,0,0 +238,TRAIN,0,0 +239,TRAIN,0,0 +240,TRAIN,0,0 +241,TRAIN,0,0 +242,TRAIN,0,0 +243,TRAIN,0,0 +244,TRAIN,0,0 +245,TRAIN,0,0 +246,TRAIN,0,0 +247,TRAIN,0,0 +248,TRAIN,0,0 +249,TRAIN,0,0 +250,TRAIN,0,0 +251,TRAIN,0,0 +252,TRAIN,0,0 +253,TRAIN,0,0 +254,TRAIN,0,0 +255,TRAIN,0,0 +256,TRAIN,0,0 +257,TRAIN,0,0 +258,TRAIN,0,0 +259,TRAIN,0,0 +260,TRAIN,0,0 +261,TRAIN,0,0 +262,TRAIN,0,0 +263,TRAIN,0,0 +264,TRAIN,0,0 +265,TRAIN,0,0 +266,TRAIN,0,0 +267,TRAIN,0,0 +268,TRAIN,0,0 +269,TRAIN,0,0 +270,TRAIN,0,0 +271,TRAIN,0,0 +272,TRAIN,0,0 +273,TRAIN,0,0 +274,TRAIN,0,0 +275,TRAIN,0,0 +276,TRAIN,0,0 +277,TRAIN,0,0 +278,TRAIN,0,0 +279,TRAIN,0,0 +280,TRAIN,0,0 +281,TRAIN,0,0 +282,TRAIN,0,0 +283,TRAIN,0,0 +284,TRAIN,0,0 +285,TRAIN,0,0 +286,TRAIN,0,0 +287,TRAIN,0,0 +288,TRAIN,0,0 +289,TRAIN,0,0 +290,TRAIN,0,0 +291,TRAIN,0,0 +292,TRAIN,0,0 +293,TRAIN,0,0 +294,TRAIN,0,0 +295,TRAIN,0,0 +296,TRAIN,0,0 +297,TRAIN,0,0 +298,TRAIN,0,0 +299,TRAIN,0,0 +300,TRAIN,0,0 +301,TRAIN,0,0 +302,TRAIN,0,0 +303,TRAIN,0,0 +304,TRAIN,0,0 +305,TRAIN,0,0 +306,TRAIN,0,0 +307,TRAIN,0,0 +308,TRAIN,0,0 +309,TRAIN,0,0 +310,TRAIN,0,0 +311,TRAIN,0,0 +312,TRAIN,0,0 +313,TRAIN,0,0 +314,TRAIN,0,0 +315,TRAIN,0,0 +316,TRAIN,0,0 +317,TRAIN,0,0 +318,TRAIN,0,0 +319,TRAIN,0,0 +320,TRAIN,0,0 +321,TRAIN,0,0 +322,TRAIN,0,0 +323,TRAIN,0,0 +324,TRAIN,0,0 +325,TRAIN,0,0 +326,TRAIN,0,0 +327,TRAIN,0,0 +328,TRAIN,0,0 +329,TRAIN,0,0 +330,TRAIN,0,0 +331,TRAIN,0,0 +332,TRAIN,0,0 +333,TRAIN,0,0 +334,TRAIN,0,0 +335,TRAIN,0,0 +336,TRAIN,0,0 +337,TRAIN,0,0 +338,TRAIN,0,0 +339,TRAIN,0,0 +340,TRAIN,0,0 +341,TRAIN,0,0 +342,TRAIN,0,0 +343,TRAIN,0,0 +344,TRAIN,0,0 +345,TRAIN,0,0 +346,TRAIN,0,0 +347,TRAIN,0,0 +348,TRAIN,0,0 +349,TRAIN,0,0 +350,TRAIN,0,0 +351,TRAIN,0,0 +352,TRAIN,0,0 +353,TRAIN,0,0 +354,TRAIN,0,0 +355,TRAIN,0,0 +356,TRAIN,0,0 +357,TRAIN,0,0 +358,TRAIN,0,0 +359,TRAIN,0,0 +360,TRAIN,0,0 +361,TRAIN,0,0 +362,TRAIN,0,0 +363,TRAIN,0,0 +364,TRAIN,0,0 +365,TRAIN,0,0 +366,TRAIN,0,0 +367,TRAIN,0,0 +368,TRAIN,0,0 +369,TRAIN,0,0 +370,TRAIN,0,0 +371,TRAIN,0,0 +372,TRAIN,0,0 +373,TRAIN,0,0 +374,TRAIN,0,0 +375,TRAIN,0,0 +376,TRAIN,0,0 +377,TRAIN,0,0 +378,TRAIN,0,0 +379,TRAIN,0,0 +380,TRAIN,0,0 +381,TRAIN,0,0 +382,TRAIN,0,0 +383,TRAIN,0,0 +384,TRAIN,0,0 +385,TRAIN,0,0 +386,TRAIN,0,0 +387,TRAIN,0,0 +388,TRAIN,0,0 +389,TRAIN,0,0 +390,TRAIN,0,0 +391,TRAIN,0,0 +392,TRAIN,0,0 +393,TRAIN,0,0 +394,TRAIN,0,0 +395,TRAIN,0,0 +396,TRAIN,0,0 +397,TRAIN,0,0 +398,TRAIN,0,0 +399,TRAIN,0,0 +400,TRAIN,0,0 +401,TRAIN,0,0 +402,TRAIN,0,0 +403,TRAIN,0,0 +404,TRAIN,0,0 +405,TRAIN,0,0 +406,TRAIN,0,0 +407,TRAIN,0,0 +408,TRAIN,0,0 +409,TRAIN,0,0 +410,TRAIN,0,0 +411,TRAIN,0,0 +412,TRAIN,0,0 +413,TRAIN,0,0 +414,TRAIN,0,0 +415,TRAIN,0,0 +416,TRAIN,0,0 +417,TRAIN,0,0 +418,TRAIN,0,0 +419,TRAIN,0,0 +420,TRAIN,0,0 +421,TRAIN,0,0 +422,TRAIN,0,0 +423,TRAIN,0,0 +424,TRAIN,0,0 +425,TRAIN,0,0 +426,TRAIN,0,0 +427,TRAIN,0,0 +428,TRAIN,0,0 +429,TRAIN,0,0 +430,TRAIN,0,0 +431,TRAIN,0,0 +432,TRAIN,0,0 +433,TRAIN,0,0 +434,TRAIN,0,0 +435,TRAIN,0,0 +436,TRAIN,0,0 +437,TRAIN,0,0 +438,TRAIN,0,0 +439,TRAIN,0,0 +440,TRAIN,0,0 +441,TRAIN,0,0 +442,TRAIN,0,0 +443,TRAIN,0,0 +444,TRAIN,0,0 +445,TRAIN,0,0 +446,TRAIN,0,0 +447,TRAIN,0,0 +448,TRAIN,0,0 +449,TRAIN,0,0 +450,TRAIN,0,0 +451,TRAIN,0,0 +452,TRAIN,0,0 +453,TRAIN,0,0 +454,TRAIN,0,0 +455,TRAIN,0,0 +456,TRAIN,0,0 +457,TRAIN,0,0 +458,TRAIN,0,0 +459,TRAIN,0,0 +460,TRAIN,0,0 +461,TRAIN,0,0 +462,TRAIN,0,0 +463,TRAIN,0,0 +464,TRAIN,0,0 +465,TRAIN,0,0 +466,TRAIN,0,0 +467,TRAIN,0,0 +468,TRAIN,0,0 +469,TRAIN,0,0 +470,TRAIN,0,0 +471,TRAIN,0,0 +472,TRAIN,0,0 +473,TRAIN,0,0 +474,TRAIN,0,0 +475,TRAIN,0,0 +476,TRAIN,0,0 +477,TRAIN,0,0 +478,TRAIN,0,0 +479,TRAIN,0,0 +480,TRAIN,0,0 +481,TRAIN,0,0 +482,TRAIN,0,0 +483,TRAIN,0,0 +484,TRAIN,0,0 +485,TRAIN,0,0 +486,TRAIN,0,0 +487,TRAIN,0,0 +488,TRAIN,0,0 +489,TRAIN,0,0 +490,TRAIN,0,0 +491,TRAIN,0,0 +492,TRAIN,0,0 +493,TRAIN,0,0 +494,TRAIN,0,0 +495,TRAIN,0,0 +496,TRAIN,0,0 +497,TRAIN,0,0 +498,TRAIN,0,0 +499,TRAIN,0,0 +500,TRAIN,0,0 +501,TRAIN,0,0 +502,TRAIN,0,0 +503,TRAIN,0,0 +504,TRAIN,0,0 +505,TRAIN,0,0 +506,TRAIN,0,0 +507,TRAIN,0,0 +508,TRAIN,0,0 +509,TRAIN,0,0 +510,TRAIN,0,0 +511,TRAIN,0,0 +512,TRAIN,0,0 +513,TRAIN,0,0 +514,TRAIN,0,0 +515,TRAIN,0,0 +516,TRAIN,0,0 +517,TRAIN,0,0 +518,TRAIN,0,0 +519,TRAIN,0,0 +520,TRAIN,0,0 +521,TRAIN,0,0 +522,TRAIN,0,0 +523,TRAIN,0,0 +524,TRAIN,0,0 +525,TRAIN,0,0 +526,TRAIN,0,0 +527,TRAIN,0,0 +528,TRAIN,0,0 +529,TRAIN,0,0 +530,TRAIN,0,0 +531,TRAIN,0,0 +532,TRAIN,0,0 +533,TRAIN,0,0 +534,TRAIN,0,0 +535,TRAIN,0,0 +536,TRAIN,0,0 +537,TRAIN,0,0 +538,TRAIN,0,0 +539,TRAIN,0,0 +540,TRAIN,0,0 +541,TRAIN,0,0 +542,TRAIN,0,0 +543,TRAIN,0,0 +544,TRAIN,0,0 +545,TRAIN,0,0 +546,TRAIN,0,0 +547,TRAIN,0,0 +548,TRAIN,0,0 +549,TRAIN,0,0 +550,TRAIN,0,0 +551,TRAIN,0,0 +552,TRAIN,0,0 +553,TRAIN,0,0 +554,TRAIN,0,0 +555,TRAIN,0,0 +556,TRAIN,0,0 +557,TRAIN,0,0 +558,TRAIN,0,0 +559,TRAIN,0,0 +560,TRAIN,0,0 +561,TRAIN,0,0 +562,TRAIN,0,0 +563,TRAIN,0,0 +564,TRAIN,0,0 +565,TRAIN,0,0 +566,TRAIN,0,0 +567,TRAIN,0,0 +568,TRAIN,0,0 +569,TRAIN,0,0 +570,TRAIN,0,0 +571,TRAIN,0,0 +572,TRAIN,0,0 +573,TRAIN,0,0 +574,TRAIN,0,0 +575,TRAIN,0,0 +576,TRAIN,0,0 +577,TRAIN,0,0 +578,TRAIN,0,0 +579,TRAIN,0,0 +580,TRAIN,0,0 +581,TRAIN,0,0 +582,TRAIN,0,0 +583,TRAIN,0,0 +584,TRAIN,0,0 +585,TRAIN,0,0 +586,TRAIN,0,0 +587,TRAIN,0,0 +588,TRAIN,0,0 +589,TRAIN,0,0 +590,TRAIN,0,0 +591,TRAIN,0,0 +592,TRAIN,0,0 +593,TRAIN,0,0 +594,TRAIN,0,0 +595,TRAIN,0,0 +596,TRAIN,0,0 +597,TRAIN,0,0 +598,TRAIN,0,0 +599,TRAIN,0,0 +600,TRAIN,0,0 +601,TRAIN,0,0 +602,TRAIN,0,0 +603,TRAIN,0,0 +604,TRAIN,0,0 +605,TRAIN,0,0 +606,TRAIN,0,0 +607,TRAIN,0,0 +608,TRAIN,0,0 +609,TRAIN,0,0 +610,TRAIN,0,0 +611,TRAIN,0,0 +612,TRAIN,0,0 +613,TRAIN,0,0 +614,TRAIN,0,0 +615,TRAIN,0,0 +616,TRAIN,0,0 +617,TRAIN,0,0 +618,TRAIN,0,0 +619,TRAIN,0,0 +620,TRAIN,0,0 +621,TRAIN,0,0 +622,TRAIN,0,0 +623,TRAIN,0,0 +624,TRAIN,0,0 +625,TRAIN,0,0 +626,TRAIN,0,0 +627,TRAIN,0,0 +628,TRAIN,0,0 +629,TRAIN,0,0 +630,TRAIN,0,0 +631,TRAIN,0,0 +632,TRAIN,0,0 +633,TRAIN,0,0 +634,TRAIN,0,0 +635,TRAIN,0,0 +636,TRAIN,0,0 +637,TRAIN,0,0 +638,TRAIN,0,0 +639,TRAIN,0,0 +640,TRAIN,0,0 +641,TRAIN,0,0 +642,TRAIN,0,0 +643,TRAIN,0,0 +644,TRAIN,0,0 +645,TRAIN,0,0 +646,TRAIN,0,0 +647,TRAIN,0,0 +648,TRAIN,0,0 +649,TRAIN,0,0 +650,TRAIN,0,0 +651,TRAIN,0,0 +652,TRAIN,0,0 +653,TRAIN,0,0 +654,TRAIN,0,0 +655,TRAIN,0,0 +656,TRAIN,0,0 +657,TRAIN,0,0 +658,TRAIN,0,0 +659,TRAIN,0,0 +660,TRAIN,0,0 +661,TRAIN,0,0 +662,TRAIN,0,0 +663,TRAIN,0,0 +664,TRAIN,0,0 +665,TRAIN,0,0 +666,TRAIN,0,0 +667,TRAIN,0,0 +668,TRAIN,0,0 +669,TRAIN,0,0 +670,TRAIN,0,0 +671,TRAIN,0,0 +672,TRAIN,0,0 +673,TRAIN,0,0 +674,TRAIN,0,0 +675,TRAIN,0,0 +676,TRAIN,0,0 +677,TRAIN,0,0 +678,TRAIN,0,0 +679,TRAIN,0,0 +680,TRAIN,0,0 +681,TRAIN,0,0 +682,TRAIN,0,0 +683,TRAIN,0,0 +684,TRAIN,0,0 +685,TRAIN,0,0 +686,TRAIN,0,0 +687,TRAIN,0,0 +688,TRAIN,0,0 +689,TRAIN,0,0 +690,TRAIN,0,0 +691,TRAIN,0,0 +692,TRAIN,0,0 +693,TRAIN,0,0 +694,TRAIN,0,0 +695,TRAIN,0,0 +696,TRAIN,0,0 +697,TRAIN,0,0 +698,TRAIN,0,0 +699,TRAIN,0,0 +700,TRAIN,0,0 +701,TRAIN,0,0 +702,TRAIN,0,0 +703,TRAIN,0,0 +704,TRAIN,0,0 +705,TRAIN,0,0 +706,TRAIN,0,0 +707,TRAIN,0,0 +708,TRAIN,0,0 +709,TRAIN,0,0 +710,TRAIN,0,0 +711,TRAIN,0,0 +712,TRAIN,0,0 +713,TRAIN,0,0 +714,TRAIN,0,0 +715,TRAIN,0,0 +716,TRAIN,0,0 +717,TRAIN,0,0 +718,TRAIN,0,0 +719,TRAIN,0,0 +720,TRAIN,0,0 +721,TRAIN,0,0 +722,TRAIN,0,0 +723,TRAIN,0,0 +724,TRAIN,0,0 +725,TRAIN,0,0 +726,TRAIN,0,0 +727,TRAIN,0,0 +728,TRAIN,0,0 +729,TRAIN,0,0 +730,TRAIN,0,0 +731,TRAIN,0,0 +732,TRAIN,0,0 +733,TRAIN,0,0 +734,TRAIN,0,0 +735,TRAIN,0,0 +736,TRAIN,0,0 +737,TRAIN,0,0 +738,TRAIN,0,0 +739,TRAIN,0,0 +740,TRAIN,0,0 +741,TRAIN,0,0 +742,TRAIN,0,0 +743,TRAIN,0,0 +744,TRAIN,0,0 +745,TRAIN,0,0 +746,TRAIN,0,0 +747,TRAIN,0,0 +748,TRAIN,0,0 +749,TRAIN,0,0 +750,TRAIN,0,0 +751,TRAIN,0,0 +752,TRAIN,0,0 +753,TRAIN,0,0 +754,TRAIN,0,0 +755,TRAIN,0,0 +756,TRAIN,0,0 +757,TRAIN,0,0 +758,TRAIN,0,0 +759,TRAIN,0,0 +760,TRAIN,0,0 +761,TRAIN,0,0 +762,TRAIN,0,0 +763,TRAIN,0,0 +764,TRAIN,0,0 +765,TRAIN,0,0 +766,TRAIN,0,0 +767,TRAIN,0,0 +768,TRAIN,0,0 +769,TRAIN,0,0 +770,TRAIN,0,0 +771,TRAIN,0,0 +772,TRAIN,0,0 +773,TRAIN,0,0 +774,TRAIN,0,0 +775,TRAIN,0,0 +776,TRAIN,0,0 +777,TRAIN,0,0 +778,TRAIN,0,0 +779,TRAIN,0,0 +780,TRAIN,0,0 +781,TRAIN,0,0 +782,TRAIN,0,0 +783,TRAIN,0,0 +784,TRAIN,0,0 +785,TRAIN,0,0 +786,TRAIN,0,0 +787,TRAIN,0,0 +788,TRAIN,0,0 +789,TRAIN,0,0 +790,TRAIN,0,0 +791,TRAIN,0,0 +792,TRAIN,0,0 +793,TRAIN,0,0 +794,TRAIN,0,0 +795,TRAIN,0,0 +796,TRAIN,0,0 +797,TRAIN,0,0 +798,TRAIN,0,0 +799,TRAIN,0,0 +800,TRAIN,0,0 +801,TRAIN,0,0 +802,TRAIN,0,0 +803,TRAIN,0,0 +804,TRAIN,0,0 +805,TRAIN,0,0 +806,TRAIN,0,0 +807,TRAIN,0,0 +808,TRAIN,0,0 +809,TRAIN,0,0 +810,TRAIN,0,0 +811,TRAIN,0,0 +812,TRAIN,0,0 +813,TRAIN,0,0 +814,TRAIN,0,0 +815,TRAIN,0,0 +816,TRAIN,0,0 +817,TRAIN,0,0 +818,TRAIN,0,0 +819,TRAIN,0,0 +820,TRAIN,0,0 +821,TRAIN,0,0 +822,TRAIN,0,0 +823,TRAIN,0,0 +824,TRAIN,0,0 +825,TRAIN,0,0 +826,TRAIN,0,0 +827,TRAIN,0,0 +828,TRAIN,0,0 +829,TRAIN,0,0 +830,TRAIN,0,0 +831,TRAIN,0,0 +832,TRAIN,0,0 +833,TRAIN,0,0 +834,TRAIN,0,0 +835,TRAIN,0,0 +836,TRAIN,0,0 +837,TRAIN,0,0 +838,TRAIN,0,0 +839,TRAIN,0,0 +840,TRAIN,0,0 +841,TRAIN,0,0 +842,TRAIN,0,0 +843,TRAIN,0,0 +844,TRAIN,0,0 +845,TRAIN,0,0 +846,TRAIN,0,0 +847,TRAIN,0,0 +848,TRAIN,0,0 +849,TRAIN,0,0 +850,TRAIN,0,0 +851,TRAIN,0,0 +852,TRAIN,0,0 +853,TRAIN,0,0 +854,TRAIN,0,0 +855,TRAIN,0,0 +856,TRAIN,0,0 +857,TRAIN,0,0 +858,TRAIN,0,0 +859,TRAIN,0,0 +860,TRAIN,0,0 +861,TRAIN,0,0 +862,TRAIN,0,0 +863,TRAIN,0,0 +864,TRAIN,0,0 +865,TRAIN,0,0 +866,TRAIN,0,0 +867,TRAIN,0,0 +868,TRAIN,0,0 +869,TRAIN,0,0 +870,TRAIN,0,0 +871,TRAIN,0,0 +872,TRAIN,0,0 +873,TRAIN,0,0 +874,TRAIN,0,0 +875,TRAIN,0,0 +876,TRAIN,0,0 +877,TRAIN,0,0 +878,TRAIN,0,0 +879,TRAIN,0,0 +880,TRAIN,0,0 +881,TRAIN,0,0 +882,TRAIN,0,0 +883,TRAIN,0,0 +884,TRAIN,0,0 +885,TRAIN,0,0 +886,TRAIN,0,0 +887,TRAIN,0,0 +888,TRAIN,0,0 +889,TRAIN,0,0 +890,TRAIN,0,0 +891,TRAIN,0,0 +892,TRAIN,0,0 +893,TRAIN,0,0 +894,TRAIN,0,0 +895,TRAIN,0,0 +896,TRAIN,0,0 +897,TRAIN,0,0 +898,TRAIN,0,0 +899,TRAIN,0,0 +900,TRAIN,0,0 +901,TRAIN,0,0 +902,TRAIN,0,0 +903,TRAIN,0,0 +904,TRAIN,0,0 +905,TRAIN,0,0 +906,TRAIN,0,0 +907,TRAIN,0,0 +908,TRAIN,0,0 +909,TRAIN,0,0 +910,TRAIN,0,0 +911,TRAIN,0,0 +912,TRAIN,0,0 +913,TRAIN,0,0 +914,TRAIN,0,0 +915,TRAIN,0,0 +916,TRAIN,0,0 +917,TRAIN,0,0 +918,TRAIN,0,0 +919,TRAIN,0,0 +920,TRAIN,0,0 +921,TRAIN,0,0 +922,TRAIN,0,0 +923,TRAIN,0,0 +924,TRAIN,0,0 +925,TRAIN,0,0 +926,TRAIN,0,0 +927,TRAIN,0,0 +928,TRAIN,0,0 +929,TRAIN,0,0 +930,TRAIN,0,0 +931,TRAIN,0,0 +932,TRAIN,0,0 +933,TRAIN,0,0 +934,TRAIN,0,0 +935,TRAIN,0,0 +936,TRAIN,0,0 +937,TRAIN,0,0 +938,TRAIN,0,0 +939,TRAIN,0,0 +940,TRAIN,0,0 +941,TRAIN,0,0 +942,TRAIN,0,0 +943,TRAIN,0,0 +944,TRAIN,0,0 +945,TRAIN,0,0 +946,TRAIN,0,0 +947,TRAIN,0,0 +948,TRAIN,0,0 +949,TRAIN,0,0 +950,TRAIN,0,0 +951,TRAIN,0,0 +952,TRAIN,0,0 +953,TRAIN,0,0 +954,TRAIN,0,0 +955,TRAIN,0,0 +956,TRAIN,0,0 +957,TRAIN,0,0 +958,TRAIN,0,0 +959,TRAIN,0,0 +960,TRAIN,0,0 +961,TRAIN,0,0 +962,TRAIN,0,0 +963,TRAIN,0,0 +964,TRAIN,0,0 +965,TRAIN,0,0 +966,TRAIN,0,0 +967,TRAIN,0,0 +968,TRAIN,0,0 +969,TRAIN,0,0 +970,TRAIN,0,0 +971,TRAIN,0,0 +972,TRAIN,0,0 +973,TRAIN,0,0 +974,TRAIN,0,0 +975,TRAIN,0,0 +976,TRAIN,0,0 +977,TRAIN,0,0 +978,TRAIN,0,0 +979,TRAIN,0,0 +980,TRAIN,0,0 +981,TRAIN,0,0 +982,TRAIN,0,0 +983,TRAIN,0,0 +984,TRAIN,0,0 +985,TRAIN,0,0 +986,TRAIN,0,0 +987,TRAIN,0,0 +988,TRAIN,0,0 +989,TRAIN,0,0 +990,TRAIN,0,0 +991,TRAIN,0,0 +992,TRAIN,0,0 +993,TRAIN,0,0 +994,TRAIN,0,0 +995,TRAIN,0,0 +996,TRAIN,0,0 +997,TRAIN,0,0 +998,TRAIN,0,0 +999,TRAIN,0,0 +1000,TRAIN,0,0 +1001,TRAIN,0,0 +1002,TRAIN,0,0 +1003,TRAIN,0,0 +1004,TRAIN,0,0 +1005,TRAIN,0,0 +1006,TRAIN,0,0 +1007,TRAIN,0,0 +1008,TRAIN,0,0 +1009,TRAIN,0,0 +1010,TRAIN,0,0 +1011,TRAIN,0,0 +1012,TRAIN,0,0 +1013,TRAIN,0,0 +1014,TRAIN,0,0 +1015,TRAIN,0,0 +1016,TRAIN,0,0 +1017,TRAIN,0,0 +1018,TRAIN,0,0 +1019,TRAIN,0,0 +1020,TRAIN,0,0 +1021,TRAIN,0,0 +1022,TRAIN,0,0 +1023,TRAIN,0,0 +1024,TRAIN,0,0 +1025,TRAIN,0,0 +1026,TRAIN,0,0 +1027,TRAIN,0,0 +1028,TRAIN,0,0 +1029,TRAIN,0,0 +1030,TRAIN,0,0 +1031,TRAIN,0,0 +1032,TRAIN,0,0 +1033,TRAIN,0,0 +1034,TRAIN,0,0 +1035,TRAIN,0,0 +1036,TRAIN,0,0 +1037,TRAIN,0,0 +1038,TRAIN,0,0 +1039,TRAIN,0,0 +1040,TRAIN,0,0 +1041,TRAIN,0,0 +1042,TRAIN,0,0 +1043,TRAIN,0,0 +1044,TRAIN,0,0 +1045,TRAIN,0,0 +1046,TRAIN,0,0 +1047,TRAIN,0,0 +1048,TRAIN,0,0 +1049,TRAIN,0,0 +1050,TRAIN,0,0 +1051,TRAIN,0,0 +1052,TRAIN,0,0 +1053,TRAIN,0,0 +1054,TRAIN,0,0 +1055,TRAIN,0,0 +1056,TRAIN,0,0 +1057,TRAIN,0,0 +1058,TRAIN,0,0 +1059,TRAIN,0,0 +1060,TRAIN,0,0 +1061,TRAIN,0,0 +1062,TRAIN,0,0 +1063,TRAIN,0,0 +1064,TRAIN,0,0 +1065,TRAIN,0,0 +1066,TRAIN,0,0 +1067,TRAIN,0,0 +1068,TRAIN,0,0 +1069,TRAIN,0,0 +1070,TRAIN,0,0 +1071,TRAIN,0,0 +1072,TRAIN,0,0 +1073,TRAIN,0,0 +1074,TRAIN,0,0 +1075,TRAIN,0,0 +1076,TRAIN,0,0 +1077,TRAIN,0,0 +1078,TRAIN,0,0 +1079,TRAIN,0,0 +1080,TRAIN,0,0 +1081,TRAIN,0,0 +1082,TRAIN,0,0 +1083,TRAIN,0,0 +1084,TRAIN,0,0 +1085,TRAIN,0,0 +1086,TRAIN,0,0 +1087,TRAIN,0,0 +1088,TRAIN,0,0 +1089,TRAIN,0,0 +1090,TRAIN,0,0 +1091,TRAIN,0,0 +1092,TRAIN,0,0 +1093,TRAIN,0,0 +1094,TRAIN,0,0 +1095,TRAIN,0,0 +1096,TRAIN,0,0 +1097,TRAIN,0,0 +1098,TRAIN,0,0 +1099,TRAIN,0,0 +1100,TRAIN,0,0 +1101,TRAIN,0,0 +1102,TRAIN,0,0 +1103,TRAIN,0,0 +1104,TRAIN,0,0 +1105,TRAIN,0,0 +1106,TRAIN,0,0 +1107,TRAIN,0,0 +1108,TRAIN,0,0 +1109,TRAIN,0,0 +1110,TRAIN,0,0 +1111,TRAIN,0,0 +1112,TRAIN,0,0 +1113,TRAIN,0,0 +1114,TRAIN,0,0 +1115,TRAIN,0,0 +1116,TRAIN,0,0 +1117,TRAIN,0,0 +1118,TRAIN,0,0 +1119,TRAIN,0,0 +1120,TRAIN,0,0 +1121,TRAIN,0,0 +1122,TRAIN,0,0 +1123,TRAIN,0,0 +1124,TRAIN,0,0 +1125,TRAIN,0,0 +1126,TRAIN,0,0 +1127,TRAIN,0,0 +1128,TRAIN,0,0 +1129,TRAIN,0,0 +1130,TRAIN,0,0 +1131,TRAIN,0,0 +1132,TRAIN,0,0 +1133,TRAIN,0,0 +1134,TRAIN,0,0 +1135,TRAIN,0,0 +1136,TRAIN,0,0 +1137,TRAIN,0,0 +1138,TRAIN,0,0 +1139,TRAIN,0,0 +1140,TRAIN,0,0 +1141,TRAIN,0,0 +1142,TRAIN,0,0 +1143,TRAIN,0,0 +1144,TRAIN,0,0 +1145,TRAIN,0,0 +1146,TRAIN,0,0 +1147,TRAIN,0,0 +1148,TRAIN,0,0 +1149,TRAIN,0,0 +1150,TRAIN,0,0 +1151,TRAIN,0,0 +1152,TRAIN,0,0 +1153,TRAIN,0,0 +1154,TRAIN,0,0 +1155,TRAIN,0,0 +1156,TRAIN,0,0 +1157,TRAIN,0,0 +1158,TRAIN,0,0 +1159,TRAIN,0,0 +1160,TRAIN,0,0 +1161,TRAIN,0,0 +1162,TRAIN,0,0 +1163,TRAIN,0,0 +1164,TRAIN,0,0 +1165,TRAIN,0,0 +1166,TRAIN,0,0 +1167,TRAIN,0,0 +1168,TRAIN,0,0 +1169,TRAIN,0,0 +1170,TRAIN,0,0 +1171,TRAIN,0,0 +1172,TRAIN,0,0 +1173,TRAIN,0,0 +1174,TRAIN,0,0 +1175,TRAIN,0,0 +1176,TRAIN,0,0 +1177,TRAIN,0,0 +1178,TRAIN,0,0 +1179,TRAIN,0,0 +1180,TRAIN,0,0 +1181,TRAIN,0,0 +1182,TRAIN,0,0 +1183,TRAIN,0,0 +1184,TRAIN,0,0 +1185,TRAIN,0,0 +1186,TRAIN,0,0 +1187,TRAIN,0,0 +1188,TRAIN,0,0 +1189,TRAIN,0,0 +1190,TRAIN,0,0 +1191,TRAIN,0,0 +1192,TRAIN,0,0 +1193,TRAIN,0,0 +1194,TRAIN,0,0 +1195,TRAIN,0,0 +1196,TRAIN,0,0 +1197,TRAIN,0,0 +1198,TRAIN,0,0 +1199,TRAIN,0,0 +1200,TRAIN,0,0 +1201,TRAIN,0,0 +1202,TRAIN,0,0 +1203,TRAIN,0,0 +1204,TRAIN,0,0 +1205,TRAIN,0,0 +1206,TRAIN,0,0 +1207,TRAIN,0,0 +1208,TRAIN,0,0 +1209,TRAIN,0,0 +1210,TRAIN,0,0 +1211,TRAIN,0,0 +1212,TRAIN,0,0 +1213,TRAIN,0,0 +1214,TRAIN,0,0 +1215,TRAIN,0,0 +1216,TRAIN,0,0 +1217,TRAIN,0,0 +1218,TRAIN,0,0 +1219,TRAIN,0,0 +1220,TRAIN,0,0 +1221,TRAIN,0,0 +1222,TRAIN,0,0 +1223,TRAIN,0,0 +1224,TRAIN,0,0 +1225,TRAIN,0,0 +1226,TRAIN,0,0 +1227,TRAIN,0,0 +1228,TRAIN,0,0 +1229,TRAIN,0,0 +1230,TRAIN,0,0 +1231,TRAIN,0,0 +1232,TRAIN,0,0 +1233,TRAIN,0,0 +1234,TRAIN,0,0 +1235,TRAIN,0,0 +1236,TRAIN,0,0 +1237,TRAIN,0,0 +1238,TRAIN,0,0 +1239,TRAIN,0,0 +1240,TRAIN,0,0 +1241,TRAIN,0,0 +1242,TRAIN,0,0 +1243,TRAIN,0,0 +1244,TRAIN,0,0 +1245,TRAIN,0,0 +1246,TRAIN,0,0 +1247,TRAIN,0,0 +1248,TRAIN,0,0 +1249,TRAIN,0,0 +1250,TRAIN,0,0 +1251,TRAIN,0,0 +1252,TRAIN,0,0 +1253,TRAIN,0,0 +1254,TRAIN,0,0 +1255,TRAIN,0,0 +1256,TRAIN,0,0 +1257,TRAIN,0,0 +1258,TRAIN,0,0 +1259,TRAIN,0,0 +1260,TRAIN,0,0 +1261,TRAIN,0,0 +1262,TRAIN,0,0 +1263,TRAIN,0,0 +1264,TRAIN,0,0 +1265,TRAIN,0,0 +1266,TRAIN,0,0 +1267,TRAIN,0,0 +1268,TRAIN,0,0 +1269,TRAIN,0,0 +1270,TRAIN,0,0 +1271,TRAIN,0,0 +1272,TRAIN,0,0 +1273,TRAIN,0,0 +1274,TRAIN,0,0 +1275,TRAIN,0,0 +1276,TRAIN,0,0 +1277,TRAIN,0,0 +1278,TRAIN,0,0 +1279,TRAIN,0,0 +1280,TRAIN,0,0 +1281,TRAIN,0,0 +1282,TRAIN,0,0 +1283,TRAIN,0,0 +1284,TRAIN,0,0 +1285,TRAIN,0,0 +1286,TRAIN,0,0 +1287,TRAIN,0,0 +1288,TRAIN,0,0 +1289,TRAIN,0,0 +1290,TRAIN,0,0 +1291,TRAIN,0,0 +1292,TRAIN,0,0 +1293,TRAIN,0,0 +1294,TRAIN,0,0 +1295,TRAIN,0,0 +1296,TRAIN,0,0 +1297,TRAIN,0,0 +1298,TRAIN,0,0 +1299,TRAIN,0,0 +1300,TRAIN,0,0 +1301,TRAIN,0,0 +1302,TRAIN,0,0 +1303,TRAIN,0,0 +1304,TRAIN,0,0 +1305,TRAIN,0,0 +1306,TRAIN,0,0 +1307,TRAIN,0,0 +1308,TRAIN,0,0 +1309,TRAIN,0,0 +1310,TRAIN,0,0 +1311,TRAIN,0,0 +1312,TRAIN,0,0 +1313,TRAIN,0,0 +1314,TRAIN,0,0 +1315,TRAIN,0,0 +1316,TRAIN,0,0 +1317,TRAIN,0,0 +1318,TRAIN,0,0 +1319,TRAIN,0,0 +1320,TRAIN,0,0 +1321,TRAIN,0,0 +1322,TRAIN,0,0 +1323,TRAIN,0,0 +1324,TRAIN,0,0 +1325,TRAIN,0,0 +1326,TRAIN,0,0 +1327,TRAIN,0,0 +1328,TRAIN,0,0 +1329,TRAIN,0,0 +1330,TRAIN,0,0 +1331,TRAIN,0,0 +1332,TRAIN,0,0 +1333,TRAIN,0,0 +1334,TRAIN,0,0 +1335,TRAIN,0,0 +1336,TRAIN,0,0 +1337,TRAIN,0,0 +1338,TRAIN,0,0 +1339,TRAIN,0,0 +1340,TRAIN,0,0 +1341,TRAIN,0,0 +1342,TRAIN,0,0 +1343,TRAIN,0,0 +1344,TRAIN,0,0 +1345,TRAIN,0,0 +1346,TRAIN,0,0 +1347,TRAIN,0,0 +1348,TRAIN,0,0 +1349,TRAIN,0,0 +1350,TRAIN,0,0 +1351,TRAIN,0,0 +1352,TRAIN,0,0 +1353,TRAIN,0,0 +1354,TRAIN,0,0 +1355,TRAIN,0,0 +1356,TRAIN,0,0 +1357,TRAIN,0,0 +1358,TRAIN,0,0 +1359,TRAIN,0,0 +1360,TRAIN,0,0 +1361,TRAIN,0,0 +1362,TRAIN,0,0 +1363,TRAIN,0,0 +1364,TRAIN,0,0 +1365,TRAIN,0,0 +1366,TRAIN,0,0 +1367,TRAIN,0,0 +1368,TRAIN,0,0 +1369,TRAIN,0,0 +1370,TRAIN,0,0 +1371,TRAIN,0,0 +1372,TRAIN,0,0 +1373,TRAIN,0,0 +1374,TRAIN,0,0 +1375,TRAIN,0,0 +1376,TRAIN,0,0 +1377,TRAIN,0,0 +1378,TRAIN,0,0 +1379,TRAIN,0,0 +1380,TRAIN,0,0 +1381,TRAIN,0,0 +1382,TRAIN,0,0 +1383,TRAIN,0,0 +1384,TRAIN,0,0 +1385,TRAIN,0,0 +1386,TRAIN,0,0 +1387,TRAIN,0,0 +1388,TRAIN,0,0 +1389,TRAIN,0,0 +1390,TRAIN,0,0 +1391,TRAIN,0,0 +1392,TRAIN,0,0 +1393,TRAIN,0,0 +1394,TRAIN,0,0 +1395,TRAIN,0,0 +1396,TRAIN,0,0 +1397,TRAIN,0,0 +1398,TRAIN,0,0 +1399,TRAIN,0,0 +1400,TRAIN,0,0 +1401,TRAIN,0,0 +1402,TRAIN,0,0 +1403,TRAIN,0,0 +1404,TRAIN,0,0 +1405,TRAIN,0,0 +1406,TRAIN,0,0 +1407,TRAIN,0,0 +1408,TRAIN,0,0 +1409,TRAIN,0,0 +1410,TRAIN,0,0 +1411,TRAIN,0,0 +1412,TRAIN,0,0 +1413,TRAIN,0,0 +1414,TRAIN,0,0 +1415,TRAIN,0,0 +1416,TRAIN,0,0 +1417,TRAIN,0,0 +1418,TRAIN,0,0 +1419,TRAIN,0,0 +1420,TRAIN,0,0 +1421,TRAIN,0,0 +1422,TRAIN,0,0 +1423,TRAIN,0,0 +1424,TRAIN,0,0 +1425,TRAIN,0,0 +1426,TRAIN,0,0 +1427,TRAIN,0,0 +1428,TRAIN,0,0 +1429,TRAIN,0,0 +1430,TRAIN,0,0 +1431,TRAIN,0,0 +1432,TRAIN,0,0 +1433,TRAIN,0,0 +1434,TRAIN,0,0 +1435,TRAIN,0,0 +1436,TRAIN,0,0 +1437,TRAIN,0,0 +1438,TRAIN,0,0 +1439,TRAIN,0,0 +1440,TRAIN,0,0 +1441,TRAIN,0,0 +1442,TRAIN,0,0 +1443,TRAIN,0,0 +1444,TRAIN,0,0 +1445,TRAIN,0,0 +1446,TRAIN,0,0 +1447,TRAIN,0,0 +1448,TRAIN,0,0 +1449,TRAIN,0,0 +1450,TRAIN,0,0 +1451,TRAIN,0,0 +1452,TRAIN,0,0 +1453,TRAIN,0,0 +1454,TRAIN,0,0 +1455,TRAIN,0,0 +1456,TRAIN,0,0 +1457,TRAIN,0,0 +1458,TRAIN,0,0 +1459,TRAIN,0,0 +1460,TRAIN,0,0 +1461,TRAIN,0,0 +1462,TRAIN,0,0 +1463,TRAIN,0,0 +1464,TRAIN,0,0 +1465,TRAIN,0,0 +1466,TRAIN,0,0 +1467,TRAIN,0,0 +1468,TRAIN,0,0 +1469,TRAIN,0,0 +1470,TRAIN,0,0 +1471,TRAIN,0,0 +1472,TRAIN,0,0 +1473,TRAIN,0,0 +1474,TRAIN,0,0 +1475,TRAIN,0,0 +1476,TRAIN,0,0 +1477,TRAIN,0,0 +1478,TRAIN,0,0 +1479,TRAIN,0,0 +1480,TRAIN,0,0 +1481,TRAIN,0,0 +1482,TRAIN,0,0 +1483,TRAIN,0,0 +1484,TRAIN,0,0 +1485,TRAIN,0,0 +1486,TRAIN,0,0 +1487,TRAIN,0,0 +1488,TRAIN,0,0 +1489,TRAIN,0,0 +1490,TRAIN,0,0 +1491,TRAIN,0,0 +1492,TRAIN,0,0 +1493,TRAIN,0,0 +1494,TRAIN,0,0 +1495,TRAIN,0,0 +1496,TRAIN,0,0 +1497,TRAIN,0,0 +1498,TRAIN,0,0 +1499,TRAIN,0,0 +1500,TRAIN,0,0 +1501,TRAIN,0,0 +1502,TRAIN,0,0 +1503,TRAIN,0,0 +1504,TRAIN,0,0 +1505,TRAIN,0,0 +1506,TRAIN,0,0 +1507,TRAIN,0,0 +1508,TRAIN,0,0 +1509,TRAIN,0,0 +1510,TRAIN,0,0 +1511,TRAIN,0,0 +1512,TRAIN,0,0 +1513,TRAIN,0,0 +1514,TRAIN,0,0 +1515,TRAIN,0,0 +1516,TRAIN,0,0 +1517,TRAIN,0,0 +1518,TRAIN,0,0 +1519,TRAIN,0,0 +1520,TRAIN,0,0 +1521,TRAIN,0,0 +1522,TRAIN,0,0 +1523,TRAIN,0,0 +1524,TRAIN,0,0 +1525,TRAIN,0,0 +1526,TRAIN,0,0 +1527,TRAIN,0,0 +1528,TRAIN,0,0 +1529,TRAIN,0,0 +1530,TRAIN,0,0 +1531,TRAIN,0,0 +1532,TRAIN,0,0 +1533,TRAIN,0,0 +1534,TRAIN,0,0 +1535,TRAIN,0,0 +1536,TRAIN,0,0 +1537,TRAIN,0,0 +1538,TRAIN,0,0 +1539,TRAIN,0,0 +1540,TRAIN,0,0 +1541,TRAIN,0,0 +1542,TRAIN,0,0 +1543,TRAIN,0,0 +1544,TRAIN,0,0 +1545,TRAIN,0,0 +1546,TRAIN,0,0 +1547,TRAIN,0,0 +1548,TRAIN,0,0 +1549,TRAIN,0,0 +1550,TRAIN,0,0 +1551,TRAIN,0,0 +1552,TRAIN,0,0 +1553,TRAIN,0,0 +1554,TRAIN,0,0 +1555,TRAIN,0,0 +1556,TRAIN,0,0 +1557,TRAIN,0,0 +1558,TRAIN,0,0 +1559,TRAIN,0,0 +1560,TRAIN,0,0 +1561,TRAIN,0,0 +1562,TRAIN,0,0 +1563,TRAIN,0,0 +1564,TRAIN,0,0 +1565,TRAIN,0,0 +1566,TRAIN,0,0 +1567,TRAIN,0,0 +1568,TRAIN,0,0 +1569,TRAIN,0,0 +1570,TRAIN,0,0 +1571,TRAIN,0,0 +1572,TRAIN,0,0 +1573,TRAIN,0,0 +1574,TRAIN,0,0 +1575,TRAIN,0,0 +1576,TRAIN,0,0 +1577,TRAIN,0,0 +1578,TRAIN,0,0 +1579,TRAIN,0,0 +1580,TRAIN,0,0 +1581,TRAIN,0,0 +1582,TRAIN,0,0 +1583,TRAIN,0,0 +1584,TRAIN,0,0 +1585,TRAIN,0,0 +1586,TRAIN,0,0 +1587,TRAIN,0,0 +1588,TRAIN,0,0 +1589,TRAIN,0,0 +1590,TRAIN,0,0 +1591,TRAIN,0,0 +1592,TRAIN,0,0 +1593,TRAIN,0,0 +1594,TRAIN,0,0 +1595,TRAIN,0,0 +1596,TRAIN,0,0 +1597,TRAIN,0,0 +1598,TRAIN,0,0 +1599,TRAIN,0,0 +1600,TRAIN,0,0 +1601,TRAIN,0,0 +1602,TRAIN,0,0 +1603,TRAIN,0,0 +1604,TRAIN,0,0 +1605,TRAIN,0,0 +1606,TRAIN,0,0 +1607,TRAIN,0,0 +1608,TRAIN,0,0 +1609,TRAIN,0,0 +1610,TRAIN,0,0 +1611,TRAIN,0,0 +1612,TRAIN,0,0 +1613,TRAIN,0,0 +1614,TRAIN,0,0 +1615,TRAIN,0,0 +1616,TRAIN,0,0 +1617,TRAIN,0,0 +1618,TRAIN,0,0 +1619,TRAIN,0,0 +1620,TRAIN,0,0 +1621,TRAIN,0,0 +1622,TRAIN,0,0 +1623,TRAIN,0,0 +1624,TRAIN,0,0 +1625,TRAIN,0,0 +1626,TRAIN,0,0 +1627,TRAIN,0,0 +1628,TRAIN,0,0 +1629,TRAIN,0,0 +1630,TRAIN,0,0 +1631,TRAIN,0,0 +1632,TRAIN,0,0 +1633,TRAIN,0,0 +1634,TRAIN,0,0 +1635,TRAIN,0,0 +1636,TRAIN,0,0 +1637,TRAIN,0,0 +1638,TRAIN,0,0 +1639,TRAIN,0,0 +1640,TRAIN,0,0 +1641,TRAIN,0,0 +1642,TRAIN,0,0 +1643,TRAIN,0,0 +1644,TRAIN,0,0 +1645,TRAIN,0,0 +1646,TRAIN,0,0 +1647,TRAIN,0,0 +1648,TRAIN,0,0 +1649,TRAIN,0,0 +1650,TRAIN,0,0 +1651,TRAIN,0,0 +1652,TRAIN,0,0 +1653,TRAIN,0,0 +1654,TRAIN,0,0 +1655,TRAIN,0,0 +1656,TRAIN,0,0 +1657,TRAIN,0,0 +1658,TRAIN,0,0 +1659,TRAIN,0,0 +1660,TRAIN,0,0 +1661,TRAIN,0,0 +1662,TRAIN,0,0 +1663,TRAIN,0,0 +1664,TRAIN,0,0 +1665,TRAIN,0,0 +1666,TRAIN,0,0 +1667,TRAIN,0,0 +1668,TRAIN,0,0 +1669,TRAIN,0,0 +1670,TRAIN,0,0 +1671,TRAIN,0,0 +1672,TRAIN,0,0 +1673,TRAIN,0,0 +1674,TRAIN,0,0 +1675,TRAIN,0,0 +1676,TRAIN,0,0 +1677,TRAIN,0,0 +1678,TRAIN,0,0 +1679,TRAIN,0,0 +1680,TRAIN,0,0 +1681,TRAIN,0,0 +1682,TRAIN,0,0 +1683,TRAIN,0,0 +1684,TRAIN,0,0 +1685,TRAIN,0,0 +1686,TRAIN,0,0 +1687,TRAIN,0,0 +1688,TRAIN,0,0 +1689,TRAIN,0,0 +1690,TRAIN,0,0 +1691,TRAIN,0,0 +1692,TRAIN,0,0 +1693,TRAIN,0,0 +1694,TRAIN,0,0 +1695,TRAIN,0,0 +1696,TRAIN,0,0 +1697,TRAIN,0,0 +1698,TRAIN,0,0 +1699,TRAIN,0,0 +1700,TRAIN,0,0 +1701,TRAIN,0,0 +1702,TRAIN,0,0 +1703,TRAIN,0,0 +1704,TRAIN,0,0 +1705,TRAIN,0,0 +1706,TRAIN,0,0 +1707,TRAIN,0,0 +1708,TRAIN,0,0 +1709,TRAIN,0,0 +1710,TRAIN,0,0 +1711,TRAIN,0,0 +1712,TRAIN,0,0 +1713,TRAIN,0,0 +1714,TRAIN,0,0 +1715,TRAIN,0,0 +1716,TRAIN,0,0 +1717,TRAIN,0,0 +1718,TRAIN,0,0 +1719,TRAIN,0,0 +1720,TRAIN,0,0 +1721,TRAIN,0,0 +1722,TRAIN,0,0 +1723,TRAIN,0,0 +1724,TRAIN,0,0 +1725,TRAIN,0,0 +1726,TRAIN,0,0 +1727,TRAIN,0,0 +1728,TRAIN,0,0 +1729,TRAIN,0,0 +1730,TRAIN,0,0 +1731,TRAIN,0,0 +1732,TRAIN,0,0 +1733,TRAIN,0,0 +1734,TRAIN,0,0 +1735,TRAIN,0,0 +1736,TRAIN,0,0 +1737,TRAIN,0,0 +1738,TRAIN,0,0 +1739,TRAIN,0,0 +1740,TRAIN,0,0 +1741,TRAIN,0,0 +1742,TRAIN,0,0 +1743,TRAIN,0,0 +1744,TRAIN,0,0 +1745,TRAIN,0,0 +1746,TRAIN,0,0 +1747,TRAIN,0,0 +1748,TRAIN,0,0 +1749,TRAIN,0,0 +1750,TRAIN,0,0 +1751,TRAIN,0,0 +1752,TRAIN,0,0 +1753,TRAIN,0,0 +1754,TRAIN,0,0 +1755,TRAIN,0,0 +1756,TRAIN,0,0 +1757,TRAIN,0,0 +1758,TRAIN,0,0 +1759,TRAIN,0,0 +1760,TRAIN,0,0 +1761,TRAIN,0,0 +1762,TRAIN,0,0 +1763,TRAIN,0,0 +1764,TRAIN,0,0 +1765,TRAIN,0,0 +1766,TRAIN,0,0 +1767,TRAIN,0,0 +1768,TRAIN,0,0 +1769,TRAIN,0,0 +1770,TRAIN,0,0 +1771,TRAIN,0,0 +1772,TRAIN,0,0 +1773,TRAIN,0,0 +1774,TRAIN,0,0 +1775,TRAIN,0,0 +1776,TRAIN,0,0 +1777,TRAIN,0,0 +1778,TRAIN,0,0 +1779,TRAIN,0,0 +1780,TRAIN,0,0 +1781,TRAIN,0,0 +1782,TRAIN,0,0 +1783,TRAIN,0,0 +1784,TRAIN,0,0 +1785,TRAIN,0,0 +1786,TRAIN,0,0 +1787,TRAIN,0,0 +1788,TRAIN,0,0 +1789,TRAIN,0,0 +1790,TRAIN,0,0 +1791,TRAIN,0,0 +1792,TRAIN,0,0 +1793,TRAIN,0,0 +1794,TRAIN,0,0 +1795,TRAIN,0,0 +1796,TRAIN,0,0 +1797,TRAIN,0,0 +1798,TRAIN,0,0 +1799,TRAIN,0,0 +1800,TRAIN,0,0 +1801,TRAIN,0,0 +1802,TRAIN,0,0 +1803,TRAIN,0,0 +1804,TRAIN,0,0 +1805,TRAIN,0,0 +1806,TRAIN,0,0 +1807,TRAIN,0,0 +1808,TRAIN,0,0 +1809,TRAIN,0,0 +1810,TRAIN,0,0 +1811,TRAIN,0,0 +1812,TRAIN,0,0 +1813,TRAIN,0,0 +1814,TRAIN,0,0 +1815,TRAIN,0,0 +1816,TRAIN,0,0 +1817,TRAIN,0,0 +1818,TRAIN,0,0 +1819,TRAIN,0,0 +1820,TRAIN,0,0 +1821,TRAIN,0,0 +1822,TRAIN,0,0 +1823,TRAIN,0,0 +1824,TRAIN,0,0 +1825,TRAIN,0,0 +1826,TRAIN,0,0 +1827,TRAIN,0,0 +1828,TRAIN,0,0 +1829,TRAIN,0,0 +1830,TRAIN,0,0 +1831,TRAIN,0,0 +1832,TRAIN,0,0 +1833,TRAIN,0,0 +1834,TRAIN,0,0 +1835,TRAIN,0,0 +1836,TRAIN,0,0 +1837,TRAIN,0,0 +1838,TRAIN,0,0 +1839,TRAIN,0,0 +1840,TRAIN,0,0 +1841,TRAIN,0,0 +1842,TRAIN,0,0 +1843,TRAIN,0,0 +1844,TRAIN,0,0 +1845,TRAIN,0,0 +1846,TRAIN,0,0 +1847,TRAIN,0,0 +1848,TRAIN,0,0 +1849,TRAIN,0,0 +1850,TRAIN,0,0 +1851,TRAIN,0,0 +1852,TRAIN,0,0 +1853,TRAIN,0,0 +1854,TRAIN,0,0 +1855,TRAIN,0,0 +1856,TRAIN,0,0 +1857,TRAIN,0,0 +1858,TRAIN,0,0 +1859,TRAIN,0,0 +1860,TRAIN,0,0 +1861,TRAIN,0,0 +1862,TRAIN,0,0 +1863,TRAIN,0,0 +1864,TRAIN,0,0 +1865,TRAIN,0,0 +1866,TRAIN,0,0 +1867,TRAIN,0,0 +1868,TRAIN,0,0 +1869,TRAIN,0,0 +1870,TRAIN,0,0 +1871,TRAIN,0,0 +1872,TRAIN,0,0 +1873,TRAIN,0,0 +1874,TRAIN,0,0 +1875,TRAIN,0,0 +1876,TRAIN,0,0 +1877,TRAIN,0,0 +1878,TRAIN,0,0 +1879,TRAIN,0,0 +1880,TRAIN,0,0 +1881,TRAIN,0,0 +1882,TRAIN,0,0 +1883,TRAIN,0,0 +1884,TRAIN,0,0 +1885,TRAIN,0,0 +1886,TRAIN,0,0 +1887,TRAIN,0,0 +1888,TRAIN,0,0 +1889,TRAIN,0,0 +1890,TRAIN,0,0 +1891,TRAIN,0,0 +1892,TRAIN,0,0 +1893,TRAIN,0,0 +1894,TRAIN,0,0 +1895,TRAIN,0,0 +1896,TRAIN,0,0 +1897,TRAIN,0,0 +1898,TRAIN,0,0 +1899,TRAIN,0,0 +1900,TRAIN,0,0 +1901,TRAIN,0,0 +1902,TRAIN,0,0 +1903,TRAIN,0,0 +1904,TRAIN,0,0 +1905,TRAIN,0,0 +1906,TRAIN,0,0 +1907,TRAIN,0,0 +1908,TRAIN,0,0 +1909,TRAIN,0,0 +1910,TRAIN,0,0 +1911,TRAIN,0,0 +1912,TRAIN,0,0 +1913,TRAIN,0,0 +1914,TRAIN,0,0 +1915,TRAIN,0,0 +1916,TRAIN,0,0 +1917,TRAIN,0,0 +1918,TRAIN,0,0 +1919,TRAIN,0,0 +1920,TRAIN,0,0 +1921,TRAIN,0,0 +1922,TRAIN,0,0 +1923,TRAIN,0,0 +1924,TRAIN,0,0 +1925,TRAIN,0,0 +1926,TRAIN,0,0 +1927,TRAIN,0,0 +1928,TRAIN,0,0 +1929,TRAIN,0,0 +1930,TRAIN,0,0 +1931,TRAIN,0,0 +1932,TRAIN,0,0 +1933,TRAIN,0,0 +1934,TRAIN,0,0 +1935,TRAIN,0,0 +1936,TRAIN,0,0 +1937,TRAIN,0,0 +1938,TRAIN,0,0 +1939,TRAIN,0,0 +1940,TRAIN,0,0 +1941,TRAIN,0,0 +1942,TRAIN,0,0 +1943,TRAIN,0,0 +1944,TRAIN,0,0 +1945,TRAIN,0,0 +1946,TRAIN,0,0 +1947,TRAIN,0,0 +1948,TRAIN,0,0 +1949,TRAIN,0,0 +1950,TRAIN,0,0 +1951,TRAIN,0,0 +1952,TRAIN,0,0 +1953,TRAIN,0,0 +1954,TRAIN,0,0 +1955,TRAIN,0,0 +1956,TRAIN,0,0 +1957,TRAIN,0,0 +1958,TRAIN,0,0 +1959,TRAIN,0,0 +1960,TRAIN,0,0 +1961,TRAIN,0,0 +1962,TRAIN,0,0 +1963,TRAIN,0,0 +1964,TRAIN,0,0 +1965,TRAIN,0,0 +1966,TRAIN,0,0 +1967,TRAIN,0,0 +1968,TRAIN,0,0 +1969,TRAIN,0,0 +1970,TRAIN,0,0 +1971,TRAIN,0,0 +1972,TRAIN,0,0 +1973,TRAIN,0,0 +1974,TRAIN,0,0 +1975,TRAIN,0,0 +1976,TRAIN,0,0 +1977,TRAIN,0,0 +1978,TRAIN,0,0 +1979,TRAIN,0,0 +1980,TRAIN,0,0 +1981,TRAIN,0,0 +1982,TRAIN,0,0 +1983,TRAIN,0,0 +1984,TRAIN,0,0 +1985,TRAIN,0,0 +1986,TRAIN,0,0 +1987,TRAIN,0,0 +1988,TRAIN,0,0 +1989,TRAIN,0,0 +1990,TRAIN,0,0 +1991,TRAIN,0,0 +1992,TRAIN,0,0 +1993,TRAIN,0,0 +1994,TRAIN,0,0 +1995,TRAIN,0,0 +1996,TRAIN,0,0 +1997,TRAIN,0,0 +1998,TRAIN,0,0 +1999,TRAIN,0,0 +2000,TRAIN,0,0 +2001,TRAIN,0,0 +2002,TRAIN,0,0 +2003,TRAIN,0,0 +2004,TRAIN,0,0 +2005,TRAIN,0,0 +2006,TRAIN,0,0 +2007,TRAIN,0,0 +2008,TRAIN,0,0 +2009,TRAIN,0,0 +2010,TRAIN,0,0 +2011,TRAIN,0,0 +2012,TRAIN,0,0 +2013,TRAIN,0,0 +2014,TRAIN,0,0 +2015,TRAIN,0,0 +2016,TRAIN,0,0 +2017,TRAIN,0,0 +2018,TRAIN,0,0 +2019,TRAIN,0,0 +2020,TRAIN,0,0 +2021,TRAIN,0,0 +2022,TRAIN,0,0 +2023,TRAIN,0,0 +2024,TRAIN,0,0 +2025,TRAIN,0,0 +2026,TRAIN,0,0 +2027,TRAIN,0,0 +2028,TRAIN,0,0 +2029,TRAIN,0,0 +2030,TRAIN,0,0 +2031,TRAIN,0,0 +2032,TRAIN,0,0 +2033,TRAIN,0,0 +2034,TRAIN,0,0 +2035,TRAIN,0,0 +2036,TRAIN,0,0 +2037,TRAIN,0,0 +2038,TRAIN,0,0 +2039,TRAIN,0,0 +2040,TRAIN,0,0 +2041,TRAIN,0,0 +2042,TRAIN,0,0 +2043,TRAIN,0,0 +2044,TRAIN,0,0 +2045,TRAIN,0,0 +2046,TRAIN,0,0 +2047,TRAIN,0,0 +2048,TRAIN,0,0 +2049,TRAIN,0,0 +2050,TRAIN,0,0 +2051,TRAIN,0,0 +2052,TRAIN,0,0 +2053,TRAIN,0,0 +2054,TRAIN,0,0 +2055,TRAIN,0,0 +2056,TRAIN,0,0 +2057,TRAIN,0,0 +2058,TRAIN,0,0 +2059,TRAIN,0,0 +2060,TRAIN,0,0 +2061,TRAIN,0,0 +2062,TRAIN,0,0 +2063,TRAIN,0,0 +2064,TRAIN,0,0 +2065,TRAIN,0,0 +2066,TRAIN,0,0 +2067,TRAIN,0,0 +2068,TRAIN,0,0 +2069,TRAIN,0,0 +2070,TRAIN,0,0 +2071,TRAIN,0,0 +2072,TRAIN,0,0 +2073,TRAIN,0,0 +2074,TRAIN,0,0 +2075,TRAIN,0,0 +2076,TRAIN,0,0 +2077,TRAIN,0,0 +2078,TRAIN,0,0 +2079,TRAIN,0,0 +2080,TRAIN,0,0 +2081,TRAIN,0,0 +2082,TRAIN,0,0 +2083,TRAIN,0,0 +2084,TRAIN,0,0 +2085,TRAIN,0,0 +2086,TRAIN,0,0 +2087,TRAIN,0,0 +2088,TRAIN,0,0 +2089,TRAIN,0,0 +2090,TRAIN,0,0 +2091,TRAIN,0,0 +2092,TRAIN,0,0 +2093,TRAIN,0,0 +2094,TRAIN,0,0 +2095,TRAIN,0,0 +2096,TRAIN,0,0 +2097,TRAIN,0,0 +2098,TRAIN,0,0 +2099,TRAIN,0,0 +2100,TRAIN,0,0 +2101,TRAIN,0,0 +2102,TRAIN,0,0 +2103,TRAIN,0,0 +2104,TRAIN,0,0 +2105,TRAIN,0,0 +2106,TRAIN,0,0 +2107,TRAIN,0,0 +2108,TRAIN,0,0 +2109,TRAIN,0,0 +2110,TRAIN,0,0 +2111,TRAIN,0,0 +2112,TRAIN,0,0 +2113,TRAIN,0,0 +2114,TRAIN,0,0 +2115,TRAIN,0,0 +2116,TRAIN,0,0 +2117,TRAIN,0,0 +2118,TRAIN,0,0 +2119,TRAIN,0,0 +2120,TRAIN,0,0 +2121,TRAIN,0,0 +2122,TRAIN,0,0 +2123,TRAIN,0,0 +2124,TRAIN,0,0 +2125,TRAIN,0,0 +2126,TRAIN,0,0 +2127,TRAIN,0,0 +2128,TRAIN,0,0 +2129,TRAIN,0,0 +2130,TRAIN,0,0 +2131,TRAIN,0,0 +2132,TRAIN,0,0 +2133,TRAIN,0,0 +2134,TRAIN,0,0 +2135,TRAIN,0,0 +2136,TRAIN,0,0 +2137,TRAIN,0,0 +2138,TRAIN,0,0 +2139,TRAIN,0,0 +2140,TRAIN,0,0 +2141,TRAIN,0,0 +2142,TRAIN,0,0 +2143,TRAIN,0,0 +2144,TRAIN,0,0 +2145,TRAIN,0,0 +2146,TRAIN,0,0 +2147,TRAIN,0,0 +2148,TRAIN,0,0 +2149,TRAIN,0,0 +2150,TRAIN,0,0 +2151,TRAIN,0,0 +2152,TRAIN,0,0 +2153,TRAIN,0,0 +2154,TRAIN,0,0 +2155,TRAIN,0,0 +2156,TRAIN,0,0 +2157,TRAIN,0,0 +2158,TRAIN,0,0 +2159,TRAIN,0,0 +2160,TRAIN,0,0 +2161,TRAIN,0,0 +2162,TRAIN,0,0 +2163,TRAIN,0,0 +2164,TRAIN,0,0 +2165,TRAIN,0,0 +2166,TRAIN,0,0 +2167,TRAIN,0,0 +2168,TRAIN,0,0 +2169,TRAIN,0,0 +2170,TRAIN,0,0 +2171,TRAIN,0,0 +2172,TRAIN,0,0 +2173,TRAIN,0,0 +2174,TRAIN,0,0 +2175,TRAIN,0,0 +2176,TRAIN,0,0 +2177,TRAIN,0,0 +2178,TRAIN,0,0 +2179,TRAIN,0,0 +2180,TRAIN,0,0 +2181,TRAIN,0,0 +2182,TRAIN,0,0 +2183,TRAIN,0,0 +2184,TRAIN,0,0 +2185,TRAIN,0,0 +2186,TRAIN,0,0 +2187,TRAIN,0,0 +2188,TRAIN,0,0 +2189,TRAIN,0,0 +2190,TRAIN,0,0 +2191,TRAIN,0,0 +2192,TRAIN,0,0 +2193,TRAIN,0,0 +2194,TRAIN,0,0 +2195,TRAIN,0,0 +2196,TRAIN,0,0 +2197,TRAIN,0,0 +2198,TRAIN,0,0 +2199,TRAIN,0,0 +2200,TRAIN,0,0 +2201,TRAIN,0,0 +2202,TRAIN,0,0 +2203,TRAIN,0,0 +2204,TRAIN,0,0 +2205,TRAIN,0,0 +2206,TRAIN,0,0 +2207,TRAIN,0,0 +2208,TRAIN,0,0 +2209,TRAIN,0,0 +2210,TRAIN,0,0 +2211,TRAIN,0,0 +2212,TRAIN,0,0 +2213,TRAIN,0,0 +2214,TRAIN,0,0 +2215,TRAIN,0,0 +2216,TRAIN,0,0 +2217,TRAIN,0,0 +2218,TRAIN,0,0 +2219,TRAIN,0,0 +2220,TRAIN,0,0 +2221,TRAIN,0,0 +2222,TRAIN,0,0 +2223,TRAIN,0,0 +2224,TRAIN,0,0 +2225,TRAIN,0,0 +2226,TRAIN,0,0 +2227,TRAIN,0,0 +2228,TRAIN,0,0 +2229,TRAIN,0,0 +2230,TRAIN,0,0 +2231,TRAIN,0,0 +2232,TRAIN,0,0 +2233,TRAIN,0,0 +2234,TRAIN,0,0 +2235,TRAIN,0,0 +2236,TRAIN,0,0 +2237,TRAIN,0,0 +2238,TRAIN,0,0 +2239,TRAIN,0,0 +2240,TRAIN,0,0 +2241,TRAIN,0,0 +2242,TRAIN,0,0 +2243,TRAIN,0,0 +2244,TRAIN,0,0 +2245,TRAIN,0,0 +2246,TRAIN,0,0 +2247,TRAIN,0,0 +2248,TRAIN,0,0 +2249,TRAIN,0,0 +2250,TRAIN,0,0 +2251,TRAIN,0,0 +2252,TRAIN,0,0 +2253,TRAIN,0,0 +2254,TRAIN,0,0 +2255,TRAIN,0,0 +2256,TRAIN,0,0 +2257,TRAIN,0,0 +2258,TRAIN,0,0 +2259,TRAIN,0,0 +2260,TRAIN,0,0 +2261,TRAIN,0,0 +2262,TRAIN,0,0 +2263,TRAIN,0,0 +2264,TRAIN,0,0 +2265,TRAIN,0,0 +2266,TRAIN,0,0 +2267,TRAIN,0,0 +2268,TRAIN,0,0 +2269,TRAIN,0,0 +2270,TRAIN,0,0 +2271,TRAIN,0,0 +2272,TRAIN,0,0 +2273,TRAIN,0,0 +2274,TRAIN,0,0 +2275,TRAIN,0,0 +2276,TRAIN,0,0 +2277,TRAIN,0,0 +2278,TRAIN,0,0 +2279,TRAIN,0,0 +2280,TRAIN,0,0 +2281,TRAIN,0,0 +2282,TRAIN,0,0 +2283,TRAIN,0,0 +2284,TRAIN,0,0 +2285,TRAIN,0,0 +2286,TRAIN,0,0 +2287,TRAIN,0,0 +2288,TRAIN,0,0 +2289,TRAIN,0,0 +2290,TRAIN,0,0 +2291,TRAIN,0,0 +2292,TRAIN,0,0 +2293,TRAIN,0,0 +2294,TRAIN,0,0 +2295,TRAIN,0,0 +2296,TRAIN,0,0 +2297,TRAIN,0,0 +2298,TRAIN,0,0 +2299,TRAIN,0,0 +2300,TRAIN,0,0 +2301,TRAIN,0,0 +2302,TRAIN,0,0 +2303,TRAIN,0,0 +2304,TRAIN,0,0 +2305,TRAIN,0,0 +2306,TRAIN,0,0 +2307,TRAIN,0,0 +2308,TRAIN,0,0 +2309,TRAIN,0,0 +2310,TRAIN,0,0 +2311,TRAIN,0,0 +2312,TRAIN,0,0 +2313,TRAIN,0,0 +2314,TRAIN,0,0 +2315,TRAIN,0,0 +2316,TRAIN,0,0 +2317,TRAIN,0,0 +2318,TRAIN,0,0 +2319,TRAIN,0,0 +2320,TRAIN,0,0 +2321,TRAIN,0,0 +2322,TRAIN,0,0 +2323,TRAIN,0,0 +2324,TRAIN,0,0 +2325,TRAIN,0,0 +2326,TRAIN,0,0 +2327,TRAIN,0,0 +2328,TRAIN,0,0 +2329,TRAIN,0,0 +2330,TRAIN,0,0 +2331,TRAIN,0,0 +2332,TRAIN,0,0 +2333,TRAIN,0,0 +2334,TRAIN,0,0 +2335,TRAIN,0,0 +2336,TRAIN,0,0 +2337,TRAIN,0,0 +2338,TRAIN,0,0 +2339,TRAIN,0,0 +2340,TRAIN,0,0 +2341,TRAIN,0,0 +2342,TRAIN,0,0 +2343,TRAIN,0,0 +2344,TRAIN,0,0 +2345,TRAIN,0,0 +2346,TRAIN,0,0 +2347,TRAIN,0,0 +2348,TRAIN,0,0 +2349,TRAIN,0,0 +2350,TRAIN,0,0 +2351,TRAIN,0,0 +2352,TRAIN,0,0 +2353,TRAIN,0,0 +2354,TRAIN,0,0 +2355,TRAIN,0,0 +2356,TRAIN,0,0 +2357,TRAIN,0,0 +2358,TRAIN,0,0 +2359,TRAIN,0,0 +2360,TRAIN,0,0 +2361,TRAIN,0,0 +2362,TRAIN,0,0 +2363,TRAIN,0,0 +2364,TRAIN,0,0 +2365,TRAIN,0,0 +2366,TRAIN,0,0 +2367,TRAIN,0,0 +2368,TRAIN,0,0 +2369,TRAIN,0,0 +2370,TRAIN,0,0 +2371,TRAIN,0,0 +2372,TRAIN,0,0 +2373,TRAIN,0,0 +2374,TRAIN,0,0 +2375,TRAIN,0,0 +2376,TRAIN,0,0 +2377,TRAIN,0,0 +2378,TRAIN,0,0 +2379,TRAIN,0,0 +2380,TRAIN,0,0 +2381,TRAIN,0,0 +2382,TRAIN,0,0 +2383,TRAIN,0,0 +2384,TRAIN,0,0 +2385,TRAIN,0,0 +2386,TRAIN,0,0 +2387,TRAIN,0,0 +2388,TRAIN,0,0 +2389,TRAIN,0,0 +2390,TRAIN,0,0 +2391,TRAIN,0,0 +2392,TRAIN,0,0 +2393,TRAIN,0,0 +2394,TRAIN,0,0 +2395,TRAIN,0,0 +2396,TRAIN,0,0 +2397,TRAIN,0,0 +2398,TRAIN,0,0 +2399,TRAIN,0,0 +2400,TRAIN,0,0 +2401,TRAIN,0,0 +2402,TRAIN,0,0 +2403,TRAIN,0,0 +2404,TRAIN,0,0 +2405,TRAIN,0,0 +2406,TRAIN,0,0 +2407,TRAIN,0,0 +2408,TRAIN,0,0 +2409,TRAIN,0,0 +2410,TRAIN,0,0 +2411,TRAIN,0,0 +2412,TRAIN,0,0 +2413,TRAIN,0,0 +2414,TRAIN,0,0 +2415,TRAIN,0,0 +2416,TRAIN,0,0 +2417,TRAIN,0,0 +2418,TRAIN,0,0 +2419,TRAIN,0,0 +2420,TRAIN,0,0 +2421,TRAIN,0,0 +2422,TRAIN,0,0 +2423,TRAIN,0,0 +2424,TRAIN,0,0 +2425,TRAIN,0,0 +2426,TRAIN,0,0 +2427,TRAIN,0,0 +2428,TRAIN,0,0 +2429,TRAIN,0,0 +2430,TRAIN,0,0 +2431,TRAIN,0,0 +2432,TRAIN,0,0 +2433,TRAIN,0,0 +2434,TRAIN,0,0 +2435,TRAIN,0,0 +2436,TRAIN,0,0 +2437,TRAIN,0,0 +2438,TRAIN,0,0 +2439,TRAIN,0,0 +2440,TRAIN,0,0 +2441,TRAIN,0,0 +2442,TRAIN,0,0 +2443,TRAIN,0,0 +2444,TRAIN,0,0 +2445,TRAIN,0,0 +2446,TRAIN,0,0 +2447,TRAIN,0,0 +2448,TRAIN,0,0 +2449,TRAIN,0,0 +2450,TRAIN,0,0 +2451,TRAIN,0,0 +2452,TRAIN,0,0 +2453,TRAIN,0,0 +2454,TRAIN,0,0 +2455,TRAIN,0,0 +2456,TRAIN,0,0 +2457,TRAIN,0,0 +2458,TRAIN,0,0 +2459,TRAIN,0,0 +2460,TRAIN,0,0 +2461,TRAIN,0,0 +2462,TRAIN,0,0 +2463,TRAIN,0,0 +2464,TRAIN,0,0 +2465,TRAIN,0,0 +2466,TRAIN,0,0 +2467,TRAIN,0,0 +2468,TRAIN,0,0 +2469,TRAIN,0,0 +2470,TRAIN,0,0 +2471,TRAIN,0,0 +2472,TRAIN,0,0 +2473,TRAIN,0,0 +2474,TRAIN,0,0 +2475,TRAIN,0,0 +2476,TRAIN,0,0 +2477,TRAIN,0,0 +2478,TRAIN,0,0 +2479,TRAIN,0,0 +2480,TRAIN,0,0 +2481,TRAIN,0,0 +2482,TRAIN,0,0 +2483,TRAIN,0,0 +2484,TRAIN,0,0 +2485,TRAIN,0,0 +2486,TRAIN,0,0 +2487,TRAIN,0,0 +2488,TRAIN,0,0 +2489,TRAIN,0,0 +2490,TRAIN,0,0 +2491,TRAIN,0,0 +2492,TRAIN,0,0 +2493,TRAIN,0,0 +2494,TRAIN,0,0 +2495,TRAIN,0,0 +2496,TRAIN,0,0 +2497,TRAIN,0,0 +2498,TRAIN,0,0 +2499,TRAIN,0,0 +2500,TRAIN,0,0 +2501,TRAIN,0,0 +2502,TRAIN,0,0 +2503,TRAIN,0,0 +2504,TRAIN,0,0 +2505,TRAIN,0,0 +2506,TRAIN,0,0 +2507,TRAIN,0,0 +2508,TRAIN,0,0 +2509,TRAIN,0,0 +2510,TRAIN,0,0 +2511,TRAIN,0,0 +2512,TRAIN,0,0 +2513,TRAIN,0,0 +2514,TRAIN,0,0 +2515,TRAIN,0,0 +2516,TRAIN,0,0 +2517,TRAIN,0,0 +2518,TRAIN,0,0 +2519,TRAIN,0,0 +2520,TRAIN,0,0 +2521,TRAIN,0,0 +2522,TRAIN,0,0 +2523,TRAIN,0,0 +2524,TRAIN,0,0 +2525,TRAIN,0,0 +2526,TRAIN,0,0 +2527,TRAIN,0,0 +2528,TRAIN,0,0 +2529,TRAIN,0,0 +2530,TRAIN,0,0 +2531,TRAIN,0,0 +2532,TRAIN,0,0 +2533,TRAIN,0,0 +2534,TRAIN,0,0 +2535,TRAIN,0,0 +2536,TRAIN,0,0 +2537,TRAIN,0,0 +2538,TRAIN,0,0 +2539,TRAIN,0,0 +2540,TRAIN,0,0 +2541,TRAIN,0,0 +2542,TRAIN,0,0 +2543,TRAIN,0,0 +2544,TRAIN,0,0 +2545,TRAIN,0,0 +2546,TRAIN,0,0 +2547,TRAIN,0,0 +2548,TRAIN,0,0 +2549,TRAIN,0,0 +2550,TRAIN,0,0 +2551,TRAIN,0,0 +2552,TRAIN,0,0 +2553,TRAIN,0,0 +2554,TRAIN,0,0 +2555,TRAIN,0,0 +2556,TRAIN,0,0 +2557,TRAIN,0,0 +2558,TRAIN,0,0 +2559,TRAIN,0,0 +2560,TRAIN,0,0 +2561,TRAIN,0,0 +2562,TRAIN,0,0 +2563,TRAIN,0,0 +2564,TRAIN,0,0 +2565,TRAIN,0,0 +2566,TRAIN,0,0 +2567,TRAIN,0,0 +2568,TRAIN,0,0 +2569,TRAIN,0,0 +2570,TRAIN,0,0 +2571,TRAIN,0,0 +2572,TRAIN,0,0 +2573,TRAIN,0,0 +2574,TRAIN,0,0 +2575,TRAIN,0,0 +2576,TRAIN,0,0 +2577,TRAIN,0,0 +2578,TRAIN,0,0 +2579,TRAIN,0,0 +2580,TRAIN,0,0 +2581,TRAIN,0,0 +2582,TRAIN,0,0 +2583,TRAIN,0,0 +2584,TRAIN,0,0 +2585,TRAIN,0,0 +2586,TRAIN,0,0 +2587,TRAIN,0,0 +2588,TRAIN,0,0 +2589,TRAIN,0,0 +2590,TRAIN,0,0 +2591,TRAIN,0,0 +2592,TRAIN,0,0 +2593,TRAIN,0,0 +2594,TRAIN,0,0 +2595,TRAIN,0,0 +2596,TRAIN,0,0 +2597,TRAIN,0,0 +2598,TRAIN,0,0 +2599,TRAIN,0,0 +2600,TRAIN,0,0 +2601,TRAIN,0,0 +2602,TRAIN,0,0 +2603,TRAIN,0,0 +2604,TRAIN,0,0 +2605,TRAIN,0,0 +2606,TRAIN,0,0 +2607,TRAIN,0,0 +2608,TRAIN,0,0 +2609,TRAIN,0,0 +2610,TRAIN,0,0 +2611,TRAIN,0,0 +2612,TRAIN,0,0 +2613,TRAIN,0,0 +2614,TRAIN,0,0 +2615,TRAIN,0,0 +2616,TRAIN,0,0 +2617,TRAIN,0,0 +2618,TRAIN,0,0 +2619,TRAIN,0,0 +2620,TRAIN,0,0 +2621,TRAIN,0,0 +2622,TRAIN,0,0 +2623,TRAIN,0,0 +2624,TRAIN,0,0 +2625,TRAIN,0,0 +2626,TRAIN,0,0 +2627,TRAIN,0,0 +2628,TRAIN,0,0 +2629,TRAIN,0,0 +2630,TRAIN,0,0 +2631,TRAIN,0,0 +2632,TRAIN,0,0 +2633,TRAIN,0,0 +2634,TRAIN,0,0 +2635,TRAIN,0,0 +2636,TRAIN,0,0 +2637,TRAIN,0,0 +2638,TRAIN,0,0 +2639,TRAIN,0,0 +2640,TRAIN,0,0 +2641,TRAIN,0,0 +2642,TRAIN,0,0 +2643,TRAIN,0,0 +2644,TRAIN,0,0 +2645,TRAIN,0,0 +2646,TRAIN,0,0 +2647,TRAIN,0,0 +2648,TRAIN,0,0 +2649,TRAIN,0,0 +2650,TRAIN,0,0 +2651,TRAIN,0,0 +2652,TRAIN,0,0 +2653,TRAIN,0,0 +2654,TRAIN,0,0 +2655,TRAIN,0,0 +2656,TRAIN,0,0 +2657,TRAIN,0,0 +2658,TRAIN,0,0 +2659,TRAIN,0,0 +2660,TRAIN,0,0 +2661,TRAIN,0,0 +2662,TRAIN,0,0 +2663,TRAIN,0,0 +2664,TRAIN,0,0 +2665,TRAIN,0,0 +2666,TRAIN,0,0 +2667,TRAIN,0,0 +2668,TRAIN,0,0 +2669,TRAIN,0,0 +2670,TRAIN,0,0 +2671,TRAIN,0,0 +2672,TRAIN,0,0 +2673,TRAIN,0,0 +2674,TRAIN,0,0 +2675,TRAIN,0,0 +2676,TRAIN,0,0 +2677,TRAIN,0,0 +2678,TRAIN,0,0 +2679,TRAIN,0,0 +2680,TRAIN,0,0 +2681,TRAIN,0,0 +2682,TRAIN,0,0 +2683,TRAIN,0,0 +2684,TRAIN,0,0 +2685,TRAIN,0,0 +2686,TRAIN,0,0 +2687,TRAIN,0,0 +2688,TRAIN,0,0 +2689,TRAIN,0,0 +2690,TRAIN,0,0 +2691,TRAIN,0,0 +2692,TRAIN,0,0 +2693,TRAIN,0,0 +2694,TRAIN,0,0 +2695,TRAIN,0,0 +2696,TRAIN,0,0 +2697,TRAIN,0,0 +2698,TRAIN,0,0 +2699,TRAIN,0,0 +2700,TRAIN,0,0 +2701,TRAIN,0,0 +2702,TRAIN,0,0 +2703,TRAIN,0,0 +2704,TRAIN,0,0 +2705,TRAIN,0,0 +2706,TRAIN,0,0 +2707,TRAIN,0,0 +2708,TRAIN,0,0 +2709,TRAIN,0,0 +2710,TRAIN,0,0 +2711,TRAIN,0,0 +2712,TRAIN,0,0 +2713,TRAIN,0,0 +2714,TRAIN,0,0 +2715,TRAIN,0,0 +2716,TRAIN,0,0 +2717,TRAIN,0,0 +2718,TRAIN,0,0 +2719,TRAIN,0,0 +2720,TRAIN,0,0 +2721,TRAIN,0,0 +2722,TRAIN,0,0 +2723,TRAIN,0,0 +2724,TRAIN,0,0 +2725,TRAIN,0,0 +2726,TRAIN,0,0 +2727,TRAIN,0,0 +2728,TRAIN,0,0 +2729,TRAIN,0,0 +2730,TRAIN,0,0 +2731,TRAIN,0,0 +2732,TRAIN,0,0 +2733,TRAIN,0,0 +2734,TRAIN,0,0 +2735,TRAIN,0,0 +2736,TRAIN,0,0 +2737,TRAIN,0,0 +2738,TRAIN,0,0 +2739,TRAIN,0,0 +2740,TRAIN,0,0 +2741,TRAIN,0,0 +2742,TRAIN,0,0 +2743,TRAIN,0,0 +2744,TRAIN,0,0 +2745,TRAIN,0,0 +2746,TRAIN,0,0 +2747,TRAIN,0,0 +2748,TRAIN,0,0 +2749,TRAIN,0,0 +2750,TRAIN,0,0 +2751,TRAIN,0,0 +2752,TRAIN,0,0 +2753,TRAIN,0,0 +2754,TRAIN,0,0 +2755,TRAIN,0,0 +2756,TRAIN,0,0 +2757,TRAIN,0,0 +2758,TRAIN,0,0 +2759,TRAIN,0,0 +2760,TRAIN,0,0 +2761,TRAIN,0,0 +2762,TRAIN,0,0 +2763,TRAIN,0,0 +2764,TRAIN,0,0 +2765,TRAIN,0,0 +2766,TRAIN,0,0 +2767,TRAIN,0,0 +2768,TRAIN,0,0 +2769,TRAIN,0,0 +2770,TRAIN,0,0 +2771,TRAIN,0,0 +2772,TRAIN,0,0 +2773,TRAIN,0,0 +2774,TRAIN,0,0 +2775,TRAIN,0,0 +2776,TRAIN,0,0 +2777,TRAIN,0,0 +2778,TRAIN,0,0 +2779,TRAIN,0,0 +2780,TRAIN,0,0 +2781,TRAIN,0,0 +2782,TRAIN,0,0 +2783,TRAIN,0,0 +2784,TRAIN,0,0 +2785,TRAIN,0,0 +2786,TRAIN,0,0 +2787,TRAIN,0,0 +2788,TRAIN,0,0 +2789,TRAIN,0,0 +2790,TRAIN,0,0 +2791,TRAIN,0,0 +2792,TRAIN,0,0 +2793,TRAIN,0,0 +2794,TRAIN,0,0 +2795,TRAIN,0,0 +2796,TRAIN,0,0 +2797,TRAIN,0,0 +2798,TRAIN,0,0 +2799,TRAIN,0,0 +2800,TRAIN,0,0 +2801,TRAIN,0,0 +2802,TRAIN,0,0 +2803,TRAIN,0,0 +2804,TRAIN,0,0 +2805,TRAIN,0,0 +2806,TRAIN,0,0 +2807,TRAIN,0,0 +2808,TRAIN,0,0 +2809,TRAIN,0,0 +2810,TRAIN,0,0 +2811,TRAIN,0,0 +2812,TRAIN,0,0 +2813,TRAIN,0,0 +2814,TRAIN,0,0 +2815,TRAIN,0,0 +2816,TRAIN,0,0 +2817,TRAIN,0,0 +2818,TRAIN,0,0 +2819,TRAIN,0,0 +2820,TRAIN,0,0 +2821,TRAIN,0,0 +2822,TRAIN,0,0 +2823,TRAIN,0,0 +2824,TRAIN,0,0 +2825,TRAIN,0,0 +2826,TRAIN,0,0 +2827,TRAIN,0,0 +2828,TRAIN,0,0 +2829,TRAIN,0,0 +2830,TRAIN,0,0 +2831,TRAIN,0,0 +2832,TRAIN,0,0 +2833,TRAIN,0,0 +2834,TRAIN,0,0 +2835,TRAIN,0,0 +2836,TRAIN,0,0 +2837,TRAIN,0,0 +2838,TRAIN,0,0 +2839,TRAIN,0,0 +2840,TRAIN,0,0 +2841,TRAIN,0,0 +2842,TRAIN,0,0 +2843,TRAIN,0,0 +2844,TRAIN,0,0 +2845,TRAIN,0,0 +2846,TRAIN,0,0 +2847,TRAIN,0,0 +2848,TRAIN,0,0 +2849,TRAIN,0,0 +2850,TRAIN,0,0 +2851,TRAIN,0,0 +2852,TRAIN,0,0 +2853,TRAIN,0,0 +2854,TRAIN,0,0 +2855,TRAIN,0,0 +2856,TRAIN,0,0 +2857,TRAIN,0,0 +2858,TRAIN,0,0 +2859,TRAIN,0,0 +2860,TRAIN,0,0 +2861,TRAIN,0,0 +2862,TRAIN,0,0 +2863,TRAIN,0,0 +2864,TRAIN,0,0 +2865,TRAIN,0,0 +2866,TRAIN,0,0 +2867,TRAIN,0,0 +2868,TRAIN,0,0 +2869,TRAIN,0,0 +2870,TRAIN,0,0 +2871,TRAIN,0,0 +2872,TRAIN,0,0 +2873,TRAIN,0,0 +2874,TRAIN,0,0 +2875,TRAIN,0,0 +2876,TRAIN,0,0 +2877,TRAIN,0,0 +2878,TRAIN,0,0 +2879,TRAIN,0,0 +2880,TRAIN,0,0 +2881,TRAIN,0,0 +2882,TRAIN,0,0 +2883,TRAIN,0,0 +2884,TRAIN,0,0 +2885,TRAIN,0,0 +2886,TRAIN,0,0 +2887,TRAIN,0,0 +2888,TRAIN,0,0 +2889,TRAIN,0,0 +2890,TRAIN,0,0 +2891,TRAIN,0,0 +2892,TRAIN,0,0 +2893,TRAIN,0,0 +2894,TRAIN,0,0 +2895,TRAIN,0,0 +2896,TRAIN,0,0 +2897,TRAIN,0,0 +2898,TRAIN,0,0 +2899,TRAIN,0,0 +2900,TRAIN,0,0 +2901,TRAIN,0,0 +2902,TRAIN,0,0 +2903,TRAIN,0,0 +2904,TRAIN,0,0 +2905,TRAIN,0,0 +2906,TRAIN,0,0 +2907,TRAIN,0,0 +2908,TRAIN,0,0 +2909,TRAIN,0,0 +2910,TRAIN,0,0 +2911,TRAIN,0,0 +2912,TRAIN,0,0 +2913,TRAIN,0,0 +2914,TRAIN,0,0 +2915,TRAIN,0,0 +2916,TRAIN,0,0 +2917,TRAIN,0,0 +2918,TRAIN,0,0 +2919,TRAIN,0,0 +2920,TRAIN,0,0 +2921,TRAIN,0,0 +2922,TRAIN,0,0 +2923,TRAIN,0,0 +2924,TRAIN,0,0 +2925,TRAIN,0,0 +2926,TRAIN,0,0 +2927,TRAIN,0,0 +2928,TRAIN,0,0 +2929,TRAIN,0,0 +2930,TRAIN,0,0 +2931,TRAIN,0,0 +2932,TRAIN,0,0 +2933,TRAIN,0,0 +2934,TRAIN,0,0 +2935,TRAIN,0,0 +2936,TRAIN,0,0 +2937,TRAIN,0,0 +2938,TRAIN,0,0 +2939,TRAIN,0,0 +2940,TRAIN,0,0 +2941,TRAIN,0,0 +2942,TRAIN,0,0 +2943,TRAIN,0,0 +2944,TRAIN,0,0 +2945,TRAIN,0,0 +2946,TRAIN,0,0 +2947,TRAIN,0,0 +2948,TRAIN,0,0 +2949,TRAIN,0,0 +2950,TRAIN,0,0 +2951,TRAIN,0,0 +2952,TRAIN,0,0 +2953,TRAIN,0,0 +2954,TRAIN,0,0 +2955,TRAIN,0,0 +2956,TRAIN,0,0 +2957,TRAIN,0,0 +2958,TRAIN,0,0 +2959,TRAIN,0,0 +2960,TRAIN,0,0 +2961,TRAIN,0,0 +2962,TRAIN,0,0 +2963,TRAIN,0,0 +2964,TRAIN,0,0 +2965,TRAIN,0,0 +2966,TRAIN,0,0 +2967,TRAIN,0,0 +2968,TRAIN,0,0 +2969,TRAIN,0,0 +2970,TRAIN,0,0 +2971,TRAIN,0,0 +2972,TRAIN,0,0 +2973,TRAIN,0,0 +2974,TRAIN,0,0 +2975,TRAIN,0,0 +2976,TRAIN,0,0 +2977,TRAIN,0,0 +2978,TRAIN,0,0 +2979,TRAIN,0,0 +2980,TRAIN,0,0 +2981,TRAIN,0,0 +2982,TRAIN,0,0 +2983,TRAIN,0,0 +2984,TRAIN,0,0 +2985,TRAIN,0,0 +2986,TRAIN,0,0 +2987,TRAIN,0,0 +2988,TRAIN,0,0 +2989,TRAIN,0,0 +2990,TRAIN,0,0 +2991,TRAIN,0,0 +2992,TRAIN,0,0 +2993,TRAIN,0,0 +2994,TRAIN,0,0 +2995,TRAIN,0,0 +2996,TRAIN,0,0 +2997,TRAIN,0,0 +2998,TRAIN,0,0 +2999,TRAIN,0,0 +3000,TRAIN,0,0 +3001,TRAIN,0,0 +3002,TRAIN,0,0 +3003,TRAIN,0,0 +3004,TRAIN,0,0 +3005,TRAIN,0,0 +3006,TRAIN,0,0 +3007,TRAIN,0,0 +3008,TRAIN,0,0 +3009,TRAIN,0,0 +3010,TRAIN,0,0 +3011,TRAIN,0,0 +3012,TRAIN,0,0 +3013,TRAIN,0,0 +3014,TRAIN,0,0 +3015,TRAIN,0,0 +3016,TRAIN,0,0 +3017,TRAIN,0,0 +3018,TRAIN,0,0 +3019,TRAIN,0,0 +3020,TRAIN,0,0 +3021,TRAIN,0,0 +3022,TRAIN,0,0 +3023,TRAIN,0,0 +3024,TRAIN,0,0 +3025,TRAIN,0,0 +3026,TRAIN,0,0 +3027,TRAIN,0,0 +3028,TRAIN,0,0 +3029,TRAIN,0,0 +3030,TRAIN,0,0 +3031,TRAIN,0,0 +3032,TRAIN,0,0 +3033,TRAIN,0,0 +3034,TRAIN,0,0 +3035,TRAIN,0,0 +3036,TRAIN,0,0 +3037,TRAIN,0,0 +3038,TRAIN,0,0 +3039,TRAIN,0,0 +3040,TRAIN,0,0 +3041,TRAIN,0,0 +3042,TRAIN,0,0 +3043,TRAIN,0,0 +3044,TRAIN,0,0 +3045,TRAIN,0,0 +3046,TRAIN,0,0 +3047,TRAIN,0,0 +3048,TRAIN,0,0 +3049,TRAIN,0,0 +3050,TRAIN,0,0 +3051,TRAIN,0,0 +3052,TRAIN,0,0 +3053,TRAIN,0,0 +3054,TRAIN,0,0 +3055,TRAIN,0,0 +3056,TRAIN,0,0 +3057,TRAIN,0,0 +3058,TRAIN,0,0 +3059,TRAIN,0,0 +3060,TRAIN,0,0 +3061,TRAIN,0,0 +3062,TRAIN,0,0 +3063,TRAIN,0,0 +3064,TRAIN,0,0 +3065,TRAIN,0,0 +3066,TRAIN,0,0 +3067,TRAIN,0,0 +3068,TRAIN,0,0 +3069,TRAIN,0,0 +3070,TRAIN,0,0 +3071,TRAIN,0,0 +3072,TRAIN,0,0 +3073,TRAIN,0,0 +3074,TRAIN,0,0 +3075,TRAIN,0,0 +3076,TRAIN,0,0 +3077,TRAIN,0,0 +3078,TRAIN,0,0 +3079,TRAIN,0,0 +3080,TRAIN,0,0 +3081,TRAIN,0,0 +3082,TRAIN,0,0 +3083,TRAIN,0,0 +3084,TRAIN,0,0 +3085,TRAIN,0,0 +3086,TRAIN,0,0 +3087,TRAIN,0,0 +3088,TRAIN,0,0 +3089,TRAIN,0,0 +3090,TRAIN,0,0 +3091,TRAIN,0,0 +3092,TRAIN,0,0 +3093,TRAIN,0,0 +3094,TRAIN,0,0 +3095,TRAIN,0,0 +3096,TRAIN,0,0 +3097,TRAIN,0,0 +3098,TRAIN,0,0 +3099,TRAIN,0,0 +3100,TRAIN,0,0 +3101,TRAIN,0,0 +3102,TRAIN,0,0 +3103,TRAIN,0,0 +3104,TRAIN,0,0 +3105,TRAIN,0,0 +3106,TRAIN,0,0 +3107,TRAIN,0,0 +3108,TRAIN,0,0 +3109,TRAIN,0,0 +3110,TRAIN,0,0 +3111,TRAIN,0,0 +3112,TRAIN,0,0 +3113,TRAIN,0,0 +3114,TRAIN,0,0 +3115,TRAIN,0,0 +3116,TRAIN,0,0 +3117,TRAIN,0,0 +3118,TRAIN,0,0 +3119,TRAIN,0,0 +3120,TRAIN,0,0 +3121,TRAIN,0,0 +3122,TRAIN,0,0 +3123,TRAIN,0,0 +3124,TRAIN,0,0 +3125,TRAIN,0,0 +3126,TRAIN,0,0 +3127,TRAIN,0,0 +3128,TRAIN,0,0 +3129,TRAIN,0,0 +3130,TRAIN,0,0 +3131,TRAIN,0,0 +3132,TRAIN,0,0 +3133,TRAIN,0,0 +3134,TRAIN,0,0 +3135,TRAIN,0,0 +3136,TRAIN,0,0 +3137,TRAIN,0,0 +3138,TRAIN,0,0 +3139,TRAIN,0,0 +3140,TRAIN,0,0 +3141,TRAIN,0,0 +3142,TRAIN,0,0 +3143,TRAIN,0,0 +3144,TRAIN,0,0 +3145,TRAIN,0,0 +3146,TRAIN,0,0 +3147,TRAIN,0,0 +3148,TRAIN,0,0 +3149,TRAIN,0,0 +3150,TRAIN,0,0 +3151,TRAIN,0,0 +3152,TRAIN,0,0 +3153,TRAIN,0,0 +3154,TRAIN,0,0 +3155,TRAIN,0,0 +3156,TRAIN,0,0 +3157,TRAIN,0,0 +3158,TRAIN,0,0 +3159,TRAIN,0,0 +3160,TRAIN,0,0 +3161,TRAIN,0,0 +3162,TRAIN,0,0 +3163,TRAIN,0,0 +3164,TRAIN,0,0 +3165,TRAIN,0,0 +3166,TRAIN,0,0 +3167,TRAIN,0,0 +3168,TRAIN,0,0 +3169,TRAIN,0,0 +3170,TRAIN,0,0 +3171,TRAIN,0,0 +3172,TRAIN,0,0 +3173,TRAIN,0,0 +3174,TRAIN,0,0 +3175,TRAIN,0,0 +3176,TRAIN,0,0 +3177,TRAIN,0,0 +3178,TRAIN,0,0 +3179,TRAIN,0,0 +3180,TRAIN,0,0 +3181,TRAIN,0,0 +3182,TRAIN,0,0 +3183,TRAIN,0,0 +3184,TRAIN,0,0 +3185,TRAIN,0,0 +3186,TRAIN,0,0 +3187,TRAIN,0,0 +3188,TRAIN,0,0 +3189,TRAIN,0,0 +3190,TRAIN,0,0 +3191,TRAIN,0,0 +3192,TRAIN,0,0 +3193,TRAIN,0,0 +3194,TRAIN,0,0 +3195,TRAIN,0,0 +3196,TRAIN,0,0 +3197,TRAIN,0,0 +3198,TRAIN,0,0 +3199,TRAIN,0,0 +3200,TRAIN,0,0 +3201,TRAIN,0,0 +3202,TRAIN,0,0 +3203,TRAIN,0,0 +3204,TRAIN,0,0 +3205,TRAIN,0,0 +3206,TRAIN,0,0 +3207,TRAIN,0,0 +3208,TRAIN,0,0 +3209,TRAIN,0,0 +3210,TRAIN,0,0 +3211,TRAIN,0,0 +3212,TRAIN,0,0 +3213,TRAIN,0,0 +3214,TRAIN,0,0 +3215,TRAIN,0,0 +3216,TRAIN,0,0 +3217,TRAIN,0,0 +3218,TRAIN,0,0 +3219,TRAIN,0,0 +3220,TRAIN,0,0 +3221,TRAIN,0,0 +3222,TRAIN,0,0 +3223,TRAIN,0,0 +3224,TRAIN,0,0 +3225,TRAIN,0,0 +3226,TRAIN,0,0 +3227,TRAIN,0,0 +3228,TRAIN,0,0 +3229,TRAIN,0,0 +3230,TRAIN,0,0 +3231,TRAIN,0,0 +3232,TRAIN,0,0 +3233,TRAIN,0,0 +3234,TRAIN,0,0 +3235,TRAIN,0,0 +3236,TRAIN,0,0 +3237,TRAIN,0,0 +3238,TRAIN,0,0 +3239,TRAIN,0,0 +3240,TRAIN,0,0 +3241,TRAIN,0,0 +3242,TRAIN,0,0 +3243,TRAIN,0,0 +3244,TRAIN,0,0 +3245,TRAIN,0,0 +3246,TRAIN,0,0 +3247,TRAIN,0,0 +3248,TRAIN,0,0 +3249,TRAIN,0,0 +3250,TRAIN,0,0 +3251,TRAIN,0,0 +3252,TRAIN,0,0 +3253,TRAIN,0,0 +3254,TRAIN,0,0 +3255,TRAIN,0,0 +3256,TRAIN,0,0 +3257,TRAIN,0,0 +3258,TRAIN,0,0 +3259,TRAIN,0,0 +3260,TRAIN,0,0 +3261,TRAIN,0,0 +3262,TRAIN,0,0 +3263,TRAIN,0,0 +3264,TRAIN,0,0 +3265,TRAIN,0,0 +3266,TRAIN,0,0 +3267,TRAIN,0,0 +3268,TRAIN,0,0 +3269,TRAIN,0,0 +3270,TRAIN,0,0 +3271,TRAIN,0,0 +3272,TRAIN,0,0 +3273,TRAIN,0,0 +3274,TRAIN,0,0 +3275,TRAIN,0,0 +3276,TRAIN,0,0 +3277,TRAIN,0,0 +3278,TRAIN,0,0 +3279,TRAIN,0,0 +3280,TRAIN,0,0 +3281,TRAIN,0,0 +3282,TRAIN,0,0 +3283,TRAIN,0,0 +3284,TRAIN,0,0 +3285,TRAIN,0,0 +3286,TRAIN,0,0 +3287,TRAIN,0,0 +3288,TRAIN,0,0 +3289,TRAIN,0,0 +3290,TRAIN,0,0 +3291,TRAIN,0,0 +3292,TRAIN,0,0 +3293,TRAIN,0,0 +3294,TRAIN,0,0 +3295,TRAIN,0,0 +3296,TRAIN,0,0 +3297,TRAIN,0,0 +3298,TRAIN,0,0 +3299,TRAIN,0,0 +3300,TRAIN,0,0 +3301,TRAIN,0,0 +3302,TRAIN,0,0 +3303,TRAIN,0,0 +3304,TRAIN,0,0 +3305,TRAIN,0,0 +3306,TRAIN,0,0 +3307,TRAIN,0,0 +3308,TRAIN,0,0 +3309,TRAIN,0,0 +3310,TRAIN,0,0 +3311,TRAIN,0,0 +3312,TRAIN,0,0 +3313,TRAIN,0,0 +3314,TRAIN,0,0 +3315,TRAIN,0,0 +3316,TRAIN,0,0 +3317,TRAIN,0,0 +3318,TRAIN,0,0 +3319,TRAIN,0,0 +3320,TRAIN,0,0 +3321,TRAIN,0,0 +3322,TRAIN,0,0 +3323,TRAIN,0,0 +3324,TRAIN,0,0 +3325,TRAIN,0,0 +3326,TRAIN,0,0 +3327,TRAIN,0,0 +3328,TRAIN,0,0 +3329,TRAIN,0,0 +3330,TRAIN,0,0 +3331,TRAIN,0,0 +3332,TRAIN,0,0 +3333,TRAIN,0,0 +3334,TRAIN,0,0 +3335,TRAIN,0,0 +3336,TRAIN,0,0 +3337,TRAIN,0,0 +3338,TRAIN,0,0 +3339,TRAIN,0,0 +3340,TRAIN,0,0 +3341,TRAIN,0,0 +3342,TRAIN,0,0 +3343,TRAIN,0,0 +3344,TRAIN,0,0 +3345,TRAIN,0,0 +3346,TRAIN,0,0 +3347,TRAIN,0,0 +3348,TRAIN,0,0 +3349,TRAIN,0,0 +3350,TRAIN,0,0 +3351,TRAIN,0,0 +3352,TRAIN,0,0 +3353,TRAIN,0,0 +3354,TRAIN,0,0 +3355,TRAIN,0,0 +3356,TRAIN,0,0 +3357,TRAIN,0,0 +3358,TRAIN,0,0 +3359,TRAIN,0,0 +3360,TRAIN,0,0 +3361,TRAIN,0,0 +3362,TRAIN,0,0 +3363,TRAIN,0,0 +3364,TRAIN,0,0 +3365,TRAIN,0,0 +3366,TRAIN,0,0 +3367,TRAIN,0,0 +3368,TRAIN,0,0 +3369,TRAIN,0,0 +3370,TRAIN,0,0 +3371,TRAIN,0,0 +3372,TRAIN,0,0 +3373,TRAIN,0,0 +3374,TRAIN,0,0 +3375,TRAIN,0,0 +3376,TRAIN,0,0 +3377,TRAIN,0,0 +3378,TRAIN,0,0 +3379,TRAIN,0,0 +3380,TRAIN,0,0 +3381,TRAIN,0,0 +3382,TRAIN,0,0 +3383,TRAIN,0,0 +3384,TRAIN,0,0 +3385,TRAIN,0,0 +3386,TRAIN,0,0 +3387,TRAIN,0,0 +3388,TRAIN,0,0 +3389,TRAIN,0,0 +3390,TRAIN,0,0 +3391,TRAIN,0,0 +3392,TRAIN,0,0 +3393,TRAIN,0,0 +3394,TRAIN,0,0 +3395,TRAIN,0,0 +3396,TRAIN,0,0 +3397,TRAIN,0,0 +3398,TRAIN,0,0 +3399,TRAIN,0,0 +3400,TRAIN,0,0 +3401,TRAIN,0,0 +3402,TRAIN,0,0 +3403,TRAIN,0,0 +3404,TRAIN,0,0 +3405,TRAIN,0,0 +3406,TRAIN,0,0 +3407,TRAIN,0,0 +3408,TRAIN,0,0 +3409,TRAIN,0,0 +3410,TRAIN,0,0 +3411,TRAIN,0,0 +3412,TRAIN,0,0 +3413,TRAIN,0,0 +3414,TRAIN,0,0 +3415,TRAIN,0,0 +3416,TRAIN,0,0 +3417,TRAIN,0,0 +3418,TRAIN,0,0 +3419,TRAIN,0,0 +3420,TRAIN,0,0 +3421,TRAIN,0,0 +3422,TRAIN,0,0 +3423,TRAIN,0,0 +3424,TRAIN,0,0 +3425,TRAIN,0,0 +3426,TRAIN,0,0 +3427,TRAIN,0,0 +3428,TRAIN,0,0 +3429,TRAIN,0,0 +3430,TRAIN,0,0 +3431,TRAIN,0,0 +3432,TRAIN,0,0 +3433,TRAIN,0,0 +3434,TRAIN,0,0 +3435,TRAIN,0,0 +3436,TRAIN,0,0 +3437,TRAIN,0,0 +3438,TRAIN,0,0 +3439,TRAIN,0,0 +3440,TRAIN,0,0 +3441,TRAIN,0,0 +3442,TRAIN,0,0 +3443,TRAIN,0,0 +3444,TRAIN,0,0 +3445,TRAIN,0,0 +3446,TRAIN,0,0 +3447,TRAIN,0,0 +3448,TRAIN,0,0 +3449,TRAIN,0,0 +3450,TRAIN,0,0 +3451,TRAIN,0,0 +3452,TRAIN,0,0 +3453,TRAIN,0,0 +3454,TRAIN,0,0 +3455,TRAIN,0,0 +3456,TRAIN,0,0 +3457,TRAIN,0,0 +3458,TRAIN,0,0 +3459,TRAIN,0,0 +3460,TRAIN,0,0 +3461,TRAIN,0,0 +3462,TRAIN,0,0 +3463,TRAIN,0,0 +3464,TRAIN,0,0 +3465,TRAIN,0,0 +3466,TRAIN,0,0 +3467,TRAIN,0,0 +3468,TRAIN,0,0 +3469,TRAIN,0,0 +3470,TRAIN,0,0 +3471,TRAIN,0,0 +3472,TRAIN,0,0 +3473,TRAIN,0,0 +3474,TRAIN,0,0 +3475,TRAIN,0,0 +3476,TRAIN,0,0 +3477,TRAIN,0,0 +3478,TRAIN,0,0 +3479,TRAIN,0,0 +3480,TRAIN,0,0 +3481,TRAIN,0,0 +3482,TRAIN,0,0 +3483,TRAIN,0,0 +3484,TRAIN,0,0 +3485,TRAIN,0,0 +3486,TRAIN,0,0 +3487,TRAIN,0,0 +3488,TRAIN,0,0 +3489,TRAIN,0,0 +3490,TRAIN,0,0 +3491,TRAIN,0,0 +3492,TRAIN,0,0 +3493,TRAIN,0,0 +3494,TRAIN,0,0 +3495,TRAIN,0,0 +3496,TRAIN,0,0 +3497,TRAIN,0,0 +3498,TRAIN,0,0 +3499,TRAIN,0,0 +3500,TRAIN,0,0 +3501,TRAIN,0,0 +3502,TRAIN,0,0 +3503,TRAIN,0,0 +3504,TRAIN,0,0 +3505,TRAIN,0,0 +3506,TRAIN,0,0 +3507,TRAIN,0,0 +3508,TRAIN,0,0 +3509,TRAIN,0,0 +3510,TRAIN,0,0 +3511,TRAIN,0,0 +3512,TRAIN,0,0 +3513,TRAIN,0,0 +3514,TRAIN,0,0 +3515,TRAIN,0,0 +3516,TRAIN,0,0 +3517,TRAIN,0,0 +3518,TRAIN,0,0 +3519,TRAIN,0,0 +3520,TRAIN,0,0 +3521,TRAIN,0,0 +3522,TRAIN,0,0 +3523,TRAIN,0,0 +3524,TRAIN,0,0 +3525,TRAIN,0,0 +3526,TRAIN,0,0 +3527,TRAIN,0,0 +3528,TRAIN,0,0 +3529,TRAIN,0,0 +3530,TRAIN,0,0 +3531,TRAIN,0,0 +3532,TRAIN,0,0 +3533,TRAIN,0,0 +3534,TRAIN,0,0 +3535,TRAIN,0,0 +3536,TRAIN,0,0 +3537,TRAIN,0,0 +3538,TRAIN,0,0 +3539,TRAIN,0,0 +3540,TRAIN,0,0 +3541,TRAIN,0,0 +3542,TRAIN,0,0 +3543,TRAIN,0,0 +3544,TRAIN,0,0 +3545,TRAIN,0,0 +3546,TRAIN,0,0 +3547,TRAIN,0,0 +3548,TRAIN,0,0 +3549,TRAIN,0,0 +3550,TRAIN,0,0 +3551,TRAIN,0,0 +3552,TRAIN,0,0 +3553,TRAIN,0,0 +3554,TRAIN,0,0 +3555,TRAIN,0,0 +3556,TRAIN,0,0 +3557,TRAIN,0,0 +3558,TRAIN,0,0 +3559,TRAIN,0,0 +3560,TRAIN,0,0 +3561,TRAIN,0,0 +3562,TRAIN,0,0 +3563,TRAIN,0,0 +3564,TRAIN,0,0 +3565,TRAIN,0,0 +3566,TRAIN,0,0 +3567,TRAIN,0,0 +3568,TRAIN,0,0 +3569,TRAIN,0,0 +3570,TRAIN,0,0 +3571,TRAIN,0,0 +3572,TRAIN,0,0 +3573,TRAIN,0,0 +3574,TRAIN,0,0 +3575,TRAIN,0,0 +3576,TRAIN,0,0 +3577,TRAIN,0,0 +3578,TRAIN,0,0 +3579,TRAIN,0,0 +3580,TRAIN,0,0 +3581,TRAIN,0,0 +3582,TRAIN,0,0 +3583,TRAIN,0,0 +3584,TRAIN,0,0 +3585,TRAIN,0,0 +3586,TRAIN,0,0 +3587,TRAIN,0,0 +3588,TRAIN,0,0 +3589,TRAIN,0,0 +3590,TRAIN,0,0 +3591,TRAIN,0,0 +3592,TRAIN,0,0 +3593,TRAIN,0,0 +3594,TRAIN,0,0 +3595,TRAIN,0,0 +3596,TRAIN,0,0 +3597,TRAIN,0,0 +3598,TRAIN,0,0 +3599,TRAIN,0,0 +3600,TRAIN,0,0 +3601,TRAIN,0,0 +3602,TRAIN,0,0 +3603,TRAIN,0,0 +3604,TRAIN,0,0 +3605,TRAIN,0,0 +3606,TRAIN,0,0 +3607,TRAIN,0,0 +3608,TRAIN,0,0 +3609,TRAIN,0,0 +3610,TRAIN,0,0 +3611,TRAIN,0,0 +3612,TRAIN,0,0 +3613,TRAIN,0,0 +3614,TRAIN,0,0 +3615,TRAIN,0,0 +3616,TRAIN,0,0 +3617,TRAIN,0,0 +3618,TRAIN,0,0 +3619,TRAIN,0,0 +3620,TRAIN,0,0 +3621,TRAIN,0,0 +3622,TRAIN,0,0 +3623,TRAIN,0,0 +3624,TRAIN,0,0 +3625,TRAIN,0,0 +3626,TRAIN,0,0 +3627,TRAIN,0,0 +3628,TRAIN,0,0 +3629,TRAIN,0,0 +3630,TRAIN,0,0 +3631,TRAIN,0,0 +3632,TRAIN,0,0 +3633,TRAIN,0,0 +3634,TRAIN,0,0 +3635,TRAIN,0,0 +3636,TRAIN,0,0 +3637,TRAIN,0,0 +3638,TRAIN,0,0 +3639,TRAIN,0,0 +3640,TRAIN,0,0 +3641,TRAIN,0,0 +3642,TRAIN,0,0 +3643,TRAIN,0,0 +3644,TRAIN,0,0 +3645,TRAIN,0,0 +3646,TRAIN,0,0 +3647,TRAIN,0,0 +3648,TRAIN,0,0 +3649,TRAIN,0,0 +3650,TRAIN,0,0 +3651,TRAIN,0,0 +3652,TRAIN,0,0 +3653,TRAIN,0,0 +3654,TRAIN,0,0 +3655,TRAIN,0,0 +3656,TRAIN,0,0 +3657,TRAIN,0,0 +3658,TRAIN,0,0 +3659,TRAIN,0,0 +3660,TRAIN,0,0 +3661,TRAIN,0,0 +3662,TRAIN,0,0 +3663,TRAIN,0,0 +3664,TRAIN,0,0 +3665,TRAIN,0,0 +3666,TRAIN,0,0 +3667,TRAIN,0,0 +3668,TRAIN,0,0 +3669,TRAIN,0,0 +3670,TRAIN,0,0 +3671,TRAIN,0,0 +3672,TRAIN,0,0 +3673,TRAIN,0,0 +3674,TRAIN,0,0 +3675,TRAIN,0,0 +3676,TRAIN,0,0 +3677,TRAIN,0,0 +3678,TRAIN,0,0 +3679,TRAIN,0,0 +3680,TRAIN,0,0 +3681,TRAIN,0,0 +3682,TRAIN,0,0 +3683,TRAIN,0,0 +3684,TRAIN,0,0 +3685,TRAIN,0,0 +3686,TRAIN,0,0 +3687,TRAIN,0,0 +3688,TRAIN,0,0 +3689,TRAIN,0,0 +3690,TRAIN,0,0 +3691,TRAIN,0,0 +3692,TRAIN,0,0 +3693,TRAIN,0,0 +3694,TRAIN,0,0 +3695,TRAIN,0,0 +3696,TRAIN,0,0 +3697,TRAIN,0,0 +3698,TRAIN,0,0 +3699,TRAIN,0,0 +3700,TRAIN,0,0 +3701,TRAIN,0,0 +3702,TRAIN,0,0 +3703,TRAIN,0,0 +3704,TRAIN,0,0 +3705,TRAIN,0,0 +3706,TRAIN,0,0 +3707,TRAIN,0,0 +3708,TRAIN,0,0 +3709,TRAIN,0,0 +3710,TRAIN,0,0 +3711,TRAIN,0,0 +3712,TRAIN,0,0 +3713,TRAIN,0,0 +3714,TRAIN,0,0 +3715,TRAIN,0,0 +3716,TRAIN,0,0 +3717,TRAIN,0,0 +3718,TRAIN,0,0 +3719,TRAIN,0,0 +3720,TRAIN,0,0 +3721,TRAIN,0,0 +3722,TRAIN,0,0 +3723,TRAIN,0,0 +3724,TRAIN,0,0 +3725,TRAIN,0,0 +3726,TRAIN,0,0 +3727,TRAIN,0,0 +3728,TRAIN,0,0 +3729,TRAIN,0,0 +3730,TRAIN,0,0 +3731,TRAIN,0,0 +3732,TRAIN,0,0 +3733,TRAIN,0,0 +3734,TRAIN,0,0 +3735,TRAIN,0,0 +3736,TRAIN,0,0 +3737,TRAIN,0,0 +3738,TRAIN,0,0 +3739,TRAIN,0,0 +3740,TRAIN,0,0 +3741,TRAIN,0,0 +3742,TRAIN,0,0 +3743,TRAIN,0,0 +3744,TRAIN,0,0 +3745,TRAIN,0,0 +3746,TRAIN,0,0 +3747,TRAIN,0,0 +3748,TRAIN,0,0 +3749,TRAIN,0,0 +3750,TRAIN,0,0 +3751,TRAIN,0,0 +3752,TRAIN,0,0 +3753,TRAIN,0,0 +3754,TRAIN,0,0 +3755,TRAIN,0,0 +3756,TRAIN,0,0 +3757,TRAIN,0,0 +3758,TRAIN,0,0 +3759,TRAIN,0,0 +3760,TRAIN,0,0 +3761,TRAIN,0,0 +3762,TRAIN,0,0 +3763,TRAIN,0,0 +3764,TRAIN,0,0 +3765,TRAIN,0,0 +3766,TRAIN,0,0 +3767,TRAIN,0,0 +3768,TRAIN,0,0 +3769,TRAIN,0,0 +3770,TRAIN,0,0 +3771,TRAIN,0,0 +3772,TRAIN,0,0 +3773,TRAIN,0,0 +3774,TRAIN,0,0 +3775,TRAIN,0,0 +3776,TRAIN,0,0 +3777,TRAIN,0,0 +3778,TRAIN,0,0 +3779,TRAIN,0,0 +3780,TRAIN,0,0 +3781,TRAIN,0,0 +3782,TRAIN,0,0 +3783,TRAIN,0,0 +3784,TRAIN,0,0 +3785,TRAIN,0,0 +3786,TRAIN,0,0 +3787,TRAIN,0,0 +3788,TRAIN,0,0 +3789,TRAIN,0,0 +3790,TRAIN,0,0 +3791,TRAIN,0,0 +3792,TRAIN,0,0 +3793,TRAIN,0,0 +3794,TRAIN,0,0 +3795,TRAIN,0,0 +3796,TRAIN,0,0 +3797,TRAIN,0,0 +3798,TRAIN,0,0 +3799,TRAIN,0,0 +3800,TRAIN,0,0 +3801,TRAIN,0,0 +3802,TRAIN,0,0 +3803,TRAIN,0,0 +3804,TRAIN,0,0 +3805,TRAIN,0,0 +3806,TRAIN,0,0 +3807,TRAIN,0,0 +3808,TRAIN,0,0 +3809,TRAIN,0,0 +3810,TRAIN,0,0 +3811,TRAIN,0,0 +3812,TRAIN,0,0 +3813,TRAIN,0,0 +3814,TRAIN,0,0 +3815,TRAIN,0,0 +3816,TRAIN,0,0 +3817,TRAIN,0,0 +3818,TRAIN,0,0 +3819,TRAIN,0,0 +3820,TRAIN,0,0 +3821,TRAIN,0,0 +3822,TRAIN,0,0 +3823,TRAIN,0,0 +3824,TRAIN,0,0 +3825,TRAIN,0,0 +3826,TRAIN,0,0 +3827,TRAIN,0,0 +3828,TRAIN,0,0 +3829,TRAIN,0,0 +3830,TRAIN,0,0 +3831,TRAIN,0,0 +3832,TRAIN,0,0 +3833,TRAIN,0,0 +3834,TRAIN,0,0 +3835,TRAIN,0,0 +3836,TRAIN,0,0 +3837,TRAIN,0,0 +3838,TRAIN,0,0 +3839,TRAIN,0,0 +3840,TRAIN,0,0 +3841,TRAIN,0,0 +3842,TRAIN,0,0 +3843,TRAIN,0,0 +3844,TRAIN,0,0 +3845,TRAIN,0,0 +3846,TRAIN,0,0 +3847,TRAIN,0,0 +3848,TRAIN,0,0 +3849,TRAIN,0,0 +3850,TRAIN,0,0 +3851,TRAIN,0,0 +3852,TRAIN,0,0 +3853,TRAIN,0,0 +3854,TRAIN,0,0 +3855,TRAIN,0,0 +3856,TRAIN,0,0 +3857,TRAIN,0,0 +3858,TRAIN,0,0 +3859,TRAIN,0,0 +3860,TRAIN,0,0 +3861,TRAIN,0,0 +3862,TRAIN,0,0 +3863,TRAIN,0,0 +3864,TRAIN,0,0 +3865,TRAIN,0,0 +3866,TRAIN,0,0 +3867,TRAIN,0,0 +3868,TRAIN,0,0 +3869,TRAIN,0,0 +3870,TRAIN,0,0 +3871,TRAIN,0,0 +3872,TRAIN,0,0 +3873,TRAIN,0,0 +3874,TRAIN,0,0 +3875,TRAIN,0,0 +3876,TRAIN,0,0 +3877,TRAIN,0,0 +3878,TRAIN,0,0 +3879,TRAIN,0,0 +3880,TRAIN,0,0 +3881,TRAIN,0,0 +3882,TRAIN,0,0 +3883,TRAIN,0,0 +3884,TRAIN,0,0 +3885,TRAIN,0,0 +3886,TRAIN,0,0 +3887,TRAIN,0,0 +3888,TRAIN,0,0 +3889,TRAIN,0,0 +3890,TRAIN,0,0 +3891,TRAIN,0,0 +3892,TRAIN,0,0 +3893,TRAIN,0,0 +3894,TRAIN,0,0 +3895,TRAIN,0,0 +3896,TRAIN,0,0 +3897,TRAIN,0,0 +3898,TRAIN,0,0 +3899,TRAIN,0,0 +3900,TRAIN,0,0 +3901,TRAIN,0,0 +3902,TRAIN,0,0 +3903,TRAIN,0,0 +3904,TRAIN,0,0 +3905,TRAIN,0,0 +3906,TRAIN,0,0 +3907,TRAIN,0,0 +3908,TRAIN,0,0 +3909,TRAIN,0,0 +3910,TRAIN,0,0 +3911,TRAIN,0,0 +3912,TRAIN,0,0 +3913,TRAIN,0,0 +3914,TRAIN,0,0 +3915,TRAIN,0,0 +3916,TRAIN,0,0 +3917,TRAIN,0,0 +3918,TRAIN,0,0 +3919,TRAIN,0,0 +3920,TRAIN,0,0 +3921,TRAIN,0,0 +3922,TRAIN,0,0 +3923,TRAIN,0,0 +3924,TRAIN,0,0 +3925,TRAIN,0,0 +3926,TRAIN,0,0 +3927,TRAIN,0,0 +3928,TRAIN,0,0 +3929,TRAIN,0,0 +3930,TRAIN,0,0 +3931,TRAIN,0,0 +3932,TRAIN,0,0 +3933,TRAIN,0,0 +3934,TRAIN,0,0 +3935,TRAIN,0,0 +3936,TRAIN,0,0 +3937,TRAIN,0,0 +3938,TRAIN,0,0 +3939,TRAIN,0,0 +3940,TRAIN,0,0 +3941,TRAIN,0,0 +3942,TRAIN,0,0 +3943,TRAIN,0,0 +3944,TRAIN,0,0 +3945,TRAIN,0,0 +3946,TRAIN,0,0 +3947,TRAIN,0,0 +3948,TRAIN,0,0 +3949,TRAIN,0,0 +3950,TRAIN,0,0 +3951,TRAIN,0,0 +3952,TRAIN,0,0 +3953,TRAIN,0,0 +3954,TRAIN,0,0 +3955,TRAIN,0,0 +3956,TRAIN,0,0 +3957,TRAIN,0,0 +3958,TRAIN,0,0 +3959,TRAIN,0,0 +3960,TRAIN,0,0 +3961,TRAIN,0,0 +3962,TRAIN,0,0 +3963,TRAIN,0,0 +3964,TRAIN,0,0 +3965,TRAIN,0,0 +3966,TRAIN,0,0 +3967,TRAIN,0,0 +3968,TRAIN,0,0 +3969,TRAIN,0,0 +3970,TRAIN,0,0 +3971,TRAIN,0,0 +3972,TRAIN,0,0 +3973,TRAIN,0,0 +3974,TRAIN,0,0 +3975,TRAIN,0,0 +3976,TRAIN,0,0 +3977,TRAIN,0,0 +3978,TRAIN,0,0 +3979,TRAIN,0,0 +3980,TRAIN,0,0 +3981,TRAIN,0,0 +3982,TRAIN,0,0 +3983,TRAIN,0,0 +3984,TRAIN,0,0 +3985,TRAIN,0,0 +3986,TRAIN,0,0 +3987,TRAIN,0,0 +3988,TRAIN,0,0 +3989,TRAIN,0,0 +3990,TRAIN,0,0 +3991,TRAIN,0,0 +3992,TRAIN,0,0 +3993,TRAIN,0,0 +3994,TRAIN,0,0 +3995,TRAIN,0,0 +3996,TRAIN,0,0 +3997,TRAIN,0,0 +3998,TRAIN,0,0 +3999,TRAIN,0,0 +4000,TRAIN,0,0 +4001,TRAIN,0,0 +4002,TRAIN,0,0 +4003,TRAIN,0,0 +4004,TRAIN,0,0 +4005,TRAIN,0,0 +4006,TRAIN,0,0 +4007,TRAIN,0,0 +4008,TRAIN,0,0 +4009,TRAIN,0,0 +4010,TRAIN,0,0 +4011,TRAIN,0,0 +4012,TRAIN,0,0 +4013,TRAIN,0,0 +4014,TRAIN,0,0 +4015,TRAIN,0,0 +4016,TRAIN,0,0 +4017,TRAIN,0,0 +4018,TRAIN,0,0 +4019,TRAIN,0,0 +4020,TRAIN,0,0 +4021,TRAIN,0,0 +4022,TRAIN,0,0 +4023,TRAIN,0,0 +4024,TRAIN,0,0 +4025,TRAIN,0,0 +4026,TRAIN,0,0 +4027,TRAIN,0,0 +4028,TRAIN,0,0 +4029,TRAIN,0,0 +4030,TRAIN,0,0 +4031,TRAIN,0,0 +4032,TRAIN,0,0 +4033,TRAIN,0,0 +4034,TRAIN,0,0 +4035,TRAIN,0,0 +4036,TRAIN,0,0 +4037,TRAIN,0,0 +4038,TRAIN,0,0 +4039,TRAIN,0,0 +4040,TRAIN,0,0 +4041,TRAIN,0,0 +4042,TRAIN,0,0 +4043,TRAIN,0,0 +4044,TRAIN,0,0 +4045,TRAIN,0,0 +4046,TRAIN,0,0 +4047,TRAIN,0,0 +4048,TRAIN,0,0 +4049,TRAIN,0,0 +4050,TRAIN,0,0 +4051,TRAIN,0,0 +4052,TRAIN,0,0 +4053,TRAIN,0,0 +4054,TRAIN,0,0 +4055,TRAIN,0,0 +4056,TRAIN,0,0 +4057,TRAIN,0,0 +4058,TRAIN,0,0 +4059,TRAIN,0,0 +4060,TRAIN,0,0 +4061,TRAIN,0,0 +4062,TRAIN,0,0 +4063,TRAIN,0,0 +4064,TRAIN,0,0 +4065,TRAIN,0,0 +4066,TRAIN,0,0 +4067,TRAIN,0,0 +4068,TRAIN,0,0 +4069,TRAIN,0,0 +4070,TRAIN,0,0 +4071,TRAIN,0,0 +4072,TRAIN,0,0 +4073,TRAIN,0,0 +4074,TRAIN,0,0 +4075,TRAIN,0,0 +4076,TRAIN,0,0 +4077,TRAIN,0,0 +4078,TRAIN,0,0 +4079,TRAIN,0,0 +4080,TRAIN,0,0 +4081,TRAIN,0,0 +4082,TRAIN,0,0 +4083,TRAIN,0,0 +4084,TRAIN,0,0 +4085,TRAIN,0,0 +4086,TRAIN,0,0 +4087,TRAIN,0,0 +4088,TRAIN,0,0 +4089,TRAIN,0,0 +4090,TRAIN,0,0 +4091,TRAIN,0,0 +4092,TRAIN,0,0 +4093,TRAIN,0,0 +4094,TRAIN,0,0 +4095,TRAIN,0,0 +4096,TRAIN,0,0 +4097,TRAIN,0,0 +4098,TRAIN,0,0 +4099,TRAIN,0,0 +4100,TRAIN,0,0 +4101,TRAIN,0,0 +4102,TRAIN,0,0 +4103,TRAIN,0,0 +4104,TRAIN,0,0 +4105,TRAIN,0,0 +4106,TRAIN,0,0 +4107,TRAIN,0,0 +4108,TRAIN,0,0 +4109,TRAIN,0,0 +4110,TRAIN,0,0 +4111,TRAIN,0,0 +4112,TRAIN,0,0 +4113,TRAIN,0,0 +4114,TRAIN,0,0 +4115,TRAIN,0,0 +4116,TRAIN,0,0 +4117,TRAIN,0,0 +4118,TRAIN,0,0 +4119,TRAIN,0,0 +4120,TRAIN,0,0 +4121,TRAIN,0,0 +4122,TRAIN,0,0 +4123,TRAIN,0,0 +4124,TRAIN,0,0 +4125,TRAIN,0,0 +4126,TRAIN,0,0 +4127,TRAIN,0,0 +4128,TRAIN,0,0 +4129,TRAIN,0,0 +4130,TRAIN,0,0 +4131,TRAIN,0,0 +4132,TRAIN,0,0 +4133,TRAIN,0,0 +4134,TRAIN,0,0 +4135,TRAIN,0,0 +4136,TRAIN,0,0 +4137,TRAIN,0,0 +4138,TRAIN,0,0 +4139,TRAIN,0,0 +4140,TRAIN,0,0 +4141,TRAIN,0,0 +4142,TRAIN,0,0 +4143,TRAIN,0,0 +4144,TRAIN,0,0 +4145,TRAIN,0,0 +4146,TRAIN,0,0 +4147,TRAIN,0,0 +4148,TRAIN,0,0 +4149,TRAIN,0,0 +4150,TRAIN,0,0 +4151,TRAIN,0,0 +4152,TRAIN,0,0 +4153,TRAIN,0,0 +4154,TRAIN,0,0 +4155,TRAIN,0,0 +4156,TRAIN,0,0 +4157,TRAIN,0,0 +4158,TRAIN,0,0 +4159,TRAIN,0,0 +4160,TRAIN,0,0 +4161,TRAIN,0,0 +4162,TRAIN,0,0 +4163,TRAIN,0,0 +4164,TRAIN,0,0 +4165,TRAIN,0,0 +4166,TRAIN,0,0 +4167,TRAIN,0,0 +4168,TRAIN,0,0 +4169,TRAIN,0,0 +4170,TRAIN,0,0 +4171,TRAIN,0,0 +4172,TRAIN,0,0 +4173,TRAIN,0,0 +4174,TRAIN,0,0 +4175,TRAIN,0,0 +4176,TRAIN,0,0 +4177,TRAIN,0,0 +4178,TRAIN,0,0 +4179,TRAIN,0,0 +4180,TRAIN,0,0 +4181,TRAIN,0,0 +4182,TRAIN,0,0 +4183,TRAIN,0,0 +4184,TRAIN,0,0 +4185,TRAIN,0,0 +4186,TRAIN,0,0 +4187,TRAIN,0,0 +4188,TRAIN,0,0 +4189,TRAIN,0,0 +4190,TRAIN,0,0 +4191,TRAIN,0,0 +4192,TRAIN,0,0 +4193,TRAIN,0,0 +4194,TRAIN,0,0 +4195,TRAIN,0,0 +4196,TRAIN,0,0 +4197,TRAIN,0,0 +4198,TRAIN,0,0 +4199,TRAIN,0,0 +4200,TRAIN,0,0 +4201,TRAIN,0,0 +4202,TRAIN,0,0 +4203,TRAIN,0,0 +4204,TRAIN,0,0 +4205,TRAIN,0,0 +4206,TRAIN,0,0 +4207,TRAIN,0,0 +4208,TRAIN,0,0 +4209,TRAIN,0,0 +4210,TRAIN,0,0 +4211,TRAIN,0,0 +4212,TRAIN,0,0 +4213,TRAIN,0,0 +4214,TRAIN,0,0 +4215,TRAIN,0,0 +4216,TRAIN,0,0 +4217,TRAIN,0,0 +4218,TRAIN,0,0 +4219,TRAIN,0,0 +4220,TRAIN,0,0 +4221,TRAIN,0,0 +4222,TRAIN,0,0 +4223,TRAIN,0,0 +4224,TRAIN,0,0 +4225,TRAIN,0,0 +4226,TRAIN,0,0 +4227,TRAIN,0,0 +4228,TRAIN,0,0 +4229,TRAIN,0,0 +4230,TRAIN,0,0 +4231,TRAIN,0,0 +4232,TRAIN,0,0 +4233,TRAIN,0,0 +4234,TRAIN,0,0 +4235,TRAIN,0,0 +4236,TRAIN,0,0 +4237,TRAIN,0,0 +4238,TRAIN,0,0 +4239,TRAIN,0,0 +4240,TRAIN,0,0 +4241,TRAIN,0,0 +4242,TRAIN,0,0 +4243,TRAIN,0,0 +4244,TRAIN,0,0 +4245,TRAIN,0,0 +4246,TRAIN,0,0 +4247,TRAIN,0,0 +4248,TRAIN,0,0 +4249,TRAIN,0,0 +4250,TRAIN,0,0 +4251,TRAIN,0,0 +4252,TRAIN,0,0 +4253,TRAIN,0,0 +4254,TRAIN,0,0 +4255,TRAIN,0,0 +4256,TRAIN,0,0 +4257,TRAIN,0,0 +4258,TRAIN,0,0 +4259,TRAIN,0,0 +4260,TRAIN,0,0 +4261,TRAIN,0,0 +4262,TRAIN,0,0 +4263,TRAIN,0,0 +4264,TRAIN,0,0 +4265,TRAIN,0,0 +4266,TRAIN,0,0 +4267,TRAIN,0,0 +4268,TRAIN,0,0 +4269,TRAIN,0,0 +4270,TRAIN,0,0 +4271,TRAIN,0,0 +4272,TRAIN,0,0 +4273,TRAIN,0,0 +4274,TRAIN,0,0 +4275,TRAIN,0,0 +4276,TRAIN,0,0 +4277,TRAIN,0,0 +4278,TRAIN,0,0 +4279,TRAIN,0,0 +4280,TRAIN,0,0 +4281,TRAIN,0,0 +4282,TRAIN,0,0 +4283,TRAIN,0,0 +4284,TRAIN,0,0 +4285,TRAIN,0,0 +4286,TRAIN,0,0 +4287,TRAIN,0,0 +4288,TRAIN,0,0 +4289,TRAIN,0,0 +4290,TRAIN,0,0 +4291,TRAIN,0,0 +4292,TRAIN,0,0 +4293,TRAIN,0,0 +4294,TRAIN,0,0 +4295,TRAIN,0,0 +4296,TRAIN,0,0 +4297,TRAIN,0,0 +4298,TRAIN,0,0 +4299,TRAIN,0,0 +4300,TRAIN,0,0 +4301,TRAIN,0,0 +4302,TRAIN,0,0 +4303,TRAIN,0,0 +4304,TRAIN,0,0 +4305,TRAIN,0,0 +4306,TRAIN,0,0 +4307,TRAIN,0,0 +4308,TRAIN,0,0 +4309,TRAIN,0,0 +4310,TRAIN,0,0 +4311,TRAIN,0,0 +4312,TRAIN,0,0 +4313,TRAIN,0,0 +4314,TRAIN,0,0 +4315,TRAIN,0,0 +4316,TRAIN,0,0 +4317,TRAIN,0,0 +4318,TRAIN,0,0 +4319,TRAIN,0,0 +4320,TRAIN,0,0 +4321,TRAIN,0,0 +4322,TRAIN,0,0 +4323,TRAIN,0,0 +4324,TRAIN,0,0 +4325,TRAIN,0,0 +4326,TRAIN,0,0 +4327,TRAIN,0,0 +4328,TRAIN,0,0 +4329,TRAIN,0,0 +4330,TRAIN,0,0 +4331,TRAIN,0,0 +4332,TRAIN,0,0 +4333,TRAIN,0,0 +4334,TRAIN,0,0 +4335,TRAIN,0,0 +4336,TRAIN,0,0 +4337,TRAIN,0,0 +4338,TRAIN,0,0 +4339,TRAIN,0,0 +4340,TRAIN,0,0 +4341,TRAIN,0,0 +4342,TRAIN,0,0 +4343,TRAIN,0,0 +4344,TRAIN,0,0 +4345,TRAIN,0,0 +4346,TRAIN,0,0 +4347,TRAIN,0,0 +4348,TRAIN,0,0 +4349,TRAIN,0,0 +4350,TRAIN,0,0 +4351,TRAIN,0,0 +4352,TRAIN,0,0 +4353,TRAIN,0,0 +4354,TRAIN,0,0 +4355,TRAIN,0,0 +4356,TRAIN,0,0 +4357,TRAIN,0,0 +4358,TRAIN,0,0 +4359,TRAIN,0,0 +4360,TRAIN,0,0 +4361,TRAIN,0,0 +4362,TRAIN,0,0 +4363,TRAIN,0,0 +4364,TRAIN,0,0 +4365,TRAIN,0,0 +4366,TRAIN,0,0 +4367,TRAIN,0,0 +4368,TRAIN,0,0 +4369,TRAIN,0,0 +4370,TRAIN,0,0 +4371,TRAIN,0,0 +4372,TRAIN,0,0 +4373,TRAIN,0,0 +4374,TRAIN,0,0 +4375,TRAIN,0,0 +4376,TRAIN,0,0 +4377,TRAIN,0,0 +4378,TRAIN,0,0 +4379,TRAIN,0,0 +4380,TRAIN,0,0 +4381,TRAIN,0,0 +4382,TRAIN,0,0 +4383,TRAIN,0,0 +4384,TRAIN,0,0 +4385,TRAIN,0,0 +4386,TRAIN,0,0 +4387,TRAIN,0,0 +4388,TRAIN,0,0 +4389,TRAIN,0,0 +4390,TRAIN,0,0 +4391,TRAIN,0,0 +4392,TRAIN,0,0 +4393,TRAIN,0,0 +4394,TRAIN,0,0 +4395,TRAIN,0,0 +4396,TRAIN,0,0 +4397,TRAIN,0,0 +4398,TRAIN,0,0 +4399,TRAIN,0,0 +4400,TRAIN,0,0 +4401,TRAIN,0,0 +4402,TRAIN,0,0 +4403,TRAIN,0,0 +4404,TRAIN,0,0 +4405,TRAIN,0,0 +4406,TRAIN,0,0 +4407,TRAIN,0,0 +4408,TRAIN,0,0 +4409,TRAIN,0,0 +4410,TRAIN,0,0 +4411,TRAIN,0,0 +4412,TRAIN,0,0 +4413,TRAIN,0,0 +4414,TRAIN,0,0 +4415,TRAIN,0,0 +4416,TRAIN,0,0 +4417,TRAIN,0,0 +4418,TRAIN,0,0 +4419,TRAIN,0,0 +4420,TRAIN,0,0 +4421,TRAIN,0,0 +4422,TRAIN,0,0 +4423,TRAIN,0,0 +4424,TRAIN,0,0 +4425,TRAIN,0,0 +4426,TRAIN,0,0 +4427,TRAIN,0,0 +4428,TRAIN,0,0 +4429,TRAIN,0,0 +4430,TRAIN,0,0 +4431,TRAIN,0,0 +4432,TRAIN,0,0 +4433,TRAIN,0,0 +4434,TRAIN,0,0 +4435,TRAIN,0,0 +4436,TRAIN,0,0 +4437,TRAIN,0,0 +4438,TRAIN,0,0 +4439,TRAIN,0,0 +4440,TRAIN,0,0 +4441,TRAIN,0,0 +4442,TRAIN,0,0 +4443,TRAIN,0,0 +4444,TRAIN,0,0 +4445,TRAIN,0,0 +4446,TRAIN,0,0 +4447,TRAIN,0,0 +4448,TRAIN,0,0 +4449,TRAIN,0,0 +4450,TRAIN,0,0 +4451,TRAIN,0,0 +4452,TRAIN,0,0 +4453,TRAIN,0,0 +4454,TRAIN,0,0 +4455,TRAIN,0,0 +4456,TRAIN,0,0 +4457,TRAIN,0,0 +4458,TRAIN,0,0 +4459,TRAIN,0,0 +4460,TRAIN,0,0 +4461,TRAIN,0,0 +4462,TRAIN,0,0 +4463,TRAIN,0,0 +4464,TRAIN,0,0 +4465,TRAIN,0,0 +4466,TRAIN,0,0 +4467,TRAIN,0,0 +4468,TRAIN,0,0 +4469,TRAIN,0,0 +4470,TRAIN,0,0 +4471,TRAIN,0,0 +4472,TRAIN,0,0 +4473,TRAIN,0,0 +4474,TRAIN,0,0 +4475,TRAIN,0,0 +4476,TRAIN,0,0 +4477,TRAIN,0,0 +4478,TRAIN,0,0 +4479,TRAIN,0,0 +4480,TRAIN,0,0 +4481,TRAIN,0,0 +4482,TRAIN,0,0 +4483,TRAIN,0,0 +4484,TRAIN,0,0 +4485,TRAIN,0,0 +4486,TRAIN,0,0 +4487,TRAIN,0,0 +4488,TRAIN,0,0 +4489,TRAIN,0,0 +4490,TRAIN,0,0 +4491,TRAIN,0,0 +4492,TRAIN,0,0 +4493,TRAIN,0,0 +4494,TRAIN,0,0 +4495,TRAIN,0,0 +4496,TRAIN,0,0 +4497,TRAIN,0,0 +4498,TRAIN,0,0 +4499,TRAIN,0,0 +4500,TRAIN,0,0 +4501,TRAIN,0,0 +4502,TRAIN,0,0 +4503,TRAIN,0,0 +4504,TRAIN,0,0 +4505,TRAIN,0,0 +4506,TRAIN,0,0 +4507,TRAIN,0,0 +4508,TRAIN,0,0 +4509,TRAIN,0,0 +4510,TRAIN,0,0 +4511,TRAIN,0,0 +4512,TRAIN,0,0 +4513,TRAIN,0,0 +4514,TRAIN,0,0 +4515,TRAIN,0,0 +4516,TRAIN,0,0 +4517,TRAIN,0,0 +4518,TRAIN,0,0 +4519,TRAIN,0,0 +4520,TRAIN,0,0 +4521,TRAIN,0,0 +4522,TRAIN,0,0 +4523,TRAIN,0,0 +4524,TRAIN,0,0 +4525,TRAIN,0,0 +4526,TRAIN,0,0 +4527,TRAIN,0,0 +4528,TRAIN,0,0 +4529,TRAIN,0,0 +4530,TRAIN,0,0 +4531,TRAIN,0,0 +4532,TRAIN,0,0 +4533,TRAIN,0,0 +4534,TRAIN,0,0 +4535,TRAIN,0,0 +4536,TRAIN,0,0 +4537,TRAIN,0,0 +4538,TRAIN,0,0 +4539,TRAIN,0,0 +4540,TRAIN,0,0 +4541,TRAIN,0,0 +4542,TRAIN,0,0 +4543,TRAIN,0,0 +4544,TRAIN,0,0 +4545,TRAIN,0,0 +4546,TRAIN,0,0 +4547,TRAIN,0,0 +4548,TRAIN,0,0 +4549,TRAIN,0,0 +4550,TRAIN,0,0 +4551,TRAIN,0,0 +4552,TRAIN,0,0 +4553,TRAIN,0,0 +4554,TRAIN,0,0 +4555,TRAIN,0,0 +4556,TRAIN,0,0 +4557,TRAIN,0,0 +4558,TRAIN,0,0 +4559,TRAIN,0,0 +4560,TRAIN,0,0 +4561,TRAIN,0,0 +4562,TRAIN,0,0 +4563,TRAIN,0,0 +4564,TRAIN,0,0 +4565,TRAIN,0,0 +4566,TRAIN,0,0 +4567,TRAIN,0,0 +4568,TRAIN,0,0 +4569,TRAIN,0,0 +4570,TRAIN,0,0 +4571,TRAIN,0,0 +4572,TRAIN,0,0 +4573,TRAIN,0,0 +4574,TRAIN,0,0 +4575,TRAIN,0,0 +4576,TRAIN,0,0 +4577,TRAIN,0,0 +4578,TRAIN,0,0 +4579,TRAIN,0,0 +4580,TRAIN,0,0 +4581,TRAIN,0,0 +4582,TRAIN,0,0 +4583,TRAIN,0,0 +4584,TRAIN,0,0 +4585,TRAIN,0,0 +4586,TRAIN,0,0 +4587,TRAIN,0,0 +4588,TRAIN,0,0 +4589,TRAIN,0,0 +4590,TRAIN,0,0 +4591,TRAIN,0,0 +4592,TRAIN,0,0 +4593,TRAIN,0,0 +4594,TRAIN,0,0 +4595,TRAIN,0,0 +4596,TRAIN,0,0 +4597,TRAIN,0,0 +4598,TRAIN,0,0 +4599,TRAIN,0,0 +4600,TRAIN,0,0 +4601,TRAIN,0,0 +4602,TRAIN,0,0 +4603,TRAIN,0,0 +4604,TRAIN,0,0 +4605,TRAIN,0,0 +4606,TRAIN,0,0 +4607,TRAIN,0,0 +4608,TRAIN,0,0 +4609,TRAIN,0,0 +4610,TRAIN,0,0 +4611,TRAIN,0,0 +4612,TRAIN,0,0 +4613,TRAIN,0,0 +4614,TRAIN,0,0 +4615,TRAIN,0,0 +4616,TRAIN,0,0 +4617,TRAIN,0,0 +4618,TRAIN,0,0 +4619,TRAIN,0,0 +4620,TRAIN,0,0 +4621,TRAIN,0,0 +4622,TRAIN,0,0 +4623,TRAIN,0,0 +4624,TRAIN,0,0 +4625,TRAIN,0,0 +4626,TRAIN,0,0 +4627,TRAIN,0,0 +4628,TRAIN,0,0 +4629,TRAIN,0,0 +4630,TRAIN,0,0 +4631,TRAIN,0,0 +4632,TRAIN,0,0 +4633,TRAIN,0,0 +4634,TRAIN,0,0 +4635,TRAIN,0,0 +4636,TRAIN,0,0 +4637,TRAIN,0,0 +4638,TRAIN,0,0 +4639,TRAIN,0,0 +4640,TRAIN,0,0 +4641,TRAIN,0,0 +4642,TRAIN,0,0 +4643,TRAIN,0,0 +4644,TRAIN,0,0 +4645,TRAIN,0,0 +4646,TRAIN,0,0 +4647,TRAIN,0,0 +4648,TRAIN,0,0 +4649,TRAIN,0,0 +4650,TRAIN,0,0 +4651,TRAIN,0,0 +4652,TRAIN,0,0 +4653,TRAIN,0,0 +4654,TRAIN,0,0 +4655,TRAIN,0,0 +4656,TRAIN,0,0 +4657,TRAIN,0,0 +4658,TRAIN,0,0 +4659,TRAIN,0,0 +4660,TRAIN,0,0 +4661,TRAIN,0,0 +4662,TRAIN,0,0 +4663,TRAIN,0,0 +4664,TRAIN,0,0 +4665,TRAIN,0,0 +4666,TRAIN,0,0 +4667,TRAIN,0,0 +4668,TRAIN,0,0 +4669,TRAIN,0,0 +4670,TRAIN,0,0 +4671,TRAIN,0,0 +4672,TRAIN,0,0 +4673,TRAIN,0,0 +4674,TRAIN,0,0 +4675,TRAIN,0,0 +4676,TRAIN,0,0 +4677,TRAIN,0,0 +4678,TRAIN,0,0 +4679,TRAIN,0,0 +4680,TRAIN,0,0 +4681,TRAIN,0,0 +4682,TRAIN,0,0 +4683,TRAIN,0,0 +4684,TRAIN,0,0 +4685,TRAIN,0,0 +4686,TRAIN,0,0 +4687,TRAIN,0,0 +4688,TRAIN,0,0 +4689,TRAIN,0,0 +4690,TRAIN,0,0 +4691,TRAIN,0,0 +4692,TRAIN,0,0 +4693,TRAIN,0,0 +4694,TRAIN,0,0 +4695,TRAIN,0,0 +4696,TRAIN,0,0 +4697,TRAIN,0,0 +4698,TRAIN,0,0 +4699,TRAIN,0,0 +4700,TRAIN,0,0 +4701,TRAIN,0,0 +4702,TRAIN,0,0 +4703,TRAIN,0,0 +4704,TRAIN,0,0 +4705,TRAIN,0,0 +4706,TRAIN,0,0 +4707,TRAIN,0,0 +4708,TRAIN,0,0 +4709,TRAIN,0,0 +4710,TRAIN,0,0 +4711,TRAIN,0,0 +4712,TRAIN,0,0 +4713,TRAIN,0,0 +4714,TRAIN,0,0 +4715,TRAIN,0,0 +4716,TRAIN,0,0 +4717,TRAIN,0,0 +4718,TRAIN,0,0 +4719,TRAIN,0,0 +4720,TRAIN,0,0 +4721,TRAIN,0,0 +4722,TRAIN,0,0 +4723,TRAIN,0,0 +4724,TRAIN,0,0 +4725,TRAIN,0,0 +4726,TRAIN,0,0 +4727,TRAIN,0,0 +4728,TRAIN,0,0 +4729,TRAIN,0,0 +4730,TRAIN,0,0 +4731,TRAIN,0,0 +4732,TRAIN,0,0 +4733,TRAIN,0,0 +4734,TRAIN,0,0 +4735,TRAIN,0,0 +4736,TRAIN,0,0 +4737,TRAIN,0,0 +4738,TRAIN,0,0 +4739,TRAIN,0,0 +4740,TRAIN,0,0 +4741,TRAIN,0,0 +4742,TRAIN,0,0 +4743,TRAIN,0,0 +4744,TRAIN,0,0 +4745,TRAIN,0,0 +4746,TRAIN,0,0 +4747,TRAIN,0,0 +4748,TRAIN,0,0 +4749,TRAIN,0,0 +4750,TRAIN,0,0 +4751,TRAIN,0,0 +4752,TRAIN,0,0 +4753,TRAIN,0,0 +4754,TRAIN,0,0 +4755,TRAIN,0,0 +4756,TRAIN,0,0 +4757,TRAIN,0,0 +4758,TRAIN,0,0 +4759,TRAIN,0,0 +4760,TRAIN,0,0 +4761,TRAIN,0,0 +4762,TRAIN,0,0 +4763,TRAIN,0,0 +4764,TRAIN,0,0 +4765,TRAIN,0,0 +4766,TRAIN,0,0 +4767,TRAIN,0,0 +4768,TRAIN,0,0 +4769,TRAIN,0,0 +4770,TRAIN,0,0 +4771,TRAIN,0,0 +4772,TRAIN,0,0 +4773,TRAIN,0,0 +4774,TRAIN,0,0 +4775,TRAIN,0,0 +4776,TRAIN,0,0 +4777,TRAIN,0,0 +4778,TRAIN,0,0 +4779,TRAIN,0,0 +4780,TRAIN,0,0 +4781,TRAIN,0,0 +4782,TRAIN,0,0 +4783,TRAIN,0,0 +4784,TRAIN,0,0 +4785,TRAIN,0,0 +4786,TRAIN,0,0 +4787,TRAIN,0,0 +4788,TRAIN,0,0 +4789,TRAIN,0,0 +4790,TRAIN,0,0 +4791,TRAIN,0,0 +4792,TRAIN,0,0 +4793,TRAIN,0,0 +4794,TRAIN,0,0 +4795,TRAIN,0,0 +4796,TRAIN,0,0 +4797,TRAIN,0,0 +4798,TRAIN,0,0 +4799,TRAIN,0,0 +4800,TRAIN,0,0 +4801,TRAIN,0,0 +4802,TRAIN,0,0 +4803,TRAIN,0,0 +4804,TRAIN,0,0 +4805,TRAIN,0,0 +4806,TRAIN,0,0 +4807,TRAIN,0,0 +4808,TRAIN,0,0 +4809,TRAIN,0,0 +4810,TRAIN,0,0 +4811,TRAIN,0,0 +4812,TRAIN,0,0 +4813,TRAIN,0,0 +4814,TRAIN,0,0 +4815,TRAIN,0,0 +4816,TRAIN,0,0 +4817,TRAIN,0,0 +4818,TRAIN,0,0 +4819,TRAIN,0,0 +4820,TRAIN,0,0 +4821,TRAIN,0,0 +4822,TRAIN,0,0 +4823,TRAIN,0,0 +4824,TRAIN,0,0 +4825,TRAIN,0,0 +4826,TRAIN,0,0 +4827,TRAIN,0,0 +4828,TRAIN,0,0 +4829,TRAIN,0,0 +4830,TRAIN,0,0 +4831,TRAIN,0,0 +4832,TRAIN,0,0 +4833,TRAIN,0,0 +4834,TRAIN,0,0 +4835,TRAIN,0,0 +4836,TRAIN,0,0 +4837,TRAIN,0,0 +4838,TRAIN,0,0 +4839,TRAIN,0,0 +4840,TRAIN,0,0 +4841,TRAIN,0,0 +4842,TRAIN,0,0 +4843,TRAIN,0,0 +4844,TRAIN,0,0 +4845,TRAIN,0,0 +4846,TRAIN,0,0 +4847,TRAIN,0,0 +4848,TRAIN,0,0 +4849,TRAIN,0,0 +4850,TRAIN,0,0 +4851,TRAIN,0,0 +4852,TRAIN,0,0 +4853,TRAIN,0,0 +4854,TRAIN,0,0 +4855,TRAIN,0,0 +4856,TRAIN,0,0 +4857,TRAIN,0,0 +4858,TRAIN,0,0 +4859,TRAIN,0,0 +4860,TRAIN,0,0 +4861,TRAIN,0,0 +4862,TRAIN,0,0 +4863,TRAIN,0,0 +4864,TRAIN,0,0 +4865,TRAIN,0,0 +4866,TRAIN,0,0 +4867,TRAIN,0,0 +4868,TRAIN,0,0 +4869,TRAIN,0,0 +4870,TRAIN,0,0 +4871,TRAIN,0,0 +4872,TRAIN,0,0 +4873,TRAIN,0,0 +4874,TRAIN,0,0 +4875,TRAIN,0,0 +4876,TRAIN,0,0 +4877,TRAIN,0,0 +4878,TRAIN,0,0 +4879,TRAIN,0,0 +4880,TRAIN,0,0 +4881,TRAIN,0,0 +4882,TRAIN,0,0 +4883,TRAIN,0,0 +4884,TRAIN,0,0 +4885,TRAIN,0,0 +4886,TRAIN,0,0 +4887,TRAIN,0,0 +4888,TRAIN,0,0 +4889,TRAIN,0,0 +4890,TRAIN,0,0 +4891,TRAIN,0,0 +4892,TRAIN,0,0 +4893,TRAIN,0,0 +4894,TRAIN,0,0 +4895,TRAIN,0,0 +4896,TRAIN,0,0 +4897,TRAIN,0,0 +4898,TRAIN,0,0 +4899,TRAIN,0,0 +4900,TRAIN,0,0 +4901,TRAIN,0,0 +4902,TRAIN,0,0 +4903,TRAIN,0,0 +4904,TRAIN,0,0 +4905,TRAIN,0,0 +4906,TRAIN,0,0 +4907,TRAIN,0,0 +4908,TRAIN,0,0 +4909,TRAIN,0,0 +4910,TRAIN,0,0 +4911,TRAIN,0,0 +4912,TRAIN,0,0 +4913,TRAIN,0,0 +4914,TRAIN,0,0 +4915,TRAIN,0,0 +4916,TRAIN,0,0 +4917,TRAIN,0,0 +4918,TRAIN,0,0 +4919,TRAIN,0,0 +4920,TRAIN,0,0 +4921,TRAIN,0,0 +4922,TRAIN,0,0 +4923,TRAIN,0,0 +4924,TRAIN,0,0 +4925,TRAIN,0,0 +4926,TRAIN,0,0 +4927,TRAIN,0,0 +4928,TRAIN,0,0 +4929,TRAIN,0,0 +4930,TRAIN,0,0 +4931,TRAIN,0,0 +4932,TRAIN,0,0 +4933,TRAIN,0,0 +4934,TRAIN,0,0 +4935,TRAIN,0,0 +4936,TRAIN,0,0 +4937,TRAIN,0,0 +4938,TRAIN,0,0 +4939,TRAIN,0,0 +4940,TRAIN,0,0 +4941,TRAIN,0,0 +4942,TRAIN,0,0 +4943,TRAIN,0,0 +4944,TRAIN,0,0 +4945,TRAIN,0,0 +4946,TRAIN,0,0 +4947,TRAIN,0,0 +4948,TRAIN,0,0 +4949,TRAIN,0,0 +4950,TRAIN,0,0 +4951,TRAIN,0,0 +4952,TRAIN,0,0 +4953,TRAIN,0,0 +4954,TRAIN,0,0 +4955,TRAIN,0,0 +4956,TRAIN,0,0 +4957,TRAIN,0,0 +4958,TRAIN,0,0 +4959,TRAIN,0,0 +4960,TRAIN,0,0 +4961,TRAIN,0,0 +4962,TRAIN,0,0 +4963,TRAIN,0,0 +4964,TRAIN,0,0 +4965,TRAIN,0,0 +4966,TRAIN,0,0 +4967,TRAIN,0,0 +4968,TRAIN,0,0 +4969,TRAIN,0,0 +4970,TRAIN,0,0 +4971,TRAIN,0,0 +4972,TRAIN,0,0 +4973,TRAIN,0,0 +4974,TRAIN,0,0 +4975,TRAIN,0,0 +4976,TRAIN,0,0 +4977,TRAIN,0,0 +4978,TRAIN,0,0 +4979,TRAIN,0,0 +4980,TRAIN,0,0 +4981,TRAIN,0,0 +4982,TRAIN,0,0 +4983,TRAIN,0,0 +4984,TRAIN,0,0 +4985,TRAIN,0,0 +4986,TRAIN,0,0 +4987,TRAIN,0,0 +4988,TRAIN,0,0 +4989,TRAIN,0,0 +4990,TRAIN,0,0 +4991,TRAIN,0,0 +4992,TRAIN,0,0 +4993,TRAIN,0,0 +4994,TRAIN,0,0 +4995,TRAIN,0,0 +4996,TRAIN,0,0 +4997,TRAIN,0,0 +4998,TRAIN,0,0 +4999,TRAIN,0,0 +5000,TRAIN,0,0 +5001,TRAIN,0,0 +5002,TRAIN,0,0 +5003,TRAIN,0,0 +5004,TRAIN,0,0 +5005,TRAIN,0,0 +5006,TRAIN,0,0 +5007,TRAIN,0,0 +5008,TRAIN,0,0 +5009,TRAIN,0,0 +5010,TRAIN,0,0 +5011,TRAIN,0,0 +5012,TRAIN,0,0 +5013,TRAIN,0,0 +5014,TRAIN,0,0 +5015,TRAIN,0,0 +5016,TRAIN,0,0 +5017,TRAIN,0,0 +5018,TRAIN,0,0 +5019,TRAIN,0,0 +5020,TRAIN,0,0 +5021,TRAIN,0,0 +5022,TRAIN,0,0 +5023,TRAIN,0,0 +5024,TRAIN,0,0 +5025,TRAIN,0,0 +5026,TRAIN,0,0 +5027,TRAIN,0,0 +5028,TRAIN,0,0 +5029,TRAIN,0,0 +5030,TRAIN,0,0 +5031,TRAIN,0,0 +5032,TRAIN,0,0 +5033,TRAIN,0,0 +5034,TRAIN,0,0 +5035,TRAIN,0,0 +5036,TRAIN,0,0 +5037,TRAIN,0,0 +5038,TRAIN,0,0 +5039,TRAIN,0,0 +5040,TRAIN,0,0 +5041,TRAIN,0,0 +5042,TRAIN,0,0 +5043,TRAIN,0,0 +5044,TRAIN,0,0 +5045,TRAIN,0,0 +5046,TRAIN,0,0 +5047,TRAIN,0,0 +5048,TRAIN,0,0 +5049,TRAIN,0,0 +5050,TRAIN,0,0 +5051,TRAIN,0,0 +5052,TRAIN,0,0 +5053,TRAIN,0,0 +5054,TRAIN,0,0 +5055,TRAIN,0,0 +5056,TRAIN,0,0 +5057,TRAIN,0,0 +5058,TRAIN,0,0 +5059,TRAIN,0,0 +5060,TRAIN,0,0 +5061,TRAIN,0,0 +5062,TRAIN,0,0 +5063,TRAIN,0,0 +5064,TRAIN,0,0 +5065,TRAIN,0,0 +5066,TRAIN,0,0 +5067,TRAIN,0,0 +5068,TRAIN,0,0 +5069,TRAIN,0,0 +5070,TRAIN,0,0 +5071,TRAIN,0,0 +5072,TRAIN,0,0 +5073,TRAIN,0,0 +5074,TRAIN,0,0 +5075,TRAIN,0,0 +5076,TRAIN,0,0 +5077,TRAIN,0,0 +5078,TRAIN,0,0 +5079,TRAIN,0,0 +5080,TRAIN,0,0 +5081,TRAIN,0,0 +5082,TRAIN,0,0 +5083,TRAIN,0,0 +5084,TRAIN,0,0 +5085,TRAIN,0,0 +5086,TRAIN,0,0 +5087,TRAIN,0,0 +5088,TRAIN,0,0 +5089,TRAIN,0,0 +5090,TRAIN,0,0 +5091,TRAIN,0,0 +5092,TRAIN,0,0 +5093,TRAIN,0,0 +5094,TRAIN,0,0 +5095,TRAIN,0,0 +5096,TRAIN,0,0 +5097,TRAIN,0,0 +5098,TRAIN,0,0 +5099,TRAIN,0,0 +5100,TRAIN,0,0 +5101,TRAIN,0,0 +5102,TRAIN,0,0 +5103,TRAIN,0,0 +5104,TRAIN,0,0 +5105,TRAIN,0,0 +5106,TRAIN,0,0 +5107,TRAIN,0,0 +5108,TRAIN,0,0 +5109,TRAIN,0,0 +5110,TRAIN,0,0 +5111,TRAIN,0,0 +5112,TRAIN,0,0 +5113,TRAIN,0,0 +5114,TRAIN,0,0 +5115,TRAIN,0,0 +5116,TRAIN,0,0 +5117,TRAIN,0,0 +5118,TRAIN,0,0 +5119,TRAIN,0,0 +5120,TRAIN,0,0 +5121,TRAIN,0,0 +5122,TRAIN,0,0 +5123,TRAIN,0,0 +5124,TRAIN,0,0 +5125,TRAIN,0,0 +5126,TRAIN,0,0 +5127,TRAIN,0,0 +5128,TRAIN,0,0 +5129,TRAIN,0,0 +5130,TRAIN,0,0 +5131,TRAIN,0,0 +5132,TRAIN,0,0 +5133,TRAIN,0,0 +5134,TRAIN,0,0 +5135,TRAIN,0,0 +5136,TRAIN,0,0 +5137,TRAIN,0,0 +5138,TRAIN,0,0 +5139,TRAIN,0,0 +5140,TRAIN,0,0 +5141,TRAIN,0,0 +5142,TRAIN,0,0 +5143,TRAIN,0,0 +5144,TRAIN,0,0 +5145,TRAIN,0,0 +5146,TRAIN,0,0 +5147,TRAIN,0,0 +5148,TRAIN,0,0 +5149,TRAIN,0,0 +5150,TRAIN,0,0 +5151,TRAIN,0,0 +5152,TRAIN,0,0 +5153,TRAIN,0,0 +5154,TRAIN,0,0 +5155,TRAIN,0,0 +5156,TRAIN,0,0 +5157,TRAIN,0,0 +5158,TRAIN,0,0 +5159,TRAIN,0,0 +5160,TRAIN,0,0 +5161,TRAIN,0,0 +5162,TRAIN,0,0 +5163,TRAIN,0,0 +5164,TRAIN,0,0 +5165,TRAIN,0,0 +5166,TRAIN,0,0 +5167,TRAIN,0,0 +5168,TRAIN,0,0 +5169,TRAIN,0,0 +5170,TRAIN,0,0 +5171,TRAIN,0,0 +5172,TRAIN,0,0 +5173,TRAIN,0,0 +5174,TRAIN,0,0 +5175,TRAIN,0,0 +5176,TRAIN,0,0 +5177,TRAIN,0,0 +5178,TRAIN,0,0 +5179,TRAIN,0,0 +5180,TRAIN,0,0 +5181,TRAIN,0,0 +5182,TRAIN,0,0 +5183,TRAIN,0,0 +5184,TRAIN,0,0 +5185,TRAIN,0,0 +5186,TRAIN,0,0 +5187,TRAIN,0,0 +5188,TRAIN,0,0 +5189,TRAIN,0,0 +5190,TRAIN,0,0 +5191,TRAIN,0,0 +5192,TRAIN,0,0 +5193,TRAIN,0,0 +5194,TRAIN,0,0 +5195,TRAIN,0,0 +5196,TRAIN,0,0 +5197,TRAIN,0,0 +5198,TRAIN,0,0 +5199,TRAIN,0,0 +5200,TRAIN,0,0 +5201,TRAIN,0,0 +5202,TRAIN,0,0 +5203,TRAIN,0,0 +5204,TRAIN,0,0 +5205,TRAIN,0,0 +5206,TRAIN,0,0 +5207,TRAIN,0,0 +5208,TRAIN,0,0 +5209,TRAIN,0,0 +5210,TRAIN,0,0 +5211,TRAIN,0,0 +5212,TRAIN,0,0 +5213,TRAIN,0,0 +5214,TRAIN,0,0 +5215,TRAIN,0,0 +5216,TRAIN,0,0 +5217,TRAIN,0,0 +5218,TRAIN,0,0 +5219,TRAIN,0,0 +5220,TRAIN,0,0 +5221,TRAIN,0,0 +5222,TRAIN,0,0 +5223,TRAIN,0,0 +5224,TRAIN,0,0 +5225,TRAIN,0,0 +5226,TRAIN,0,0 +5227,TRAIN,0,0 +5228,TRAIN,0,0 +5229,TRAIN,0,0 +5230,TRAIN,0,0 +5231,TRAIN,0,0 +5232,TRAIN,0,0 +5233,TRAIN,0,0 +5234,TRAIN,0,0 +5235,TRAIN,0,0 +5236,TRAIN,0,0 +5237,TRAIN,0,0 +5238,TRAIN,0,0 +5239,TRAIN,0,0 +5240,TRAIN,0,0 +5241,TRAIN,0,0 +5242,TRAIN,0,0 +5243,TRAIN,0,0 +5244,TRAIN,0,0 +5245,TRAIN,0,0 +5246,TRAIN,0,0 +5247,TRAIN,0,0 +5248,TRAIN,0,0 +5249,TRAIN,0,0 +5250,TRAIN,0,0 +5251,TRAIN,0,0 +5252,TRAIN,0,0 +5253,TRAIN,0,0 +5254,TRAIN,0,0 +5255,TRAIN,0,0 +5256,TRAIN,0,0 +5257,TRAIN,0,0 +5258,TRAIN,0,0 +5259,TRAIN,0,0 +5260,TRAIN,0,0 +5261,TRAIN,0,0 +5262,TRAIN,0,0 +5263,TRAIN,0,0 +5264,TRAIN,0,0 +5265,TRAIN,0,0 +5266,TRAIN,0,0 +5267,TRAIN,0,0 +5268,TRAIN,0,0 +5269,TRAIN,0,0 +5270,TRAIN,0,0 +5271,TRAIN,0,0 +5272,TRAIN,0,0 +5273,TRAIN,0,0 +5274,TRAIN,0,0 +5275,TRAIN,0,0 +5276,TRAIN,0,0 +5277,TRAIN,0,0 +5278,TRAIN,0,0 +5279,TRAIN,0,0 +5280,TRAIN,0,0 +5281,TRAIN,0,0 +5282,TRAIN,0,0 +5283,TRAIN,0,0 +5284,TRAIN,0,0 +5285,TRAIN,0,0 +5286,TRAIN,0,0 +5287,TRAIN,0,0 +5288,TRAIN,0,0 +5289,TRAIN,0,0 +5290,TRAIN,0,0 +5291,TRAIN,0,0 +5292,TRAIN,0,0 +5293,TRAIN,0,0 +5294,TRAIN,0,0 +5295,TRAIN,0,0 +5296,TRAIN,0,0 +5297,TRAIN,0,0 +5298,TRAIN,0,0 +5299,TRAIN,0,0 +5300,TRAIN,0,0 +5301,TRAIN,0,0 +5302,TRAIN,0,0 +5303,TRAIN,0,0 +5304,TRAIN,0,0 +5305,TRAIN,0,0 +5306,TRAIN,0,0 +5307,TRAIN,0,0 +5308,TRAIN,0,0 +5309,TRAIN,0,0 +5310,TRAIN,0,0 +5311,TRAIN,0,0 +5312,TRAIN,0,0 +5313,TRAIN,0,0 +5314,TRAIN,0,0 +5315,TRAIN,0,0 +5316,TRAIN,0,0 +5317,TRAIN,0,0 +5318,TRAIN,0,0 +5319,TRAIN,0,0 +5320,TRAIN,0,0 +5321,TRAIN,0,0 +5322,TRAIN,0,0 +5323,TRAIN,0,0 +5324,TRAIN,0,0 +5325,TRAIN,0,0 +5326,TRAIN,0,0 +5327,TRAIN,0,0 +5328,TRAIN,0,0 +5329,TRAIN,0,0 +5330,TRAIN,0,0 +5331,TRAIN,0,0 +5332,TRAIN,0,0 +5333,TRAIN,0,0 +5334,TRAIN,0,0 +5335,TRAIN,0,0 +5336,TRAIN,0,0 +5337,TRAIN,0,0 +5338,TRAIN,0,0 +5339,TRAIN,0,0 +5340,TRAIN,0,0 +5341,TRAIN,0,0 +5342,TRAIN,0,0 +5343,TRAIN,0,0 +5344,TRAIN,0,0 +5345,TRAIN,0,0 +5346,TRAIN,0,0 +5347,TRAIN,0,0 +5348,TRAIN,0,0 +5349,TRAIN,0,0 +5350,TRAIN,0,0 +5351,TRAIN,0,0 +5352,TRAIN,0,0 +5353,TRAIN,0,0 +5354,TRAIN,0,0 +5355,TRAIN,0,0 +5356,TRAIN,0,0 +5357,TRAIN,0,0 +5358,TRAIN,0,0 +5359,TRAIN,0,0 +5360,TRAIN,0,0 +5361,TRAIN,0,0 +5362,TRAIN,0,0 +5363,TRAIN,0,0 +5364,TRAIN,0,0 +5365,TRAIN,0,0 +5366,TRAIN,0,0 +5367,TRAIN,0,0 +5368,TRAIN,0,0 +5369,TRAIN,0,0 +5370,TRAIN,0,0 +5371,TRAIN,0,0 +5372,TRAIN,0,0 +5373,TRAIN,0,0 +5374,TRAIN,0,0 +5375,TRAIN,0,0 +5376,TRAIN,0,0 +5377,TRAIN,0,0 +5378,TRAIN,0,0 +5379,TRAIN,0,0 +5380,TRAIN,0,0 +5381,TRAIN,0,0 +5382,TRAIN,0,0 +5383,TRAIN,0,0 +5384,TRAIN,0,0 +5385,TRAIN,0,0 +5386,TRAIN,0,0 +5387,TRAIN,0,0 +5388,TRAIN,0,0 +5389,TRAIN,0,0 +5390,TRAIN,0,0 +5391,TRAIN,0,0 +5392,TRAIN,0,0 +5393,TRAIN,0,0 +5394,TRAIN,0,0 +5395,TRAIN,0,0 +5396,TRAIN,0,0 +5397,TRAIN,0,0 +5398,TRAIN,0,0 +5399,TRAIN,0,0 +5400,TRAIN,0,0 +5401,TRAIN,0,0 +5402,TRAIN,0,0 +5403,TRAIN,0,0 +5404,TRAIN,0,0 +5405,TRAIN,0,0 +5406,TRAIN,0,0 +5407,TRAIN,0,0 +5408,TRAIN,0,0 +5409,TRAIN,0,0 +5410,TRAIN,0,0 +5411,TRAIN,0,0 +5412,TRAIN,0,0 +5413,TRAIN,0,0 +5414,TRAIN,0,0 +5415,TRAIN,0,0 +5416,TRAIN,0,0 +5417,TRAIN,0,0 +5418,TRAIN,0,0 +5419,TRAIN,0,0 +5420,TRAIN,0,0 +5421,TRAIN,0,0 +5422,TRAIN,0,0 +5423,TRAIN,0,0 +5424,TRAIN,0,0 +5425,TRAIN,0,0 +5426,TRAIN,0,0 +5427,TRAIN,0,0 +5428,TRAIN,0,0 +5429,TRAIN,0,0 +5430,TRAIN,0,0 +5431,TRAIN,0,0 +5432,TRAIN,0,0 +5433,TRAIN,0,0 +5434,TRAIN,0,0 +5435,TRAIN,0,0 +5436,TRAIN,0,0 +5437,TRAIN,0,0 +5438,TRAIN,0,0 +5439,TRAIN,0,0 +5440,TRAIN,0,0 +5441,TRAIN,0,0 +5442,TRAIN,0,0 +5443,TRAIN,0,0 +5444,TRAIN,0,0 +5445,TRAIN,0,0 +5446,TRAIN,0,0 +5447,TRAIN,0,0 +5448,TRAIN,0,0 +5449,TRAIN,0,0 +5450,TRAIN,0,0 +5451,TRAIN,0,0 +5452,TRAIN,0,0 +5453,TRAIN,0,0 +5454,TRAIN,0,0 +5455,TRAIN,0,0 +5456,TRAIN,0,0 +5457,TRAIN,0,0 +5458,TRAIN,0,0 +5459,TRAIN,0,0 +5460,TRAIN,0,0 +5461,TRAIN,0,0 +5462,TRAIN,0,0 +5463,TRAIN,0,0 +5464,TRAIN,0,0 +5465,TRAIN,0,0 +5466,TRAIN,0,0 +5467,TRAIN,0,0 +5468,TRAIN,0,0 +5469,TRAIN,0,0 +5470,TRAIN,0,0 +5471,TRAIN,0,0 +5472,TRAIN,0,0 +5473,TRAIN,0,0 +5474,TRAIN,0,0 +5475,TRAIN,0,0 +5476,TRAIN,0,0 +5477,TRAIN,0,0 +5478,TRAIN,0,0 +5479,TRAIN,0,0 +5480,TRAIN,0,0 +5481,TRAIN,0,0 +5482,TRAIN,0,0 +5483,TRAIN,0,0 +5484,TRAIN,0,0 +5485,TRAIN,0,0 +5486,TRAIN,0,0 +5487,TRAIN,0,0 +5488,TRAIN,0,0 +5489,TRAIN,0,0 +5490,TRAIN,0,0 +5491,TRAIN,0,0 +5492,TRAIN,0,0 +5493,TRAIN,0,0 +5494,TRAIN,0,0 +5495,TRAIN,0,0 +5496,TRAIN,0,0 +5497,TRAIN,0,0 +5498,TRAIN,0,0 +5499,TRAIN,0,0 +5500,TRAIN,0,0 +5501,TRAIN,0,0 +5502,TRAIN,0,0 +5503,TRAIN,0,0 +5504,TRAIN,0,0 +5505,TRAIN,0,0 +5506,TRAIN,0,0 +5507,TRAIN,0,0 +5508,TRAIN,0,0 +5509,TRAIN,0,0 +5510,TRAIN,0,0 +5511,TRAIN,0,0 +5512,TRAIN,0,0 +5513,TRAIN,0,0 +5514,TRAIN,0,0 +5515,TRAIN,0,0 +5516,TRAIN,0,0 +5517,TRAIN,0,0 +5518,TRAIN,0,0 +5519,TRAIN,0,0 +5520,TRAIN,0,0 +5521,TRAIN,0,0 +5522,TRAIN,0,0 +5523,TRAIN,0,0 +5524,TRAIN,0,0 +5525,TRAIN,0,0 +5526,TRAIN,0,0 +5527,TRAIN,0,0 +5528,TRAIN,0,0 +5529,TRAIN,0,0 +5530,TRAIN,0,0 +5531,TRAIN,0,0 +5532,TRAIN,0,0 +5533,TRAIN,0,0 +5534,TRAIN,0,0 +5535,TRAIN,0,0 +5536,TRAIN,0,0 +5537,TRAIN,0,0 +5538,TRAIN,0,0 +5539,TRAIN,0,0 +5540,TRAIN,0,0 +5541,TRAIN,0,0 +5542,TRAIN,0,0 +5543,TRAIN,0,0 +5544,TRAIN,0,0 +5545,TRAIN,0,0 +5546,TRAIN,0,0 +5547,TRAIN,0,0 +5548,TRAIN,0,0 +5549,TRAIN,0,0 +5550,TRAIN,0,0 +5551,TRAIN,0,0 +5552,TRAIN,0,0 +5553,TRAIN,0,0 +5554,TRAIN,0,0 +5555,TRAIN,0,0 +5556,TRAIN,0,0 +5557,TRAIN,0,0 +5558,TRAIN,0,0 +5559,TRAIN,0,0 +5560,TRAIN,0,0 +5561,TRAIN,0,0 +5562,TRAIN,0,0 +5563,TRAIN,0,0 +5564,TRAIN,0,0 +5565,TRAIN,0,0 +5566,TRAIN,0,0 +5567,TRAIN,0,0 +5568,TRAIN,0,0 +5569,TRAIN,0,0 +5570,TRAIN,0,0 +5571,TRAIN,0,0 +5572,TRAIN,0,0 +5573,TRAIN,0,0 +5574,TRAIN,0,0 +5575,TRAIN,0,0 +5576,TRAIN,0,0 +5577,TRAIN,0,0 +5578,TRAIN,0,0 +5579,TRAIN,0,0 +5580,TRAIN,0,0 +5581,TRAIN,0,0 +5582,TRAIN,0,0 +5583,TRAIN,0,0 +5584,TRAIN,0,0 +5585,TRAIN,0,0 +5586,TRAIN,0,0 +5587,TRAIN,0,0 +5588,TRAIN,0,0 +5589,TRAIN,0,0 +5590,TRAIN,0,0 +5591,TRAIN,0,0 +5592,TRAIN,0,0 +5593,TRAIN,0,0 +5594,TRAIN,0,0 +5595,TRAIN,0,0 +5596,TRAIN,0,0 +5597,TRAIN,0,0 +5598,TRAIN,0,0 +5599,TRAIN,0,0 +5600,TRAIN,0,0 +5601,TRAIN,0,0 +5602,TRAIN,0,0 +5603,TRAIN,0,0 +5604,TRAIN,0,0 +5605,TRAIN,0,0 +5606,TRAIN,0,0 +5607,TRAIN,0,0 +5608,TRAIN,0,0 +5609,TRAIN,0,0 +5610,TRAIN,0,0 +5611,TRAIN,0,0 +5612,TRAIN,0,0 +5613,TRAIN,0,0 +5614,TRAIN,0,0 +5615,TRAIN,0,0 +5616,TRAIN,0,0 +5617,TRAIN,0,0 +5618,TRAIN,0,0 +5619,TRAIN,0,0 +5620,TRAIN,0,0 +5621,TRAIN,0,0 +5622,TRAIN,0,0 +5623,TRAIN,0,0 +5624,TRAIN,0,0 +5625,TRAIN,0,0 +5626,TRAIN,0,0 +5627,TRAIN,0,0 +5628,TRAIN,0,0 +5629,TRAIN,0,0 +5630,TRAIN,0,0 +5631,TRAIN,0,0 +5632,TRAIN,0,0 +5633,TRAIN,0,0 +5634,TRAIN,0,0 +5635,TRAIN,0,0 +5636,TRAIN,0,0 +5637,TRAIN,0,0 +5638,TRAIN,0,0 +5639,TRAIN,0,0 +5640,TRAIN,0,0 +5641,TRAIN,0,0 +5642,TRAIN,0,0 +5643,TRAIN,0,0 +5644,TRAIN,0,0 +5645,TRAIN,0,0 +5646,TRAIN,0,0 +5647,TRAIN,0,0 +5648,TRAIN,0,0 +5649,TRAIN,0,0 +5650,TRAIN,0,0 +5651,TRAIN,0,0 +5652,TRAIN,0,0 +5653,TRAIN,0,0 +5654,TRAIN,0,0 +5655,TRAIN,0,0 +5656,TRAIN,0,0 +5657,TRAIN,0,0 +5658,TRAIN,0,0 +5659,TRAIN,0,0 +5660,TRAIN,0,0 +5661,TRAIN,0,0 +5662,TRAIN,0,0 +5663,TRAIN,0,0 +5664,TRAIN,0,0 +5665,TRAIN,0,0 +5666,TRAIN,0,0 +5667,TRAIN,0,0 +5668,TRAIN,0,0 +5669,TRAIN,0,0 +5670,TRAIN,0,0 +5671,TRAIN,0,0 +5672,TRAIN,0,0 +5673,TRAIN,0,0 +5674,TRAIN,0,0 +5675,TRAIN,0,0 +5676,TRAIN,0,0 +5677,TRAIN,0,0 +5678,TRAIN,0,0 +5679,TRAIN,0,0 +5680,TRAIN,0,0 +5681,TRAIN,0,0 +5682,TRAIN,0,0 +5683,TRAIN,0,0 +5684,TRAIN,0,0 +5685,TRAIN,0,0 +5686,TRAIN,0,0 +5687,TRAIN,0,0 +5688,TRAIN,0,0 +5689,TRAIN,0,0 +5690,TRAIN,0,0 +5691,TRAIN,0,0 +5692,TRAIN,0,0 +5693,TRAIN,0,0 +5694,TRAIN,0,0 +5695,TRAIN,0,0 +5696,TRAIN,0,0 +5697,TRAIN,0,0 +5698,TRAIN,0,0 +5699,TRAIN,0,0 +5700,TRAIN,0,0 +5701,TRAIN,0,0 +5702,TRAIN,0,0 +5703,TRAIN,0,0 +5704,TRAIN,0,0 +5705,TRAIN,0,0 +5706,TRAIN,0,0 +5707,TRAIN,0,0 +5708,TRAIN,0,0 +5709,TRAIN,0,0 +5710,TRAIN,0,0 +5711,TRAIN,0,0 +5712,TRAIN,0,0 +5713,TRAIN,0,0 +5714,TRAIN,0,0 +5715,TRAIN,0,0 +5716,TRAIN,0,0 +5717,TRAIN,0,0 +5718,TRAIN,0,0 +5719,TRAIN,0,0 +5720,TRAIN,0,0 +5721,TRAIN,0,0 +5722,TRAIN,0,0 +5723,TRAIN,0,0 +5724,TRAIN,0,0 +5725,TRAIN,0,0 +5726,TRAIN,0,0 +5727,TRAIN,0,0 +5728,TRAIN,0,0 +5729,TRAIN,0,0 +5730,TRAIN,0,0 +5731,TRAIN,0,0 +5732,TRAIN,0,0 +5733,TRAIN,0,0 +5734,TRAIN,0,0 +5735,TRAIN,0,0 +5736,TRAIN,0,0 +5737,TRAIN,0,0 +5738,TRAIN,0,0 +5739,TRAIN,0,0 +5740,TRAIN,0,0 +5741,TRAIN,0,0 +5742,TRAIN,0,0 +5743,TRAIN,0,0 +5744,TRAIN,0,0 +5745,TRAIN,0,0 +5746,TRAIN,0,0 +5747,TRAIN,0,0 +5748,TRAIN,0,0 +5749,TRAIN,0,0 +5750,TRAIN,0,0 +5751,TRAIN,0,0 +5752,TRAIN,0,0 +5753,TRAIN,0,0 +5754,TRAIN,0,0 +5755,TRAIN,0,0 +5756,TRAIN,0,0 +5757,TRAIN,0,0 +5758,TRAIN,0,0 +5759,TRAIN,0,0 +5760,TRAIN,0,0 +5761,TRAIN,0,0 +5762,TRAIN,0,0 +5763,TRAIN,0,0 +5764,TRAIN,0,0 +5765,TRAIN,0,0 +5766,TRAIN,0,0 +5767,TRAIN,0,0 +5768,TRAIN,0,0 +5769,TRAIN,0,0 +5770,TRAIN,0,0 +5771,TRAIN,0,0 +5772,TRAIN,0,0 +5773,TRAIN,0,0 +5774,TRAIN,0,0 +5775,TRAIN,0,0 +5776,TRAIN,0,0 +5777,TRAIN,0,0 +5778,TRAIN,0,0 +5779,TRAIN,0,0 +5780,TRAIN,0,0 +5781,TRAIN,0,0 +5782,TRAIN,0,0 +5783,TRAIN,0,0 +5784,TRAIN,0,0 +5785,TRAIN,0,0 +5786,TRAIN,0,0 +5787,TRAIN,0,0 +5788,TRAIN,0,0 +5789,TRAIN,0,0 +5790,TRAIN,0,0 +5791,TRAIN,0,0 +5792,TRAIN,0,0 +5793,TRAIN,0,0 +5794,TRAIN,0,0 +5795,TRAIN,0,0 +5796,TRAIN,0,0 +5797,TRAIN,0,0 +5798,TRAIN,0,0 +5799,TRAIN,0,0 +5800,TRAIN,0,0 +5801,TRAIN,0,0 +5802,TRAIN,0,0 +5803,TRAIN,0,0 +5804,TRAIN,0,0 +5805,TRAIN,0,0 +5806,TRAIN,0,0 +5807,TRAIN,0,0 +5808,TRAIN,0,0 +5809,TRAIN,0,0 +5810,TRAIN,0,0 +5811,TRAIN,0,0 +5812,TRAIN,0,0 +5813,TRAIN,0,0 +5814,TRAIN,0,0 +5815,TRAIN,0,0 +5816,TRAIN,0,0 +5817,TRAIN,0,0 +5818,TRAIN,0,0 +5819,TRAIN,0,0 +5820,TRAIN,0,0 +5821,TRAIN,0,0 +5822,TRAIN,0,0 +5823,TRAIN,0,0 +5824,TRAIN,0,0 +5825,TRAIN,0,0 +5826,TRAIN,0,0 +5827,TRAIN,0,0 +5828,TRAIN,0,0 +5829,TRAIN,0,0 +5830,TRAIN,0,0 +5831,TRAIN,0,0 +5832,TRAIN,0,0 +5833,TRAIN,0,0 +5834,TRAIN,0,0 +5835,TRAIN,0,0 +5836,TRAIN,0,0 +5837,TRAIN,0,0 +5838,TRAIN,0,0 +5839,TRAIN,0,0 +5840,TRAIN,0,0 +5841,TRAIN,0,0 +5842,TRAIN,0,0 +5843,TRAIN,0,0 +5844,TRAIN,0,0 +5845,TRAIN,0,0 +5846,TRAIN,0,0 +5847,TRAIN,0,0 +5848,TRAIN,0,0 +5849,TRAIN,0,0 +5850,TRAIN,0,0 +5851,TRAIN,0,0 +5852,TRAIN,0,0 +5853,TRAIN,0,0 +5854,TRAIN,0,0 +5855,TRAIN,0,0 +5856,TRAIN,0,0 +5857,TRAIN,0,0 +5858,TRAIN,0,0 +5859,TRAIN,0,0 +5860,TRAIN,0,0 +5861,TRAIN,0,0 +5862,TRAIN,0,0 +5863,TRAIN,0,0 +5864,TRAIN,0,0 +5865,TRAIN,0,0 +5866,TRAIN,0,0 +5867,TRAIN,0,0 +5868,TRAIN,0,0 +5869,TRAIN,0,0 +5870,TRAIN,0,0 +5871,TRAIN,0,0 +5872,TRAIN,0,0 +5873,TRAIN,0,0 +5874,TRAIN,0,0 +5875,TRAIN,0,0 +5876,TRAIN,0,0 +5877,TRAIN,0,0 +5878,TRAIN,0,0 +5879,TRAIN,0,0 +5880,TRAIN,0,0 +5881,TRAIN,0,0 +5882,TRAIN,0,0 +5883,TRAIN,0,0 +5884,TRAIN,0,0 +5885,TRAIN,0,0 +5886,TRAIN,0,0 +5887,TRAIN,0,0 +5888,TRAIN,0,0 +5889,TRAIN,0,0 +5890,TRAIN,0,0 +5891,TRAIN,0,0 +5892,TRAIN,0,0 +5893,TRAIN,0,0 +5894,TRAIN,0,0 +5895,TRAIN,0,0 +5896,TRAIN,0,0 +5897,TRAIN,0,0 +5898,TRAIN,0,0 +5899,TRAIN,0,0 +5900,TRAIN,0,0 +5901,TRAIN,0,0 +5902,TRAIN,0,0 +5903,TRAIN,0,0 +5904,TRAIN,0,0 +5905,TRAIN,0,0 +5906,TRAIN,0,0 +5907,TRAIN,0,0 +5908,TRAIN,0,0 +5909,TRAIN,0,0 +5910,TRAIN,0,0 +5911,TRAIN,0,0 +5912,TRAIN,0,0 +5913,TRAIN,0,0 +5914,TRAIN,0,0 +5915,TRAIN,0,0 +5916,TRAIN,0,0 +5917,TRAIN,0,0 +5918,TRAIN,0,0 +5919,TRAIN,0,0 +5920,TRAIN,0,0 +5921,TRAIN,0,0 +5922,TRAIN,0,0 +5923,TRAIN,0,0 +5924,TRAIN,0,0 +5925,TRAIN,0,0 +5926,TRAIN,0,0 +5927,TRAIN,0,0 +5928,TRAIN,0,0 +5929,TRAIN,0,0 +5930,TRAIN,0,0 +5931,TRAIN,0,0 +5932,TRAIN,0,0 +5933,TRAIN,0,0 +5934,TRAIN,0,0 +5935,TRAIN,0,0 +5936,TRAIN,0,0 +5937,TRAIN,0,0 +5938,TRAIN,0,0 +5939,TRAIN,0,0 +5940,TRAIN,0,0 +5941,TRAIN,0,0 +5942,TRAIN,0,0 +5943,TRAIN,0,0 +5944,TRAIN,0,0 +5945,TRAIN,0,0 +5946,TRAIN,0,0 +5947,TRAIN,0,0 +5948,TRAIN,0,0 +5949,TRAIN,0,0 +5950,TRAIN,0,0 +5951,TRAIN,0,0 +5952,TRAIN,0,0 +5953,TRAIN,0,0 +5954,TRAIN,0,0 +5955,TRAIN,0,0 +5956,TRAIN,0,0 +5957,TRAIN,0,0 +5958,TRAIN,0,0 +5959,TRAIN,0,0 +5960,TRAIN,0,0 +5961,TRAIN,0,0 +5962,TRAIN,0,0 +5963,TRAIN,0,0 +5964,TRAIN,0,0 +5965,TRAIN,0,0 +5966,TRAIN,0,0 +5967,TRAIN,0,0 +5968,TRAIN,0,0 +5969,TRAIN,0,0 +5970,TRAIN,0,0 +5971,TRAIN,0,0 +5972,TRAIN,0,0 +5973,TRAIN,0,0 +5974,TRAIN,0,0 +5975,TRAIN,0,0 +5976,TRAIN,0,0 +5977,TRAIN,0,0 +5978,TRAIN,0,0 +5979,TRAIN,0,0 +5980,TRAIN,0,0 +5981,TRAIN,0,0 +5982,TRAIN,0,0 +5983,TRAIN,0,0 +5984,TRAIN,0,0 +5985,TRAIN,0,0 +5986,TRAIN,0,0 +5987,TRAIN,0,0 +5988,TRAIN,0,0 +5989,TRAIN,0,0 +5990,TRAIN,0,0 +5991,TRAIN,0,0 +5992,TRAIN,0,0 +5993,TRAIN,0,0 +5994,TRAIN,0,0 +5995,TRAIN,0,0 +5996,TRAIN,0,0 +5997,TRAIN,0,0 +5998,TRAIN,0,0 +5999,TRAIN,0,0 +6000,TRAIN,0,0 +6001,TRAIN,0,0 +6002,TRAIN,0,0 +6003,TRAIN,0,0 +6004,TRAIN,0,0 +6005,TRAIN,0,0 +6006,TRAIN,0,0 +6007,TRAIN,0,0 +6008,TRAIN,0,0 +6009,TRAIN,0,0 +6010,TRAIN,0,0 +6011,TRAIN,0,0 +6012,TRAIN,0,0 +6013,TRAIN,0,0 +6014,TRAIN,0,0 +6015,TRAIN,0,0 +6016,TRAIN,0,0 +6017,TRAIN,0,0 +6018,TRAIN,0,0 +6019,TRAIN,0,0 +6020,TRAIN,0,0 +6021,TRAIN,0,0 +6022,TRAIN,0,0 +6023,TRAIN,0,0 +6024,TRAIN,0,0 +6025,TRAIN,0,0 +6026,TRAIN,0,0 +6027,TRAIN,0,0 +6028,TRAIN,0,0 +6029,TRAIN,0,0 +6030,TRAIN,0,0 +6031,TRAIN,0,0 +6032,TRAIN,0,0 +6033,TRAIN,0,0 +6034,TRAIN,0,0 +6035,TRAIN,0,0 +6036,TRAIN,0,0 +6037,TRAIN,0,0 +6038,TRAIN,0,0 +6039,TRAIN,0,0 +6040,TRAIN,0,0 +6041,TRAIN,0,0 +6042,TRAIN,0,0 +6043,TRAIN,0,0 +6044,TRAIN,0,0 +6045,TRAIN,0,0 +6046,TRAIN,0,0 +6047,TRAIN,0,0 +6048,TRAIN,0,0 +6049,TRAIN,0,0 +6050,TRAIN,0,0 +6051,TRAIN,0,0 +6052,TRAIN,0,0 +6053,TRAIN,0,0 +6054,TRAIN,0,0 +6055,TRAIN,0,0 +6056,TRAIN,0,0 +6057,TRAIN,0,0 +6058,TRAIN,0,0 +6059,TRAIN,0,0 +6060,TRAIN,0,0 +6061,TRAIN,0,0 +6062,TRAIN,0,0 +6063,TRAIN,0,0 +6064,TRAIN,0,0 +6065,TRAIN,0,0 +6066,TRAIN,0,0 +6067,TRAIN,0,0 +6068,TRAIN,0,0 +6069,TRAIN,0,0 +6070,TRAIN,0,0 +6071,TRAIN,0,0 +6072,TRAIN,0,0 +6073,TRAIN,0,0 +6074,TRAIN,0,0 +6075,TRAIN,0,0 +6076,TRAIN,0,0 +6077,TRAIN,0,0 +6078,TRAIN,0,0 +6079,TRAIN,0,0 +6080,TRAIN,0,0 +6081,TRAIN,0,0 +6082,TRAIN,0,0 +6083,TRAIN,0,0 +6084,TRAIN,0,0 +6085,TRAIN,0,0 +6086,TRAIN,0,0 +6087,TRAIN,0,0 +6088,TRAIN,0,0 +6089,TRAIN,0,0 +6090,TRAIN,0,0 +6091,TRAIN,0,0 +6092,TRAIN,0,0 +6093,TRAIN,0,0 +6094,TRAIN,0,0 +6095,TRAIN,0,0 +6096,TRAIN,0,0 +6097,TRAIN,0,0 +6098,TRAIN,0,0 +6099,TRAIN,0,0 +6100,TRAIN,0,0 +6101,TRAIN,0,0 +6102,TRAIN,0,0 +6103,TRAIN,0,0 +6104,TRAIN,0,0 +6105,TRAIN,0,0 +6106,TRAIN,0,0 +6107,TRAIN,0,0 +6108,TRAIN,0,0 +6109,TRAIN,0,0 +6110,TRAIN,0,0 +6111,TRAIN,0,0 +6112,TRAIN,0,0 +6113,TRAIN,0,0 +6114,TRAIN,0,0 +6115,TRAIN,0,0 +6116,TRAIN,0,0 +6117,TRAIN,0,0 +6118,TRAIN,0,0 +6119,TRAIN,0,0 +6120,TRAIN,0,0 +6121,TRAIN,0,0 +6122,TRAIN,0,0 +6123,TRAIN,0,0 +6124,TRAIN,0,0 +6125,TRAIN,0,0 +6126,TRAIN,0,0 +6127,TRAIN,0,0 +6128,TRAIN,0,0 +6129,TRAIN,0,0 +6130,TRAIN,0,0 +6131,TRAIN,0,0 +6132,TRAIN,0,0 +6133,TRAIN,0,0 +6134,TRAIN,0,0 +6135,TRAIN,0,0 +6136,TRAIN,0,0 +6137,TRAIN,0,0 +6138,TRAIN,0,0 +6139,TRAIN,0,0 +6140,TRAIN,0,0 +6141,TRAIN,0,0 +6142,TRAIN,0,0 +6143,TRAIN,0,0 +6144,TRAIN,0,0 +6145,TRAIN,0,0 +6146,TRAIN,0,0 +6147,TRAIN,0,0 +6148,TRAIN,0,0 +6149,TRAIN,0,0 +6150,TRAIN,0,0 +6151,TRAIN,0,0 +6152,TRAIN,0,0 +6153,TRAIN,0,0 +6154,TRAIN,0,0 +6155,TRAIN,0,0 +6156,TRAIN,0,0 +6157,TRAIN,0,0 +6158,TRAIN,0,0 +6159,TRAIN,0,0 +6160,TRAIN,0,0 +6161,TRAIN,0,0 +6162,TRAIN,0,0 +6163,TRAIN,0,0 +6164,TRAIN,0,0 +6165,TRAIN,0,0 +6166,TRAIN,0,0 +6167,TRAIN,0,0 +6168,TRAIN,0,0 +6169,TRAIN,0,0 +6170,TRAIN,0,0 +6171,TRAIN,0,0 +6172,TRAIN,0,0 +6173,TRAIN,0,0 +6174,TRAIN,0,0 +6175,TRAIN,0,0 +6176,TRAIN,0,0 +6177,TRAIN,0,0 +6178,TRAIN,0,0 +6179,TRAIN,0,0 +6180,TRAIN,0,0 +6181,TRAIN,0,0 +6182,TRAIN,0,0 +6183,TRAIN,0,0 +6184,TRAIN,0,0 +6185,TRAIN,0,0 +6186,TRAIN,0,0 +6187,TRAIN,0,0 +6188,TRAIN,0,0 +6189,TRAIN,0,0 +6190,TRAIN,0,0 +6191,TRAIN,0,0 +6192,TRAIN,0,0 +6193,TRAIN,0,0 +6194,TRAIN,0,0 +6195,TRAIN,0,0 +6196,TRAIN,0,0 +6197,TRAIN,0,0 +6198,TRAIN,0,0 +6199,TRAIN,0,0 +6200,TRAIN,0,0 +6201,TRAIN,0,0 +6202,TRAIN,0,0 +6203,TRAIN,0,0 +6204,TRAIN,0,0 +6205,TRAIN,0,0 +6206,TRAIN,0,0 +6207,TRAIN,0,0 +6208,TRAIN,0,0 +6209,TRAIN,0,0 +6210,TRAIN,0,0 +6211,TRAIN,0,0 +6212,TRAIN,0,0 +6213,TRAIN,0,0 +6214,TRAIN,0,0 +6215,TRAIN,0,0 +6216,TRAIN,0,0 +6217,TRAIN,0,0 +6218,TRAIN,0,0 +6219,TRAIN,0,0 +6220,TRAIN,0,0 +6221,TRAIN,0,0 +6222,TRAIN,0,0 +6223,TRAIN,0,0 +6224,TRAIN,0,0 +6225,TRAIN,0,0 +6226,TRAIN,0,0 +6227,TRAIN,0,0 +6228,TRAIN,0,0 +6229,TRAIN,0,0 +6230,TRAIN,0,0 +6231,TRAIN,0,0 +6232,TRAIN,0,0 +6233,TRAIN,0,0 +6234,TRAIN,0,0 +6235,TRAIN,0,0 +6236,TRAIN,0,0 +6237,TRAIN,0,0 +6238,TRAIN,0,0 +6239,TRAIN,0,0 +6240,TRAIN,0,0 +6241,TRAIN,0,0 +6242,TRAIN,0,0 +6243,TRAIN,0,0 +6244,TRAIN,0,0 +6245,TRAIN,0,0 +6246,TRAIN,0,0 +6247,TRAIN,0,0 +6248,TRAIN,0,0 +6249,TRAIN,0,0 +6250,TRAIN,0,0 +6251,TRAIN,0,0 +6252,TRAIN,0,0 +6253,TRAIN,0,0 +6254,TRAIN,0,0 +6255,TRAIN,0,0 +6256,TRAIN,0,0 +6257,TRAIN,0,0 +6258,TRAIN,0,0 +6259,TRAIN,0,0 +6260,TRAIN,0,0 +6261,TRAIN,0,0 +6262,TRAIN,0,0 +6263,TRAIN,0,0 +6264,TRAIN,0,0 +6265,TRAIN,0,0 +6266,TRAIN,0,0 +6267,TRAIN,0,0 +6268,TRAIN,0,0 +6269,TRAIN,0,0 +6270,TRAIN,0,0 +6271,TRAIN,0,0 +6272,TRAIN,0,0 +6273,TRAIN,0,0 +6274,TRAIN,0,0 +6275,TRAIN,0,0 +6276,TRAIN,0,0 +6277,TRAIN,0,0 +6278,TRAIN,0,0 +6279,TRAIN,0,0 +6280,TRAIN,0,0 +6281,TRAIN,0,0 +6282,TRAIN,0,0 +6283,TRAIN,0,0 +6284,TRAIN,0,0 +6285,TRAIN,0,0 +6286,TRAIN,0,0 +6287,TRAIN,0,0 +6288,TRAIN,0,0 +6289,TRAIN,0,0 +6290,TRAIN,0,0 +6291,TRAIN,0,0 +6292,TRAIN,0,0 +6293,TRAIN,0,0 +6294,TRAIN,0,0 +6295,TRAIN,0,0 +6296,TRAIN,0,0 +6297,TRAIN,0,0 +6298,TRAIN,0,0 +6299,TRAIN,0,0 +6300,TRAIN,0,0 +6301,TRAIN,0,0 +6302,TRAIN,0,0 +6303,TRAIN,0,0 +6304,TRAIN,0,0 +6305,TRAIN,0,0 +6306,TRAIN,0,0 +6307,TRAIN,0,0 +6308,TRAIN,0,0 +6309,TRAIN,0,0 +6310,TRAIN,0,0 +6311,TRAIN,0,0 +6312,TRAIN,0,0 +6313,TRAIN,0,0 +6314,TRAIN,0,0 +6315,TRAIN,0,0 +6316,TRAIN,0,0 +6317,TRAIN,0,0 +6318,TRAIN,0,0 +6319,TRAIN,0,0 +6320,TRAIN,0,0 +6321,TRAIN,0,0 +6322,TRAIN,0,0 +6323,TRAIN,0,0 +6324,TRAIN,0,0 +6325,TRAIN,0,0 +6326,TRAIN,0,0 +6327,TRAIN,0,0 +6328,TRAIN,0,0 +6329,TRAIN,0,0 +6330,TRAIN,0,0 +6331,TRAIN,0,0 +6332,TRAIN,0,0 +6333,TRAIN,0,0 +6334,TRAIN,0,0 +6335,TRAIN,0,0 +6336,TRAIN,0,0 +6337,TRAIN,0,0 +6338,TRAIN,0,0 +6339,TRAIN,0,0 +6340,TRAIN,0,0 +6341,TRAIN,0,0 +6342,TRAIN,0,0 +6343,TRAIN,0,0 +6344,TRAIN,0,0 +6345,TRAIN,0,0 +6346,TRAIN,0,0 +6347,TRAIN,0,0 +6348,TRAIN,0,0 +6349,TRAIN,0,0 +6350,TRAIN,0,0 +6351,TRAIN,0,0 +6352,TRAIN,0,0 +6353,TRAIN,0,0 +6354,TRAIN,0,0 +6355,TRAIN,0,0 +6356,TRAIN,0,0 +6357,TRAIN,0,0 +6358,TRAIN,0,0 +6359,TRAIN,0,0 +6360,TRAIN,0,0 +6361,TRAIN,0,0 +6362,TRAIN,0,0 +6363,TRAIN,0,0 +6364,TRAIN,0,0 +6365,TRAIN,0,0 +6366,TRAIN,0,0 +6367,TRAIN,0,0 +6368,TRAIN,0,0 +6369,TRAIN,0,0 +6370,TRAIN,0,0 +6371,TRAIN,0,0 +6372,TRAIN,0,0 +6373,TRAIN,0,0 +6374,TRAIN,0,0 +6375,TRAIN,0,0 +6376,TRAIN,0,0 +6377,TRAIN,0,0 +6378,TRAIN,0,0 +6379,TRAIN,0,0 +6380,TRAIN,0,0 +6381,TRAIN,0,0 +6382,TRAIN,0,0 +6383,TRAIN,0,0 +6384,TRAIN,0,0 +6385,TRAIN,0,0 +6386,TRAIN,0,0 +6387,TRAIN,0,0 +6388,TRAIN,0,0 +6389,TRAIN,0,0 +6390,TRAIN,0,0 +6391,TRAIN,0,0 +6392,TRAIN,0,0 +6393,TRAIN,0,0 +6394,TRAIN,0,0 +6395,TRAIN,0,0 +6396,TRAIN,0,0 +6397,TRAIN,0,0 +6398,TRAIN,0,0 +6399,TRAIN,0,0 +6400,TRAIN,0,0 +6401,TRAIN,0,0 +6402,TRAIN,0,0 +6403,TRAIN,0,0 +6404,TRAIN,0,0 +6405,TRAIN,0,0 +6406,TRAIN,0,0 +6407,TRAIN,0,0 +6408,TRAIN,0,0 +6409,TRAIN,0,0 +6410,TRAIN,0,0 +6411,TRAIN,0,0 +6412,TRAIN,0,0 +6413,TRAIN,0,0 +6414,TRAIN,0,0 +6415,TRAIN,0,0 +6416,TRAIN,0,0 +6417,TRAIN,0,0 +6418,TRAIN,0,0 +6419,TRAIN,0,0 +6420,TRAIN,0,0 +6421,TRAIN,0,0 +6422,TRAIN,0,0 +6423,TRAIN,0,0 +6424,TRAIN,0,0 +6425,TRAIN,0,0 +6426,TRAIN,0,0 +6427,TRAIN,0,0 +6428,TRAIN,0,0 +6429,TRAIN,0,0 +6430,TRAIN,0,0 +6431,TRAIN,0,0 +6432,TRAIN,0,0 +6433,TRAIN,0,0 +6434,TRAIN,0,0 +6435,TRAIN,0,0 +6436,TRAIN,0,0 +6437,TRAIN,0,0 +6438,TRAIN,0,0 +6439,TRAIN,0,0 +6440,TRAIN,0,0 +6441,TRAIN,0,0 +6442,TRAIN,0,0 +6443,TRAIN,0,0 +6444,TRAIN,0,0 +6445,TRAIN,0,0 +6446,TRAIN,0,0 +6447,TRAIN,0,0 +6448,TRAIN,0,0 +6449,TRAIN,0,0 +6450,TRAIN,0,0 +6451,TRAIN,0,0 +6452,TRAIN,0,0 +6453,TRAIN,0,0 +6454,TRAIN,0,0 +6455,TRAIN,0,0 +6456,TRAIN,0,0 +6457,TRAIN,0,0 +6458,TRAIN,0,0 +6459,TRAIN,0,0 +6460,TRAIN,0,0 +6461,TRAIN,0,0 +6462,TRAIN,0,0 +6463,TRAIN,0,0 +6464,TRAIN,0,0 +6465,TRAIN,0,0 +6466,TRAIN,0,0 +6467,TRAIN,0,0 +6468,TRAIN,0,0 +6469,TRAIN,0,0 +6470,TRAIN,0,0 +6471,TRAIN,0,0 +6472,TRAIN,0,0 +6473,TRAIN,0,0 +6474,TRAIN,0,0 +6475,TRAIN,0,0 +6476,TRAIN,0,0 +6477,TRAIN,0,0 +6478,TRAIN,0,0 +6479,TRAIN,0,0 +6480,TRAIN,0,0 +6481,TRAIN,0,0 +6482,TRAIN,0,0 +6483,TRAIN,0,0 +6484,TRAIN,0,0 +6485,TRAIN,0,0 +6486,TRAIN,0,0 +6487,TRAIN,0,0 +6488,TRAIN,0,0 +6489,TRAIN,0,0 +6490,TRAIN,0,0 +6491,TRAIN,0,0 +6492,TRAIN,0,0 +6493,TRAIN,0,0 +6494,TRAIN,0,0 +6495,TRAIN,0,0 +6496,TRAIN,0,0 +6497,TRAIN,0,0 +6498,TRAIN,0,0 +6499,TRAIN,0,0 +6500,TRAIN,0,0 +6501,TRAIN,0,0 +6502,TRAIN,0,0 +6503,TRAIN,0,0 +6504,TRAIN,0,0 +6505,TRAIN,0,0 +6506,TRAIN,0,0 +6507,TRAIN,0,0 +6508,TRAIN,0,0 +6509,TRAIN,0,0 +6510,TRAIN,0,0 +6511,TRAIN,0,0 +6512,TRAIN,0,0 +6513,TRAIN,0,0 +6514,TRAIN,0,0 +6515,TRAIN,0,0 +6516,TRAIN,0,0 +6517,TRAIN,0,0 +6518,TRAIN,0,0 +6519,TRAIN,0,0 +6520,TRAIN,0,0 +6521,TRAIN,0,0 +6522,TRAIN,0,0 +6523,TRAIN,0,0 +6524,TRAIN,0,0 +6525,TRAIN,0,0 +6526,TRAIN,0,0 +6527,TRAIN,0,0 +6528,TRAIN,0,0 +6529,TRAIN,0,0 +6530,TRAIN,0,0 +6531,TRAIN,0,0 +6532,TRAIN,0,0 +6533,TRAIN,0,0 +6534,TRAIN,0,0 +6535,TRAIN,0,0 +6536,TRAIN,0,0 +6537,TRAIN,0,0 +6538,TRAIN,0,0 +6539,TRAIN,0,0 +6540,TRAIN,0,0 +6541,TRAIN,0,0 +6542,TRAIN,0,0 +6543,TRAIN,0,0 +6544,TRAIN,0,0 +6545,TRAIN,0,0 +6546,TRAIN,0,0 +6547,TRAIN,0,0 +6548,TRAIN,0,0 +6549,TRAIN,0,0 +6550,TRAIN,0,0 +6551,TRAIN,0,0 +6552,TRAIN,0,0 +6553,TRAIN,0,0 +6554,TRAIN,0,0 +6555,TRAIN,0,0 +6556,TRAIN,0,0 +6557,TRAIN,0,0 +6558,TRAIN,0,0 +6559,TRAIN,0,0 +6560,TRAIN,0,0 +6561,TRAIN,0,0 +6562,TRAIN,0,0 +6563,TRAIN,0,0 +6564,TRAIN,0,0 +6565,TRAIN,0,0 +6566,TRAIN,0,0 +6567,TRAIN,0,0 +6568,TRAIN,0,0 +6569,TRAIN,0,0 +6570,TRAIN,0,0 +6571,TRAIN,0,0 +6572,TRAIN,0,0 +6573,TRAIN,0,0 +6574,TRAIN,0,0 +6575,TRAIN,0,0 +6576,TRAIN,0,0 +6577,TRAIN,0,0 +6578,TRAIN,0,0 +6579,TRAIN,0,0 +6580,TRAIN,0,0 +6581,TRAIN,0,0 +6582,TRAIN,0,0 +6583,TRAIN,0,0 +6584,TRAIN,0,0 +6585,TRAIN,0,0 +6586,TRAIN,0,0 +6587,TRAIN,0,0 +6588,TRAIN,0,0 +6589,TRAIN,0,0 +6590,TRAIN,0,0 +6591,TRAIN,0,0 +6592,TRAIN,0,0 +6593,TRAIN,0,0 +6594,TRAIN,0,0 +6595,TRAIN,0,0 +6596,TRAIN,0,0 +6597,TRAIN,0,0 +6598,TRAIN,0,0 +6599,TRAIN,0,0 +6600,TRAIN,0,0 +6601,TRAIN,0,0 +6602,TRAIN,0,0 +6603,TRAIN,0,0 +6604,TRAIN,0,0 +6605,TRAIN,0,0 +6606,TRAIN,0,0 +6607,TRAIN,0,0 +6608,TRAIN,0,0 +6609,TRAIN,0,0 +6610,TRAIN,0,0 +6611,TRAIN,0,0 +6612,TRAIN,0,0 +6613,TRAIN,0,0 +6614,TRAIN,0,0 +6615,TRAIN,0,0 +6616,TRAIN,0,0 +6617,TRAIN,0,0 +6618,TRAIN,0,0 +6619,TRAIN,0,0 +6620,TRAIN,0,0 +6621,TRAIN,0,0 +6622,TRAIN,0,0 +6623,TRAIN,0,0 +6624,TRAIN,0,0 +6625,TRAIN,0,0 +6626,TRAIN,0,0 +6627,TRAIN,0,0 +6628,TRAIN,0,0 +6629,TRAIN,0,0 +6630,TRAIN,0,0 +6631,TRAIN,0,0 +6632,TRAIN,0,0 +6633,TRAIN,0,0 +6634,TRAIN,0,0 +6635,TRAIN,0,0 +6636,TRAIN,0,0 +6637,TRAIN,0,0 +6638,TRAIN,0,0 +6639,TRAIN,0,0 +6640,TRAIN,0,0 +6641,TRAIN,0,0 +6642,TRAIN,0,0 +6643,TRAIN,0,0 +6644,TRAIN,0,0 +6645,TRAIN,0,0 +6646,TRAIN,0,0 +6647,TRAIN,0,0 +6648,TRAIN,0,0 +6649,TRAIN,0,0 +6650,TRAIN,0,0 +6651,TRAIN,0,0 +6652,TRAIN,0,0 +6653,TRAIN,0,0 +6654,TRAIN,0,0 +6655,TRAIN,0,0 +6656,TRAIN,0,0 +6657,TRAIN,0,0 +6658,TRAIN,0,0 +6659,TRAIN,0,0 +6660,TRAIN,0,0 +6661,TRAIN,0,0 +6662,TRAIN,0,0 +6663,TRAIN,0,0 +6664,TRAIN,0,0 +6665,TRAIN,0,0 +6666,TRAIN,0,0 +6667,TRAIN,0,0 +6668,TRAIN,0,0 +6669,TRAIN,0,0 +6670,TRAIN,0,0 +6671,TRAIN,0,0 +6672,TRAIN,0,0 +6673,TRAIN,0,0 +6674,TRAIN,0,0 +6675,TRAIN,0,0 +6676,TRAIN,0,0 +6677,TRAIN,0,0 +6678,TRAIN,0,0 +6679,TRAIN,0,0 +6680,TRAIN,0,0 +6681,TRAIN,0,0 +6682,TRAIN,0,0 +6683,TRAIN,0,0 +6684,TRAIN,0,0 +6685,TRAIN,0,0 +6686,TRAIN,0,0 +6687,TRAIN,0,0 +6688,TRAIN,0,0 +6689,TRAIN,0,0 +6690,TRAIN,0,0 +6691,TRAIN,0,0 +6692,TRAIN,0,0 +6693,TRAIN,0,0 +6694,TRAIN,0,0 +6695,TRAIN,0,0 +6696,TRAIN,0,0 +6697,TRAIN,0,0 +6698,TRAIN,0,0 +6699,TRAIN,0,0 +6700,TRAIN,0,0 +6701,TRAIN,0,0 +6702,TRAIN,0,0 +6703,TRAIN,0,0 +6704,TRAIN,0,0 +6705,TRAIN,0,0 +6706,TRAIN,0,0 +6707,TRAIN,0,0 +6708,TRAIN,0,0 +6709,TRAIN,0,0 +6710,TRAIN,0,0 +6711,TRAIN,0,0 +6712,TRAIN,0,0 +6713,TRAIN,0,0 +6714,TRAIN,0,0 +6715,TRAIN,0,0 +6716,TRAIN,0,0 +6717,TRAIN,0,0 +6718,TRAIN,0,0 +6719,TRAIN,0,0 +6720,TRAIN,0,0 +6721,TRAIN,0,0 +6722,TRAIN,0,0 +6723,TRAIN,0,0 +6724,TRAIN,0,0 +6725,TRAIN,0,0 +6726,TRAIN,0,0 +6727,TRAIN,0,0 +6728,TRAIN,0,0 +6729,TRAIN,0,0 +6730,TRAIN,0,0 +6731,TRAIN,0,0 +6732,TRAIN,0,0 +6733,TRAIN,0,0 +6734,TRAIN,0,0 +6735,TRAIN,0,0 +6736,TRAIN,0,0 +6737,TRAIN,0,0 +6738,TRAIN,0,0 +6739,TRAIN,0,0 +6740,TRAIN,0,0 +6741,TRAIN,0,0 +6742,TRAIN,0,0 +6743,TRAIN,0,0 +6744,TRAIN,0,0 +6745,TRAIN,0,0 +6746,TRAIN,0,0 +6747,TRAIN,0,0 +6748,TRAIN,0,0 +6749,TRAIN,0,0 +6750,TRAIN,0,0 +6751,TRAIN,0,0 +6752,TRAIN,0,0 +6753,TRAIN,0,0 +6754,TRAIN,0,0 +6755,TRAIN,0,0 +6756,TRAIN,0,0 +6757,TRAIN,0,0 +6758,TRAIN,0,0 +6759,TRAIN,0,0 +6760,TRAIN,0,0 +6761,TRAIN,0,0 +6762,TRAIN,0,0 +6763,TRAIN,0,0 +6764,TRAIN,0,0 +6765,TRAIN,0,0 +6766,TRAIN,0,0 +6767,TRAIN,0,0 +6768,TRAIN,0,0 +6769,TRAIN,0,0 +6770,TRAIN,0,0 +6771,TRAIN,0,0 +6772,TRAIN,0,0 +6773,TRAIN,0,0 +6774,TRAIN,0,0 +6775,TRAIN,0,0 +6776,TRAIN,0,0 +6777,TRAIN,0,0 +6778,TRAIN,0,0 +6779,TRAIN,0,0 +6780,TRAIN,0,0 +6781,TRAIN,0,0 +6782,TRAIN,0,0 +6783,TRAIN,0,0 +6784,TRAIN,0,0 +6785,TRAIN,0,0 +6786,TRAIN,0,0 +6787,TRAIN,0,0 +6788,TRAIN,0,0 +6789,TRAIN,0,0 +6790,TRAIN,0,0 +6791,TRAIN,0,0 +6792,TRAIN,0,0 +6793,TRAIN,0,0 +6794,TRAIN,0,0 +6795,TRAIN,0,0 +6796,TRAIN,0,0 +6797,TRAIN,0,0 +6798,TRAIN,0,0 +6799,TRAIN,0,0 +6800,TRAIN,0,0 +6801,TRAIN,0,0 +6802,TRAIN,0,0 +6803,TRAIN,0,0 +6804,TRAIN,0,0 +6805,TRAIN,0,0 +6806,TRAIN,0,0 +6807,TRAIN,0,0 +6808,TRAIN,0,0 +6809,TRAIN,0,0 +6810,TRAIN,0,0 +6811,TRAIN,0,0 +6812,TRAIN,0,0 +6813,TRAIN,0,0 +6814,TRAIN,0,0 +6815,TRAIN,0,0 +6816,TRAIN,0,0 +6817,TRAIN,0,0 +6818,TRAIN,0,0 +6819,TRAIN,0,0 +6820,TRAIN,0,0 +6821,TRAIN,0,0 +6822,TRAIN,0,0 +6823,TRAIN,0,0 +6824,TRAIN,0,0 +6825,TRAIN,0,0 +6826,TRAIN,0,0 +6827,TRAIN,0,0 +6828,TRAIN,0,0 +6829,TRAIN,0,0 +6830,TRAIN,0,0 +6831,TRAIN,0,0 +6832,TRAIN,0,0 +6833,TRAIN,0,0 +6834,TRAIN,0,0 +6835,TRAIN,0,0 +6836,TRAIN,0,0 +6837,TRAIN,0,0 +6838,TRAIN,0,0 +6839,TRAIN,0,0 +6840,TRAIN,0,0 +6841,TRAIN,0,0 +6842,TRAIN,0,0 +6843,TRAIN,0,0 +6844,TRAIN,0,0 +6845,TRAIN,0,0 +6846,TRAIN,0,0 +6847,TRAIN,0,0 +6848,TRAIN,0,0 +6849,TRAIN,0,0 +6850,TRAIN,0,0 +6851,TRAIN,0,0 +6852,TRAIN,0,0 +6853,TRAIN,0,0 +6854,TRAIN,0,0 +6855,TRAIN,0,0 +6856,TRAIN,0,0 +6857,TRAIN,0,0 +6858,TRAIN,0,0 +6859,TRAIN,0,0 +6860,TRAIN,0,0 +6861,TRAIN,0,0 +6862,TRAIN,0,0 +6863,TRAIN,0,0 +6864,TRAIN,0,0 +6865,TRAIN,0,0 +6866,TRAIN,0,0 +6867,TRAIN,0,0 +6868,TRAIN,0,0 +6869,TRAIN,0,0 +6870,TRAIN,0,0 +6871,TRAIN,0,0 +6872,TRAIN,0,0 +6873,TRAIN,0,0 +6874,TRAIN,0,0 +6875,TRAIN,0,0 +6876,TRAIN,0,0 +6877,TRAIN,0,0 +6878,TRAIN,0,0 +6879,TRAIN,0,0 +6880,TRAIN,0,0 +6881,TRAIN,0,0 +6882,TRAIN,0,0 +6883,TRAIN,0,0 +6884,TRAIN,0,0 +6885,TRAIN,0,0 +6886,TRAIN,0,0 +6887,TRAIN,0,0 +6888,TRAIN,0,0 +6889,TRAIN,0,0 +6890,TRAIN,0,0 +6891,TRAIN,0,0 +6892,TRAIN,0,0 +6893,TRAIN,0,0 +6894,TRAIN,0,0 +6895,TRAIN,0,0 +6896,TRAIN,0,0 +6897,TRAIN,0,0 +6898,TRAIN,0,0 +6899,TRAIN,0,0 +6900,TRAIN,0,0 +6901,TRAIN,0,0 +6902,TRAIN,0,0 +6903,TRAIN,0,0 +6904,TRAIN,0,0 +6905,TRAIN,0,0 +6906,TRAIN,0,0 +6907,TRAIN,0,0 +6908,TRAIN,0,0 +6909,TRAIN,0,0 +6910,TRAIN,0,0 +6911,TRAIN,0,0 +6912,TRAIN,0,0 +6913,TRAIN,0,0 +6914,TRAIN,0,0 +6915,TRAIN,0,0 +6916,TRAIN,0,0 +6917,TRAIN,0,0 +6918,TRAIN,0,0 +6919,TRAIN,0,0 +6920,TRAIN,0,0 +6921,TRAIN,0,0 +6922,TRAIN,0,0 +6923,TRAIN,0,0 +6924,TRAIN,0,0 +6925,TRAIN,0,0 +6926,TRAIN,0,0 +6927,TRAIN,0,0 +6928,TRAIN,0,0 +6929,TRAIN,0,0 +6930,TRAIN,0,0 +6931,TRAIN,0,0 +6932,TRAIN,0,0 +6933,TRAIN,0,0 +6934,TRAIN,0,0 +6935,TRAIN,0,0 +6936,TRAIN,0,0 +6937,TRAIN,0,0 +6938,TRAIN,0,0 +6939,TRAIN,0,0 +6940,TRAIN,0,0 +6941,TRAIN,0,0 +6942,TRAIN,0,0 +6943,TRAIN,0,0 +6944,TRAIN,0,0 +6945,TRAIN,0,0 +6946,TRAIN,0,0 +6947,TRAIN,0,0 +6948,TRAIN,0,0 +6949,TRAIN,0,0 +6950,TRAIN,0,0 +6951,TRAIN,0,0 +6952,TRAIN,0,0 +6953,TRAIN,0,0 +6954,TRAIN,0,0 +6955,TRAIN,0,0 +6956,TRAIN,0,0 +6957,TRAIN,0,0 +6958,TRAIN,0,0 +6959,TRAIN,0,0 +6960,TRAIN,0,0 +6961,TRAIN,0,0 +6962,TRAIN,0,0 +6963,TRAIN,0,0 +6964,TRAIN,0,0 +6965,TRAIN,0,0 +6966,TRAIN,0,0 +6967,TRAIN,0,0 +6968,TRAIN,0,0 +6969,TRAIN,0,0 +6970,TRAIN,0,0 +6971,TRAIN,0,0 +6972,TRAIN,0,0 +6973,TRAIN,0,0 +6974,TRAIN,0,0 +6975,TRAIN,0,0 +6976,TRAIN,0,0 +6977,TRAIN,0,0 +6978,TRAIN,0,0 +6979,TRAIN,0,0 +6980,TRAIN,0,0 +6981,TRAIN,0,0 +6982,TRAIN,0,0 +6983,TRAIN,0,0 +6984,TRAIN,0,0 +6985,TRAIN,0,0 +6986,TRAIN,0,0 +6987,TRAIN,0,0 +6988,TRAIN,0,0 +6989,TRAIN,0,0 +6990,TRAIN,0,0 +6991,TRAIN,0,0 +6992,TRAIN,0,0 +6993,TRAIN,0,0 +6994,TRAIN,0,0 +6995,TRAIN,0,0 +6996,TRAIN,0,0 +6997,TRAIN,0,0 +6998,TRAIN,0,0 +6999,TRAIN,0,0 +7000,TRAIN,0,0 +7001,TRAIN,0,0 +7002,TRAIN,0,0 +7003,TRAIN,0,0 +7004,TRAIN,0,0 +7005,TRAIN,0,0 +7006,TRAIN,0,0 +7007,TRAIN,0,0 +7008,TRAIN,0,0 +7009,TRAIN,0,0 +7010,TRAIN,0,0 +7011,TRAIN,0,0 +7012,TRAIN,0,0 +7013,TRAIN,0,0 +7014,TRAIN,0,0 +7015,TRAIN,0,0 +7016,TRAIN,0,0 +7017,TRAIN,0,0 +7018,TRAIN,0,0 +7019,TRAIN,0,0 +7020,TRAIN,0,0 +7021,TRAIN,0,0 +7022,TRAIN,0,0 +7023,TRAIN,0,0 +7024,TRAIN,0,0 +7025,TRAIN,0,0 +7026,TRAIN,0,0 diff --git a/datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json b/datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json new file mode 100644 index 0000000..1fd55ad --- /dev/null +++ b/datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "kpi_problem", + "problemName": "kpi_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "kpi_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 3, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TRAIN" + } + ], + "test": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TEST" + } + ], + "score": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly/kpi/kpi_dataset/datasetDoc.json b/datasets/anomaly/kpi/kpi_dataset/datasetDoc.json new file mode 100644 index 0000000..c062016 --- /dev/null +++ b/datasets/anomaly/kpi/kpi_dataset/datasetDoc.json @@ -0,0 +1,63 @@ +{ + "about": { + "datasetID": "kpi_dataset", + "datasetName": "kpi", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 4 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly/kpi/kpi_dataset/tables/learningData.csv.REMOVED.git-id b/datasets/anomaly/kpi/kpi_dataset/tables/learningData.csv.REMOVED.git-id new file mode 100644 index 0000000..3a466ae --- /dev/null +++ b/datasets/anomaly/kpi/kpi_dataset/tables/learningData.csv.REMOVED.git-id @@ -0,0 +1 @@ +d80846dc46c173472f646a52005a1fb3670ccd09 \ No newline at end of file diff --git a/datasets/anomaly/kpi/kpi_problem/dataSplits.csv.REMOVED.git-id b/datasets/anomaly/kpi/kpi_problem/dataSplits.csv.REMOVED.git-id new file mode 100644 index 0000000..7376921 --- /dev/null +++ b/datasets/anomaly/kpi/kpi_problem/dataSplits.csv.REMOVED.git-id @@ -0,0 +1 @@ +44db328c252a8156434142a37ef65765869e7548 \ No newline at end of file diff --git a/datasets/anomaly/kpi/kpi_problem/problemDoc.json b/datasets/anomaly/kpi/kpi_problem/problemDoc.json new file mode 100644 index 0000000..1fd55ad --- /dev/null +++ b/datasets/anomaly/kpi/kpi_problem/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "kpi_problem", + "problemName": "kpi_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "kpi_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 3, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TRAIN" + } + ], + "test": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TEST" + } + ], + "score": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly/raw_data/kpi.csv.REMOVED.git-id b/datasets/anomaly/raw_data/kpi.csv.REMOVED.git-id new file mode 100644 index 0000000..9049ed1 --- /dev/null +++ b/datasets/anomaly/raw_data/kpi.csv.REMOVED.git-id @@ -0,0 +1 @@ +bea5d1c052730eaba76b84ff5df854477cdfa80b \ No newline at end of file diff --git a/datasets/anomaly/raw_data/yahoo_sub_5.csv b/datasets/anomaly/raw_data/yahoo_sub_5.csv new file mode 100644 index 0000000..d3b267e --- /dev/null +++ b/datasets/anomaly/raw_data/yahoo_sub_5.csv @@ -0,0 +1,1401 @@ +,timestamp,value_0,value_1,value_2,value_3,value_4,is_anomaly +0,1,12183,0.0,3.7166666666667,5,2109,0 +1,2,12715,0.091757964510557,3.6108333333332996,60,3229,0 +2,3,12736,0.17229675238449998,3.4813888888888997,88,3637,0 +3,4,12716,0.22621935431999,3.3802777777778,84,1982,0 +4,5,12739,0.17635798469946,3.1933333333332996,111,2751,0 +5,6,12737,0.09049124547605099,2.7866666666667004,112,2128,0 +6,7,12857,0.08460994072769001,2.4627777777777995,1235,2109,0 +7,8,12884,0.06842699169496,2.2541666666667,710,2328,0 +8,9,12894,0.13330269689422,2.1180555555556,618,2453,0 +9,10,12675,0.085026586189321,2.0691666666667,84,2847,0 +10,11,13260,0.097073068447328,2.1972222222222,100,3659,0 +11,12,13470,0.0,2.3188888888889,125,5207,0 +12,13,13060,0.031063767542922,2.34,114,5146,0 +13,14,12949,0.017732750501525,2.4902777777778,145,4712,0 +14,15,13035,0.063354504072079,2.6438888888889,91,6363,0 +15,16,12980,0.087870391896335,2.8486111111111003,94,5010,0 +16,17,13677,0.11546815687728999,2.8833333333332996,79,3956,0 +17,18,13381,0.073413457727404,2.8808333333332996,50,4063,0 +18,19,12737,0.040392584616896,2.9005555555556,39,3748,0 +19,20,12554,0.08911335594722301,3.0855555555556,28,3047,0 +20,21,12470,0.09803005371153099,3.3536111111111,29,4099,0 +21,22,12490,0.047140641497552004,3.7438888888888995,24,2122,0 +22,23,12539,0.10481279080241,3.7947222222222,19,3387,0 +23,24,12530,0.20478886838927998,3.8011111111111004,21,1950,0 +24,25,13002,0.04485100631921201,3.6508333333332996,27,2927,0 +25,26,12989,0.1053622140254,3.555,46,1889,0 +26,27,13038,0.08436887679639,3.4769444444444,133,1910,0 +27,28,13011,0.097980673762982,3.2158333333332996,143,3747,0 +28,29,12984,0.10165726215275,3.1141666666667,86,4994,0 +29,30,13079,0.056764513454874,2.7983333333332996,118,2009,0 +30,31,13048,0.074428708878932,2.4252777777778,56,2899,0 +31,32,13096,0.091244453451818,2.14,92,2298,0 +32,33,13003,0.094529332881679,1.9822222222222001,85,1894,0 +33,34,13057,0.016638011234698,1.9694444444443997,122,1999,0 +34,35,13023,0.038096861957006005,2.0741666666667,74,3007,0 +35,36,13033,0.064497814457643,2.2505555555556,84,2838,0 +36,37,13034,0.030426401876333997,2.2819444444443997,54,4113,0 +37,38,13068,0.095423209955973,2.4216666666667,77,2150,0 +38,39,13057,0.069688744272108,2.5997222222222005,84,3007,0 +39,40,13047,0.03468622413034,2.7544444444444003,139,2484,0 +40,41,13795,0.089564461084836,2.7258333333333,65,2101,0 +41,42,13528,0.07337616196456799,2.8302777777778,38,2001,0 +42,43,13032,0.061939295606039,2.9422222222222,35,2102,0 +43,44,13084,0.11419089175512,3.0919444444444,47,2129,0 +44,45,13000,0.10475925920163,3.3519444444444,37,4422,0 +45,46,13008,0.079657960399444,3.6952777777778,53,4573,0 +46,47,12978,0.14475546275415999,3.8269444444444,55,1989,0 +47,48,13067,0.1421711341096,3.7877777777778,45,1953,0 +48,49,13086,0.07696963969656899,3.7536111111111,46,1872,0 +49,50,13023,0.06393273436444799,3.61,35,1850,0 +50,51,13046,0.14973281021845003,3.5091666666667,68,2879,0 +51,52,13032,0.041478839355346,3.4205555555556,82,1840,0 +52,53,13012,0.089317973365284,3.2647222222222,154,2134,0 +53,54,13051,0.088820248166203,2.7944444444444,128,2234,0 +54,55,12979,0.054872994406929,2.46,79,3769,0 +55,56,13025,0.07913553329046401,2.2075,66,2717,0 +56,57,13007,0.16317996709063,2.1758333333333,92,2171,0 +57,58,13036,0.08671926699280201,2.3058333333333,67,2224,0 +58,59,13043,0.0733999511789,2.3983333333332997,58,1967,0 +59,60,13023,0.0,2.55,58,2148,0 +60,61,13022,0.032756244361869,2.7302777777778,63,1978,0 +61,62,13033,0.054893891024455,2.8169444444444003,61,2021,0 +62,63,13024,0.068514114108229,2.9247222222222002,55,2060,0 +63,64,13048,0.05279414163165401,2.8911111111111003,71,2096,0 +64,65,13740,0.023853017353212,2.9575,64,2082,0 +65,66,13540,0.07426125441559799,2.9080555555556,92,2175,0 +66,67,12724,0.024228588329878998,3.0088888888888996,44,2332,0 +67,68,13070,0.09233413002519698,3.2033333333333,35,2147,0 +68,69,13106,0.15930655332113,3.6213888888889,53,2163,0 +69,70,13025,0.12755838225296,4.0322222222222,49,2406,0 +70,71,13074,0.10152541717054,4.1227777777778,49,2022,0 +71,72,13079,0.04014845396824399,3.9736111111111003,103,2188,0 +72,73,13184,0.087208372094752,3.8425,107,2758,0 +73,74,13194,0.074209918996797,3.7097222222222,74,2925,0 +74,75,13191,0.05904453736940401,3.6258333333332997,56,3223,0 +75,76,13059,0.06248169832921499,3.4705555555556,60,2507,0 +76,77,13169,0.08876527685714598,3.2877777777778,73,2435,0 +77,78,13114,0.051354431854972,2.9286111111111004,99,2552,0 +78,79,13037,0.074790104163639,2.4888888888889,84,2540,0 +79,80,13179,0.091817341555971,2.2744444444444,129,2642,0 +80,81,13152,0.14762794333026003,2.1733333333333,101,2254,0 +81,82,13095,0.07101004447510299,2.3416666666667,101,2539,0 +82,83,13144,0.07689756334240598,2.3808333333332996,51,2596,0 +83,84,13170,0.08412575787388402,2.4663888888889,95,2573,0 +84,85,13162,0.06328921386603299,2.6608333333333,48,2302,0 +85,86,13117,0.057393902128707,2.7558333333332996,40,2991,0 +86,87,13129,0.041819399065704,2.8636111111111004,55,3141,0 +87,88,13386,0.073729686380986,2.7586111111111005,56,3285,0 +88,89,13929,0.15365285617975,2.7377777777778,935,3807,0 +89,90,13385,0.06035585974240701,2.6961111111111005,34,2892,0 +90,91,13106,0.10644586288975,2.8569444444444,57,2538,0 +91,92,13113,0.05931428636012699,3.1833333333333,70,2234,0 +92,93,13155,0.096293806236591,3.5544444444444,72,2707,0 +93,94,13186,0.085101425467407,3.8894444444444,66,2382,0 +94,95,13151,0.11149072274185,4.1138888888889,72,2426,0 +95,96,13156,0.076266981262989,3.9519444444444,49,2451,0 +96,97,12813,0.097952120177625,3.8275,41,2288,0 +97,98,12821,0.17250021935572,3.6438888888889,42,2256,0 +98,99,12867,0.11389182319254,3.5608333333332998,39,2884,0 +99,100,12837,0.08999961787521,3.5013888888888998,81,2398,0 +100,101,12911,0.048649372449385005,3.3088888888889,90,2239,0 +101,102,12842,0.13861764684085998,2.9063888888888996,92,2248,0 +102,103,12905,0.1088795585287,2.5027777777777995,81,2387,0 +103,104,12993,0.054235162564995,2.2466666666667003,145,3876,0 +104,105,12974,0.0390040506742,2.1869444444444,47,3073,0 +105,106,13039,0.0744713077811,2.2402777777778,63,3113,0 +106,107,13322,0.040258943675435,2.3727777777777996,118,3363,0 +107,108,13606,0.0,2.4566666666667003,56,3796,0 +108,109,13536,0.027955712584728,2.5452777777777995,127,4924,0 +109,110,13341,0.047309968420241,2.6830555555556,48,4300,0 +110,111,13360,0.016602764360002,2.805,114,5225,0 +111,112,13450,0.04243257762835399,2.7386111111111004,78,4047,0 +112,113,14102,0.051191743726562995,2.7438888888888995,58,4134,0 +113,114,14026,0.0,2.7586111111111005,56,4786,0 +114,115,13162,0.056724832354639,2.9013888888888997,67,4184,0 +115,116,13118,0.055771058827737,3.19,155,2888,0 +116,117,12953,0.081014772096658,3.5561111111111003,123,2674,0 +117,118,12854,0.08253629738290899,3.8433333333333,118,2574,0 +118,119,12952,0.11499203730886,4.0319444444444,133,3123,0 +119,120,12915,0.07668513845109799,3.8844444444444,75,3369,0 +120,121,11994,0.070057457403873,3.6908333333332997,29,3284,0 +121,122,11868,0.07031477357556501,3.6141666666667,68,2127,0 +122,123,11977,0.091946448716499,3.5019444444444,91,2117,0 +123,124,11874,0.14560588482235998,3.4205555555556,101,2271,0 +124,125,11913,0.09477432932347199,3.1780555555556,22,2513,0 +125,126,11933,0.10217989327054,2.8361111111111,20,2746,0 +126,127,11844,0.04854243074027901,2.5222222222222004,27,2076,0 +127,128,11968,0.068760549683423,2.2416666666667004,45,2297,0 +128,129,11996,0.075440683881139,2.1588888888889,42,2312,0 +129,130,12006,0.11771339431815,2.2763888888889,59,2834,0 +130,131,12225,0.069437397660265,2.3391666666667,52,3584,0 +131,132,12482,0.0,2.4841666666667,62,4009,0 +132,133,12289,0.0,2.4911111111111,81,4142,0 +133,134,12219,0.0,2.6922222222222,84,3876,0 +134,135,12282,0.027395404320488,2.8205555555556,104,4098,0 +135,136,12367,0.055202605299814,2.8216666666667,111,3831,0 +136,137,13042,0.078387348178452,2.7122222222222,91,3842,0 +137,138,12665,0.11851571646444001,2.6744444444443998,33,4129,0 +138,139,12133,0.068395341911942,2.8097222222222,26,3509,0 +139,140,12023,0.04720597158087901,3.1838888888889,37,2450,0 +140,141,11847,0.07910648512645599,3.5130555555556,23,2270,0 +141,142,11980,0.067550601916344,3.7722222222222,29,2360,0 +142,143,12026,0.080666570182724,3.9058333333333,45,2431,0 +143,144,11852,0.044973875852863,3.7697222222222,49,2042,0 +144,145,12152,0.065734580284861,3.6027777777778,27,1833,0 +145,146,12148,0.068759646748575,3.5038888888888997,46,1852,0 +146,147,12236,0.027278224398313,3.445,39,1927,0 +147,148,12155,0.067695565422881,3.3494444444444,72,1999,0 +148,149,12113,0.07244669924777,3.1961111111111005,81,2030,0 +149,150,12175,0.028882930937167997,2.8905555555555997,64,1963,0 +150,151,12103,0.021568136982842,2.5805555555556,79,2116,0 +151,152,12206,0.064254625408408,2.3380555555556004,132,2461,0 +152,153,12239,0.073869151016554,2.2116666666667,127,2388,0 +153,154,12398,0.026644044055307004,2.2013888888889,121,2846,0 +154,155,12582,0.051289858799957,2.3236111111111,98,2974,0 +155,156,12705,0.099217337562612,2.3002777777778,128,3776,0 +156,157,12555,0.016615805334675,2.385,158,3885,0 +157,158,12476,0.078387348178452,2.5597222222222005,78,3865,0 +158,159,12706,0.0,2.6941666666667,65,4319,0 +159,160,12671,0.049384244324413,2.7169444444444,81,4646,0 +160,161,13277,0.043044731483849,2.6369444444443997,586,3873,0 +161,162,12757,0.042155048516160004,2.6572222222222,48,3489,0 +162,163,12401,0.042236538352835,2.8466666666667004,38,2790,0 +163,164,12248,0.1001564296112,3.1955555555556,30,2641,0 +164,165,12156,0.17378132267942997,3.5633333333332997,28,2960,0 +165,166,12210,0.12005519462967999,3.8113888888889,36,2192,0 +166,167,11983,0.14491137762023998,3.9655555555556,50,2145,0 +167,168,12374,0.07336941078506799,3.8483333333333,47,2133,0 +168,169,12230,0.12395626148951999,3.6441666666667003,82,2330,0 +169,170,12200,0.15077430423660998,3.5213888888889,56,2235,0 +170,171,12135,0.18960071033689,3.4702777777777998,140,2258,0 +171,172,12131,0.06051348935254,3.3033333333333,145,2200,0 +172,173,12165,0.072057993662839,3.1933333333332996,114,2161,0 +173,174,12193,0.082361078437032,2.8183333333332996,129,2159,0 +174,175,12165,0.12343775199875999,2.52,143,2088,0 +175,176,12304,0.10718177844830001,2.2886111111111,113,2473,0 +176,177,12275,0.10359394556778999,2.0822222222222,108,3217,0 +177,178,12369,0.021162435488903,2.1416666666667,93,2994,0 +178,179,12569,0.074524398314698,2.2688888888889,63,3827,0 +179,180,12766,0.12687067454443,2.335,103,4176,0 +180,181,12621,0.04175261832616001,2.4388888888888998,114,4227,0 +181,182,12611,0.0,2.5386111111111,67,4290,0 +182,183,12618,0.040819652463459,2.6288888888889,106,4691,0 +183,184,12631,0.082668981599835,2.7511111111111,160,4442,0 +184,185,13121,0.06181362481077901,2.7744444444444,81,5775,0 +185,186,12871,0.0,2.8297222222222,113,3840,0 +186,187,12252,0.076137992226715,2.9708333333333,37,3721,0 +187,188,12155,0.12107639529965,3.1333333333332996,70,2498,0 +188,189,12186,0.0,3.3544444444444,82,2265,0 +189,190,12179,0.19840339729984,3.6780555555556,76,2451,0 +190,191,12109,0.20112394005693,3.8038888888888995,59,2892,0 +191,192,12142,0.096833471661634,3.8177777777778,58,2166,0 +192,193,12145,0.10338450919956,3.6916666666667,49,2040,0 +193,194,12162,0.10142513773096001,3.5197222222222,36,2013,0 +194,195,12165,0.09779274451732001,3.5186111111111003,111,2000,0 +195,196,12125,0.14744152252573,3.2597222222222,81,2117,0 +196,197,12097,0.083396348606149,3.0930555555556,92,2775,0 +197,198,12099,0.09563749800691301,2.7825,113,2116,0 +198,199,12140,0.14768844039376003,2.4494444444443997,90,1991,0 +199,200,12188,0.1131872329372,2.2369444444443998,183,3162,0 +200,201,12157,0.073729686380986,2.0961111111111,117,2958,0 +201,202,12128,0.064614077523704,2.0377777777778,110,3153,0 +202,203,12190,0.05601995959727501,2.0730555555556003,179,2190,0 +203,204,12151,0.074812141908008,2.1655555555556,134,2172,0 +204,205,12214,0.024893884278452006,2.285,135,2074,0 +205,206,12275,0.023695834967821,2.4283333333333,100,2078,0 +206,207,12164,0.058680009072634,2.6186111111111,47,2406,0 +207,208,12120,0.10008779345816002,2.7372222222222002,88,2018,0 +208,209,12693,0.066566772961868,2.8266666666667004,74,2091,0 +209,210,12624,0.070501147961051,2.8469444444444,58,2310,0 +210,211,12163,0.098779019649936,2.9855555555556,100,2113,0 +211,212,12100,0.11803653713500999,3.1038888888889,49,2518,0 +212,213,12162,0.10076746585103,3.4058333333333,36,2605,0 +213,214,12106,0.053210709415362996,3.6138888888888996,40,2680,0 +214,215,12156,0.099346579713514,3.93,50,2228,0 +215,216,12120,0.047275248011591,3.8155555555556,58,2023,0 +216,217,12420,0.09126220979158199,3.6588888888888995,50,3702,0 +217,218,12417,0.038593218846487996,3.5913888888888996,53,1992,0 +218,219,12450,0.070273907645883,3.4644444444444003,93,1988,0 +219,220,12395,0.029431888410362997,3.3944444444444,78,1919,0 +220,221,12382,0.09685476998430699,3.2227777777778,84,2213,0 +221,222,12438,0.11656453357642,2.7961111111111,112,2181,0 +222,223,12363,0.12109055114779,2.4383333333332997,73,2152,0 +223,224,12393,0.20381554615785996,2.2647222222222005,91,2393,0 +224,225,12399,0.04631176800502201,2.1886111111111,114,2173,0 +225,226,12456,0.18261306403662,2.2825,127,2109,0 +226,227,12442,0.021992750543024,2.3333333333332997,69,3606,0 +227,228,12481,0.088072259040681,2.445,59,2114,0 +228,229,12432,0.037896500450725,2.5811111111111,64,2135,0 +229,230,12403,0.09882843339863001,2.7094444444444,75,2303,0 +230,231,12406,0.076277687882641,2.88,44,2137,0 +231,232,12462,0.022875979046570998,2.8555555555556,52,2264,0 +232,233,13034,0.10022162220861001,2.7791666666667,42,2245,0 +233,234,12830,0.08117200437078799,2.7772222222222,45,2151,0 +234,235,12439,0.09750667785645802,3.02,26,2330,0 +235,236,12541,0.05680722879784299,3.2213888888888995,29,3357,0 +236,237,12462,0.12240855732315001,3.6211111111111003,32,3152,0 +237,238,12394,0.1715485140175,4.0219444444444,44,2693,0 +238,239,12507,0.075015592829224,4.0980555555556,41,3798,0 +239,240,12512,0.11388410095531,3.9080555555556,42,4596,0 +240,241,12093,0.10519027968795,3.7269444444444,46,2529,0 +241,242,12197,0.1150532998405,3.6244444444444,40,2124,0 +242,243,12138,0.10890530980571,3.5252777777778,64,2762,0 +243,244,12174,0.09935062148508599,3.4675,70,2973,0 +244,245,12163,0.12889794040441002,3.3316666666667003,69,3041,0 +245,246,12096,0.12069378235889001,2.9497222222222,73,2179,0 +246,247,12166,0.13053034917739,2.5708333333332996,85,2322,0 +247,248,12187,0.078977758004111,2.3086111111111,63,2274,0 +248,249,12246,0.08088416337864099,2.2311111111111,67,2448,0 +249,250,12335,0.04008956024204,2.3119444444444,68,3811,0 +250,251,12556,0.05063725351997099,2.3536111111111,62,3761,0 +251,252,12652,0.039066291775136,2.4819444444444,69,4269,0 +252,253,12646,0.028611752774163998,2.6605555555556,82,4244,0 +253,254,12803,0.040593364983329,2.7527777777778,56,4417,0 +254,255,12570,0.038807415292018,3.0741666666667005,38,3758,0 +255,256,12633,0.07832796288132202,2.8522222222222,30,4375,0 +256,257,13146,0.066320996162546,2.7277777777778,48,4158,0 +257,258,12994,0.083175583471284,2.7502777777778,63,3410,0 +258,259,12314,0.06802464587725401,2.8797222222222,34,2853,0 +259,260,12193,0.051675070535005994,3.2027777777778,11,2628,0 +260,261,12127,0.04412911220799701,3.5633333333332997,22,2287,0 +261,262,12140,0.037685894365982006,3.8808333333332996,22,3334,0 +262,263,12174,0.09341456146583801,4.0352777777778,12,2795,0 +263,264,12180,0.06987083046098,3.8966666666667003,10,2089,0 +264,265,12861,0.021992750543024,3.7225,14,2260,0 +265,266,12957,0.11305566197523,3.73,39,3176,0 +266,267,12981,0.030884138240845002,3.5558333333333,55,4049,0 +267,268,12958,0.10381377439313,3.3169444444444003,90,2902,0 +268,269,12913,0.048953768695625004,3.2322222222222,68,3743,0 +269,270,12939,0.042258794089861,2.8658333333333,95,4280,0 +270,271,12933,0.04838868558547099,2.5169444444443996,70,3977,0 +271,272,13006,0.034197830567692,2.3,96,4518,0 +272,273,13091,0.08835953066771099,2.1888888888888998,45,2707,0 +273,274,13201,0.086890518272785,2.2030555555556,96,3522,0 +274,275,13520,0.031087561676959,2.2711111111111,74,4584,0 +275,276,13675,0.071287463233942,2.4697222222222,82,4141,0 +276,277,13594,0.14372616993938,2.5988888888889,82,4831,0 +277,278,13466,0.12647517487142998,2.7258333333333,45,3991,0 +278,279,13448,0.042854531198562,2.7858333333333,134,4645,0 +279,280,13492,0.039930389849143995,2.7922222222222,119,4967,0 +280,281,14123,0.076184645265048,2.6988888888888996,86,4578,0 +281,282,13839,0.037830020408535,2.7663888888889,75,4972,0 +282,283,13335,0.030884138240845002,2.8938888888889,45,5522,0 +283,284,13196,0.048316550276279,3.1875,50,2832,0 +284,285,13047,0.10986585566763,3.6463888888889,31,2826,0 +285,286,13008,0.025485002897852004,3.8666666666667004,88,2855,0 +286,287,12763,0.12451757643335,3.9808333333332997,42,2660,0 +287,288,12949,0.12875690949235,3.8277777777778,70,2447,0 +288,289,13009,0.15720639094135,3.6269444444444,106,2545,0 +289,290,13008,0.079092017261926,3.5266666666667,44,3842,0 +290,291,12890,0.14711499890479998,3.5077777777778,57,2332,0 +291,292,13004,0.0531410973178,3.3455555555556,95,2294,0 +292,293,12918,0.10136246281349001,3.1241666666667003,91,3016,0 +293,294,12910,0.053119315802353,2.8713888888889,66,3944,0 +294,295,12915,0.11313351589999002,2.5133333333333,66,2332,0 +295,296,13121,0.076760188212735,2.2197222222222,82,2405,0 +296,297,13076,0.08890522133351199,2.205,73,2572,0 +297,298,13096,0.10095551301750001,2.2677777777777997,69,2558,0 +298,299,13339,0.15685427502807,2.2991666666667,107,3701,0 +299,300,13635,0.11090638960365,2.4277777777778,101,4228,0 +300,301,13493,0.054798089981891,2.5333333333333,66,3990,0 +301,302,13402,0.08461316628091001,2.6422222222222005,47,4707,0 +302,303,13417,0.15790425505315,2.8211111111111005,47,3857,0 +303,304,13382,0.021675109392133997,2.7625,66,3874,0 +304,305,14199,0.14112049645292002,2.7391666666667,102,4369,0 +305,306,13973,0.059612111520904,2.7525,71,4488,0 +306,307,13284,0.067835890522602,2.8644444444444,53,3637,0 +307,308,13070,0.047414460026828,3.1927777777778,28,2705,0 +308,309,12983,0.050348669783997005,3.5872222222222,24,2429,0 +309,310,13075,0.07296715773193299,3.8305555555556,23,2839,0 +310,311,12991,0.10713527159168999,3.8827777777778,30,2371,0 +311,312,12993,0.073622496612493,3.7291666666667003,25,2758,0 +312,313,13121,0.11556476355437,3.6172222222222,29,2291,0 +313,314,13097,0.034160489683707995,3.4491666666667005,27,2220,0 +314,315,13150,0.019571935182124002,3.4097222222222,77,2620,0 +315,316,13078,0.15720996206912,3.2605555555556,46,2467,0 +316,317,13140,0.11515041454164,3.2191666666667,86,2088,0 +317,318,13102,0.086415715789296,2.9586111111111,97,2137,0 +318,319,13110,0.092606306920552,2.6036111111111,88,2907,0 +319,320,13138,0.04645857903869201,2.3319444444444,110,2558,0 +320,321,13238,0.10977831600416,2.2025,89,2823,0 +321,322,13317,0.11090009191451,2.2711111111111,134,2465,0 +322,323,13512,0.076652795374797,2.2897222222222005,84,4399,0 +323,324,13669,0.1087202400467,2.3297222222222005,109,4088,0 +324,325,13651,0.11471628863897,2.395,57,5099,0 +325,326,13580,0.11070024667119,2.5063888888889,49,5157,0 +326,327,13538,0.026827723134057995,2.7077777777778,83,3782,0 +327,328,13657,0.029426630692549,2.735,101,4008,0 +328,329,14183,0.028611752774163998,2.6958333333332996,88,4534,0 +329,330,14117,0.05310618109238201,2.6930555555556,56,3242,0 +330,331,13166,0.055538160906184006,2.875,31,2808,0 +331,332,13265,0.11009690391165,3.1788888888888995,22,3676,0 +332,333,13085,0.10979978093137,3.5808333333333,32,3523,0 +333,334,13167,0.036174223284821,3.8508333333333,27,3038,0 +334,335,13170,0.048361321378982004,3.9180555555556,17,2299,0 +335,336,13132,0.10958125953197999,3.815,27,2345,0 +336,337,13055,0.047305343559722,3.6080555555556,38,2565,0 +337,338,13025,0.04531686866460401,3.4927777777778,73,2576,0 +338,339,13076,0.13255054531036,3.4316666666667004,56,2327,0 +339,340,13044,0.079695587369141,3.3436111111111004,49,2211,0 +340,341,13035,0.10277355185943,3.0663888888888997,90,2642,0 +341,342,13103,0.15061124796385,2.7894444444444,106,3646,0 +342,343,13067,0.14509169704095,2.4994444444444,51,2281,0 +343,344,13183,0.054445250001619004,2.2544444444444,99,2474,0 +344,345,13144,0.082058799915824,2.0847222222222,104,2536,0 +345,346,13166,0.04215131178281901,2.0888888888889,119,2900,0 +346,347,13406,0.05740470330970599,2.1594444444443996,73,3144,0 +347,348,13544,0.040891918425583,2.2533333333332997,92,3725,0 +348,349,13608,0.045224636676714995,2.3880555555556002,57,4305,0 +349,350,13522,0.0,2.6338888888889,100,3665,0 +350,351,13595,0.0,2.6588888888889,93,3791,0 +351,352,13420,0.10335456693442999,2.7586111111111005,111,3897,0 +352,353,14163,0.033846222120808,2.8797222222222,91,3494,0 +353,354,13678,0.026167129419327997,2.785,43,3353,0 +354,355,13272,0.08571767780871499,2.8219444444444,91,2741,0 +355,356,13071,0.12459953631184001,3.0055555555556,63,2463,0 +356,357,13004,0.054750658073534006,3.2936111111111,60,3477,0 +357,358,13068,0.20799106772677,3.5575,56,2792,0 +358,359,13031,0.10314231079956,3.6761111111111004,59,2183,0 +359,360,13013,0.12212653292147,3.7166666666667,48,2874,0 +360,361,12998,0.19159058299176,3.6013888888889,65,2147,0 +361,362,12971,0.10782180851978,3.4455555555556,77,2754,0 +362,363,13000,0.06408869538637901,3.4166666666667003,60,2007,0 +363,364,12998,0.095540168894753,3.1791666666667004,94,2564,0 +364,365,12906,0.039360296791109,3.0013888888888998,84,3020,0 +365,366,12969,0.086611479249287,2.72,99,2004,0 +366,367,12963,0.058455074416030005,2.4527777777777997,61,2047,0 +367,368,12933,0.051490800079599004,2.1816666666667,60,3531,0 +368,369,12990,0.075496432869001,2.0161111111111,78,2383,0 +369,370,12980,0.10358625218721,1.9769444444443998,81,2112,0 +370,371,12982,0.062806431427897,2.0597222222222,61,2554,0 +371,372,12989,0.08970338978685001,2.2111111111111,68,2371,0 +372,373,13073,0.09451731613096799,2.3141666666667002,53,2060,0 +373,374,12950,0.032322011663911,2.4280555555556003,49,2086,0 +374,375,12990,0.047911560407608,2.5855555555556,40,2130,0 +375,376,13035,0.062001214431213,2.6977777777778,125,2072,0 +376,377,13681,0.027102718749392,2.7777777777778,61,2033,0 +377,378,13304,0.034703114844079,2.7988888888888996,111,2683,0 +378,379,12965,0.066236017573192,2.8927777777778,32,2046,0 +379,380,12966,0.032230355211769,3.0413888888889,21,2064,0 +380,381,12943,0.11559664215716,3.3569444444444,14,2067,0 +381,382,12958,0.021952502374124,3.4808333333332997,32,2496,0 +382,383,13005,0.13347711194703,3.7641666666667004,29,4758,0 +383,384,12923,0.10579408349833999,3.8097222222222,26,2806,0 +384,385,12812,0.10679035350244001,3.6911111111111,52,2227,0 +385,386,12803,0.068633627680319,3.4902777777778,39,3123,0 +386,387,12850,0.04699518011436099,3.3769444444444,78,3460,0 +387,388,12797,0.14159640074335997,3.3011111111111004,78,3587,0 +388,389,12732,0.078500039299167,3.1369444444444,83,2558,0 +389,390,12817,0.049232295047845,2.8475,63,2306,0 +390,391,12818,0.078777592482879,2.4544444444443996,108,2083,0 +391,392,12815,0.08993433499951,2.1247222222222,158,3073,0 +392,393,12805,0.081869163858473,2.0266666666667,115,3325,0 +393,394,12703,0.14556064903749,2.1763888888889,112,2321,0 +394,395,12771,0.0,2.3088888888889,73,2846,0 +395,396,12847,0.0,2.4213888888889,93,2482,0 +396,397,12872,0.030693547421212,2.6436111111111,65,2306,0 +397,398,12815,0.0,2.6602777777777997,91,2298,0 +398,399,12844,0.046999447831426996,2.7677777777778,106,2907,0 +399,400,12811,0.028815579681692002,2.8066666666667004,66,2329,0 +400,401,13472,0.0,2.7661111111111003,26,2456,0 +401,402,13063,0.039360296791109,2.8133333333332997,23,2178,0 +402,403,12833,0.039570832199428,2.9186111111111,24,2142,0 +403,404,12842,0.090659246308087,3.1930555555556,19,2277,0 +404,405,12804,0.10540579050057002,3.565,23,3066,0 +405,406,12852,0.062601610466313,3.9133333333333,30,3619,0 +406,407,12862,0.051455855638306,3.9658333333332996,23,3726,0 +407,408,12799,0.05463175864878501,3.8930555555556,35,2282,0 +408,409,12789,0.09017822949731,3.7297222222222,41,3079,0 +409,410,12815,0.04528752509160901,3.6516666666667,63,2448,0 +410,411,12887,0.033344698319951,3.5927777777778,33,2574,0 +411,412,12903,0.080098394586215,3.4694444444444,50,3697,0 +412,413,12892,0.025162301034707,3.2536111111111,88,3067,0 +413,414,12907,0.078260793447992,2.8986111111111,115,3491,0 +414,415,12883,0.07223863924679201,2.4488888888889,69,3195,0 +415,416,12965,0.042917873674349,2.2119444444444,116,2763,0 +416,417,12932,0.04720597158087901,2.2011111111111,73,2605,0 +417,418,13134,0.048273008229067,2.2338888888889,75,2755,0 +418,419,13440,0.036987975876273,2.3116666666667003,56,3300,0 +419,420,13544,0.06291463671717,2.3869444444443997,66,3838,0 +420,421,13508,0.033319304393751,2.5119444444443997,70,3608,0 +421,422,13401,0.029115275623859,2.5713888888889,52,3845,0 +422,423,13410,0.06821638123436,2.5088888888889,32,3563,0 +423,424,13482,0.015408589348188001,2.4155555555556,16,5478,0 +424,425,14124,0.01916018435633,3.6455555555556,46,3656,0 +425,426,13703,0.06374239746477901,2.4625,53,3491,0 +426,427,13250,0.09973889072880301,2.5808333333333,67,3430,0 +427,428,13092,0.10950621554455,3.0033333333332997,58,2807,0 +428,429,13012,0.061389206215894006,3.3486111111111003,17,2524,0 +429,430,12901,0.05130763806024401,3.6644444444444,26,2964,0 +430,431,12848,0.082471571552878,4.0083333333333,13,3969,0 +431,432,13025,0.060122448878635,3.8530555555556,8,3561,0 +432,433,11352,0.07469842969719999,3.6183333333333,20,3394,0 +433,434,8761,0.056170625137636994,3.4922222222222,23,3005,0 +434,435,10433,0.052668952946360995,3.4958333333333,34,2350,0 +435,436,10088,0.068871884486763,3.2738888888888997,35,2139,0 +436,437,9485,0.04023605711093899,3.2102777777778,48,2098,0 +437,438,8865,0.053200012471362995,2.8475,67,2341,0 +438,439,8920,0.056725172482787994,2.4883333333332995,38,2698,0 +439,440,8798,0.035229341473877,2.1955555555556003,33,2968,0 +440,441,8927,0.0,2.1461111111111,40,2824,0 +441,442,9211,0.020190723068726,2.1522222222222,37,3003,0 +442,443,9286,0.093342961377898,2.3122222222222004,51,3551,0 +443,444,9725,0.0,2.4033333333332996,52,4689,0 +444,445,11050,0.015717168144981003,2.4944444444443996,57,3481,0 +445,446,11521,0.017190609993733997,2.6622222222222005,82,3376,0 +446,447,11603,0.0,2.675,74,3198,0 +447,448,11665,0.043273461915965,2.6997222222222,80,3059,0 +448,449,12153,0.029854520963497996,2.6997222222222,78,2937,0 +449,450,11672,0.017383620014121998,2.7194444444444,58,2881,0 +450,451,11119,0.046391383573699006,2.8258333333333,41,2777,0 +451,452,11124,0.042155878228,3.1044444444444,34,2510,0 +452,453,10734,0.05268422233957901,3.4736111111111003,35,2356,0 +453,454,11612,0.063573954212613,3.6972222222222,40,2383,0 +454,455,11523,0.077413583128967,3.8038888888888995,35,2455,0 +455,456,11632,0.069605078732108,3.7494444444444,37,2285,0 +456,457,12838,0.075937967855042,3.6813888888888995,43,2455,0 +457,458,11637,0.04735400243835201,3.4791666666667003,45,4298,0 +458,459,12542,0.044000040388062,3.4530555555555997,48,2400,0 +459,460,12394,0.09513097192459499,3.2841666666667004,77,3431,0 +460,461,12419,0.069274987547704,3.205,79,2252,0 +461,462,12484,0.061118974117397,2.8436111111111004,59,2628,0 +462,463,12413,0.056393740750134,2.4441666666667,107,3266,0 +463,464,12440,0.06125086589409901,2.275,100,2620,0 +464,465,12614,0.047746883512707,2.1788888888889,84,2824,0 +465,466,12693,0.047136440673386,2.2083333333332997,99,2801,0 +466,467,12989,0.0,2.2997222222222002,103,3106,0 +467,468,13200,0.0,2.3155555555556004,47,3532,0 +468,469,13108,0.049828520132601,2.41,67,4210,0 +469,470,12886,0.0,2.5902777777778,65,3646,0 +470,471,13000,0.0,2.6636111111111,65,3768,0 +471,472,13071,0.043576825212603996,2.7105555555556,70,5342,0 +472,473,13563,0.035173891965944996,2.6811111111111,76,5327,0 +473,474,13333,0.04413510379665099,2.715,40,3363,0 +474,475,12672,0.016955671451488998,2.7083333333332997,54,3016,0 +475,476,12547,0.1330396486107,3.0038888888888997,45,3257,0 +476,477,12289,0.016462114132943,3.3911111111111003,32,2619,0 +477,478,12584,0.055696363369897,3.6375,26,2573,0 +478,479,12526,0.036411774365824996,3.7755555555556,25,2575,0 +479,480,12416,0.047966724418057,3.5786111111111003,34,5355,0 +480,481,12450,0.05609961782665,3.4222222222222,43,5809,0 +481,482,12460,0.09699047978112099,3.2538888888888997,68,3823,0 +482,483,12425,0.11147038220963999,3.1683333333332997,60,3116,0 +483,484,12430,0.044797927381498,3.0677777777778,74,2321,0 +484,485,12418,0.024403519177111,2.94,68,2193,0 +485,486,12437,0.08532776818426499,2.7291666666667003,43,2982,0 +486,487,12484,0.043615168647623,2.4147222222222005,73,4140,0 +487,488,12380,0.056692005942856,2.1419444444443996,72,2353,0 +488,489,12620,0.033708553131457,2.0244444444444,66,3350,0 +489,490,12674,0.04014845396824399,2.0458333333333,90,3184,0 +490,491,12855,0.099551526697496,2.09,104,3469,0 +491,492,13053,0.0,2.1575,114,4204,0 +492,493,12898,0.036157867549893995,2.2655555555556,98,6447,0 +493,494,12809,0.052738784696875,2.2561111111111,70,4898,0 +494,495,12964,0.021636091422946997,2.4669444444443998,101,3633,0 +495,496,12956,0.03712022063964399,2.5277777777778,77,4189,0 +496,497,13625,0.034467327401996005,2.5266666666667,69,4012,0 +497,498,13285,0.0,2.5438888888889,19,4009,0 +498,499,12715,0.09680701971025901,2.6511111111111,47,4346,0 +499,500,12637,0.059601475230884,2.9711111111111004,38,2781,0 +500,501,12535,0.068431521141608,3.2288888888889,22,2811,0 +501,502,12512,0.09611085542803999,3.505,20,2415,0 +502,503,12549,0.064177980162036,3.4944444444444,26,3589,0 +503,504,12567,0.11565746993409,3.4633333333332996,24,2878,0 +504,505,12362,0.073501732487291,3.3177777777778,27,3471,0 +505,506,12326,0.072746100819649,3.1963888888888996,25,2697,0 +506,507,12450,0.07557888002360401,3.1069444444444,57,2583,0 +507,508,12404,0.036816888038697,3.0172222222222,58,3173,0 +508,509,12362,0.09396923545355901,2.9247222222222002,81,3341,0 +509,510,12431,0.034848294186597004,2.5336111111111,81,2305,0 +510,511,12351,0.084191269180943,2.2480555555556,69,2186,0 +511,512,12528,0.13109036514766,2.0383333333333002,50,4439,0 +512,513,12559,0.061132356147447,1.8852777777778,55,3173,0 +513,514,12586,0.019478099970089,1.9225,57,2831,0 +514,515,12864,0.0,1.9719444444443999,78,16385,0 +515,516,13026,0.0,2.0608333333333,57,83955,0 +516,517,12880,0.017965204407153,2.16,78,4574,0 +517,518,12743,0.019202263481758998,2.3077777777777997,95,4987,0 +518,519,12812,0.0,2.415,88,5110,0 +519,520,12878,0.052306327013631,2.4669444444443998,108,4893,0 +520,521,13427,0.08536575533023,2.5125,87,3807,0 +521,522,13081,0.05246136025669901,2.6294444444444,87,3447,0 +522,523,12752,0.035302992848671,2.8183333333332996,44,4329,0 +523,524,12594,0.028682734942579,3.0547222222222,39,5166,0 +524,525,12507,0.024204462299365,3.33,27,3454,0 +525,526,12494,0.034360100307537,3.5738888888888996,23,3578,0 +526,527,12487,0.018977302969238,3.6888888888888998,11,2406,0 +527,528,12404,0.034308847257872,3.7111111111111,13,2073,0 +528,529,11147,0.07460088255490599,3.7180555555556,24,1925,0 +529,530,11147,0.055037935083209005,3.6041666666667003,77,2357,0 +530,531,11128,0.039311673522385,3.4483333333333,54,1947,0 +531,532,11106,0.046619928266775,3.2413888888888995,45,1912,0 +532,533,11115,0.048227542028920996,3.1355555555556,36,2107,0 +533,534,11044,0.020367863848113996,2.8172222222222,59,2985,0 +534,535,11110,0.063069968046591,2.4275,81,2081,0 +535,536,11190,0.05447086605697401,2.2513888888888998,50,2631,0 +536,537,11063,0.0,2.0691666666667,53,2130,0 +537,538,11078,0.059261864411046,2.0155555555556,44,2085,0 +538,539,11146,0.064174002348993,2.0952777777777998,87,2211,0 +539,540,11010,0.0,2.2397222222222,94,2105,0 +540,541,11139,0.021912411214588,2.3275,128,2585,0 +541,542,11117,0.05795826200210599,2.5255555555556004,82,3695,0 +542,543,11081,0.035358633773416,2.665,49,3198,0 +543,544,11128,0.029191244440102997,2.7975,79,3191,0 +544,545,11720,0.054981313823219,2.8597222222222,62,2016,0 +545,546,11384,0.06405347705857799,2.7983333333332996,64,2124,0 +546,547,11018,0.0,2.9322222222222,34,2105,0 +547,548,11104,0.055445634363329,3.08,41,2031,0 +548,549,11084,0.040996998867197,3.3466666666667004,47,1964,0 +549,550,11106,0.027670189755404,3.6869444444444,31,2016,0 +550,551,11055,0.054579839310753,3.7966666666667,26,3909,0 +551,552,11098,0.04483364007329901,3.7805555555556,17,2105,0 +552,553,11028,0.03282297151413,3.7422222222222,30,2405,0 +553,554,11152,0.017696014614986,3.6391666666667004,17,2141,0 +554,555,11025,0.09418709999244,3.4775,28,1910,0 +555,556,11015,0.061817529149429,3.3283333333333,20,1951,0 +556,557,11125,0.054000161367617996,3.1702777777778,85,2310,0 +557,558,11035,0.061656002495989994,2.7688888888889,52,2047,0 +558,559,11103,0.055915839259234004,2.4266666666667,143,2048,0 +559,560,11100,0.062788330996733,2.1963888888889,106,3083,0 +560,561,11170,0.044888048273534,2.135,244,3619,0 +561,562,11078,0.09525948495633699,2.3186111111111,2005,2172,0 +562,563,11150,0.021952502374124,2.3383333333332996,124,3142,0 +563,564,11149,0.0,2.5002777777777996,109,2256,0 +564,565,10984,0.0,2.6527777777778,148,2200,0 +565,566,11034,0.0,2.7661111111111003,126,2183,0 +566,567,11050,0.061557079663167,2.7347222222222,46,2030,0 +567,568,11102,0.14186075040414,2.6069444444444,49,2297,0 +568,569,11743,0.0,2.5547222222222,40,2213,0 +569,570,11371,0.077457673524504,2.4716666666667004,39,4014,0 +570,571,11078,0.16422977329792998,2.6530555555556004,25,2809,0 +571,572,11224,0.049366067455729,2.9488888888888996,37,2355,0 +572,573,11146,0.10064381631633,3.3383333333332996,32,2372,0 +573,574,11199,0.11909159312805999,3.5419444444444,47,2387,0 +574,575,11181,0.09003816676619801,5.3302777777778,34,2359,0 +575,576,11022,0.055882659245704,3.7727777777778,40,2485,0 +576,577,11073,0.1836893913223,3.6333333333332996,46,3728,0 +577,578,11120,0.08574268253550299,3.5430555555556,35,2820,0 +578,579,11008,0.12559700716583,3.6711111111111,61,2426,0 +579,580,11078,0.086129850619071,3.4572222222222,56,2307,0 +580,581,11121,0.04175261832616001,3.2,72,2233,0 +581,582,11041,0.094396473652892,2.7772222222222,110,2178,0 +582,583,11168,0.045323960075285004,2.415,135,2243,0 +583,584,11213,0.13808411333909,2.2530555555556004,133,2713,0 +584,585,11238,0.08029349854683501,2.0994444444443996,148,3168,0 +585,586,11273,0.06507307495461,2.1780555555556003,86,3163,0 +586,587,11479,0.084518021856329,2.2638888888889,132,3289,0 +587,588,11839,0.030507395540508,2.3575,73,4001,0 +588,589,11735,0.058925029212997006,2.4680555555556003,95,4684,0 +589,590,11574,0.0,2.6208333333333,74,4137,0 +590,591,11531,0.033075906123641,2.6863888888889,51,4787,0 +591,592,11420,0.16633704704670998,2.6172222222222,65,4278,0 +592,593,12301,0.10228536028167,2.6194444444443996,95,3898,0 +593,594,11845,0.16949365549682996,2.6358333333333,72,3728,0 +594,595,11374,0.08260397756200501,2.8661111111111004,41,4047,0 +595,596,11370,0.024378363844867995,3.0533333333333,38,3373,0 +596,597,11197,0.15686874147816002,3.4438888888888997,32,2669,0 +597,598,11171,0.063929461148943,3.6552777777778,22,3289,0 +598,599,11197,0.12602019009982998,3.8519444444444,29,2556,0 +599,600,11114,0.035137191893634005,3.8069444444444,32,2557,0 +600,601,12564,0.14965728062748998,3.5961111111111004,40,3003,0 +601,602,12459,0.10046170077382,3.5344444444444,59,2441,0 +602,603,12508,0.13163105487926,3.3972222222222,52,2396,0 +603,604,12464,0.043899611017859004,3.3936111111111003,42,3426,0 +604,605,12438,0.19567092855859,3.1025,46,2379,0 +605,606,12449,0.19135011734274998,2.8630555555556,97,3026,0 +606,607,12373,0.11171915024595,2.4255555555556003,72,2336,0 +607,608,12594,0.032053604746412,1.8619444444443998,81,2850,0 +608,609,12623,0.09644836158065499,1.8930555555556001,81,3016,0 +609,610,12759,0.07934996156433399,2.2080555555556,70,3537,0 +610,611,12841,0.024581173073577996,2.3052777777777997,89,3899,0 +611,612,13063,0.025596039426134,2.3777777777777995,87,5044,0 +612,613,13023,0.027922074309281005,2.5161111111111,125,4806,0 +613,614,12884,0.025935450238779998,2.6411111111111,69,4139,0 +614,615,13007,0.033086949155743,2.8011111111111004,57,4776,0 +615,616,13016,0.047260069860172005,2.7236111111111003,99,4065,0 +616,617,13588,0.03848713016603201,2.6813888888889,111,4969,0 +617,618,13272,0.16080169828563,2.7336111111111,71,3784,0 +618,619,12589,0.12635270044885,2.8863888888888996,71,3297,0 +619,620,12651,0.046904491868436,3.1225,48,3347,0 +620,621,12616,0.059534673085297,3.4613888888888997,76,3170,0 +621,622,12492,0.12198352023567999,3.8297222222222,56,2241,0 +622,623,12497,0.052131597947041995,3.8936111111111003,35,2301,0 +623,624,12623,0.094084438832673,3.7588888888888996,35,2303,0 +624,625,12481,0.13486764750848,3.5827777777778,29,2587,0 +625,626,12434,0.062226183256115,3.4730555555555997,38,3211,0 +626,627,12495,0.09120203546303399,3.4175,69,2604,0 +627,628,12375,0.09613785932463101,3.3533333333332997,77,2841,0 +628,629,12357,0.10449109200784999,3.1963888888888996,20,2168,0 +629,630,12433,0.097127966420289,2.8852777777778,24,2265,0 +630,631,12432,0.064404980330111,2.4880555555556003,83,2908,0 +631,632,12429,0.10188181868693,2.2325,62,3180,0 +632,633,12551,0.19953464365013,2.1044444444444,54,3118,0 +633,634,12799,0.0747839457206,2.1097222222222,54,3296,0 +634,635,12818,0.0,2.235,60,4432,0 +635,636,13071,0.0,2.3516666666667003,63,4336,0 +636,637,12897,0.0,2.5138888888889,95,4534,0 +637,638,12961,0.041436571087464,2.6105555555556004,69,4261,0 +638,639,12925,0.038671790863765,2.7233333333333,68,5248,0 +639,640,12968,0.03581063431610201,2.6633333333333,58,5014,0 +640,641,13525,0.1409929213297,2.5580555555556,107,3864,0 +641,642,12993,0.0,2.6627777777777997,48,5682,0 +642,643,12369,0.052915080344848,2.7625,64,4404,0 +643,644,12195,0.11966022897483,3.0283333333332996,52,3705,0 +644,645,12464,0.12973870706052,3.3727777777778,61,2738,0 +645,646,12470,0.023838633821410996,3.6369444444444,47,2887,0 +646,647,12475,0.12358680271021001,3.7088888888889,58,3776,0 +647,648,12482,0.089095336472172,3.5847222222222,51,3532,0 +648,649,12221,0.019762530636926998,3.4836111111111,61,3724,0 +649,650,12325,0.020994992941051005,3.4077777777777998,53,2786,0 +650,651,12258,0.10380294658324002,3.4441666666667,55,2941,0 +651,652,11980,0.079228021087742,3.1683333333332997,52,2351,0 +652,653,11947,0.039012779943635,3.0527777777778,89,2316,0 +653,654,12291,0.10658713601061,2.8527777777778,85,2350,0 +654,655,12293,0.14426278476756,2.5433333333332997,106,2916,0 +655,656,12341,0.08706206992122,2.1997222222222,88,2437,0 +656,657,12390,0.16325946030154,2.1036111111111,59,2761,0 +657,658,12611,0.0,2.2133333333332996,48,3941,0 +658,659,12737,0.0,2.2086111111111,66,4025,0 +659,660,12882,0.07729609083366701,2.2883333333333,95,4466,0 +660,661,12891,0.058100747891124,2.3222222222222,82,4401,0 +661,662,12756,0.06119152331234099,2.47,76,4747,0 +662,663,12875,0.08592375974441901,2.685,104,4051,0 +663,664,12847,0.033467197342518996,2.6763888888889,54,4448,0 +664,665,13518,0.030265788895452003,2.5838888888889,43,3736,0 +665,666,13217,0.11950310860409,2.6130555555556003,39,3918,0 +666,667,12621,0.09169148327055698,2.7633333333333,48,3408,0 +667,668,12591,0.18439354827551,3.0708333333332996,38,2883,0 +668,669,12332,0.10741924067542,3.4347222222222,45,3631,0 +669,670,12404,0.15862461647089002,3.7030555555555997,64,2609,0 +670,671,12457,0.14957813136313,3.8138888888888998,35,2533,0 +671,672,12370,0.24059408570531,3.8508333333333,66,2469,0 +672,673,11509,0.15511115210127,3.8961111111111,61,2458,0 +673,674,11433,0.19582462633147998,3.4763888888889,58,2458,0 +674,675,11317,0.13981560037535998,3.4041666666667,51,2043,0 +675,676,11364,0.1392329990551,3.2352777777778,55,1985,0 +676,677,11350,0.13079770999921,3.1508333333332996,126,2032,0 +677,678,11348,0.05367288121870901,2.7863888888888995,61,3409,0 +678,679,11365,0.10971373742228,2.4861111111111,94,2018,0 +679,680,11505,0.13825204927093,2.2444444444443996,83,2461,0 +680,681,11468,0.13912778922607,2.1286111111111,136,2318,0 +681,682,11562,0.10215803640865,2.1261111111111,104,2787,0 +682,683,11858,0.096617489053804,2.2405555555556003,77,3186,0 +683,684,11933,0.0,2.2991666666667,109,3490,0 +684,685,11813,0.0,2.3627777777778,146,3407,0 +685,686,11735,0.0,2.5863888888889,69,3193,0 +686,687,11848,0.0,2.7286111111111,121,3412,0 +687,688,11843,0.0,2.8355555555556,53,3563,0 +688,689,12318,0.068897518746959,2.7875,61,3247,0 +689,690,11846,0.05418569809170299,2.7825,82,3012,0 +690,691,11066,0.06507307495461,2.7972222222222,37,2382,0 +691,692,10920,0.10547682048851,3.0355555555555997,19,2012,0 +692,693,10836,0.056437861708265,3.2486111111111002,19,1915,0 +693,694,10879,0.09870371159383699,3.6077777777778,19,1982,0 +694,695,10796,0.14331889652193,3.76,54,1950,0 +695,696,10785,0.057044494886419994,3.8066666666667004,44,4176,0 +696,697,9469,0.0,3.6638888888889,46,3654,0 +697,698,9278,0.032146952736052,3.5161111111111003,53,3063,0 +698,699,9417,0.068135614649249,3.3286111111111003,83,1916,0 +699,700,9253,0.034514299845882,3.2166666666667,92,1848,0 +700,701,9435,0.028306668795131003,2.9783333333332997,94,1704,0 +701,702,9356,0.13119921991025002,2.7211111111111004,111,1680,0 +702,703,9354,0.09360977200772301,2.4102777777777997,84,2011,0 +703,704,9405,0.11179018663123,2.1366666666667,52,1772,0 +704,705,9326,0.065272680657868,1.9947222222222,68,1838,0 +705,706,9549,0.15901886092526998,1.9936111111111001,35,1924,0 +706,707,9499,0.0,2.0788888888889,40,2038,0 +707,708,9371,0.26537507315217,2.1736111111111,47,1991,0 +708,709,9462,0.0,2.4027777777778,85,1729,0 +709,710,9509,0.05661033690817299,2.4580555555556,59,1673,0 +710,711,9469,0.026644044055307004,2.6102777777777995,61,1656,0 +711,712,9522,0.040819652463459,2.7597222222222,45,1774,0 +712,713,9885,0.13497701521251,2.8122222222222,47,1784,0 +713,714,9802,0.16853433621426,2.8427777777778,72,1818,0 +714,715,9461,0.08655557751574,2.87,69,1981,0 +715,716,9393,0.057411277886819004,2.9769444444444,17,2004,0 +716,717,9638,0.037244401880164,3.3241666666667005,47,1788,0 +717,718,9435,0.1132743034971,3.6375,37,1786,0 +718,719,9519,0.15690958465910998,3.8652777777778,57,1781,0 +719,720,9492,0.09604225449090802,3.8091666666667003,62,2024,0 +720,721,9458,0.06746445682560599,3.6844444444444,72,1669,0 +721,722,9420,0.05837314521040401,3.5913888888888996,43,1729,0 +722,723,9429,0.048008603166117006,3.5255555555556,57,1682,0 +723,724,9461,0.12614216994504,3.3277777777778,47,1714,0 +724,725,9404,0.077186121310215,3.07,61,1679,0 +725,726,9366,0.042879382350005,2.7622222222222,53,1739,0 +726,727,9488,0.031014262794497004,2.3872222222222,78,1669,0 +727,728,9515,0.13957171072647,2.1308333333333,100,1806,0 +728,729,9487,0.027108383258305998,2.1563888888889,104,1650,0 +729,730,9497,0.0,2.2547222222222003,56,1751,0 +730,731,9516,0.0,2.3397222222222003,89,1685,0 +731,732,9504,0.0,2.4808333333332997,108,1645,0 +732,733,9422,0.025265991419407996,2.6208333333333,67,2133,0 +733,734,9543,0.0,2.8138888888888998,83,1618,0 +734,735,9395,0.047219926720593,2.9275,90,1623,0 +735,736,9352,0.083109434319356,2.8663888888888995,82,1697,0 +736,737,9884,0.10860709298782001,2.7794444444444,76,1684,0 +737,738,9820,0.09831971809508301,2.8194444444444002,34,1779,0 +738,739,9439,0.02201293380153,2.9458333333332996,43,2982,0 +739,740,9560,0.064929719079082,3.2413888888888995,40,1848,0 +740,741,9589,0.036960535765785,3.7166666666667,40,1772,0 +741,742,9575,0.068536856116777,4.1333333333333,57,1841,0 +742,743,9541,0.012398281267648999,4.2697222222222,60,1834,0 +743,744,9490,0.03530531183359101,4.2797222222222,53,1860,0 +744,745,7160,0.024153733176505,4.0,44,1647,0 +745,746,7233,0.031750779212929,3.8877777777777998,48,2129,0 +746,747,7166,0.092612685693125,3.6633333333333,50,1763,0 +747,748,7245,0.12674340154738,3.6127777777778,65,1433,0 +748,749,7299,0.068594711667718,3.3175,93,1428,0 +749,750,7169,0.13866540834682,2.8930555555556,105,1521,0 +750,751,7228,0.04681302439000701,2.4722222222222,94,1622,0 +751,752,7123,0.072990045810784,2.2294444444444,53,1580,0 +752,753,7199,0.17156759541908997,2.1286111111111,59,1468,0 +753,754,7167,0.05187669973457199,2.2219444444443996,63,1520,0 +754,755,7212,0.031958698733102996,2.3366666666667,61,1529,0 +755,756,7206,0.07333373485157901,2.4155555555556,72,1611,0 +756,757,7149,0.0,2.5408333333332997,93,1511,0 +757,758,7284,0.023187512335638,2.6511111111111,62,1906,0 +758,759,7265,0.031672522871666,2.8405555555556,50,2632,0 +759,760,7221,0.09110385536221399,2.8336111111111,42,1483,0 +760,761,7588,0.0,2.6575,62,1611,0 +761,762,7423,0.0983398607742,2.6622222222222005,21,1676,0 +762,763,7198,0.08011943311413,2.7719444444444,28,1670,0 +763,764,7279,0.043646436319699,3.0344444444444,65,1631,0 +764,765,7174,0.091445521226266,3.3741666666667003,37,1799,0 +765,766,7259,0.067771120773973,3.6925,20,1511,0 +766,767,7166,0.049768578185777006,3.8136111111111,47,1605,0 +767,768,7171,0.067455979006223,3.8202777777778,45,1758,0 +768,769,6883,0.14102875351082,3.7547222222222,49,1509,0 +769,770,6859,0.04521932948417,3.6077777777778,46,1591,0 +770,771,6817,0.032382889221133,3.5330555555556,30,1543,0 +771,772,6877,0.075100266089453,3.3544444444444,30,1573,0 +772,773,6785,0.038989846359505,3.1155555555556,48,1473,0 +773,774,6665,0.093396608626074,2.8463888888888995,36,1476,0 +774,775,6805,0.06797619687558401,2.4411111111111,46,1712,0 +775,776,6863,0.08326287339845401,2.1455555555556,27,1801,0 +776,777,6926,0.015112630017379001,2.0025,79,1902,0 +777,778,7004,0.031549757127405,2.1247222222222,65,2005,0 +778,779,6950,0.0,2.2741666666667,57,2363,0 +779,780,7262,0.0,2.3272222222222005,61,2513,0 +780,781,7361,0.017214486216241002,2.4363888888889,89,2664,0 +781,782,7288,0.015541991667356,2.6155555555556003,80,2714,0 +782,783,7463,0.0,2.7272222222222,79,2754,0 +783,784,7188,0.027199843934104,2.6552777777778,113,2670,0 +784,785,7658,0.053744802378685,2.6086111111111,71,2584,0 +785,786,7575,0.056755112785469006,2.6025,53,2466,0 +786,787,6954,0.070873939193717,2.7372222222222002,64,2137,0 +787,788,6862,0.19022950977106,3.0125,43,1931,0 +788,789,6896,0.17589540947937002,3.3477777777778,34,1743,0 +789,790,6954,0.022875979046570998,3.6236111111111002,29,1713,0 +790,791,6869,0.0,3.7383333333332995,30,1649,0 +791,792,6890,0.13681403156951,3.7772222222222,24,1633,0 +792,793,9742,0.058507485759525,3.6966666666667,40,1993,0 +793,794,9730,0.10227075584147999,3.7733333333332997,32,1940,0 +794,795,9810,0.06726096113022301,3.6408333333333,39,1951,0 +795,796,9688,0.15267199916685997,3.3922222222222,67,1894,0 +796,797,9849,0.069818221889972,3.1627777777778,65,1801,0 +797,798,9765,0.030305771594538997,2.6875,49,1962,0 +798,799,9812,0.09211700324247198,2.3533333333332997,41,2123,0 +799,800,9931,0.12298177354813,2.0425,50,2434,0 +800,801,9908,0.08705722689013601,1.9738888888889,48,2402,0 +801,802,10066,0.07529920073678098,2.0425,59,3013,0 +802,803,10184,0.06217694957317299,2.1563888888889,51,3086,0 +803,804,10295,0.020886039183631,2.2866666666667004,43,3527,0 +804,805,10113,0.08148200392528,2.3919444444443996,72,3716,0 +805,806,10218,0.027014133895137002,2.5513888888889,52,3577,0 +806,807,10322,0.08271940630361399,2.6030555555556,68,3430,0 +807,808,10269,0.038537180887872,2.6647222222222005,74,3413,0 +808,809,10781,0.090543853269643,2.5930555555556003,46,3755,0 +809,810,10486,0.025935450238779998,2.5513888888889,64,4806,0 +810,811,10124,0.090692829340129,2.76,38,3127,0 +811,812,9993,0.09154630234853098,3.0636111111111,40,3421,0 +812,813,9801,0.09562635368432303,3.4016666666667,50,2475,0 +813,814,9760,0.0,3.7277777777778,42,2440,0 +814,815,9858,0.0,3.7902777777778,37,2731,0 +815,816,9884,0.027267039980187,3.7355555555556,34,2493,0 +816,817,7781,0.024102810048699,3.535,37,1665,0 +817,818,7742,0.072297652068167,3.5819444444444,47,1771,0 +818,819,7682,0.12348623922845,3.3847222222222,67,2293,0 +819,820,7831,0.077453588867077,3.2547222222222,66,1959,0 +820,821,7641,0.05662557916213299,3.125,91,1498,0 +821,822,7641,0.15509029304093,2.7766666666667,132,1537,0 +822,823,7759,0.079595064406905,2.4725,149,1580,0 +823,824,7748,0.053225613553496996,2.1927777777778,65,1901,0 +824,825,7776,0.057411277886819004,2.1283333333333,50,1916,0 +825,826,7938,0.077171346852694,2.2319444444444,70,2213,0 +826,827,8031,0.0,2.3061111111111,82,2205,0 +827,828,8117,0.07512642149906099,2.3363888888889,72,2486,0 +828,829,8099,0.0,2.3686111111111,98,2580,0 +829,830,8002,0.0,2.4986111111111002,78,2530,0 +830,831,7944,0.026463035590685,2.6433333333333,86,2664,0 +831,832,7963,0.024228588329878998,2.7563888888888997,76,4368,0 +832,833,8602,0.055182797357095005,2.6652777777777996,95,3103,0 +833,834,8269,0.09607690135522999,2.6844444444444,63,2249,0 +834,835,7871,0.059431847203259,2.7902777777778,32,2070,0 +835,836,7709,0.018731901987648,3.1119444444444,30,2833,0 +836,837,7726,0.033970515582906,3.5491666666667,27,1734,0 +837,838,7781,0.049963174087431,3.7102777777778,22,2151,0 +838,839,7762,0.073295374096872,3.7961111111111,19,2103,0 +839,840,7692,0.017715537831218996,3.7730555555556,32,1725,0 +840,841,6608,0.014656639469103998,3.5919444444444,45,1895,0 +841,842,6526,0.15513271231042,3.5580555555555997,65,1959,0 +842,843,6531,0.06544162031760599,3.4588888888889,73,1637,0 +843,844,6483,0.12276447331552001,3.2969444444444003,52,1658,0 +844,845,6602,0.054046416943085,3.2288888888889,93,1666,0 +845,846,6555,0.06827770027642299,2.7358333333332996,68,2410,0 +846,847,6610,0.10171854295932001,2.4636111111111,127,1787,0 +847,848,6690,0.093454285728882,2.1894444444444,105,2264,0 +848,849,6651,0.04318436192577,2.1227777777778,75,2007,0 +849,850,6759,0.10050707347524,2.1369444444443997,77,2107,0 +850,851,6836,0.019571935182124002,2.2230555555556,140,2355,0 +851,852,6894,0.0,2.3188888888889,132,2726,0 +852,853,6844,0.0,2.4166666666667003,100,2875,0 +853,854,6773,0.02713995635286,2.5777777777777997,174,2780,0 +854,855,6802,0.092632629280125,2.7869444444444,82,3936,0 +855,856,6947,0.09867663820799799,2.8586111111111,128,3116,0 +856,857,7248,0.0,3.0816666666667003,79,3770,0 +857,858,6885,0.11132365864914,2.8713888888889,71,2382,0 +858,859,6643,0.09473018999010001,2.9386111111111,60,2152,0 +859,860,6560,0.061070711161473,2.9827777777778,60,1754,0 +860,861,6554,0.18477832073133,3.3197222222222,56,1783,0 +861,862,6600,0.055986690710270993,3.5961111111111004,78,1780,0 +862,863,6525,0.16264480046039997,3.7613888888888996,60,1582,0 +863,864,6543,0.026215643469447998,3.7305555555556,48,2271,0 +864,865,9018,0.0,3.5580555555555997,48,2592,0 +865,866,9225,0.054655616583012,3.5136111111111004,42,2921,0 +866,867,9112,0.07076692500883701,3.3772222222222,64,1814,0 +867,868,9195,0.067217215228375,3.2402777777778,36,3219,0 +868,869,9206,0.046060828388587,3.0586111111111003,40,2567,0 +869,870,9224,0.08329795085471901,2.7908333333332997,18,1899,0 +870,871,9408,0.08219020764935,2.3761111111111,35,1801,0 +871,872,9082,0.046792553198475,2.1347222222222,44,2005,0 +872,873,9168,0.06755714954154099,1.9991666666666998,105,2572,0 +873,874,9258,0.09905088200828699,1.9983333333333,71,3563,0 +874,875,9158,0.0,2.0908333333333,65,2777,0 +875,876,9140,0.10824637351267001,2.2311111111111,74,3362,0 +876,877,9206,0.0,2.3219444444443997,34,3590,0 +877,878,9186,0.0,2.4727777777777997,49,2930,0 +878,879,9155,0.037750185176735,2.5952777777777998,44,2481,0 +879,880,9174,0.030345867660395,2.7416666666667004,57,2571,0 +880,881,9758,0.057665227298857,2.7652777777778,102,3546,0 +881,882,9451,0.16774071722374,2.7980555555556,106,4984,0 +882,883,9153,0.10462164884166,2.7597222222222,58,1994,0 +883,884,9233,0.051974117163581995,3.0116666666667005,57,3060,0 +884,885,9250,0.070438547008222,3.2916666666667003,62,2151,0 +885,886,9317,0.11437533048243999,3.5547222222222,42,2158,0 +886,887,9130,0.028754095353637002,3.7580555555556,35,2319,0 +887,888,9249,0.06874265819680701,3.7330555555556,28,1909,0 +888,889,8297,0.041552255552731,3.5886111111111005,27,1627,0 +889,890,8245,0.033571347720577,3.5255555555556,35,2459,0 +890,891,8298,0.014724878652831001,3.3858333333333,50,3167,0 +891,892,8247,0.046095580964192,3.2677777777778,69,1839,0 +892,893,8387,0.031859774913781,3.1247222222222,64,3887,0 +893,894,8392,0.09412153625342401,2.7213888888888995,69,2031,0 +894,895,8531,0.11471874999036,2.3972222222222004,58,1522,0 +895,896,8437,0.09375530196425098,2.0836111111111,58,1732,0 +896,897,8344,0.10898948864078999,2.0644444444444,51,2169,0 +897,898,8274,0.031129909255124,2.2063888888889,46,1679,0 +898,899,8328,0.0,2.3044444444443997,84,1941,0 +899,900,8351,0.020155867044518997,2.47,144,1638,0 +900,901,8380,0.016795241270985,2.5697222222222003,86,1725,0 +901,902,8332,0.0,2.7625,69,1903,0 +902,903,8366,0.0,2.9436111111111005,81,2074,0 +903,904,8357,0.01748186857624,2.7905555555556,175,1848,0 +904,905,8867,0.015638795432702,2.7527777777778,65,1761,0 +905,906,8659,0.037878946671491,2.6980555555556,48,1838,0 +906,907,8458,0.14870829462531002,2.9102777777778,33,1640,0 +907,908,8360,0.07322030784057598,3.2663888888889,35,1715,0 +908,909,8330,0.10504553292421,3.5372222222222,37,1717,0 +909,910,8298,0.10771048774666,3.86,31,1758,0 +910,911,8381,0.07484115005697,3.9216666666667,36,1975,0 +911,912,8393,0.10377526695926,3.8766666666667002,30,1865,0 +912,913,3998,0.052336696506499,3.6463888888889,28,3575,0 +913,914,3733,0.039930389849143995,3.6552777777778,24,1413,0 +914,915,3735,0.052659026600132004,3.5880555555556,68,1414,0 +915,916,3709,0.071593754146172,3.3594444444444003,26,1170,0 +916,917,3755,0.072107773186609,3.1888888888888998,78,1209,0 +917,918,3782,0.14407221323011,2.7575,90,1170,0 +918,919,3849,0.078873737285415,2.3936111111111,76,1328,0 +919,920,3801,0.090543853269643,2.1925,94,1258,0 +920,921,3787,0.0,2.16,70,1427,0 +921,922,3835,0.18229662394063,2.2719444444444,129,1480,0 +922,923,4035,0.10064381631633,2.3994444444444,120,1687,0 +923,924,4173,0.0,2.2836111111111,122,1942,0 +924,925,3995,0.0,2.5422222222222004,100,1967,0 +925,926,4016,0.0,2.6908333333332997,102,2110,0 +926,927,4049,0.064661049677152,2.7702777777778,118,1956,0 +927,928,4014,0.10610212880951,2.7405555555556,86,1984,0 +928,929,4263,0.098345239553664,2.6908333333332997,92,1893,0 +929,930,3941,0.055426072308289,2.7008333333333,44,1821,0 +930,931,4023,0.026036719363444,2.8322222222222,25,1641,0 +931,932,3917,0.058176601538018,3.0922222222222002,54,1604,0 +932,933,3910,0.11644035456955001,3.4363888888889,48,1265,0 +933,934,3934,0.067489738764642,3.7530555555556,56,1407,0 +934,935,3783,0.09115553454055801,3.9127777777778,42,1342,0 +935,936,3834,0.052217414705359004,3.7608333333333,41,1216,0 +936,937,8698,0.028401045145692,3.6472222222222,32,2569,0 +937,938,8969,0.06030991242653401,3.5544444444444,48,2150,0 +938,939,8928,0.057683225704233,3.5036111111111,40,2317,0 +939,940,9020,0.049602244305934996,3.2538888888888997,26,2047,0 +940,941,8865,0.054771618715138,3.1886111111111,55,2065,0 +941,942,8830,0.014455899164978,2.7341666666667,52,1909,0 +942,943,8879,0.05563571922395901,2.3655555555556003,34,1910,0 +943,944,9120,0.077488949885965,2.1688888888889,61,2037,0 +944,945,9111,0.06776025909838901,2.0977777777778,34,3065,0 +945,946,9071,0.033919453583666,2.3077777777777997,50,2452,0 +946,947,9205,0.030948232299767998,2.3611111111111,47,3226,0 +947,948,9355,0.0,2.4986111111111002,56,3271,0 +948,949,9372,0.0,2.5691666666667,76,3471,0 +949,950,9392,0.0,2.7463888888889,60,3922,0 +950,951,9416,0.0,2.8063888888888995,100,3296,0 +951,952,9394,0.0,2.8091666666667003,80,3171,0 +952,953,9810,0.10150033578287,2.715,74,3208,0 +953,954,9594,0.13650296233629,2.6869444444444,24,3602,0 +954,955,9006,0.048341331534980006,2.8180555555556,41,3208,0 +955,956,9140,0.055919636698742996,3.0541666666667004,19,3455,0 +956,957,8925,0.05282677388968401,3.4711111111111004,24,2833,0 +957,958,9047,0.07932984590431501,3.7566666666667,18,3453,0 +958,959,9030,0.033310879512461,3.8633333333332995,28,3155,0 +959,960,9088,0.048306771033287996,3.7519444444444,5,2145,0 +960,961,8569,0.034002578802562,3.6480555555556,12,1999,0 +961,962,8616,0.04780164047085401,3.5061111111111005,35,2135,0 +962,963,8497,0.13378075099383,3.47,41,1813,0 +963,964,8439,0.063853685461221,3.3086111111111003,30,2020,0 +964,965,8567,0.0,3.1194444444444,22,2127,0 +965,966,8694,0.073869151016554,2.8044444444444,56,1764,0 +966,967,8739,0.04358290846692801,2.4205555555556004,34,2249,0 +967,968,8761,0.0,2.1180555555556,73,3119,0 +968,969,8838,0.062006969698131,2.1266666666667,86,2031,0 +969,970,8908,0.14006961492891,2.1708333333333,68,2246,0 +970,971,9053,0.11198565566103999,2.3247222222222,36,3214,0 +971,972,9346,0.0,2.4208333333332996,66,4207,0 +972,973,8989,0.05842745555499299,2.5563888888889,74,4195,0 +973,974,8807,0.070887934206661,2.7086111111111,78,3179,0 +974,975,9020,0.031869233863638,2.8027777777778,66,2739,0 +975,976,9034,0.0,2.7711111111111,118,2394,0 +976,977,9558,0.055680379884383,2.74,81,3750,0 +977,978,9042,0.030919398857213,2.6869444444444,85,3000,0 +978,979,8804,0.04022215086538101,2.8113888888889,69,2646,0 +979,980,8885,0.08462727078727299,3.1258333333332997,49,2375,0 +980,981,8721,0.15790637433488,3.4711111111111004,56,2442,0 +981,982,8676,0.09916557184644699,3.7419444444444,64,2069,0 +982,983,9029,0.051043016646698,3.7258333333333,48,1899,0 +983,984,8670,0.023695834967821,3.5369444444444,65,2277,0 +984,985,8537,0.13363180896924,3.4911111111111004,53,1926,0 +985,986,8418,0.14375985835531,3.3769444444444,70,1949,0 +986,987,8481,0.13890523887057998,3.3327777777778,51,2222,0 +987,988,8535,0.096357518724471,3.1925,30,1797,0 +988,989,8535,0.098277544249084,3.135,97,1860,0 +989,990,8442,0.11251833989481,2.8338888888889,41,2870,0 +990,991,8448,0.074768662666532,2.4997222222222004,32,1899,0 +991,992,8527,0.038008655416852,2.2297222222222004,47,2336,0 +992,993,8541,0.016354174968753,2.1158333333333,34,2703,0 +993,994,8635,0.11898350916153001,2.1966666666667,54,2773,0 +994,995,8867,0.0,2.2591666666667,69,2577,0 +995,996,9033,0.0,2.3002777777778,109,2816,0 +996,997,8875,0.0,2.3797222222222003,76,3133,0 +997,998,8708,0.0,2.625,47,3366,0 +998,999,8455,0.020636446066963,2.6661111111111,44,3062,0 +999,1000,8713,0.043044731483849,2.6694444444444,92,3003,0 +1000,1001,8934,0.12513578187909,2.6541666666667,67,3044,0 +1001,1002,8745,0.099581351017555,2.6483333333332997,26,3230,0 +1002,1003,8674,0.085903047711976,2.7444444444444,42,2793,0 +1003,1004,8606,0.066698820830796,3.0788888888889,69,1945,0 +1004,1005,8508,0.034228320502586,3.4833333333332996,32,2716,0 +1005,1006,8558,0.028479870560763,3.6063888888888997,41,2103,0 +1006,1007,8529,0.16430377699282997,3.8069444444444,52,1795,0 +1007,1008,8520,0.020290722486788003,3.6475,56,2840,0 +1008,1009,6662,0.17253761895951003,3.5219444444444,47,2653,0 +1009,1010,6491,0.1150267570489,3.3708333333333,65,2819,0 +1010,1011,6498,0.14119445755296,3.3086111111111003,70,1706,0 +1011,1012,6500,0.079900598296651,3.2411111111111004,84,1801,0 +1012,1013,6471,0.11459361685243,3.0525,71,3271,0 +1013,1014,6354,0.11299850955195001,2.7419444444444,110,2001,0 +1014,1015,6592,0.078187238738118,2.4305555555556,65,1678,0 +1015,1016,6552,0.15222680511595002,2.1852777777778,68,1703,0 +1016,1017,6492,0.05823703723779,2.0644444444444,74,2441,0 +1017,1018,6577,0.038270957919533,2.1961111111111,43,2304,0 +1018,1019,6777,0.045436612403901,2.2886111111111,55,3124,0 +1019,1020,6844,0.051111263534218,2.3219444444443997,53,3605,0 +1020,1021,6769,0.0,2.4436111111111,64,2985,0 +1021,1022,6642,0.0,2.6463888888889,58,2934,0 +1022,1023,6782,0.05724849659412799,2.735,54,3044,0 +1023,1024,6715,0.0,2.7586111111111005,121,3463,0 +1024,1025,6915,0.084808608043399,2.7138888888888997,103,3199,0 +1025,1026,6569,0.05823703723779,2.7119444444444,66,2684,0 +1026,1027,6486,0.12640598881102003,2.8027777777778,73,3317,0 +1027,1028,6504,0.08602692657241201,2.9777777777778,71,2159,0 +1028,1029,6445,0.13712331887199,3.2961111111111,37,2043,0 +1029,1030,6427,0.12184008568978999,3.4869444444444,46,2003,0 +1030,1031,6365,0.050317612906927996,3.6736111111111005,40,2260,0 +1031,1032,6277,0.07167380324199299,3.7469444444444,26,3522,0 +1032,1033,5231,0.051289858799957,3.6133333333332995,42,1840,0 +1033,1034,5166,0.094021005766084,3.4752777777778,63,1820,0 +1034,1035,5303,0.020566298353792,3.3602777777778,68,1856,0 +1035,1036,5306,0.12275234276969,3.1605555555555997,87,1715,0 +1036,1037,5298,0.1054190746845,3.0733333333333,60,1695,0 +1037,1038,5268,0.19050318144252,2.7130555555556,94,2254,0 +1038,1039,5251,0.10472332930133,2.2886111111111,121,1652,0 +1039,1040,5194,0.12644994481537,2.0783333333333,128,1602,0 +1040,1041,5230,0.08859454436104999,1.9188888888889,68,1792,0 +1041,1042,5244,0.0,1.9355555555556003,76,1954,0 +1042,1043,5102,0.09532581107230802,2.0569444444443996,77,1808,0 +1043,1044,5244,0.15766772749983,2.1902777777778,158,1629,0 +1044,1045,5249,0.06429178708826701,2.3477777777777997,112,2140,0 +1045,1046,5261,0.068395341911942,2.5502777777778,85,2390,0 +1046,1047,5339,0.025992957736547993,2.6597222222222,77,1707,0 +1047,1048,5241,0.0,2.7238888888888995,89,1901,0 +1048,1049,5491,0.021142167244917996,2.7375,106,1820,0 +1049,1050,5374,0.072067861729848,2.7483333333332998,47,2167,0 +1050,1051,5354,0.1275228688396,2.8525,34,2063,0 +1051,1052,5232,0.043846003986674,3.0038888888888997,32,2184,0 +1052,1053,5217,0.10247450096433999,3.2761111111111005,22,1981,0 +1053,1054,5258,0.07584150637714701,3.5761111111111004,16,1813,0 +1054,1055,5251,0.020496657705832002,3.8172222222222,32,2033,0 +1055,1056,5223,0.13399493992192998,3.6691666666667,16,1629,0 +1056,1057,3952,0.091121163023619,3.5558333333333,20,1485,0 +1057,1058,3949,0.11809705541338,3.4266666666667,56,1527,0 +1058,1059,4021,0.033014047837867995,3.435,74,2561,0 +1059,1060,3815,0.16367597832104,3.2111111111111,116,1523,0 +1060,1061,3855,0.12469537397569001,3.1297222222222,72,1446,0 +1061,1062,3892,0.095002031789468,2.7538888888888997,66,1499,0 +1062,1063,3948,0.1028064299952,2.3116666666667003,56,1368,0 +1063,1064,3860,0.028861851985229003,2.0988888888889,61,1426,0 +1064,1065,3830,0.05806984314166,2.0983333333333,2151,3528,0 +1065,1066,3821,0.050886592113012004,2.1986111111111,459,2279,0 +1066,1067,3886,0.05081829754409599,2.3677777777777997,84,1421,0 +1067,1068,3954,0.0,2.5036111111111,55,2008,0 +1068,1069,3839,0.08354288831032201,2.5786111111111,61,1429,0 +1069,1070,3921,0.0,2.8172222222222,19,1497,0 +1070,1071,3874,0.08142390858425298,2.8727777777778,30,1604,0 +1071,1072,3996,0.047911560407608,2.8294444444444,73,1595,0 +1072,1073,4246,0.12201534565884,2.7136111111111005,63,2217,0 +1073,1074,3803,0.088739417881303,2.7058333333333,35,1580,0 +1074,1075,3594,0.08276214539547999,2.8161111111111,57,1466,0 +1075,1076,3778,0.066779641097052,3.1541666666667,50,1717,0 +1076,1077,3745,0.11367082443275,3.5791666666667004,48,1564,0 +1077,1078,3747,0.021597223158314,3.8158333333332997,40,1752,0 +1078,1079,3726,0.16874893592242002,3.9405555555556,36,1598,0 +1079,1080,3729,0.041971530556774,3.7294444444444,59,1842,0 +1080,1081,8513,0.042983941794881,3.6183333333333,14,3066,0 +1081,1082,8738,0.14500733624043,3.4911111111111004,16,2272,0 +1082,1083,8709,0.04672709003112901,3.4566666666667003,36,4344,0 +1083,1084,8601,0.032553617944112004,3.37,65,3242,0 +1084,1085,8719,0.040039251102491,3.1658333333332997,80,2291,0 +1085,1086,8820,0.05515375910112699,2.7261111111111003,91,2240,0 +1086,1087,8674,0.057511810177119004,2.3533333333332997,102,2012,0 +1087,1088,8859,0.041202889821452,2.1158333333333,85,2305,0 +1088,1089,8905,0.07854024449462599,2.0852777777778,69,2295,0 +1089,1090,8920,0.11628975245152,2.1422222222222,79,2370,0 +1090,1091,9062,0.087543035971238,2.3172222222222003,66,3066,0 +1091,1092,9139,0.0,2.3983333333332997,47,3132,0 +1092,1093,8866,0.031151045483538996,2.55,51,3006,0 +1093,1094,8997,0.0,2.7413888888888995,20,3101,0 +1094,1095,9122,0.029949950026121004,2.7636111111111004,62,3739,0 +1095,1096,9191,0.067297142748812,2.7002777777777998,54,3933,0 +1096,1097,9795,0.08450527625030299,2.7247222222222,99,4537,0 +1097,1098,9255,0.04985210926935801,2.5866666666667,64,3856,0 +1098,1099,8924,0.094084438832673,2.8597222222222,66,2862,0 +1099,1100,9012,0.044896125591910994,3.1269444444444,49,2449,0 +1100,1101,9023,0.07328004196455701,3.5019444444444,73,2222,0 +1101,1102,8875,0.13104465124262998,3.7786111111111005,47,2159,0 +1102,1103,8800,0.10394116672902001,3.8727777777778,48,2486,0 +1103,1104,8785,0.033616505813902,3.7041666666667004,35,3148,0 +1104,1105,8474,0.02672150953308,3.5533333333333,27,3207,0 +1105,1106,8412,0.082058799915824,3.4461111111111005,19,2057,0 +1106,1107,8491,0.057321827873555005,3.4341666666667003,37,2029,0 +1107,1108,8391,0.067005870534182,3.3141666666667002,45,3127,0 +1108,1109,8216,0.13429243256821,3.0438888888888997,45,2597,0 +1109,1110,8292,0.015094533525413,2.6791666666667004,32,2350,0 +1110,1111,8406,0.063949370932991,2.3202777777778,99,2364,0 +1111,1112,8509,0.09437881174246199,2.0691666666667,71,2095,0 +1112,1113,8486,0.02139340711812,2.0091666666667,93,2978,0 +1113,1114,8616,0.0,2.1886111111111,78,2743,0 +1114,1115,8642,0.0,2.3088888888889,71,2668,0 +1115,1116,8823,0.0,2.3794444444444,91,3054,0 +1116,1117,8774,0.0,2.5994444444443996,31,3733,0 +1117,1118,8810,0.0,2.7119444444444,35,4312,0 +1118,1119,8611,0.0,2.76,25,4112,0 +1119,1120,8798,0.10029435223064,2.6975,45,3541,0 +1120,1121,9179,0.0,2.5466666666667,33,3901,0 +1121,1122,9057,0.10365337249761998,2.6036111111111,34,4371,0 +1122,1123,8633,0.12418226954696002,2.7927777777778,40,4099,0 +1123,1124,8517,0.0,2.9788888888889,17,3039,0 +1124,1125,8427,0.051166116772473,3.4080555555556,17,3197,0 +1125,1126,8615,0.04022215086538101,3.6813888888888995,16,2346,0 +1126,1127,8690,0.17057206553854998,3.7983333333332996,26,2285,0 +1127,1128,8438,0.12861588337799,3.6338888888888996,19,2313,0 +1128,1129,10388,0.0,3.5111111111111004,30,3216,0 +1129,1130,10588,0.0,3.3613888888888996,94,3860,0 +1130,1131,10533,0.14569364884757002,3.3072222222222,73,4781,0 +1131,1132,10397,0.18198813530019,3.2447222222222,59,2957,0 +1132,1133,10347,0.038073868368754996,3.1152777777778,53,2171,0 +1133,1134,10405,0.11491272575332001,2.6994444444444,56,2856,0 +1134,1135,10411,0.064841538076484,2.3497222222222005,70,2714,0 +1135,1136,10503,0.048708312546253,2.0619444444444,60,2602,0 +1136,1137,10598,0.11629780056153,2.0625,83,2331,0 +1137,1138,10692,0.07659916149791901,2.1905555555556004,265,3586,0 +1138,1139,10874,0.0,2.2588888888889,944,3363,0 +1139,1140,11043,0.043763623117499,2.3983333333332997,36,3879,0 +1140,1141,11009,0.0,2.5536111111111,42,3556,0 +1141,1142,10818,0.041436571087464,2.7408333333333,23,4381,0 +1142,1143,10985,0.0,2.7375,75,4777,0 +1143,1144,10861,0.08191467409622599,2.7780555555556,68,4879,0 +1144,1145,12282,0.11084389924027,2.6225,23,3553,0 +1145,1146,11225,0.12510294083344,2.6386111111111,35,3177,0 +1146,1147,10775,0.10213470511716999,2.7908333333332997,38,2727,0 +1147,1148,10688,0.06332743445339299,3.0922222222222002,69,2758,0 +1148,1149,10601,0.033666593475508995,3.4291666666667004,57,4124,0 +1149,1150,10634,0.057459020289436,3.6752777777778,58,3076,0 +1150,1151,10646,0.023008391787587002,3.7361111111111005,43,2291,0 +1151,1152,10562,0.037622360322277996,3.5905555555556,65,2482,0 +1152,1153,10608,0.026766196308354,3.3872222222222,60,2537,0 +1153,1154,10618,0.13691041072327,3.3186111111111005,55,2434,0 +1154,1155,10636,0.024581173073577996,3.2775,49,2608,0 +1155,1156,10583,0.050723618686514,3.1625,54,2614,0 +1156,1157,10613,0.038807415292018,3.1391666666667004,66,2904,0 +1157,1158,10603,0.10731539561588001,2.7616666666667005,59,2204,0 +1158,1159,10601,0.13649131550296,2.4675,107,2326,0 +1159,1160,10757,0.11190990870167998,2.2166666666667,104,3002,0 +1160,1161,10815,0.17879123074031,2.1205555555556,100,3472,0 +1161,1162,10790,0.08728058888363299,2.2044444444443996,133,3496,0 +1162,1163,11082,0.0,2.3147222222222004,65,3168,0 +1163,1164,11121,0.07099894663641,2.2416666666667004,152,4268,0 +1164,1165,10913,0.098617038600063,2.405,83,4350,0 +1165,1166,11004,0.0,2.5705555555556003,158,3555,0 +1166,1167,11135,0.10519721128315,2.7088888888889,145,4986,0 +1167,1168,10960,0.10928571467638999,2.6913888888888997,77,4576,0 +1168,1169,11686,0.14969099592127,2.6427777777777997,13,4451,0 +1169,1170,11244,0.060122448878635,2.705,67,3627,0 +1170,1171,10931,0.068254139999346,2.8738888888889,25,3485,0 +1171,1172,10811,0.05698767181974299,3.0819444444444,27,3046,0 +1172,1173,10679,0.09466793501476899,3.4491666666667005,23,2657,0 +1173,1174,10648,0.13287358772218,3.6275,28,2423,0 +1174,1175,10757,0.032507012295146,3.8027777777778,25,2374,0 +1175,1176,10706,0.14779741522058998,3.6436111111111003,28,2493,0 +1176,1177,9077,0.10864900088005,3.4861111111111005,30,2495,0 +1177,1178,8836,0.12602969813907,3.3266666666667004,31,2189,0 +1178,1179,8971,0.07253718299881,3.1866666666667003,31,2214,0 +1179,1180,8972,0.31381296416887,3.2213888888888995,44,2374,0 +1180,1181,8903,0.2312064012582,3.0102777777778,27,3230,0 +1181,1182,8967,0.17687421373190998,2.6658333333332997,36,2132,0 +1182,1183,8962,0.022073721703464003,2.3902777777777997,61,3042,0 +1183,1184,9044,0.11600086139072999,2.1380555555556002,64,2053,0 +1184,1185,8931,0.10418807549523,2.0161111111111,118,2349,0 +1185,1186,9028,0.04022215086538101,2.0641666666667,98,3381,0 +1186,1187,9240,0.06812462580532,2.1844444444443996,76,3436,0 +1187,1188,9227,0.055328485037955,2.2822222222222,57,3280,0 +1188,1189,9227,0.027788383289498998,2.4002777777777995,74,4357,0 +1189,1190,9125,0.0,2.5433333333332997,72,4522,0 +1190,1191,9075,0.0,2.7469444444444,78,4094,0 +1191,1192,9117,0.035137191893634005,2.6872222222222,69,3296,0 +1192,1193,9562,0.035137191893634005,2.6980555555556,125,4129,0 +1193,1194,9305,0.11258759940039,2.7380555555556,157,3036,0 +1194,1195,8965,0.16105265701128,2.7858333333333,61,2628,0 +1195,1196,8862,0.15210502999287,3.0502777777778,12,2296,0 +1196,1197,8858,0.07673479360192201,3.2991666666667,16,2221,0 +1197,1198,8820,0.17013715283392,3.5533333333333,36,1991,0 +1198,1199,8876,0.1609412187274,3.6652777777778,27,2778,0 +1199,1200,8797,0.12008642730107,3.6116666666667,22,2511,0 +1200,1201,9074,0.045995324803682,3.5463888888888997,22,2103,0 +1201,1202,9318,0.23802438276872,3.4013888888888997,35,2111,0 +1202,1203,9286,0.18078076076243,3.245,67,2055,0 +1203,1204,9320,0.12741851179236,3.1644444444444,46,1930,0 +1204,1205,9280,0.08024661572906401,2.9361111111111002,72,2456,0 +1205,1206,9333,0.32656213417732,2.6952777777778,96,2952,0 +1206,1207,9334,0.28639695711596,2.3702777777777997,117,2147,0 +1207,1208,9337,0.083900984173012,2.0947222222222,113,2051,0 +1208,1209,9405,0.12853338721539,1.9538888888888999,140,2281,0 +1209,1210,9263,0.032414228925828,1.9925,107,2102,0 +1210,1211,9326,0.08237281480963901,2.0363888888889,102,2062,0 +1211,1212,9421,0.0,2.1919444444444,85,2796,0 +1212,1213,9275,0.0,2.3211111111111,49,2005,0 +1213,1214,9323,0.0,2.4955555555556,69,2075,0 +1214,1215,9347,0.45868581620054,2.6980555555556,68,2058,1 +1215,1216,9333,0.19590927087360002,2.7219444444444,104,2733,0 +1216,1217,9846,0.7871265862012701,2.725,111,2170,1 +1217,1218,9497,0.18267963393082,2.7816666666667,88,2282,0 +1218,1219,9383,0.26777755992147,2.7811111111111004,64,2178,0 +1219,1220,9300,0.30404676514833,2.955,29,2283,0 +1220,1221,9389,0.28226806095289003,3.3158333333332997,32,2097,0 +1221,1222,9364,0.32093016819692,3.5669444444444003,29,2738,0 +1222,1223,9227,0.24793583772273,3.7419444444444,21,2678,0 +1223,1224,9309,0.27376916868294,3.6236111111111002,33,2404,0 +1224,1225,6204,0.32069151905173,3.4416666666667,37,1497,0 +1225,1226,6048,0.16728853165162,3.4172222222222,57,1496,0 +1226,1227,5949,0.17244047836378998,3.3016666666667,72,1935,0 +1227,1228,5981,0.21356200193615002,3.1963888888888996,86,1521,0 +1228,1229,5897,0.08833993625230199,3.0641666666667002,70,2879,0 +1229,1230,6038,0.20141526375625,2.735,63,1561,0 +1230,1231,6094,0.12271171189386001,2.3288888888889,49,1381,0 +1231,1232,6022,0.15111333507662,2.0938888888889,81,1826,0 +1232,1233,6122,0.3688420983862,2.1338888888889,58,1896,0 +1233,1234,6034,0.15672074166098002,2.2247222222222005,70,2083,0 +1234,1235,6079,0.09947623679378201,2.3308333333333,67,1792,0 +1235,1236,5998,0.18394691317126002,2.3902777777777997,70,3258,0 +1236,1237,6004,0.076264605227629,2.5819444444444,95,2265,0 +1237,1238,5908,0.058100747891124,2.6661111111111,100,2775,0 +1238,1239,6022,0.18015967729618,2.8258333333333,116,1545,0 +1239,1240,5981,0.059431847203259,2.7502777777778,123,1818,0 +1240,1241,6399,0.14870829462531002,2.6730555555556004,71,1481,0 +1241,1242,6119,0.09565694822541,2.7536111111111,65,1677,0 +1242,1243,6114,0.16022629962173002,2.9677777777778,73,1858,0 +1243,1244,5915,0.4140256163498,3.37,53,1643,0 +1244,1245,6192,0.32447726333369004,3.5958333333333,79,1582,0 +1245,1246,6021,0.15394421357627,3.8144444444444,77,1611,0 +1246,1247,6060,0.060070368432037995,3.8283333333333,59,1803,0 +1247,1248,7510,0.14236976564388001,3.7030555555555997,66,2121,0 +1248,1249,7560,0.12741851179236,3.5802777777778,54,2375,0 +1249,1250,7525,0.093634078744746,3.4197222222222,54,1866,0 +1250,1251,7483,0.13709947889982,3.4438888888888997,89,2398,0 +1251,1252,7452,0.06298116794216299,3.3425,85,2577,0 +1252,1253,7512,0.13125017838571001,3.1608333333333,96,1801,0 +1253,1254,7572,0.21161148728916002,2.7413888888888995,149,1840,0 +1254,1255,7629,0.06783428261124,2.3808333333332996,139,1985,0 +1255,1256,7529,0.20877561051189,2.12,90,2041,0 +1256,1257,7623,0.10394294206935002,2.1533333333333,68,2075,0 +1257,1258,7637,0.0,2.2569444444444,445,2564,0 +1258,1259,7921,0.076424293095548,2.3183333333332996,100,2734,0 +1259,1260,7790,0.08809461878011901,2.3583333333332996,138,3143,0 +1260,1261,7782,0.03428038631974299,2.5072222222222003,104,3119,0 +1261,1262,7829,0.039360296791109,2.5927777777778,82,3590,0 +1262,1263,7902,0.0,2.6894444444444,208,3893,0 +1263,1264,8039,0.03894406599435601,2.6291666666667,92,3264,0 +1264,1265,8350,0.18176011684739002,2.6469444444444,53,3963,0 +1265,1266,8142,0.18521047165852,2.7461111111111003,65,2757,0 +1266,1267,7886,0.13079770999921,2.9363888888889,62,2306,0 +1267,1268,7743,0.13310058077443,3.2797222222222002,73,2549,0 +1268,1269,7707,0.054750658073534006,3.5194444444444,84,2212,0 +1269,1270,7726,0.030588852697706,3.8130555555556,90,2286,0 +1270,1271,7717,0.12998124134227002,3.7941666666667,80,2979,0 +1271,1272,10331,0.09100057249197198,3.6086111111111,90,3158,0 +1272,1273,10515,0.19464543002904003,3.3858333333333,84,2645,0 +1273,1274,10415,0.22178651521516,3.3336111111111,34,3161,0 +1274,1275,10387,0.22983578430825,3.3116666666667003,67,4460,0 +1275,1276,10471,0.298229429356,3.2616666666667005,74,2630,0 +1276,1277,10385,0.12923377484588,3.0044444444444003,44,2593,0 +1277,1278,10439,0.19609416059774,2.6741666666667,64,2625,0 +1278,1279,10516,0.04051853381938501,2.3191666666667,70,4834,0 +1279,1280,10587,0.07099894663641,2.0597222222222,96,4056,0 +1280,1281,10586,0.07584150637714701,2.0547222222222,110,5713,0 +1281,1282,10684,0.08180100127782801,2.1511111111111,68,3940,0 +1282,1283,10880,0.0,2.2602777777778,90,4414,0 +1283,1284,10830,0.0,2.2883333333333,90,5044,0 +1284,1285,10794,0.09140162014739302,2.3736111111111002,69,3894,0 +1285,1286,10843,0.0,2.5869444444444,46,3993,0 +1286,1287,10805,0.0,2.6480555555556,74,4404,0 +1287,1288,10996,0.0,2.6077777777777995,68,4072,0 +1288,1289,11327,0.05363316840061,2.6069444444444,67,4182,0 +1289,1290,11090,0.26818151064716,2.6908333333332997,51,3351,0 +1290,1291,10578,0.21887772653901,2.9019444444444003,39,4183,0 +1291,1292,10528,0.32371296573811,3.2711111111111,26,4068,0 +1292,1293,10475,0.12565805017257,3.5872222222222,25,8139,0 +1293,1294,10664,0.092277247744574,3.6913888888888997,32,11000,0 +1294,1295,10513,0.077016875742983,3.6313888888888997,17,2975,0 +1295,1296,9072,0.37144807973125005,3.5605555555556,19,2692,0 +1296,1297,9069,0.19332372237792,3.4402777777778,16,2502,0 +1297,1298,9089,0.06345811641554701,3.35,28,2510,0 +1298,1299,9027,0.22671215594729996,3.3469444444444,24,2663,0 +1299,1300,8969,0.053072279964629,3.2708333333332997,35,3575,0 +1300,1301,9073,0.13336345197744,3.2519444444444,49,2586,0 +1301,1302,8957,0.1252855094715,2.7311111111111,106,2908,0 +1302,1303,9126,0.096211952864224,2.3875,80,3530,0 +1303,1304,9122,0.09652446751775501,2.0847222222222,90,2776,0 +1304,1305,9231,0.08924770147957402,2.0975,169,2962,0 +1305,1306,9368,0.11889606284161999,2.1763888888889,98,3441,0 +1306,1307,9458,0.031429841710104,2.2327777777777995,92,4376,0 +1307,1308,9463,0.0,2.2725,91,3857,0 +1308,1309,9356,0.036512411627867995,2.3202777777778,99,4685,0 +1309,1310,9340,0.0,2.5425,90,4585,0 +1310,1311,9340,0.0,2.5986111111111,126,3542,0 +1311,1312,9276,0.0,2.6319444444444,102,3370,0 +1312,1313,9611,0.10106696361212,2.5836111111111,132,3515,0 +1313,1314,9532,0.14854949043035,2.675,88,3793,0 +1314,1315,9156,0.08612162048398898,2.8522222222222,135,2954,0 +1315,1316,9222,0.16494200410492002,3.1302777777778,114,2627,0 +1316,1317,9282,0.28637713141253,3.4805555555556,35,2550,0 +1317,1318,9573,0.13206535647488,3.5994444444444,24,2480,0 +1318,1319,9333,0.27364025607799,3.5847222222222,44,2521,0 +1319,1320,9987,0.38382339961227,3.4963888888889,26,2860,0 +1320,1321,10133,0.08426242877623301,3.3825,37,3675,0 +1321,1322,10010,0.32904135680259006,3.2694444444444,45,2704,0 +1322,1323,10028,0.22632868808707998,3.2322222222222,42,3121,0 +1323,1324,9984,0.17914189971361,3.1936111111111005,47,2603,0 +1324,1325,10041,0.30046815361859003,3.0536111111111004,34,3984,0 +1325,1326,10072,0.22650915594248,2.7819444444444,56,2537,0 +1326,1327,10025,0.0,2.4152777777777996,87,3349,0 +1327,1328,10116,0.1223093269317,2.1569444444443997,74,3958,0 +1328,1329,10232,0.1696074188221,2.1125,90,4243,0 +1329,1330,10516,0.0,2.1833333333333003,79,4159,0 +1330,1331,10449,0.028193633007367002,2.205,97,5637,0 +1331,1332,10598,0.0,2.1697222222222,90,8142,0 +1332,1333,10337,0.0,2.3075,77,5713,0 +1333,1334,10469,0.097305232437507,2.4575,101,3668,0 +1334,1335,10426,0.11905908868378999,2.6077777777777995,74,4307,0 +1335,1336,10531,0.11660374103282001,2.6275,439,4354,0 +1336,1337,10875,0.06047429775658401,2.6144444444443997,79,4262,0 +1337,1338,10494,0.22568442027805,2.6477777777777995,165,3446,0 +1338,1339,10195,0.14077736537045002,2.8594444444444003,139,2677,0 +1339,1340,9918,0.1924574892026,3.2675,56,4450,0 +1340,1341,9889,0.18922597300629002,3.5136111111111004,102,3044,0 +1341,1342,9947,0.041593949118095004,3.5725,101,3428,0 +1342,1343,9977,0.2502095174271,3.6863888888889,41,2845,0 +1343,1344,10835,0.18663972932643,3.5636111111111,94,2781,0 +1344,1345,10765,0.07351854082400298,3.4127777777778,116,2743,0 +1345,1346,10656,0.081949111399618,3.295,94,4470,0 +1346,1347,10485,0.20148511394008997,3.2666666666667004,89,2596,0 +1347,1348,10681,0.11515101921294,3.1933333333332996,141,3249,0 +1348,1349,10852,0.07797276382811,3.0688888888888997,167,2529,0 +1349,1350,10728,0.07244862879413201,2.8102777777778,148,2452,0 +1350,1351,10874,0.07310929970435699,2.42,105,2934,0 +1351,1352,10964,0.066868365737218,2.1358333333333,210,3159,0 +1352,1353,10984,0.057885125015937004,1.9916666666667,145,3974,0 +1353,1354,11055,0.09727414207464802,2.0947222222222,136,4305,0 +1354,1355,11233,0.033270317741557996,2.1591666666667,126,5012,0 +1355,1356,11161,0.0,2.2377777777778,157,4455,0 +1356,1357,10966,0.038270957919533,2.2511111111111,105,4108,0 +1357,1358,11193,0.08728058888363299,2.4208333333332996,114,4339,0 +1358,1359,11167,0.10536774813238,2.5241666666667,104,5056,0 +1359,1360,11367,0.1233991317089,2.5794444444443996,69,5573,0 +1360,1361,51251,0.042565915766552,2.5936111111111,75,3366,1 +1361,1362,17953,0.23147422367229,2.6830555555556,73,2559,1 +1362,1363,170029,0.08983405162538902,2.8188888888888997,74,1999,1 +1363,1364,10955,0.07464756469365201,2.9513888888888995,126,1993,0 +1364,1365,10984,0.09924410491893401,3.2830555555556,67,1913,0 +1365,1366,10964,0.11535172009194,3.4819444444444,32,1760,0 +1366,1367,10980,0.21774881707851998,3.5886111111111005,38,1890,0 +1367,1368,10852,0.1305066423559,3.4836111111111,34,2469,0 +1368,1369,10786,0.10054853030204,3.3955555555556,36,2133,0 +1369,1370,10841,0.02468393737575,3.2847222222222,26,3359,0 +1370,1371,10762,0.10018007414459,3.2383333333332995,74,3783,0 +1371,1372,10419,0.12522619841308,3.2188888888888996,85,1809,0 +1372,1373,10467,0.11781887197077001,2.9483333333333,67,2143,0 +1373,1374,10502,0.13417256350298,2.5855555555556,84,2567,0 +1374,1375,10519,0.07474686582090599,2.3005555555556003,1630,2176,0 +1375,1376,10579,0.13570963056519,2.0855555555556,1435,1929,0 +1376,1377,10502,0.076431907457478,1.9027777777777999,857,2244,0 +1377,1378,10661,0.0,1.9411111111111,31,1810,0 +1378,1379,10818,0.1936428046839,2.0444444444444,500,2088,0 +1379,1380,10918,0.05282677388968401,2.1363888888889,53,2371,0 +1380,1381,10871,0.0,2.22,61,1843,0 +1381,1382,10796,0.054466597481213,2.3530555555556,158,2668,0 +1382,1383,10774,0.057459020289436,2.545,184,2309,0 +1383,1384,10898,0.28750562005936,2.6202777777777997,91,1998,0 +1384,1385,11442,0.075538554674309,2.6847222222222,60,2480,0 +1385,1386,11113,0.08112608570492501,2.6591666666667004,107,2147,0 +1386,1387,10888,0.21563803296368,2.7863888888888995,5157,1802,0 +1387,1388,10894,0.09572500230568501,3.0269444444444003,28,1789,0 +1388,1389,10888,0.17516056892320997,3.3227777777778,24,1999,0 +1389,1390,10896,0.32902836018585996,3.6097222222222,21,2142,0 +1390,1391,10800,0.10216065221678,3.6805555555555998,12,1904,0 +1391,1392,11000,0.19741931250852,3.6075,24,1876,0 +1392,1393,10985,0.10149107903671001,3.4091666666667004,17,2434,0 +1393,1394,11017,0.17479255893624,3.3666666666667004,48,2472,0 +1394,1395,10863,0.034385029573777,3.3158333333332997,41,1744,0 +1395,1396,10875,0.21988771218053,3.1622222222222,1088,2404,0 +1396,1397,10987,0.10149107903671001,3.1086111111111,68,1971,0 +1397,1398,10778,0.10269981175444999,2.6552777777778,2575,1713,0 +1398,1399,10957,0.11258759940039,2.2730555555556,4688,1765,0 +1399,1400,10832,0.13022351806001,2.0591666666667,477,3156,0 diff --git a/datasets/anomaly/template/datasetDoc.json b/datasets/anomaly/template/datasetDoc.json new file mode 100644 index 0000000..0494777 --- /dev/null +++ b/datasets/anomaly/template/datasetDoc.json @@ -0,0 +1,183 @@ +{ + "about": { + "datasetID": "template", + "datasetName": "baseball", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "Player", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "Number_seasons", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "Games_played", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "At_bats", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "Runs", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "Hits", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "Doubles", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 8, + "colName": "Triples", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 9, + "colName": "Home_runs", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 10, + "colName": "RBIs", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 11, + "colName": "Walks", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 12, + "colName": "Strikeouts", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 13, + "colName": "Batting_average", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 14, + "colName": "On_base_pct", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 15, + "colName": "Slugging_pct", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 16, + "colName": "Fielding_ave", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 17, + "colName": "Position", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 18, + "colName": "Hall_of_Fame", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 19 + } + ] +} diff --git a/datasets/anomaly/template/problemDoc.json b/datasets/anomaly/template/problemDoc.json new file mode 100644 index 0000000..514a80a --- /dev/null +++ b/datasets/anomaly/template/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "template", + "problemName": "baseball_problem", + "problemDescription": "**Author**: Jeffrey S. Simonoff \n**Source**: [AnalCatData](http://www.stern.nyu.edu/~jsimonof/AnalCatData) - 2003 \n**Please cite**: Jeffrey S. Simonoff, Analyzing Categorical Data, Springer-Verlag, New York, 2003 \n \nDatabase of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave' \n\nNotes: \n* Quotes, Single-Quotes and Backslashes were removed, Blanks replaced with Underscores\n* Player is an identifier that should be ignored when modelling the data", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "multiClass", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "185_baseball_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 18, + "colName": "Hall_of_Fame" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "185_baseball_dataset", + "to": "185_baseball_dataset_TRAIN" + } + ], + "test": [ + { + "from": "185_baseball_dataset", + "to": "185_baseball_dataset_TEST" + } + ], + "score": [ + { + "from": "185_baseball_dataset", + "to": "185_baseball_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} diff --git a/datasets/anomaly/transform_kpi.py b/datasets/anomaly/transform_kpi.py new file mode 100644 index 0000000..e8bd319 --- /dev/null +++ b/datasets/anomaly/transform_kpi.py @@ -0,0 +1,160 @@ +# TODO: Wrap it as a class and connect it to GUI +# A script to transform anomaly data to d3m format +import pandas as pd +import numpy as np +import os +import json + +############################## +# Some information for the dataset to be transformed +# Designed for time series data +name = 'kpi' +src_path = './raw_data/kpi.csv' +label_name = 'label' +timestamp_name = 'timestamp' +value_names = ['value'] +ratio = 0.8 # Ratio of training data, the rest is for testing + +############################### + + + +dst_root = './' + name +dirs = ['./', 'SCORE', 'TEST', 'TRAIN'] +maps = {'./': None, 'SCORE': 'TEST', 'TEST': 'TEST', 'TRAIN': 'TRAIN'} + +# Create the corresponding directories +for d in dirs: + if maps[d] is not None: + dataset_name = 'dataset_' + maps[d] + problem_name = 'problem_' + maps[d] + else: + dataset_name = name + '_dataset' + problem_name = name + '_problem' + tables_dir = os.path.join(dst_root, d, dataset_name, 'tables') + if not os.path.exists(tables_dir): + os.makedirs(tables_dir) + problem_dir = os.path.join(dst_root, d, problem_name) + if not os.path.exists(problem_dir): + os.makedirs(problem_dir) + +# Process data +_df = pd.DataFrame() +df = pd.read_csv(src_path) +_df['d3mIndex'] = df.index +_df['timestamp'] = df[timestamp_name] +for value_name in value_names: + _df[value_name] = df[value_name] +_df['ground_truth'] = df[label_name] +df = _df +cols = df.columns.tolist() + +# Save all the data +df.to_csv(os.path.join(dst_root, name+'_dataset', 'tables', 'learningData.csv'), index=False) + +# Save training and testing data +train_df, test_df = df[:int(df.shape[0]*ratio)], df[int(df.shape[0]*ratio):] + +train_df.to_csv(os.path.join(dst_root, 'TRAIN', 'dataset_TRAIN', 'tables', 'learningData.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'TEST', 'dataset_TEST', 'tables', 'learningData.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'SCORE', 'dataset_TEST', 'tables', 'learningData.csv'), index=False) + +# Data splits +row_0 = train_df.shape[0] +row_1 = train_df.shape[0] +row = row_0 + row_1 +df = pd.DataFrame(np.array([[i for i in range(row)], ['TRAIN' for _ in range(row_0)] + ['TEST' for _ in range(row_1)], [0 for _ in range(row)], [0 for _ in range(row)]]).transpose(), columns = ['d3mIndex', 'type', 'repeat', 'fold']) + +# Save data splits for all data +train_df.to_csv(os.path.join(dst_root, name+'_problem', 'dataSplits.csv'), index=False) + +# Save training and testing splits +train_df, test_df = df[:row_0], df[row_0:] +train_df.to_csv(os.path.join(dst_root, 'TRAIN', 'problem_TRAIN', 'dataSplits.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'TEST', 'problem_TEST', 'dataSplits.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'SCORE', 'problem_TEST', 'dataSplits.csv'), index=False) + + +# Dataset JSON files +# Load template +with open('template/datasetDoc.json') as json_file: + data = json.load(json_file) +columns = [] +for i in range(len(cols)): + c = {} + c['colIndex'] = i + c['colName'] = cols[i] + if i == 0: + c['colType'] = 'integer' + c['role'] = ['index'] + elif i == 1: + c['colType'] = 'integer' + c['role'] = ['attribute'] + elif i == len(cols)-1: + c['colType'] = 'integer' + c['role'] = ['suggestedTarget'] + else: + c['colType'] = 'real' + c['role'] = ['attribute'] + columns.append(c) +data['dataResources'][0]['columns'] = columns +data['dataResources'][0]['columnsCount'] = len(cols) + +data['about']['datasetID'] = name + '_dataset' +data['about']['datasetName'] = name +with open(os.path.join(dst_root, name+'_dataset', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +data['about']['datasetID'] = name +'_dataset_TRAIN' +data['about']['datasetName'] = "NULL" +with open(os.path.join(dst_root, 'TRAIN', 'dataset_TRAIN', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +data['about']['datasetID'] = name + '_dataset_TEST' +data['about']['datasetName'] = 'NULL' +with open(os.path.join(dst_root, 'TEST', 'dataset_TEST', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +data['about']['datasetID'] = name + '_dataset_TEST' +data['about']['datasetName'] = 'NULL' +with open(os.path.join(dst_root, 'SCORE', 'dataset_TEST', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +# Problem JSON files +# Load template +with open('template/problemDoc.json') as json_file: + data = json.load(json_file) + +data['about']['problemID'] = name+'_problem' +data['about']['problemName'] = name+'_problem' +data['about']['problemDescription'] = 'Anomaly detection' +data['about']['taskKeywords'] = ['classification', 'binary', 'tabular'] +data['inputs']['data'][0]['datasetID'] = name + '_dataset' +data['inputs']['data'][0]['targets'][0]['colIndex'] = len(cols)-1 +data['inputs']['data'][0]['targets'][0]['colName'] = cols[-1] +data['inputs']['dataSplits']['datasetViewMaps']['train'][0]['from'] = name+'_dataset' +data['inputs']['dataSplits']['datasetViewMaps']['test'][0]['from'] = name+'_dataset' +data['inputs']['dataSplits']['datasetViewMaps']['score'][0]['from'] = name+'_dataset' +data['inputs']['dataSplits']['datasetViewMaps']['train'][0]['to'] = name+'_dataset_TRAIN' +data['inputs']['dataSplits']['datasetViewMaps']['test'][0]['to'] = name+'_dataset_TEST' +data['inputs']['dataSplits']['datasetViewMaps']['score'][0]['to'] = name+'_dataset_SCORE' + +with open(os.path.join(dst_root, name+'_problem', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +with open(os.path.join(dst_root, 'TRAIN', 'problem_TRAIN', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +with open(os.path.join(dst_root, 'TEST', 'problem_TEST', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +with open(os.path.join(dst_root, 'SCORE', 'problem_TEST', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +# Make an empty targets.csv +with open(os.path.join(dst_root, 'SCORE', 'targets.csv'), 'w') as outfile: + outfile.write('') + + + + diff --git a/datasets/anomaly/transform_yahoo.py b/datasets/anomaly/transform_yahoo.py new file mode 100644 index 0000000..3f4a7ab --- /dev/null +++ b/datasets/anomaly/transform_yahoo.py @@ -0,0 +1,160 @@ +# TODO: Wrap it as a class and connect it to GUI +# A script to transform anomaly data to d3m format +import pandas as pd +import numpy as np +import os +import json + +############################## +# Some information for the dataset to be transformed +# Designed for time series data +name = 'yahoo_sub_5' +src_path = './raw_data/yahoo_sub_5.csv' +label_name = 'is_anomaly' +timestamp_name = 'timestamp' +value_names = ['value_{}'.format(i) for i in range(5)] +ratio = 0.9 # Ratio of training data, the rest is for testing + +############################### + + + +dst_root = './' + name +dirs = ['./', 'SCORE', 'TEST', 'TRAIN'] +maps = {'./': None, 'SCORE': 'TEST', 'TEST': 'TEST', 'TRAIN': 'TRAIN'} + +# Create the corresponding directories +for d in dirs: + if maps[d] is not None: + dataset_name = 'dataset_' + maps[d] + problem_name = 'problem_' + maps[d] + else: + dataset_name = name + '_dataset' + problem_name = name + '_problem' + tables_dir = os.path.join(dst_root, d, dataset_name, 'tables') + if not os.path.exists(tables_dir): + os.makedirs(tables_dir) + problem_dir = os.path.join(dst_root, d, problem_name) + if not os.path.exists(problem_dir): + os.makedirs(problem_dir) + +# Process data +_df = pd.DataFrame() +df = pd.read_csv(src_path) +_df['d3mIndex'] = df.index +_df['timestamp'] = df[timestamp_name] +for value_name in value_names: + _df[value_name] = df[value_name] +_df['ground_truth'] = df[label_name] +df = _df +cols = df.columns.tolist() + +# Save all the data +df.to_csv(os.path.join(dst_root, name+'_dataset', 'tables', 'learningData.csv'), index=False) + +# Save training and testing data +train_df, test_df = df[:int(df.shape[0]*ratio)], df[int(df.shape[0]*ratio):] + +train_df.to_csv(os.path.join(dst_root, 'TRAIN', 'dataset_TRAIN', 'tables', 'learningData.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'TEST', 'dataset_TEST', 'tables', 'learningData.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'SCORE', 'dataset_TEST', 'tables', 'learningData.csv'), index=False) + +# Data splits +row_0 = train_df.shape[0] +row_1 = train_df.shape[0] +row = row_0 + row_1 +df = pd.DataFrame(np.array([[i for i in range(row)], ['TRAIN' for _ in range(row_0)] + ['TEST' for _ in range(row_1)], [0 for _ in range(row)], [0 for _ in range(row)]]).transpose(), columns = ['d3mIndex', 'type', 'repeat', 'fold']) + +# Save data splits for all data +train_df.to_csv(os.path.join(dst_root, name+'_problem', 'dataSplits.csv'), index=False) + +# Save training and testing splits +train_df, test_df = df[:row_0], df[row_0:] +train_df.to_csv(os.path.join(dst_root, 'TRAIN', 'problem_TRAIN', 'dataSplits.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'TEST', 'problem_TEST', 'dataSplits.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'SCORE', 'problem_TEST', 'dataSplits.csv'), index=False) + + +# Dataset JSON files +# Load template +with open('template/datasetDoc.json') as json_file: + data = json.load(json_file) +columns = [] +for i in range(len(cols)): + c = {} + c['colIndex'] = i + c['colName'] = cols[i] + if i == 0: + c['colType'] = 'integer' + c['role'] = ['index'] + elif i == 1: + c['colType'] = 'integer' + c['role'] = ['attribute'] + elif i == len(cols)-1: + c['colType'] = 'integer' + c['role'] = ['suggestedTarget'] + else: + c['colType'] = 'real' + c['role'] = ['attribute'] + columns.append(c) +data['dataResources'][0]['columns'] = columns +data['dataResources'][0]['columnsCount'] = len(cols) + +data['about']['datasetID'] = name + '_dataset' +data['about']['datasetName'] = name +with open(os.path.join(dst_root, name+'_dataset', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +data['about']['datasetID'] = name +'_dataset_TRAIN' +data['about']['datasetName'] = "NULL" +with open(os.path.join(dst_root, 'TRAIN', 'dataset_TRAIN', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +data['about']['datasetID'] = name + '_dataset_TEST' +data['about']['datasetName'] = 'NULL' +with open(os.path.join(dst_root, 'TEST', 'dataset_TEST', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +data['about']['datasetID'] = name + '_dataset_TEST' +data['about']['datasetName'] = 'NULL' +with open(os.path.join(dst_root, 'SCORE', 'dataset_TEST', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +# Problem JSON files +# Load template +with open('template/problemDoc.json') as json_file: + data = json.load(json_file) + +data['about']['problemID'] = name+'_problem' +data['about']['problemName'] = name+'_problem' +data['about']['problemDescription'] = 'Anomaly detection' +data['about']['taskKeywords'] = ['classification', 'binary', 'tabular'] +data['inputs']['data'][0]['datasetID'] = name + '_dataset' +data['inputs']['data'][0]['targets'][0]['colIndex'] = len(cols)-1 +data['inputs']['data'][0]['targets'][0]['colName'] = cols[-1] +data['inputs']['dataSplits']['datasetViewMaps']['train'][0]['from'] = name+'_dataset' +data['inputs']['dataSplits']['datasetViewMaps']['test'][0]['from'] = name+'_dataset' +data['inputs']['dataSplits']['datasetViewMaps']['score'][0]['from'] = name+'_dataset' +data['inputs']['dataSplits']['datasetViewMaps']['train'][0]['to'] = name+'_dataset_TRAIN' +data['inputs']['dataSplits']['datasetViewMaps']['test'][0]['to'] = name+'_dataset_TEST' +data['inputs']['dataSplits']['datasetViewMaps']['score'][0]['to'] = name+'_dataset_SCORE' + +with open(os.path.join(dst_root, name+'_problem', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +with open(os.path.join(dst_root, 'TRAIN', 'problem_TRAIN', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +with open(os.path.join(dst_root, 'TEST', 'problem_TEST', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +with open(os.path.join(dst_root, 'SCORE', 'problem_TEST', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +# Make an empty targets.csv +with open(os.path.join(dst_root, 'SCORE', 'targets.csv'), 'w') as outfile: + outfile.write('') + + + + diff --git a/datasets/anomaly/yahoo_sub_5/SCORE/dataset_TEST/datasetDoc.json b/datasets/anomaly/yahoo_sub_5/SCORE/dataset_TEST/datasetDoc.json new file mode 100644 index 0000000..ff5dec4 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/SCORE/dataset_TEST/datasetDoc.json @@ -0,0 +1,95 @@ +{ + "about": { + "datasetID": "yahoo_sub_5_dataset_TEST", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value_0", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "value_1", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "value_2", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "value_3", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "value_4", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 8 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly/yahoo_sub_5/SCORE/dataset_TEST/tables/learningData.csv b/datasets/anomaly/yahoo_sub_5/SCORE/dataset_TEST/tables/learningData.csv new file mode 100644 index 0000000..e3d5131 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/SCORE/dataset_TEST/tables/learningData.csv @@ -0,0 +1,141 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +1260,1261,7782,0.034280386319742985,2.5072222222222003,104,3119,0 +1261,1262,7829,0.039360296791109,2.5927777777778,82,3590,0 +1262,1263,7902,0.0,2.6894444444444,208,3893,0 +1263,1264,8039,0.038944065994356014,2.6291666666667,92,3264,0 +1264,1265,8350,0.18176011684739,2.6469444444444,53,3963,0 +1265,1266,8142,0.18521047165852,2.7461111111111003,65,2757,0 +1266,1267,7886,0.13079770999921,2.9363888888889,62,2306,0 +1267,1268,7743,0.13310058077443,3.2797222222222,73,2549,0 +1268,1269,7707,0.054750658073534006,3.5194444444444,84,2212,0 +1269,1270,7726,0.030588852697706,3.8130555555556,90,2286,0 +1270,1271,7717,0.12998124134227002,3.7941666666667,80,2979,0 +1271,1272,10331,0.09100057249197198,3.6086111111111,90,3158,0 +1272,1273,10515,0.19464543002904006,3.3858333333333,84,2645,0 +1273,1274,10415,0.22178651521516,3.3336111111111,34,3161,0 +1274,1275,10387,0.22983578430825,3.3116666666667003,67,4460,0 +1275,1276,10471,0.298229429356,3.2616666666667005,74,2630,0 +1276,1277,10385,0.12923377484588,3.0044444444444003,44,2593,0 +1277,1278,10439,0.19609416059774,2.6741666666667,64,2625,0 +1278,1279,10516,0.040518533819385014,2.3191666666667,70,4834,0 +1279,1280,10587,0.07099894663641,2.0597222222222,96,4056,0 +1280,1281,10586,0.07584150637714701,2.0547222222222,110,5713,0 +1281,1282,10684,0.08180100127782801,2.1511111111111,68,3940,0 +1282,1283,10880,0.0,2.2602777777778,90,4414,0 +1283,1284,10830,0.0,2.2883333333333,90,5044,0 +1284,1285,10794,0.09140162014739303,2.3736111111111,69,3894,0 +1285,1286,10843,0.0,2.5869444444444,46,3993,0 +1286,1287,10805,0.0,2.6480555555556,74,4404,0 +1287,1288,10996,0.0,2.6077777777777995,68,4072,0 +1288,1289,11327,0.05363316840061,2.6069444444444,67,4182,0 +1289,1290,11090,0.26818151064716,2.6908333333333,51,3351,0 +1290,1291,10578,0.21887772653901,2.9019444444444003,39,4183,0 +1291,1292,10528,0.32371296573811,3.2711111111111,26,4068,0 +1292,1293,10475,0.12565805017257,3.5872222222222,25,8139,0 +1293,1294,10664,0.092277247744574,3.6913888888889,32,11000,0 +1294,1295,10513,0.077016875742983,3.6313888888889,17,2975,0 +1295,1296,9072,0.3714480797312501,3.5605555555556,19,2692,0 +1296,1297,9069,0.19332372237792,3.4402777777778,16,2502,0 +1297,1298,9089,0.06345811641554701,3.35,28,2510,0 +1298,1299,9027,0.2267121559473,3.3469444444444,24,2663,0 +1299,1300,8969,0.053072279964629,3.2708333333333,35,3575,0 +1300,1301,9073,0.13336345197744,3.2519444444444,49,2586,0 +1301,1302,8957,0.1252855094715,2.7311111111111,106,2908,0 +1302,1303,9126,0.096211952864224,2.3875,80,3530,0 +1303,1304,9122,0.096524467517755,2.0847222222222,90,2776,0 +1304,1305,9231,0.08924770147957402,2.0975,169,2962,0 +1305,1306,9368,0.11889606284162,2.1763888888889,98,3441,0 +1306,1307,9458,0.031429841710104,2.2327777777777995,92,4376,0 +1307,1308,9463,0.0,2.2725,91,3857,0 +1308,1309,9356,0.036512411627868,2.3202777777778,99,4685,0 +1309,1310,9340,0.0,2.5425,90,4585,0 +1310,1311,9340,0.0,2.5986111111111,126,3542,0 +1311,1312,9276,0.0,2.6319444444444,102,3370,0 +1312,1313,9611,0.10106696361212,2.5836111111111,132,3515,0 +1313,1314,9532,0.14854949043035,2.675,88,3793,0 +1314,1315,9156,0.08612162048398897,2.8522222222222,135,2954,0 +1315,1316,9222,0.16494200410492002,3.1302777777778,114,2627,0 +1316,1317,9282,0.28637713141253,3.4805555555556,35,2550,0 +1317,1318,9573,0.13206535647488,3.5994444444444,24,2480,0 +1318,1319,9333,0.27364025607799,3.5847222222222,44,2521,0 +1319,1320,9987,0.38382339961227,3.4963888888889,26,2860,0 +1320,1321,10133,0.08426242877623301,3.3825,37,3675,0 +1321,1322,10010,0.3290413568025901,3.2694444444444,45,2704,0 +1322,1323,10028,0.22632868808708,3.2322222222222,42,3121,0 +1323,1324,9984,0.17914189971361,3.1936111111111005,47,2603,0 +1324,1325,10041,0.30046815361859003,3.0536111111111004,34,3984,0 +1325,1326,10072,0.22650915594248,2.7819444444444,56,2537,0 +1326,1327,10025,0.0,2.4152777777778,87,3349,0 +1327,1328,10116,0.1223093269317,2.1569444444444,74,3958,0 +1328,1329,10232,0.1696074188221,2.1125,90,4243,0 +1329,1330,10516,0.0,2.1833333333333003,79,4159,0 +1330,1331,10449,0.028193633007367,2.205,97,5637,0 +1331,1332,10598,0.0,2.1697222222222,90,8142,0 +1332,1333,10337,0.0,2.3075,77,5713,0 +1333,1334,10469,0.097305232437507,2.4575,101,3668,0 +1334,1335,10426,0.11905908868379,2.6077777777777995,74,4307,0 +1335,1336,10531,0.11660374103282,2.6275,439,4354,0 +1336,1337,10875,0.060474297756584014,2.6144444444444,79,4262,0 +1337,1338,10494,0.22568442027805,2.6477777777777995,165,3446,0 +1338,1339,10195,0.14077736537045002,2.8594444444444003,139,2677,0 +1339,1340,9918,0.1924574892026,3.2675,56,4450,0 +1340,1341,9889,0.18922597300629,3.5136111111111004,102,3044,0 +1341,1342,9947,0.041593949118095004,3.5725,101,3428,0 +1342,1343,9977,0.2502095174271,3.6863888888889,41,2845,0 +1343,1344,10835,0.18663972932643,3.5636111111111,94,2781,0 +1344,1345,10765,0.07351854082400297,3.4127777777778,116,2743,0 +1345,1346,10656,0.081949111399618,3.295,94,4470,0 +1346,1347,10485,0.20148511394009,3.2666666666667004,89,2596,0 +1347,1348,10681,0.11515101921294,3.1933333333333,141,3249,0 +1348,1349,10852,0.07797276382811,3.0688888888889,167,2529,0 +1349,1350,10728,0.07244862879413201,2.8102777777778,148,2452,0 +1350,1351,10874,0.07310929970435699,2.42,105,2934,0 +1351,1352,10964,0.066868365737218,2.1358333333333,210,3159,0 +1352,1353,10984,0.05788512501593701,1.9916666666667,145,3974,0 +1353,1354,11055,0.09727414207464803,2.0947222222222,136,4305,0 +1354,1355,11233,0.033270317741558,2.1591666666667,126,5012,0 +1355,1356,11161,0.0,2.2377777777778,157,4455,0 +1356,1357,10966,0.038270957919533,2.2511111111111,105,4108,0 +1357,1358,11193,0.08728058888363299,2.4208333333333,114,4339,0 +1358,1359,11167,0.10536774813238,2.5241666666667,104,5056,0 +1359,1360,11367,0.1233991317089,2.5794444444444,69,5573,0 +1360,1361,51251,0.042565915766552,2.5936111111111,75,3366,1 +1361,1362,17953,0.23147422367229,2.6830555555556,73,2559,1 +1362,1363,170029,0.08983405162538903,2.8188888888889,74,1999,1 +1363,1364,10955,0.07464756469365201,2.9513888888888995,126,1993,0 +1364,1365,10984,0.099244104918934,3.2830555555556,67,1913,0 +1365,1366,10964,0.11535172009194,3.4819444444444,32,1760,0 +1366,1367,10980,0.21774881707852,3.5886111111111005,38,1890,0 +1367,1368,10852,0.1305066423559,3.4836111111111,34,2469,0 +1368,1369,10786,0.10054853030204,3.3955555555556,36,2133,0 +1369,1370,10841,0.02468393737575,3.2847222222222,26,3359,0 +1370,1371,10762,0.10018007414459,3.2383333333332995,74,3783,0 +1371,1372,10419,0.12522619841308,3.2188888888889,85,1809,0 +1372,1373,10467,0.11781887197077,2.9483333333333,67,2143,0 +1373,1374,10502,0.13417256350298,2.5855555555556,84,2567,0 +1374,1375,10519,0.07474686582090599,2.3005555555556003,1630,2176,0 +1375,1376,10579,0.13570963056519,2.0855555555556,1435,1929,0 +1376,1377,10502,0.076431907457478,1.9027777777778,857,2244,0 +1377,1378,10661,0.0,1.9411111111111,31,1810,0 +1378,1379,10818,0.1936428046839,2.0444444444444,500,2088,0 +1379,1380,10918,0.052826773889684014,2.1363888888889,53,2371,0 +1380,1381,10871,0.0,2.22,61,1843,0 +1381,1382,10796,0.054466597481213,2.3530555555556,158,2668,0 +1382,1383,10774,0.057459020289436,2.545,184,2309,0 +1383,1384,10898,0.28750562005936,2.6202777777778,91,1998,0 +1384,1385,11442,0.075538554674309,2.6847222222222,60,2480,0 +1385,1386,11113,0.08112608570492501,2.6591666666667004,107,2147,0 +1386,1387,10888,0.21563803296368,2.7863888888888995,5157,1802,0 +1387,1388,10894,0.095725002305685,3.0269444444444003,28,1789,0 +1388,1389,10888,0.17516056892320994,3.3227777777778,24,1999,0 +1389,1390,10896,0.32902836018586,3.6097222222222,21,2142,0 +1390,1391,10800,0.10216065221678,3.6805555555556,12,1904,0 +1391,1392,11000,0.19741931250852,3.6075,24,1876,0 +1392,1393,10985,0.10149107903671,3.4091666666667004,17,2434,0 +1393,1394,11017,0.17479255893624,3.3666666666667004,48,2472,0 +1394,1395,10863,0.034385029573777,3.3158333333333,41,1744,0 +1395,1396,10875,0.21988771218053,3.1622222222222,1088,2404,0 +1396,1397,10987,0.10149107903671,3.1086111111111,68,1971,0 +1397,1398,10778,0.10269981175445,2.6552777777778,2575,1713,0 +1398,1399,10957,0.11258759940039,2.2730555555556,4688,1765,0 +1399,1400,10832,0.13022351806001,2.0591666666667,477,3156,0 diff --git a/datasets/anomaly/yahoo_sub_5/SCORE/problem_TEST/dataSplits.csv b/datasets/anomaly/yahoo_sub_5/SCORE/problem_TEST/dataSplits.csv new file mode 100644 index 0000000..c72d454 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/SCORE/problem_TEST/dataSplits.csv @@ -0,0 +1,1261 @@ +d3mIndex,type,repeat,fold +1260,TEST,0,0 +1261,TEST,0,0 +1262,TEST,0,0 +1263,TEST,0,0 +1264,TEST,0,0 +1265,TEST,0,0 +1266,TEST,0,0 +1267,TEST,0,0 +1268,TEST,0,0 +1269,TEST,0,0 +1270,TEST,0,0 +1271,TEST,0,0 +1272,TEST,0,0 +1273,TEST,0,0 +1274,TEST,0,0 +1275,TEST,0,0 +1276,TEST,0,0 +1277,TEST,0,0 +1278,TEST,0,0 +1279,TEST,0,0 +1280,TEST,0,0 +1281,TEST,0,0 +1282,TEST,0,0 +1283,TEST,0,0 +1284,TEST,0,0 +1285,TEST,0,0 +1286,TEST,0,0 +1287,TEST,0,0 +1288,TEST,0,0 +1289,TEST,0,0 +1290,TEST,0,0 +1291,TEST,0,0 +1292,TEST,0,0 +1293,TEST,0,0 +1294,TEST,0,0 +1295,TEST,0,0 +1296,TEST,0,0 +1297,TEST,0,0 +1298,TEST,0,0 +1299,TEST,0,0 +1300,TEST,0,0 +1301,TEST,0,0 +1302,TEST,0,0 +1303,TEST,0,0 +1304,TEST,0,0 +1305,TEST,0,0 +1306,TEST,0,0 +1307,TEST,0,0 +1308,TEST,0,0 +1309,TEST,0,0 +1310,TEST,0,0 +1311,TEST,0,0 +1312,TEST,0,0 +1313,TEST,0,0 +1314,TEST,0,0 +1315,TEST,0,0 +1316,TEST,0,0 +1317,TEST,0,0 +1318,TEST,0,0 +1319,TEST,0,0 +1320,TEST,0,0 +1321,TEST,0,0 +1322,TEST,0,0 +1323,TEST,0,0 +1324,TEST,0,0 +1325,TEST,0,0 +1326,TEST,0,0 +1327,TEST,0,0 +1328,TEST,0,0 +1329,TEST,0,0 +1330,TEST,0,0 +1331,TEST,0,0 +1332,TEST,0,0 +1333,TEST,0,0 +1334,TEST,0,0 +1335,TEST,0,0 +1336,TEST,0,0 +1337,TEST,0,0 +1338,TEST,0,0 +1339,TEST,0,0 +1340,TEST,0,0 +1341,TEST,0,0 +1342,TEST,0,0 +1343,TEST,0,0 +1344,TEST,0,0 +1345,TEST,0,0 +1346,TEST,0,0 +1347,TEST,0,0 +1348,TEST,0,0 +1349,TEST,0,0 +1350,TEST,0,0 +1351,TEST,0,0 +1352,TEST,0,0 +1353,TEST,0,0 +1354,TEST,0,0 +1355,TEST,0,0 +1356,TEST,0,0 +1357,TEST,0,0 +1358,TEST,0,0 +1359,TEST,0,0 +1360,TEST,0,0 +1361,TEST,0,0 +1362,TEST,0,0 +1363,TEST,0,0 +1364,TEST,0,0 +1365,TEST,0,0 +1366,TEST,0,0 +1367,TEST,0,0 +1368,TEST,0,0 +1369,TEST,0,0 +1370,TEST,0,0 +1371,TEST,0,0 +1372,TEST,0,0 +1373,TEST,0,0 +1374,TEST,0,0 +1375,TEST,0,0 +1376,TEST,0,0 +1377,TEST,0,0 +1378,TEST,0,0 +1379,TEST,0,0 +1380,TEST,0,0 +1381,TEST,0,0 +1382,TEST,0,0 +1383,TEST,0,0 +1384,TEST,0,0 +1385,TEST,0,0 +1386,TEST,0,0 +1387,TEST,0,0 +1388,TEST,0,0 +1389,TEST,0,0 +1390,TEST,0,0 +1391,TEST,0,0 +1392,TEST,0,0 +1393,TEST,0,0 +1394,TEST,0,0 +1395,TEST,0,0 +1396,TEST,0,0 +1397,TEST,0,0 +1398,TEST,0,0 +1399,TEST,0,0 +1400,TEST,0,0 +1401,TEST,0,0 +1402,TEST,0,0 +1403,TEST,0,0 +1404,TEST,0,0 +1405,TEST,0,0 +1406,TEST,0,0 +1407,TEST,0,0 +1408,TEST,0,0 +1409,TEST,0,0 +1410,TEST,0,0 +1411,TEST,0,0 +1412,TEST,0,0 +1413,TEST,0,0 +1414,TEST,0,0 +1415,TEST,0,0 +1416,TEST,0,0 +1417,TEST,0,0 +1418,TEST,0,0 +1419,TEST,0,0 +1420,TEST,0,0 +1421,TEST,0,0 +1422,TEST,0,0 +1423,TEST,0,0 +1424,TEST,0,0 +1425,TEST,0,0 +1426,TEST,0,0 +1427,TEST,0,0 +1428,TEST,0,0 +1429,TEST,0,0 +1430,TEST,0,0 +1431,TEST,0,0 +1432,TEST,0,0 +1433,TEST,0,0 +1434,TEST,0,0 +1435,TEST,0,0 +1436,TEST,0,0 +1437,TEST,0,0 +1438,TEST,0,0 +1439,TEST,0,0 +1440,TEST,0,0 +1441,TEST,0,0 +1442,TEST,0,0 +1443,TEST,0,0 +1444,TEST,0,0 +1445,TEST,0,0 +1446,TEST,0,0 +1447,TEST,0,0 +1448,TEST,0,0 +1449,TEST,0,0 +1450,TEST,0,0 +1451,TEST,0,0 +1452,TEST,0,0 +1453,TEST,0,0 +1454,TEST,0,0 +1455,TEST,0,0 +1456,TEST,0,0 +1457,TEST,0,0 +1458,TEST,0,0 +1459,TEST,0,0 +1460,TEST,0,0 +1461,TEST,0,0 +1462,TEST,0,0 +1463,TEST,0,0 +1464,TEST,0,0 +1465,TEST,0,0 +1466,TEST,0,0 +1467,TEST,0,0 +1468,TEST,0,0 +1469,TEST,0,0 +1470,TEST,0,0 +1471,TEST,0,0 +1472,TEST,0,0 +1473,TEST,0,0 +1474,TEST,0,0 +1475,TEST,0,0 +1476,TEST,0,0 +1477,TEST,0,0 +1478,TEST,0,0 +1479,TEST,0,0 +1480,TEST,0,0 +1481,TEST,0,0 +1482,TEST,0,0 +1483,TEST,0,0 +1484,TEST,0,0 +1485,TEST,0,0 +1486,TEST,0,0 +1487,TEST,0,0 +1488,TEST,0,0 +1489,TEST,0,0 +1490,TEST,0,0 +1491,TEST,0,0 +1492,TEST,0,0 +1493,TEST,0,0 +1494,TEST,0,0 +1495,TEST,0,0 +1496,TEST,0,0 +1497,TEST,0,0 +1498,TEST,0,0 +1499,TEST,0,0 +1500,TEST,0,0 +1501,TEST,0,0 +1502,TEST,0,0 +1503,TEST,0,0 +1504,TEST,0,0 +1505,TEST,0,0 +1506,TEST,0,0 +1507,TEST,0,0 +1508,TEST,0,0 +1509,TEST,0,0 +1510,TEST,0,0 +1511,TEST,0,0 +1512,TEST,0,0 +1513,TEST,0,0 +1514,TEST,0,0 +1515,TEST,0,0 +1516,TEST,0,0 +1517,TEST,0,0 +1518,TEST,0,0 +1519,TEST,0,0 +1520,TEST,0,0 +1521,TEST,0,0 +1522,TEST,0,0 +1523,TEST,0,0 +1524,TEST,0,0 +1525,TEST,0,0 +1526,TEST,0,0 +1527,TEST,0,0 +1528,TEST,0,0 +1529,TEST,0,0 +1530,TEST,0,0 +1531,TEST,0,0 +1532,TEST,0,0 +1533,TEST,0,0 +1534,TEST,0,0 +1535,TEST,0,0 +1536,TEST,0,0 +1537,TEST,0,0 +1538,TEST,0,0 +1539,TEST,0,0 +1540,TEST,0,0 +1541,TEST,0,0 +1542,TEST,0,0 +1543,TEST,0,0 +1544,TEST,0,0 +1545,TEST,0,0 +1546,TEST,0,0 +1547,TEST,0,0 +1548,TEST,0,0 +1549,TEST,0,0 +1550,TEST,0,0 +1551,TEST,0,0 +1552,TEST,0,0 +1553,TEST,0,0 +1554,TEST,0,0 +1555,TEST,0,0 +1556,TEST,0,0 +1557,TEST,0,0 +1558,TEST,0,0 +1559,TEST,0,0 +1560,TEST,0,0 +1561,TEST,0,0 +1562,TEST,0,0 +1563,TEST,0,0 +1564,TEST,0,0 +1565,TEST,0,0 +1566,TEST,0,0 +1567,TEST,0,0 +1568,TEST,0,0 +1569,TEST,0,0 +1570,TEST,0,0 +1571,TEST,0,0 +1572,TEST,0,0 +1573,TEST,0,0 +1574,TEST,0,0 +1575,TEST,0,0 +1576,TEST,0,0 +1577,TEST,0,0 +1578,TEST,0,0 +1579,TEST,0,0 +1580,TEST,0,0 +1581,TEST,0,0 +1582,TEST,0,0 +1583,TEST,0,0 +1584,TEST,0,0 +1585,TEST,0,0 +1586,TEST,0,0 +1587,TEST,0,0 +1588,TEST,0,0 +1589,TEST,0,0 +1590,TEST,0,0 +1591,TEST,0,0 +1592,TEST,0,0 +1593,TEST,0,0 +1594,TEST,0,0 +1595,TEST,0,0 +1596,TEST,0,0 +1597,TEST,0,0 +1598,TEST,0,0 +1599,TEST,0,0 +1600,TEST,0,0 +1601,TEST,0,0 +1602,TEST,0,0 +1603,TEST,0,0 +1604,TEST,0,0 +1605,TEST,0,0 +1606,TEST,0,0 +1607,TEST,0,0 +1608,TEST,0,0 +1609,TEST,0,0 +1610,TEST,0,0 +1611,TEST,0,0 +1612,TEST,0,0 +1613,TEST,0,0 +1614,TEST,0,0 +1615,TEST,0,0 +1616,TEST,0,0 +1617,TEST,0,0 +1618,TEST,0,0 +1619,TEST,0,0 +1620,TEST,0,0 +1621,TEST,0,0 +1622,TEST,0,0 +1623,TEST,0,0 +1624,TEST,0,0 +1625,TEST,0,0 +1626,TEST,0,0 +1627,TEST,0,0 +1628,TEST,0,0 +1629,TEST,0,0 +1630,TEST,0,0 +1631,TEST,0,0 +1632,TEST,0,0 +1633,TEST,0,0 +1634,TEST,0,0 +1635,TEST,0,0 +1636,TEST,0,0 +1637,TEST,0,0 +1638,TEST,0,0 +1639,TEST,0,0 +1640,TEST,0,0 +1641,TEST,0,0 +1642,TEST,0,0 +1643,TEST,0,0 +1644,TEST,0,0 +1645,TEST,0,0 +1646,TEST,0,0 +1647,TEST,0,0 +1648,TEST,0,0 +1649,TEST,0,0 +1650,TEST,0,0 +1651,TEST,0,0 +1652,TEST,0,0 +1653,TEST,0,0 +1654,TEST,0,0 +1655,TEST,0,0 +1656,TEST,0,0 +1657,TEST,0,0 +1658,TEST,0,0 +1659,TEST,0,0 +1660,TEST,0,0 +1661,TEST,0,0 +1662,TEST,0,0 +1663,TEST,0,0 +1664,TEST,0,0 +1665,TEST,0,0 +1666,TEST,0,0 +1667,TEST,0,0 +1668,TEST,0,0 +1669,TEST,0,0 +1670,TEST,0,0 +1671,TEST,0,0 +1672,TEST,0,0 +1673,TEST,0,0 +1674,TEST,0,0 +1675,TEST,0,0 +1676,TEST,0,0 +1677,TEST,0,0 +1678,TEST,0,0 +1679,TEST,0,0 +1680,TEST,0,0 +1681,TEST,0,0 +1682,TEST,0,0 +1683,TEST,0,0 +1684,TEST,0,0 +1685,TEST,0,0 +1686,TEST,0,0 +1687,TEST,0,0 +1688,TEST,0,0 +1689,TEST,0,0 +1690,TEST,0,0 +1691,TEST,0,0 +1692,TEST,0,0 +1693,TEST,0,0 +1694,TEST,0,0 +1695,TEST,0,0 +1696,TEST,0,0 +1697,TEST,0,0 +1698,TEST,0,0 +1699,TEST,0,0 +1700,TEST,0,0 +1701,TEST,0,0 +1702,TEST,0,0 +1703,TEST,0,0 +1704,TEST,0,0 +1705,TEST,0,0 +1706,TEST,0,0 +1707,TEST,0,0 +1708,TEST,0,0 +1709,TEST,0,0 +1710,TEST,0,0 +1711,TEST,0,0 +1712,TEST,0,0 +1713,TEST,0,0 +1714,TEST,0,0 +1715,TEST,0,0 +1716,TEST,0,0 +1717,TEST,0,0 +1718,TEST,0,0 +1719,TEST,0,0 +1720,TEST,0,0 +1721,TEST,0,0 +1722,TEST,0,0 +1723,TEST,0,0 +1724,TEST,0,0 +1725,TEST,0,0 +1726,TEST,0,0 +1727,TEST,0,0 +1728,TEST,0,0 +1729,TEST,0,0 +1730,TEST,0,0 +1731,TEST,0,0 +1732,TEST,0,0 +1733,TEST,0,0 +1734,TEST,0,0 +1735,TEST,0,0 +1736,TEST,0,0 +1737,TEST,0,0 +1738,TEST,0,0 +1739,TEST,0,0 +1740,TEST,0,0 +1741,TEST,0,0 +1742,TEST,0,0 +1743,TEST,0,0 +1744,TEST,0,0 +1745,TEST,0,0 +1746,TEST,0,0 +1747,TEST,0,0 +1748,TEST,0,0 +1749,TEST,0,0 +1750,TEST,0,0 +1751,TEST,0,0 +1752,TEST,0,0 +1753,TEST,0,0 +1754,TEST,0,0 +1755,TEST,0,0 +1756,TEST,0,0 +1757,TEST,0,0 +1758,TEST,0,0 +1759,TEST,0,0 +1760,TEST,0,0 +1761,TEST,0,0 +1762,TEST,0,0 +1763,TEST,0,0 +1764,TEST,0,0 +1765,TEST,0,0 +1766,TEST,0,0 +1767,TEST,0,0 +1768,TEST,0,0 +1769,TEST,0,0 +1770,TEST,0,0 +1771,TEST,0,0 +1772,TEST,0,0 +1773,TEST,0,0 +1774,TEST,0,0 +1775,TEST,0,0 +1776,TEST,0,0 +1777,TEST,0,0 +1778,TEST,0,0 +1779,TEST,0,0 +1780,TEST,0,0 +1781,TEST,0,0 +1782,TEST,0,0 +1783,TEST,0,0 +1784,TEST,0,0 +1785,TEST,0,0 +1786,TEST,0,0 +1787,TEST,0,0 +1788,TEST,0,0 +1789,TEST,0,0 +1790,TEST,0,0 +1791,TEST,0,0 +1792,TEST,0,0 +1793,TEST,0,0 +1794,TEST,0,0 +1795,TEST,0,0 +1796,TEST,0,0 +1797,TEST,0,0 +1798,TEST,0,0 +1799,TEST,0,0 +1800,TEST,0,0 +1801,TEST,0,0 +1802,TEST,0,0 +1803,TEST,0,0 +1804,TEST,0,0 +1805,TEST,0,0 +1806,TEST,0,0 +1807,TEST,0,0 +1808,TEST,0,0 +1809,TEST,0,0 +1810,TEST,0,0 +1811,TEST,0,0 +1812,TEST,0,0 +1813,TEST,0,0 +1814,TEST,0,0 +1815,TEST,0,0 +1816,TEST,0,0 +1817,TEST,0,0 +1818,TEST,0,0 +1819,TEST,0,0 +1820,TEST,0,0 +1821,TEST,0,0 +1822,TEST,0,0 +1823,TEST,0,0 +1824,TEST,0,0 +1825,TEST,0,0 +1826,TEST,0,0 +1827,TEST,0,0 +1828,TEST,0,0 +1829,TEST,0,0 +1830,TEST,0,0 +1831,TEST,0,0 +1832,TEST,0,0 +1833,TEST,0,0 +1834,TEST,0,0 +1835,TEST,0,0 +1836,TEST,0,0 +1837,TEST,0,0 +1838,TEST,0,0 +1839,TEST,0,0 +1840,TEST,0,0 +1841,TEST,0,0 +1842,TEST,0,0 +1843,TEST,0,0 +1844,TEST,0,0 +1845,TEST,0,0 +1846,TEST,0,0 +1847,TEST,0,0 +1848,TEST,0,0 +1849,TEST,0,0 +1850,TEST,0,0 +1851,TEST,0,0 +1852,TEST,0,0 +1853,TEST,0,0 +1854,TEST,0,0 +1855,TEST,0,0 +1856,TEST,0,0 +1857,TEST,0,0 +1858,TEST,0,0 +1859,TEST,0,0 +1860,TEST,0,0 +1861,TEST,0,0 +1862,TEST,0,0 +1863,TEST,0,0 +1864,TEST,0,0 +1865,TEST,0,0 +1866,TEST,0,0 +1867,TEST,0,0 +1868,TEST,0,0 +1869,TEST,0,0 +1870,TEST,0,0 +1871,TEST,0,0 +1872,TEST,0,0 +1873,TEST,0,0 +1874,TEST,0,0 +1875,TEST,0,0 +1876,TEST,0,0 +1877,TEST,0,0 +1878,TEST,0,0 +1879,TEST,0,0 +1880,TEST,0,0 +1881,TEST,0,0 +1882,TEST,0,0 +1883,TEST,0,0 +1884,TEST,0,0 +1885,TEST,0,0 +1886,TEST,0,0 +1887,TEST,0,0 +1888,TEST,0,0 +1889,TEST,0,0 +1890,TEST,0,0 +1891,TEST,0,0 +1892,TEST,0,0 +1893,TEST,0,0 +1894,TEST,0,0 +1895,TEST,0,0 +1896,TEST,0,0 +1897,TEST,0,0 +1898,TEST,0,0 +1899,TEST,0,0 +1900,TEST,0,0 +1901,TEST,0,0 +1902,TEST,0,0 +1903,TEST,0,0 +1904,TEST,0,0 +1905,TEST,0,0 +1906,TEST,0,0 +1907,TEST,0,0 +1908,TEST,0,0 +1909,TEST,0,0 +1910,TEST,0,0 +1911,TEST,0,0 +1912,TEST,0,0 +1913,TEST,0,0 +1914,TEST,0,0 +1915,TEST,0,0 +1916,TEST,0,0 +1917,TEST,0,0 +1918,TEST,0,0 +1919,TEST,0,0 +1920,TEST,0,0 +1921,TEST,0,0 +1922,TEST,0,0 +1923,TEST,0,0 +1924,TEST,0,0 +1925,TEST,0,0 +1926,TEST,0,0 +1927,TEST,0,0 +1928,TEST,0,0 +1929,TEST,0,0 +1930,TEST,0,0 +1931,TEST,0,0 +1932,TEST,0,0 +1933,TEST,0,0 +1934,TEST,0,0 +1935,TEST,0,0 +1936,TEST,0,0 +1937,TEST,0,0 +1938,TEST,0,0 +1939,TEST,0,0 +1940,TEST,0,0 +1941,TEST,0,0 +1942,TEST,0,0 +1943,TEST,0,0 +1944,TEST,0,0 +1945,TEST,0,0 +1946,TEST,0,0 +1947,TEST,0,0 +1948,TEST,0,0 +1949,TEST,0,0 +1950,TEST,0,0 +1951,TEST,0,0 +1952,TEST,0,0 +1953,TEST,0,0 +1954,TEST,0,0 +1955,TEST,0,0 +1956,TEST,0,0 +1957,TEST,0,0 +1958,TEST,0,0 +1959,TEST,0,0 +1960,TEST,0,0 +1961,TEST,0,0 +1962,TEST,0,0 +1963,TEST,0,0 +1964,TEST,0,0 +1965,TEST,0,0 +1966,TEST,0,0 +1967,TEST,0,0 +1968,TEST,0,0 +1969,TEST,0,0 +1970,TEST,0,0 +1971,TEST,0,0 +1972,TEST,0,0 +1973,TEST,0,0 +1974,TEST,0,0 +1975,TEST,0,0 +1976,TEST,0,0 +1977,TEST,0,0 +1978,TEST,0,0 +1979,TEST,0,0 +1980,TEST,0,0 +1981,TEST,0,0 +1982,TEST,0,0 +1983,TEST,0,0 +1984,TEST,0,0 +1985,TEST,0,0 +1986,TEST,0,0 +1987,TEST,0,0 +1988,TEST,0,0 +1989,TEST,0,0 +1990,TEST,0,0 +1991,TEST,0,0 +1992,TEST,0,0 +1993,TEST,0,0 +1994,TEST,0,0 +1995,TEST,0,0 +1996,TEST,0,0 +1997,TEST,0,0 +1998,TEST,0,0 +1999,TEST,0,0 +2000,TEST,0,0 +2001,TEST,0,0 +2002,TEST,0,0 +2003,TEST,0,0 +2004,TEST,0,0 +2005,TEST,0,0 +2006,TEST,0,0 +2007,TEST,0,0 +2008,TEST,0,0 +2009,TEST,0,0 +2010,TEST,0,0 +2011,TEST,0,0 +2012,TEST,0,0 +2013,TEST,0,0 +2014,TEST,0,0 +2015,TEST,0,0 +2016,TEST,0,0 +2017,TEST,0,0 +2018,TEST,0,0 +2019,TEST,0,0 +2020,TEST,0,0 +2021,TEST,0,0 +2022,TEST,0,0 +2023,TEST,0,0 +2024,TEST,0,0 +2025,TEST,0,0 +2026,TEST,0,0 +2027,TEST,0,0 +2028,TEST,0,0 +2029,TEST,0,0 +2030,TEST,0,0 +2031,TEST,0,0 +2032,TEST,0,0 +2033,TEST,0,0 +2034,TEST,0,0 +2035,TEST,0,0 +2036,TEST,0,0 +2037,TEST,0,0 +2038,TEST,0,0 +2039,TEST,0,0 +2040,TEST,0,0 +2041,TEST,0,0 +2042,TEST,0,0 +2043,TEST,0,0 +2044,TEST,0,0 +2045,TEST,0,0 +2046,TEST,0,0 +2047,TEST,0,0 +2048,TEST,0,0 +2049,TEST,0,0 +2050,TEST,0,0 +2051,TEST,0,0 +2052,TEST,0,0 +2053,TEST,0,0 +2054,TEST,0,0 +2055,TEST,0,0 +2056,TEST,0,0 +2057,TEST,0,0 +2058,TEST,0,0 +2059,TEST,0,0 +2060,TEST,0,0 +2061,TEST,0,0 +2062,TEST,0,0 +2063,TEST,0,0 +2064,TEST,0,0 +2065,TEST,0,0 +2066,TEST,0,0 +2067,TEST,0,0 +2068,TEST,0,0 +2069,TEST,0,0 +2070,TEST,0,0 +2071,TEST,0,0 +2072,TEST,0,0 +2073,TEST,0,0 +2074,TEST,0,0 +2075,TEST,0,0 +2076,TEST,0,0 +2077,TEST,0,0 +2078,TEST,0,0 +2079,TEST,0,0 +2080,TEST,0,0 +2081,TEST,0,0 +2082,TEST,0,0 +2083,TEST,0,0 +2084,TEST,0,0 +2085,TEST,0,0 +2086,TEST,0,0 +2087,TEST,0,0 +2088,TEST,0,0 +2089,TEST,0,0 +2090,TEST,0,0 +2091,TEST,0,0 +2092,TEST,0,0 +2093,TEST,0,0 +2094,TEST,0,0 +2095,TEST,0,0 +2096,TEST,0,0 +2097,TEST,0,0 +2098,TEST,0,0 +2099,TEST,0,0 +2100,TEST,0,0 +2101,TEST,0,0 +2102,TEST,0,0 +2103,TEST,0,0 +2104,TEST,0,0 +2105,TEST,0,0 +2106,TEST,0,0 +2107,TEST,0,0 +2108,TEST,0,0 +2109,TEST,0,0 +2110,TEST,0,0 +2111,TEST,0,0 +2112,TEST,0,0 +2113,TEST,0,0 +2114,TEST,0,0 +2115,TEST,0,0 +2116,TEST,0,0 +2117,TEST,0,0 +2118,TEST,0,0 +2119,TEST,0,0 +2120,TEST,0,0 +2121,TEST,0,0 +2122,TEST,0,0 +2123,TEST,0,0 +2124,TEST,0,0 +2125,TEST,0,0 +2126,TEST,0,0 +2127,TEST,0,0 +2128,TEST,0,0 +2129,TEST,0,0 +2130,TEST,0,0 +2131,TEST,0,0 +2132,TEST,0,0 +2133,TEST,0,0 +2134,TEST,0,0 +2135,TEST,0,0 +2136,TEST,0,0 +2137,TEST,0,0 +2138,TEST,0,0 +2139,TEST,0,0 +2140,TEST,0,0 +2141,TEST,0,0 +2142,TEST,0,0 +2143,TEST,0,0 +2144,TEST,0,0 +2145,TEST,0,0 +2146,TEST,0,0 +2147,TEST,0,0 +2148,TEST,0,0 +2149,TEST,0,0 +2150,TEST,0,0 +2151,TEST,0,0 +2152,TEST,0,0 +2153,TEST,0,0 +2154,TEST,0,0 +2155,TEST,0,0 +2156,TEST,0,0 +2157,TEST,0,0 +2158,TEST,0,0 +2159,TEST,0,0 +2160,TEST,0,0 +2161,TEST,0,0 +2162,TEST,0,0 +2163,TEST,0,0 +2164,TEST,0,0 +2165,TEST,0,0 +2166,TEST,0,0 +2167,TEST,0,0 +2168,TEST,0,0 +2169,TEST,0,0 +2170,TEST,0,0 +2171,TEST,0,0 +2172,TEST,0,0 +2173,TEST,0,0 +2174,TEST,0,0 +2175,TEST,0,0 +2176,TEST,0,0 +2177,TEST,0,0 +2178,TEST,0,0 +2179,TEST,0,0 +2180,TEST,0,0 +2181,TEST,0,0 +2182,TEST,0,0 +2183,TEST,0,0 +2184,TEST,0,0 +2185,TEST,0,0 +2186,TEST,0,0 +2187,TEST,0,0 +2188,TEST,0,0 +2189,TEST,0,0 +2190,TEST,0,0 +2191,TEST,0,0 +2192,TEST,0,0 +2193,TEST,0,0 +2194,TEST,0,0 +2195,TEST,0,0 +2196,TEST,0,0 +2197,TEST,0,0 +2198,TEST,0,0 +2199,TEST,0,0 +2200,TEST,0,0 +2201,TEST,0,0 +2202,TEST,0,0 +2203,TEST,0,0 +2204,TEST,0,0 +2205,TEST,0,0 +2206,TEST,0,0 +2207,TEST,0,0 +2208,TEST,0,0 +2209,TEST,0,0 +2210,TEST,0,0 +2211,TEST,0,0 +2212,TEST,0,0 +2213,TEST,0,0 +2214,TEST,0,0 +2215,TEST,0,0 +2216,TEST,0,0 +2217,TEST,0,0 +2218,TEST,0,0 +2219,TEST,0,0 +2220,TEST,0,0 +2221,TEST,0,0 +2222,TEST,0,0 +2223,TEST,0,0 +2224,TEST,0,0 +2225,TEST,0,0 +2226,TEST,0,0 +2227,TEST,0,0 +2228,TEST,0,0 +2229,TEST,0,0 +2230,TEST,0,0 +2231,TEST,0,0 +2232,TEST,0,0 +2233,TEST,0,0 +2234,TEST,0,0 +2235,TEST,0,0 +2236,TEST,0,0 +2237,TEST,0,0 +2238,TEST,0,0 +2239,TEST,0,0 +2240,TEST,0,0 +2241,TEST,0,0 +2242,TEST,0,0 +2243,TEST,0,0 +2244,TEST,0,0 +2245,TEST,0,0 +2246,TEST,0,0 +2247,TEST,0,0 +2248,TEST,0,0 +2249,TEST,0,0 +2250,TEST,0,0 +2251,TEST,0,0 +2252,TEST,0,0 +2253,TEST,0,0 +2254,TEST,0,0 +2255,TEST,0,0 +2256,TEST,0,0 +2257,TEST,0,0 +2258,TEST,0,0 +2259,TEST,0,0 +2260,TEST,0,0 +2261,TEST,0,0 +2262,TEST,0,0 +2263,TEST,0,0 +2264,TEST,0,0 +2265,TEST,0,0 +2266,TEST,0,0 +2267,TEST,0,0 +2268,TEST,0,0 +2269,TEST,0,0 +2270,TEST,0,0 +2271,TEST,0,0 +2272,TEST,0,0 +2273,TEST,0,0 +2274,TEST,0,0 +2275,TEST,0,0 +2276,TEST,0,0 +2277,TEST,0,0 +2278,TEST,0,0 +2279,TEST,0,0 +2280,TEST,0,0 +2281,TEST,0,0 +2282,TEST,0,0 +2283,TEST,0,0 +2284,TEST,0,0 +2285,TEST,0,0 +2286,TEST,0,0 +2287,TEST,0,0 +2288,TEST,0,0 +2289,TEST,0,0 +2290,TEST,0,0 +2291,TEST,0,0 +2292,TEST,0,0 +2293,TEST,0,0 +2294,TEST,0,0 +2295,TEST,0,0 +2296,TEST,0,0 +2297,TEST,0,0 +2298,TEST,0,0 +2299,TEST,0,0 +2300,TEST,0,0 +2301,TEST,0,0 +2302,TEST,0,0 +2303,TEST,0,0 +2304,TEST,0,0 +2305,TEST,0,0 +2306,TEST,0,0 +2307,TEST,0,0 +2308,TEST,0,0 +2309,TEST,0,0 +2310,TEST,0,0 +2311,TEST,0,0 +2312,TEST,0,0 +2313,TEST,0,0 +2314,TEST,0,0 +2315,TEST,0,0 +2316,TEST,0,0 +2317,TEST,0,0 +2318,TEST,0,0 +2319,TEST,0,0 +2320,TEST,0,0 +2321,TEST,0,0 +2322,TEST,0,0 +2323,TEST,0,0 +2324,TEST,0,0 +2325,TEST,0,0 +2326,TEST,0,0 +2327,TEST,0,0 +2328,TEST,0,0 +2329,TEST,0,0 +2330,TEST,0,0 +2331,TEST,0,0 +2332,TEST,0,0 +2333,TEST,0,0 +2334,TEST,0,0 +2335,TEST,0,0 +2336,TEST,0,0 +2337,TEST,0,0 +2338,TEST,0,0 +2339,TEST,0,0 +2340,TEST,0,0 +2341,TEST,0,0 +2342,TEST,0,0 +2343,TEST,0,0 +2344,TEST,0,0 +2345,TEST,0,0 +2346,TEST,0,0 +2347,TEST,0,0 +2348,TEST,0,0 +2349,TEST,0,0 +2350,TEST,0,0 +2351,TEST,0,0 +2352,TEST,0,0 +2353,TEST,0,0 +2354,TEST,0,0 +2355,TEST,0,0 +2356,TEST,0,0 +2357,TEST,0,0 +2358,TEST,0,0 +2359,TEST,0,0 +2360,TEST,0,0 +2361,TEST,0,0 +2362,TEST,0,0 +2363,TEST,0,0 +2364,TEST,0,0 +2365,TEST,0,0 +2366,TEST,0,0 +2367,TEST,0,0 +2368,TEST,0,0 +2369,TEST,0,0 +2370,TEST,0,0 +2371,TEST,0,0 +2372,TEST,0,0 +2373,TEST,0,0 +2374,TEST,0,0 +2375,TEST,0,0 +2376,TEST,0,0 +2377,TEST,0,0 +2378,TEST,0,0 +2379,TEST,0,0 +2380,TEST,0,0 +2381,TEST,0,0 +2382,TEST,0,0 +2383,TEST,0,0 +2384,TEST,0,0 +2385,TEST,0,0 +2386,TEST,0,0 +2387,TEST,0,0 +2388,TEST,0,0 +2389,TEST,0,0 +2390,TEST,0,0 +2391,TEST,0,0 +2392,TEST,0,0 +2393,TEST,0,0 +2394,TEST,0,0 +2395,TEST,0,0 +2396,TEST,0,0 +2397,TEST,0,0 +2398,TEST,0,0 +2399,TEST,0,0 +2400,TEST,0,0 +2401,TEST,0,0 +2402,TEST,0,0 +2403,TEST,0,0 +2404,TEST,0,0 +2405,TEST,0,0 +2406,TEST,0,0 +2407,TEST,0,0 +2408,TEST,0,0 +2409,TEST,0,0 +2410,TEST,0,0 +2411,TEST,0,0 +2412,TEST,0,0 +2413,TEST,0,0 +2414,TEST,0,0 +2415,TEST,0,0 +2416,TEST,0,0 +2417,TEST,0,0 +2418,TEST,0,0 +2419,TEST,0,0 +2420,TEST,0,0 +2421,TEST,0,0 +2422,TEST,0,0 +2423,TEST,0,0 +2424,TEST,0,0 +2425,TEST,0,0 +2426,TEST,0,0 +2427,TEST,0,0 +2428,TEST,0,0 +2429,TEST,0,0 +2430,TEST,0,0 +2431,TEST,0,0 +2432,TEST,0,0 +2433,TEST,0,0 +2434,TEST,0,0 +2435,TEST,0,0 +2436,TEST,0,0 +2437,TEST,0,0 +2438,TEST,0,0 +2439,TEST,0,0 +2440,TEST,0,0 +2441,TEST,0,0 +2442,TEST,0,0 +2443,TEST,0,0 +2444,TEST,0,0 +2445,TEST,0,0 +2446,TEST,0,0 +2447,TEST,0,0 +2448,TEST,0,0 +2449,TEST,0,0 +2450,TEST,0,0 +2451,TEST,0,0 +2452,TEST,0,0 +2453,TEST,0,0 +2454,TEST,0,0 +2455,TEST,0,0 +2456,TEST,0,0 +2457,TEST,0,0 +2458,TEST,0,0 +2459,TEST,0,0 +2460,TEST,0,0 +2461,TEST,0,0 +2462,TEST,0,0 +2463,TEST,0,0 +2464,TEST,0,0 +2465,TEST,0,0 +2466,TEST,0,0 +2467,TEST,0,0 +2468,TEST,0,0 +2469,TEST,0,0 +2470,TEST,0,0 +2471,TEST,0,0 +2472,TEST,0,0 +2473,TEST,0,0 +2474,TEST,0,0 +2475,TEST,0,0 +2476,TEST,0,0 +2477,TEST,0,0 +2478,TEST,0,0 +2479,TEST,0,0 +2480,TEST,0,0 +2481,TEST,0,0 +2482,TEST,0,0 +2483,TEST,0,0 +2484,TEST,0,0 +2485,TEST,0,0 +2486,TEST,0,0 +2487,TEST,0,0 +2488,TEST,0,0 +2489,TEST,0,0 +2490,TEST,0,0 +2491,TEST,0,0 +2492,TEST,0,0 +2493,TEST,0,0 +2494,TEST,0,0 +2495,TEST,0,0 +2496,TEST,0,0 +2497,TEST,0,0 +2498,TEST,0,0 +2499,TEST,0,0 +2500,TEST,0,0 +2501,TEST,0,0 +2502,TEST,0,0 +2503,TEST,0,0 +2504,TEST,0,0 +2505,TEST,0,0 +2506,TEST,0,0 +2507,TEST,0,0 +2508,TEST,0,0 +2509,TEST,0,0 +2510,TEST,0,0 +2511,TEST,0,0 +2512,TEST,0,0 +2513,TEST,0,0 +2514,TEST,0,0 +2515,TEST,0,0 +2516,TEST,0,0 +2517,TEST,0,0 +2518,TEST,0,0 +2519,TEST,0,0 diff --git a/datasets/anomaly/yahoo_sub_5/SCORE/problem_TEST/problemDoc.json b/datasets/anomaly/yahoo_sub_5/SCORE/problem_TEST/problemDoc.json new file mode 100644 index 0000000..417cb6b --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/SCORE/problem_TEST/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "yahoo_sub_5_problem", + "problemName": "yahoo_sub_5_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "yahoo_sub_5_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 7, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TRAIN" + } + ], + "test": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TEST" + } + ], + "score": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly/yahoo_sub_5/SCORE/targets.csv b/datasets/anomaly/yahoo_sub_5/SCORE/targets.csv new file mode 100644 index 0000000..e69de29 diff --git a/datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json b/datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json new file mode 100644 index 0000000..ff5dec4 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json @@ -0,0 +1,95 @@ +{ + "about": { + "datasetID": "yahoo_sub_5_dataset_TEST", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value_0", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "value_1", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "value_2", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "value_3", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "value_4", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 8 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/tables/learningData.csv b/datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/tables/learningData.csv new file mode 100644 index 0000000..e3d5131 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/tables/learningData.csv @@ -0,0 +1,141 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +1260,1261,7782,0.034280386319742985,2.5072222222222003,104,3119,0 +1261,1262,7829,0.039360296791109,2.5927777777778,82,3590,0 +1262,1263,7902,0.0,2.6894444444444,208,3893,0 +1263,1264,8039,0.038944065994356014,2.6291666666667,92,3264,0 +1264,1265,8350,0.18176011684739,2.6469444444444,53,3963,0 +1265,1266,8142,0.18521047165852,2.7461111111111003,65,2757,0 +1266,1267,7886,0.13079770999921,2.9363888888889,62,2306,0 +1267,1268,7743,0.13310058077443,3.2797222222222,73,2549,0 +1268,1269,7707,0.054750658073534006,3.5194444444444,84,2212,0 +1269,1270,7726,0.030588852697706,3.8130555555556,90,2286,0 +1270,1271,7717,0.12998124134227002,3.7941666666667,80,2979,0 +1271,1272,10331,0.09100057249197198,3.6086111111111,90,3158,0 +1272,1273,10515,0.19464543002904006,3.3858333333333,84,2645,0 +1273,1274,10415,0.22178651521516,3.3336111111111,34,3161,0 +1274,1275,10387,0.22983578430825,3.3116666666667003,67,4460,0 +1275,1276,10471,0.298229429356,3.2616666666667005,74,2630,0 +1276,1277,10385,0.12923377484588,3.0044444444444003,44,2593,0 +1277,1278,10439,0.19609416059774,2.6741666666667,64,2625,0 +1278,1279,10516,0.040518533819385014,2.3191666666667,70,4834,0 +1279,1280,10587,0.07099894663641,2.0597222222222,96,4056,0 +1280,1281,10586,0.07584150637714701,2.0547222222222,110,5713,0 +1281,1282,10684,0.08180100127782801,2.1511111111111,68,3940,0 +1282,1283,10880,0.0,2.2602777777778,90,4414,0 +1283,1284,10830,0.0,2.2883333333333,90,5044,0 +1284,1285,10794,0.09140162014739303,2.3736111111111,69,3894,0 +1285,1286,10843,0.0,2.5869444444444,46,3993,0 +1286,1287,10805,0.0,2.6480555555556,74,4404,0 +1287,1288,10996,0.0,2.6077777777777995,68,4072,0 +1288,1289,11327,0.05363316840061,2.6069444444444,67,4182,0 +1289,1290,11090,0.26818151064716,2.6908333333333,51,3351,0 +1290,1291,10578,0.21887772653901,2.9019444444444003,39,4183,0 +1291,1292,10528,0.32371296573811,3.2711111111111,26,4068,0 +1292,1293,10475,0.12565805017257,3.5872222222222,25,8139,0 +1293,1294,10664,0.092277247744574,3.6913888888889,32,11000,0 +1294,1295,10513,0.077016875742983,3.6313888888889,17,2975,0 +1295,1296,9072,0.3714480797312501,3.5605555555556,19,2692,0 +1296,1297,9069,0.19332372237792,3.4402777777778,16,2502,0 +1297,1298,9089,0.06345811641554701,3.35,28,2510,0 +1298,1299,9027,0.2267121559473,3.3469444444444,24,2663,0 +1299,1300,8969,0.053072279964629,3.2708333333333,35,3575,0 +1300,1301,9073,0.13336345197744,3.2519444444444,49,2586,0 +1301,1302,8957,0.1252855094715,2.7311111111111,106,2908,0 +1302,1303,9126,0.096211952864224,2.3875,80,3530,0 +1303,1304,9122,0.096524467517755,2.0847222222222,90,2776,0 +1304,1305,9231,0.08924770147957402,2.0975,169,2962,0 +1305,1306,9368,0.11889606284162,2.1763888888889,98,3441,0 +1306,1307,9458,0.031429841710104,2.2327777777777995,92,4376,0 +1307,1308,9463,0.0,2.2725,91,3857,0 +1308,1309,9356,0.036512411627868,2.3202777777778,99,4685,0 +1309,1310,9340,0.0,2.5425,90,4585,0 +1310,1311,9340,0.0,2.5986111111111,126,3542,0 +1311,1312,9276,0.0,2.6319444444444,102,3370,0 +1312,1313,9611,0.10106696361212,2.5836111111111,132,3515,0 +1313,1314,9532,0.14854949043035,2.675,88,3793,0 +1314,1315,9156,0.08612162048398897,2.8522222222222,135,2954,0 +1315,1316,9222,0.16494200410492002,3.1302777777778,114,2627,0 +1316,1317,9282,0.28637713141253,3.4805555555556,35,2550,0 +1317,1318,9573,0.13206535647488,3.5994444444444,24,2480,0 +1318,1319,9333,0.27364025607799,3.5847222222222,44,2521,0 +1319,1320,9987,0.38382339961227,3.4963888888889,26,2860,0 +1320,1321,10133,0.08426242877623301,3.3825,37,3675,0 +1321,1322,10010,0.3290413568025901,3.2694444444444,45,2704,0 +1322,1323,10028,0.22632868808708,3.2322222222222,42,3121,0 +1323,1324,9984,0.17914189971361,3.1936111111111005,47,2603,0 +1324,1325,10041,0.30046815361859003,3.0536111111111004,34,3984,0 +1325,1326,10072,0.22650915594248,2.7819444444444,56,2537,0 +1326,1327,10025,0.0,2.4152777777778,87,3349,0 +1327,1328,10116,0.1223093269317,2.1569444444444,74,3958,0 +1328,1329,10232,0.1696074188221,2.1125,90,4243,0 +1329,1330,10516,0.0,2.1833333333333003,79,4159,0 +1330,1331,10449,0.028193633007367,2.205,97,5637,0 +1331,1332,10598,0.0,2.1697222222222,90,8142,0 +1332,1333,10337,0.0,2.3075,77,5713,0 +1333,1334,10469,0.097305232437507,2.4575,101,3668,0 +1334,1335,10426,0.11905908868379,2.6077777777777995,74,4307,0 +1335,1336,10531,0.11660374103282,2.6275,439,4354,0 +1336,1337,10875,0.060474297756584014,2.6144444444444,79,4262,0 +1337,1338,10494,0.22568442027805,2.6477777777777995,165,3446,0 +1338,1339,10195,0.14077736537045002,2.8594444444444003,139,2677,0 +1339,1340,9918,0.1924574892026,3.2675,56,4450,0 +1340,1341,9889,0.18922597300629,3.5136111111111004,102,3044,0 +1341,1342,9947,0.041593949118095004,3.5725,101,3428,0 +1342,1343,9977,0.2502095174271,3.6863888888889,41,2845,0 +1343,1344,10835,0.18663972932643,3.5636111111111,94,2781,0 +1344,1345,10765,0.07351854082400297,3.4127777777778,116,2743,0 +1345,1346,10656,0.081949111399618,3.295,94,4470,0 +1346,1347,10485,0.20148511394009,3.2666666666667004,89,2596,0 +1347,1348,10681,0.11515101921294,3.1933333333333,141,3249,0 +1348,1349,10852,0.07797276382811,3.0688888888889,167,2529,0 +1349,1350,10728,0.07244862879413201,2.8102777777778,148,2452,0 +1350,1351,10874,0.07310929970435699,2.42,105,2934,0 +1351,1352,10964,0.066868365737218,2.1358333333333,210,3159,0 +1352,1353,10984,0.05788512501593701,1.9916666666667,145,3974,0 +1353,1354,11055,0.09727414207464803,2.0947222222222,136,4305,0 +1354,1355,11233,0.033270317741558,2.1591666666667,126,5012,0 +1355,1356,11161,0.0,2.2377777777778,157,4455,0 +1356,1357,10966,0.038270957919533,2.2511111111111,105,4108,0 +1357,1358,11193,0.08728058888363299,2.4208333333333,114,4339,0 +1358,1359,11167,0.10536774813238,2.5241666666667,104,5056,0 +1359,1360,11367,0.1233991317089,2.5794444444444,69,5573,0 +1360,1361,51251,0.042565915766552,2.5936111111111,75,3366,1 +1361,1362,17953,0.23147422367229,2.6830555555556,73,2559,1 +1362,1363,170029,0.08983405162538903,2.8188888888889,74,1999,1 +1363,1364,10955,0.07464756469365201,2.9513888888888995,126,1993,0 +1364,1365,10984,0.099244104918934,3.2830555555556,67,1913,0 +1365,1366,10964,0.11535172009194,3.4819444444444,32,1760,0 +1366,1367,10980,0.21774881707852,3.5886111111111005,38,1890,0 +1367,1368,10852,0.1305066423559,3.4836111111111,34,2469,0 +1368,1369,10786,0.10054853030204,3.3955555555556,36,2133,0 +1369,1370,10841,0.02468393737575,3.2847222222222,26,3359,0 +1370,1371,10762,0.10018007414459,3.2383333333332995,74,3783,0 +1371,1372,10419,0.12522619841308,3.2188888888889,85,1809,0 +1372,1373,10467,0.11781887197077,2.9483333333333,67,2143,0 +1373,1374,10502,0.13417256350298,2.5855555555556,84,2567,0 +1374,1375,10519,0.07474686582090599,2.3005555555556003,1630,2176,0 +1375,1376,10579,0.13570963056519,2.0855555555556,1435,1929,0 +1376,1377,10502,0.076431907457478,1.9027777777778,857,2244,0 +1377,1378,10661,0.0,1.9411111111111,31,1810,0 +1378,1379,10818,0.1936428046839,2.0444444444444,500,2088,0 +1379,1380,10918,0.052826773889684014,2.1363888888889,53,2371,0 +1380,1381,10871,0.0,2.22,61,1843,0 +1381,1382,10796,0.054466597481213,2.3530555555556,158,2668,0 +1382,1383,10774,0.057459020289436,2.545,184,2309,0 +1383,1384,10898,0.28750562005936,2.6202777777778,91,1998,0 +1384,1385,11442,0.075538554674309,2.6847222222222,60,2480,0 +1385,1386,11113,0.08112608570492501,2.6591666666667004,107,2147,0 +1386,1387,10888,0.21563803296368,2.7863888888888995,5157,1802,0 +1387,1388,10894,0.095725002305685,3.0269444444444003,28,1789,0 +1388,1389,10888,0.17516056892320994,3.3227777777778,24,1999,0 +1389,1390,10896,0.32902836018586,3.6097222222222,21,2142,0 +1390,1391,10800,0.10216065221678,3.6805555555556,12,1904,0 +1391,1392,11000,0.19741931250852,3.6075,24,1876,0 +1392,1393,10985,0.10149107903671,3.4091666666667004,17,2434,0 +1393,1394,11017,0.17479255893624,3.3666666666667004,48,2472,0 +1394,1395,10863,0.034385029573777,3.3158333333333,41,1744,0 +1395,1396,10875,0.21988771218053,3.1622222222222,1088,2404,0 +1396,1397,10987,0.10149107903671,3.1086111111111,68,1971,0 +1397,1398,10778,0.10269981175445,2.6552777777778,2575,1713,0 +1398,1399,10957,0.11258759940039,2.2730555555556,4688,1765,0 +1399,1400,10832,0.13022351806001,2.0591666666667,477,3156,0 diff --git a/datasets/anomaly/yahoo_sub_5/TEST/problem_TEST/dataSplits.csv b/datasets/anomaly/yahoo_sub_5/TEST/problem_TEST/dataSplits.csv new file mode 100644 index 0000000..c72d454 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/TEST/problem_TEST/dataSplits.csv @@ -0,0 +1,1261 @@ +d3mIndex,type,repeat,fold +1260,TEST,0,0 +1261,TEST,0,0 +1262,TEST,0,0 +1263,TEST,0,0 +1264,TEST,0,0 +1265,TEST,0,0 +1266,TEST,0,0 +1267,TEST,0,0 +1268,TEST,0,0 +1269,TEST,0,0 +1270,TEST,0,0 +1271,TEST,0,0 +1272,TEST,0,0 +1273,TEST,0,0 +1274,TEST,0,0 +1275,TEST,0,0 +1276,TEST,0,0 +1277,TEST,0,0 +1278,TEST,0,0 +1279,TEST,0,0 +1280,TEST,0,0 +1281,TEST,0,0 +1282,TEST,0,0 +1283,TEST,0,0 +1284,TEST,0,0 +1285,TEST,0,0 +1286,TEST,0,0 +1287,TEST,0,0 +1288,TEST,0,0 +1289,TEST,0,0 +1290,TEST,0,0 +1291,TEST,0,0 +1292,TEST,0,0 +1293,TEST,0,0 +1294,TEST,0,0 +1295,TEST,0,0 +1296,TEST,0,0 +1297,TEST,0,0 +1298,TEST,0,0 +1299,TEST,0,0 +1300,TEST,0,0 +1301,TEST,0,0 +1302,TEST,0,0 +1303,TEST,0,0 +1304,TEST,0,0 +1305,TEST,0,0 +1306,TEST,0,0 +1307,TEST,0,0 +1308,TEST,0,0 +1309,TEST,0,0 +1310,TEST,0,0 +1311,TEST,0,0 +1312,TEST,0,0 +1313,TEST,0,0 +1314,TEST,0,0 +1315,TEST,0,0 +1316,TEST,0,0 +1317,TEST,0,0 +1318,TEST,0,0 +1319,TEST,0,0 +1320,TEST,0,0 +1321,TEST,0,0 +1322,TEST,0,0 +1323,TEST,0,0 +1324,TEST,0,0 +1325,TEST,0,0 +1326,TEST,0,0 +1327,TEST,0,0 +1328,TEST,0,0 +1329,TEST,0,0 +1330,TEST,0,0 +1331,TEST,0,0 +1332,TEST,0,0 +1333,TEST,0,0 +1334,TEST,0,0 +1335,TEST,0,0 +1336,TEST,0,0 +1337,TEST,0,0 +1338,TEST,0,0 +1339,TEST,0,0 +1340,TEST,0,0 +1341,TEST,0,0 +1342,TEST,0,0 +1343,TEST,0,0 +1344,TEST,0,0 +1345,TEST,0,0 +1346,TEST,0,0 +1347,TEST,0,0 +1348,TEST,0,0 +1349,TEST,0,0 +1350,TEST,0,0 +1351,TEST,0,0 +1352,TEST,0,0 +1353,TEST,0,0 +1354,TEST,0,0 +1355,TEST,0,0 +1356,TEST,0,0 +1357,TEST,0,0 +1358,TEST,0,0 +1359,TEST,0,0 +1360,TEST,0,0 +1361,TEST,0,0 +1362,TEST,0,0 +1363,TEST,0,0 +1364,TEST,0,0 +1365,TEST,0,0 +1366,TEST,0,0 +1367,TEST,0,0 +1368,TEST,0,0 +1369,TEST,0,0 +1370,TEST,0,0 +1371,TEST,0,0 +1372,TEST,0,0 +1373,TEST,0,0 +1374,TEST,0,0 +1375,TEST,0,0 +1376,TEST,0,0 +1377,TEST,0,0 +1378,TEST,0,0 +1379,TEST,0,0 +1380,TEST,0,0 +1381,TEST,0,0 +1382,TEST,0,0 +1383,TEST,0,0 +1384,TEST,0,0 +1385,TEST,0,0 +1386,TEST,0,0 +1387,TEST,0,0 +1388,TEST,0,0 +1389,TEST,0,0 +1390,TEST,0,0 +1391,TEST,0,0 +1392,TEST,0,0 +1393,TEST,0,0 +1394,TEST,0,0 +1395,TEST,0,0 +1396,TEST,0,0 +1397,TEST,0,0 +1398,TEST,0,0 +1399,TEST,0,0 +1400,TEST,0,0 +1401,TEST,0,0 +1402,TEST,0,0 +1403,TEST,0,0 +1404,TEST,0,0 +1405,TEST,0,0 +1406,TEST,0,0 +1407,TEST,0,0 +1408,TEST,0,0 +1409,TEST,0,0 +1410,TEST,0,0 +1411,TEST,0,0 +1412,TEST,0,0 +1413,TEST,0,0 +1414,TEST,0,0 +1415,TEST,0,0 +1416,TEST,0,0 +1417,TEST,0,0 +1418,TEST,0,0 +1419,TEST,0,0 +1420,TEST,0,0 +1421,TEST,0,0 +1422,TEST,0,0 +1423,TEST,0,0 +1424,TEST,0,0 +1425,TEST,0,0 +1426,TEST,0,0 +1427,TEST,0,0 +1428,TEST,0,0 +1429,TEST,0,0 +1430,TEST,0,0 +1431,TEST,0,0 +1432,TEST,0,0 +1433,TEST,0,0 +1434,TEST,0,0 +1435,TEST,0,0 +1436,TEST,0,0 +1437,TEST,0,0 +1438,TEST,0,0 +1439,TEST,0,0 +1440,TEST,0,0 +1441,TEST,0,0 +1442,TEST,0,0 +1443,TEST,0,0 +1444,TEST,0,0 +1445,TEST,0,0 +1446,TEST,0,0 +1447,TEST,0,0 +1448,TEST,0,0 +1449,TEST,0,0 +1450,TEST,0,0 +1451,TEST,0,0 +1452,TEST,0,0 +1453,TEST,0,0 +1454,TEST,0,0 +1455,TEST,0,0 +1456,TEST,0,0 +1457,TEST,0,0 +1458,TEST,0,0 +1459,TEST,0,0 +1460,TEST,0,0 +1461,TEST,0,0 +1462,TEST,0,0 +1463,TEST,0,0 +1464,TEST,0,0 +1465,TEST,0,0 +1466,TEST,0,0 +1467,TEST,0,0 +1468,TEST,0,0 +1469,TEST,0,0 +1470,TEST,0,0 +1471,TEST,0,0 +1472,TEST,0,0 +1473,TEST,0,0 +1474,TEST,0,0 +1475,TEST,0,0 +1476,TEST,0,0 +1477,TEST,0,0 +1478,TEST,0,0 +1479,TEST,0,0 +1480,TEST,0,0 +1481,TEST,0,0 +1482,TEST,0,0 +1483,TEST,0,0 +1484,TEST,0,0 +1485,TEST,0,0 +1486,TEST,0,0 +1487,TEST,0,0 +1488,TEST,0,0 +1489,TEST,0,0 +1490,TEST,0,0 +1491,TEST,0,0 +1492,TEST,0,0 +1493,TEST,0,0 +1494,TEST,0,0 +1495,TEST,0,0 +1496,TEST,0,0 +1497,TEST,0,0 +1498,TEST,0,0 +1499,TEST,0,0 +1500,TEST,0,0 +1501,TEST,0,0 +1502,TEST,0,0 +1503,TEST,0,0 +1504,TEST,0,0 +1505,TEST,0,0 +1506,TEST,0,0 +1507,TEST,0,0 +1508,TEST,0,0 +1509,TEST,0,0 +1510,TEST,0,0 +1511,TEST,0,0 +1512,TEST,0,0 +1513,TEST,0,0 +1514,TEST,0,0 +1515,TEST,0,0 +1516,TEST,0,0 +1517,TEST,0,0 +1518,TEST,0,0 +1519,TEST,0,0 +1520,TEST,0,0 +1521,TEST,0,0 +1522,TEST,0,0 +1523,TEST,0,0 +1524,TEST,0,0 +1525,TEST,0,0 +1526,TEST,0,0 +1527,TEST,0,0 +1528,TEST,0,0 +1529,TEST,0,0 +1530,TEST,0,0 +1531,TEST,0,0 +1532,TEST,0,0 +1533,TEST,0,0 +1534,TEST,0,0 +1535,TEST,0,0 +1536,TEST,0,0 +1537,TEST,0,0 +1538,TEST,0,0 +1539,TEST,0,0 +1540,TEST,0,0 +1541,TEST,0,0 +1542,TEST,0,0 +1543,TEST,0,0 +1544,TEST,0,0 +1545,TEST,0,0 +1546,TEST,0,0 +1547,TEST,0,0 +1548,TEST,0,0 +1549,TEST,0,0 +1550,TEST,0,0 +1551,TEST,0,0 +1552,TEST,0,0 +1553,TEST,0,0 +1554,TEST,0,0 +1555,TEST,0,0 +1556,TEST,0,0 +1557,TEST,0,0 +1558,TEST,0,0 +1559,TEST,0,0 +1560,TEST,0,0 +1561,TEST,0,0 +1562,TEST,0,0 +1563,TEST,0,0 +1564,TEST,0,0 +1565,TEST,0,0 +1566,TEST,0,0 +1567,TEST,0,0 +1568,TEST,0,0 +1569,TEST,0,0 +1570,TEST,0,0 +1571,TEST,0,0 +1572,TEST,0,0 +1573,TEST,0,0 +1574,TEST,0,0 +1575,TEST,0,0 +1576,TEST,0,0 +1577,TEST,0,0 +1578,TEST,0,0 +1579,TEST,0,0 +1580,TEST,0,0 +1581,TEST,0,0 +1582,TEST,0,0 +1583,TEST,0,0 +1584,TEST,0,0 +1585,TEST,0,0 +1586,TEST,0,0 +1587,TEST,0,0 +1588,TEST,0,0 +1589,TEST,0,0 +1590,TEST,0,0 +1591,TEST,0,0 +1592,TEST,0,0 +1593,TEST,0,0 +1594,TEST,0,0 +1595,TEST,0,0 +1596,TEST,0,0 +1597,TEST,0,0 +1598,TEST,0,0 +1599,TEST,0,0 +1600,TEST,0,0 +1601,TEST,0,0 +1602,TEST,0,0 +1603,TEST,0,0 +1604,TEST,0,0 +1605,TEST,0,0 +1606,TEST,0,0 +1607,TEST,0,0 +1608,TEST,0,0 +1609,TEST,0,0 +1610,TEST,0,0 +1611,TEST,0,0 +1612,TEST,0,0 +1613,TEST,0,0 +1614,TEST,0,0 +1615,TEST,0,0 +1616,TEST,0,0 +1617,TEST,0,0 +1618,TEST,0,0 +1619,TEST,0,0 +1620,TEST,0,0 +1621,TEST,0,0 +1622,TEST,0,0 +1623,TEST,0,0 +1624,TEST,0,0 +1625,TEST,0,0 +1626,TEST,0,0 +1627,TEST,0,0 +1628,TEST,0,0 +1629,TEST,0,0 +1630,TEST,0,0 +1631,TEST,0,0 +1632,TEST,0,0 +1633,TEST,0,0 +1634,TEST,0,0 +1635,TEST,0,0 +1636,TEST,0,0 +1637,TEST,0,0 +1638,TEST,0,0 +1639,TEST,0,0 +1640,TEST,0,0 +1641,TEST,0,0 +1642,TEST,0,0 +1643,TEST,0,0 +1644,TEST,0,0 +1645,TEST,0,0 +1646,TEST,0,0 +1647,TEST,0,0 +1648,TEST,0,0 +1649,TEST,0,0 +1650,TEST,0,0 +1651,TEST,0,0 +1652,TEST,0,0 +1653,TEST,0,0 +1654,TEST,0,0 +1655,TEST,0,0 +1656,TEST,0,0 +1657,TEST,0,0 +1658,TEST,0,0 +1659,TEST,0,0 +1660,TEST,0,0 +1661,TEST,0,0 +1662,TEST,0,0 +1663,TEST,0,0 +1664,TEST,0,0 +1665,TEST,0,0 +1666,TEST,0,0 +1667,TEST,0,0 +1668,TEST,0,0 +1669,TEST,0,0 +1670,TEST,0,0 +1671,TEST,0,0 +1672,TEST,0,0 +1673,TEST,0,0 +1674,TEST,0,0 +1675,TEST,0,0 +1676,TEST,0,0 +1677,TEST,0,0 +1678,TEST,0,0 +1679,TEST,0,0 +1680,TEST,0,0 +1681,TEST,0,0 +1682,TEST,0,0 +1683,TEST,0,0 +1684,TEST,0,0 +1685,TEST,0,0 +1686,TEST,0,0 +1687,TEST,0,0 +1688,TEST,0,0 +1689,TEST,0,0 +1690,TEST,0,0 +1691,TEST,0,0 +1692,TEST,0,0 +1693,TEST,0,0 +1694,TEST,0,0 +1695,TEST,0,0 +1696,TEST,0,0 +1697,TEST,0,0 +1698,TEST,0,0 +1699,TEST,0,0 +1700,TEST,0,0 +1701,TEST,0,0 +1702,TEST,0,0 +1703,TEST,0,0 +1704,TEST,0,0 +1705,TEST,0,0 +1706,TEST,0,0 +1707,TEST,0,0 +1708,TEST,0,0 +1709,TEST,0,0 +1710,TEST,0,0 +1711,TEST,0,0 +1712,TEST,0,0 +1713,TEST,0,0 +1714,TEST,0,0 +1715,TEST,0,0 +1716,TEST,0,0 +1717,TEST,0,0 +1718,TEST,0,0 +1719,TEST,0,0 +1720,TEST,0,0 +1721,TEST,0,0 +1722,TEST,0,0 +1723,TEST,0,0 +1724,TEST,0,0 +1725,TEST,0,0 +1726,TEST,0,0 +1727,TEST,0,0 +1728,TEST,0,0 +1729,TEST,0,0 +1730,TEST,0,0 +1731,TEST,0,0 +1732,TEST,0,0 +1733,TEST,0,0 +1734,TEST,0,0 +1735,TEST,0,0 +1736,TEST,0,0 +1737,TEST,0,0 +1738,TEST,0,0 +1739,TEST,0,0 +1740,TEST,0,0 +1741,TEST,0,0 +1742,TEST,0,0 +1743,TEST,0,0 +1744,TEST,0,0 +1745,TEST,0,0 +1746,TEST,0,0 +1747,TEST,0,0 +1748,TEST,0,0 +1749,TEST,0,0 +1750,TEST,0,0 +1751,TEST,0,0 +1752,TEST,0,0 +1753,TEST,0,0 +1754,TEST,0,0 +1755,TEST,0,0 +1756,TEST,0,0 +1757,TEST,0,0 +1758,TEST,0,0 +1759,TEST,0,0 +1760,TEST,0,0 +1761,TEST,0,0 +1762,TEST,0,0 +1763,TEST,0,0 +1764,TEST,0,0 +1765,TEST,0,0 +1766,TEST,0,0 +1767,TEST,0,0 +1768,TEST,0,0 +1769,TEST,0,0 +1770,TEST,0,0 +1771,TEST,0,0 +1772,TEST,0,0 +1773,TEST,0,0 +1774,TEST,0,0 +1775,TEST,0,0 +1776,TEST,0,0 +1777,TEST,0,0 +1778,TEST,0,0 +1779,TEST,0,0 +1780,TEST,0,0 +1781,TEST,0,0 +1782,TEST,0,0 +1783,TEST,0,0 +1784,TEST,0,0 +1785,TEST,0,0 +1786,TEST,0,0 +1787,TEST,0,0 +1788,TEST,0,0 +1789,TEST,0,0 +1790,TEST,0,0 +1791,TEST,0,0 +1792,TEST,0,0 +1793,TEST,0,0 +1794,TEST,0,0 +1795,TEST,0,0 +1796,TEST,0,0 +1797,TEST,0,0 +1798,TEST,0,0 +1799,TEST,0,0 +1800,TEST,0,0 +1801,TEST,0,0 +1802,TEST,0,0 +1803,TEST,0,0 +1804,TEST,0,0 +1805,TEST,0,0 +1806,TEST,0,0 +1807,TEST,0,0 +1808,TEST,0,0 +1809,TEST,0,0 +1810,TEST,0,0 +1811,TEST,0,0 +1812,TEST,0,0 +1813,TEST,0,0 +1814,TEST,0,0 +1815,TEST,0,0 +1816,TEST,0,0 +1817,TEST,0,0 +1818,TEST,0,0 +1819,TEST,0,0 +1820,TEST,0,0 +1821,TEST,0,0 +1822,TEST,0,0 +1823,TEST,0,0 +1824,TEST,0,0 +1825,TEST,0,0 +1826,TEST,0,0 +1827,TEST,0,0 +1828,TEST,0,0 +1829,TEST,0,0 +1830,TEST,0,0 +1831,TEST,0,0 +1832,TEST,0,0 +1833,TEST,0,0 +1834,TEST,0,0 +1835,TEST,0,0 +1836,TEST,0,0 +1837,TEST,0,0 +1838,TEST,0,0 +1839,TEST,0,0 +1840,TEST,0,0 +1841,TEST,0,0 +1842,TEST,0,0 +1843,TEST,0,0 +1844,TEST,0,0 +1845,TEST,0,0 +1846,TEST,0,0 +1847,TEST,0,0 +1848,TEST,0,0 +1849,TEST,0,0 +1850,TEST,0,0 +1851,TEST,0,0 +1852,TEST,0,0 +1853,TEST,0,0 +1854,TEST,0,0 +1855,TEST,0,0 +1856,TEST,0,0 +1857,TEST,0,0 +1858,TEST,0,0 +1859,TEST,0,0 +1860,TEST,0,0 +1861,TEST,0,0 +1862,TEST,0,0 +1863,TEST,0,0 +1864,TEST,0,0 +1865,TEST,0,0 +1866,TEST,0,0 +1867,TEST,0,0 +1868,TEST,0,0 +1869,TEST,0,0 +1870,TEST,0,0 +1871,TEST,0,0 +1872,TEST,0,0 +1873,TEST,0,0 +1874,TEST,0,0 +1875,TEST,0,0 +1876,TEST,0,0 +1877,TEST,0,0 +1878,TEST,0,0 +1879,TEST,0,0 +1880,TEST,0,0 +1881,TEST,0,0 +1882,TEST,0,0 +1883,TEST,0,0 +1884,TEST,0,0 +1885,TEST,0,0 +1886,TEST,0,0 +1887,TEST,0,0 +1888,TEST,0,0 +1889,TEST,0,0 +1890,TEST,0,0 +1891,TEST,0,0 +1892,TEST,0,0 +1893,TEST,0,0 +1894,TEST,0,0 +1895,TEST,0,0 +1896,TEST,0,0 +1897,TEST,0,0 +1898,TEST,0,0 +1899,TEST,0,0 +1900,TEST,0,0 +1901,TEST,0,0 +1902,TEST,0,0 +1903,TEST,0,0 +1904,TEST,0,0 +1905,TEST,0,0 +1906,TEST,0,0 +1907,TEST,0,0 +1908,TEST,0,0 +1909,TEST,0,0 +1910,TEST,0,0 +1911,TEST,0,0 +1912,TEST,0,0 +1913,TEST,0,0 +1914,TEST,0,0 +1915,TEST,0,0 +1916,TEST,0,0 +1917,TEST,0,0 +1918,TEST,0,0 +1919,TEST,0,0 +1920,TEST,0,0 +1921,TEST,0,0 +1922,TEST,0,0 +1923,TEST,0,0 +1924,TEST,0,0 +1925,TEST,0,0 +1926,TEST,0,0 +1927,TEST,0,0 +1928,TEST,0,0 +1929,TEST,0,0 +1930,TEST,0,0 +1931,TEST,0,0 +1932,TEST,0,0 +1933,TEST,0,0 +1934,TEST,0,0 +1935,TEST,0,0 +1936,TEST,0,0 +1937,TEST,0,0 +1938,TEST,0,0 +1939,TEST,0,0 +1940,TEST,0,0 +1941,TEST,0,0 +1942,TEST,0,0 +1943,TEST,0,0 +1944,TEST,0,0 +1945,TEST,0,0 +1946,TEST,0,0 +1947,TEST,0,0 +1948,TEST,0,0 +1949,TEST,0,0 +1950,TEST,0,0 +1951,TEST,0,0 +1952,TEST,0,0 +1953,TEST,0,0 +1954,TEST,0,0 +1955,TEST,0,0 +1956,TEST,0,0 +1957,TEST,0,0 +1958,TEST,0,0 +1959,TEST,0,0 +1960,TEST,0,0 +1961,TEST,0,0 +1962,TEST,0,0 +1963,TEST,0,0 +1964,TEST,0,0 +1965,TEST,0,0 +1966,TEST,0,0 +1967,TEST,0,0 +1968,TEST,0,0 +1969,TEST,0,0 +1970,TEST,0,0 +1971,TEST,0,0 +1972,TEST,0,0 +1973,TEST,0,0 +1974,TEST,0,0 +1975,TEST,0,0 +1976,TEST,0,0 +1977,TEST,0,0 +1978,TEST,0,0 +1979,TEST,0,0 +1980,TEST,0,0 +1981,TEST,0,0 +1982,TEST,0,0 +1983,TEST,0,0 +1984,TEST,0,0 +1985,TEST,0,0 +1986,TEST,0,0 +1987,TEST,0,0 +1988,TEST,0,0 +1989,TEST,0,0 +1990,TEST,0,0 +1991,TEST,0,0 +1992,TEST,0,0 +1993,TEST,0,0 +1994,TEST,0,0 +1995,TEST,0,0 +1996,TEST,0,0 +1997,TEST,0,0 +1998,TEST,0,0 +1999,TEST,0,0 +2000,TEST,0,0 +2001,TEST,0,0 +2002,TEST,0,0 +2003,TEST,0,0 +2004,TEST,0,0 +2005,TEST,0,0 +2006,TEST,0,0 +2007,TEST,0,0 +2008,TEST,0,0 +2009,TEST,0,0 +2010,TEST,0,0 +2011,TEST,0,0 +2012,TEST,0,0 +2013,TEST,0,0 +2014,TEST,0,0 +2015,TEST,0,0 +2016,TEST,0,0 +2017,TEST,0,0 +2018,TEST,0,0 +2019,TEST,0,0 +2020,TEST,0,0 +2021,TEST,0,0 +2022,TEST,0,0 +2023,TEST,0,0 +2024,TEST,0,0 +2025,TEST,0,0 +2026,TEST,0,0 +2027,TEST,0,0 +2028,TEST,0,0 +2029,TEST,0,0 +2030,TEST,0,0 +2031,TEST,0,0 +2032,TEST,0,0 +2033,TEST,0,0 +2034,TEST,0,0 +2035,TEST,0,0 +2036,TEST,0,0 +2037,TEST,0,0 +2038,TEST,0,0 +2039,TEST,0,0 +2040,TEST,0,0 +2041,TEST,0,0 +2042,TEST,0,0 +2043,TEST,0,0 +2044,TEST,0,0 +2045,TEST,0,0 +2046,TEST,0,0 +2047,TEST,0,0 +2048,TEST,0,0 +2049,TEST,0,0 +2050,TEST,0,0 +2051,TEST,0,0 +2052,TEST,0,0 +2053,TEST,0,0 +2054,TEST,0,0 +2055,TEST,0,0 +2056,TEST,0,0 +2057,TEST,0,0 +2058,TEST,0,0 +2059,TEST,0,0 +2060,TEST,0,0 +2061,TEST,0,0 +2062,TEST,0,0 +2063,TEST,0,0 +2064,TEST,0,0 +2065,TEST,0,0 +2066,TEST,0,0 +2067,TEST,0,0 +2068,TEST,0,0 +2069,TEST,0,0 +2070,TEST,0,0 +2071,TEST,0,0 +2072,TEST,0,0 +2073,TEST,0,0 +2074,TEST,0,0 +2075,TEST,0,0 +2076,TEST,0,0 +2077,TEST,0,0 +2078,TEST,0,0 +2079,TEST,0,0 +2080,TEST,0,0 +2081,TEST,0,0 +2082,TEST,0,0 +2083,TEST,0,0 +2084,TEST,0,0 +2085,TEST,0,0 +2086,TEST,0,0 +2087,TEST,0,0 +2088,TEST,0,0 +2089,TEST,0,0 +2090,TEST,0,0 +2091,TEST,0,0 +2092,TEST,0,0 +2093,TEST,0,0 +2094,TEST,0,0 +2095,TEST,0,0 +2096,TEST,0,0 +2097,TEST,0,0 +2098,TEST,0,0 +2099,TEST,0,0 +2100,TEST,0,0 +2101,TEST,0,0 +2102,TEST,0,0 +2103,TEST,0,0 +2104,TEST,0,0 +2105,TEST,0,0 +2106,TEST,0,0 +2107,TEST,0,0 +2108,TEST,0,0 +2109,TEST,0,0 +2110,TEST,0,0 +2111,TEST,0,0 +2112,TEST,0,0 +2113,TEST,0,0 +2114,TEST,0,0 +2115,TEST,0,0 +2116,TEST,0,0 +2117,TEST,0,0 +2118,TEST,0,0 +2119,TEST,0,0 +2120,TEST,0,0 +2121,TEST,0,0 +2122,TEST,0,0 +2123,TEST,0,0 +2124,TEST,0,0 +2125,TEST,0,0 +2126,TEST,0,0 +2127,TEST,0,0 +2128,TEST,0,0 +2129,TEST,0,0 +2130,TEST,0,0 +2131,TEST,0,0 +2132,TEST,0,0 +2133,TEST,0,0 +2134,TEST,0,0 +2135,TEST,0,0 +2136,TEST,0,0 +2137,TEST,0,0 +2138,TEST,0,0 +2139,TEST,0,0 +2140,TEST,0,0 +2141,TEST,0,0 +2142,TEST,0,0 +2143,TEST,0,0 +2144,TEST,0,0 +2145,TEST,0,0 +2146,TEST,0,0 +2147,TEST,0,0 +2148,TEST,0,0 +2149,TEST,0,0 +2150,TEST,0,0 +2151,TEST,0,0 +2152,TEST,0,0 +2153,TEST,0,0 +2154,TEST,0,0 +2155,TEST,0,0 +2156,TEST,0,0 +2157,TEST,0,0 +2158,TEST,0,0 +2159,TEST,0,0 +2160,TEST,0,0 +2161,TEST,0,0 +2162,TEST,0,0 +2163,TEST,0,0 +2164,TEST,0,0 +2165,TEST,0,0 +2166,TEST,0,0 +2167,TEST,0,0 +2168,TEST,0,0 +2169,TEST,0,0 +2170,TEST,0,0 +2171,TEST,0,0 +2172,TEST,0,0 +2173,TEST,0,0 +2174,TEST,0,0 +2175,TEST,0,0 +2176,TEST,0,0 +2177,TEST,0,0 +2178,TEST,0,0 +2179,TEST,0,0 +2180,TEST,0,0 +2181,TEST,0,0 +2182,TEST,0,0 +2183,TEST,0,0 +2184,TEST,0,0 +2185,TEST,0,0 +2186,TEST,0,0 +2187,TEST,0,0 +2188,TEST,0,0 +2189,TEST,0,0 +2190,TEST,0,0 +2191,TEST,0,0 +2192,TEST,0,0 +2193,TEST,0,0 +2194,TEST,0,0 +2195,TEST,0,0 +2196,TEST,0,0 +2197,TEST,0,0 +2198,TEST,0,0 +2199,TEST,0,0 +2200,TEST,0,0 +2201,TEST,0,0 +2202,TEST,0,0 +2203,TEST,0,0 +2204,TEST,0,0 +2205,TEST,0,0 +2206,TEST,0,0 +2207,TEST,0,0 +2208,TEST,0,0 +2209,TEST,0,0 +2210,TEST,0,0 +2211,TEST,0,0 +2212,TEST,0,0 +2213,TEST,0,0 +2214,TEST,0,0 +2215,TEST,0,0 +2216,TEST,0,0 +2217,TEST,0,0 +2218,TEST,0,0 +2219,TEST,0,0 +2220,TEST,0,0 +2221,TEST,0,0 +2222,TEST,0,0 +2223,TEST,0,0 +2224,TEST,0,0 +2225,TEST,0,0 +2226,TEST,0,0 +2227,TEST,0,0 +2228,TEST,0,0 +2229,TEST,0,0 +2230,TEST,0,0 +2231,TEST,0,0 +2232,TEST,0,0 +2233,TEST,0,0 +2234,TEST,0,0 +2235,TEST,0,0 +2236,TEST,0,0 +2237,TEST,0,0 +2238,TEST,0,0 +2239,TEST,0,0 +2240,TEST,0,0 +2241,TEST,0,0 +2242,TEST,0,0 +2243,TEST,0,0 +2244,TEST,0,0 +2245,TEST,0,0 +2246,TEST,0,0 +2247,TEST,0,0 +2248,TEST,0,0 +2249,TEST,0,0 +2250,TEST,0,0 +2251,TEST,0,0 +2252,TEST,0,0 +2253,TEST,0,0 +2254,TEST,0,0 +2255,TEST,0,0 +2256,TEST,0,0 +2257,TEST,0,0 +2258,TEST,0,0 +2259,TEST,0,0 +2260,TEST,0,0 +2261,TEST,0,0 +2262,TEST,0,0 +2263,TEST,0,0 +2264,TEST,0,0 +2265,TEST,0,0 +2266,TEST,0,0 +2267,TEST,0,0 +2268,TEST,0,0 +2269,TEST,0,0 +2270,TEST,0,0 +2271,TEST,0,0 +2272,TEST,0,0 +2273,TEST,0,0 +2274,TEST,0,0 +2275,TEST,0,0 +2276,TEST,0,0 +2277,TEST,0,0 +2278,TEST,0,0 +2279,TEST,0,0 +2280,TEST,0,0 +2281,TEST,0,0 +2282,TEST,0,0 +2283,TEST,0,0 +2284,TEST,0,0 +2285,TEST,0,0 +2286,TEST,0,0 +2287,TEST,0,0 +2288,TEST,0,0 +2289,TEST,0,0 +2290,TEST,0,0 +2291,TEST,0,0 +2292,TEST,0,0 +2293,TEST,0,0 +2294,TEST,0,0 +2295,TEST,0,0 +2296,TEST,0,0 +2297,TEST,0,0 +2298,TEST,0,0 +2299,TEST,0,0 +2300,TEST,0,0 +2301,TEST,0,0 +2302,TEST,0,0 +2303,TEST,0,0 +2304,TEST,0,0 +2305,TEST,0,0 +2306,TEST,0,0 +2307,TEST,0,0 +2308,TEST,0,0 +2309,TEST,0,0 +2310,TEST,0,0 +2311,TEST,0,0 +2312,TEST,0,0 +2313,TEST,0,0 +2314,TEST,0,0 +2315,TEST,0,0 +2316,TEST,0,0 +2317,TEST,0,0 +2318,TEST,0,0 +2319,TEST,0,0 +2320,TEST,0,0 +2321,TEST,0,0 +2322,TEST,0,0 +2323,TEST,0,0 +2324,TEST,0,0 +2325,TEST,0,0 +2326,TEST,0,0 +2327,TEST,0,0 +2328,TEST,0,0 +2329,TEST,0,0 +2330,TEST,0,0 +2331,TEST,0,0 +2332,TEST,0,0 +2333,TEST,0,0 +2334,TEST,0,0 +2335,TEST,0,0 +2336,TEST,0,0 +2337,TEST,0,0 +2338,TEST,0,0 +2339,TEST,0,0 +2340,TEST,0,0 +2341,TEST,0,0 +2342,TEST,0,0 +2343,TEST,0,0 +2344,TEST,0,0 +2345,TEST,0,0 +2346,TEST,0,0 +2347,TEST,0,0 +2348,TEST,0,0 +2349,TEST,0,0 +2350,TEST,0,0 +2351,TEST,0,0 +2352,TEST,0,0 +2353,TEST,0,0 +2354,TEST,0,0 +2355,TEST,0,0 +2356,TEST,0,0 +2357,TEST,0,0 +2358,TEST,0,0 +2359,TEST,0,0 +2360,TEST,0,0 +2361,TEST,0,0 +2362,TEST,0,0 +2363,TEST,0,0 +2364,TEST,0,0 +2365,TEST,0,0 +2366,TEST,0,0 +2367,TEST,0,0 +2368,TEST,0,0 +2369,TEST,0,0 +2370,TEST,0,0 +2371,TEST,0,0 +2372,TEST,0,0 +2373,TEST,0,0 +2374,TEST,0,0 +2375,TEST,0,0 +2376,TEST,0,0 +2377,TEST,0,0 +2378,TEST,0,0 +2379,TEST,0,0 +2380,TEST,0,0 +2381,TEST,0,0 +2382,TEST,0,0 +2383,TEST,0,0 +2384,TEST,0,0 +2385,TEST,0,0 +2386,TEST,0,0 +2387,TEST,0,0 +2388,TEST,0,0 +2389,TEST,0,0 +2390,TEST,0,0 +2391,TEST,0,0 +2392,TEST,0,0 +2393,TEST,0,0 +2394,TEST,0,0 +2395,TEST,0,0 +2396,TEST,0,0 +2397,TEST,0,0 +2398,TEST,0,0 +2399,TEST,0,0 +2400,TEST,0,0 +2401,TEST,0,0 +2402,TEST,0,0 +2403,TEST,0,0 +2404,TEST,0,0 +2405,TEST,0,0 +2406,TEST,0,0 +2407,TEST,0,0 +2408,TEST,0,0 +2409,TEST,0,0 +2410,TEST,0,0 +2411,TEST,0,0 +2412,TEST,0,0 +2413,TEST,0,0 +2414,TEST,0,0 +2415,TEST,0,0 +2416,TEST,0,0 +2417,TEST,0,0 +2418,TEST,0,0 +2419,TEST,0,0 +2420,TEST,0,0 +2421,TEST,0,0 +2422,TEST,0,0 +2423,TEST,0,0 +2424,TEST,0,0 +2425,TEST,0,0 +2426,TEST,0,0 +2427,TEST,0,0 +2428,TEST,0,0 +2429,TEST,0,0 +2430,TEST,0,0 +2431,TEST,0,0 +2432,TEST,0,0 +2433,TEST,0,0 +2434,TEST,0,0 +2435,TEST,0,0 +2436,TEST,0,0 +2437,TEST,0,0 +2438,TEST,0,0 +2439,TEST,0,0 +2440,TEST,0,0 +2441,TEST,0,0 +2442,TEST,0,0 +2443,TEST,0,0 +2444,TEST,0,0 +2445,TEST,0,0 +2446,TEST,0,0 +2447,TEST,0,0 +2448,TEST,0,0 +2449,TEST,0,0 +2450,TEST,0,0 +2451,TEST,0,0 +2452,TEST,0,0 +2453,TEST,0,0 +2454,TEST,0,0 +2455,TEST,0,0 +2456,TEST,0,0 +2457,TEST,0,0 +2458,TEST,0,0 +2459,TEST,0,0 +2460,TEST,0,0 +2461,TEST,0,0 +2462,TEST,0,0 +2463,TEST,0,0 +2464,TEST,0,0 +2465,TEST,0,0 +2466,TEST,0,0 +2467,TEST,0,0 +2468,TEST,0,0 +2469,TEST,0,0 +2470,TEST,0,0 +2471,TEST,0,0 +2472,TEST,0,0 +2473,TEST,0,0 +2474,TEST,0,0 +2475,TEST,0,0 +2476,TEST,0,0 +2477,TEST,0,0 +2478,TEST,0,0 +2479,TEST,0,0 +2480,TEST,0,0 +2481,TEST,0,0 +2482,TEST,0,0 +2483,TEST,0,0 +2484,TEST,0,0 +2485,TEST,0,0 +2486,TEST,0,0 +2487,TEST,0,0 +2488,TEST,0,0 +2489,TEST,0,0 +2490,TEST,0,0 +2491,TEST,0,0 +2492,TEST,0,0 +2493,TEST,0,0 +2494,TEST,0,0 +2495,TEST,0,0 +2496,TEST,0,0 +2497,TEST,0,0 +2498,TEST,0,0 +2499,TEST,0,0 +2500,TEST,0,0 +2501,TEST,0,0 +2502,TEST,0,0 +2503,TEST,0,0 +2504,TEST,0,0 +2505,TEST,0,0 +2506,TEST,0,0 +2507,TEST,0,0 +2508,TEST,0,0 +2509,TEST,0,0 +2510,TEST,0,0 +2511,TEST,0,0 +2512,TEST,0,0 +2513,TEST,0,0 +2514,TEST,0,0 +2515,TEST,0,0 +2516,TEST,0,0 +2517,TEST,0,0 +2518,TEST,0,0 +2519,TEST,0,0 diff --git a/datasets/anomaly/yahoo_sub_5/TEST/problem_TEST/problemDoc.json b/datasets/anomaly/yahoo_sub_5/TEST/problem_TEST/problemDoc.json new file mode 100644 index 0000000..417cb6b --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/TEST/problem_TEST/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "yahoo_sub_5_problem", + "problemName": "yahoo_sub_5_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "yahoo_sub_5_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 7, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TRAIN" + } + ], + "test": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TEST" + } + ], + "score": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json b/datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json new file mode 100644 index 0000000..be6f5c0 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json @@ -0,0 +1,95 @@ +{ + "about": { + "datasetID": "yahoo_sub_5_dataset_TRAIN", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value_0", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "value_1", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "value_2", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "value_3", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "value_4", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 8 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv b/datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv new file mode 100644 index 0000000..c07dc45 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv @@ -0,0 +1,1261 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +0,1,12183,0.0,3.7166666666667,5,2109,0 +1,2,12715,0.091757964510557,3.6108333333333,60,3229,0 +2,3,12736,0.17229675238449998,3.4813888888889,88,3637,0 +3,4,12716,0.22621935431999,3.3802777777778,84,1982,0 +4,5,12739,0.17635798469946,3.1933333333333,111,2751,0 +5,6,12737,0.090491245476051,2.7866666666667004,112,2128,0 +6,7,12857,0.08460994072769001,2.4627777777777995,1235,2109,0 +7,8,12884,0.06842699169496,2.2541666666667,710,2328,0 +8,9,12894,0.13330269689422,2.1180555555556,618,2453,0 +9,10,12675,0.085026586189321,2.0691666666667,84,2847,0 +10,11,13260,0.097073068447328,2.1972222222222,100,3659,0 +11,12,13470,0.0,2.3188888888889,125,5207,0 +12,13,13060,0.031063767542922,2.34,114,5146,0 +13,14,12949,0.017732750501525,2.4902777777778,145,4712,0 +14,15,13035,0.063354504072079,2.6438888888889,91,6363,0 +15,16,12980,0.087870391896335,2.8486111111111003,94,5010,0 +16,17,13677,0.11546815687729,2.8833333333333,79,3956,0 +17,18,13381,0.073413457727404,2.8808333333333,50,4063,0 +18,19,12737,0.040392584616896,2.9005555555556,39,3748,0 +19,20,12554,0.08911335594722301,3.0855555555556,28,3047,0 +20,21,12470,0.098030053711531,3.3536111111111,29,4099,0 +21,22,12490,0.047140641497552,3.7438888888889,24,2122,0 +22,23,12539,0.10481279080241,3.7947222222222,19,3387,0 +23,24,12530,0.20478886838928,3.801111111111101,21,1950,0 +24,25,13002,0.04485100631921201,3.6508333333333,27,2927,0 +25,26,12989,0.1053622140254,3.555,46,1889,0 +26,27,13038,0.08436887679639,3.4769444444444,133,1910,0 +27,28,13011,0.097980673762982,3.2158333333333,143,3747,0 +28,29,12984,0.10165726215275,3.1141666666667,86,4994,0 +29,30,13079,0.056764513454874,2.7983333333333,118,2009,0 +30,31,13048,0.074428708878932,2.4252777777778,56,2899,0 +31,32,13096,0.091244453451818,2.14,92,2298,0 +32,33,13003,0.094529332881679,1.9822222222222,85,1894,0 +33,34,13057,0.016638011234698,1.9694444444444,122,1999,0 +34,35,13023,0.038096861957006005,2.0741666666667,74,3007,0 +35,36,13033,0.064497814457643,2.2505555555556,84,2838,0 +36,37,13034,0.030426401876334,2.2819444444444,54,4113,0 +37,38,13068,0.095423209955973,2.4216666666667,77,2150,0 +38,39,13057,0.069688744272108,2.5997222222222005,84,3007,0 +39,40,13047,0.03468622413034,2.7544444444444003,139,2484,0 +40,41,13795,0.089564461084836,2.7258333333333,65,2101,0 +41,42,13528,0.07337616196456799,2.8302777777778,38,2001,0 +42,43,13032,0.061939295606039,2.9422222222222,35,2102,0 +43,44,13084,0.11419089175512,3.0919444444444,47,2129,0 +44,45,13000,0.10475925920163,3.3519444444444,37,4422,0 +45,46,13008,0.079657960399444,3.6952777777778,53,4573,0 +46,47,12978,0.14475546275416,3.8269444444444,55,1989,0 +47,48,13067,0.1421711341096,3.7877777777778,45,1953,0 +48,49,13086,0.07696963969656899,3.7536111111111,46,1872,0 +49,50,13023,0.06393273436444799,3.61,35,1850,0 +50,51,13046,0.14973281021845006,3.5091666666667,68,2879,0 +51,52,13032,0.041478839355346,3.4205555555556,82,1840,0 +52,53,13012,0.089317973365284,3.2647222222222,154,2134,0 +53,54,13051,0.088820248166203,2.7944444444444,128,2234,0 +54,55,12979,0.054872994406929,2.46,79,3769,0 +55,56,13025,0.07913553329046401,2.2075,66,2717,0 +56,57,13007,0.16317996709063,2.1758333333333,92,2171,0 +57,58,13036,0.08671926699280201,2.3058333333333,67,2224,0 +58,59,13043,0.0733999511789,2.3983333333333,58,1967,0 +59,60,13023,0.0,2.55,58,2148,0 +60,61,13022,0.032756244361869,2.7302777777778,63,1978,0 +61,62,13033,0.054893891024455,2.8169444444444003,61,2021,0 +62,63,13024,0.068514114108229,2.9247222222222,55,2060,0 +63,64,13048,0.05279414163165401,2.8911111111111003,71,2096,0 +64,65,13740,0.023853017353212,2.9575,64,2082,0 +65,66,13540,0.07426125441559799,2.9080555555556,92,2175,0 +66,67,12724,0.024228588329879,3.0088888888889,44,2332,0 +67,68,13070,0.09233413002519697,3.2033333333333,35,2147,0 +68,69,13106,0.15930655332113,3.6213888888889,53,2163,0 +69,70,13025,0.12755838225296,4.0322222222222,49,2406,0 +70,71,13074,0.10152541717054,4.1227777777778,49,2022,0 +71,72,13079,0.040148453968243986,3.9736111111111,103,2188,0 +72,73,13184,0.087208372094752,3.8425,107,2758,0 +73,74,13194,0.074209918996797,3.7097222222222,74,2925,0 +74,75,13191,0.059044537369404015,3.6258333333333,56,3223,0 +75,76,13059,0.06248169832921499,3.4705555555556,60,2507,0 +76,77,13169,0.08876527685714597,3.2877777777778,73,2435,0 +77,78,13114,0.051354431854972,2.9286111111111004,99,2552,0 +78,79,13037,0.074790104163639,2.4888888888889,84,2540,0 +79,80,13179,0.091817341555971,2.2744444444444,129,2642,0 +80,81,13152,0.14762794333026005,2.1733333333333,101,2254,0 +81,82,13095,0.07101004447510299,2.3416666666667,101,2539,0 +82,83,13144,0.07689756334240598,2.3808333333333,51,2596,0 +83,84,13170,0.08412575787388403,2.4663888888889,95,2573,0 +84,85,13162,0.06328921386603299,2.6608333333333,48,2302,0 +85,86,13117,0.057393902128707,2.7558333333333,40,2991,0 +86,87,13129,0.041819399065704,2.8636111111111004,55,3141,0 +87,88,13386,0.073729686380986,2.7586111111111005,56,3285,0 +88,89,13929,0.15365285617975,2.7377777777778,935,3807,0 +89,90,13385,0.060355859742407016,2.6961111111111005,34,2892,0 +90,91,13106,0.10644586288975,2.8569444444444,57,2538,0 +91,92,13113,0.059314286360126985,3.1833333333333,70,2234,0 +92,93,13155,0.096293806236591,3.5544444444444,72,2707,0 +93,94,13186,0.085101425467407,3.8894444444444,66,2382,0 +94,95,13151,0.11149072274185,4.1138888888889,72,2426,0 +95,96,13156,0.076266981262989,3.9519444444444,49,2451,0 +96,97,12813,0.097952120177625,3.8275,41,2288,0 +97,98,12821,0.17250021935572,3.6438888888889,42,2256,0 +98,99,12867,0.11389182319254,3.5608333333333,39,2884,0 +99,100,12837,0.08999961787521,3.5013888888889,81,2398,0 +100,101,12911,0.048649372449385005,3.3088888888889,90,2239,0 +101,102,12842,0.13861764684085998,2.9063888888889,92,2248,0 +102,103,12905,0.1088795585287,2.5027777777777995,81,2387,0 +103,104,12993,0.054235162564995,2.2466666666667003,145,3876,0 +104,105,12974,0.0390040506742,2.1869444444444,47,3073,0 +105,106,13039,0.0744713077811,2.2402777777778,63,3113,0 +106,107,13322,0.040258943675435,2.3727777777778,118,3363,0 +107,108,13606,0.0,2.4566666666667003,56,3796,0 +108,109,13536,0.027955712584728,2.5452777777777995,127,4924,0 +109,110,13341,0.047309968420241,2.6830555555556,48,4300,0 +110,111,13360,0.016602764360002,2.805,114,5225,0 +111,112,13450,0.042432577628353986,2.7386111111111004,78,4047,0 +112,113,14102,0.051191743726563,2.7438888888888995,58,4134,0 +113,114,14026,0.0,2.7586111111111005,56,4786,0 +114,115,13162,0.056724832354639,2.9013888888889,67,4184,0 +115,116,13118,0.055771058827737,3.19,155,2888,0 +116,117,12953,0.081014772096658,3.5561111111111003,123,2674,0 +117,118,12854,0.08253629738290899,3.8433333333333,118,2574,0 +118,119,12952,0.11499203730886,4.0319444444444,133,3123,0 +119,120,12915,0.07668513845109799,3.8844444444444,75,3369,0 +120,121,11994,0.070057457403873,3.6908333333333,29,3284,0 +121,122,11868,0.07031477357556501,3.6141666666667,68,2127,0 +122,123,11977,0.091946448716499,3.5019444444444,91,2117,0 +123,124,11874,0.14560588482235998,3.4205555555556,101,2271,0 +124,125,11913,0.094774329323472,3.1780555555556,22,2513,0 +125,126,11933,0.10217989327054,2.8361111111111,20,2746,0 +126,127,11844,0.04854243074027901,2.5222222222222004,27,2076,0 +127,128,11968,0.068760549683423,2.2416666666667004,45,2297,0 +128,129,11996,0.075440683881139,2.1588888888889,42,2312,0 +129,130,12006,0.11771339431815,2.2763888888889,59,2834,0 +130,131,12225,0.069437397660265,2.3391666666667,52,3584,0 +131,132,12482,0.0,2.4841666666667,62,4009,0 +132,133,12289,0.0,2.4911111111111,81,4142,0 +133,134,12219,0.0,2.6922222222222,84,3876,0 +134,135,12282,0.027395404320488,2.8205555555556,104,4098,0 +135,136,12367,0.055202605299814,2.8216666666667,111,3831,0 +136,137,13042,0.078387348178452,2.7122222222222,91,3842,0 +137,138,12665,0.11851571646444,2.6744444444444,33,4129,0 +138,139,12133,0.068395341911942,2.8097222222222,26,3509,0 +139,140,12023,0.04720597158087901,3.1838888888889,37,2450,0 +140,141,11847,0.07910648512645599,3.5130555555556,23,2270,0 +141,142,11980,0.067550601916344,3.7722222222222,29,2360,0 +142,143,12026,0.080666570182724,3.9058333333333,45,2431,0 +143,144,11852,0.044973875852863,3.7697222222222,49,2042,0 +144,145,12152,0.065734580284861,3.6027777777778,27,1833,0 +145,146,12148,0.068759646748575,3.5038888888889,46,1852,0 +146,147,12236,0.027278224398313,3.445,39,1927,0 +147,148,12155,0.067695565422881,3.3494444444444,72,1999,0 +148,149,12113,0.07244669924777,3.1961111111111005,81,2030,0 +149,150,12175,0.028882930937168,2.8905555555556,64,1963,0 +150,151,12103,0.021568136982842,2.5805555555556,79,2116,0 +151,152,12206,0.064254625408408,2.3380555555556004,132,2461,0 +152,153,12239,0.073869151016554,2.2116666666667,127,2388,0 +153,154,12398,0.026644044055307004,2.2013888888889,121,2846,0 +154,155,12582,0.051289858799957,2.3236111111111,98,2974,0 +155,156,12705,0.099217337562612,2.3002777777778,128,3776,0 +156,157,12555,0.016615805334675,2.385,158,3885,0 +157,158,12476,0.078387348178452,2.5597222222222005,78,3865,0 +158,159,12706,0.0,2.6941666666667,65,4319,0 +159,160,12671,0.049384244324413,2.7169444444444,81,4646,0 +160,161,13277,0.043044731483849,2.6369444444444,586,3873,0 +161,162,12757,0.04215504851616,2.6572222222222,48,3489,0 +162,163,12401,0.042236538352835,2.8466666666667004,38,2790,0 +163,164,12248,0.1001564296112,3.1955555555556,30,2641,0 +164,165,12156,0.17378132267942994,3.5633333333333,28,2960,0 +165,166,12210,0.12005519462968,3.8113888888889,36,2192,0 +166,167,11983,0.14491137762023998,3.9655555555556,50,2145,0 +167,168,12374,0.07336941078506799,3.8483333333333,47,2133,0 +168,169,12230,0.12395626148952,3.6441666666667,82,2330,0 +169,170,12200,0.15077430423660998,3.5213888888889,56,2235,0 +170,171,12135,0.18960071033689,3.4702777777778,140,2258,0 +171,172,12131,0.06051348935254,3.3033333333333,145,2200,0 +172,173,12165,0.072057993662839,3.1933333333333,114,2161,0 +173,174,12193,0.082361078437032,2.8183333333333,129,2159,0 +174,175,12165,0.12343775199876,2.52,143,2088,0 +175,176,12304,0.1071817784483,2.2886111111111,113,2473,0 +176,177,12275,0.10359394556779,2.0822222222222,108,3217,0 +177,178,12369,0.021162435488903,2.1416666666667,93,2994,0 +178,179,12569,0.074524398314698,2.2688888888889,63,3827,0 +179,180,12766,0.12687067454443,2.335,103,4176,0 +180,181,12621,0.041752618326160014,2.4388888888889,114,4227,0 +181,182,12611,0.0,2.5386111111111,67,4290,0 +182,183,12618,0.040819652463459,2.6288888888889,106,4691,0 +183,184,12631,0.082668981599835,2.7511111111111,160,4442,0 +184,185,13121,0.06181362481077901,2.7744444444444,81,5775,0 +185,186,12871,0.0,2.8297222222222,113,3840,0 +186,187,12252,0.076137992226715,2.9708333333333,37,3721,0 +187,188,12155,0.12107639529965,3.1333333333333,70,2498,0 +188,189,12186,0.0,3.3544444444444,82,2265,0 +189,190,12179,0.19840339729984,3.6780555555556,76,2451,0 +190,191,12109,0.20112394005693,3.8038888888889,59,2892,0 +191,192,12142,0.096833471661634,3.8177777777778,58,2166,0 +192,193,12145,0.10338450919956,3.6916666666667,49,2040,0 +193,194,12162,0.10142513773096,3.5197222222222,36,2013,0 +194,195,12165,0.09779274451732,3.5186111111111003,111,2000,0 +195,196,12125,0.14744152252573,3.2597222222222,81,2117,0 +196,197,12097,0.083396348606149,3.0930555555556,92,2775,0 +197,198,12099,0.095637498006913,2.7825,113,2116,0 +198,199,12140,0.14768844039376006,2.4494444444444,90,1991,0 +199,200,12188,0.1131872329372,2.2369444444444,183,3162,0 +200,201,12157,0.073729686380986,2.0961111111111,117,2958,0 +201,202,12128,0.064614077523704,2.0377777777778,110,3153,0 +202,203,12190,0.056019959597275015,2.0730555555556003,179,2190,0 +203,204,12151,0.074812141908008,2.1655555555556,134,2172,0 +204,205,12214,0.02489388427845201,2.285,135,2074,0 +205,206,12275,0.023695834967821,2.4283333333333,100,2078,0 +206,207,12164,0.058680009072634,2.6186111111111,47,2406,0 +207,208,12120,0.10008779345816002,2.7372222222222,88,2018,0 +208,209,12693,0.066566772961868,2.8266666666667004,74,2091,0 +209,210,12624,0.070501147961051,2.8469444444444,58,2310,0 +210,211,12163,0.098779019649936,2.9855555555556,100,2113,0 +211,212,12100,0.11803653713501,3.1038888888889,49,2518,0 +212,213,12162,0.10076746585103,3.4058333333333,36,2605,0 +213,214,12106,0.053210709415363,3.6138888888889,40,2680,0 +214,215,12156,0.099346579713514,3.93,50,2228,0 +215,216,12120,0.047275248011591,3.8155555555556,58,2023,0 +216,217,12420,0.091262209791582,3.6588888888889,50,3702,0 +217,218,12417,0.038593218846488,3.5913888888889,53,1992,0 +218,219,12450,0.070273907645883,3.4644444444444003,93,1988,0 +219,220,12395,0.029431888410363,3.3944444444444,78,1919,0 +220,221,12382,0.096854769984307,3.2227777777778,84,2213,0 +221,222,12438,0.11656453357642,2.7961111111111,112,2181,0 +222,223,12363,0.12109055114779,2.4383333333333,73,2152,0 +223,224,12393,0.20381554615786,2.2647222222222005,91,2393,0 +224,225,12399,0.046311768005022014,2.1886111111111,114,2173,0 +225,226,12456,0.18261306403662,2.2825,127,2109,0 +226,227,12442,0.021992750543024,2.3333333333333,69,3606,0 +227,228,12481,0.088072259040681,2.445,59,2114,0 +228,229,12432,0.037896500450725,2.5811111111111,64,2135,0 +229,230,12403,0.09882843339863,2.7094444444444,75,2303,0 +230,231,12406,0.076277687882641,2.88,44,2137,0 +231,232,12462,0.022875979046571,2.8555555555556,52,2264,0 +232,233,13034,0.10022162220861,2.7791666666667,42,2245,0 +233,234,12830,0.08117200437078799,2.7772222222222,45,2151,0 +234,235,12439,0.09750667785645803,3.02,26,2330,0 +235,236,12541,0.05680722879784299,3.2213888888888995,29,3357,0 +236,237,12462,0.12240855732315,3.6211111111111,32,3152,0 +237,238,12394,0.1715485140175,4.0219444444444,44,2693,0 +238,239,12507,0.075015592829224,4.0980555555556,41,3798,0 +239,240,12512,0.11388410095531,3.9080555555556,42,4596,0 +240,241,12093,0.10519027968795,3.7269444444444,46,2529,0 +241,242,12197,0.1150532998405,3.6244444444444,40,2124,0 +242,243,12138,0.10890530980571,3.5252777777778,64,2762,0 +243,244,12174,0.099350621485086,3.4675,70,2973,0 +244,245,12163,0.12889794040441002,3.3316666666667003,69,3041,0 +245,246,12096,0.12069378235889,2.9497222222222,73,2179,0 +246,247,12166,0.13053034917739,2.5708333333333,85,2322,0 +247,248,12187,0.078977758004111,2.3086111111111,63,2274,0 +248,249,12246,0.08088416337864099,2.2311111111111,67,2448,0 +249,250,12335,0.04008956024204,2.3119444444444,68,3811,0 +250,251,12556,0.05063725351997099,2.3536111111111,62,3761,0 +251,252,12652,0.039066291775136,2.4819444444444,69,4269,0 +252,253,12646,0.028611752774164,2.6605555555556,82,4244,0 +253,254,12803,0.040593364983329,2.7527777777778,56,4417,0 +254,255,12570,0.038807415292018,3.0741666666667005,38,3758,0 +255,256,12633,0.07832796288132203,2.8522222222222,30,4375,0 +256,257,13146,0.066320996162546,2.7277777777778,48,4158,0 +257,258,12994,0.083175583471284,2.7502777777778,63,3410,0 +258,259,12314,0.06802464587725401,2.8797222222222,34,2853,0 +259,260,12193,0.051675070535006,3.2027777777778,11,2628,0 +260,261,12127,0.044129112207997014,3.5633333333333,22,2287,0 +261,262,12140,0.037685894365982006,3.8808333333333,22,3334,0 +262,263,12174,0.093414561465838,4.0352777777778,12,2795,0 +263,264,12180,0.06987083046098,3.8966666666667,10,2089,0 +264,265,12861,0.021992750543024,3.7225,14,2260,0 +265,266,12957,0.11305566197523,3.73,39,3176,0 +266,267,12981,0.030884138240845,3.5558333333333,55,4049,0 +267,268,12958,0.10381377439313,3.3169444444444003,90,2902,0 +268,269,12913,0.048953768695625004,3.2322222222222,68,3743,0 +269,270,12939,0.042258794089861,2.8658333333333,95,4280,0 +270,271,12933,0.048388685585470985,2.5169444444444,70,3977,0 +271,272,13006,0.034197830567692,2.3,96,4518,0 +272,273,13091,0.08835953066771099,2.1888888888889,45,2707,0 +273,274,13201,0.086890518272785,2.2030555555556,96,3522,0 +274,275,13520,0.031087561676959,2.2711111111111,74,4584,0 +275,276,13675,0.071287463233942,2.4697222222222,82,4141,0 +276,277,13594,0.14372616993938,2.5988888888889,82,4831,0 +277,278,13466,0.12647517487142998,2.7258333333333,45,3991,0 +278,279,13448,0.042854531198562,2.7858333333333,134,4645,0 +279,280,13492,0.039930389849144,2.7922222222222,119,4967,0 +280,281,14123,0.076184645265048,2.6988888888889,86,4578,0 +281,282,13839,0.037830020408535,2.7663888888889,75,4972,0 +282,283,13335,0.030884138240845,2.8938888888889,45,5522,0 +283,284,13196,0.048316550276279,3.1875,50,2832,0 +284,285,13047,0.10986585566763,3.6463888888889,31,2826,0 +285,286,13008,0.025485002897852004,3.866666666666701,88,2855,0 +286,287,12763,0.12451757643335,3.9808333333333,42,2660,0 +287,288,12949,0.12875690949235,3.8277777777778,70,2447,0 +288,289,13009,0.15720639094135,3.6269444444444,106,2545,0 +289,290,13008,0.079092017261926,3.5266666666667,44,3842,0 +290,291,12890,0.14711499890479998,3.5077777777778,57,2332,0 +291,292,13004,0.0531410973178,3.3455555555556,95,2294,0 +292,293,12918,0.10136246281349,3.1241666666667003,91,3016,0 +293,294,12910,0.053119315802353,2.8713888888889,66,3944,0 +294,295,12915,0.11313351589999003,2.5133333333333,66,2332,0 +295,296,13121,0.076760188212735,2.2197222222222,82,2405,0 +296,297,13076,0.08890522133351199,2.205,73,2572,0 +297,298,13096,0.1009555130175,2.2677777777778,69,2558,0 +298,299,13339,0.15685427502807,2.2991666666667,107,3701,0 +299,300,13635,0.11090638960365,2.4277777777778,101,4228,0 +300,301,13493,0.054798089981891,2.5333333333333,66,3990,0 +301,302,13402,0.08461316628091001,2.6422222222222005,47,4707,0 +302,303,13417,0.15790425505315,2.8211111111111005,47,3857,0 +303,304,13382,0.021675109392134,2.7625,66,3874,0 +304,305,14199,0.14112049645292002,2.7391666666667,102,4369,0 +305,306,13973,0.059612111520904,2.7525,71,4488,0 +306,307,13284,0.067835890522602,2.8644444444444,53,3637,0 +307,308,13070,0.047414460026828,3.1927777777778,28,2705,0 +308,309,12983,0.050348669783997005,3.5872222222222,24,2429,0 +309,310,13075,0.07296715773193299,3.8305555555556,23,2839,0 +310,311,12991,0.10713527159169,3.8827777777778,30,2371,0 +311,312,12993,0.073622496612493,3.7291666666667,25,2758,0 +312,313,13121,0.11556476355437,3.6172222222222,29,2291,0 +313,314,13097,0.034160489683707995,3.4491666666667005,27,2220,0 +314,315,13150,0.019571935182124,3.4097222222222,77,2620,0 +315,316,13078,0.15720996206912,3.2605555555556,46,2467,0 +316,317,13140,0.11515041454164,3.2191666666667,86,2088,0 +317,318,13102,0.086415715789296,2.9586111111111,97,2137,0 +318,319,13110,0.092606306920552,2.6036111111111,88,2907,0 +319,320,13138,0.046458579038692015,2.3319444444444,110,2558,0 +320,321,13238,0.10977831600416,2.2025,89,2823,0 +321,322,13317,0.11090009191451,2.2711111111111,134,2465,0 +322,323,13512,0.076652795374797,2.2897222222222005,84,4399,0 +323,324,13669,0.1087202400467,2.3297222222222005,109,4088,0 +324,325,13651,0.11471628863897,2.395,57,5099,0 +325,326,13580,0.11070024667119,2.5063888888889,49,5157,0 +326,327,13538,0.026827723134058,2.7077777777778,83,3782,0 +327,328,13657,0.029426630692549,2.735,101,4008,0 +328,329,14183,0.028611752774164,2.6958333333333,88,4534,0 +329,330,14117,0.053106181092382014,2.6930555555556,56,3242,0 +330,331,13166,0.055538160906184006,2.875,31,2808,0 +331,332,13265,0.11009690391165,3.1788888888888995,22,3676,0 +332,333,13085,0.10979978093137,3.5808333333333,32,3523,0 +333,334,13167,0.036174223284821,3.8508333333333,27,3038,0 +334,335,13170,0.048361321378982,3.9180555555556,17,2299,0 +335,336,13132,0.10958125953198,3.815,27,2345,0 +336,337,13055,0.047305343559722,3.6080555555556,38,2565,0 +337,338,13025,0.045316868664604014,3.4927777777778,73,2576,0 +338,339,13076,0.13255054531036,3.4316666666667004,56,2327,0 +339,340,13044,0.079695587369141,3.3436111111111004,49,2211,0 +340,341,13035,0.10277355185943,3.0663888888889,90,2642,0 +341,342,13103,0.15061124796385,2.7894444444444,106,3646,0 +342,343,13067,0.14509169704095,2.4994444444444,51,2281,0 +343,344,13183,0.054445250001619004,2.2544444444444,99,2474,0 +344,345,13144,0.082058799915824,2.0847222222222,104,2536,0 +345,346,13166,0.042151311782819015,2.0888888888889,119,2900,0 +346,347,13406,0.057404703309705984,2.1594444444444,73,3144,0 +347,348,13544,0.040891918425583,2.2533333333333,92,3725,0 +348,349,13608,0.045224636676715,2.3880555555556,57,4305,0 +349,350,13522,0.0,2.6338888888889,100,3665,0 +350,351,13595,0.0,2.6588888888889,93,3791,0 +351,352,13420,0.10335456693443,2.7586111111111005,111,3897,0 +352,353,14163,0.033846222120808,2.8797222222222,91,3494,0 +353,354,13678,0.026167129419328,2.785,43,3353,0 +354,355,13272,0.08571767780871499,2.8219444444444,91,2741,0 +355,356,13071,0.12459953631184,3.0055555555556,63,2463,0 +356,357,13004,0.054750658073534006,3.2936111111111,60,3477,0 +357,358,13068,0.20799106772677,3.5575,56,2792,0 +358,359,13031,0.10314231079956,3.676111111111101,59,2183,0 +359,360,13013,0.12212653292147,3.7166666666667,48,2874,0 +360,361,12998,0.19159058299176,3.6013888888889,65,2147,0 +361,362,12971,0.10782180851978,3.4455555555556,77,2754,0 +362,363,13000,0.06408869538637901,3.4166666666667003,60,2007,0 +363,364,12998,0.095540168894753,3.1791666666667004,94,2564,0 +364,365,12906,0.039360296791109,3.0013888888889,84,3020,0 +365,366,12969,0.086611479249287,2.72,99,2004,0 +366,367,12963,0.05845507441603001,2.4527777777778,61,2047,0 +367,368,12933,0.051490800079599004,2.1816666666667,60,3531,0 +368,369,12990,0.075496432869001,2.0161111111111,78,2383,0 +369,370,12980,0.10358625218721,1.9769444444444,81,2112,0 +370,371,12982,0.062806431427897,2.0597222222222,61,2554,0 +371,372,12989,0.08970338978685001,2.2111111111111,68,2371,0 +372,373,13073,0.094517316130968,2.3141666666667,53,2060,0 +373,374,12950,0.032322011663911,2.4280555555556003,49,2086,0 +374,375,12990,0.047911560407608,2.5855555555556,40,2130,0 +375,376,13035,0.062001214431213,2.6977777777778,125,2072,0 +376,377,13681,0.027102718749392,2.7777777777778,61,2033,0 +377,378,13304,0.034703114844079,2.7988888888889,111,2683,0 +378,379,12965,0.066236017573192,2.8927777777778,32,2046,0 +379,380,12966,0.032230355211769,3.0413888888889,21,2064,0 +380,381,12943,0.11559664215716,3.3569444444444,14,2067,0 +381,382,12958,0.021952502374124,3.4808333333333,32,2496,0 +382,383,13005,0.13347711194703,3.764166666666701,29,4758,0 +383,384,12923,0.10579408349834,3.8097222222222,26,2806,0 +384,385,12812,0.10679035350244,3.6911111111111,52,2227,0 +385,386,12803,0.068633627680319,3.4902777777778,39,3123,0 +386,387,12850,0.04699518011436099,3.3769444444444,78,3460,0 +387,388,12797,0.14159640074335994,3.3011111111111004,78,3587,0 +388,389,12732,0.078500039299167,3.1369444444444,83,2558,0 +389,390,12817,0.049232295047845,2.8475,63,2306,0 +390,391,12818,0.078777592482879,2.4544444444444,108,2083,0 +391,392,12815,0.08993433499951,2.1247222222222,158,3073,0 +392,393,12805,0.081869163858473,2.0266666666667,115,3325,0 +393,394,12703,0.14556064903749,2.1763888888889,112,2321,0 +394,395,12771,0.0,2.3088888888889,73,2846,0 +395,396,12847,0.0,2.4213888888889,93,2482,0 +396,397,12872,0.030693547421212,2.6436111111111,65,2306,0 +397,398,12815,0.0,2.6602777777778,91,2298,0 +398,399,12844,0.046999447831427,2.7677777777778,106,2907,0 +399,400,12811,0.028815579681692,2.8066666666667004,66,2329,0 +400,401,13472,0.0,2.7661111111111003,26,2456,0 +401,402,13063,0.039360296791109,2.8133333333333,23,2178,0 +402,403,12833,0.039570832199428,2.9186111111111,24,2142,0 +403,404,12842,0.090659246308087,3.1930555555556,19,2277,0 +404,405,12804,0.10540579050057003,3.565,23,3066,0 +405,406,12852,0.062601610466313,3.9133333333333,30,3619,0 +406,407,12862,0.051455855638306,3.9658333333333,23,3726,0 +407,408,12799,0.054631758648785014,3.8930555555556,35,2282,0 +408,409,12789,0.09017822949731,3.7297222222222,41,3079,0 +409,410,12815,0.045287525091609014,3.6516666666667,63,2448,0 +410,411,12887,0.033344698319951,3.5927777777778,33,2574,0 +411,412,12903,0.080098394586215,3.4694444444444,50,3697,0 +412,413,12892,0.025162301034707,3.2536111111111,88,3067,0 +413,414,12907,0.078260793447992,2.8986111111111,115,3491,0 +414,415,12883,0.07223863924679201,2.4488888888889,69,3195,0 +415,416,12965,0.042917873674349,2.2119444444444,116,2763,0 +416,417,12932,0.04720597158087901,2.2011111111111,73,2605,0 +417,418,13134,0.048273008229067,2.2338888888889,75,2755,0 +418,419,13440,0.036987975876273,2.3116666666667003,56,3300,0 +419,420,13544,0.06291463671717,2.3869444444444,66,3838,0 +420,421,13508,0.033319304393751,2.5119444444444,70,3608,0 +421,422,13401,0.029115275623859,2.5713888888889,52,3845,0 +422,423,13410,0.06821638123436,2.5088888888889,32,3563,0 +423,424,13482,0.015408589348188,2.4155555555556,16,5478,0 +424,425,14124,0.01916018435633,3.6455555555556,46,3656,0 +425,426,13703,0.06374239746477901,2.4625,53,3491,0 +426,427,13250,0.099738890728803,2.5808333333333,67,3430,0 +427,428,13092,0.10950621554455,3.0033333333333,58,2807,0 +428,429,13012,0.06138920621589401,3.3486111111111003,17,2524,0 +429,430,12901,0.051307638060244014,3.6644444444444,26,2964,0 +430,431,12848,0.082471571552878,4.0083333333333,13,3969,0 +431,432,13025,0.060122448878635,3.8530555555556,8,3561,0 +432,433,11352,0.07469842969719999,3.6183333333333,20,3394,0 +433,434,8761,0.056170625137636994,3.4922222222222,23,3005,0 +434,435,10433,0.052668952946361,3.4958333333333,34,2350,0 +435,436,10088,0.068871884486763,3.2738888888889,35,2139,0 +436,437,9485,0.040236057110938986,3.2102777777778,48,2098,0 +437,438,8865,0.053200012471363,2.8475,67,2341,0 +438,439,8920,0.056725172482788,2.4883333333332995,38,2698,0 +439,440,8798,0.035229341473877,2.1955555555556003,33,2968,0 +440,441,8927,0.0,2.1461111111111,40,2824,0 +441,442,9211,0.020190723068726,2.1522222222222,37,3003,0 +442,443,9286,0.093342961377898,2.3122222222222004,51,3551,0 +443,444,9725,0.0,2.4033333333333,52,4689,0 +444,445,11050,0.015717168144981003,2.4944444444444,57,3481,0 +445,446,11521,0.017190609993733997,2.6622222222222005,82,3376,0 +446,447,11603,0.0,2.675,74,3198,0 +447,448,11665,0.043273461915965,2.6997222222222,80,3059,0 +448,449,12153,0.029854520963498,2.6997222222222,78,2937,0 +449,450,11672,0.017383620014121998,2.7194444444444,58,2881,0 +450,451,11119,0.046391383573699006,2.8258333333333,41,2777,0 +451,452,11124,0.042155878228,3.1044444444444,34,2510,0 +452,453,10734,0.052684222339579014,3.4736111111111003,35,2356,0 +453,454,11612,0.063573954212613,3.6972222222222,40,2383,0 +454,455,11523,0.077413583128967,3.8038888888889,35,2455,0 +455,456,11632,0.069605078732108,3.7494444444444,37,2285,0 +456,457,12838,0.075937967855042,3.6813888888889,43,2455,0 +457,458,11637,0.047354002438352014,3.4791666666667003,45,4298,0 +458,459,12542,0.044000040388062,3.4530555555556,48,2400,0 +459,460,12394,0.095130971924595,3.2841666666667004,77,3431,0 +460,461,12419,0.069274987547704,3.205,79,2252,0 +461,462,12484,0.061118974117397,2.8436111111111004,59,2628,0 +462,463,12413,0.056393740750134,2.4441666666667,107,3266,0 +463,464,12440,0.06125086589409901,2.275,100,2620,0 +464,465,12614,0.047746883512707,2.1788888888889,84,2824,0 +465,466,12693,0.047136440673386,2.2083333333333,99,2801,0 +466,467,12989,0.0,2.2997222222222,103,3106,0 +467,468,13200,0.0,2.3155555555556004,47,3532,0 +468,469,13108,0.049828520132601,2.41,67,4210,0 +469,470,12886,0.0,2.5902777777778,65,3646,0 +470,471,13000,0.0,2.6636111111111,65,3768,0 +471,472,13071,0.043576825212604,2.7105555555556,70,5342,0 +472,473,13563,0.035173891965945,2.6811111111111,76,5327,0 +473,474,13333,0.04413510379665099,2.715,40,3363,0 +474,475,12672,0.016955671451488998,2.7083333333333,54,3016,0 +475,476,12547,0.1330396486107,3.0038888888889,45,3257,0 +476,477,12289,0.016462114132943,3.3911111111111003,32,2619,0 +477,478,12584,0.055696363369897,3.6375,26,2573,0 +478,479,12526,0.036411774365825,3.7755555555556,25,2575,0 +479,480,12416,0.047966724418057,3.5786111111111003,34,5355,0 +480,481,12450,0.05609961782665,3.4222222222222,43,5809,0 +481,482,12460,0.096990479781121,3.2538888888889,68,3823,0 +482,483,12425,0.11147038220964,3.1683333333333,60,3116,0 +483,484,12430,0.044797927381498,3.0677777777778,74,2321,0 +484,485,12418,0.024403519177111,2.94,68,2193,0 +485,486,12437,0.08532776818426499,2.7291666666667003,43,2982,0 +486,487,12484,0.043615168647623,2.4147222222222005,73,4140,0 +487,488,12380,0.056692005942856,2.1419444444444,72,2353,0 +488,489,12620,0.033708553131457,2.0244444444444,66,3350,0 +489,490,12674,0.040148453968243986,2.0458333333333,90,3184,0 +490,491,12855,0.099551526697496,2.09,104,3469,0 +491,492,13053,0.0,2.1575,114,4204,0 +492,493,12898,0.036157867549894,2.2655555555556,98,6447,0 +493,494,12809,0.052738784696875,2.2561111111111,70,4898,0 +494,495,12964,0.021636091422947,2.4669444444444,101,3633,0 +495,496,12956,0.037120220639643986,2.5277777777778,77,4189,0 +496,497,13625,0.034467327401996005,2.5266666666667,69,4012,0 +497,498,13285,0.0,2.5438888888889,19,4009,0 +498,499,12715,0.096807019710259,2.6511111111111,47,4346,0 +499,500,12637,0.059601475230884,2.9711111111111004,38,2781,0 +500,501,12535,0.068431521141608,3.2288888888889,22,2811,0 +501,502,12512,0.09611085542804,3.505,20,2415,0 +502,503,12549,0.064177980162036,3.4944444444444,26,3589,0 +503,504,12567,0.11565746993409,3.4633333333333,24,2878,0 +504,505,12362,0.073501732487291,3.3177777777778,27,3471,0 +505,506,12326,0.072746100819649,3.1963888888889,25,2697,0 +506,507,12450,0.07557888002360401,3.1069444444444,57,2583,0 +507,508,12404,0.036816888038697,3.0172222222222,58,3173,0 +508,509,12362,0.093969235453559,2.9247222222222,81,3341,0 +509,510,12431,0.034848294186597004,2.5336111111111,81,2305,0 +510,511,12351,0.084191269180943,2.2480555555556,69,2186,0 +511,512,12528,0.13109036514766,2.0383333333333,50,4439,0 +512,513,12559,0.061132356147447,1.8852777777778,55,3173,0 +513,514,12586,0.019478099970089,1.9225,57,2831,0 +514,515,12864,0.0,1.9719444444444,78,16385,0 +515,516,13026,0.0,2.0608333333333,57,83955,0 +516,517,12880,0.017965204407153,2.16,78,4574,0 +517,518,12743,0.019202263481759,2.3077777777778,95,4987,0 +518,519,12812,0.0,2.415,88,5110,0 +519,520,12878,0.052306327013631,2.4669444444444,108,4893,0 +520,521,13427,0.08536575533023,2.5125,87,3807,0 +521,522,13081,0.052461360256699015,2.6294444444444,87,3447,0 +522,523,12752,0.035302992848671,2.8183333333333,44,4329,0 +523,524,12594,0.028682734942579,3.0547222222222,39,5166,0 +524,525,12507,0.024204462299365,3.33,27,3454,0 +525,526,12494,0.034360100307537,3.5738888888889,23,3578,0 +526,527,12487,0.018977302969238,3.6888888888889,11,2406,0 +527,528,12404,0.034308847257872,3.7111111111111,13,2073,0 +528,529,11147,0.07460088255490599,3.7180555555556,24,1925,0 +529,530,11147,0.055037935083209005,3.6041666666667,77,2357,0 +530,531,11128,0.039311673522385,3.4483333333333,54,1947,0 +531,532,11106,0.046619928266775,3.2413888888888995,45,1912,0 +532,533,11115,0.048227542028921,3.1355555555556,36,2107,0 +533,534,11044,0.020367863848114,2.8172222222222,59,2985,0 +534,535,11110,0.063069968046591,2.4275,81,2081,0 +535,536,11190,0.054470866056974016,2.2513888888889,50,2631,0 +536,537,11063,0.0,2.0691666666667,53,2130,0 +537,538,11078,0.059261864411046,2.0155555555556,44,2085,0 +538,539,11146,0.064174002348993,2.0952777777778,87,2211,0 +539,540,11010,0.0,2.2397222222222,94,2105,0 +540,541,11139,0.021912411214588,2.3275,128,2585,0 +541,542,11117,0.057958262002105985,2.5255555555556004,82,3695,0 +542,543,11081,0.035358633773416,2.665,49,3198,0 +543,544,11128,0.029191244440103,2.7975,79,3191,0 +544,545,11720,0.054981313823219,2.8597222222222,62,2016,0 +545,546,11384,0.06405347705857799,2.7983333333333,64,2124,0 +546,547,11018,0.0,2.9322222222222,34,2105,0 +547,548,11104,0.055445634363329,3.08,41,2031,0 +548,549,11084,0.040996998867197,3.3466666666667004,47,1964,0 +549,550,11106,0.027670189755404,3.6869444444444,31,2016,0 +550,551,11055,0.054579839310753,3.7966666666667,26,3909,0 +551,552,11098,0.044833640073299014,3.7805555555556,17,2105,0 +552,553,11028,0.03282297151413,3.7422222222222,30,2405,0 +553,554,11152,0.017696014614986,3.639166666666701,17,2141,0 +554,555,11025,0.09418709999244,3.4775,28,1910,0 +555,556,11015,0.061817529149429,3.3283333333333,20,1951,0 +556,557,11125,0.054000161367618,3.1702777777778,85,2310,0 +557,558,11035,0.06165600249599,2.7688888888889,52,2047,0 +558,559,11103,0.055915839259234,2.4266666666667,143,2048,0 +559,560,11100,0.062788330996733,2.1963888888889,106,3083,0 +560,561,11170,0.044888048273534,2.135,244,3619,0 +561,562,11078,0.095259484956337,2.3186111111111,2005,2172,0 +562,563,11150,0.021952502374124,2.3383333333333,124,3142,0 +563,564,11149,0.0,2.5002777777778,109,2256,0 +564,565,10984,0.0,2.6527777777778,148,2200,0 +565,566,11034,0.0,2.7661111111111003,126,2183,0 +566,567,11050,0.061557079663167,2.7347222222222,46,2030,0 +567,568,11102,0.14186075040414,2.6069444444444,49,2297,0 +568,569,11743,0.0,2.5547222222222,40,2213,0 +569,570,11371,0.077457673524504,2.4716666666667004,39,4014,0 +570,571,11078,0.16422977329792998,2.6530555555556004,25,2809,0 +571,572,11224,0.049366067455729,2.9488888888889,37,2355,0 +572,573,11146,0.10064381631633,3.3383333333333,32,2372,0 +573,574,11199,0.11909159312806,3.5419444444444,47,2387,0 +574,575,11181,0.09003816676619801,5.3302777777778,34,2359,0 +575,576,11022,0.055882659245704,3.7727777777778,40,2485,0 +576,577,11073,0.1836893913223,3.6333333333333,46,3728,0 +577,578,11120,0.08574268253550299,3.5430555555556,35,2820,0 +578,579,11008,0.12559700716583,3.6711111111111,61,2426,0 +579,580,11078,0.086129850619071,3.4572222222222,56,2307,0 +580,581,11121,0.041752618326160014,3.2,72,2233,0 +581,582,11041,0.094396473652892,2.7772222222222,110,2178,0 +582,583,11168,0.045323960075285004,2.415,135,2243,0 +583,584,11213,0.13808411333909,2.2530555555556004,133,2713,0 +584,585,11238,0.08029349854683501,2.0994444444444,148,3168,0 +585,586,11273,0.06507307495461,2.1780555555556003,86,3163,0 +586,587,11479,0.084518021856329,2.2638888888889,132,3289,0 +587,588,11839,0.030507395540508,2.3575,73,4001,0 +588,589,11735,0.05892502921299701,2.4680555555556003,95,4684,0 +589,590,11574,0.0,2.6208333333333,74,4137,0 +590,591,11531,0.033075906123641,2.6863888888889,51,4787,0 +591,592,11420,0.16633704704670998,2.6172222222222,65,4278,0 +592,593,12301,0.10228536028167,2.6194444444444,95,3898,0 +593,594,11845,0.16949365549682996,2.6358333333333,72,3728,0 +594,595,11374,0.08260397756200501,2.8661111111111004,41,4047,0 +595,596,11370,0.024378363844868,3.0533333333333,38,3373,0 +596,597,11197,0.15686874147816002,3.4438888888889,32,2669,0 +597,598,11171,0.063929461148943,3.6552777777778,22,3289,0 +598,599,11197,0.12602019009982998,3.8519444444444,29,2556,0 +599,600,11114,0.035137191893634005,3.8069444444444,32,2557,0 +600,601,12564,0.14965728062748998,3.5961111111111004,40,3003,0 +601,602,12459,0.10046170077382,3.5344444444444,59,2441,0 +602,603,12508,0.13163105487926,3.3972222222222,52,2396,0 +603,604,12464,0.043899611017859004,3.3936111111111003,42,3426,0 +604,605,12438,0.19567092855859,3.1025,46,2379,0 +605,606,12449,0.19135011734275,2.8630555555556,97,3026,0 +606,607,12373,0.11171915024595,2.4255555555556003,72,2336,0 +607,608,12594,0.032053604746412,1.8619444444444,81,2850,0 +608,609,12623,0.096448361580655,1.8930555555556,81,3016,0 +609,610,12759,0.07934996156433399,2.2080555555556,70,3537,0 +610,611,12841,0.024581173073578,2.3052777777778,89,3899,0 +611,612,13063,0.025596039426134,2.3777777777777995,87,5044,0 +612,613,13023,0.027922074309281,2.5161111111111,125,4806,0 +613,614,12884,0.02593545023878,2.6411111111111,69,4139,0 +614,615,13007,0.033086949155743,2.8011111111111004,57,4776,0 +615,616,13016,0.047260069860172005,2.7236111111111003,99,4065,0 +616,617,13588,0.038487130166032016,2.6813888888889,111,4969,0 +617,618,13272,0.16080169828563,2.7336111111111,71,3784,0 +618,619,12589,0.12635270044885,2.8863888888889,71,3297,0 +619,620,12651,0.046904491868436,3.1225,48,3347,0 +620,621,12616,0.059534673085297,3.4613888888889,76,3170,0 +621,622,12492,0.12198352023568,3.8297222222222,56,2241,0 +622,623,12497,0.052131597947042,3.8936111111111,35,2301,0 +623,624,12623,0.094084438832673,3.7588888888889,35,2303,0 +624,625,12481,0.13486764750848,3.5827777777778,29,2587,0 +625,626,12434,0.062226183256115,3.4730555555556,38,3211,0 +626,627,12495,0.091202035463034,3.4175,69,2604,0 +627,628,12375,0.096137859324631,3.3533333333333,77,2841,0 +628,629,12357,0.10449109200785,3.1963888888889,20,2168,0 +629,630,12433,0.097127966420289,2.8852777777778,24,2265,0 +630,631,12432,0.064404980330111,2.4880555555556003,83,2908,0 +631,632,12429,0.10188181868693,2.2325,62,3180,0 +632,633,12551,0.19953464365013,2.1044444444444,54,3118,0 +633,634,12799,0.0747839457206,2.1097222222222,54,3296,0 +634,635,12818,0.0,2.235,60,4432,0 +635,636,13071,0.0,2.3516666666667003,63,4336,0 +636,637,12897,0.0,2.5138888888889,95,4534,0 +637,638,12961,0.041436571087464,2.6105555555556004,69,4261,0 +638,639,12925,0.038671790863765,2.7233333333333,68,5248,0 +639,640,12968,0.035810634316102014,2.6633333333333,58,5014,0 +640,641,13525,0.1409929213297,2.5580555555556,107,3864,0 +641,642,12993,0.0,2.6627777777778,48,5682,0 +642,643,12369,0.052915080344848,2.7625,64,4404,0 +643,644,12195,0.11966022897483,3.0283333333333,52,3705,0 +644,645,12464,0.12973870706052,3.3727777777778,61,2738,0 +645,646,12470,0.023838633821411,3.6369444444444,47,2887,0 +646,647,12475,0.12358680271021,3.7088888888889,58,3776,0 +647,648,12482,0.089095336472172,3.5847222222222,51,3532,0 +648,649,12221,0.019762530636927,3.4836111111111,61,3724,0 +649,650,12325,0.020994992941051,3.4077777777778,53,2786,0 +650,651,12258,0.10380294658324002,3.4441666666667,55,2941,0 +651,652,11980,0.079228021087742,3.1683333333333,52,2351,0 +652,653,11947,0.039012779943635,3.0527777777778,89,2316,0 +653,654,12291,0.10658713601061,2.8527777777778,85,2350,0 +654,655,12293,0.14426278476756,2.5433333333333,106,2916,0 +655,656,12341,0.08706206992122,2.1997222222222,88,2437,0 +656,657,12390,0.16325946030154,2.1036111111111,59,2761,0 +657,658,12611,0.0,2.2133333333333,48,3941,0 +658,659,12737,0.0,2.2086111111111,66,4025,0 +659,660,12882,0.07729609083366701,2.2883333333333,95,4466,0 +660,661,12891,0.058100747891124,2.3222222222222,82,4401,0 +661,662,12756,0.061191523312340984,2.47,76,4747,0 +662,663,12875,0.08592375974441901,2.685,104,4051,0 +663,664,12847,0.033467197342519,2.6763888888889,54,4448,0 +664,665,13518,0.030265788895452006,2.5838888888889,43,3736,0 +665,666,13217,0.11950310860409,2.6130555555556003,39,3918,0 +666,667,12621,0.09169148327055697,2.7633333333333,48,3408,0 +667,668,12591,0.18439354827551,3.0708333333333,38,2883,0 +668,669,12332,0.10741924067542,3.4347222222222,45,3631,0 +669,670,12404,0.15862461647089002,3.7030555555556,64,2609,0 +670,671,12457,0.14957813136313,3.8138888888889,35,2533,0 +671,672,12370,0.24059408570531,3.8508333333333,66,2469,0 +672,673,11509,0.15511115210127,3.8961111111111,61,2458,0 +673,674,11433,0.19582462633148,3.4763888888889,58,2458,0 +674,675,11317,0.13981560037535998,3.4041666666667,51,2043,0 +675,676,11364,0.1392329990551,3.2352777777778,55,1985,0 +676,677,11350,0.13079770999921,3.1508333333333,126,2032,0 +677,678,11348,0.053672881218709015,2.7863888888888995,61,3409,0 +678,679,11365,0.10971373742228,2.4861111111111,94,2018,0 +679,680,11505,0.13825204927093,2.2444444444444,83,2461,0 +680,681,11468,0.13912778922607,2.1286111111111,136,2318,0 +681,682,11562,0.10215803640865,2.1261111111111,104,2787,0 +682,683,11858,0.096617489053804,2.2405555555556003,77,3186,0 +683,684,11933,0.0,2.2991666666667,109,3490,0 +684,685,11813,0.0,2.3627777777778,146,3407,0 +685,686,11735,0.0,2.5863888888889,69,3193,0 +686,687,11848,0.0,2.7286111111111,121,3412,0 +687,688,11843,0.0,2.8355555555556,53,3563,0 +688,689,12318,0.068897518746959,2.7875,61,3247,0 +689,690,11846,0.05418569809170299,2.7825,82,3012,0 +690,691,11066,0.06507307495461,2.7972222222222,37,2382,0 +691,692,10920,0.10547682048851,3.0355555555556,19,2012,0 +692,693,10836,0.056437861708265,3.2486111111111,19,1915,0 +693,694,10879,0.098703711593837,3.6077777777778,19,1982,0 +694,695,10796,0.14331889652193,3.76,54,1950,0 +695,696,10785,0.05704449488642,3.806666666666701,44,4176,0 +696,697,9469,0.0,3.6638888888889,46,3654,0 +697,698,9278,0.032146952736052,3.5161111111111003,53,3063,0 +698,699,9417,0.068135614649249,3.3286111111111003,83,1916,0 +699,700,9253,0.034514299845882,3.2166666666667,92,1848,0 +700,701,9435,0.028306668795131006,2.9783333333333,94,1704,0 +701,702,9356,0.13119921991025002,2.7211111111111004,111,1680,0 +702,703,9354,0.093609772007723,2.4102777777778,84,2011,0 +703,704,9405,0.11179018663123,2.1366666666667,52,1772,0 +704,705,9326,0.065272680657868,1.9947222222222,68,1838,0 +705,706,9549,0.15901886092526998,1.9936111111111,35,1924,0 +706,707,9499,0.0,2.0788888888889,40,2038,0 +707,708,9371,0.26537507315217,2.1736111111111,47,1991,0 +708,709,9462,0.0,2.4027777777778,85,1729,0 +709,710,9509,0.056610336908172985,2.4580555555556,59,1673,0 +710,711,9469,0.026644044055307004,2.6102777777777995,61,1656,0 +711,712,9522,0.040819652463459,2.7597222222222,45,1774,0 +712,713,9885,0.13497701521251,2.8122222222222,47,1784,0 +713,714,9802,0.16853433621426,2.8427777777778,72,1818,0 +714,715,9461,0.08655557751574,2.87,69,1981,0 +715,716,9393,0.05741127788681901,2.9769444444444,17,2004,0 +716,717,9638,0.037244401880164,3.3241666666667005,47,1788,0 +717,718,9435,0.1132743034971,3.6375,37,1786,0 +718,719,9519,0.15690958465910998,3.8652777777778,57,1781,0 +719,720,9492,0.09604225449090803,3.8091666666667,62,2024,0 +720,721,9458,0.06746445682560599,3.6844444444444,72,1669,0 +721,722,9420,0.058373145210404015,3.5913888888889,43,1729,0 +722,723,9429,0.048008603166117006,3.5255555555556,57,1682,0 +723,724,9461,0.12614216994504,3.3277777777778,47,1714,0 +724,725,9404,0.077186121310215,3.07,61,1679,0 +725,726,9366,0.042879382350005,2.7622222222222,53,1739,0 +726,727,9488,0.031014262794497007,2.3872222222222,78,1669,0 +727,728,9515,0.13957171072647,2.1308333333333,100,1806,0 +728,729,9487,0.027108383258306,2.1563888888889,104,1650,0 +729,730,9497,0.0,2.2547222222222003,56,1751,0 +730,731,9516,0.0,2.3397222222222003,89,1685,0 +731,732,9504,0.0,2.4808333333333,108,1645,0 +732,733,9422,0.025265991419408,2.6208333333333,67,2133,0 +733,734,9543,0.0,2.8138888888889,83,1618,0 +734,735,9395,0.047219926720593,2.9275,90,1623,0 +735,736,9352,0.083109434319356,2.8663888888888995,82,1697,0 +736,737,9884,0.10860709298782,2.7794444444444,76,1684,0 +737,738,9820,0.098319718095083,2.8194444444444,34,1779,0 +738,739,9439,0.02201293380153,2.9458333333333,43,2982,0 +739,740,9560,0.064929719079082,3.2413888888888995,40,1848,0 +740,741,9589,0.036960535765785,3.7166666666667,40,1772,0 +741,742,9575,0.068536856116777,4.1333333333333,57,1841,0 +742,743,9541,0.012398281267649,4.2697222222222,60,1834,0 +743,744,9490,0.035305311833591015,4.2797222222222,53,1860,0 +744,745,7160,0.024153733176505,4.0,44,1647,0 +745,746,7233,0.031750779212929,3.8877777777778,48,2129,0 +746,747,7166,0.092612685693125,3.6633333333333,50,1763,0 +747,748,7245,0.12674340154738,3.6127777777778,65,1433,0 +748,749,7299,0.068594711667718,3.3175,93,1428,0 +749,750,7169,0.13866540834682,2.8930555555556,105,1521,0 +750,751,7228,0.046813024390007014,2.4722222222222,94,1622,0 +751,752,7123,0.072990045810784,2.2294444444444,53,1580,0 +752,753,7199,0.17156759541908995,2.1286111111111,59,1468,0 +753,754,7167,0.051876699734571985,2.2219444444444,63,1520,0 +754,755,7212,0.031958698733103,2.3366666666667,61,1529,0 +755,756,7206,0.07333373485157901,2.4155555555556,72,1611,0 +756,757,7149,0.0,2.5408333333333,93,1511,0 +757,758,7284,0.023187512335638,2.6511111111111,62,1906,0 +758,759,7265,0.031672522871666,2.8405555555556,50,2632,0 +759,760,7221,0.091103855362214,2.8336111111111,42,1483,0 +760,761,7588,0.0,2.6575,62,1611,0 +761,762,7423,0.0983398607742,2.6622222222222005,21,1676,0 +762,763,7198,0.08011943311413,2.7719444444444,28,1670,0 +763,764,7279,0.043646436319699,3.0344444444444,65,1631,0 +764,765,7174,0.091445521226266,3.3741666666667003,37,1799,0 +765,766,7259,0.067771120773973,3.6925,20,1511,0 +766,767,7166,0.049768578185777006,3.8136111111111,47,1605,0 +767,768,7171,0.067455979006223,3.8202777777778,45,1758,0 +768,769,6883,0.14102875351082,3.7547222222222,49,1509,0 +769,770,6859,0.04521932948417,3.6077777777778,46,1591,0 +770,771,6817,0.032382889221133,3.5330555555556,30,1543,0 +771,772,6877,0.075100266089453,3.3544444444444,30,1573,0 +772,773,6785,0.038989846359505,3.1155555555556,48,1473,0 +773,774,6665,0.093396608626074,2.8463888888888995,36,1476,0 +774,775,6805,0.06797619687558401,2.4411111111111,46,1712,0 +775,776,6863,0.08326287339845401,2.1455555555556,27,1801,0 +776,777,6926,0.015112630017379,2.0025,79,1902,0 +777,778,7004,0.031549757127405,2.1247222222222,65,2005,0 +778,779,6950,0.0,2.2741666666667,57,2363,0 +779,780,7262,0.0,2.3272222222222005,61,2513,0 +780,781,7361,0.017214486216241002,2.4363888888889,89,2664,0 +781,782,7288,0.015541991667356,2.6155555555556003,80,2714,0 +782,783,7463,0.0,2.7272222222222,79,2754,0 +783,784,7188,0.027199843934104,2.6552777777778,113,2670,0 +784,785,7658,0.053744802378685,2.6086111111111,71,2584,0 +785,786,7575,0.05675511278546901,2.6025,53,2466,0 +786,787,6954,0.070873939193717,2.7372222222222,64,2137,0 +787,788,6862,0.19022950977106,3.0125,43,1931,0 +788,789,6896,0.17589540947937002,3.3477777777778,34,1743,0 +789,790,6954,0.022875979046571,3.6236111111111,29,1713,0 +790,791,6869,0.0,3.7383333333333,30,1649,0 +791,792,6890,0.13681403156951,3.7772222222222,24,1633,0 +792,793,9742,0.058507485759525,3.6966666666667,40,1993,0 +793,794,9730,0.10227075584148,3.7733333333333,32,1940,0 +794,795,9810,0.06726096113022301,3.6408333333333,39,1951,0 +795,796,9688,0.15267199916685995,3.3922222222222,67,1894,0 +796,797,9849,0.069818221889972,3.1627777777778,65,1801,0 +797,798,9765,0.030305771594539,2.6875,49,1962,0 +798,799,9812,0.09211700324247198,2.3533333333333,41,2123,0 +799,800,9931,0.12298177354813,2.0425,50,2434,0 +800,801,9908,0.08705722689013601,1.9738888888889,48,2402,0 +801,802,10066,0.07529920073678098,2.0425,59,3013,0 +802,803,10184,0.06217694957317299,2.1563888888889,51,3086,0 +803,804,10295,0.020886039183631,2.2866666666667004,43,3527,0 +804,805,10113,0.08148200392528,2.3919444444444,72,3716,0 +805,806,10218,0.027014133895137,2.5513888888889,52,3577,0 +806,807,10322,0.08271940630361399,2.6030555555556,68,3430,0 +807,808,10269,0.038537180887872,2.6647222222222005,74,3413,0 +808,809,10781,0.090543853269643,2.5930555555556003,46,3755,0 +809,810,10486,0.02593545023878,2.5513888888889,64,4806,0 +810,811,10124,0.090692829340129,2.76,38,3127,0 +811,812,9993,0.09154630234853098,3.0636111111111,40,3421,0 +812,813,9801,0.09562635368432304,3.4016666666667,50,2475,0 +813,814,9760,0.0,3.7277777777778,42,2440,0 +814,815,9858,0.0,3.7902777777778,37,2731,0 +815,816,9884,0.027267039980187,3.7355555555556,34,2493,0 +816,817,7781,0.024102810048699,3.535,37,1665,0 +817,818,7742,0.072297652068167,3.5819444444444,47,1771,0 +818,819,7682,0.12348623922845,3.3847222222222,67,2293,0 +819,820,7831,0.077453588867077,3.2547222222222,66,1959,0 +820,821,7641,0.05662557916213299,3.125,91,1498,0 +821,822,7641,0.15509029304093,2.7766666666667,132,1537,0 +822,823,7759,0.079595064406905,2.4725,149,1580,0 +823,824,7748,0.053225613553497,2.1927777777778,65,1901,0 +824,825,7776,0.05741127788681901,2.1283333333333,50,1916,0 +825,826,7938,0.077171346852694,2.2319444444444,70,2213,0 +826,827,8031,0.0,2.3061111111111,82,2205,0 +827,828,8117,0.07512642149906099,2.3363888888889,72,2486,0 +828,829,8099,0.0,2.3686111111111,98,2580,0 +829,830,8002,0.0,2.4986111111111,78,2530,0 +830,831,7944,0.026463035590685,2.6433333333333,86,2664,0 +831,832,7963,0.024228588329879,2.7563888888889,76,4368,0 +832,833,8602,0.055182797357095005,2.6652777777778,95,3103,0 +833,834,8269,0.09607690135523,2.6844444444444,63,2249,0 +834,835,7871,0.059431847203259,2.7902777777778,32,2070,0 +835,836,7709,0.018731901987648,3.1119444444444,30,2833,0 +836,837,7726,0.033970515582906,3.5491666666667,27,1734,0 +837,838,7781,0.049963174087431,3.7102777777778,22,2151,0 +838,839,7762,0.073295374096872,3.7961111111111,19,2103,0 +839,840,7692,0.017715537831218996,3.7730555555556,32,1725,0 +840,841,6608,0.014656639469103996,3.5919444444444,45,1895,0 +841,842,6526,0.15513271231042,3.5580555555556,65,1959,0 +842,843,6531,0.06544162031760599,3.4588888888889,73,1637,0 +843,844,6483,0.12276447331552,3.2969444444444003,52,1658,0 +844,845,6602,0.054046416943085,3.2288888888889,93,1666,0 +845,846,6555,0.06827770027642299,2.7358333333333,68,2410,0 +846,847,6610,0.10171854295932,2.4636111111111,127,1787,0 +847,848,6690,0.093454285728882,2.1894444444444,105,2264,0 +848,849,6651,0.04318436192577,2.1227777777778,75,2007,0 +849,850,6759,0.10050707347524,2.1369444444444,77,2107,0 +850,851,6836,0.019571935182124,2.2230555555556,140,2355,0 +851,852,6894,0.0,2.3188888888889,132,2726,0 +852,853,6844,0.0,2.4166666666667003,100,2875,0 +853,854,6773,0.02713995635286,2.5777777777778,174,2780,0 +854,855,6802,0.092632629280125,2.7869444444444,82,3936,0 +855,856,6947,0.098676638207998,2.8586111111111,128,3116,0 +856,857,7248,0.0,3.0816666666667003,79,3770,0 +857,858,6885,0.11132365864914,2.8713888888889,71,2382,0 +858,859,6643,0.0947301899901,2.9386111111111,60,2152,0 +859,860,6560,0.061070711161473,2.9827777777778,60,1754,0 +860,861,6554,0.18477832073133,3.3197222222222,56,1783,0 +861,862,6600,0.055986690710270993,3.5961111111111004,78,1780,0 +862,863,6525,0.16264480046039995,3.7613888888889,60,1582,0 +863,864,6543,0.026215643469448,3.7305555555556,48,2271,0 +864,865,9018,0.0,3.5580555555556,48,2592,0 +865,866,9225,0.054655616583012,3.5136111111111004,42,2921,0 +866,867,9112,0.07076692500883701,3.3772222222222,64,1814,0 +867,868,9195,0.067217215228375,3.2402777777778,36,3219,0 +868,869,9206,0.046060828388587,3.0586111111111003,40,2567,0 +869,870,9224,0.08329795085471901,2.7908333333333,18,1899,0 +870,871,9408,0.08219020764935,2.3761111111111,35,1801,0 +871,872,9082,0.046792553198475,2.1347222222222,44,2005,0 +872,873,9168,0.06755714954154099,1.9991666666667,105,2572,0 +873,874,9258,0.099050882008287,1.9983333333333,71,3563,0 +874,875,9158,0.0,2.0908333333333,65,2777,0 +875,876,9140,0.10824637351267,2.2311111111111,74,3362,0 +876,877,9206,0.0,2.3219444444444,34,3590,0 +877,878,9186,0.0,2.4727777777778,49,2930,0 +878,879,9155,0.037750185176735,2.5952777777778,44,2481,0 +879,880,9174,0.030345867660395,2.7416666666667004,57,2571,0 +880,881,9758,0.057665227298857,2.7652777777778,102,3546,0 +881,882,9451,0.16774071722374,2.7980555555556,106,4984,0 +882,883,9153,0.10462164884166,2.7597222222222,58,1994,0 +883,884,9233,0.051974117163582,3.0116666666667005,57,3060,0 +884,885,9250,0.070438547008222,3.2916666666667003,62,2151,0 +885,886,9317,0.11437533048244,3.5547222222222,42,2158,0 +886,887,9130,0.028754095353637,3.7580555555556,35,2319,0 +887,888,9249,0.06874265819680701,3.7330555555556,28,1909,0 +888,889,8297,0.041552255552731,3.5886111111111005,27,1627,0 +889,890,8245,0.033571347720577,3.5255555555556,35,2459,0 +890,891,8298,0.014724878652831,3.3858333333333,50,3167,0 +891,892,8247,0.046095580964192,3.2677777777778,69,1839,0 +892,893,8387,0.031859774913781,3.1247222222222,64,3887,0 +893,894,8392,0.094121536253424,2.7213888888888995,69,2031,0 +894,895,8531,0.11471874999036,2.3972222222222004,58,1522,0 +895,896,8437,0.09375530196425097,2.0836111111111,58,1732,0 +896,897,8344,0.10898948864079,2.0644444444444,51,2169,0 +897,898,8274,0.031129909255124,2.2063888888889,46,1679,0 +898,899,8328,0.0,2.3044444444444,84,1941,0 +899,900,8351,0.020155867044519,2.47,144,1638,0 +900,901,8380,0.016795241270985,2.5697222222222003,86,1725,0 +901,902,8332,0.0,2.7625,69,1903,0 +902,903,8366,0.0,2.9436111111111005,81,2074,0 +903,904,8357,0.01748186857624,2.7905555555556,175,1848,0 +904,905,8867,0.015638795432702,2.7527777777778,65,1761,0 +905,906,8659,0.037878946671491,2.6980555555556,48,1838,0 +906,907,8458,0.14870829462531002,2.9102777777778,33,1640,0 +907,908,8360,0.07322030784057597,3.2663888888889,35,1715,0 +908,909,8330,0.10504553292421,3.5372222222222,37,1717,0 +909,910,8298,0.10771048774666,3.86,31,1758,0 +910,911,8381,0.07484115005697,3.9216666666667,36,1975,0 +911,912,8393,0.10377526695926,3.8766666666667,30,1865,0 +912,913,3998,0.052336696506499,3.6463888888889,28,3575,0 +913,914,3733,0.039930389849144,3.6552777777778,24,1413,0 +914,915,3735,0.052659026600132,3.5880555555556,68,1414,0 +915,916,3709,0.071593754146172,3.3594444444444003,26,1170,0 +916,917,3755,0.072107773186609,3.1888888888889,78,1209,0 +917,918,3782,0.14407221323011,2.7575,90,1170,0 +918,919,3849,0.078873737285415,2.3936111111111,76,1328,0 +919,920,3801,0.090543853269643,2.1925,94,1258,0 +920,921,3787,0.0,2.16,70,1427,0 +921,922,3835,0.18229662394063,2.2719444444444,129,1480,0 +922,923,4035,0.10064381631633,2.3994444444444,120,1687,0 +923,924,4173,0.0,2.2836111111111,122,1942,0 +924,925,3995,0.0,2.5422222222222004,100,1967,0 +925,926,4016,0.0,2.6908333333333,102,2110,0 +926,927,4049,0.064661049677152,2.7702777777778,118,1956,0 +927,928,4014,0.10610212880951,2.7405555555556,86,1984,0 +928,929,4263,0.098345239553664,2.6908333333333,92,1893,0 +929,930,3941,0.055426072308289,2.7008333333333,44,1821,0 +930,931,4023,0.026036719363444,2.8322222222222,25,1641,0 +931,932,3917,0.058176601538018,3.0922222222222,54,1604,0 +932,933,3910,0.11644035456955,3.4363888888889,48,1265,0 +933,934,3934,0.067489738764642,3.7530555555556,56,1407,0 +934,935,3783,0.091155534540558,3.9127777777778,42,1342,0 +935,936,3834,0.052217414705359004,3.7608333333333,41,1216,0 +936,937,8698,0.028401045145692,3.6472222222222,32,2569,0 +937,938,8969,0.06030991242653401,3.5544444444444,48,2150,0 +938,939,8928,0.057683225704233,3.5036111111111,40,2317,0 +939,940,9020,0.049602244305935,3.2538888888889,26,2047,0 +940,941,8865,0.054771618715138,3.1886111111111,55,2065,0 +941,942,8830,0.014455899164978,2.7341666666667,52,1909,0 +942,943,8879,0.05563571922395901,2.3655555555556003,34,1910,0 +943,944,9120,0.077488949885965,2.1688888888889,61,2037,0 +944,945,9111,0.06776025909838901,2.0977777777778,34,3065,0 +945,946,9071,0.033919453583666,2.3077777777778,50,2452,0 +946,947,9205,0.030948232299768,2.3611111111111,47,3226,0 +947,948,9355,0.0,2.4986111111111,56,3271,0 +948,949,9372,0.0,2.5691666666667,76,3471,0 +949,950,9392,0.0,2.7463888888889,60,3922,0 +950,951,9416,0.0,2.8063888888888995,100,3296,0 +951,952,9394,0.0,2.8091666666667003,80,3171,0 +952,953,9810,0.10150033578287,2.715,74,3208,0 +953,954,9594,0.13650296233629,2.6869444444444,24,3602,0 +954,955,9006,0.048341331534980006,2.8180555555556,41,3208,0 +955,956,9140,0.055919636698743,3.0541666666667004,19,3455,0 +956,957,8925,0.052826773889684014,3.4711111111111004,24,2833,0 +957,958,9047,0.07932984590431501,3.7566666666667,18,3453,0 +958,959,9030,0.033310879512461,3.8633333333333,28,3155,0 +959,960,9088,0.048306771033288,3.7519444444444,5,2145,0 +960,961,8569,0.034002578802562,3.6480555555556,12,1999,0 +961,962,8616,0.047801640470854015,3.5061111111111005,35,2135,0 +962,963,8497,0.13378075099383,3.47,41,1813,0 +963,964,8439,0.063853685461221,3.3086111111111003,30,2020,0 +964,965,8567,0.0,3.1194444444444,22,2127,0 +965,966,8694,0.073869151016554,2.8044444444444,56,1764,0 +966,967,8739,0.043582908466928014,2.4205555555556004,34,2249,0 +967,968,8761,0.0,2.1180555555556,73,3119,0 +968,969,8838,0.062006969698131,2.1266666666667,86,2031,0 +969,970,8908,0.14006961492891,2.1708333333333,68,2246,0 +970,971,9053,0.11198565566104,2.3247222222222,36,3214,0 +971,972,9346,0.0,2.4208333333333,66,4207,0 +972,973,8989,0.058427455554992985,2.5563888888889,74,4195,0 +973,974,8807,0.070887934206661,2.7086111111111,78,3179,0 +974,975,9020,0.031869233863638,2.8027777777778,66,2739,0 +975,976,9034,0.0,2.7711111111111,118,2394,0 +976,977,9558,0.055680379884383,2.74,81,3750,0 +977,978,9042,0.030919398857213,2.6869444444444,85,3000,0 +978,979,8804,0.040222150865381015,2.8113888888889,69,2646,0 +979,980,8885,0.08462727078727299,3.1258333333333,49,2375,0 +980,981,8721,0.15790637433488,3.4711111111111004,56,2442,0 +981,982,8676,0.099165571846447,3.7419444444444,64,2069,0 +982,983,9029,0.051043016646698,3.7258333333333,48,1899,0 +983,984,8670,0.023695834967821,3.5369444444444,65,2277,0 +984,985,8537,0.13363180896924,3.4911111111111004,53,1926,0 +985,986,8418,0.14375985835531,3.3769444444444,70,1949,0 +986,987,8481,0.13890523887057998,3.3327777777778,51,2222,0 +987,988,8535,0.096357518724471,3.1925,30,1797,0 +988,989,8535,0.098277544249084,3.135,97,1860,0 +989,990,8442,0.11251833989481,2.8338888888889,41,2870,0 +990,991,8448,0.074768662666532,2.4997222222222004,32,1899,0 +991,992,8527,0.038008655416852,2.2297222222222004,47,2336,0 +992,993,8541,0.016354174968753,2.1158333333333,34,2703,0 +993,994,8635,0.11898350916153,2.1966666666667,54,2773,0 +994,995,8867,0.0,2.2591666666667,69,2577,0 +995,996,9033,0.0,2.3002777777778,109,2816,0 +996,997,8875,0.0,2.3797222222222003,76,3133,0 +997,998,8708,0.0,2.625,47,3366,0 +998,999,8455,0.020636446066963,2.6661111111111,44,3062,0 +999,1000,8713,0.043044731483849,2.6694444444444,92,3003,0 +1000,1001,8934,0.12513578187909,2.6541666666667,67,3044,0 +1001,1002,8745,0.099581351017555,2.6483333333333,26,3230,0 +1002,1003,8674,0.085903047711976,2.7444444444444,42,2793,0 +1003,1004,8606,0.066698820830796,3.0788888888889,69,1945,0 +1004,1005,8508,0.034228320502586,3.4833333333333,32,2716,0 +1005,1006,8558,0.028479870560763,3.6063888888889,41,2103,0 +1006,1007,8529,0.16430377699282994,3.8069444444444,52,1795,0 +1007,1008,8520,0.020290722486788003,3.6475,56,2840,0 +1008,1009,6662,0.17253761895951006,3.5219444444444,47,2653,0 +1009,1010,6491,0.1150267570489,3.3708333333333,65,2819,0 +1010,1011,6498,0.14119445755296,3.3086111111111003,70,1706,0 +1011,1012,6500,0.079900598296651,3.2411111111111004,84,1801,0 +1012,1013,6471,0.11459361685243,3.0525,71,3271,0 +1013,1014,6354,0.11299850955195,2.7419444444444,110,2001,0 +1014,1015,6592,0.078187238738118,2.4305555555556,65,1678,0 +1015,1016,6552,0.15222680511595002,2.1852777777778,68,1703,0 +1016,1017,6492,0.05823703723779,2.0644444444444,74,2441,0 +1017,1018,6577,0.038270957919533,2.1961111111111,43,2304,0 +1018,1019,6777,0.045436612403901,2.2886111111111,55,3124,0 +1019,1020,6844,0.051111263534218,2.3219444444444,53,3605,0 +1020,1021,6769,0.0,2.4436111111111,64,2985,0 +1021,1022,6642,0.0,2.6463888888889,58,2934,0 +1022,1023,6782,0.057248496594127986,2.735,54,3044,0 +1023,1024,6715,0.0,2.7586111111111005,121,3463,0 +1024,1025,6915,0.084808608043399,2.7138888888889,103,3199,0 +1025,1026,6569,0.05823703723779,2.7119444444444,66,2684,0 +1026,1027,6486,0.12640598881102005,2.8027777777778,73,3317,0 +1027,1028,6504,0.08602692657241201,2.9777777777778,71,2159,0 +1028,1029,6445,0.13712331887199,3.2961111111111,37,2043,0 +1029,1030,6427,0.12184008568979,3.4869444444444,46,2003,0 +1030,1031,6365,0.050317612906928,3.673611111111101,40,2260,0 +1031,1032,6277,0.07167380324199299,3.7469444444444,26,3522,0 +1032,1033,5231,0.051289858799957,3.6133333333333,42,1840,0 +1033,1034,5166,0.094021005766084,3.4752777777778,63,1820,0 +1034,1035,5303,0.020566298353792,3.3602777777778,68,1856,0 +1035,1036,5306,0.12275234276969,3.1605555555556,87,1715,0 +1036,1037,5298,0.1054190746845,3.0733333333333,60,1695,0 +1037,1038,5268,0.19050318144252,2.7130555555556,94,2254,0 +1038,1039,5251,0.10472332930133,2.2886111111111,121,1652,0 +1039,1040,5194,0.12644994481537,2.0783333333333,128,1602,0 +1040,1041,5230,0.08859454436104999,1.9188888888889,68,1792,0 +1041,1042,5244,0.0,1.9355555555556003,76,1954,0 +1042,1043,5102,0.09532581107230803,2.0569444444444,77,1808,0 +1043,1044,5244,0.15766772749983,2.1902777777778,158,1629,0 +1044,1045,5249,0.06429178708826701,2.3477777777778,112,2140,0 +1045,1046,5261,0.068395341911942,2.5502777777778,85,2390,0 +1046,1047,5339,0.025992957736547997,2.6597222222222,77,1707,0 +1047,1048,5241,0.0,2.7238888888888995,89,1901,0 +1048,1049,5491,0.021142167244918,2.7375,106,1820,0 +1049,1050,5374,0.072067861729848,2.7483333333333,47,2167,0 +1050,1051,5354,0.1275228688396,2.8525,34,2063,0 +1051,1052,5232,0.043846003986674,3.0038888888889,32,2184,0 +1052,1053,5217,0.10247450096434,3.2761111111111005,22,1981,0 +1053,1054,5258,0.07584150637714701,3.5761111111111004,16,1813,0 +1054,1055,5251,0.020496657705832,3.8172222222222,32,2033,0 +1055,1056,5223,0.13399493992192998,3.6691666666667,16,1629,0 +1056,1057,3952,0.091121163023619,3.5558333333333,20,1485,0 +1057,1058,3949,0.11809705541338,3.4266666666667,56,1527,0 +1058,1059,4021,0.033014047837867995,3.435,74,2561,0 +1059,1060,3815,0.16367597832104,3.2111111111111,116,1523,0 +1060,1061,3855,0.12469537397569,3.1297222222222,72,1446,0 +1061,1062,3892,0.095002031789468,2.7538888888889,66,1499,0 +1062,1063,3948,0.1028064299952,2.3116666666667003,56,1368,0 +1063,1064,3860,0.028861851985229007,2.0988888888889,61,1426,0 +1064,1065,3830,0.05806984314166,2.0983333333333,2151,3528,0 +1065,1066,3821,0.050886592113012,2.1986111111111,459,2279,0 +1066,1067,3886,0.05081829754409599,2.3677777777778,84,1421,0 +1067,1068,3954,0.0,2.5036111111111,55,2008,0 +1068,1069,3839,0.08354288831032201,2.5786111111111,61,1429,0 +1069,1070,3921,0.0,2.8172222222222,19,1497,0 +1070,1071,3874,0.08142390858425297,2.8727777777778,30,1604,0 +1071,1072,3996,0.047911560407608,2.8294444444444,73,1595,0 +1072,1073,4246,0.12201534565884,2.7136111111111005,63,2217,0 +1073,1074,3803,0.088739417881303,2.7058333333333,35,1580,0 +1074,1075,3594,0.08276214539547999,2.8161111111111,57,1466,0 +1075,1076,3778,0.066779641097052,3.1541666666667,50,1717,0 +1076,1077,3745,0.11367082443275,3.5791666666667004,48,1564,0 +1077,1078,3747,0.021597223158314,3.8158333333333,40,1752,0 +1078,1079,3726,0.16874893592242002,3.9405555555556,36,1598,0 +1079,1080,3729,0.041971530556774,3.7294444444444,59,1842,0 +1080,1081,8513,0.042983941794881,3.6183333333333,14,3066,0 +1081,1082,8738,0.14500733624043,3.4911111111111004,16,2272,0 +1082,1083,8709,0.046727090031129015,3.4566666666667003,36,4344,0 +1083,1084,8601,0.032553617944112004,3.37,65,3242,0 +1084,1085,8719,0.040039251102491,3.1658333333333,80,2291,0 +1085,1086,8820,0.055153759101126985,2.7261111111111003,91,2240,0 +1086,1087,8674,0.05751181017711901,2.3533333333333,102,2012,0 +1087,1088,8859,0.041202889821452,2.1158333333333,85,2305,0 +1088,1089,8905,0.07854024449462599,2.0852777777778,69,2295,0 +1089,1090,8920,0.11628975245152,2.1422222222222,79,2370,0 +1090,1091,9062,0.087543035971238,2.3172222222222003,66,3066,0 +1091,1092,9139,0.0,2.3983333333333,47,3132,0 +1092,1093,8866,0.031151045483539,2.55,51,3006,0 +1093,1094,8997,0.0,2.7413888888888995,20,3101,0 +1094,1095,9122,0.029949950026121008,2.7636111111111004,62,3739,0 +1095,1096,9191,0.067297142748812,2.7002777777778,54,3933,0 +1096,1097,9795,0.08450527625030299,2.7247222222222,99,4537,0 +1097,1098,9255,0.049852109269358014,2.5866666666667,64,3856,0 +1098,1099,8924,0.094084438832673,2.8597222222222,66,2862,0 +1099,1100,9012,0.044896125591910994,3.1269444444444,49,2449,0 +1100,1101,9023,0.07328004196455701,3.5019444444444,73,2222,0 +1101,1102,8875,0.13104465124262998,3.778611111111101,47,2159,0 +1102,1103,8800,0.10394116672902,3.8727777777778,48,2486,0 +1103,1104,8785,0.033616505813902,3.704166666666701,35,3148,0 +1104,1105,8474,0.02672150953308,3.5533333333333,27,3207,0 +1105,1106,8412,0.082058799915824,3.4461111111111005,19,2057,0 +1106,1107,8491,0.05732182787355501,3.4341666666667003,37,2029,0 +1107,1108,8391,0.067005870534182,3.3141666666667,45,3127,0 +1108,1109,8216,0.13429243256821,3.0438888888889,45,2597,0 +1109,1110,8292,0.015094533525413,2.6791666666667004,32,2350,0 +1110,1111,8406,0.063949370932991,2.3202777777778,99,2364,0 +1111,1112,8509,0.094378811742462,2.0691666666667,71,2095,0 +1112,1113,8486,0.02139340711812,2.0091666666667,93,2978,0 +1113,1114,8616,0.0,2.1886111111111,78,2743,0 +1114,1115,8642,0.0,2.3088888888889,71,2668,0 +1115,1116,8823,0.0,2.3794444444444,91,3054,0 +1116,1117,8774,0.0,2.5994444444444,31,3733,0 +1117,1118,8810,0.0,2.7119444444444,35,4312,0 +1118,1119,8611,0.0,2.76,25,4112,0 +1119,1120,8798,0.10029435223064,2.6975,45,3541,0 +1120,1121,9179,0.0,2.5466666666667,33,3901,0 +1121,1122,9057,0.10365337249761998,2.6036111111111,34,4371,0 +1122,1123,8633,0.12418226954696003,2.7927777777778,40,4099,0 +1123,1124,8517,0.0,2.9788888888889,17,3039,0 +1124,1125,8427,0.051166116772473,3.4080555555556,17,3197,0 +1125,1126,8615,0.040222150865381015,3.6813888888889,16,2346,0 +1126,1127,8690,0.17057206553854998,3.7983333333333,26,2285,0 +1127,1128,8438,0.12861588337799,3.6338888888889,19,2313,0 +1128,1129,10388,0.0,3.5111111111111004,30,3216,0 +1129,1130,10588,0.0,3.3613888888889,94,3860,0 +1130,1131,10533,0.14569364884757002,3.3072222222222,73,4781,0 +1131,1132,10397,0.18198813530019,3.2447222222222,59,2957,0 +1132,1133,10347,0.038073868368755,3.1152777777778,53,2171,0 +1133,1134,10405,0.11491272575332,2.6994444444444,56,2856,0 +1134,1135,10411,0.064841538076484,2.3497222222222005,70,2714,0 +1135,1136,10503,0.048708312546253,2.0619444444444,60,2602,0 +1136,1137,10598,0.11629780056153,2.0625,83,2331,0 +1137,1138,10692,0.07659916149791901,2.1905555555556004,265,3586,0 +1138,1139,10874,0.0,2.2588888888889,944,3363,0 +1139,1140,11043,0.043763623117499,2.3983333333333,36,3879,0 +1140,1141,11009,0.0,2.5536111111111,42,3556,0 +1141,1142,10818,0.041436571087464,2.7408333333333,23,4381,0 +1142,1143,10985,0.0,2.7375,75,4777,0 +1143,1144,10861,0.08191467409622599,2.7780555555556,68,4879,0 +1144,1145,12282,0.11084389924027,2.6225,23,3553,0 +1145,1146,11225,0.12510294083344,2.6386111111111,35,3177,0 +1146,1147,10775,0.10213470511717,2.7908333333333,38,2727,0 +1147,1148,10688,0.06332743445339299,3.0922222222222,69,2758,0 +1148,1149,10601,0.033666593475508995,3.4291666666667004,57,4124,0 +1149,1150,10634,0.057459020289436,3.6752777777778,58,3076,0 +1150,1151,10646,0.023008391787587,3.736111111111101,43,2291,0 +1151,1152,10562,0.037622360322278,3.5905555555556,65,2482,0 +1152,1153,10608,0.026766196308354,3.3872222222222,60,2537,0 +1153,1154,10618,0.13691041072327,3.3186111111111005,55,2434,0 +1154,1155,10636,0.024581173073578,3.2775,49,2608,0 +1155,1156,10583,0.050723618686514,3.1625,54,2614,0 +1156,1157,10613,0.038807415292018,3.1391666666667004,66,2904,0 +1157,1158,10603,0.10731539561588,2.7616666666667005,59,2204,0 +1158,1159,10601,0.13649131550296,2.4675,107,2326,0 +1159,1160,10757,0.11190990870167998,2.2166666666667,104,3002,0 +1160,1161,10815,0.17879123074031,2.1205555555556,100,3472,0 +1161,1162,10790,0.08728058888363299,2.2044444444444,133,3496,0 +1162,1163,11082,0.0,2.3147222222222004,65,3168,0 +1163,1164,11121,0.07099894663641,2.2416666666667004,152,4268,0 +1164,1165,10913,0.098617038600063,2.405,83,4350,0 +1165,1166,11004,0.0,2.5705555555556003,158,3555,0 +1166,1167,11135,0.10519721128315,2.7088888888889,145,4986,0 +1167,1168,10960,0.10928571467639,2.6913888888889,77,4576,0 +1168,1169,11686,0.14969099592127,2.6427777777778,13,4451,0 +1169,1170,11244,0.060122448878635,2.705,67,3627,0 +1170,1171,10931,0.068254139999346,2.8738888888889,25,3485,0 +1171,1172,10811,0.056987671819742985,3.0819444444444,27,3046,0 +1172,1173,10679,0.094667935014769,3.4491666666667005,23,2657,0 +1173,1174,10648,0.13287358772218,3.6275,28,2423,0 +1174,1175,10757,0.032507012295146,3.8027777777778,25,2374,0 +1175,1176,10706,0.14779741522058998,3.6436111111111,28,2493,0 +1176,1177,9077,0.10864900088005,3.4861111111111005,30,2495,0 +1177,1178,8836,0.12602969813907,3.3266666666667004,31,2189,0 +1178,1179,8971,0.07253718299881,3.1866666666667003,31,2214,0 +1179,1180,8972,0.31381296416887,3.2213888888888995,44,2374,0 +1180,1181,8903,0.2312064012582,3.0102777777778,27,3230,0 +1181,1182,8967,0.17687421373190998,2.6658333333333,36,2132,0 +1182,1183,8962,0.022073721703464003,2.3902777777778,61,3042,0 +1183,1184,9044,0.11600086139073,2.1380555555556,64,2053,0 +1184,1185,8931,0.10418807549523,2.0161111111111,118,2349,0 +1185,1186,9028,0.040222150865381015,2.0641666666667,98,3381,0 +1186,1187,9240,0.06812462580532,2.1844444444444,76,3436,0 +1187,1188,9227,0.055328485037955,2.2822222222222,57,3280,0 +1188,1189,9227,0.027788383289499,2.4002777777777995,74,4357,0 +1189,1190,9125,0.0,2.5433333333333,72,4522,0 +1190,1191,9075,0.0,2.7469444444444,78,4094,0 +1191,1192,9117,0.035137191893634005,2.6872222222222,69,3296,0 +1192,1193,9562,0.035137191893634005,2.6980555555556,125,4129,0 +1193,1194,9305,0.11258759940039,2.7380555555556,157,3036,0 +1194,1195,8965,0.16105265701128,2.7858333333333,61,2628,0 +1195,1196,8862,0.15210502999287,3.0502777777778,12,2296,0 +1196,1197,8858,0.07673479360192201,3.2991666666667,16,2221,0 +1197,1198,8820,0.17013715283392,3.5533333333333,36,1991,0 +1198,1199,8876,0.1609412187274,3.6652777777778,27,2778,0 +1199,1200,8797,0.12008642730107,3.6116666666667,22,2511,0 +1200,1201,9074,0.045995324803682,3.5463888888889,22,2103,0 +1201,1202,9318,0.23802438276872,3.4013888888889,35,2111,0 +1202,1203,9286,0.18078076076243,3.245,67,2055,0 +1203,1204,9320,0.12741851179236,3.1644444444444,46,1930,0 +1204,1205,9280,0.08024661572906401,2.9361111111111,72,2456,0 +1205,1206,9333,0.32656213417732,2.6952777777778,96,2952,0 +1206,1207,9334,0.28639695711596,2.3702777777778,117,2147,0 +1207,1208,9337,0.083900984173012,2.0947222222222,113,2051,0 +1208,1209,9405,0.12853338721539,1.9538888888889,140,2281,0 +1209,1210,9263,0.032414228925828,1.9925,107,2102,0 +1210,1211,9326,0.08237281480963901,2.0363888888889,102,2062,0 +1211,1212,9421,0.0,2.1919444444444,85,2796,0 +1212,1213,9275,0.0,2.3211111111111,49,2005,0 +1213,1214,9323,0.0,2.4955555555556,69,2075,0 +1214,1215,9347,0.45868581620054,2.6980555555556,68,2058,1 +1215,1216,9333,0.1959092708736,2.7219444444444,104,2733,0 +1216,1217,9846,0.7871265862012701,2.725,111,2170,1 +1217,1218,9497,0.18267963393082,2.7816666666667,88,2282,0 +1218,1219,9383,0.26777755992147,2.7811111111111004,64,2178,0 +1219,1220,9300,0.30404676514833,2.955,29,2283,0 +1220,1221,9389,0.28226806095289003,3.3158333333333,32,2097,0 +1221,1222,9364,0.32093016819692,3.5669444444444003,29,2738,0 +1222,1223,9227,0.24793583772273,3.7419444444444,21,2678,0 +1223,1224,9309,0.27376916868294,3.6236111111111,33,2404,0 +1224,1225,6204,0.32069151905173,3.4416666666667,37,1497,0 +1225,1226,6048,0.16728853165162,3.4172222222222,57,1496,0 +1226,1227,5949,0.17244047836378998,3.3016666666667,72,1935,0 +1227,1228,5981,0.21356200193615,3.1963888888889,86,1521,0 +1228,1229,5897,0.08833993625230199,3.0641666666667,70,2879,0 +1229,1230,6038,0.20141526375625,2.735,63,1561,0 +1230,1231,6094,0.12271171189386,2.3288888888889,49,1381,0 +1231,1232,6022,0.15111333507662,2.0938888888889,81,1826,0 +1232,1233,6122,0.3688420983862,2.1338888888889,58,1896,0 +1233,1234,6034,0.15672074166098002,2.2247222222222005,70,2083,0 +1234,1235,6079,0.099476236793782,2.3308333333333,67,1792,0 +1235,1236,5998,0.18394691317126,2.3902777777778,70,3258,0 +1236,1237,6004,0.076264605227629,2.5819444444444,95,2265,0 +1237,1238,5908,0.058100747891124,2.6661111111111,100,2775,0 +1238,1239,6022,0.18015967729618,2.8258333333333,116,1545,0 +1239,1240,5981,0.059431847203259,2.7502777777778,123,1818,0 +1240,1241,6399,0.14870829462531002,2.6730555555556004,71,1481,0 +1241,1242,6119,0.09565694822541,2.7536111111111,65,1677,0 +1242,1243,6114,0.16022629962173002,2.9677777777778,73,1858,0 +1243,1244,5915,0.4140256163498,3.37,53,1643,0 +1244,1245,6192,0.32447726333369004,3.5958333333333,79,1582,0 +1245,1246,6021,0.15394421357627,3.8144444444444,77,1611,0 +1246,1247,6060,0.060070368432038,3.8283333333333,59,1803,0 +1247,1248,7510,0.14236976564388,3.7030555555556,66,2121,0 +1248,1249,7560,0.12741851179236,3.5802777777778,54,2375,0 +1249,1250,7525,0.093634078744746,3.4197222222222,54,1866,0 +1250,1251,7483,0.13709947889982,3.4438888888889,89,2398,0 +1251,1252,7452,0.06298116794216299,3.3425,85,2577,0 +1252,1253,7512,0.13125017838571,3.1608333333333,96,1801,0 +1253,1254,7572,0.21161148728916,2.7413888888888995,149,1840,0 +1254,1255,7629,0.06783428261124,2.3808333333333,139,1985,0 +1255,1256,7529,0.20877561051189,2.12,90,2041,0 +1256,1257,7623,0.10394294206935002,2.1533333333333,68,2075,0 +1257,1258,7637,0.0,2.2569444444444,445,2564,0 +1258,1259,7921,0.076424293095548,2.3183333333333,100,2734,0 +1259,1260,7790,0.08809461878011901,2.3583333333333,138,3143,0 diff --git a/datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/dataSplits.csv b/datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/dataSplits.csv new file mode 100644 index 0000000..b017fe5 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/dataSplits.csv @@ -0,0 +1,1261 @@ +d3mIndex,type,repeat,fold +0,TRAIN,0,0 +1,TRAIN,0,0 +2,TRAIN,0,0 +3,TRAIN,0,0 +4,TRAIN,0,0 +5,TRAIN,0,0 +6,TRAIN,0,0 +7,TRAIN,0,0 +8,TRAIN,0,0 +9,TRAIN,0,0 +10,TRAIN,0,0 +11,TRAIN,0,0 +12,TRAIN,0,0 +13,TRAIN,0,0 +14,TRAIN,0,0 +15,TRAIN,0,0 +16,TRAIN,0,0 +17,TRAIN,0,0 +18,TRAIN,0,0 +19,TRAIN,0,0 +20,TRAIN,0,0 +21,TRAIN,0,0 +22,TRAIN,0,0 +23,TRAIN,0,0 +24,TRAIN,0,0 +25,TRAIN,0,0 +26,TRAIN,0,0 +27,TRAIN,0,0 +28,TRAIN,0,0 +29,TRAIN,0,0 +30,TRAIN,0,0 +31,TRAIN,0,0 +32,TRAIN,0,0 +33,TRAIN,0,0 +34,TRAIN,0,0 +35,TRAIN,0,0 +36,TRAIN,0,0 +37,TRAIN,0,0 +38,TRAIN,0,0 +39,TRAIN,0,0 +40,TRAIN,0,0 +41,TRAIN,0,0 +42,TRAIN,0,0 +43,TRAIN,0,0 +44,TRAIN,0,0 +45,TRAIN,0,0 +46,TRAIN,0,0 +47,TRAIN,0,0 +48,TRAIN,0,0 +49,TRAIN,0,0 +50,TRAIN,0,0 +51,TRAIN,0,0 +52,TRAIN,0,0 +53,TRAIN,0,0 +54,TRAIN,0,0 +55,TRAIN,0,0 +56,TRAIN,0,0 +57,TRAIN,0,0 +58,TRAIN,0,0 +59,TRAIN,0,0 +60,TRAIN,0,0 +61,TRAIN,0,0 +62,TRAIN,0,0 +63,TRAIN,0,0 +64,TRAIN,0,0 +65,TRAIN,0,0 +66,TRAIN,0,0 +67,TRAIN,0,0 +68,TRAIN,0,0 +69,TRAIN,0,0 +70,TRAIN,0,0 +71,TRAIN,0,0 +72,TRAIN,0,0 +73,TRAIN,0,0 +74,TRAIN,0,0 +75,TRAIN,0,0 +76,TRAIN,0,0 +77,TRAIN,0,0 +78,TRAIN,0,0 +79,TRAIN,0,0 +80,TRAIN,0,0 +81,TRAIN,0,0 +82,TRAIN,0,0 +83,TRAIN,0,0 +84,TRAIN,0,0 +85,TRAIN,0,0 +86,TRAIN,0,0 +87,TRAIN,0,0 +88,TRAIN,0,0 +89,TRAIN,0,0 +90,TRAIN,0,0 +91,TRAIN,0,0 +92,TRAIN,0,0 +93,TRAIN,0,0 +94,TRAIN,0,0 +95,TRAIN,0,0 +96,TRAIN,0,0 +97,TRAIN,0,0 +98,TRAIN,0,0 +99,TRAIN,0,0 +100,TRAIN,0,0 +101,TRAIN,0,0 +102,TRAIN,0,0 +103,TRAIN,0,0 +104,TRAIN,0,0 +105,TRAIN,0,0 +106,TRAIN,0,0 +107,TRAIN,0,0 +108,TRAIN,0,0 +109,TRAIN,0,0 +110,TRAIN,0,0 +111,TRAIN,0,0 +112,TRAIN,0,0 +113,TRAIN,0,0 +114,TRAIN,0,0 +115,TRAIN,0,0 +116,TRAIN,0,0 +117,TRAIN,0,0 +118,TRAIN,0,0 +119,TRAIN,0,0 +120,TRAIN,0,0 +121,TRAIN,0,0 +122,TRAIN,0,0 +123,TRAIN,0,0 +124,TRAIN,0,0 +125,TRAIN,0,0 +126,TRAIN,0,0 +127,TRAIN,0,0 +128,TRAIN,0,0 +129,TRAIN,0,0 +130,TRAIN,0,0 +131,TRAIN,0,0 +132,TRAIN,0,0 +133,TRAIN,0,0 +134,TRAIN,0,0 +135,TRAIN,0,0 +136,TRAIN,0,0 +137,TRAIN,0,0 +138,TRAIN,0,0 +139,TRAIN,0,0 +140,TRAIN,0,0 +141,TRAIN,0,0 +142,TRAIN,0,0 +143,TRAIN,0,0 +144,TRAIN,0,0 +145,TRAIN,0,0 +146,TRAIN,0,0 +147,TRAIN,0,0 +148,TRAIN,0,0 +149,TRAIN,0,0 +150,TRAIN,0,0 +151,TRAIN,0,0 +152,TRAIN,0,0 +153,TRAIN,0,0 +154,TRAIN,0,0 +155,TRAIN,0,0 +156,TRAIN,0,0 +157,TRAIN,0,0 +158,TRAIN,0,0 +159,TRAIN,0,0 +160,TRAIN,0,0 +161,TRAIN,0,0 +162,TRAIN,0,0 +163,TRAIN,0,0 +164,TRAIN,0,0 +165,TRAIN,0,0 +166,TRAIN,0,0 +167,TRAIN,0,0 +168,TRAIN,0,0 +169,TRAIN,0,0 +170,TRAIN,0,0 +171,TRAIN,0,0 +172,TRAIN,0,0 +173,TRAIN,0,0 +174,TRAIN,0,0 +175,TRAIN,0,0 +176,TRAIN,0,0 +177,TRAIN,0,0 +178,TRAIN,0,0 +179,TRAIN,0,0 +180,TRAIN,0,0 +181,TRAIN,0,0 +182,TRAIN,0,0 +183,TRAIN,0,0 +184,TRAIN,0,0 +185,TRAIN,0,0 +186,TRAIN,0,0 +187,TRAIN,0,0 +188,TRAIN,0,0 +189,TRAIN,0,0 +190,TRAIN,0,0 +191,TRAIN,0,0 +192,TRAIN,0,0 +193,TRAIN,0,0 +194,TRAIN,0,0 +195,TRAIN,0,0 +196,TRAIN,0,0 +197,TRAIN,0,0 +198,TRAIN,0,0 +199,TRAIN,0,0 +200,TRAIN,0,0 +201,TRAIN,0,0 +202,TRAIN,0,0 +203,TRAIN,0,0 +204,TRAIN,0,0 +205,TRAIN,0,0 +206,TRAIN,0,0 +207,TRAIN,0,0 +208,TRAIN,0,0 +209,TRAIN,0,0 +210,TRAIN,0,0 +211,TRAIN,0,0 +212,TRAIN,0,0 +213,TRAIN,0,0 +214,TRAIN,0,0 +215,TRAIN,0,0 +216,TRAIN,0,0 +217,TRAIN,0,0 +218,TRAIN,0,0 +219,TRAIN,0,0 +220,TRAIN,0,0 +221,TRAIN,0,0 +222,TRAIN,0,0 +223,TRAIN,0,0 +224,TRAIN,0,0 +225,TRAIN,0,0 +226,TRAIN,0,0 +227,TRAIN,0,0 +228,TRAIN,0,0 +229,TRAIN,0,0 +230,TRAIN,0,0 +231,TRAIN,0,0 +232,TRAIN,0,0 +233,TRAIN,0,0 +234,TRAIN,0,0 +235,TRAIN,0,0 +236,TRAIN,0,0 +237,TRAIN,0,0 +238,TRAIN,0,0 +239,TRAIN,0,0 +240,TRAIN,0,0 +241,TRAIN,0,0 +242,TRAIN,0,0 +243,TRAIN,0,0 +244,TRAIN,0,0 +245,TRAIN,0,0 +246,TRAIN,0,0 +247,TRAIN,0,0 +248,TRAIN,0,0 +249,TRAIN,0,0 +250,TRAIN,0,0 +251,TRAIN,0,0 +252,TRAIN,0,0 +253,TRAIN,0,0 +254,TRAIN,0,0 +255,TRAIN,0,0 +256,TRAIN,0,0 +257,TRAIN,0,0 +258,TRAIN,0,0 +259,TRAIN,0,0 +260,TRAIN,0,0 +261,TRAIN,0,0 +262,TRAIN,0,0 +263,TRAIN,0,0 +264,TRAIN,0,0 +265,TRAIN,0,0 +266,TRAIN,0,0 +267,TRAIN,0,0 +268,TRAIN,0,0 +269,TRAIN,0,0 +270,TRAIN,0,0 +271,TRAIN,0,0 +272,TRAIN,0,0 +273,TRAIN,0,0 +274,TRAIN,0,0 +275,TRAIN,0,0 +276,TRAIN,0,0 +277,TRAIN,0,0 +278,TRAIN,0,0 +279,TRAIN,0,0 +280,TRAIN,0,0 +281,TRAIN,0,0 +282,TRAIN,0,0 +283,TRAIN,0,0 +284,TRAIN,0,0 +285,TRAIN,0,0 +286,TRAIN,0,0 +287,TRAIN,0,0 +288,TRAIN,0,0 +289,TRAIN,0,0 +290,TRAIN,0,0 +291,TRAIN,0,0 +292,TRAIN,0,0 +293,TRAIN,0,0 +294,TRAIN,0,0 +295,TRAIN,0,0 +296,TRAIN,0,0 +297,TRAIN,0,0 +298,TRAIN,0,0 +299,TRAIN,0,0 +300,TRAIN,0,0 +301,TRAIN,0,0 +302,TRAIN,0,0 +303,TRAIN,0,0 +304,TRAIN,0,0 +305,TRAIN,0,0 +306,TRAIN,0,0 +307,TRAIN,0,0 +308,TRAIN,0,0 +309,TRAIN,0,0 +310,TRAIN,0,0 +311,TRAIN,0,0 +312,TRAIN,0,0 +313,TRAIN,0,0 +314,TRAIN,0,0 +315,TRAIN,0,0 +316,TRAIN,0,0 +317,TRAIN,0,0 +318,TRAIN,0,0 +319,TRAIN,0,0 +320,TRAIN,0,0 +321,TRAIN,0,0 +322,TRAIN,0,0 +323,TRAIN,0,0 +324,TRAIN,0,0 +325,TRAIN,0,0 +326,TRAIN,0,0 +327,TRAIN,0,0 +328,TRAIN,0,0 +329,TRAIN,0,0 +330,TRAIN,0,0 +331,TRAIN,0,0 +332,TRAIN,0,0 +333,TRAIN,0,0 +334,TRAIN,0,0 +335,TRAIN,0,0 +336,TRAIN,0,0 +337,TRAIN,0,0 +338,TRAIN,0,0 +339,TRAIN,0,0 +340,TRAIN,0,0 +341,TRAIN,0,0 +342,TRAIN,0,0 +343,TRAIN,0,0 +344,TRAIN,0,0 +345,TRAIN,0,0 +346,TRAIN,0,0 +347,TRAIN,0,0 +348,TRAIN,0,0 +349,TRAIN,0,0 +350,TRAIN,0,0 +351,TRAIN,0,0 +352,TRAIN,0,0 +353,TRAIN,0,0 +354,TRAIN,0,0 +355,TRAIN,0,0 +356,TRAIN,0,0 +357,TRAIN,0,0 +358,TRAIN,0,0 +359,TRAIN,0,0 +360,TRAIN,0,0 +361,TRAIN,0,0 +362,TRAIN,0,0 +363,TRAIN,0,0 +364,TRAIN,0,0 +365,TRAIN,0,0 +366,TRAIN,0,0 +367,TRAIN,0,0 +368,TRAIN,0,0 +369,TRAIN,0,0 +370,TRAIN,0,0 +371,TRAIN,0,0 +372,TRAIN,0,0 +373,TRAIN,0,0 +374,TRAIN,0,0 +375,TRAIN,0,0 +376,TRAIN,0,0 +377,TRAIN,0,0 +378,TRAIN,0,0 +379,TRAIN,0,0 +380,TRAIN,0,0 +381,TRAIN,0,0 +382,TRAIN,0,0 +383,TRAIN,0,0 +384,TRAIN,0,0 +385,TRAIN,0,0 +386,TRAIN,0,0 +387,TRAIN,0,0 +388,TRAIN,0,0 +389,TRAIN,0,0 +390,TRAIN,0,0 +391,TRAIN,0,0 +392,TRAIN,0,0 +393,TRAIN,0,0 +394,TRAIN,0,0 +395,TRAIN,0,0 +396,TRAIN,0,0 +397,TRAIN,0,0 +398,TRAIN,0,0 +399,TRAIN,0,0 +400,TRAIN,0,0 +401,TRAIN,0,0 +402,TRAIN,0,0 +403,TRAIN,0,0 +404,TRAIN,0,0 +405,TRAIN,0,0 +406,TRAIN,0,0 +407,TRAIN,0,0 +408,TRAIN,0,0 +409,TRAIN,0,0 +410,TRAIN,0,0 +411,TRAIN,0,0 +412,TRAIN,0,0 +413,TRAIN,0,0 +414,TRAIN,0,0 +415,TRAIN,0,0 +416,TRAIN,0,0 +417,TRAIN,0,0 +418,TRAIN,0,0 +419,TRAIN,0,0 +420,TRAIN,0,0 +421,TRAIN,0,0 +422,TRAIN,0,0 +423,TRAIN,0,0 +424,TRAIN,0,0 +425,TRAIN,0,0 +426,TRAIN,0,0 +427,TRAIN,0,0 +428,TRAIN,0,0 +429,TRAIN,0,0 +430,TRAIN,0,0 +431,TRAIN,0,0 +432,TRAIN,0,0 +433,TRAIN,0,0 +434,TRAIN,0,0 +435,TRAIN,0,0 +436,TRAIN,0,0 +437,TRAIN,0,0 +438,TRAIN,0,0 +439,TRAIN,0,0 +440,TRAIN,0,0 +441,TRAIN,0,0 +442,TRAIN,0,0 +443,TRAIN,0,0 +444,TRAIN,0,0 +445,TRAIN,0,0 +446,TRAIN,0,0 +447,TRAIN,0,0 +448,TRAIN,0,0 +449,TRAIN,0,0 +450,TRAIN,0,0 +451,TRAIN,0,0 +452,TRAIN,0,0 +453,TRAIN,0,0 +454,TRAIN,0,0 +455,TRAIN,0,0 +456,TRAIN,0,0 +457,TRAIN,0,0 +458,TRAIN,0,0 +459,TRAIN,0,0 +460,TRAIN,0,0 +461,TRAIN,0,0 +462,TRAIN,0,0 +463,TRAIN,0,0 +464,TRAIN,0,0 +465,TRAIN,0,0 +466,TRAIN,0,0 +467,TRAIN,0,0 +468,TRAIN,0,0 +469,TRAIN,0,0 +470,TRAIN,0,0 +471,TRAIN,0,0 +472,TRAIN,0,0 +473,TRAIN,0,0 +474,TRAIN,0,0 +475,TRAIN,0,0 +476,TRAIN,0,0 +477,TRAIN,0,0 +478,TRAIN,0,0 +479,TRAIN,0,0 +480,TRAIN,0,0 +481,TRAIN,0,0 +482,TRAIN,0,0 +483,TRAIN,0,0 +484,TRAIN,0,0 +485,TRAIN,0,0 +486,TRAIN,0,0 +487,TRAIN,0,0 +488,TRAIN,0,0 +489,TRAIN,0,0 +490,TRAIN,0,0 +491,TRAIN,0,0 +492,TRAIN,0,0 +493,TRAIN,0,0 +494,TRAIN,0,0 +495,TRAIN,0,0 +496,TRAIN,0,0 +497,TRAIN,0,0 +498,TRAIN,0,0 +499,TRAIN,0,0 +500,TRAIN,0,0 +501,TRAIN,0,0 +502,TRAIN,0,0 +503,TRAIN,0,0 +504,TRAIN,0,0 +505,TRAIN,0,0 +506,TRAIN,0,0 +507,TRAIN,0,0 +508,TRAIN,0,0 +509,TRAIN,0,0 +510,TRAIN,0,0 +511,TRAIN,0,0 +512,TRAIN,0,0 +513,TRAIN,0,0 +514,TRAIN,0,0 +515,TRAIN,0,0 +516,TRAIN,0,0 +517,TRAIN,0,0 +518,TRAIN,0,0 +519,TRAIN,0,0 +520,TRAIN,0,0 +521,TRAIN,0,0 +522,TRAIN,0,0 +523,TRAIN,0,0 +524,TRAIN,0,0 +525,TRAIN,0,0 +526,TRAIN,0,0 +527,TRAIN,0,0 +528,TRAIN,0,0 +529,TRAIN,0,0 +530,TRAIN,0,0 +531,TRAIN,0,0 +532,TRAIN,0,0 +533,TRAIN,0,0 +534,TRAIN,0,0 +535,TRAIN,0,0 +536,TRAIN,0,0 +537,TRAIN,0,0 +538,TRAIN,0,0 +539,TRAIN,0,0 +540,TRAIN,0,0 +541,TRAIN,0,0 +542,TRAIN,0,0 +543,TRAIN,0,0 +544,TRAIN,0,0 +545,TRAIN,0,0 +546,TRAIN,0,0 +547,TRAIN,0,0 +548,TRAIN,0,0 +549,TRAIN,0,0 +550,TRAIN,0,0 +551,TRAIN,0,0 +552,TRAIN,0,0 +553,TRAIN,0,0 +554,TRAIN,0,0 +555,TRAIN,0,0 +556,TRAIN,0,0 +557,TRAIN,0,0 +558,TRAIN,0,0 +559,TRAIN,0,0 +560,TRAIN,0,0 +561,TRAIN,0,0 +562,TRAIN,0,0 +563,TRAIN,0,0 +564,TRAIN,0,0 +565,TRAIN,0,0 +566,TRAIN,0,0 +567,TRAIN,0,0 +568,TRAIN,0,0 +569,TRAIN,0,0 +570,TRAIN,0,0 +571,TRAIN,0,0 +572,TRAIN,0,0 +573,TRAIN,0,0 +574,TRAIN,0,0 +575,TRAIN,0,0 +576,TRAIN,0,0 +577,TRAIN,0,0 +578,TRAIN,0,0 +579,TRAIN,0,0 +580,TRAIN,0,0 +581,TRAIN,0,0 +582,TRAIN,0,0 +583,TRAIN,0,0 +584,TRAIN,0,0 +585,TRAIN,0,0 +586,TRAIN,0,0 +587,TRAIN,0,0 +588,TRAIN,0,0 +589,TRAIN,0,0 +590,TRAIN,0,0 +591,TRAIN,0,0 +592,TRAIN,0,0 +593,TRAIN,0,0 +594,TRAIN,0,0 +595,TRAIN,0,0 +596,TRAIN,0,0 +597,TRAIN,0,0 +598,TRAIN,0,0 +599,TRAIN,0,0 +600,TRAIN,0,0 +601,TRAIN,0,0 +602,TRAIN,0,0 +603,TRAIN,0,0 +604,TRAIN,0,0 +605,TRAIN,0,0 +606,TRAIN,0,0 +607,TRAIN,0,0 +608,TRAIN,0,0 +609,TRAIN,0,0 +610,TRAIN,0,0 +611,TRAIN,0,0 +612,TRAIN,0,0 +613,TRAIN,0,0 +614,TRAIN,0,0 +615,TRAIN,0,0 +616,TRAIN,0,0 +617,TRAIN,0,0 +618,TRAIN,0,0 +619,TRAIN,0,0 +620,TRAIN,0,0 +621,TRAIN,0,0 +622,TRAIN,0,0 +623,TRAIN,0,0 +624,TRAIN,0,0 +625,TRAIN,0,0 +626,TRAIN,0,0 +627,TRAIN,0,0 +628,TRAIN,0,0 +629,TRAIN,0,0 +630,TRAIN,0,0 +631,TRAIN,0,0 +632,TRAIN,0,0 +633,TRAIN,0,0 +634,TRAIN,0,0 +635,TRAIN,0,0 +636,TRAIN,0,0 +637,TRAIN,0,0 +638,TRAIN,0,0 +639,TRAIN,0,0 +640,TRAIN,0,0 +641,TRAIN,0,0 +642,TRAIN,0,0 +643,TRAIN,0,0 +644,TRAIN,0,0 +645,TRAIN,0,0 +646,TRAIN,0,0 +647,TRAIN,0,0 +648,TRAIN,0,0 +649,TRAIN,0,0 +650,TRAIN,0,0 +651,TRAIN,0,0 +652,TRAIN,0,0 +653,TRAIN,0,0 +654,TRAIN,0,0 +655,TRAIN,0,0 +656,TRAIN,0,0 +657,TRAIN,0,0 +658,TRAIN,0,0 +659,TRAIN,0,0 +660,TRAIN,0,0 +661,TRAIN,0,0 +662,TRAIN,0,0 +663,TRAIN,0,0 +664,TRAIN,0,0 +665,TRAIN,0,0 +666,TRAIN,0,0 +667,TRAIN,0,0 +668,TRAIN,0,0 +669,TRAIN,0,0 +670,TRAIN,0,0 +671,TRAIN,0,0 +672,TRAIN,0,0 +673,TRAIN,0,0 +674,TRAIN,0,0 +675,TRAIN,0,0 +676,TRAIN,0,0 +677,TRAIN,0,0 +678,TRAIN,0,0 +679,TRAIN,0,0 +680,TRAIN,0,0 +681,TRAIN,0,0 +682,TRAIN,0,0 +683,TRAIN,0,0 +684,TRAIN,0,0 +685,TRAIN,0,0 +686,TRAIN,0,0 +687,TRAIN,0,0 +688,TRAIN,0,0 +689,TRAIN,0,0 +690,TRAIN,0,0 +691,TRAIN,0,0 +692,TRAIN,0,0 +693,TRAIN,0,0 +694,TRAIN,0,0 +695,TRAIN,0,0 +696,TRAIN,0,0 +697,TRAIN,0,0 +698,TRAIN,0,0 +699,TRAIN,0,0 +700,TRAIN,0,0 +701,TRAIN,0,0 +702,TRAIN,0,0 +703,TRAIN,0,0 +704,TRAIN,0,0 +705,TRAIN,0,0 +706,TRAIN,0,0 +707,TRAIN,0,0 +708,TRAIN,0,0 +709,TRAIN,0,0 +710,TRAIN,0,0 +711,TRAIN,0,0 +712,TRAIN,0,0 +713,TRAIN,0,0 +714,TRAIN,0,0 +715,TRAIN,0,0 +716,TRAIN,0,0 +717,TRAIN,0,0 +718,TRAIN,0,0 +719,TRAIN,0,0 +720,TRAIN,0,0 +721,TRAIN,0,0 +722,TRAIN,0,0 +723,TRAIN,0,0 +724,TRAIN,0,0 +725,TRAIN,0,0 +726,TRAIN,0,0 +727,TRAIN,0,0 +728,TRAIN,0,0 +729,TRAIN,0,0 +730,TRAIN,0,0 +731,TRAIN,0,0 +732,TRAIN,0,0 +733,TRAIN,0,0 +734,TRAIN,0,0 +735,TRAIN,0,0 +736,TRAIN,0,0 +737,TRAIN,0,0 +738,TRAIN,0,0 +739,TRAIN,0,0 +740,TRAIN,0,0 +741,TRAIN,0,0 +742,TRAIN,0,0 +743,TRAIN,0,0 +744,TRAIN,0,0 +745,TRAIN,0,0 +746,TRAIN,0,0 +747,TRAIN,0,0 +748,TRAIN,0,0 +749,TRAIN,0,0 +750,TRAIN,0,0 +751,TRAIN,0,0 +752,TRAIN,0,0 +753,TRAIN,0,0 +754,TRAIN,0,0 +755,TRAIN,0,0 +756,TRAIN,0,0 +757,TRAIN,0,0 +758,TRAIN,0,0 +759,TRAIN,0,0 +760,TRAIN,0,0 +761,TRAIN,0,0 +762,TRAIN,0,0 +763,TRAIN,0,0 +764,TRAIN,0,0 +765,TRAIN,0,0 +766,TRAIN,0,0 +767,TRAIN,0,0 +768,TRAIN,0,0 +769,TRAIN,0,0 +770,TRAIN,0,0 +771,TRAIN,0,0 +772,TRAIN,0,0 +773,TRAIN,0,0 +774,TRAIN,0,0 +775,TRAIN,0,0 +776,TRAIN,0,0 +777,TRAIN,0,0 +778,TRAIN,0,0 +779,TRAIN,0,0 +780,TRAIN,0,0 +781,TRAIN,0,0 +782,TRAIN,0,0 +783,TRAIN,0,0 +784,TRAIN,0,0 +785,TRAIN,0,0 +786,TRAIN,0,0 +787,TRAIN,0,0 +788,TRAIN,0,0 +789,TRAIN,0,0 +790,TRAIN,0,0 +791,TRAIN,0,0 +792,TRAIN,0,0 +793,TRAIN,0,0 +794,TRAIN,0,0 +795,TRAIN,0,0 +796,TRAIN,0,0 +797,TRAIN,0,0 +798,TRAIN,0,0 +799,TRAIN,0,0 +800,TRAIN,0,0 +801,TRAIN,0,0 +802,TRAIN,0,0 +803,TRAIN,0,0 +804,TRAIN,0,0 +805,TRAIN,0,0 +806,TRAIN,0,0 +807,TRAIN,0,0 +808,TRAIN,0,0 +809,TRAIN,0,0 +810,TRAIN,0,0 +811,TRAIN,0,0 +812,TRAIN,0,0 +813,TRAIN,0,0 +814,TRAIN,0,0 +815,TRAIN,0,0 +816,TRAIN,0,0 +817,TRAIN,0,0 +818,TRAIN,0,0 +819,TRAIN,0,0 +820,TRAIN,0,0 +821,TRAIN,0,0 +822,TRAIN,0,0 +823,TRAIN,0,0 +824,TRAIN,0,0 +825,TRAIN,0,0 +826,TRAIN,0,0 +827,TRAIN,0,0 +828,TRAIN,0,0 +829,TRAIN,0,0 +830,TRAIN,0,0 +831,TRAIN,0,0 +832,TRAIN,0,0 +833,TRAIN,0,0 +834,TRAIN,0,0 +835,TRAIN,0,0 +836,TRAIN,0,0 +837,TRAIN,0,0 +838,TRAIN,0,0 +839,TRAIN,0,0 +840,TRAIN,0,0 +841,TRAIN,0,0 +842,TRAIN,0,0 +843,TRAIN,0,0 +844,TRAIN,0,0 +845,TRAIN,0,0 +846,TRAIN,0,0 +847,TRAIN,0,0 +848,TRAIN,0,0 +849,TRAIN,0,0 +850,TRAIN,0,0 +851,TRAIN,0,0 +852,TRAIN,0,0 +853,TRAIN,0,0 +854,TRAIN,0,0 +855,TRAIN,0,0 +856,TRAIN,0,0 +857,TRAIN,0,0 +858,TRAIN,0,0 +859,TRAIN,0,0 +860,TRAIN,0,0 +861,TRAIN,0,0 +862,TRAIN,0,0 +863,TRAIN,0,0 +864,TRAIN,0,0 +865,TRAIN,0,0 +866,TRAIN,0,0 +867,TRAIN,0,0 +868,TRAIN,0,0 +869,TRAIN,0,0 +870,TRAIN,0,0 +871,TRAIN,0,0 +872,TRAIN,0,0 +873,TRAIN,0,0 +874,TRAIN,0,0 +875,TRAIN,0,0 +876,TRAIN,0,0 +877,TRAIN,0,0 +878,TRAIN,0,0 +879,TRAIN,0,0 +880,TRAIN,0,0 +881,TRAIN,0,0 +882,TRAIN,0,0 +883,TRAIN,0,0 +884,TRAIN,0,0 +885,TRAIN,0,0 +886,TRAIN,0,0 +887,TRAIN,0,0 +888,TRAIN,0,0 +889,TRAIN,0,0 +890,TRAIN,0,0 +891,TRAIN,0,0 +892,TRAIN,0,0 +893,TRAIN,0,0 +894,TRAIN,0,0 +895,TRAIN,0,0 +896,TRAIN,0,0 +897,TRAIN,0,0 +898,TRAIN,0,0 +899,TRAIN,0,0 +900,TRAIN,0,0 +901,TRAIN,0,0 +902,TRAIN,0,0 +903,TRAIN,0,0 +904,TRAIN,0,0 +905,TRAIN,0,0 +906,TRAIN,0,0 +907,TRAIN,0,0 +908,TRAIN,0,0 +909,TRAIN,0,0 +910,TRAIN,0,0 +911,TRAIN,0,0 +912,TRAIN,0,0 +913,TRAIN,0,0 +914,TRAIN,0,0 +915,TRAIN,0,0 +916,TRAIN,0,0 +917,TRAIN,0,0 +918,TRAIN,0,0 +919,TRAIN,0,0 +920,TRAIN,0,0 +921,TRAIN,0,0 +922,TRAIN,0,0 +923,TRAIN,0,0 +924,TRAIN,0,0 +925,TRAIN,0,0 +926,TRAIN,0,0 +927,TRAIN,0,0 +928,TRAIN,0,0 +929,TRAIN,0,0 +930,TRAIN,0,0 +931,TRAIN,0,0 +932,TRAIN,0,0 +933,TRAIN,0,0 +934,TRAIN,0,0 +935,TRAIN,0,0 +936,TRAIN,0,0 +937,TRAIN,0,0 +938,TRAIN,0,0 +939,TRAIN,0,0 +940,TRAIN,0,0 +941,TRAIN,0,0 +942,TRAIN,0,0 +943,TRAIN,0,0 +944,TRAIN,0,0 +945,TRAIN,0,0 +946,TRAIN,0,0 +947,TRAIN,0,0 +948,TRAIN,0,0 +949,TRAIN,0,0 +950,TRAIN,0,0 +951,TRAIN,0,0 +952,TRAIN,0,0 +953,TRAIN,0,0 +954,TRAIN,0,0 +955,TRAIN,0,0 +956,TRAIN,0,0 +957,TRAIN,0,0 +958,TRAIN,0,0 +959,TRAIN,0,0 +960,TRAIN,0,0 +961,TRAIN,0,0 +962,TRAIN,0,0 +963,TRAIN,0,0 +964,TRAIN,0,0 +965,TRAIN,0,0 +966,TRAIN,0,0 +967,TRAIN,0,0 +968,TRAIN,0,0 +969,TRAIN,0,0 +970,TRAIN,0,0 +971,TRAIN,0,0 +972,TRAIN,0,0 +973,TRAIN,0,0 +974,TRAIN,0,0 +975,TRAIN,0,0 +976,TRAIN,0,0 +977,TRAIN,0,0 +978,TRAIN,0,0 +979,TRAIN,0,0 +980,TRAIN,0,0 +981,TRAIN,0,0 +982,TRAIN,0,0 +983,TRAIN,0,0 +984,TRAIN,0,0 +985,TRAIN,0,0 +986,TRAIN,0,0 +987,TRAIN,0,0 +988,TRAIN,0,0 +989,TRAIN,0,0 +990,TRAIN,0,0 +991,TRAIN,0,0 +992,TRAIN,0,0 +993,TRAIN,0,0 +994,TRAIN,0,0 +995,TRAIN,0,0 +996,TRAIN,0,0 +997,TRAIN,0,0 +998,TRAIN,0,0 +999,TRAIN,0,0 +1000,TRAIN,0,0 +1001,TRAIN,0,0 +1002,TRAIN,0,0 +1003,TRAIN,0,0 +1004,TRAIN,0,0 +1005,TRAIN,0,0 +1006,TRAIN,0,0 +1007,TRAIN,0,0 +1008,TRAIN,0,0 +1009,TRAIN,0,0 +1010,TRAIN,0,0 +1011,TRAIN,0,0 +1012,TRAIN,0,0 +1013,TRAIN,0,0 +1014,TRAIN,0,0 +1015,TRAIN,0,0 +1016,TRAIN,0,0 +1017,TRAIN,0,0 +1018,TRAIN,0,0 +1019,TRAIN,0,0 +1020,TRAIN,0,0 +1021,TRAIN,0,0 +1022,TRAIN,0,0 +1023,TRAIN,0,0 +1024,TRAIN,0,0 +1025,TRAIN,0,0 +1026,TRAIN,0,0 +1027,TRAIN,0,0 +1028,TRAIN,0,0 +1029,TRAIN,0,0 +1030,TRAIN,0,0 +1031,TRAIN,0,0 +1032,TRAIN,0,0 +1033,TRAIN,0,0 +1034,TRAIN,0,0 +1035,TRAIN,0,0 +1036,TRAIN,0,0 +1037,TRAIN,0,0 +1038,TRAIN,0,0 +1039,TRAIN,0,0 +1040,TRAIN,0,0 +1041,TRAIN,0,0 +1042,TRAIN,0,0 +1043,TRAIN,0,0 +1044,TRAIN,0,0 +1045,TRAIN,0,0 +1046,TRAIN,0,0 +1047,TRAIN,0,0 +1048,TRAIN,0,0 +1049,TRAIN,0,0 +1050,TRAIN,0,0 +1051,TRAIN,0,0 +1052,TRAIN,0,0 +1053,TRAIN,0,0 +1054,TRAIN,0,0 +1055,TRAIN,0,0 +1056,TRAIN,0,0 +1057,TRAIN,0,0 +1058,TRAIN,0,0 +1059,TRAIN,0,0 +1060,TRAIN,0,0 +1061,TRAIN,0,0 +1062,TRAIN,0,0 +1063,TRAIN,0,0 +1064,TRAIN,0,0 +1065,TRAIN,0,0 +1066,TRAIN,0,0 +1067,TRAIN,0,0 +1068,TRAIN,0,0 +1069,TRAIN,0,0 +1070,TRAIN,0,0 +1071,TRAIN,0,0 +1072,TRAIN,0,0 +1073,TRAIN,0,0 +1074,TRAIN,0,0 +1075,TRAIN,0,0 +1076,TRAIN,0,0 +1077,TRAIN,0,0 +1078,TRAIN,0,0 +1079,TRAIN,0,0 +1080,TRAIN,0,0 +1081,TRAIN,0,0 +1082,TRAIN,0,0 +1083,TRAIN,0,0 +1084,TRAIN,0,0 +1085,TRAIN,0,0 +1086,TRAIN,0,0 +1087,TRAIN,0,0 +1088,TRAIN,0,0 +1089,TRAIN,0,0 +1090,TRAIN,0,0 +1091,TRAIN,0,0 +1092,TRAIN,0,0 +1093,TRAIN,0,0 +1094,TRAIN,0,0 +1095,TRAIN,0,0 +1096,TRAIN,0,0 +1097,TRAIN,0,0 +1098,TRAIN,0,0 +1099,TRAIN,0,0 +1100,TRAIN,0,0 +1101,TRAIN,0,0 +1102,TRAIN,0,0 +1103,TRAIN,0,0 +1104,TRAIN,0,0 +1105,TRAIN,0,0 +1106,TRAIN,0,0 +1107,TRAIN,0,0 +1108,TRAIN,0,0 +1109,TRAIN,0,0 +1110,TRAIN,0,0 +1111,TRAIN,0,0 +1112,TRAIN,0,0 +1113,TRAIN,0,0 +1114,TRAIN,0,0 +1115,TRAIN,0,0 +1116,TRAIN,0,0 +1117,TRAIN,0,0 +1118,TRAIN,0,0 +1119,TRAIN,0,0 +1120,TRAIN,0,0 +1121,TRAIN,0,0 +1122,TRAIN,0,0 +1123,TRAIN,0,0 +1124,TRAIN,0,0 +1125,TRAIN,0,0 +1126,TRAIN,0,0 +1127,TRAIN,0,0 +1128,TRAIN,0,0 +1129,TRAIN,0,0 +1130,TRAIN,0,0 +1131,TRAIN,0,0 +1132,TRAIN,0,0 +1133,TRAIN,0,0 +1134,TRAIN,0,0 +1135,TRAIN,0,0 +1136,TRAIN,0,0 +1137,TRAIN,0,0 +1138,TRAIN,0,0 +1139,TRAIN,0,0 +1140,TRAIN,0,0 +1141,TRAIN,0,0 +1142,TRAIN,0,0 +1143,TRAIN,0,0 +1144,TRAIN,0,0 +1145,TRAIN,0,0 +1146,TRAIN,0,0 +1147,TRAIN,0,0 +1148,TRAIN,0,0 +1149,TRAIN,0,0 +1150,TRAIN,0,0 +1151,TRAIN,0,0 +1152,TRAIN,0,0 +1153,TRAIN,0,0 +1154,TRAIN,0,0 +1155,TRAIN,0,0 +1156,TRAIN,0,0 +1157,TRAIN,0,0 +1158,TRAIN,0,0 +1159,TRAIN,0,0 +1160,TRAIN,0,0 +1161,TRAIN,0,0 +1162,TRAIN,0,0 +1163,TRAIN,0,0 +1164,TRAIN,0,0 +1165,TRAIN,0,0 +1166,TRAIN,0,0 +1167,TRAIN,0,0 +1168,TRAIN,0,0 +1169,TRAIN,0,0 +1170,TRAIN,0,0 +1171,TRAIN,0,0 +1172,TRAIN,0,0 +1173,TRAIN,0,0 +1174,TRAIN,0,0 +1175,TRAIN,0,0 +1176,TRAIN,0,0 +1177,TRAIN,0,0 +1178,TRAIN,0,0 +1179,TRAIN,0,0 +1180,TRAIN,0,0 +1181,TRAIN,0,0 +1182,TRAIN,0,0 +1183,TRAIN,0,0 +1184,TRAIN,0,0 +1185,TRAIN,0,0 +1186,TRAIN,0,0 +1187,TRAIN,0,0 +1188,TRAIN,0,0 +1189,TRAIN,0,0 +1190,TRAIN,0,0 +1191,TRAIN,0,0 +1192,TRAIN,0,0 +1193,TRAIN,0,0 +1194,TRAIN,0,0 +1195,TRAIN,0,0 +1196,TRAIN,0,0 +1197,TRAIN,0,0 +1198,TRAIN,0,0 +1199,TRAIN,0,0 +1200,TRAIN,0,0 +1201,TRAIN,0,0 +1202,TRAIN,0,0 +1203,TRAIN,0,0 +1204,TRAIN,0,0 +1205,TRAIN,0,0 +1206,TRAIN,0,0 +1207,TRAIN,0,0 +1208,TRAIN,0,0 +1209,TRAIN,0,0 +1210,TRAIN,0,0 +1211,TRAIN,0,0 +1212,TRAIN,0,0 +1213,TRAIN,0,0 +1214,TRAIN,0,0 +1215,TRAIN,0,0 +1216,TRAIN,0,0 +1217,TRAIN,0,0 +1218,TRAIN,0,0 +1219,TRAIN,0,0 +1220,TRAIN,0,0 +1221,TRAIN,0,0 +1222,TRAIN,0,0 +1223,TRAIN,0,0 +1224,TRAIN,0,0 +1225,TRAIN,0,0 +1226,TRAIN,0,0 +1227,TRAIN,0,0 +1228,TRAIN,0,0 +1229,TRAIN,0,0 +1230,TRAIN,0,0 +1231,TRAIN,0,0 +1232,TRAIN,0,0 +1233,TRAIN,0,0 +1234,TRAIN,0,0 +1235,TRAIN,0,0 +1236,TRAIN,0,0 +1237,TRAIN,0,0 +1238,TRAIN,0,0 +1239,TRAIN,0,0 +1240,TRAIN,0,0 +1241,TRAIN,0,0 +1242,TRAIN,0,0 +1243,TRAIN,0,0 +1244,TRAIN,0,0 +1245,TRAIN,0,0 +1246,TRAIN,0,0 +1247,TRAIN,0,0 +1248,TRAIN,0,0 +1249,TRAIN,0,0 +1250,TRAIN,0,0 +1251,TRAIN,0,0 +1252,TRAIN,0,0 +1253,TRAIN,0,0 +1254,TRAIN,0,0 +1255,TRAIN,0,0 +1256,TRAIN,0,0 +1257,TRAIN,0,0 +1258,TRAIN,0,0 +1259,TRAIN,0,0 diff --git a/datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json b/datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json new file mode 100644 index 0000000..417cb6b --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "yahoo_sub_5_problem", + "problemName": "yahoo_sub_5_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "yahoo_sub_5_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 7, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TRAIN" + } + ], + "test": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TEST" + } + ], + "score": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/datasetDoc.json b/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/datasetDoc.json new file mode 100644 index 0000000..08f39bf --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/datasetDoc.json @@ -0,0 +1,95 @@ +{ + "about": { + "datasetID": "yahoo_sub_5_dataset", + "datasetName": "yahoo_sub_5", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value_0", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "value_1", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "value_2", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "value_3", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "value_4", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 8 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv b/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv new file mode 100644 index 0000000..afae224 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv @@ -0,0 +1,1401 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +0,1,12183,0.0,3.7166666666667,5,2109,0 +1,2,12715,0.091757964510557,3.6108333333333,60,3229,0 +2,3,12736,0.17229675238449998,3.4813888888889,88,3637,0 +3,4,12716,0.22621935431999,3.3802777777778,84,1982,0 +4,5,12739,0.17635798469946,3.1933333333333,111,2751,0 +5,6,12737,0.090491245476051,2.7866666666667004,112,2128,0 +6,7,12857,0.08460994072769001,2.4627777777777995,1235,2109,0 +7,8,12884,0.06842699169496,2.2541666666667,710,2328,0 +8,9,12894,0.13330269689422,2.1180555555556,618,2453,0 +9,10,12675,0.085026586189321,2.0691666666667,84,2847,0 +10,11,13260,0.097073068447328,2.1972222222222,100,3659,0 +11,12,13470,0.0,2.3188888888889,125,5207,0 +12,13,13060,0.031063767542922,2.34,114,5146,0 +13,14,12949,0.017732750501525,2.4902777777778,145,4712,0 +14,15,13035,0.063354504072079,2.6438888888889,91,6363,0 +15,16,12980,0.087870391896335,2.8486111111111003,94,5010,0 +16,17,13677,0.11546815687729,2.8833333333333,79,3956,0 +17,18,13381,0.073413457727404,2.8808333333333,50,4063,0 +18,19,12737,0.040392584616896,2.9005555555556,39,3748,0 +19,20,12554,0.08911335594722301,3.0855555555556,28,3047,0 +20,21,12470,0.098030053711531,3.3536111111111,29,4099,0 +21,22,12490,0.047140641497552,3.7438888888889,24,2122,0 +22,23,12539,0.10481279080241,3.7947222222222,19,3387,0 +23,24,12530,0.20478886838928,3.801111111111101,21,1950,0 +24,25,13002,0.04485100631921201,3.6508333333333,27,2927,0 +25,26,12989,0.1053622140254,3.555,46,1889,0 +26,27,13038,0.08436887679639,3.4769444444444,133,1910,0 +27,28,13011,0.097980673762982,3.2158333333333,143,3747,0 +28,29,12984,0.10165726215275,3.1141666666667,86,4994,0 +29,30,13079,0.056764513454874,2.7983333333333,118,2009,0 +30,31,13048,0.074428708878932,2.4252777777778,56,2899,0 +31,32,13096,0.091244453451818,2.14,92,2298,0 +32,33,13003,0.094529332881679,1.9822222222222,85,1894,0 +33,34,13057,0.016638011234698,1.9694444444444,122,1999,0 +34,35,13023,0.038096861957006005,2.0741666666667,74,3007,0 +35,36,13033,0.064497814457643,2.2505555555556,84,2838,0 +36,37,13034,0.030426401876334,2.2819444444444,54,4113,0 +37,38,13068,0.095423209955973,2.4216666666667,77,2150,0 +38,39,13057,0.069688744272108,2.5997222222222005,84,3007,0 +39,40,13047,0.03468622413034,2.7544444444444003,139,2484,0 +40,41,13795,0.089564461084836,2.7258333333333,65,2101,0 +41,42,13528,0.07337616196456799,2.8302777777778,38,2001,0 +42,43,13032,0.061939295606039,2.9422222222222,35,2102,0 +43,44,13084,0.11419089175512,3.0919444444444,47,2129,0 +44,45,13000,0.10475925920163,3.3519444444444,37,4422,0 +45,46,13008,0.079657960399444,3.6952777777778,53,4573,0 +46,47,12978,0.14475546275416,3.8269444444444,55,1989,0 +47,48,13067,0.1421711341096,3.7877777777778,45,1953,0 +48,49,13086,0.07696963969656899,3.7536111111111,46,1872,0 +49,50,13023,0.06393273436444799,3.61,35,1850,0 +50,51,13046,0.14973281021845006,3.5091666666667,68,2879,0 +51,52,13032,0.041478839355346,3.4205555555556,82,1840,0 +52,53,13012,0.089317973365284,3.2647222222222,154,2134,0 +53,54,13051,0.088820248166203,2.7944444444444,128,2234,0 +54,55,12979,0.054872994406929,2.46,79,3769,0 +55,56,13025,0.07913553329046401,2.2075,66,2717,0 +56,57,13007,0.16317996709063,2.1758333333333,92,2171,0 +57,58,13036,0.08671926699280201,2.3058333333333,67,2224,0 +58,59,13043,0.0733999511789,2.3983333333333,58,1967,0 +59,60,13023,0.0,2.55,58,2148,0 +60,61,13022,0.032756244361869,2.7302777777778,63,1978,0 +61,62,13033,0.054893891024455,2.8169444444444003,61,2021,0 +62,63,13024,0.068514114108229,2.9247222222222,55,2060,0 +63,64,13048,0.05279414163165401,2.8911111111111003,71,2096,0 +64,65,13740,0.023853017353212,2.9575,64,2082,0 +65,66,13540,0.07426125441559799,2.9080555555556,92,2175,0 +66,67,12724,0.024228588329879,3.0088888888889,44,2332,0 +67,68,13070,0.09233413002519697,3.2033333333333,35,2147,0 +68,69,13106,0.15930655332113,3.6213888888889,53,2163,0 +69,70,13025,0.12755838225296,4.0322222222222,49,2406,0 +70,71,13074,0.10152541717054,4.1227777777778,49,2022,0 +71,72,13079,0.040148453968243986,3.9736111111111,103,2188,0 +72,73,13184,0.087208372094752,3.8425,107,2758,0 +73,74,13194,0.074209918996797,3.7097222222222,74,2925,0 +74,75,13191,0.059044537369404015,3.6258333333333,56,3223,0 +75,76,13059,0.06248169832921499,3.4705555555556,60,2507,0 +76,77,13169,0.08876527685714597,3.2877777777778,73,2435,0 +77,78,13114,0.051354431854972,2.9286111111111004,99,2552,0 +78,79,13037,0.074790104163639,2.4888888888889,84,2540,0 +79,80,13179,0.091817341555971,2.2744444444444,129,2642,0 +80,81,13152,0.14762794333026005,2.1733333333333,101,2254,0 +81,82,13095,0.07101004447510299,2.3416666666667,101,2539,0 +82,83,13144,0.07689756334240598,2.3808333333333,51,2596,0 +83,84,13170,0.08412575787388403,2.4663888888889,95,2573,0 +84,85,13162,0.06328921386603299,2.6608333333333,48,2302,0 +85,86,13117,0.057393902128707,2.7558333333333,40,2991,0 +86,87,13129,0.041819399065704,2.8636111111111004,55,3141,0 +87,88,13386,0.073729686380986,2.7586111111111005,56,3285,0 +88,89,13929,0.15365285617975,2.7377777777778,935,3807,0 +89,90,13385,0.060355859742407016,2.6961111111111005,34,2892,0 +90,91,13106,0.10644586288975,2.8569444444444,57,2538,0 +91,92,13113,0.059314286360126985,3.1833333333333,70,2234,0 +92,93,13155,0.096293806236591,3.5544444444444,72,2707,0 +93,94,13186,0.085101425467407,3.8894444444444,66,2382,0 +94,95,13151,0.11149072274185,4.1138888888889,72,2426,0 +95,96,13156,0.076266981262989,3.9519444444444,49,2451,0 +96,97,12813,0.097952120177625,3.8275,41,2288,0 +97,98,12821,0.17250021935572,3.6438888888889,42,2256,0 +98,99,12867,0.11389182319254,3.5608333333333,39,2884,0 +99,100,12837,0.08999961787521,3.5013888888889,81,2398,0 +100,101,12911,0.048649372449385005,3.3088888888889,90,2239,0 +101,102,12842,0.13861764684085998,2.9063888888889,92,2248,0 +102,103,12905,0.1088795585287,2.5027777777777995,81,2387,0 +103,104,12993,0.054235162564995,2.2466666666667003,145,3876,0 +104,105,12974,0.0390040506742,2.1869444444444,47,3073,0 +105,106,13039,0.0744713077811,2.2402777777778,63,3113,0 +106,107,13322,0.040258943675435,2.3727777777778,118,3363,0 +107,108,13606,0.0,2.4566666666667003,56,3796,0 +108,109,13536,0.027955712584728,2.5452777777777995,127,4924,0 +109,110,13341,0.047309968420241,2.6830555555556,48,4300,0 +110,111,13360,0.016602764360002,2.805,114,5225,0 +111,112,13450,0.042432577628353986,2.7386111111111004,78,4047,0 +112,113,14102,0.051191743726563,2.7438888888888995,58,4134,0 +113,114,14026,0.0,2.7586111111111005,56,4786,0 +114,115,13162,0.056724832354639,2.9013888888889,67,4184,0 +115,116,13118,0.055771058827737,3.19,155,2888,0 +116,117,12953,0.081014772096658,3.5561111111111003,123,2674,0 +117,118,12854,0.08253629738290899,3.8433333333333,118,2574,0 +118,119,12952,0.11499203730886,4.0319444444444,133,3123,0 +119,120,12915,0.07668513845109799,3.8844444444444,75,3369,0 +120,121,11994,0.070057457403873,3.6908333333333,29,3284,0 +121,122,11868,0.07031477357556501,3.6141666666667,68,2127,0 +122,123,11977,0.091946448716499,3.5019444444444,91,2117,0 +123,124,11874,0.14560588482235998,3.4205555555556,101,2271,0 +124,125,11913,0.094774329323472,3.1780555555556,22,2513,0 +125,126,11933,0.10217989327054,2.8361111111111,20,2746,0 +126,127,11844,0.04854243074027901,2.5222222222222004,27,2076,0 +127,128,11968,0.068760549683423,2.2416666666667004,45,2297,0 +128,129,11996,0.075440683881139,2.1588888888889,42,2312,0 +129,130,12006,0.11771339431815,2.2763888888889,59,2834,0 +130,131,12225,0.069437397660265,2.3391666666667,52,3584,0 +131,132,12482,0.0,2.4841666666667,62,4009,0 +132,133,12289,0.0,2.4911111111111,81,4142,0 +133,134,12219,0.0,2.6922222222222,84,3876,0 +134,135,12282,0.027395404320488,2.8205555555556,104,4098,0 +135,136,12367,0.055202605299814,2.8216666666667,111,3831,0 +136,137,13042,0.078387348178452,2.7122222222222,91,3842,0 +137,138,12665,0.11851571646444,2.6744444444444,33,4129,0 +138,139,12133,0.068395341911942,2.8097222222222,26,3509,0 +139,140,12023,0.04720597158087901,3.1838888888889,37,2450,0 +140,141,11847,0.07910648512645599,3.5130555555556,23,2270,0 +141,142,11980,0.067550601916344,3.7722222222222,29,2360,0 +142,143,12026,0.080666570182724,3.9058333333333,45,2431,0 +143,144,11852,0.044973875852863,3.7697222222222,49,2042,0 +144,145,12152,0.065734580284861,3.6027777777778,27,1833,0 +145,146,12148,0.068759646748575,3.5038888888889,46,1852,0 +146,147,12236,0.027278224398313,3.445,39,1927,0 +147,148,12155,0.067695565422881,3.3494444444444,72,1999,0 +148,149,12113,0.07244669924777,3.1961111111111005,81,2030,0 +149,150,12175,0.028882930937168,2.8905555555556,64,1963,0 +150,151,12103,0.021568136982842,2.5805555555556,79,2116,0 +151,152,12206,0.064254625408408,2.3380555555556004,132,2461,0 +152,153,12239,0.073869151016554,2.2116666666667,127,2388,0 +153,154,12398,0.026644044055307004,2.2013888888889,121,2846,0 +154,155,12582,0.051289858799957,2.3236111111111,98,2974,0 +155,156,12705,0.099217337562612,2.3002777777778,128,3776,0 +156,157,12555,0.016615805334675,2.385,158,3885,0 +157,158,12476,0.078387348178452,2.5597222222222005,78,3865,0 +158,159,12706,0.0,2.6941666666667,65,4319,0 +159,160,12671,0.049384244324413,2.7169444444444,81,4646,0 +160,161,13277,0.043044731483849,2.6369444444444,586,3873,0 +161,162,12757,0.04215504851616,2.6572222222222,48,3489,0 +162,163,12401,0.042236538352835,2.8466666666667004,38,2790,0 +163,164,12248,0.1001564296112,3.1955555555556,30,2641,0 +164,165,12156,0.17378132267942994,3.5633333333333,28,2960,0 +165,166,12210,0.12005519462968,3.8113888888889,36,2192,0 +166,167,11983,0.14491137762023998,3.9655555555556,50,2145,0 +167,168,12374,0.07336941078506799,3.8483333333333,47,2133,0 +168,169,12230,0.12395626148952,3.6441666666667,82,2330,0 +169,170,12200,0.15077430423660998,3.5213888888889,56,2235,0 +170,171,12135,0.18960071033689,3.4702777777778,140,2258,0 +171,172,12131,0.06051348935254,3.3033333333333,145,2200,0 +172,173,12165,0.072057993662839,3.1933333333333,114,2161,0 +173,174,12193,0.082361078437032,2.8183333333333,129,2159,0 +174,175,12165,0.12343775199876,2.52,143,2088,0 +175,176,12304,0.1071817784483,2.2886111111111,113,2473,0 +176,177,12275,0.10359394556779,2.0822222222222,108,3217,0 +177,178,12369,0.021162435488903,2.1416666666667,93,2994,0 +178,179,12569,0.074524398314698,2.2688888888889,63,3827,0 +179,180,12766,0.12687067454443,2.335,103,4176,0 +180,181,12621,0.041752618326160014,2.4388888888889,114,4227,0 +181,182,12611,0.0,2.5386111111111,67,4290,0 +182,183,12618,0.040819652463459,2.6288888888889,106,4691,0 +183,184,12631,0.082668981599835,2.7511111111111,160,4442,0 +184,185,13121,0.06181362481077901,2.7744444444444,81,5775,0 +185,186,12871,0.0,2.8297222222222,113,3840,0 +186,187,12252,0.076137992226715,2.9708333333333,37,3721,0 +187,188,12155,0.12107639529965,3.1333333333333,70,2498,0 +188,189,12186,0.0,3.3544444444444,82,2265,0 +189,190,12179,0.19840339729984,3.6780555555556,76,2451,0 +190,191,12109,0.20112394005693,3.8038888888889,59,2892,0 +191,192,12142,0.096833471661634,3.8177777777778,58,2166,0 +192,193,12145,0.10338450919956,3.6916666666667,49,2040,0 +193,194,12162,0.10142513773096,3.5197222222222,36,2013,0 +194,195,12165,0.09779274451732,3.5186111111111003,111,2000,0 +195,196,12125,0.14744152252573,3.2597222222222,81,2117,0 +196,197,12097,0.083396348606149,3.0930555555556,92,2775,0 +197,198,12099,0.095637498006913,2.7825,113,2116,0 +198,199,12140,0.14768844039376006,2.4494444444444,90,1991,0 +199,200,12188,0.1131872329372,2.2369444444444,183,3162,0 +200,201,12157,0.073729686380986,2.0961111111111,117,2958,0 +201,202,12128,0.064614077523704,2.0377777777778,110,3153,0 +202,203,12190,0.056019959597275015,2.0730555555556003,179,2190,0 +203,204,12151,0.074812141908008,2.1655555555556,134,2172,0 +204,205,12214,0.02489388427845201,2.285,135,2074,0 +205,206,12275,0.023695834967821,2.4283333333333,100,2078,0 +206,207,12164,0.058680009072634,2.6186111111111,47,2406,0 +207,208,12120,0.10008779345816002,2.7372222222222,88,2018,0 +208,209,12693,0.066566772961868,2.8266666666667004,74,2091,0 +209,210,12624,0.070501147961051,2.8469444444444,58,2310,0 +210,211,12163,0.098779019649936,2.9855555555556,100,2113,0 +211,212,12100,0.11803653713501,3.1038888888889,49,2518,0 +212,213,12162,0.10076746585103,3.4058333333333,36,2605,0 +213,214,12106,0.053210709415363,3.6138888888889,40,2680,0 +214,215,12156,0.099346579713514,3.93,50,2228,0 +215,216,12120,0.047275248011591,3.8155555555556,58,2023,0 +216,217,12420,0.091262209791582,3.6588888888889,50,3702,0 +217,218,12417,0.038593218846488,3.5913888888889,53,1992,0 +218,219,12450,0.070273907645883,3.4644444444444003,93,1988,0 +219,220,12395,0.029431888410363,3.3944444444444,78,1919,0 +220,221,12382,0.096854769984307,3.2227777777778,84,2213,0 +221,222,12438,0.11656453357642,2.7961111111111,112,2181,0 +222,223,12363,0.12109055114779,2.4383333333333,73,2152,0 +223,224,12393,0.20381554615786,2.2647222222222005,91,2393,0 +224,225,12399,0.046311768005022014,2.1886111111111,114,2173,0 +225,226,12456,0.18261306403662,2.2825,127,2109,0 +226,227,12442,0.021992750543024,2.3333333333333,69,3606,0 +227,228,12481,0.088072259040681,2.445,59,2114,0 +228,229,12432,0.037896500450725,2.5811111111111,64,2135,0 +229,230,12403,0.09882843339863,2.7094444444444,75,2303,0 +230,231,12406,0.076277687882641,2.88,44,2137,0 +231,232,12462,0.022875979046571,2.8555555555556,52,2264,0 +232,233,13034,0.10022162220861,2.7791666666667,42,2245,0 +233,234,12830,0.08117200437078799,2.7772222222222,45,2151,0 +234,235,12439,0.09750667785645803,3.02,26,2330,0 +235,236,12541,0.05680722879784299,3.2213888888888995,29,3357,0 +236,237,12462,0.12240855732315,3.6211111111111,32,3152,0 +237,238,12394,0.1715485140175,4.0219444444444,44,2693,0 +238,239,12507,0.075015592829224,4.0980555555556,41,3798,0 +239,240,12512,0.11388410095531,3.9080555555556,42,4596,0 +240,241,12093,0.10519027968795,3.7269444444444,46,2529,0 +241,242,12197,0.1150532998405,3.6244444444444,40,2124,0 +242,243,12138,0.10890530980571,3.5252777777778,64,2762,0 +243,244,12174,0.099350621485086,3.4675,70,2973,0 +244,245,12163,0.12889794040441002,3.3316666666667003,69,3041,0 +245,246,12096,0.12069378235889,2.9497222222222,73,2179,0 +246,247,12166,0.13053034917739,2.5708333333333,85,2322,0 +247,248,12187,0.078977758004111,2.3086111111111,63,2274,0 +248,249,12246,0.08088416337864099,2.2311111111111,67,2448,0 +249,250,12335,0.04008956024204,2.3119444444444,68,3811,0 +250,251,12556,0.05063725351997099,2.3536111111111,62,3761,0 +251,252,12652,0.039066291775136,2.4819444444444,69,4269,0 +252,253,12646,0.028611752774164,2.6605555555556,82,4244,0 +253,254,12803,0.040593364983329,2.7527777777778,56,4417,0 +254,255,12570,0.038807415292018,3.0741666666667005,38,3758,0 +255,256,12633,0.07832796288132203,2.8522222222222,30,4375,0 +256,257,13146,0.066320996162546,2.7277777777778,48,4158,0 +257,258,12994,0.083175583471284,2.7502777777778,63,3410,0 +258,259,12314,0.06802464587725401,2.8797222222222,34,2853,0 +259,260,12193,0.051675070535006,3.2027777777778,11,2628,0 +260,261,12127,0.044129112207997014,3.5633333333333,22,2287,0 +261,262,12140,0.037685894365982006,3.8808333333333,22,3334,0 +262,263,12174,0.093414561465838,4.0352777777778,12,2795,0 +263,264,12180,0.06987083046098,3.8966666666667,10,2089,0 +264,265,12861,0.021992750543024,3.7225,14,2260,0 +265,266,12957,0.11305566197523,3.73,39,3176,0 +266,267,12981,0.030884138240845,3.5558333333333,55,4049,0 +267,268,12958,0.10381377439313,3.3169444444444003,90,2902,0 +268,269,12913,0.048953768695625004,3.2322222222222,68,3743,0 +269,270,12939,0.042258794089861,2.8658333333333,95,4280,0 +270,271,12933,0.048388685585470985,2.5169444444444,70,3977,0 +271,272,13006,0.034197830567692,2.3,96,4518,0 +272,273,13091,0.08835953066771099,2.1888888888889,45,2707,0 +273,274,13201,0.086890518272785,2.2030555555556,96,3522,0 +274,275,13520,0.031087561676959,2.2711111111111,74,4584,0 +275,276,13675,0.071287463233942,2.4697222222222,82,4141,0 +276,277,13594,0.14372616993938,2.5988888888889,82,4831,0 +277,278,13466,0.12647517487142998,2.7258333333333,45,3991,0 +278,279,13448,0.042854531198562,2.7858333333333,134,4645,0 +279,280,13492,0.039930389849144,2.7922222222222,119,4967,0 +280,281,14123,0.076184645265048,2.6988888888889,86,4578,0 +281,282,13839,0.037830020408535,2.7663888888889,75,4972,0 +282,283,13335,0.030884138240845,2.8938888888889,45,5522,0 +283,284,13196,0.048316550276279,3.1875,50,2832,0 +284,285,13047,0.10986585566763,3.6463888888889,31,2826,0 +285,286,13008,0.025485002897852004,3.866666666666701,88,2855,0 +286,287,12763,0.12451757643335,3.9808333333333,42,2660,0 +287,288,12949,0.12875690949235,3.8277777777778,70,2447,0 +288,289,13009,0.15720639094135,3.6269444444444,106,2545,0 +289,290,13008,0.079092017261926,3.5266666666667,44,3842,0 +290,291,12890,0.14711499890479998,3.5077777777778,57,2332,0 +291,292,13004,0.0531410973178,3.3455555555556,95,2294,0 +292,293,12918,0.10136246281349,3.1241666666667003,91,3016,0 +293,294,12910,0.053119315802353,2.8713888888889,66,3944,0 +294,295,12915,0.11313351589999003,2.5133333333333,66,2332,0 +295,296,13121,0.076760188212735,2.2197222222222,82,2405,0 +296,297,13076,0.08890522133351199,2.205,73,2572,0 +297,298,13096,0.1009555130175,2.2677777777778,69,2558,0 +298,299,13339,0.15685427502807,2.2991666666667,107,3701,0 +299,300,13635,0.11090638960365,2.4277777777778,101,4228,0 +300,301,13493,0.054798089981891,2.5333333333333,66,3990,0 +301,302,13402,0.08461316628091001,2.6422222222222005,47,4707,0 +302,303,13417,0.15790425505315,2.8211111111111005,47,3857,0 +303,304,13382,0.021675109392134,2.7625,66,3874,0 +304,305,14199,0.14112049645292002,2.7391666666667,102,4369,0 +305,306,13973,0.059612111520904,2.7525,71,4488,0 +306,307,13284,0.067835890522602,2.8644444444444,53,3637,0 +307,308,13070,0.047414460026828,3.1927777777778,28,2705,0 +308,309,12983,0.050348669783997005,3.5872222222222,24,2429,0 +309,310,13075,0.07296715773193299,3.8305555555556,23,2839,0 +310,311,12991,0.10713527159169,3.8827777777778,30,2371,0 +311,312,12993,0.073622496612493,3.7291666666667,25,2758,0 +312,313,13121,0.11556476355437,3.6172222222222,29,2291,0 +313,314,13097,0.034160489683707995,3.4491666666667005,27,2220,0 +314,315,13150,0.019571935182124,3.4097222222222,77,2620,0 +315,316,13078,0.15720996206912,3.2605555555556,46,2467,0 +316,317,13140,0.11515041454164,3.2191666666667,86,2088,0 +317,318,13102,0.086415715789296,2.9586111111111,97,2137,0 +318,319,13110,0.092606306920552,2.6036111111111,88,2907,0 +319,320,13138,0.046458579038692015,2.3319444444444,110,2558,0 +320,321,13238,0.10977831600416,2.2025,89,2823,0 +321,322,13317,0.11090009191451,2.2711111111111,134,2465,0 +322,323,13512,0.076652795374797,2.2897222222222005,84,4399,0 +323,324,13669,0.1087202400467,2.3297222222222005,109,4088,0 +324,325,13651,0.11471628863897,2.395,57,5099,0 +325,326,13580,0.11070024667119,2.5063888888889,49,5157,0 +326,327,13538,0.026827723134058,2.7077777777778,83,3782,0 +327,328,13657,0.029426630692549,2.735,101,4008,0 +328,329,14183,0.028611752774164,2.6958333333333,88,4534,0 +329,330,14117,0.053106181092382014,2.6930555555556,56,3242,0 +330,331,13166,0.055538160906184006,2.875,31,2808,0 +331,332,13265,0.11009690391165,3.1788888888888995,22,3676,0 +332,333,13085,0.10979978093137,3.5808333333333,32,3523,0 +333,334,13167,0.036174223284821,3.8508333333333,27,3038,0 +334,335,13170,0.048361321378982,3.9180555555556,17,2299,0 +335,336,13132,0.10958125953198,3.815,27,2345,0 +336,337,13055,0.047305343559722,3.6080555555556,38,2565,0 +337,338,13025,0.045316868664604014,3.4927777777778,73,2576,0 +338,339,13076,0.13255054531036,3.4316666666667004,56,2327,0 +339,340,13044,0.079695587369141,3.3436111111111004,49,2211,0 +340,341,13035,0.10277355185943,3.0663888888889,90,2642,0 +341,342,13103,0.15061124796385,2.7894444444444,106,3646,0 +342,343,13067,0.14509169704095,2.4994444444444,51,2281,0 +343,344,13183,0.054445250001619004,2.2544444444444,99,2474,0 +344,345,13144,0.082058799915824,2.0847222222222,104,2536,0 +345,346,13166,0.042151311782819015,2.0888888888889,119,2900,0 +346,347,13406,0.057404703309705984,2.1594444444444,73,3144,0 +347,348,13544,0.040891918425583,2.2533333333333,92,3725,0 +348,349,13608,0.045224636676715,2.3880555555556,57,4305,0 +349,350,13522,0.0,2.6338888888889,100,3665,0 +350,351,13595,0.0,2.6588888888889,93,3791,0 +351,352,13420,0.10335456693443,2.7586111111111005,111,3897,0 +352,353,14163,0.033846222120808,2.8797222222222,91,3494,0 +353,354,13678,0.026167129419328,2.785,43,3353,0 +354,355,13272,0.08571767780871499,2.8219444444444,91,2741,0 +355,356,13071,0.12459953631184,3.0055555555556,63,2463,0 +356,357,13004,0.054750658073534006,3.2936111111111,60,3477,0 +357,358,13068,0.20799106772677,3.5575,56,2792,0 +358,359,13031,0.10314231079956,3.676111111111101,59,2183,0 +359,360,13013,0.12212653292147,3.7166666666667,48,2874,0 +360,361,12998,0.19159058299176,3.6013888888889,65,2147,0 +361,362,12971,0.10782180851978,3.4455555555556,77,2754,0 +362,363,13000,0.06408869538637901,3.4166666666667003,60,2007,0 +363,364,12998,0.095540168894753,3.1791666666667004,94,2564,0 +364,365,12906,0.039360296791109,3.0013888888889,84,3020,0 +365,366,12969,0.086611479249287,2.72,99,2004,0 +366,367,12963,0.05845507441603001,2.4527777777778,61,2047,0 +367,368,12933,0.051490800079599004,2.1816666666667,60,3531,0 +368,369,12990,0.075496432869001,2.0161111111111,78,2383,0 +369,370,12980,0.10358625218721,1.9769444444444,81,2112,0 +370,371,12982,0.062806431427897,2.0597222222222,61,2554,0 +371,372,12989,0.08970338978685001,2.2111111111111,68,2371,0 +372,373,13073,0.094517316130968,2.3141666666667,53,2060,0 +373,374,12950,0.032322011663911,2.4280555555556003,49,2086,0 +374,375,12990,0.047911560407608,2.5855555555556,40,2130,0 +375,376,13035,0.062001214431213,2.6977777777778,125,2072,0 +376,377,13681,0.027102718749392,2.7777777777778,61,2033,0 +377,378,13304,0.034703114844079,2.7988888888889,111,2683,0 +378,379,12965,0.066236017573192,2.8927777777778,32,2046,0 +379,380,12966,0.032230355211769,3.0413888888889,21,2064,0 +380,381,12943,0.11559664215716,3.3569444444444,14,2067,0 +381,382,12958,0.021952502374124,3.4808333333333,32,2496,0 +382,383,13005,0.13347711194703,3.764166666666701,29,4758,0 +383,384,12923,0.10579408349834,3.8097222222222,26,2806,0 +384,385,12812,0.10679035350244,3.6911111111111,52,2227,0 +385,386,12803,0.068633627680319,3.4902777777778,39,3123,0 +386,387,12850,0.04699518011436099,3.3769444444444,78,3460,0 +387,388,12797,0.14159640074335994,3.3011111111111004,78,3587,0 +388,389,12732,0.078500039299167,3.1369444444444,83,2558,0 +389,390,12817,0.049232295047845,2.8475,63,2306,0 +390,391,12818,0.078777592482879,2.4544444444444,108,2083,0 +391,392,12815,0.08993433499951,2.1247222222222,158,3073,0 +392,393,12805,0.081869163858473,2.0266666666667,115,3325,0 +393,394,12703,0.14556064903749,2.1763888888889,112,2321,0 +394,395,12771,0.0,2.3088888888889,73,2846,0 +395,396,12847,0.0,2.4213888888889,93,2482,0 +396,397,12872,0.030693547421212,2.6436111111111,65,2306,0 +397,398,12815,0.0,2.6602777777778,91,2298,0 +398,399,12844,0.046999447831427,2.7677777777778,106,2907,0 +399,400,12811,0.028815579681692,2.8066666666667004,66,2329,0 +400,401,13472,0.0,2.7661111111111003,26,2456,0 +401,402,13063,0.039360296791109,2.8133333333333,23,2178,0 +402,403,12833,0.039570832199428,2.9186111111111,24,2142,0 +403,404,12842,0.090659246308087,3.1930555555556,19,2277,0 +404,405,12804,0.10540579050057003,3.565,23,3066,0 +405,406,12852,0.062601610466313,3.9133333333333,30,3619,0 +406,407,12862,0.051455855638306,3.9658333333333,23,3726,0 +407,408,12799,0.054631758648785014,3.8930555555556,35,2282,0 +408,409,12789,0.09017822949731,3.7297222222222,41,3079,0 +409,410,12815,0.045287525091609014,3.6516666666667,63,2448,0 +410,411,12887,0.033344698319951,3.5927777777778,33,2574,0 +411,412,12903,0.080098394586215,3.4694444444444,50,3697,0 +412,413,12892,0.025162301034707,3.2536111111111,88,3067,0 +413,414,12907,0.078260793447992,2.8986111111111,115,3491,0 +414,415,12883,0.07223863924679201,2.4488888888889,69,3195,0 +415,416,12965,0.042917873674349,2.2119444444444,116,2763,0 +416,417,12932,0.04720597158087901,2.2011111111111,73,2605,0 +417,418,13134,0.048273008229067,2.2338888888889,75,2755,0 +418,419,13440,0.036987975876273,2.3116666666667003,56,3300,0 +419,420,13544,0.06291463671717,2.3869444444444,66,3838,0 +420,421,13508,0.033319304393751,2.5119444444444,70,3608,0 +421,422,13401,0.029115275623859,2.5713888888889,52,3845,0 +422,423,13410,0.06821638123436,2.5088888888889,32,3563,0 +423,424,13482,0.015408589348188,2.4155555555556,16,5478,0 +424,425,14124,0.01916018435633,3.6455555555556,46,3656,0 +425,426,13703,0.06374239746477901,2.4625,53,3491,0 +426,427,13250,0.099738890728803,2.5808333333333,67,3430,0 +427,428,13092,0.10950621554455,3.0033333333333,58,2807,0 +428,429,13012,0.06138920621589401,3.3486111111111003,17,2524,0 +429,430,12901,0.051307638060244014,3.6644444444444,26,2964,0 +430,431,12848,0.082471571552878,4.0083333333333,13,3969,0 +431,432,13025,0.060122448878635,3.8530555555556,8,3561,0 +432,433,11352,0.07469842969719999,3.6183333333333,20,3394,0 +433,434,8761,0.056170625137636994,3.4922222222222,23,3005,0 +434,435,10433,0.052668952946361,3.4958333333333,34,2350,0 +435,436,10088,0.068871884486763,3.2738888888889,35,2139,0 +436,437,9485,0.040236057110938986,3.2102777777778,48,2098,0 +437,438,8865,0.053200012471363,2.8475,67,2341,0 +438,439,8920,0.056725172482788,2.4883333333332995,38,2698,0 +439,440,8798,0.035229341473877,2.1955555555556003,33,2968,0 +440,441,8927,0.0,2.1461111111111,40,2824,0 +441,442,9211,0.020190723068726,2.1522222222222,37,3003,0 +442,443,9286,0.093342961377898,2.3122222222222004,51,3551,0 +443,444,9725,0.0,2.4033333333333,52,4689,0 +444,445,11050,0.015717168144981003,2.4944444444444,57,3481,0 +445,446,11521,0.017190609993733997,2.6622222222222005,82,3376,0 +446,447,11603,0.0,2.675,74,3198,0 +447,448,11665,0.043273461915965,2.6997222222222,80,3059,0 +448,449,12153,0.029854520963498,2.6997222222222,78,2937,0 +449,450,11672,0.017383620014121998,2.7194444444444,58,2881,0 +450,451,11119,0.046391383573699006,2.8258333333333,41,2777,0 +451,452,11124,0.042155878228,3.1044444444444,34,2510,0 +452,453,10734,0.052684222339579014,3.4736111111111003,35,2356,0 +453,454,11612,0.063573954212613,3.6972222222222,40,2383,0 +454,455,11523,0.077413583128967,3.8038888888889,35,2455,0 +455,456,11632,0.069605078732108,3.7494444444444,37,2285,0 +456,457,12838,0.075937967855042,3.6813888888889,43,2455,0 +457,458,11637,0.047354002438352014,3.4791666666667003,45,4298,0 +458,459,12542,0.044000040388062,3.4530555555556,48,2400,0 +459,460,12394,0.095130971924595,3.2841666666667004,77,3431,0 +460,461,12419,0.069274987547704,3.205,79,2252,0 +461,462,12484,0.061118974117397,2.8436111111111004,59,2628,0 +462,463,12413,0.056393740750134,2.4441666666667,107,3266,0 +463,464,12440,0.06125086589409901,2.275,100,2620,0 +464,465,12614,0.047746883512707,2.1788888888889,84,2824,0 +465,466,12693,0.047136440673386,2.2083333333333,99,2801,0 +466,467,12989,0.0,2.2997222222222,103,3106,0 +467,468,13200,0.0,2.3155555555556004,47,3532,0 +468,469,13108,0.049828520132601,2.41,67,4210,0 +469,470,12886,0.0,2.5902777777778,65,3646,0 +470,471,13000,0.0,2.6636111111111,65,3768,0 +471,472,13071,0.043576825212604,2.7105555555556,70,5342,0 +472,473,13563,0.035173891965945,2.6811111111111,76,5327,0 +473,474,13333,0.04413510379665099,2.715,40,3363,0 +474,475,12672,0.016955671451488998,2.7083333333333,54,3016,0 +475,476,12547,0.1330396486107,3.0038888888889,45,3257,0 +476,477,12289,0.016462114132943,3.3911111111111003,32,2619,0 +477,478,12584,0.055696363369897,3.6375,26,2573,0 +478,479,12526,0.036411774365825,3.7755555555556,25,2575,0 +479,480,12416,0.047966724418057,3.5786111111111003,34,5355,0 +480,481,12450,0.05609961782665,3.4222222222222,43,5809,0 +481,482,12460,0.096990479781121,3.2538888888889,68,3823,0 +482,483,12425,0.11147038220964,3.1683333333333,60,3116,0 +483,484,12430,0.044797927381498,3.0677777777778,74,2321,0 +484,485,12418,0.024403519177111,2.94,68,2193,0 +485,486,12437,0.08532776818426499,2.7291666666667003,43,2982,0 +486,487,12484,0.043615168647623,2.4147222222222005,73,4140,0 +487,488,12380,0.056692005942856,2.1419444444444,72,2353,0 +488,489,12620,0.033708553131457,2.0244444444444,66,3350,0 +489,490,12674,0.040148453968243986,2.0458333333333,90,3184,0 +490,491,12855,0.099551526697496,2.09,104,3469,0 +491,492,13053,0.0,2.1575,114,4204,0 +492,493,12898,0.036157867549894,2.2655555555556,98,6447,0 +493,494,12809,0.052738784696875,2.2561111111111,70,4898,0 +494,495,12964,0.021636091422947,2.4669444444444,101,3633,0 +495,496,12956,0.037120220639643986,2.5277777777778,77,4189,0 +496,497,13625,0.034467327401996005,2.5266666666667,69,4012,0 +497,498,13285,0.0,2.5438888888889,19,4009,0 +498,499,12715,0.096807019710259,2.6511111111111,47,4346,0 +499,500,12637,0.059601475230884,2.9711111111111004,38,2781,0 +500,501,12535,0.068431521141608,3.2288888888889,22,2811,0 +501,502,12512,0.09611085542804,3.505,20,2415,0 +502,503,12549,0.064177980162036,3.4944444444444,26,3589,0 +503,504,12567,0.11565746993409,3.4633333333333,24,2878,0 +504,505,12362,0.073501732487291,3.3177777777778,27,3471,0 +505,506,12326,0.072746100819649,3.1963888888889,25,2697,0 +506,507,12450,0.07557888002360401,3.1069444444444,57,2583,0 +507,508,12404,0.036816888038697,3.0172222222222,58,3173,0 +508,509,12362,0.093969235453559,2.9247222222222,81,3341,0 +509,510,12431,0.034848294186597004,2.5336111111111,81,2305,0 +510,511,12351,0.084191269180943,2.2480555555556,69,2186,0 +511,512,12528,0.13109036514766,2.0383333333333,50,4439,0 +512,513,12559,0.061132356147447,1.8852777777778,55,3173,0 +513,514,12586,0.019478099970089,1.9225,57,2831,0 +514,515,12864,0.0,1.9719444444444,78,16385,0 +515,516,13026,0.0,2.0608333333333,57,83955,0 +516,517,12880,0.017965204407153,2.16,78,4574,0 +517,518,12743,0.019202263481759,2.3077777777778,95,4987,0 +518,519,12812,0.0,2.415,88,5110,0 +519,520,12878,0.052306327013631,2.4669444444444,108,4893,0 +520,521,13427,0.08536575533023,2.5125,87,3807,0 +521,522,13081,0.052461360256699015,2.6294444444444,87,3447,0 +522,523,12752,0.035302992848671,2.8183333333333,44,4329,0 +523,524,12594,0.028682734942579,3.0547222222222,39,5166,0 +524,525,12507,0.024204462299365,3.33,27,3454,0 +525,526,12494,0.034360100307537,3.5738888888889,23,3578,0 +526,527,12487,0.018977302969238,3.6888888888889,11,2406,0 +527,528,12404,0.034308847257872,3.7111111111111,13,2073,0 +528,529,11147,0.07460088255490599,3.7180555555556,24,1925,0 +529,530,11147,0.055037935083209005,3.6041666666667,77,2357,0 +530,531,11128,0.039311673522385,3.4483333333333,54,1947,0 +531,532,11106,0.046619928266775,3.2413888888888995,45,1912,0 +532,533,11115,0.048227542028921,3.1355555555556,36,2107,0 +533,534,11044,0.020367863848114,2.8172222222222,59,2985,0 +534,535,11110,0.063069968046591,2.4275,81,2081,0 +535,536,11190,0.054470866056974016,2.2513888888889,50,2631,0 +536,537,11063,0.0,2.0691666666667,53,2130,0 +537,538,11078,0.059261864411046,2.0155555555556,44,2085,0 +538,539,11146,0.064174002348993,2.0952777777778,87,2211,0 +539,540,11010,0.0,2.2397222222222,94,2105,0 +540,541,11139,0.021912411214588,2.3275,128,2585,0 +541,542,11117,0.057958262002105985,2.5255555555556004,82,3695,0 +542,543,11081,0.035358633773416,2.665,49,3198,0 +543,544,11128,0.029191244440103,2.7975,79,3191,0 +544,545,11720,0.054981313823219,2.8597222222222,62,2016,0 +545,546,11384,0.06405347705857799,2.7983333333333,64,2124,0 +546,547,11018,0.0,2.9322222222222,34,2105,0 +547,548,11104,0.055445634363329,3.08,41,2031,0 +548,549,11084,0.040996998867197,3.3466666666667004,47,1964,0 +549,550,11106,0.027670189755404,3.6869444444444,31,2016,0 +550,551,11055,0.054579839310753,3.7966666666667,26,3909,0 +551,552,11098,0.044833640073299014,3.7805555555556,17,2105,0 +552,553,11028,0.03282297151413,3.7422222222222,30,2405,0 +553,554,11152,0.017696014614986,3.639166666666701,17,2141,0 +554,555,11025,0.09418709999244,3.4775,28,1910,0 +555,556,11015,0.061817529149429,3.3283333333333,20,1951,0 +556,557,11125,0.054000161367618,3.1702777777778,85,2310,0 +557,558,11035,0.06165600249599,2.7688888888889,52,2047,0 +558,559,11103,0.055915839259234,2.4266666666667,143,2048,0 +559,560,11100,0.062788330996733,2.1963888888889,106,3083,0 +560,561,11170,0.044888048273534,2.135,244,3619,0 +561,562,11078,0.095259484956337,2.3186111111111,2005,2172,0 +562,563,11150,0.021952502374124,2.3383333333333,124,3142,0 +563,564,11149,0.0,2.5002777777778,109,2256,0 +564,565,10984,0.0,2.6527777777778,148,2200,0 +565,566,11034,0.0,2.7661111111111003,126,2183,0 +566,567,11050,0.061557079663167,2.7347222222222,46,2030,0 +567,568,11102,0.14186075040414,2.6069444444444,49,2297,0 +568,569,11743,0.0,2.5547222222222,40,2213,0 +569,570,11371,0.077457673524504,2.4716666666667004,39,4014,0 +570,571,11078,0.16422977329792998,2.6530555555556004,25,2809,0 +571,572,11224,0.049366067455729,2.9488888888889,37,2355,0 +572,573,11146,0.10064381631633,3.3383333333333,32,2372,0 +573,574,11199,0.11909159312806,3.5419444444444,47,2387,0 +574,575,11181,0.09003816676619801,5.3302777777778,34,2359,0 +575,576,11022,0.055882659245704,3.7727777777778,40,2485,0 +576,577,11073,0.1836893913223,3.6333333333333,46,3728,0 +577,578,11120,0.08574268253550299,3.5430555555556,35,2820,0 +578,579,11008,0.12559700716583,3.6711111111111,61,2426,0 +579,580,11078,0.086129850619071,3.4572222222222,56,2307,0 +580,581,11121,0.041752618326160014,3.2,72,2233,0 +581,582,11041,0.094396473652892,2.7772222222222,110,2178,0 +582,583,11168,0.045323960075285004,2.415,135,2243,0 +583,584,11213,0.13808411333909,2.2530555555556004,133,2713,0 +584,585,11238,0.08029349854683501,2.0994444444444,148,3168,0 +585,586,11273,0.06507307495461,2.1780555555556003,86,3163,0 +586,587,11479,0.084518021856329,2.2638888888889,132,3289,0 +587,588,11839,0.030507395540508,2.3575,73,4001,0 +588,589,11735,0.05892502921299701,2.4680555555556003,95,4684,0 +589,590,11574,0.0,2.6208333333333,74,4137,0 +590,591,11531,0.033075906123641,2.6863888888889,51,4787,0 +591,592,11420,0.16633704704670998,2.6172222222222,65,4278,0 +592,593,12301,0.10228536028167,2.6194444444444,95,3898,0 +593,594,11845,0.16949365549682996,2.6358333333333,72,3728,0 +594,595,11374,0.08260397756200501,2.8661111111111004,41,4047,0 +595,596,11370,0.024378363844868,3.0533333333333,38,3373,0 +596,597,11197,0.15686874147816002,3.4438888888889,32,2669,0 +597,598,11171,0.063929461148943,3.6552777777778,22,3289,0 +598,599,11197,0.12602019009982998,3.8519444444444,29,2556,0 +599,600,11114,0.035137191893634005,3.8069444444444,32,2557,0 +600,601,12564,0.14965728062748998,3.5961111111111004,40,3003,0 +601,602,12459,0.10046170077382,3.5344444444444,59,2441,0 +602,603,12508,0.13163105487926,3.3972222222222,52,2396,0 +603,604,12464,0.043899611017859004,3.3936111111111003,42,3426,0 +604,605,12438,0.19567092855859,3.1025,46,2379,0 +605,606,12449,0.19135011734275,2.8630555555556,97,3026,0 +606,607,12373,0.11171915024595,2.4255555555556003,72,2336,0 +607,608,12594,0.032053604746412,1.8619444444444,81,2850,0 +608,609,12623,0.096448361580655,1.8930555555556,81,3016,0 +609,610,12759,0.07934996156433399,2.2080555555556,70,3537,0 +610,611,12841,0.024581173073578,2.3052777777778,89,3899,0 +611,612,13063,0.025596039426134,2.3777777777777995,87,5044,0 +612,613,13023,0.027922074309281,2.5161111111111,125,4806,0 +613,614,12884,0.02593545023878,2.6411111111111,69,4139,0 +614,615,13007,0.033086949155743,2.8011111111111004,57,4776,0 +615,616,13016,0.047260069860172005,2.7236111111111003,99,4065,0 +616,617,13588,0.038487130166032016,2.6813888888889,111,4969,0 +617,618,13272,0.16080169828563,2.7336111111111,71,3784,0 +618,619,12589,0.12635270044885,2.8863888888889,71,3297,0 +619,620,12651,0.046904491868436,3.1225,48,3347,0 +620,621,12616,0.059534673085297,3.4613888888889,76,3170,0 +621,622,12492,0.12198352023568,3.8297222222222,56,2241,0 +622,623,12497,0.052131597947042,3.8936111111111,35,2301,0 +623,624,12623,0.094084438832673,3.7588888888889,35,2303,0 +624,625,12481,0.13486764750848,3.5827777777778,29,2587,0 +625,626,12434,0.062226183256115,3.4730555555556,38,3211,0 +626,627,12495,0.091202035463034,3.4175,69,2604,0 +627,628,12375,0.096137859324631,3.3533333333333,77,2841,0 +628,629,12357,0.10449109200785,3.1963888888889,20,2168,0 +629,630,12433,0.097127966420289,2.8852777777778,24,2265,0 +630,631,12432,0.064404980330111,2.4880555555556003,83,2908,0 +631,632,12429,0.10188181868693,2.2325,62,3180,0 +632,633,12551,0.19953464365013,2.1044444444444,54,3118,0 +633,634,12799,0.0747839457206,2.1097222222222,54,3296,0 +634,635,12818,0.0,2.235,60,4432,0 +635,636,13071,0.0,2.3516666666667003,63,4336,0 +636,637,12897,0.0,2.5138888888889,95,4534,0 +637,638,12961,0.041436571087464,2.6105555555556004,69,4261,0 +638,639,12925,0.038671790863765,2.7233333333333,68,5248,0 +639,640,12968,0.035810634316102014,2.6633333333333,58,5014,0 +640,641,13525,0.1409929213297,2.5580555555556,107,3864,0 +641,642,12993,0.0,2.6627777777778,48,5682,0 +642,643,12369,0.052915080344848,2.7625,64,4404,0 +643,644,12195,0.11966022897483,3.0283333333333,52,3705,0 +644,645,12464,0.12973870706052,3.3727777777778,61,2738,0 +645,646,12470,0.023838633821411,3.6369444444444,47,2887,0 +646,647,12475,0.12358680271021,3.7088888888889,58,3776,0 +647,648,12482,0.089095336472172,3.5847222222222,51,3532,0 +648,649,12221,0.019762530636927,3.4836111111111,61,3724,0 +649,650,12325,0.020994992941051,3.4077777777778,53,2786,0 +650,651,12258,0.10380294658324002,3.4441666666667,55,2941,0 +651,652,11980,0.079228021087742,3.1683333333333,52,2351,0 +652,653,11947,0.039012779943635,3.0527777777778,89,2316,0 +653,654,12291,0.10658713601061,2.8527777777778,85,2350,0 +654,655,12293,0.14426278476756,2.5433333333333,106,2916,0 +655,656,12341,0.08706206992122,2.1997222222222,88,2437,0 +656,657,12390,0.16325946030154,2.1036111111111,59,2761,0 +657,658,12611,0.0,2.2133333333333,48,3941,0 +658,659,12737,0.0,2.2086111111111,66,4025,0 +659,660,12882,0.07729609083366701,2.2883333333333,95,4466,0 +660,661,12891,0.058100747891124,2.3222222222222,82,4401,0 +661,662,12756,0.061191523312340984,2.47,76,4747,0 +662,663,12875,0.08592375974441901,2.685,104,4051,0 +663,664,12847,0.033467197342519,2.6763888888889,54,4448,0 +664,665,13518,0.030265788895452006,2.5838888888889,43,3736,0 +665,666,13217,0.11950310860409,2.6130555555556003,39,3918,0 +666,667,12621,0.09169148327055697,2.7633333333333,48,3408,0 +667,668,12591,0.18439354827551,3.0708333333333,38,2883,0 +668,669,12332,0.10741924067542,3.4347222222222,45,3631,0 +669,670,12404,0.15862461647089002,3.7030555555556,64,2609,0 +670,671,12457,0.14957813136313,3.8138888888889,35,2533,0 +671,672,12370,0.24059408570531,3.8508333333333,66,2469,0 +672,673,11509,0.15511115210127,3.8961111111111,61,2458,0 +673,674,11433,0.19582462633148,3.4763888888889,58,2458,0 +674,675,11317,0.13981560037535998,3.4041666666667,51,2043,0 +675,676,11364,0.1392329990551,3.2352777777778,55,1985,0 +676,677,11350,0.13079770999921,3.1508333333333,126,2032,0 +677,678,11348,0.053672881218709015,2.7863888888888995,61,3409,0 +678,679,11365,0.10971373742228,2.4861111111111,94,2018,0 +679,680,11505,0.13825204927093,2.2444444444444,83,2461,0 +680,681,11468,0.13912778922607,2.1286111111111,136,2318,0 +681,682,11562,0.10215803640865,2.1261111111111,104,2787,0 +682,683,11858,0.096617489053804,2.2405555555556003,77,3186,0 +683,684,11933,0.0,2.2991666666667,109,3490,0 +684,685,11813,0.0,2.3627777777778,146,3407,0 +685,686,11735,0.0,2.5863888888889,69,3193,0 +686,687,11848,0.0,2.7286111111111,121,3412,0 +687,688,11843,0.0,2.8355555555556,53,3563,0 +688,689,12318,0.068897518746959,2.7875,61,3247,0 +689,690,11846,0.05418569809170299,2.7825,82,3012,0 +690,691,11066,0.06507307495461,2.7972222222222,37,2382,0 +691,692,10920,0.10547682048851,3.0355555555556,19,2012,0 +692,693,10836,0.056437861708265,3.2486111111111,19,1915,0 +693,694,10879,0.098703711593837,3.6077777777778,19,1982,0 +694,695,10796,0.14331889652193,3.76,54,1950,0 +695,696,10785,0.05704449488642,3.806666666666701,44,4176,0 +696,697,9469,0.0,3.6638888888889,46,3654,0 +697,698,9278,0.032146952736052,3.5161111111111003,53,3063,0 +698,699,9417,0.068135614649249,3.3286111111111003,83,1916,0 +699,700,9253,0.034514299845882,3.2166666666667,92,1848,0 +700,701,9435,0.028306668795131006,2.9783333333333,94,1704,0 +701,702,9356,0.13119921991025002,2.7211111111111004,111,1680,0 +702,703,9354,0.093609772007723,2.4102777777778,84,2011,0 +703,704,9405,0.11179018663123,2.1366666666667,52,1772,0 +704,705,9326,0.065272680657868,1.9947222222222,68,1838,0 +705,706,9549,0.15901886092526998,1.9936111111111,35,1924,0 +706,707,9499,0.0,2.0788888888889,40,2038,0 +707,708,9371,0.26537507315217,2.1736111111111,47,1991,0 +708,709,9462,0.0,2.4027777777778,85,1729,0 +709,710,9509,0.056610336908172985,2.4580555555556,59,1673,0 +710,711,9469,0.026644044055307004,2.6102777777777995,61,1656,0 +711,712,9522,0.040819652463459,2.7597222222222,45,1774,0 +712,713,9885,0.13497701521251,2.8122222222222,47,1784,0 +713,714,9802,0.16853433621426,2.8427777777778,72,1818,0 +714,715,9461,0.08655557751574,2.87,69,1981,0 +715,716,9393,0.05741127788681901,2.9769444444444,17,2004,0 +716,717,9638,0.037244401880164,3.3241666666667005,47,1788,0 +717,718,9435,0.1132743034971,3.6375,37,1786,0 +718,719,9519,0.15690958465910998,3.8652777777778,57,1781,0 +719,720,9492,0.09604225449090803,3.8091666666667,62,2024,0 +720,721,9458,0.06746445682560599,3.6844444444444,72,1669,0 +721,722,9420,0.058373145210404015,3.5913888888889,43,1729,0 +722,723,9429,0.048008603166117006,3.5255555555556,57,1682,0 +723,724,9461,0.12614216994504,3.3277777777778,47,1714,0 +724,725,9404,0.077186121310215,3.07,61,1679,0 +725,726,9366,0.042879382350005,2.7622222222222,53,1739,0 +726,727,9488,0.031014262794497007,2.3872222222222,78,1669,0 +727,728,9515,0.13957171072647,2.1308333333333,100,1806,0 +728,729,9487,0.027108383258306,2.1563888888889,104,1650,0 +729,730,9497,0.0,2.2547222222222003,56,1751,0 +730,731,9516,0.0,2.3397222222222003,89,1685,0 +731,732,9504,0.0,2.4808333333333,108,1645,0 +732,733,9422,0.025265991419408,2.6208333333333,67,2133,0 +733,734,9543,0.0,2.8138888888889,83,1618,0 +734,735,9395,0.047219926720593,2.9275,90,1623,0 +735,736,9352,0.083109434319356,2.8663888888888995,82,1697,0 +736,737,9884,0.10860709298782,2.7794444444444,76,1684,0 +737,738,9820,0.098319718095083,2.8194444444444,34,1779,0 +738,739,9439,0.02201293380153,2.9458333333333,43,2982,0 +739,740,9560,0.064929719079082,3.2413888888888995,40,1848,0 +740,741,9589,0.036960535765785,3.7166666666667,40,1772,0 +741,742,9575,0.068536856116777,4.1333333333333,57,1841,0 +742,743,9541,0.012398281267649,4.2697222222222,60,1834,0 +743,744,9490,0.035305311833591015,4.2797222222222,53,1860,0 +744,745,7160,0.024153733176505,4.0,44,1647,0 +745,746,7233,0.031750779212929,3.8877777777778,48,2129,0 +746,747,7166,0.092612685693125,3.6633333333333,50,1763,0 +747,748,7245,0.12674340154738,3.6127777777778,65,1433,0 +748,749,7299,0.068594711667718,3.3175,93,1428,0 +749,750,7169,0.13866540834682,2.8930555555556,105,1521,0 +750,751,7228,0.046813024390007014,2.4722222222222,94,1622,0 +751,752,7123,0.072990045810784,2.2294444444444,53,1580,0 +752,753,7199,0.17156759541908995,2.1286111111111,59,1468,0 +753,754,7167,0.051876699734571985,2.2219444444444,63,1520,0 +754,755,7212,0.031958698733103,2.3366666666667,61,1529,0 +755,756,7206,0.07333373485157901,2.4155555555556,72,1611,0 +756,757,7149,0.0,2.5408333333333,93,1511,0 +757,758,7284,0.023187512335638,2.6511111111111,62,1906,0 +758,759,7265,0.031672522871666,2.8405555555556,50,2632,0 +759,760,7221,0.091103855362214,2.8336111111111,42,1483,0 +760,761,7588,0.0,2.6575,62,1611,0 +761,762,7423,0.0983398607742,2.6622222222222005,21,1676,0 +762,763,7198,0.08011943311413,2.7719444444444,28,1670,0 +763,764,7279,0.043646436319699,3.0344444444444,65,1631,0 +764,765,7174,0.091445521226266,3.3741666666667003,37,1799,0 +765,766,7259,0.067771120773973,3.6925,20,1511,0 +766,767,7166,0.049768578185777006,3.8136111111111,47,1605,0 +767,768,7171,0.067455979006223,3.8202777777778,45,1758,0 +768,769,6883,0.14102875351082,3.7547222222222,49,1509,0 +769,770,6859,0.04521932948417,3.6077777777778,46,1591,0 +770,771,6817,0.032382889221133,3.5330555555556,30,1543,0 +771,772,6877,0.075100266089453,3.3544444444444,30,1573,0 +772,773,6785,0.038989846359505,3.1155555555556,48,1473,0 +773,774,6665,0.093396608626074,2.8463888888888995,36,1476,0 +774,775,6805,0.06797619687558401,2.4411111111111,46,1712,0 +775,776,6863,0.08326287339845401,2.1455555555556,27,1801,0 +776,777,6926,0.015112630017379,2.0025,79,1902,0 +777,778,7004,0.031549757127405,2.1247222222222,65,2005,0 +778,779,6950,0.0,2.2741666666667,57,2363,0 +779,780,7262,0.0,2.3272222222222005,61,2513,0 +780,781,7361,0.017214486216241002,2.4363888888889,89,2664,0 +781,782,7288,0.015541991667356,2.6155555555556003,80,2714,0 +782,783,7463,0.0,2.7272222222222,79,2754,0 +783,784,7188,0.027199843934104,2.6552777777778,113,2670,0 +784,785,7658,0.053744802378685,2.6086111111111,71,2584,0 +785,786,7575,0.05675511278546901,2.6025,53,2466,0 +786,787,6954,0.070873939193717,2.7372222222222,64,2137,0 +787,788,6862,0.19022950977106,3.0125,43,1931,0 +788,789,6896,0.17589540947937002,3.3477777777778,34,1743,0 +789,790,6954,0.022875979046571,3.6236111111111,29,1713,0 +790,791,6869,0.0,3.7383333333333,30,1649,0 +791,792,6890,0.13681403156951,3.7772222222222,24,1633,0 +792,793,9742,0.058507485759525,3.6966666666667,40,1993,0 +793,794,9730,0.10227075584148,3.7733333333333,32,1940,0 +794,795,9810,0.06726096113022301,3.6408333333333,39,1951,0 +795,796,9688,0.15267199916685995,3.3922222222222,67,1894,0 +796,797,9849,0.069818221889972,3.1627777777778,65,1801,0 +797,798,9765,0.030305771594539,2.6875,49,1962,0 +798,799,9812,0.09211700324247198,2.3533333333333,41,2123,0 +799,800,9931,0.12298177354813,2.0425,50,2434,0 +800,801,9908,0.08705722689013601,1.9738888888889,48,2402,0 +801,802,10066,0.07529920073678098,2.0425,59,3013,0 +802,803,10184,0.06217694957317299,2.1563888888889,51,3086,0 +803,804,10295,0.020886039183631,2.2866666666667004,43,3527,0 +804,805,10113,0.08148200392528,2.3919444444444,72,3716,0 +805,806,10218,0.027014133895137,2.5513888888889,52,3577,0 +806,807,10322,0.08271940630361399,2.6030555555556,68,3430,0 +807,808,10269,0.038537180887872,2.6647222222222005,74,3413,0 +808,809,10781,0.090543853269643,2.5930555555556003,46,3755,0 +809,810,10486,0.02593545023878,2.5513888888889,64,4806,0 +810,811,10124,0.090692829340129,2.76,38,3127,0 +811,812,9993,0.09154630234853098,3.0636111111111,40,3421,0 +812,813,9801,0.09562635368432304,3.4016666666667,50,2475,0 +813,814,9760,0.0,3.7277777777778,42,2440,0 +814,815,9858,0.0,3.7902777777778,37,2731,0 +815,816,9884,0.027267039980187,3.7355555555556,34,2493,0 +816,817,7781,0.024102810048699,3.535,37,1665,0 +817,818,7742,0.072297652068167,3.5819444444444,47,1771,0 +818,819,7682,0.12348623922845,3.3847222222222,67,2293,0 +819,820,7831,0.077453588867077,3.2547222222222,66,1959,0 +820,821,7641,0.05662557916213299,3.125,91,1498,0 +821,822,7641,0.15509029304093,2.7766666666667,132,1537,0 +822,823,7759,0.079595064406905,2.4725,149,1580,0 +823,824,7748,0.053225613553497,2.1927777777778,65,1901,0 +824,825,7776,0.05741127788681901,2.1283333333333,50,1916,0 +825,826,7938,0.077171346852694,2.2319444444444,70,2213,0 +826,827,8031,0.0,2.3061111111111,82,2205,0 +827,828,8117,0.07512642149906099,2.3363888888889,72,2486,0 +828,829,8099,0.0,2.3686111111111,98,2580,0 +829,830,8002,0.0,2.4986111111111,78,2530,0 +830,831,7944,0.026463035590685,2.6433333333333,86,2664,0 +831,832,7963,0.024228588329879,2.7563888888889,76,4368,0 +832,833,8602,0.055182797357095005,2.6652777777778,95,3103,0 +833,834,8269,0.09607690135523,2.6844444444444,63,2249,0 +834,835,7871,0.059431847203259,2.7902777777778,32,2070,0 +835,836,7709,0.018731901987648,3.1119444444444,30,2833,0 +836,837,7726,0.033970515582906,3.5491666666667,27,1734,0 +837,838,7781,0.049963174087431,3.7102777777778,22,2151,0 +838,839,7762,0.073295374096872,3.7961111111111,19,2103,0 +839,840,7692,0.017715537831218996,3.7730555555556,32,1725,0 +840,841,6608,0.014656639469103996,3.5919444444444,45,1895,0 +841,842,6526,0.15513271231042,3.5580555555556,65,1959,0 +842,843,6531,0.06544162031760599,3.4588888888889,73,1637,0 +843,844,6483,0.12276447331552,3.2969444444444003,52,1658,0 +844,845,6602,0.054046416943085,3.2288888888889,93,1666,0 +845,846,6555,0.06827770027642299,2.7358333333333,68,2410,0 +846,847,6610,0.10171854295932,2.4636111111111,127,1787,0 +847,848,6690,0.093454285728882,2.1894444444444,105,2264,0 +848,849,6651,0.04318436192577,2.1227777777778,75,2007,0 +849,850,6759,0.10050707347524,2.1369444444444,77,2107,0 +850,851,6836,0.019571935182124,2.2230555555556,140,2355,0 +851,852,6894,0.0,2.3188888888889,132,2726,0 +852,853,6844,0.0,2.4166666666667003,100,2875,0 +853,854,6773,0.02713995635286,2.5777777777778,174,2780,0 +854,855,6802,0.092632629280125,2.7869444444444,82,3936,0 +855,856,6947,0.098676638207998,2.8586111111111,128,3116,0 +856,857,7248,0.0,3.0816666666667003,79,3770,0 +857,858,6885,0.11132365864914,2.8713888888889,71,2382,0 +858,859,6643,0.0947301899901,2.9386111111111,60,2152,0 +859,860,6560,0.061070711161473,2.9827777777778,60,1754,0 +860,861,6554,0.18477832073133,3.3197222222222,56,1783,0 +861,862,6600,0.055986690710270993,3.5961111111111004,78,1780,0 +862,863,6525,0.16264480046039995,3.7613888888889,60,1582,0 +863,864,6543,0.026215643469448,3.7305555555556,48,2271,0 +864,865,9018,0.0,3.5580555555556,48,2592,0 +865,866,9225,0.054655616583012,3.5136111111111004,42,2921,0 +866,867,9112,0.07076692500883701,3.3772222222222,64,1814,0 +867,868,9195,0.067217215228375,3.2402777777778,36,3219,0 +868,869,9206,0.046060828388587,3.0586111111111003,40,2567,0 +869,870,9224,0.08329795085471901,2.7908333333333,18,1899,0 +870,871,9408,0.08219020764935,2.3761111111111,35,1801,0 +871,872,9082,0.046792553198475,2.1347222222222,44,2005,0 +872,873,9168,0.06755714954154099,1.9991666666667,105,2572,0 +873,874,9258,0.099050882008287,1.9983333333333,71,3563,0 +874,875,9158,0.0,2.0908333333333,65,2777,0 +875,876,9140,0.10824637351267,2.2311111111111,74,3362,0 +876,877,9206,0.0,2.3219444444444,34,3590,0 +877,878,9186,0.0,2.4727777777778,49,2930,0 +878,879,9155,0.037750185176735,2.5952777777778,44,2481,0 +879,880,9174,0.030345867660395,2.7416666666667004,57,2571,0 +880,881,9758,0.057665227298857,2.7652777777778,102,3546,0 +881,882,9451,0.16774071722374,2.7980555555556,106,4984,0 +882,883,9153,0.10462164884166,2.7597222222222,58,1994,0 +883,884,9233,0.051974117163582,3.0116666666667005,57,3060,0 +884,885,9250,0.070438547008222,3.2916666666667003,62,2151,0 +885,886,9317,0.11437533048244,3.5547222222222,42,2158,0 +886,887,9130,0.028754095353637,3.7580555555556,35,2319,0 +887,888,9249,0.06874265819680701,3.7330555555556,28,1909,0 +888,889,8297,0.041552255552731,3.5886111111111005,27,1627,0 +889,890,8245,0.033571347720577,3.5255555555556,35,2459,0 +890,891,8298,0.014724878652831,3.3858333333333,50,3167,0 +891,892,8247,0.046095580964192,3.2677777777778,69,1839,0 +892,893,8387,0.031859774913781,3.1247222222222,64,3887,0 +893,894,8392,0.094121536253424,2.7213888888888995,69,2031,0 +894,895,8531,0.11471874999036,2.3972222222222004,58,1522,0 +895,896,8437,0.09375530196425097,2.0836111111111,58,1732,0 +896,897,8344,0.10898948864079,2.0644444444444,51,2169,0 +897,898,8274,0.031129909255124,2.2063888888889,46,1679,0 +898,899,8328,0.0,2.3044444444444,84,1941,0 +899,900,8351,0.020155867044519,2.47,144,1638,0 +900,901,8380,0.016795241270985,2.5697222222222003,86,1725,0 +901,902,8332,0.0,2.7625,69,1903,0 +902,903,8366,0.0,2.9436111111111005,81,2074,0 +903,904,8357,0.01748186857624,2.7905555555556,175,1848,0 +904,905,8867,0.015638795432702,2.7527777777778,65,1761,0 +905,906,8659,0.037878946671491,2.6980555555556,48,1838,0 +906,907,8458,0.14870829462531002,2.9102777777778,33,1640,0 +907,908,8360,0.07322030784057597,3.2663888888889,35,1715,0 +908,909,8330,0.10504553292421,3.5372222222222,37,1717,0 +909,910,8298,0.10771048774666,3.86,31,1758,0 +910,911,8381,0.07484115005697,3.9216666666667,36,1975,0 +911,912,8393,0.10377526695926,3.8766666666667,30,1865,0 +912,913,3998,0.052336696506499,3.6463888888889,28,3575,0 +913,914,3733,0.039930389849144,3.6552777777778,24,1413,0 +914,915,3735,0.052659026600132,3.5880555555556,68,1414,0 +915,916,3709,0.071593754146172,3.3594444444444003,26,1170,0 +916,917,3755,0.072107773186609,3.1888888888889,78,1209,0 +917,918,3782,0.14407221323011,2.7575,90,1170,0 +918,919,3849,0.078873737285415,2.3936111111111,76,1328,0 +919,920,3801,0.090543853269643,2.1925,94,1258,0 +920,921,3787,0.0,2.16,70,1427,0 +921,922,3835,0.18229662394063,2.2719444444444,129,1480,0 +922,923,4035,0.10064381631633,2.3994444444444,120,1687,0 +923,924,4173,0.0,2.2836111111111,122,1942,0 +924,925,3995,0.0,2.5422222222222004,100,1967,0 +925,926,4016,0.0,2.6908333333333,102,2110,0 +926,927,4049,0.064661049677152,2.7702777777778,118,1956,0 +927,928,4014,0.10610212880951,2.7405555555556,86,1984,0 +928,929,4263,0.098345239553664,2.6908333333333,92,1893,0 +929,930,3941,0.055426072308289,2.7008333333333,44,1821,0 +930,931,4023,0.026036719363444,2.8322222222222,25,1641,0 +931,932,3917,0.058176601538018,3.0922222222222,54,1604,0 +932,933,3910,0.11644035456955,3.4363888888889,48,1265,0 +933,934,3934,0.067489738764642,3.7530555555556,56,1407,0 +934,935,3783,0.091155534540558,3.9127777777778,42,1342,0 +935,936,3834,0.052217414705359004,3.7608333333333,41,1216,0 +936,937,8698,0.028401045145692,3.6472222222222,32,2569,0 +937,938,8969,0.06030991242653401,3.5544444444444,48,2150,0 +938,939,8928,0.057683225704233,3.5036111111111,40,2317,0 +939,940,9020,0.049602244305935,3.2538888888889,26,2047,0 +940,941,8865,0.054771618715138,3.1886111111111,55,2065,0 +941,942,8830,0.014455899164978,2.7341666666667,52,1909,0 +942,943,8879,0.05563571922395901,2.3655555555556003,34,1910,0 +943,944,9120,0.077488949885965,2.1688888888889,61,2037,0 +944,945,9111,0.06776025909838901,2.0977777777778,34,3065,0 +945,946,9071,0.033919453583666,2.3077777777778,50,2452,0 +946,947,9205,0.030948232299768,2.3611111111111,47,3226,0 +947,948,9355,0.0,2.4986111111111,56,3271,0 +948,949,9372,0.0,2.5691666666667,76,3471,0 +949,950,9392,0.0,2.7463888888889,60,3922,0 +950,951,9416,0.0,2.8063888888888995,100,3296,0 +951,952,9394,0.0,2.8091666666667003,80,3171,0 +952,953,9810,0.10150033578287,2.715,74,3208,0 +953,954,9594,0.13650296233629,2.6869444444444,24,3602,0 +954,955,9006,0.048341331534980006,2.8180555555556,41,3208,0 +955,956,9140,0.055919636698743,3.0541666666667004,19,3455,0 +956,957,8925,0.052826773889684014,3.4711111111111004,24,2833,0 +957,958,9047,0.07932984590431501,3.7566666666667,18,3453,0 +958,959,9030,0.033310879512461,3.8633333333333,28,3155,0 +959,960,9088,0.048306771033288,3.7519444444444,5,2145,0 +960,961,8569,0.034002578802562,3.6480555555556,12,1999,0 +961,962,8616,0.047801640470854015,3.5061111111111005,35,2135,0 +962,963,8497,0.13378075099383,3.47,41,1813,0 +963,964,8439,0.063853685461221,3.3086111111111003,30,2020,0 +964,965,8567,0.0,3.1194444444444,22,2127,0 +965,966,8694,0.073869151016554,2.8044444444444,56,1764,0 +966,967,8739,0.043582908466928014,2.4205555555556004,34,2249,0 +967,968,8761,0.0,2.1180555555556,73,3119,0 +968,969,8838,0.062006969698131,2.1266666666667,86,2031,0 +969,970,8908,0.14006961492891,2.1708333333333,68,2246,0 +970,971,9053,0.11198565566104,2.3247222222222,36,3214,0 +971,972,9346,0.0,2.4208333333333,66,4207,0 +972,973,8989,0.058427455554992985,2.5563888888889,74,4195,0 +973,974,8807,0.070887934206661,2.7086111111111,78,3179,0 +974,975,9020,0.031869233863638,2.8027777777778,66,2739,0 +975,976,9034,0.0,2.7711111111111,118,2394,0 +976,977,9558,0.055680379884383,2.74,81,3750,0 +977,978,9042,0.030919398857213,2.6869444444444,85,3000,0 +978,979,8804,0.040222150865381015,2.8113888888889,69,2646,0 +979,980,8885,0.08462727078727299,3.1258333333333,49,2375,0 +980,981,8721,0.15790637433488,3.4711111111111004,56,2442,0 +981,982,8676,0.099165571846447,3.7419444444444,64,2069,0 +982,983,9029,0.051043016646698,3.7258333333333,48,1899,0 +983,984,8670,0.023695834967821,3.5369444444444,65,2277,0 +984,985,8537,0.13363180896924,3.4911111111111004,53,1926,0 +985,986,8418,0.14375985835531,3.3769444444444,70,1949,0 +986,987,8481,0.13890523887057998,3.3327777777778,51,2222,0 +987,988,8535,0.096357518724471,3.1925,30,1797,0 +988,989,8535,0.098277544249084,3.135,97,1860,0 +989,990,8442,0.11251833989481,2.8338888888889,41,2870,0 +990,991,8448,0.074768662666532,2.4997222222222004,32,1899,0 +991,992,8527,0.038008655416852,2.2297222222222004,47,2336,0 +992,993,8541,0.016354174968753,2.1158333333333,34,2703,0 +993,994,8635,0.11898350916153,2.1966666666667,54,2773,0 +994,995,8867,0.0,2.2591666666667,69,2577,0 +995,996,9033,0.0,2.3002777777778,109,2816,0 +996,997,8875,0.0,2.3797222222222003,76,3133,0 +997,998,8708,0.0,2.625,47,3366,0 +998,999,8455,0.020636446066963,2.6661111111111,44,3062,0 +999,1000,8713,0.043044731483849,2.6694444444444,92,3003,0 +1000,1001,8934,0.12513578187909,2.6541666666667,67,3044,0 +1001,1002,8745,0.099581351017555,2.6483333333333,26,3230,0 +1002,1003,8674,0.085903047711976,2.7444444444444,42,2793,0 +1003,1004,8606,0.066698820830796,3.0788888888889,69,1945,0 +1004,1005,8508,0.034228320502586,3.4833333333333,32,2716,0 +1005,1006,8558,0.028479870560763,3.6063888888889,41,2103,0 +1006,1007,8529,0.16430377699282994,3.8069444444444,52,1795,0 +1007,1008,8520,0.020290722486788003,3.6475,56,2840,0 +1008,1009,6662,0.17253761895951006,3.5219444444444,47,2653,0 +1009,1010,6491,0.1150267570489,3.3708333333333,65,2819,0 +1010,1011,6498,0.14119445755296,3.3086111111111003,70,1706,0 +1011,1012,6500,0.079900598296651,3.2411111111111004,84,1801,0 +1012,1013,6471,0.11459361685243,3.0525,71,3271,0 +1013,1014,6354,0.11299850955195,2.7419444444444,110,2001,0 +1014,1015,6592,0.078187238738118,2.4305555555556,65,1678,0 +1015,1016,6552,0.15222680511595002,2.1852777777778,68,1703,0 +1016,1017,6492,0.05823703723779,2.0644444444444,74,2441,0 +1017,1018,6577,0.038270957919533,2.1961111111111,43,2304,0 +1018,1019,6777,0.045436612403901,2.2886111111111,55,3124,0 +1019,1020,6844,0.051111263534218,2.3219444444444,53,3605,0 +1020,1021,6769,0.0,2.4436111111111,64,2985,0 +1021,1022,6642,0.0,2.6463888888889,58,2934,0 +1022,1023,6782,0.057248496594127986,2.735,54,3044,0 +1023,1024,6715,0.0,2.7586111111111005,121,3463,0 +1024,1025,6915,0.084808608043399,2.7138888888889,103,3199,0 +1025,1026,6569,0.05823703723779,2.7119444444444,66,2684,0 +1026,1027,6486,0.12640598881102005,2.8027777777778,73,3317,0 +1027,1028,6504,0.08602692657241201,2.9777777777778,71,2159,0 +1028,1029,6445,0.13712331887199,3.2961111111111,37,2043,0 +1029,1030,6427,0.12184008568979,3.4869444444444,46,2003,0 +1030,1031,6365,0.050317612906928,3.673611111111101,40,2260,0 +1031,1032,6277,0.07167380324199299,3.7469444444444,26,3522,0 +1032,1033,5231,0.051289858799957,3.6133333333333,42,1840,0 +1033,1034,5166,0.094021005766084,3.4752777777778,63,1820,0 +1034,1035,5303,0.020566298353792,3.3602777777778,68,1856,0 +1035,1036,5306,0.12275234276969,3.1605555555556,87,1715,0 +1036,1037,5298,0.1054190746845,3.0733333333333,60,1695,0 +1037,1038,5268,0.19050318144252,2.7130555555556,94,2254,0 +1038,1039,5251,0.10472332930133,2.2886111111111,121,1652,0 +1039,1040,5194,0.12644994481537,2.0783333333333,128,1602,0 +1040,1041,5230,0.08859454436104999,1.9188888888889,68,1792,0 +1041,1042,5244,0.0,1.9355555555556003,76,1954,0 +1042,1043,5102,0.09532581107230803,2.0569444444444,77,1808,0 +1043,1044,5244,0.15766772749983,2.1902777777778,158,1629,0 +1044,1045,5249,0.06429178708826701,2.3477777777778,112,2140,0 +1045,1046,5261,0.068395341911942,2.5502777777778,85,2390,0 +1046,1047,5339,0.025992957736547997,2.6597222222222,77,1707,0 +1047,1048,5241,0.0,2.7238888888888995,89,1901,0 +1048,1049,5491,0.021142167244918,2.7375,106,1820,0 +1049,1050,5374,0.072067861729848,2.7483333333333,47,2167,0 +1050,1051,5354,0.1275228688396,2.8525,34,2063,0 +1051,1052,5232,0.043846003986674,3.0038888888889,32,2184,0 +1052,1053,5217,0.10247450096434,3.2761111111111005,22,1981,0 +1053,1054,5258,0.07584150637714701,3.5761111111111004,16,1813,0 +1054,1055,5251,0.020496657705832,3.8172222222222,32,2033,0 +1055,1056,5223,0.13399493992192998,3.6691666666667,16,1629,0 +1056,1057,3952,0.091121163023619,3.5558333333333,20,1485,0 +1057,1058,3949,0.11809705541338,3.4266666666667,56,1527,0 +1058,1059,4021,0.033014047837867995,3.435,74,2561,0 +1059,1060,3815,0.16367597832104,3.2111111111111,116,1523,0 +1060,1061,3855,0.12469537397569,3.1297222222222,72,1446,0 +1061,1062,3892,0.095002031789468,2.7538888888889,66,1499,0 +1062,1063,3948,0.1028064299952,2.3116666666667003,56,1368,0 +1063,1064,3860,0.028861851985229007,2.0988888888889,61,1426,0 +1064,1065,3830,0.05806984314166,2.0983333333333,2151,3528,0 +1065,1066,3821,0.050886592113012,2.1986111111111,459,2279,0 +1066,1067,3886,0.05081829754409599,2.3677777777778,84,1421,0 +1067,1068,3954,0.0,2.5036111111111,55,2008,0 +1068,1069,3839,0.08354288831032201,2.5786111111111,61,1429,0 +1069,1070,3921,0.0,2.8172222222222,19,1497,0 +1070,1071,3874,0.08142390858425297,2.8727777777778,30,1604,0 +1071,1072,3996,0.047911560407608,2.8294444444444,73,1595,0 +1072,1073,4246,0.12201534565884,2.7136111111111005,63,2217,0 +1073,1074,3803,0.088739417881303,2.7058333333333,35,1580,0 +1074,1075,3594,0.08276214539547999,2.8161111111111,57,1466,0 +1075,1076,3778,0.066779641097052,3.1541666666667,50,1717,0 +1076,1077,3745,0.11367082443275,3.5791666666667004,48,1564,0 +1077,1078,3747,0.021597223158314,3.8158333333333,40,1752,0 +1078,1079,3726,0.16874893592242002,3.9405555555556,36,1598,0 +1079,1080,3729,0.041971530556774,3.7294444444444,59,1842,0 +1080,1081,8513,0.042983941794881,3.6183333333333,14,3066,0 +1081,1082,8738,0.14500733624043,3.4911111111111004,16,2272,0 +1082,1083,8709,0.046727090031129015,3.4566666666667003,36,4344,0 +1083,1084,8601,0.032553617944112004,3.37,65,3242,0 +1084,1085,8719,0.040039251102491,3.1658333333333,80,2291,0 +1085,1086,8820,0.055153759101126985,2.7261111111111003,91,2240,0 +1086,1087,8674,0.05751181017711901,2.3533333333333,102,2012,0 +1087,1088,8859,0.041202889821452,2.1158333333333,85,2305,0 +1088,1089,8905,0.07854024449462599,2.0852777777778,69,2295,0 +1089,1090,8920,0.11628975245152,2.1422222222222,79,2370,0 +1090,1091,9062,0.087543035971238,2.3172222222222003,66,3066,0 +1091,1092,9139,0.0,2.3983333333333,47,3132,0 +1092,1093,8866,0.031151045483539,2.55,51,3006,0 +1093,1094,8997,0.0,2.7413888888888995,20,3101,0 +1094,1095,9122,0.029949950026121008,2.7636111111111004,62,3739,0 +1095,1096,9191,0.067297142748812,2.7002777777778,54,3933,0 +1096,1097,9795,0.08450527625030299,2.7247222222222,99,4537,0 +1097,1098,9255,0.049852109269358014,2.5866666666667,64,3856,0 +1098,1099,8924,0.094084438832673,2.8597222222222,66,2862,0 +1099,1100,9012,0.044896125591910994,3.1269444444444,49,2449,0 +1100,1101,9023,0.07328004196455701,3.5019444444444,73,2222,0 +1101,1102,8875,0.13104465124262998,3.778611111111101,47,2159,0 +1102,1103,8800,0.10394116672902,3.8727777777778,48,2486,0 +1103,1104,8785,0.033616505813902,3.704166666666701,35,3148,0 +1104,1105,8474,0.02672150953308,3.5533333333333,27,3207,0 +1105,1106,8412,0.082058799915824,3.4461111111111005,19,2057,0 +1106,1107,8491,0.05732182787355501,3.4341666666667003,37,2029,0 +1107,1108,8391,0.067005870534182,3.3141666666667,45,3127,0 +1108,1109,8216,0.13429243256821,3.0438888888889,45,2597,0 +1109,1110,8292,0.015094533525413,2.6791666666667004,32,2350,0 +1110,1111,8406,0.063949370932991,2.3202777777778,99,2364,0 +1111,1112,8509,0.094378811742462,2.0691666666667,71,2095,0 +1112,1113,8486,0.02139340711812,2.0091666666667,93,2978,0 +1113,1114,8616,0.0,2.1886111111111,78,2743,0 +1114,1115,8642,0.0,2.3088888888889,71,2668,0 +1115,1116,8823,0.0,2.3794444444444,91,3054,0 +1116,1117,8774,0.0,2.5994444444444,31,3733,0 +1117,1118,8810,0.0,2.7119444444444,35,4312,0 +1118,1119,8611,0.0,2.76,25,4112,0 +1119,1120,8798,0.10029435223064,2.6975,45,3541,0 +1120,1121,9179,0.0,2.5466666666667,33,3901,0 +1121,1122,9057,0.10365337249761998,2.6036111111111,34,4371,0 +1122,1123,8633,0.12418226954696003,2.7927777777778,40,4099,0 +1123,1124,8517,0.0,2.9788888888889,17,3039,0 +1124,1125,8427,0.051166116772473,3.4080555555556,17,3197,0 +1125,1126,8615,0.040222150865381015,3.6813888888889,16,2346,0 +1126,1127,8690,0.17057206553854998,3.7983333333333,26,2285,0 +1127,1128,8438,0.12861588337799,3.6338888888889,19,2313,0 +1128,1129,10388,0.0,3.5111111111111004,30,3216,0 +1129,1130,10588,0.0,3.3613888888889,94,3860,0 +1130,1131,10533,0.14569364884757002,3.3072222222222,73,4781,0 +1131,1132,10397,0.18198813530019,3.2447222222222,59,2957,0 +1132,1133,10347,0.038073868368755,3.1152777777778,53,2171,0 +1133,1134,10405,0.11491272575332,2.6994444444444,56,2856,0 +1134,1135,10411,0.064841538076484,2.3497222222222005,70,2714,0 +1135,1136,10503,0.048708312546253,2.0619444444444,60,2602,0 +1136,1137,10598,0.11629780056153,2.0625,83,2331,0 +1137,1138,10692,0.07659916149791901,2.1905555555556004,265,3586,0 +1138,1139,10874,0.0,2.2588888888889,944,3363,0 +1139,1140,11043,0.043763623117499,2.3983333333333,36,3879,0 +1140,1141,11009,0.0,2.5536111111111,42,3556,0 +1141,1142,10818,0.041436571087464,2.7408333333333,23,4381,0 +1142,1143,10985,0.0,2.7375,75,4777,0 +1143,1144,10861,0.08191467409622599,2.7780555555556,68,4879,0 +1144,1145,12282,0.11084389924027,2.6225,23,3553,0 +1145,1146,11225,0.12510294083344,2.6386111111111,35,3177,0 +1146,1147,10775,0.10213470511717,2.7908333333333,38,2727,0 +1147,1148,10688,0.06332743445339299,3.0922222222222,69,2758,0 +1148,1149,10601,0.033666593475508995,3.4291666666667004,57,4124,0 +1149,1150,10634,0.057459020289436,3.6752777777778,58,3076,0 +1150,1151,10646,0.023008391787587,3.736111111111101,43,2291,0 +1151,1152,10562,0.037622360322278,3.5905555555556,65,2482,0 +1152,1153,10608,0.026766196308354,3.3872222222222,60,2537,0 +1153,1154,10618,0.13691041072327,3.3186111111111005,55,2434,0 +1154,1155,10636,0.024581173073578,3.2775,49,2608,0 +1155,1156,10583,0.050723618686514,3.1625,54,2614,0 +1156,1157,10613,0.038807415292018,3.1391666666667004,66,2904,0 +1157,1158,10603,0.10731539561588,2.7616666666667005,59,2204,0 +1158,1159,10601,0.13649131550296,2.4675,107,2326,0 +1159,1160,10757,0.11190990870167998,2.2166666666667,104,3002,0 +1160,1161,10815,0.17879123074031,2.1205555555556,100,3472,0 +1161,1162,10790,0.08728058888363299,2.2044444444444,133,3496,0 +1162,1163,11082,0.0,2.3147222222222004,65,3168,0 +1163,1164,11121,0.07099894663641,2.2416666666667004,152,4268,0 +1164,1165,10913,0.098617038600063,2.405,83,4350,0 +1165,1166,11004,0.0,2.5705555555556003,158,3555,0 +1166,1167,11135,0.10519721128315,2.7088888888889,145,4986,0 +1167,1168,10960,0.10928571467639,2.6913888888889,77,4576,0 +1168,1169,11686,0.14969099592127,2.6427777777778,13,4451,0 +1169,1170,11244,0.060122448878635,2.705,67,3627,0 +1170,1171,10931,0.068254139999346,2.8738888888889,25,3485,0 +1171,1172,10811,0.056987671819742985,3.0819444444444,27,3046,0 +1172,1173,10679,0.094667935014769,3.4491666666667005,23,2657,0 +1173,1174,10648,0.13287358772218,3.6275,28,2423,0 +1174,1175,10757,0.032507012295146,3.8027777777778,25,2374,0 +1175,1176,10706,0.14779741522058998,3.6436111111111,28,2493,0 +1176,1177,9077,0.10864900088005,3.4861111111111005,30,2495,0 +1177,1178,8836,0.12602969813907,3.3266666666667004,31,2189,0 +1178,1179,8971,0.07253718299881,3.1866666666667003,31,2214,0 +1179,1180,8972,0.31381296416887,3.2213888888888995,44,2374,0 +1180,1181,8903,0.2312064012582,3.0102777777778,27,3230,0 +1181,1182,8967,0.17687421373190998,2.6658333333333,36,2132,0 +1182,1183,8962,0.022073721703464003,2.3902777777778,61,3042,0 +1183,1184,9044,0.11600086139073,2.1380555555556,64,2053,0 +1184,1185,8931,0.10418807549523,2.0161111111111,118,2349,0 +1185,1186,9028,0.040222150865381015,2.0641666666667,98,3381,0 +1186,1187,9240,0.06812462580532,2.1844444444444,76,3436,0 +1187,1188,9227,0.055328485037955,2.2822222222222,57,3280,0 +1188,1189,9227,0.027788383289499,2.4002777777777995,74,4357,0 +1189,1190,9125,0.0,2.5433333333333,72,4522,0 +1190,1191,9075,0.0,2.7469444444444,78,4094,0 +1191,1192,9117,0.035137191893634005,2.6872222222222,69,3296,0 +1192,1193,9562,0.035137191893634005,2.6980555555556,125,4129,0 +1193,1194,9305,0.11258759940039,2.7380555555556,157,3036,0 +1194,1195,8965,0.16105265701128,2.7858333333333,61,2628,0 +1195,1196,8862,0.15210502999287,3.0502777777778,12,2296,0 +1196,1197,8858,0.07673479360192201,3.2991666666667,16,2221,0 +1197,1198,8820,0.17013715283392,3.5533333333333,36,1991,0 +1198,1199,8876,0.1609412187274,3.6652777777778,27,2778,0 +1199,1200,8797,0.12008642730107,3.6116666666667,22,2511,0 +1200,1201,9074,0.045995324803682,3.5463888888889,22,2103,0 +1201,1202,9318,0.23802438276872,3.4013888888889,35,2111,0 +1202,1203,9286,0.18078076076243,3.245,67,2055,0 +1203,1204,9320,0.12741851179236,3.1644444444444,46,1930,0 +1204,1205,9280,0.08024661572906401,2.9361111111111,72,2456,0 +1205,1206,9333,0.32656213417732,2.6952777777778,96,2952,0 +1206,1207,9334,0.28639695711596,2.3702777777778,117,2147,0 +1207,1208,9337,0.083900984173012,2.0947222222222,113,2051,0 +1208,1209,9405,0.12853338721539,1.9538888888889,140,2281,0 +1209,1210,9263,0.032414228925828,1.9925,107,2102,0 +1210,1211,9326,0.08237281480963901,2.0363888888889,102,2062,0 +1211,1212,9421,0.0,2.1919444444444,85,2796,0 +1212,1213,9275,0.0,2.3211111111111,49,2005,0 +1213,1214,9323,0.0,2.4955555555556,69,2075,0 +1214,1215,9347,0.45868581620054,2.6980555555556,68,2058,1 +1215,1216,9333,0.1959092708736,2.7219444444444,104,2733,0 +1216,1217,9846,0.7871265862012701,2.725,111,2170,1 +1217,1218,9497,0.18267963393082,2.7816666666667,88,2282,0 +1218,1219,9383,0.26777755992147,2.7811111111111004,64,2178,0 +1219,1220,9300,0.30404676514833,2.955,29,2283,0 +1220,1221,9389,0.28226806095289003,3.3158333333333,32,2097,0 +1221,1222,9364,0.32093016819692,3.5669444444444003,29,2738,0 +1222,1223,9227,0.24793583772273,3.7419444444444,21,2678,0 +1223,1224,9309,0.27376916868294,3.6236111111111,33,2404,0 +1224,1225,6204,0.32069151905173,3.4416666666667,37,1497,0 +1225,1226,6048,0.16728853165162,3.4172222222222,57,1496,0 +1226,1227,5949,0.17244047836378998,3.3016666666667,72,1935,0 +1227,1228,5981,0.21356200193615,3.1963888888889,86,1521,0 +1228,1229,5897,0.08833993625230199,3.0641666666667,70,2879,0 +1229,1230,6038,0.20141526375625,2.735,63,1561,0 +1230,1231,6094,0.12271171189386,2.3288888888889,49,1381,0 +1231,1232,6022,0.15111333507662,2.0938888888889,81,1826,0 +1232,1233,6122,0.3688420983862,2.1338888888889,58,1896,0 +1233,1234,6034,0.15672074166098002,2.2247222222222005,70,2083,0 +1234,1235,6079,0.099476236793782,2.3308333333333,67,1792,0 +1235,1236,5998,0.18394691317126,2.3902777777778,70,3258,0 +1236,1237,6004,0.076264605227629,2.5819444444444,95,2265,0 +1237,1238,5908,0.058100747891124,2.6661111111111,100,2775,0 +1238,1239,6022,0.18015967729618,2.8258333333333,116,1545,0 +1239,1240,5981,0.059431847203259,2.7502777777778,123,1818,0 +1240,1241,6399,0.14870829462531002,2.6730555555556004,71,1481,0 +1241,1242,6119,0.09565694822541,2.7536111111111,65,1677,0 +1242,1243,6114,0.16022629962173002,2.9677777777778,73,1858,0 +1243,1244,5915,0.4140256163498,3.37,53,1643,0 +1244,1245,6192,0.32447726333369004,3.5958333333333,79,1582,0 +1245,1246,6021,0.15394421357627,3.8144444444444,77,1611,0 +1246,1247,6060,0.060070368432038,3.8283333333333,59,1803,0 +1247,1248,7510,0.14236976564388,3.7030555555556,66,2121,0 +1248,1249,7560,0.12741851179236,3.5802777777778,54,2375,0 +1249,1250,7525,0.093634078744746,3.4197222222222,54,1866,0 +1250,1251,7483,0.13709947889982,3.4438888888889,89,2398,0 +1251,1252,7452,0.06298116794216299,3.3425,85,2577,0 +1252,1253,7512,0.13125017838571,3.1608333333333,96,1801,0 +1253,1254,7572,0.21161148728916,2.7413888888888995,149,1840,0 +1254,1255,7629,0.06783428261124,2.3808333333333,139,1985,0 +1255,1256,7529,0.20877561051189,2.12,90,2041,0 +1256,1257,7623,0.10394294206935002,2.1533333333333,68,2075,0 +1257,1258,7637,0.0,2.2569444444444,445,2564,0 +1258,1259,7921,0.076424293095548,2.3183333333333,100,2734,0 +1259,1260,7790,0.08809461878011901,2.3583333333333,138,3143,0 +1260,1261,7782,0.034280386319742985,2.5072222222222003,104,3119,0 +1261,1262,7829,0.039360296791109,2.5927777777778,82,3590,0 +1262,1263,7902,0.0,2.6894444444444,208,3893,0 +1263,1264,8039,0.038944065994356014,2.6291666666667,92,3264,0 +1264,1265,8350,0.18176011684739,2.6469444444444,53,3963,0 +1265,1266,8142,0.18521047165852,2.7461111111111003,65,2757,0 +1266,1267,7886,0.13079770999921,2.9363888888889,62,2306,0 +1267,1268,7743,0.13310058077443,3.2797222222222,73,2549,0 +1268,1269,7707,0.054750658073534006,3.5194444444444,84,2212,0 +1269,1270,7726,0.030588852697706,3.8130555555556,90,2286,0 +1270,1271,7717,0.12998124134227002,3.7941666666667,80,2979,0 +1271,1272,10331,0.09100057249197198,3.6086111111111,90,3158,0 +1272,1273,10515,0.19464543002904006,3.3858333333333,84,2645,0 +1273,1274,10415,0.22178651521516,3.3336111111111,34,3161,0 +1274,1275,10387,0.22983578430825,3.3116666666667003,67,4460,0 +1275,1276,10471,0.298229429356,3.2616666666667005,74,2630,0 +1276,1277,10385,0.12923377484588,3.0044444444444003,44,2593,0 +1277,1278,10439,0.19609416059774,2.6741666666667,64,2625,0 +1278,1279,10516,0.040518533819385014,2.3191666666667,70,4834,0 +1279,1280,10587,0.07099894663641,2.0597222222222,96,4056,0 +1280,1281,10586,0.07584150637714701,2.0547222222222,110,5713,0 +1281,1282,10684,0.08180100127782801,2.1511111111111,68,3940,0 +1282,1283,10880,0.0,2.2602777777778,90,4414,0 +1283,1284,10830,0.0,2.2883333333333,90,5044,0 +1284,1285,10794,0.09140162014739303,2.3736111111111,69,3894,0 +1285,1286,10843,0.0,2.5869444444444,46,3993,0 +1286,1287,10805,0.0,2.6480555555556,74,4404,0 +1287,1288,10996,0.0,2.6077777777777995,68,4072,0 +1288,1289,11327,0.05363316840061,2.6069444444444,67,4182,0 +1289,1290,11090,0.26818151064716,2.6908333333333,51,3351,0 +1290,1291,10578,0.21887772653901,2.9019444444444003,39,4183,0 +1291,1292,10528,0.32371296573811,3.2711111111111,26,4068,0 +1292,1293,10475,0.12565805017257,3.5872222222222,25,8139,0 +1293,1294,10664,0.092277247744574,3.6913888888889,32,11000,0 +1294,1295,10513,0.077016875742983,3.6313888888889,17,2975,0 +1295,1296,9072,0.3714480797312501,3.5605555555556,19,2692,0 +1296,1297,9069,0.19332372237792,3.4402777777778,16,2502,0 +1297,1298,9089,0.06345811641554701,3.35,28,2510,0 +1298,1299,9027,0.2267121559473,3.3469444444444,24,2663,0 +1299,1300,8969,0.053072279964629,3.2708333333333,35,3575,0 +1300,1301,9073,0.13336345197744,3.2519444444444,49,2586,0 +1301,1302,8957,0.1252855094715,2.7311111111111,106,2908,0 +1302,1303,9126,0.096211952864224,2.3875,80,3530,0 +1303,1304,9122,0.096524467517755,2.0847222222222,90,2776,0 +1304,1305,9231,0.08924770147957402,2.0975,169,2962,0 +1305,1306,9368,0.11889606284162,2.1763888888889,98,3441,0 +1306,1307,9458,0.031429841710104,2.2327777777777995,92,4376,0 +1307,1308,9463,0.0,2.2725,91,3857,0 +1308,1309,9356,0.036512411627868,2.3202777777778,99,4685,0 +1309,1310,9340,0.0,2.5425,90,4585,0 +1310,1311,9340,0.0,2.5986111111111,126,3542,0 +1311,1312,9276,0.0,2.6319444444444,102,3370,0 +1312,1313,9611,0.10106696361212,2.5836111111111,132,3515,0 +1313,1314,9532,0.14854949043035,2.675,88,3793,0 +1314,1315,9156,0.08612162048398897,2.8522222222222,135,2954,0 +1315,1316,9222,0.16494200410492002,3.1302777777778,114,2627,0 +1316,1317,9282,0.28637713141253,3.4805555555556,35,2550,0 +1317,1318,9573,0.13206535647488,3.5994444444444,24,2480,0 +1318,1319,9333,0.27364025607799,3.5847222222222,44,2521,0 +1319,1320,9987,0.38382339961227,3.4963888888889,26,2860,0 +1320,1321,10133,0.08426242877623301,3.3825,37,3675,0 +1321,1322,10010,0.3290413568025901,3.2694444444444,45,2704,0 +1322,1323,10028,0.22632868808708,3.2322222222222,42,3121,0 +1323,1324,9984,0.17914189971361,3.1936111111111005,47,2603,0 +1324,1325,10041,0.30046815361859003,3.0536111111111004,34,3984,0 +1325,1326,10072,0.22650915594248,2.7819444444444,56,2537,0 +1326,1327,10025,0.0,2.4152777777778,87,3349,0 +1327,1328,10116,0.1223093269317,2.1569444444444,74,3958,0 +1328,1329,10232,0.1696074188221,2.1125,90,4243,0 +1329,1330,10516,0.0,2.1833333333333003,79,4159,0 +1330,1331,10449,0.028193633007367,2.205,97,5637,0 +1331,1332,10598,0.0,2.1697222222222,90,8142,0 +1332,1333,10337,0.0,2.3075,77,5713,0 +1333,1334,10469,0.097305232437507,2.4575,101,3668,0 +1334,1335,10426,0.11905908868379,2.6077777777777995,74,4307,0 +1335,1336,10531,0.11660374103282,2.6275,439,4354,0 +1336,1337,10875,0.060474297756584014,2.6144444444444,79,4262,0 +1337,1338,10494,0.22568442027805,2.6477777777777995,165,3446,0 +1338,1339,10195,0.14077736537045002,2.8594444444444003,139,2677,0 +1339,1340,9918,0.1924574892026,3.2675,56,4450,0 +1340,1341,9889,0.18922597300629,3.5136111111111004,102,3044,0 +1341,1342,9947,0.041593949118095004,3.5725,101,3428,0 +1342,1343,9977,0.2502095174271,3.6863888888889,41,2845,0 +1343,1344,10835,0.18663972932643,3.5636111111111,94,2781,0 +1344,1345,10765,0.07351854082400297,3.4127777777778,116,2743,0 +1345,1346,10656,0.081949111399618,3.295,94,4470,0 +1346,1347,10485,0.20148511394009,3.2666666666667004,89,2596,0 +1347,1348,10681,0.11515101921294,3.1933333333333,141,3249,0 +1348,1349,10852,0.07797276382811,3.0688888888889,167,2529,0 +1349,1350,10728,0.07244862879413201,2.8102777777778,148,2452,0 +1350,1351,10874,0.07310929970435699,2.42,105,2934,0 +1351,1352,10964,0.066868365737218,2.1358333333333,210,3159,0 +1352,1353,10984,0.05788512501593701,1.9916666666667,145,3974,0 +1353,1354,11055,0.09727414207464803,2.0947222222222,136,4305,0 +1354,1355,11233,0.033270317741558,2.1591666666667,126,5012,0 +1355,1356,11161,0.0,2.2377777777778,157,4455,0 +1356,1357,10966,0.038270957919533,2.2511111111111,105,4108,0 +1357,1358,11193,0.08728058888363299,2.4208333333333,114,4339,0 +1358,1359,11167,0.10536774813238,2.5241666666667,104,5056,0 +1359,1360,11367,0.1233991317089,2.5794444444444,69,5573,0 +1360,1361,51251,0.042565915766552,2.5936111111111,75,3366,1 +1361,1362,17953,0.23147422367229,2.6830555555556,73,2559,1 +1362,1363,170029,0.08983405162538903,2.8188888888889,74,1999,1 +1363,1364,10955,0.07464756469365201,2.9513888888888995,126,1993,0 +1364,1365,10984,0.099244104918934,3.2830555555556,67,1913,0 +1365,1366,10964,0.11535172009194,3.4819444444444,32,1760,0 +1366,1367,10980,0.21774881707852,3.5886111111111005,38,1890,0 +1367,1368,10852,0.1305066423559,3.4836111111111,34,2469,0 +1368,1369,10786,0.10054853030204,3.3955555555556,36,2133,0 +1369,1370,10841,0.02468393737575,3.2847222222222,26,3359,0 +1370,1371,10762,0.10018007414459,3.2383333333332995,74,3783,0 +1371,1372,10419,0.12522619841308,3.2188888888889,85,1809,0 +1372,1373,10467,0.11781887197077,2.9483333333333,67,2143,0 +1373,1374,10502,0.13417256350298,2.5855555555556,84,2567,0 +1374,1375,10519,0.07474686582090599,2.3005555555556003,1630,2176,0 +1375,1376,10579,0.13570963056519,2.0855555555556,1435,1929,0 +1376,1377,10502,0.076431907457478,1.9027777777778,857,2244,0 +1377,1378,10661,0.0,1.9411111111111,31,1810,0 +1378,1379,10818,0.1936428046839,2.0444444444444,500,2088,0 +1379,1380,10918,0.052826773889684014,2.1363888888889,53,2371,0 +1380,1381,10871,0.0,2.22,61,1843,0 +1381,1382,10796,0.054466597481213,2.3530555555556,158,2668,0 +1382,1383,10774,0.057459020289436,2.545,184,2309,0 +1383,1384,10898,0.28750562005936,2.6202777777778,91,1998,0 +1384,1385,11442,0.075538554674309,2.6847222222222,60,2480,0 +1385,1386,11113,0.08112608570492501,2.6591666666667004,107,2147,0 +1386,1387,10888,0.21563803296368,2.7863888888888995,5157,1802,0 +1387,1388,10894,0.095725002305685,3.0269444444444003,28,1789,0 +1388,1389,10888,0.17516056892320994,3.3227777777778,24,1999,0 +1389,1390,10896,0.32902836018586,3.6097222222222,21,2142,0 +1390,1391,10800,0.10216065221678,3.6805555555556,12,1904,0 +1391,1392,11000,0.19741931250852,3.6075,24,1876,0 +1392,1393,10985,0.10149107903671,3.4091666666667004,17,2434,0 +1393,1394,11017,0.17479255893624,3.3666666666667004,48,2472,0 +1394,1395,10863,0.034385029573777,3.3158333333333,41,1744,0 +1395,1396,10875,0.21988771218053,3.1622222222222,1088,2404,0 +1396,1397,10987,0.10149107903671,3.1086111111111,68,1971,0 +1397,1398,10778,0.10269981175445,2.6552777777778,2575,1713,0 +1398,1399,10957,0.11258759940039,2.2730555555556,4688,1765,0 +1399,1400,10832,0.13022351806001,2.0591666666667,477,3156,0 diff --git a/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_problem/dataSplits.csv b/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_problem/dataSplits.csv new file mode 100644 index 0000000..c07dc45 --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_problem/dataSplits.csv @@ -0,0 +1,1261 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +0,1,12183,0.0,3.7166666666667,5,2109,0 +1,2,12715,0.091757964510557,3.6108333333333,60,3229,0 +2,3,12736,0.17229675238449998,3.4813888888889,88,3637,0 +3,4,12716,0.22621935431999,3.3802777777778,84,1982,0 +4,5,12739,0.17635798469946,3.1933333333333,111,2751,0 +5,6,12737,0.090491245476051,2.7866666666667004,112,2128,0 +6,7,12857,0.08460994072769001,2.4627777777777995,1235,2109,0 +7,8,12884,0.06842699169496,2.2541666666667,710,2328,0 +8,9,12894,0.13330269689422,2.1180555555556,618,2453,0 +9,10,12675,0.085026586189321,2.0691666666667,84,2847,0 +10,11,13260,0.097073068447328,2.1972222222222,100,3659,0 +11,12,13470,0.0,2.3188888888889,125,5207,0 +12,13,13060,0.031063767542922,2.34,114,5146,0 +13,14,12949,0.017732750501525,2.4902777777778,145,4712,0 +14,15,13035,0.063354504072079,2.6438888888889,91,6363,0 +15,16,12980,0.087870391896335,2.8486111111111003,94,5010,0 +16,17,13677,0.11546815687729,2.8833333333333,79,3956,0 +17,18,13381,0.073413457727404,2.8808333333333,50,4063,0 +18,19,12737,0.040392584616896,2.9005555555556,39,3748,0 +19,20,12554,0.08911335594722301,3.0855555555556,28,3047,0 +20,21,12470,0.098030053711531,3.3536111111111,29,4099,0 +21,22,12490,0.047140641497552,3.7438888888889,24,2122,0 +22,23,12539,0.10481279080241,3.7947222222222,19,3387,0 +23,24,12530,0.20478886838928,3.801111111111101,21,1950,0 +24,25,13002,0.04485100631921201,3.6508333333333,27,2927,0 +25,26,12989,0.1053622140254,3.555,46,1889,0 +26,27,13038,0.08436887679639,3.4769444444444,133,1910,0 +27,28,13011,0.097980673762982,3.2158333333333,143,3747,0 +28,29,12984,0.10165726215275,3.1141666666667,86,4994,0 +29,30,13079,0.056764513454874,2.7983333333333,118,2009,0 +30,31,13048,0.074428708878932,2.4252777777778,56,2899,0 +31,32,13096,0.091244453451818,2.14,92,2298,0 +32,33,13003,0.094529332881679,1.9822222222222,85,1894,0 +33,34,13057,0.016638011234698,1.9694444444444,122,1999,0 +34,35,13023,0.038096861957006005,2.0741666666667,74,3007,0 +35,36,13033,0.064497814457643,2.2505555555556,84,2838,0 +36,37,13034,0.030426401876334,2.2819444444444,54,4113,0 +37,38,13068,0.095423209955973,2.4216666666667,77,2150,0 +38,39,13057,0.069688744272108,2.5997222222222005,84,3007,0 +39,40,13047,0.03468622413034,2.7544444444444003,139,2484,0 +40,41,13795,0.089564461084836,2.7258333333333,65,2101,0 +41,42,13528,0.07337616196456799,2.8302777777778,38,2001,0 +42,43,13032,0.061939295606039,2.9422222222222,35,2102,0 +43,44,13084,0.11419089175512,3.0919444444444,47,2129,0 +44,45,13000,0.10475925920163,3.3519444444444,37,4422,0 +45,46,13008,0.079657960399444,3.6952777777778,53,4573,0 +46,47,12978,0.14475546275416,3.8269444444444,55,1989,0 +47,48,13067,0.1421711341096,3.7877777777778,45,1953,0 +48,49,13086,0.07696963969656899,3.7536111111111,46,1872,0 +49,50,13023,0.06393273436444799,3.61,35,1850,0 +50,51,13046,0.14973281021845006,3.5091666666667,68,2879,0 +51,52,13032,0.041478839355346,3.4205555555556,82,1840,0 +52,53,13012,0.089317973365284,3.2647222222222,154,2134,0 +53,54,13051,0.088820248166203,2.7944444444444,128,2234,0 +54,55,12979,0.054872994406929,2.46,79,3769,0 +55,56,13025,0.07913553329046401,2.2075,66,2717,0 +56,57,13007,0.16317996709063,2.1758333333333,92,2171,0 +57,58,13036,0.08671926699280201,2.3058333333333,67,2224,0 +58,59,13043,0.0733999511789,2.3983333333333,58,1967,0 +59,60,13023,0.0,2.55,58,2148,0 +60,61,13022,0.032756244361869,2.7302777777778,63,1978,0 +61,62,13033,0.054893891024455,2.8169444444444003,61,2021,0 +62,63,13024,0.068514114108229,2.9247222222222,55,2060,0 +63,64,13048,0.05279414163165401,2.8911111111111003,71,2096,0 +64,65,13740,0.023853017353212,2.9575,64,2082,0 +65,66,13540,0.07426125441559799,2.9080555555556,92,2175,0 +66,67,12724,0.024228588329879,3.0088888888889,44,2332,0 +67,68,13070,0.09233413002519697,3.2033333333333,35,2147,0 +68,69,13106,0.15930655332113,3.6213888888889,53,2163,0 +69,70,13025,0.12755838225296,4.0322222222222,49,2406,0 +70,71,13074,0.10152541717054,4.1227777777778,49,2022,0 +71,72,13079,0.040148453968243986,3.9736111111111,103,2188,0 +72,73,13184,0.087208372094752,3.8425,107,2758,0 +73,74,13194,0.074209918996797,3.7097222222222,74,2925,0 +74,75,13191,0.059044537369404015,3.6258333333333,56,3223,0 +75,76,13059,0.06248169832921499,3.4705555555556,60,2507,0 +76,77,13169,0.08876527685714597,3.2877777777778,73,2435,0 +77,78,13114,0.051354431854972,2.9286111111111004,99,2552,0 +78,79,13037,0.074790104163639,2.4888888888889,84,2540,0 +79,80,13179,0.091817341555971,2.2744444444444,129,2642,0 +80,81,13152,0.14762794333026005,2.1733333333333,101,2254,0 +81,82,13095,0.07101004447510299,2.3416666666667,101,2539,0 +82,83,13144,0.07689756334240598,2.3808333333333,51,2596,0 +83,84,13170,0.08412575787388403,2.4663888888889,95,2573,0 +84,85,13162,0.06328921386603299,2.6608333333333,48,2302,0 +85,86,13117,0.057393902128707,2.7558333333333,40,2991,0 +86,87,13129,0.041819399065704,2.8636111111111004,55,3141,0 +87,88,13386,0.073729686380986,2.7586111111111005,56,3285,0 +88,89,13929,0.15365285617975,2.7377777777778,935,3807,0 +89,90,13385,0.060355859742407016,2.6961111111111005,34,2892,0 +90,91,13106,0.10644586288975,2.8569444444444,57,2538,0 +91,92,13113,0.059314286360126985,3.1833333333333,70,2234,0 +92,93,13155,0.096293806236591,3.5544444444444,72,2707,0 +93,94,13186,0.085101425467407,3.8894444444444,66,2382,0 +94,95,13151,0.11149072274185,4.1138888888889,72,2426,0 +95,96,13156,0.076266981262989,3.9519444444444,49,2451,0 +96,97,12813,0.097952120177625,3.8275,41,2288,0 +97,98,12821,0.17250021935572,3.6438888888889,42,2256,0 +98,99,12867,0.11389182319254,3.5608333333333,39,2884,0 +99,100,12837,0.08999961787521,3.5013888888889,81,2398,0 +100,101,12911,0.048649372449385005,3.3088888888889,90,2239,0 +101,102,12842,0.13861764684085998,2.9063888888889,92,2248,0 +102,103,12905,0.1088795585287,2.5027777777777995,81,2387,0 +103,104,12993,0.054235162564995,2.2466666666667003,145,3876,0 +104,105,12974,0.0390040506742,2.1869444444444,47,3073,0 +105,106,13039,0.0744713077811,2.2402777777778,63,3113,0 +106,107,13322,0.040258943675435,2.3727777777778,118,3363,0 +107,108,13606,0.0,2.4566666666667003,56,3796,0 +108,109,13536,0.027955712584728,2.5452777777777995,127,4924,0 +109,110,13341,0.047309968420241,2.6830555555556,48,4300,0 +110,111,13360,0.016602764360002,2.805,114,5225,0 +111,112,13450,0.042432577628353986,2.7386111111111004,78,4047,0 +112,113,14102,0.051191743726563,2.7438888888888995,58,4134,0 +113,114,14026,0.0,2.7586111111111005,56,4786,0 +114,115,13162,0.056724832354639,2.9013888888889,67,4184,0 +115,116,13118,0.055771058827737,3.19,155,2888,0 +116,117,12953,0.081014772096658,3.5561111111111003,123,2674,0 +117,118,12854,0.08253629738290899,3.8433333333333,118,2574,0 +118,119,12952,0.11499203730886,4.0319444444444,133,3123,0 +119,120,12915,0.07668513845109799,3.8844444444444,75,3369,0 +120,121,11994,0.070057457403873,3.6908333333333,29,3284,0 +121,122,11868,0.07031477357556501,3.6141666666667,68,2127,0 +122,123,11977,0.091946448716499,3.5019444444444,91,2117,0 +123,124,11874,0.14560588482235998,3.4205555555556,101,2271,0 +124,125,11913,0.094774329323472,3.1780555555556,22,2513,0 +125,126,11933,0.10217989327054,2.8361111111111,20,2746,0 +126,127,11844,0.04854243074027901,2.5222222222222004,27,2076,0 +127,128,11968,0.068760549683423,2.2416666666667004,45,2297,0 +128,129,11996,0.075440683881139,2.1588888888889,42,2312,0 +129,130,12006,0.11771339431815,2.2763888888889,59,2834,0 +130,131,12225,0.069437397660265,2.3391666666667,52,3584,0 +131,132,12482,0.0,2.4841666666667,62,4009,0 +132,133,12289,0.0,2.4911111111111,81,4142,0 +133,134,12219,0.0,2.6922222222222,84,3876,0 +134,135,12282,0.027395404320488,2.8205555555556,104,4098,0 +135,136,12367,0.055202605299814,2.8216666666667,111,3831,0 +136,137,13042,0.078387348178452,2.7122222222222,91,3842,0 +137,138,12665,0.11851571646444,2.6744444444444,33,4129,0 +138,139,12133,0.068395341911942,2.8097222222222,26,3509,0 +139,140,12023,0.04720597158087901,3.1838888888889,37,2450,0 +140,141,11847,0.07910648512645599,3.5130555555556,23,2270,0 +141,142,11980,0.067550601916344,3.7722222222222,29,2360,0 +142,143,12026,0.080666570182724,3.9058333333333,45,2431,0 +143,144,11852,0.044973875852863,3.7697222222222,49,2042,0 +144,145,12152,0.065734580284861,3.6027777777778,27,1833,0 +145,146,12148,0.068759646748575,3.5038888888889,46,1852,0 +146,147,12236,0.027278224398313,3.445,39,1927,0 +147,148,12155,0.067695565422881,3.3494444444444,72,1999,0 +148,149,12113,0.07244669924777,3.1961111111111005,81,2030,0 +149,150,12175,0.028882930937168,2.8905555555556,64,1963,0 +150,151,12103,0.021568136982842,2.5805555555556,79,2116,0 +151,152,12206,0.064254625408408,2.3380555555556004,132,2461,0 +152,153,12239,0.073869151016554,2.2116666666667,127,2388,0 +153,154,12398,0.026644044055307004,2.2013888888889,121,2846,0 +154,155,12582,0.051289858799957,2.3236111111111,98,2974,0 +155,156,12705,0.099217337562612,2.3002777777778,128,3776,0 +156,157,12555,0.016615805334675,2.385,158,3885,0 +157,158,12476,0.078387348178452,2.5597222222222005,78,3865,0 +158,159,12706,0.0,2.6941666666667,65,4319,0 +159,160,12671,0.049384244324413,2.7169444444444,81,4646,0 +160,161,13277,0.043044731483849,2.6369444444444,586,3873,0 +161,162,12757,0.04215504851616,2.6572222222222,48,3489,0 +162,163,12401,0.042236538352835,2.8466666666667004,38,2790,0 +163,164,12248,0.1001564296112,3.1955555555556,30,2641,0 +164,165,12156,0.17378132267942994,3.5633333333333,28,2960,0 +165,166,12210,0.12005519462968,3.8113888888889,36,2192,0 +166,167,11983,0.14491137762023998,3.9655555555556,50,2145,0 +167,168,12374,0.07336941078506799,3.8483333333333,47,2133,0 +168,169,12230,0.12395626148952,3.6441666666667,82,2330,0 +169,170,12200,0.15077430423660998,3.5213888888889,56,2235,0 +170,171,12135,0.18960071033689,3.4702777777778,140,2258,0 +171,172,12131,0.06051348935254,3.3033333333333,145,2200,0 +172,173,12165,0.072057993662839,3.1933333333333,114,2161,0 +173,174,12193,0.082361078437032,2.8183333333333,129,2159,0 +174,175,12165,0.12343775199876,2.52,143,2088,0 +175,176,12304,0.1071817784483,2.2886111111111,113,2473,0 +176,177,12275,0.10359394556779,2.0822222222222,108,3217,0 +177,178,12369,0.021162435488903,2.1416666666667,93,2994,0 +178,179,12569,0.074524398314698,2.2688888888889,63,3827,0 +179,180,12766,0.12687067454443,2.335,103,4176,0 +180,181,12621,0.041752618326160014,2.4388888888889,114,4227,0 +181,182,12611,0.0,2.5386111111111,67,4290,0 +182,183,12618,0.040819652463459,2.6288888888889,106,4691,0 +183,184,12631,0.082668981599835,2.7511111111111,160,4442,0 +184,185,13121,0.06181362481077901,2.7744444444444,81,5775,0 +185,186,12871,0.0,2.8297222222222,113,3840,0 +186,187,12252,0.076137992226715,2.9708333333333,37,3721,0 +187,188,12155,0.12107639529965,3.1333333333333,70,2498,0 +188,189,12186,0.0,3.3544444444444,82,2265,0 +189,190,12179,0.19840339729984,3.6780555555556,76,2451,0 +190,191,12109,0.20112394005693,3.8038888888889,59,2892,0 +191,192,12142,0.096833471661634,3.8177777777778,58,2166,0 +192,193,12145,0.10338450919956,3.6916666666667,49,2040,0 +193,194,12162,0.10142513773096,3.5197222222222,36,2013,0 +194,195,12165,0.09779274451732,3.5186111111111003,111,2000,0 +195,196,12125,0.14744152252573,3.2597222222222,81,2117,0 +196,197,12097,0.083396348606149,3.0930555555556,92,2775,0 +197,198,12099,0.095637498006913,2.7825,113,2116,0 +198,199,12140,0.14768844039376006,2.4494444444444,90,1991,0 +199,200,12188,0.1131872329372,2.2369444444444,183,3162,0 +200,201,12157,0.073729686380986,2.0961111111111,117,2958,0 +201,202,12128,0.064614077523704,2.0377777777778,110,3153,0 +202,203,12190,0.056019959597275015,2.0730555555556003,179,2190,0 +203,204,12151,0.074812141908008,2.1655555555556,134,2172,0 +204,205,12214,0.02489388427845201,2.285,135,2074,0 +205,206,12275,0.023695834967821,2.4283333333333,100,2078,0 +206,207,12164,0.058680009072634,2.6186111111111,47,2406,0 +207,208,12120,0.10008779345816002,2.7372222222222,88,2018,0 +208,209,12693,0.066566772961868,2.8266666666667004,74,2091,0 +209,210,12624,0.070501147961051,2.8469444444444,58,2310,0 +210,211,12163,0.098779019649936,2.9855555555556,100,2113,0 +211,212,12100,0.11803653713501,3.1038888888889,49,2518,0 +212,213,12162,0.10076746585103,3.4058333333333,36,2605,0 +213,214,12106,0.053210709415363,3.6138888888889,40,2680,0 +214,215,12156,0.099346579713514,3.93,50,2228,0 +215,216,12120,0.047275248011591,3.8155555555556,58,2023,0 +216,217,12420,0.091262209791582,3.6588888888889,50,3702,0 +217,218,12417,0.038593218846488,3.5913888888889,53,1992,0 +218,219,12450,0.070273907645883,3.4644444444444003,93,1988,0 +219,220,12395,0.029431888410363,3.3944444444444,78,1919,0 +220,221,12382,0.096854769984307,3.2227777777778,84,2213,0 +221,222,12438,0.11656453357642,2.7961111111111,112,2181,0 +222,223,12363,0.12109055114779,2.4383333333333,73,2152,0 +223,224,12393,0.20381554615786,2.2647222222222005,91,2393,0 +224,225,12399,0.046311768005022014,2.1886111111111,114,2173,0 +225,226,12456,0.18261306403662,2.2825,127,2109,0 +226,227,12442,0.021992750543024,2.3333333333333,69,3606,0 +227,228,12481,0.088072259040681,2.445,59,2114,0 +228,229,12432,0.037896500450725,2.5811111111111,64,2135,0 +229,230,12403,0.09882843339863,2.7094444444444,75,2303,0 +230,231,12406,0.076277687882641,2.88,44,2137,0 +231,232,12462,0.022875979046571,2.8555555555556,52,2264,0 +232,233,13034,0.10022162220861,2.7791666666667,42,2245,0 +233,234,12830,0.08117200437078799,2.7772222222222,45,2151,0 +234,235,12439,0.09750667785645803,3.02,26,2330,0 +235,236,12541,0.05680722879784299,3.2213888888888995,29,3357,0 +236,237,12462,0.12240855732315,3.6211111111111,32,3152,0 +237,238,12394,0.1715485140175,4.0219444444444,44,2693,0 +238,239,12507,0.075015592829224,4.0980555555556,41,3798,0 +239,240,12512,0.11388410095531,3.9080555555556,42,4596,0 +240,241,12093,0.10519027968795,3.7269444444444,46,2529,0 +241,242,12197,0.1150532998405,3.6244444444444,40,2124,0 +242,243,12138,0.10890530980571,3.5252777777778,64,2762,0 +243,244,12174,0.099350621485086,3.4675,70,2973,0 +244,245,12163,0.12889794040441002,3.3316666666667003,69,3041,0 +245,246,12096,0.12069378235889,2.9497222222222,73,2179,0 +246,247,12166,0.13053034917739,2.5708333333333,85,2322,0 +247,248,12187,0.078977758004111,2.3086111111111,63,2274,0 +248,249,12246,0.08088416337864099,2.2311111111111,67,2448,0 +249,250,12335,0.04008956024204,2.3119444444444,68,3811,0 +250,251,12556,0.05063725351997099,2.3536111111111,62,3761,0 +251,252,12652,0.039066291775136,2.4819444444444,69,4269,0 +252,253,12646,0.028611752774164,2.6605555555556,82,4244,0 +253,254,12803,0.040593364983329,2.7527777777778,56,4417,0 +254,255,12570,0.038807415292018,3.0741666666667005,38,3758,0 +255,256,12633,0.07832796288132203,2.8522222222222,30,4375,0 +256,257,13146,0.066320996162546,2.7277777777778,48,4158,0 +257,258,12994,0.083175583471284,2.7502777777778,63,3410,0 +258,259,12314,0.06802464587725401,2.8797222222222,34,2853,0 +259,260,12193,0.051675070535006,3.2027777777778,11,2628,0 +260,261,12127,0.044129112207997014,3.5633333333333,22,2287,0 +261,262,12140,0.037685894365982006,3.8808333333333,22,3334,0 +262,263,12174,0.093414561465838,4.0352777777778,12,2795,0 +263,264,12180,0.06987083046098,3.8966666666667,10,2089,0 +264,265,12861,0.021992750543024,3.7225,14,2260,0 +265,266,12957,0.11305566197523,3.73,39,3176,0 +266,267,12981,0.030884138240845,3.5558333333333,55,4049,0 +267,268,12958,0.10381377439313,3.3169444444444003,90,2902,0 +268,269,12913,0.048953768695625004,3.2322222222222,68,3743,0 +269,270,12939,0.042258794089861,2.8658333333333,95,4280,0 +270,271,12933,0.048388685585470985,2.5169444444444,70,3977,0 +271,272,13006,0.034197830567692,2.3,96,4518,0 +272,273,13091,0.08835953066771099,2.1888888888889,45,2707,0 +273,274,13201,0.086890518272785,2.2030555555556,96,3522,0 +274,275,13520,0.031087561676959,2.2711111111111,74,4584,0 +275,276,13675,0.071287463233942,2.4697222222222,82,4141,0 +276,277,13594,0.14372616993938,2.5988888888889,82,4831,0 +277,278,13466,0.12647517487142998,2.7258333333333,45,3991,0 +278,279,13448,0.042854531198562,2.7858333333333,134,4645,0 +279,280,13492,0.039930389849144,2.7922222222222,119,4967,0 +280,281,14123,0.076184645265048,2.6988888888889,86,4578,0 +281,282,13839,0.037830020408535,2.7663888888889,75,4972,0 +282,283,13335,0.030884138240845,2.8938888888889,45,5522,0 +283,284,13196,0.048316550276279,3.1875,50,2832,0 +284,285,13047,0.10986585566763,3.6463888888889,31,2826,0 +285,286,13008,0.025485002897852004,3.866666666666701,88,2855,0 +286,287,12763,0.12451757643335,3.9808333333333,42,2660,0 +287,288,12949,0.12875690949235,3.8277777777778,70,2447,0 +288,289,13009,0.15720639094135,3.6269444444444,106,2545,0 +289,290,13008,0.079092017261926,3.5266666666667,44,3842,0 +290,291,12890,0.14711499890479998,3.5077777777778,57,2332,0 +291,292,13004,0.0531410973178,3.3455555555556,95,2294,0 +292,293,12918,0.10136246281349,3.1241666666667003,91,3016,0 +293,294,12910,0.053119315802353,2.8713888888889,66,3944,0 +294,295,12915,0.11313351589999003,2.5133333333333,66,2332,0 +295,296,13121,0.076760188212735,2.2197222222222,82,2405,0 +296,297,13076,0.08890522133351199,2.205,73,2572,0 +297,298,13096,0.1009555130175,2.2677777777778,69,2558,0 +298,299,13339,0.15685427502807,2.2991666666667,107,3701,0 +299,300,13635,0.11090638960365,2.4277777777778,101,4228,0 +300,301,13493,0.054798089981891,2.5333333333333,66,3990,0 +301,302,13402,0.08461316628091001,2.6422222222222005,47,4707,0 +302,303,13417,0.15790425505315,2.8211111111111005,47,3857,0 +303,304,13382,0.021675109392134,2.7625,66,3874,0 +304,305,14199,0.14112049645292002,2.7391666666667,102,4369,0 +305,306,13973,0.059612111520904,2.7525,71,4488,0 +306,307,13284,0.067835890522602,2.8644444444444,53,3637,0 +307,308,13070,0.047414460026828,3.1927777777778,28,2705,0 +308,309,12983,0.050348669783997005,3.5872222222222,24,2429,0 +309,310,13075,0.07296715773193299,3.8305555555556,23,2839,0 +310,311,12991,0.10713527159169,3.8827777777778,30,2371,0 +311,312,12993,0.073622496612493,3.7291666666667,25,2758,0 +312,313,13121,0.11556476355437,3.6172222222222,29,2291,0 +313,314,13097,0.034160489683707995,3.4491666666667005,27,2220,0 +314,315,13150,0.019571935182124,3.4097222222222,77,2620,0 +315,316,13078,0.15720996206912,3.2605555555556,46,2467,0 +316,317,13140,0.11515041454164,3.2191666666667,86,2088,0 +317,318,13102,0.086415715789296,2.9586111111111,97,2137,0 +318,319,13110,0.092606306920552,2.6036111111111,88,2907,0 +319,320,13138,0.046458579038692015,2.3319444444444,110,2558,0 +320,321,13238,0.10977831600416,2.2025,89,2823,0 +321,322,13317,0.11090009191451,2.2711111111111,134,2465,0 +322,323,13512,0.076652795374797,2.2897222222222005,84,4399,0 +323,324,13669,0.1087202400467,2.3297222222222005,109,4088,0 +324,325,13651,0.11471628863897,2.395,57,5099,0 +325,326,13580,0.11070024667119,2.5063888888889,49,5157,0 +326,327,13538,0.026827723134058,2.7077777777778,83,3782,0 +327,328,13657,0.029426630692549,2.735,101,4008,0 +328,329,14183,0.028611752774164,2.6958333333333,88,4534,0 +329,330,14117,0.053106181092382014,2.6930555555556,56,3242,0 +330,331,13166,0.055538160906184006,2.875,31,2808,0 +331,332,13265,0.11009690391165,3.1788888888888995,22,3676,0 +332,333,13085,0.10979978093137,3.5808333333333,32,3523,0 +333,334,13167,0.036174223284821,3.8508333333333,27,3038,0 +334,335,13170,0.048361321378982,3.9180555555556,17,2299,0 +335,336,13132,0.10958125953198,3.815,27,2345,0 +336,337,13055,0.047305343559722,3.6080555555556,38,2565,0 +337,338,13025,0.045316868664604014,3.4927777777778,73,2576,0 +338,339,13076,0.13255054531036,3.4316666666667004,56,2327,0 +339,340,13044,0.079695587369141,3.3436111111111004,49,2211,0 +340,341,13035,0.10277355185943,3.0663888888889,90,2642,0 +341,342,13103,0.15061124796385,2.7894444444444,106,3646,0 +342,343,13067,0.14509169704095,2.4994444444444,51,2281,0 +343,344,13183,0.054445250001619004,2.2544444444444,99,2474,0 +344,345,13144,0.082058799915824,2.0847222222222,104,2536,0 +345,346,13166,0.042151311782819015,2.0888888888889,119,2900,0 +346,347,13406,0.057404703309705984,2.1594444444444,73,3144,0 +347,348,13544,0.040891918425583,2.2533333333333,92,3725,0 +348,349,13608,0.045224636676715,2.3880555555556,57,4305,0 +349,350,13522,0.0,2.6338888888889,100,3665,0 +350,351,13595,0.0,2.6588888888889,93,3791,0 +351,352,13420,0.10335456693443,2.7586111111111005,111,3897,0 +352,353,14163,0.033846222120808,2.8797222222222,91,3494,0 +353,354,13678,0.026167129419328,2.785,43,3353,0 +354,355,13272,0.08571767780871499,2.8219444444444,91,2741,0 +355,356,13071,0.12459953631184,3.0055555555556,63,2463,0 +356,357,13004,0.054750658073534006,3.2936111111111,60,3477,0 +357,358,13068,0.20799106772677,3.5575,56,2792,0 +358,359,13031,0.10314231079956,3.676111111111101,59,2183,0 +359,360,13013,0.12212653292147,3.7166666666667,48,2874,0 +360,361,12998,0.19159058299176,3.6013888888889,65,2147,0 +361,362,12971,0.10782180851978,3.4455555555556,77,2754,0 +362,363,13000,0.06408869538637901,3.4166666666667003,60,2007,0 +363,364,12998,0.095540168894753,3.1791666666667004,94,2564,0 +364,365,12906,0.039360296791109,3.0013888888889,84,3020,0 +365,366,12969,0.086611479249287,2.72,99,2004,0 +366,367,12963,0.05845507441603001,2.4527777777778,61,2047,0 +367,368,12933,0.051490800079599004,2.1816666666667,60,3531,0 +368,369,12990,0.075496432869001,2.0161111111111,78,2383,0 +369,370,12980,0.10358625218721,1.9769444444444,81,2112,0 +370,371,12982,0.062806431427897,2.0597222222222,61,2554,0 +371,372,12989,0.08970338978685001,2.2111111111111,68,2371,0 +372,373,13073,0.094517316130968,2.3141666666667,53,2060,0 +373,374,12950,0.032322011663911,2.4280555555556003,49,2086,0 +374,375,12990,0.047911560407608,2.5855555555556,40,2130,0 +375,376,13035,0.062001214431213,2.6977777777778,125,2072,0 +376,377,13681,0.027102718749392,2.7777777777778,61,2033,0 +377,378,13304,0.034703114844079,2.7988888888889,111,2683,0 +378,379,12965,0.066236017573192,2.8927777777778,32,2046,0 +379,380,12966,0.032230355211769,3.0413888888889,21,2064,0 +380,381,12943,0.11559664215716,3.3569444444444,14,2067,0 +381,382,12958,0.021952502374124,3.4808333333333,32,2496,0 +382,383,13005,0.13347711194703,3.764166666666701,29,4758,0 +383,384,12923,0.10579408349834,3.8097222222222,26,2806,0 +384,385,12812,0.10679035350244,3.6911111111111,52,2227,0 +385,386,12803,0.068633627680319,3.4902777777778,39,3123,0 +386,387,12850,0.04699518011436099,3.3769444444444,78,3460,0 +387,388,12797,0.14159640074335994,3.3011111111111004,78,3587,0 +388,389,12732,0.078500039299167,3.1369444444444,83,2558,0 +389,390,12817,0.049232295047845,2.8475,63,2306,0 +390,391,12818,0.078777592482879,2.4544444444444,108,2083,0 +391,392,12815,0.08993433499951,2.1247222222222,158,3073,0 +392,393,12805,0.081869163858473,2.0266666666667,115,3325,0 +393,394,12703,0.14556064903749,2.1763888888889,112,2321,0 +394,395,12771,0.0,2.3088888888889,73,2846,0 +395,396,12847,0.0,2.4213888888889,93,2482,0 +396,397,12872,0.030693547421212,2.6436111111111,65,2306,0 +397,398,12815,0.0,2.6602777777778,91,2298,0 +398,399,12844,0.046999447831427,2.7677777777778,106,2907,0 +399,400,12811,0.028815579681692,2.8066666666667004,66,2329,0 +400,401,13472,0.0,2.7661111111111003,26,2456,0 +401,402,13063,0.039360296791109,2.8133333333333,23,2178,0 +402,403,12833,0.039570832199428,2.9186111111111,24,2142,0 +403,404,12842,0.090659246308087,3.1930555555556,19,2277,0 +404,405,12804,0.10540579050057003,3.565,23,3066,0 +405,406,12852,0.062601610466313,3.9133333333333,30,3619,0 +406,407,12862,0.051455855638306,3.9658333333333,23,3726,0 +407,408,12799,0.054631758648785014,3.8930555555556,35,2282,0 +408,409,12789,0.09017822949731,3.7297222222222,41,3079,0 +409,410,12815,0.045287525091609014,3.6516666666667,63,2448,0 +410,411,12887,0.033344698319951,3.5927777777778,33,2574,0 +411,412,12903,0.080098394586215,3.4694444444444,50,3697,0 +412,413,12892,0.025162301034707,3.2536111111111,88,3067,0 +413,414,12907,0.078260793447992,2.8986111111111,115,3491,0 +414,415,12883,0.07223863924679201,2.4488888888889,69,3195,0 +415,416,12965,0.042917873674349,2.2119444444444,116,2763,0 +416,417,12932,0.04720597158087901,2.2011111111111,73,2605,0 +417,418,13134,0.048273008229067,2.2338888888889,75,2755,0 +418,419,13440,0.036987975876273,2.3116666666667003,56,3300,0 +419,420,13544,0.06291463671717,2.3869444444444,66,3838,0 +420,421,13508,0.033319304393751,2.5119444444444,70,3608,0 +421,422,13401,0.029115275623859,2.5713888888889,52,3845,0 +422,423,13410,0.06821638123436,2.5088888888889,32,3563,0 +423,424,13482,0.015408589348188,2.4155555555556,16,5478,0 +424,425,14124,0.01916018435633,3.6455555555556,46,3656,0 +425,426,13703,0.06374239746477901,2.4625,53,3491,0 +426,427,13250,0.099738890728803,2.5808333333333,67,3430,0 +427,428,13092,0.10950621554455,3.0033333333333,58,2807,0 +428,429,13012,0.06138920621589401,3.3486111111111003,17,2524,0 +429,430,12901,0.051307638060244014,3.6644444444444,26,2964,0 +430,431,12848,0.082471571552878,4.0083333333333,13,3969,0 +431,432,13025,0.060122448878635,3.8530555555556,8,3561,0 +432,433,11352,0.07469842969719999,3.6183333333333,20,3394,0 +433,434,8761,0.056170625137636994,3.4922222222222,23,3005,0 +434,435,10433,0.052668952946361,3.4958333333333,34,2350,0 +435,436,10088,0.068871884486763,3.2738888888889,35,2139,0 +436,437,9485,0.040236057110938986,3.2102777777778,48,2098,0 +437,438,8865,0.053200012471363,2.8475,67,2341,0 +438,439,8920,0.056725172482788,2.4883333333332995,38,2698,0 +439,440,8798,0.035229341473877,2.1955555555556003,33,2968,0 +440,441,8927,0.0,2.1461111111111,40,2824,0 +441,442,9211,0.020190723068726,2.1522222222222,37,3003,0 +442,443,9286,0.093342961377898,2.3122222222222004,51,3551,0 +443,444,9725,0.0,2.4033333333333,52,4689,0 +444,445,11050,0.015717168144981003,2.4944444444444,57,3481,0 +445,446,11521,0.017190609993733997,2.6622222222222005,82,3376,0 +446,447,11603,0.0,2.675,74,3198,0 +447,448,11665,0.043273461915965,2.6997222222222,80,3059,0 +448,449,12153,0.029854520963498,2.6997222222222,78,2937,0 +449,450,11672,0.017383620014121998,2.7194444444444,58,2881,0 +450,451,11119,0.046391383573699006,2.8258333333333,41,2777,0 +451,452,11124,0.042155878228,3.1044444444444,34,2510,0 +452,453,10734,0.052684222339579014,3.4736111111111003,35,2356,0 +453,454,11612,0.063573954212613,3.6972222222222,40,2383,0 +454,455,11523,0.077413583128967,3.8038888888889,35,2455,0 +455,456,11632,0.069605078732108,3.7494444444444,37,2285,0 +456,457,12838,0.075937967855042,3.6813888888889,43,2455,0 +457,458,11637,0.047354002438352014,3.4791666666667003,45,4298,0 +458,459,12542,0.044000040388062,3.4530555555556,48,2400,0 +459,460,12394,0.095130971924595,3.2841666666667004,77,3431,0 +460,461,12419,0.069274987547704,3.205,79,2252,0 +461,462,12484,0.061118974117397,2.8436111111111004,59,2628,0 +462,463,12413,0.056393740750134,2.4441666666667,107,3266,0 +463,464,12440,0.06125086589409901,2.275,100,2620,0 +464,465,12614,0.047746883512707,2.1788888888889,84,2824,0 +465,466,12693,0.047136440673386,2.2083333333333,99,2801,0 +466,467,12989,0.0,2.2997222222222,103,3106,0 +467,468,13200,0.0,2.3155555555556004,47,3532,0 +468,469,13108,0.049828520132601,2.41,67,4210,0 +469,470,12886,0.0,2.5902777777778,65,3646,0 +470,471,13000,0.0,2.6636111111111,65,3768,0 +471,472,13071,0.043576825212604,2.7105555555556,70,5342,0 +472,473,13563,0.035173891965945,2.6811111111111,76,5327,0 +473,474,13333,0.04413510379665099,2.715,40,3363,0 +474,475,12672,0.016955671451488998,2.7083333333333,54,3016,0 +475,476,12547,0.1330396486107,3.0038888888889,45,3257,0 +476,477,12289,0.016462114132943,3.3911111111111003,32,2619,0 +477,478,12584,0.055696363369897,3.6375,26,2573,0 +478,479,12526,0.036411774365825,3.7755555555556,25,2575,0 +479,480,12416,0.047966724418057,3.5786111111111003,34,5355,0 +480,481,12450,0.05609961782665,3.4222222222222,43,5809,0 +481,482,12460,0.096990479781121,3.2538888888889,68,3823,0 +482,483,12425,0.11147038220964,3.1683333333333,60,3116,0 +483,484,12430,0.044797927381498,3.0677777777778,74,2321,0 +484,485,12418,0.024403519177111,2.94,68,2193,0 +485,486,12437,0.08532776818426499,2.7291666666667003,43,2982,0 +486,487,12484,0.043615168647623,2.4147222222222005,73,4140,0 +487,488,12380,0.056692005942856,2.1419444444444,72,2353,0 +488,489,12620,0.033708553131457,2.0244444444444,66,3350,0 +489,490,12674,0.040148453968243986,2.0458333333333,90,3184,0 +490,491,12855,0.099551526697496,2.09,104,3469,0 +491,492,13053,0.0,2.1575,114,4204,0 +492,493,12898,0.036157867549894,2.2655555555556,98,6447,0 +493,494,12809,0.052738784696875,2.2561111111111,70,4898,0 +494,495,12964,0.021636091422947,2.4669444444444,101,3633,0 +495,496,12956,0.037120220639643986,2.5277777777778,77,4189,0 +496,497,13625,0.034467327401996005,2.5266666666667,69,4012,0 +497,498,13285,0.0,2.5438888888889,19,4009,0 +498,499,12715,0.096807019710259,2.6511111111111,47,4346,0 +499,500,12637,0.059601475230884,2.9711111111111004,38,2781,0 +500,501,12535,0.068431521141608,3.2288888888889,22,2811,0 +501,502,12512,0.09611085542804,3.505,20,2415,0 +502,503,12549,0.064177980162036,3.4944444444444,26,3589,0 +503,504,12567,0.11565746993409,3.4633333333333,24,2878,0 +504,505,12362,0.073501732487291,3.3177777777778,27,3471,0 +505,506,12326,0.072746100819649,3.1963888888889,25,2697,0 +506,507,12450,0.07557888002360401,3.1069444444444,57,2583,0 +507,508,12404,0.036816888038697,3.0172222222222,58,3173,0 +508,509,12362,0.093969235453559,2.9247222222222,81,3341,0 +509,510,12431,0.034848294186597004,2.5336111111111,81,2305,0 +510,511,12351,0.084191269180943,2.2480555555556,69,2186,0 +511,512,12528,0.13109036514766,2.0383333333333,50,4439,0 +512,513,12559,0.061132356147447,1.8852777777778,55,3173,0 +513,514,12586,0.019478099970089,1.9225,57,2831,0 +514,515,12864,0.0,1.9719444444444,78,16385,0 +515,516,13026,0.0,2.0608333333333,57,83955,0 +516,517,12880,0.017965204407153,2.16,78,4574,0 +517,518,12743,0.019202263481759,2.3077777777778,95,4987,0 +518,519,12812,0.0,2.415,88,5110,0 +519,520,12878,0.052306327013631,2.4669444444444,108,4893,0 +520,521,13427,0.08536575533023,2.5125,87,3807,0 +521,522,13081,0.052461360256699015,2.6294444444444,87,3447,0 +522,523,12752,0.035302992848671,2.8183333333333,44,4329,0 +523,524,12594,0.028682734942579,3.0547222222222,39,5166,0 +524,525,12507,0.024204462299365,3.33,27,3454,0 +525,526,12494,0.034360100307537,3.5738888888889,23,3578,0 +526,527,12487,0.018977302969238,3.6888888888889,11,2406,0 +527,528,12404,0.034308847257872,3.7111111111111,13,2073,0 +528,529,11147,0.07460088255490599,3.7180555555556,24,1925,0 +529,530,11147,0.055037935083209005,3.6041666666667,77,2357,0 +530,531,11128,0.039311673522385,3.4483333333333,54,1947,0 +531,532,11106,0.046619928266775,3.2413888888888995,45,1912,0 +532,533,11115,0.048227542028921,3.1355555555556,36,2107,0 +533,534,11044,0.020367863848114,2.8172222222222,59,2985,0 +534,535,11110,0.063069968046591,2.4275,81,2081,0 +535,536,11190,0.054470866056974016,2.2513888888889,50,2631,0 +536,537,11063,0.0,2.0691666666667,53,2130,0 +537,538,11078,0.059261864411046,2.0155555555556,44,2085,0 +538,539,11146,0.064174002348993,2.0952777777778,87,2211,0 +539,540,11010,0.0,2.2397222222222,94,2105,0 +540,541,11139,0.021912411214588,2.3275,128,2585,0 +541,542,11117,0.057958262002105985,2.5255555555556004,82,3695,0 +542,543,11081,0.035358633773416,2.665,49,3198,0 +543,544,11128,0.029191244440103,2.7975,79,3191,0 +544,545,11720,0.054981313823219,2.8597222222222,62,2016,0 +545,546,11384,0.06405347705857799,2.7983333333333,64,2124,0 +546,547,11018,0.0,2.9322222222222,34,2105,0 +547,548,11104,0.055445634363329,3.08,41,2031,0 +548,549,11084,0.040996998867197,3.3466666666667004,47,1964,0 +549,550,11106,0.027670189755404,3.6869444444444,31,2016,0 +550,551,11055,0.054579839310753,3.7966666666667,26,3909,0 +551,552,11098,0.044833640073299014,3.7805555555556,17,2105,0 +552,553,11028,0.03282297151413,3.7422222222222,30,2405,0 +553,554,11152,0.017696014614986,3.639166666666701,17,2141,0 +554,555,11025,0.09418709999244,3.4775,28,1910,0 +555,556,11015,0.061817529149429,3.3283333333333,20,1951,0 +556,557,11125,0.054000161367618,3.1702777777778,85,2310,0 +557,558,11035,0.06165600249599,2.7688888888889,52,2047,0 +558,559,11103,0.055915839259234,2.4266666666667,143,2048,0 +559,560,11100,0.062788330996733,2.1963888888889,106,3083,0 +560,561,11170,0.044888048273534,2.135,244,3619,0 +561,562,11078,0.095259484956337,2.3186111111111,2005,2172,0 +562,563,11150,0.021952502374124,2.3383333333333,124,3142,0 +563,564,11149,0.0,2.5002777777778,109,2256,0 +564,565,10984,0.0,2.6527777777778,148,2200,0 +565,566,11034,0.0,2.7661111111111003,126,2183,0 +566,567,11050,0.061557079663167,2.7347222222222,46,2030,0 +567,568,11102,0.14186075040414,2.6069444444444,49,2297,0 +568,569,11743,0.0,2.5547222222222,40,2213,0 +569,570,11371,0.077457673524504,2.4716666666667004,39,4014,0 +570,571,11078,0.16422977329792998,2.6530555555556004,25,2809,0 +571,572,11224,0.049366067455729,2.9488888888889,37,2355,0 +572,573,11146,0.10064381631633,3.3383333333333,32,2372,0 +573,574,11199,0.11909159312806,3.5419444444444,47,2387,0 +574,575,11181,0.09003816676619801,5.3302777777778,34,2359,0 +575,576,11022,0.055882659245704,3.7727777777778,40,2485,0 +576,577,11073,0.1836893913223,3.6333333333333,46,3728,0 +577,578,11120,0.08574268253550299,3.5430555555556,35,2820,0 +578,579,11008,0.12559700716583,3.6711111111111,61,2426,0 +579,580,11078,0.086129850619071,3.4572222222222,56,2307,0 +580,581,11121,0.041752618326160014,3.2,72,2233,0 +581,582,11041,0.094396473652892,2.7772222222222,110,2178,0 +582,583,11168,0.045323960075285004,2.415,135,2243,0 +583,584,11213,0.13808411333909,2.2530555555556004,133,2713,0 +584,585,11238,0.08029349854683501,2.0994444444444,148,3168,0 +585,586,11273,0.06507307495461,2.1780555555556003,86,3163,0 +586,587,11479,0.084518021856329,2.2638888888889,132,3289,0 +587,588,11839,0.030507395540508,2.3575,73,4001,0 +588,589,11735,0.05892502921299701,2.4680555555556003,95,4684,0 +589,590,11574,0.0,2.6208333333333,74,4137,0 +590,591,11531,0.033075906123641,2.6863888888889,51,4787,0 +591,592,11420,0.16633704704670998,2.6172222222222,65,4278,0 +592,593,12301,0.10228536028167,2.6194444444444,95,3898,0 +593,594,11845,0.16949365549682996,2.6358333333333,72,3728,0 +594,595,11374,0.08260397756200501,2.8661111111111004,41,4047,0 +595,596,11370,0.024378363844868,3.0533333333333,38,3373,0 +596,597,11197,0.15686874147816002,3.4438888888889,32,2669,0 +597,598,11171,0.063929461148943,3.6552777777778,22,3289,0 +598,599,11197,0.12602019009982998,3.8519444444444,29,2556,0 +599,600,11114,0.035137191893634005,3.8069444444444,32,2557,0 +600,601,12564,0.14965728062748998,3.5961111111111004,40,3003,0 +601,602,12459,0.10046170077382,3.5344444444444,59,2441,0 +602,603,12508,0.13163105487926,3.3972222222222,52,2396,0 +603,604,12464,0.043899611017859004,3.3936111111111003,42,3426,0 +604,605,12438,0.19567092855859,3.1025,46,2379,0 +605,606,12449,0.19135011734275,2.8630555555556,97,3026,0 +606,607,12373,0.11171915024595,2.4255555555556003,72,2336,0 +607,608,12594,0.032053604746412,1.8619444444444,81,2850,0 +608,609,12623,0.096448361580655,1.8930555555556,81,3016,0 +609,610,12759,0.07934996156433399,2.2080555555556,70,3537,0 +610,611,12841,0.024581173073578,2.3052777777778,89,3899,0 +611,612,13063,0.025596039426134,2.3777777777777995,87,5044,0 +612,613,13023,0.027922074309281,2.5161111111111,125,4806,0 +613,614,12884,0.02593545023878,2.6411111111111,69,4139,0 +614,615,13007,0.033086949155743,2.8011111111111004,57,4776,0 +615,616,13016,0.047260069860172005,2.7236111111111003,99,4065,0 +616,617,13588,0.038487130166032016,2.6813888888889,111,4969,0 +617,618,13272,0.16080169828563,2.7336111111111,71,3784,0 +618,619,12589,0.12635270044885,2.8863888888889,71,3297,0 +619,620,12651,0.046904491868436,3.1225,48,3347,0 +620,621,12616,0.059534673085297,3.4613888888889,76,3170,0 +621,622,12492,0.12198352023568,3.8297222222222,56,2241,0 +622,623,12497,0.052131597947042,3.8936111111111,35,2301,0 +623,624,12623,0.094084438832673,3.7588888888889,35,2303,0 +624,625,12481,0.13486764750848,3.5827777777778,29,2587,0 +625,626,12434,0.062226183256115,3.4730555555556,38,3211,0 +626,627,12495,0.091202035463034,3.4175,69,2604,0 +627,628,12375,0.096137859324631,3.3533333333333,77,2841,0 +628,629,12357,0.10449109200785,3.1963888888889,20,2168,0 +629,630,12433,0.097127966420289,2.8852777777778,24,2265,0 +630,631,12432,0.064404980330111,2.4880555555556003,83,2908,0 +631,632,12429,0.10188181868693,2.2325,62,3180,0 +632,633,12551,0.19953464365013,2.1044444444444,54,3118,0 +633,634,12799,0.0747839457206,2.1097222222222,54,3296,0 +634,635,12818,0.0,2.235,60,4432,0 +635,636,13071,0.0,2.3516666666667003,63,4336,0 +636,637,12897,0.0,2.5138888888889,95,4534,0 +637,638,12961,0.041436571087464,2.6105555555556004,69,4261,0 +638,639,12925,0.038671790863765,2.7233333333333,68,5248,0 +639,640,12968,0.035810634316102014,2.6633333333333,58,5014,0 +640,641,13525,0.1409929213297,2.5580555555556,107,3864,0 +641,642,12993,0.0,2.6627777777778,48,5682,0 +642,643,12369,0.052915080344848,2.7625,64,4404,0 +643,644,12195,0.11966022897483,3.0283333333333,52,3705,0 +644,645,12464,0.12973870706052,3.3727777777778,61,2738,0 +645,646,12470,0.023838633821411,3.6369444444444,47,2887,0 +646,647,12475,0.12358680271021,3.7088888888889,58,3776,0 +647,648,12482,0.089095336472172,3.5847222222222,51,3532,0 +648,649,12221,0.019762530636927,3.4836111111111,61,3724,0 +649,650,12325,0.020994992941051,3.4077777777778,53,2786,0 +650,651,12258,0.10380294658324002,3.4441666666667,55,2941,0 +651,652,11980,0.079228021087742,3.1683333333333,52,2351,0 +652,653,11947,0.039012779943635,3.0527777777778,89,2316,0 +653,654,12291,0.10658713601061,2.8527777777778,85,2350,0 +654,655,12293,0.14426278476756,2.5433333333333,106,2916,0 +655,656,12341,0.08706206992122,2.1997222222222,88,2437,0 +656,657,12390,0.16325946030154,2.1036111111111,59,2761,0 +657,658,12611,0.0,2.2133333333333,48,3941,0 +658,659,12737,0.0,2.2086111111111,66,4025,0 +659,660,12882,0.07729609083366701,2.2883333333333,95,4466,0 +660,661,12891,0.058100747891124,2.3222222222222,82,4401,0 +661,662,12756,0.061191523312340984,2.47,76,4747,0 +662,663,12875,0.08592375974441901,2.685,104,4051,0 +663,664,12847,0.033467197342519,2.6763888888889,54,4448,0 +664,665,13518,0.030265788895452006,2.5838888888889,43,3736,0 +665,666,13217,0.11950310860409,2.6130555555556003,39,3918,0 +666,667,12621,0.09169148327055697,2.7633333333333,48,3408,0 +667,668,12591,0.18439354827551,3.0708333333333,38,2883,0 +668,669,12332,0.10741924067542,3.4347222222222,45,3631,0 +669,670,12404,0.15862461647089002,3.7030555555556,64,2609,0 +670,671,12457,0.14957813136313,3.8138888888889,35,2533,0 +671,672,12370,0.24059408570531,3.8508333333333,66,2469,0 +672,673,11509,0.15511115210127,3.8961111111111,61,2458,0 +673,674,11433,0.19582462633148,3.4763888888889,58,2458,0 +674,675,11317,0.13981560037535998,3.4041666666667,51,2043,0 +675,676,11364,0.1392329990551,3.2352777777778,55,1985,0 +676,677,11350,0.13079770999921,3.1508333333333,126,2032,0 +677,678,11348,0.053672881218709015,2.7863888888888995,61,3409,0 +678,679,11365,0.10971373742228,2.4861111111111,94,2018,0 +679,680,11505,0.13825204927093,2.2444444444444,83,2461,0 +680,681,11468,0.13912778922607,2.1286111111111,136,2318,0 +681,682,11562,0.10215803640865,2.1261111111111,104,2787,0 +682,683,11858,0.096617489053804,2.2405555555556003,77,3186,0 +683,684,11933,0.0,2.2991666666667,109,3490,0 +684,685,11813,0.0,2.3627777777778,146,3407,0 +685,686,11735,0.0,2.5863888888889,69,3193,0 +686,687,11848,0.0,2.7286111111111,121,3412,0 +687,688,11843,0.0,2.8355555555556,53,3563,0 +688,689,12318,0.068897518746959,2.7875,61,3247,0 +689,690,11846,0.05418569809170299,2.7825,82,3012,0 +690,691,11066,0.06507307495461,2.7972222222222,37,2382,0 +691,692,10920,0.10547682048851,3.0355555555556,19,2012,0 +692,693,10836,0.056437861708265,3.2486111111111,19,1915,0 +693,694,10879,0.098703711593837,3.6077777777778,19,1982,0 +694,695,10796,0.14331889652193,3.76,54,1950,0 +695,696,10785,0.05704449488642,3.806666666666701,44,4176,0 +696,697,9469,0.0,3.6638888888889,46,3654,0 +697,698,9278,0.032146952736052,3.5161111111111003,53,3063,0 +698,699,9417,0.068135614649249,3.3286111111111003,83,1916,0 +699,700,9253,0.034514299845882,3.2166666666667,92,1848,0 +700,701,9435,0.028306668795131006,2.9783333333333,94,1704,0 +701,702,9356,0.13119921991025002,2.7211111111111004,111,1680,0 +702,703,9354,0.093609772007723,2.4102777777778,84,2011,0 +703,704,9405,0.11179018663123,2.1366666666667,52,1772,0 +704,705,9326,0.065272680657868,1.9947222222222,68,1838,0 +705,706,9549,0.15901886092526998,1.9936111111111,35,1924,0 +706,707,9499,0.0,2.0788888888889,40,2038,0 +707,708,9371,0.26537507315217,2.1736111111111,47,1991,0 +708,709,9462,0.0,2.4027777777778,85,1729,0 +709,710,9509,0.056610336908172985,2.4580555555556,59,1673,0 +710,711,9469,0.026644044055307004,2.6102777777777995,61,1656,0 +711,712,9522,0.040819652463459,2.7597222222222,45,1774,0 +712,713,9885,0.13497701521251,2.8122222222222,47,1784,0 +713,714,9802,0.16853433621426,2.8427777777778,72,1818,0 +714,715,9461,0.08655557751574,2.87,69,1981,0 +715,716,9393,0.05741127788681901,2.9769444444444,17,2004,0 +716,717,9638,0.037244401880164,3.3241666666667005,47,1788,0 +717,718,9435,0.1132743034971,3.6375,37,1786,0 +718,719,9519,0.15690958465910998,3.8652777777778,57,1781,0 +719,720,9492,0.09604225449090803,3.8091666666667,62,2024,0 +720,721,9458,0.06746445682560599,3.6844444444444,72,1669,0 +721,722,9420,0.058373145210404015,3.5913888888889,43,1729,0 +722,723,9429,0.048008603166117006,3.5255555555556,57,1682,0 +723,724,9461,0.12614216994504,3.3277777777778,47,1714,0 +724,725,9404,0.077186121310215,3.07,61,1679,0 +725,726,9366,0.042879382350005,2.7622222222222,53,1739,0 +726,727,9488,0.031014262794497007,2.3872222222222,78,1669,0 +727,728,9515,0.13957171072647,2.1308333333333,100,1806,0 +728,729,9487,0.027108383258306,2.1563888888889,104,1650,0 +729,730,9497,0.0,2.2547222222222003,56,1751,0 +730,731,9516,0.0,2.3397222222222003,89,1685,0 +731,732,9504,0.0,2.4808333333333,108,1645,0 +732,733,9422,0.025265991419408,2.6208333333333,67,2133,0 +733,734,9543,0.0,2.8138888888889,83,1618,0 +734,735,9395,0.047219926720593,2.9275,90,1623,0 +735,736,9352,0.083109434319356,2.8663888888888995,82,1697,0 +736,737,9884,0.10860709298782,2.7794444444444,76,1684,0 +737,738,9820,0.098319718095083,2.8194444444444,34,1779,0 +738,739,9439,0.02201293380153,2.9458333333333,43,2982,0 +739,740,9560,0.064929719079082,3.2413888888888995,40,1848,0 +740,741,9589,0.036960535765785,3.7166666666667,40,1772,0 +741,742,9575,0.068536856116777,4.1333333333333,57,1841,0 +742,743,9541,0.012398281267649,4.2697222222222,60,1834,0 +743,744,9490,0.035305311833591015,4.2797222222222,53,1860,0 +744,745,7160,0.024153733176505,4.0,44,1647,0 +745,746,7233,0.031750779212929,3.8877777777778,48,2129,0 +746,747,7166,0.092612685693125,3.6633333333333,50,1763,0 +747,748,7245,0.12674340154738,3.6127777777778,65,1433,0 +748,749,7299,0.068594711667718,3.3175,93,1428,0 +749,750,7169,0.13866540834682,2.8930555555556,105,1521,0 +750,751,7228,0.046813024390007014,2.4722222222222,94,1622,0 +751,752,7123,0.072990045810784,2.2294444444444,53,1580,0 +752,753,7199,0.17156759541908995,2.1286111111111,59,1468,0 +753,754,7167,0.051876699734571985,2.2219444444444,63,1520,0 +754,755,7212,0.031958698733103,2.3366666666667,61,1529,0 +755,756,7206,0.07333373485157901,2.4155555555556,72,1611,0 +756,757,7149,0.0,2.5408333333333,93,1511,0 +757,758,7284,0.023187512335638,2.6511111111111,62,1906,0 +758,759,7265,0.031672522871666,2.8405555555556,50,2632,0 +759,760,7221,0.091103855362214,2.8336111111111,42,1483,0 +760,761,7588,0.0,2.6575,62,1611,0 +761,762,7423,0.0983398607742,2.6622222222222005,21,1676,0 +762,763,7198,0.08011943311413,2.7719444444444,28,1670,0 +763,764,7279,0.043646436319699,3.0344444444444,65,1631,0 +764,765,7174,0.091445521226266,3.3741666666667003,37,1799,0 +765,766,7259,0.067771120773973,3.6925,20,1511,0 +766,767,7166,0.049768578185777006,3.8136111111111,47,1605,0 +767,768,7171,0.067455979006223,3.8202777777778,45,1758,0 +768,769,6883,0.14102875351082,3.7547222222222,49,1509,0 +769,770,6859,0.04521932948417,3.6077777777778,46,1591,0 +770,771,6817,0.032382889221133,3.5330555555556,30,1543,0 +771,772,6877,0.075100266089453,3.3544444444444,30,1573,0 +772,773,6785,0.038989846359505,3.1155555555556,48,1473,0 +773,774,6665,0.093396608626074,2.8463888888888995,36,1476,0 +774,775,6805,0.06797619687558401,2.4411111111111,46,1712,0 +775,776,6863,0.08326287339845401,2.1455555555556,27,1801,0 +776,777,6926,0.015112630017379,2.0025,79,1902,0 +777,778,7004,0.031549757127405,2.1247222222222,65,2005,0 +778,779,6950,0.0,2.2741666666667,57,2363,0 +779,780,7262,0.0,2.3272222222222005,61,2513,0 +780,781,7361,0.017214486216241002,2.4363888888889,89,2664,0 +781,782,7288,0.015541991667356,2.6155555555556003,80,2714,0 +782,783,7463,0.0,2.7272222222222,79,2754,0 +783,784,7188,0.027199843934104,2.6552777777778,113,2670,0 +784,785,7658,0.053744802378685,2.6086111111111,71,2584,0 +785,786,7575,0.05675511278546901,2.6025,53,2466,0 +786,787,6954,0.070873939193717,2.7372222222222,64,2137,0 +787,788,6862,0.19022950977106,3.0125,43,1931,0 +788,789,6896,0.17589540947937002,3.3477777777778,34,1743,0 +789,790,6954,0.022875979046571,3.6236111111111,29,1713,0 +790,791,6869,0.0,3.7383333333333,30,1649,0 +791,792,6890,0.13681403156951,3.7772222222222,24,1633,0 +792,793,9742,0.058507485759525,3.6966666666667,40,1993,0 +793,794,9730,0.10227075584148,3.7733333333333,32,1940,0 +794,795,9810,0.06726096113022301,3.6408333333333,39,1951,0 +795,796,9688,0.15267199916685995,3.3922222222222,67,1894,0 +796,797,9849,0.069818221889972,3.1627777777778,65,1801,0 +797,798,9765,0.030305771594539,2.6875,49,1962,0 +798,799,9812,0.09211700324247198,2.3533333333333,41,2123,0 +799,800,9931,0.12298177354813,2.0425,50,2434,0 +800,801,9908,0.08705722689013601,1.9738888888889,48,2402,0 +801,802,10066,0.07529920073678098,2.0425,59,3013,0 +802,803,10184,0.06217694957317299,2.1563888888889,51,3086,0 +803,804,10295,0.020886039183631,2.2866666666667004,43,3527,0 +804,805,10113,0.08148200392528,2.3919444444444,72,3716,0 +805,806,10218,0.027014133895137,2.5513888888889,52,3577,0 +806,807,10322,0.08271940630361399,2.6030555555556,68,3430,0 +807,808,10269,0.038537180887872,2.6647222222222005,74,3413,0 +808,809,10781,0.090543853269643,2.5930555555556003,46,3755,0 +809,810,10486,0.02593545023878,2.5513888888889,64,4806,0 +810,811,10124,0.090692829340129,2.76,38,3127,0 +811,812,9993,0.09154630234853098,3.0636111111111,40,3421,0 +812,813,9801,0.09562635368432304,3.4016666666667,50,2475,0 +813,814,9760,0.0,3.7277777777778,42,2440,0 +814,815,9858,0.0,3.7902777777778,37,2731,0 +815,816,9884,0.027267039980187,3.7355555555556,34,2493,0 +816,817,7781,0.024102810048699,3.535,37,1665,0 +817,818,7742,0.072297652068167,3.5819444444444,47,1771,0 +818,819,7682,0.12348623922845,3.3847222222222,67,2293,0 +819,820,7831,0.077453588867077,3.2547222222222,66,1959,0 +820,821,7641,0.05662557916213299,3.125,91,1498,0 +821,822,7641,0.15509029304093,2.7766666666667,132,1537,0 +822,823,7759,0.079595064406905,2.4725,149,1580,0 +823,824,7748,0.053225613553497,2.1927777777778,65,1901,0 +824,825,7776,0.05741127788681901,2.1283333333333,50,1916,0 +825,826,7938,0.077171346852694,2.2319444444444,70,2213,0 +826,827,8031,0.0,2.3061111111111,82,2205,0 +827,828,8117,0.07512642149906099,2.3363888888889,72,2486,0 +828,829,8099,0.0,2.3686111111111,98,2580,0 +829,830,8002,0.0,2.4986111111111,78,2530,0 +830,831,7944,0.026463035590685,2.6433333333333,86,2664,0 +831,832,7963,0.024228588329879,2.7563888888889,76,4368,0 +832,833,8602,0.055182797357095005,2.6652777777778,95,3103,0 +833,834,8269,0.09607690135523,2.6844444444444,63,2249,0 +834,835,7871,0.059431847203259,2.7902777777778,32,2070,0 +835,836,7709,0.018731901987648,3.1119444444444,30,2833,0 +836,837,7726,0.033970515582906,3.5491666666667,27,1734,0 +837,838,7781,0.049963174087431,3.7102777777778,22,2151,0 +838,839,7762,0.073295374096872,3.7961111111111,19,2103,0 +839,840,7692,0.017715537831218996,3.7730555555556,32,1725,0 +840,841,6608,0.014656639469103996,3.5919444444444,45,1895,0 +841,842,6526,0.15513271231042,3.5580555555556,65,1959,0 +842,843,6531,0.06544162031760599,3.4588888888889,73,1637,0 +843,844,6483,0.12276447331552,3.2969444444444003,52,1658,0 +844,845,6602,0.054046416943085,3.2288888888889,93,1666,0 +845,846,6555,0.06827770027642299,2.7358333333333,68,2410,0 +846,847,6610,0.10171854295932,2.4636111111111,127,1787,0 +847,848,6690,0.093454285728882,2.1894444444444,105,2264,0 +848,849,6651,0.04318436192577,2.1227777777778,75,2007,0 +849,850,6759,0.10050707347524,2.1369444444444,77,2107,0 +850,851,6836,0.019571935182124,2.2230555555556,140,2355,0 +851,852,6894,0.0,2.3188888888889,132,2726,0 +852,853,6844,0.0,2.4166666666667003,100,2875,0 +853,854,6773,0.02713995635286,2.5777777777778,174,2780,0 +854,855,6802,0.092632629280125,2.7869444444444,82,3936,0 +855,856,6947,0.098676638207998,2.8586111111111,128,3116,0 +856,857,7248,0.0,3.0816666666667003,79,3770,0 +857,858,6885,0.11132365864914,2.8713888888889,71,2382,0 +858,859,6643,0.0947301899901,2.9386111111111,60,2152,0 +859,860,6560,0.061070711161473,2.9827777777778,60,1754,0 +860,861,6554,0.18477832073133,3.3197222222222,56,1783,0 +861,862,6600,0.055986690710270993,3.5961111111111004,78,1780,0 +862,863,6525,0.16264480046039995,3.7613888888889,60,1582,0 +863,864,6543,0.026215643469448,3.7305555555556,48,2271,0 +864,865,9018,0.0,3.5580555555556,48,2592,0 +865,866,9225,0.054655616583012,3.5136111111111004,42,2921,0 +866,867,9112,0.07076692500883701,3.3772222222222,64,1814,0 +867,868,9195,0.067217215228375,3.2402777777778,36,3219,0 +868,869,9206,0.046060828388587,3.0586111111111003,40,2567,0 +869,870,9224,0.08329795085471901,2.7908333333333,18,1899,0 +870,871,9408,0.08219020764935,2.3761111111111,35,1801,0 +871,872,9082,0.046792553198475,2.1347222222222,44,2005,0 +872,873,9168,0.06755714954154099,1.9991666666667,105,2572,0 +873,874,9258,0.099050882008287,1.9983333333333,71,3563,0 +874,875,9158,0.0,2.0908333333333,65,2777,0 +875,876,9140,0.10824637351267,2.2311111111111,74,3362,0 +876,877,9206,0.0,2.3219444444444,34,3590,0 +877,878,9186,0.0,2.4727777777778,49,2930,0 +878,879,9155,0.037750185176735,2.5952777777778,44,2481,0 +879,880,9174,0.030345867660395,2.7416666666667004,57,2571,0 +880,881,9758,0.057665227298857,2.7652777777778,102,3546,0 +881,882,9451,0.16774071722374,2.7980555555556,106,4984,0 +882,883,9153,0.10462164884166,2.7597222222222,58,1994,0 +883,884,9233,0.051974117163582,3.0116666666667005,57,3060,0 +884,885,9250,0.070438547008222,3.2916666666667003,62,2151,0 +885,886,9317,0.11437533048244,3.5547222222222,42,2158,0 +886,887,9130,0.028754095353637,3.7580555555556,35,2319,0 +887,888,9249,0.06874265819680701,3.7330555555556,28,1909,0 +888,889,8297,0.041552255552731,3.5886111111111005,27,1627,0 +889,890,8245,0.033571347720577,3.5255555555556,35,2459,0 +890,891,8298,0.014724878652831,3.3858333333333,50,3167,0 +891,892,8247,0.046095580964192,3.2677777777778,69,1839,0 +892,893,8387,0.031859774913781,3.1247222222222,64,3887,0 +893,894,8392,0.094121536253424,2.7213888888888995,69,2031,0 +894,895,8531,0.11471874999036,2.3972222222222004,58,1522,0 +895,896,8437,0.09375530196425097,2.0836111111111,58,1732,0 +896,897,8344,0.10898948864079,2.0644444444444,51,2169,0 +897,898,8274,0.031129909255124,2.2063888888889,46,1679,0 +898,899,8328,0.0,2.3044444444444,84,1941,0 +899,900,8351,0.020155867044519,2.47,144,1638,0 +900,901,8380,0.016795241270985,2.5697222222222003,86,1725,0 +901,902,8332,0.0,2.7625,69,1903,0 +902,903,8366,0.0,2.9436111111111005,81,2074,0 +903,904,8357,0.01748186857624,2.7905555555556,175,1848,0 +904,905,8867,0.015638795432702,2.7527777777778,65,1761,0 +905,906,8659,0.037878946671491,2.6980555555556,48,1838,0 +906,907,8458,0.14870829462531002,2.9102777777778,33,1640,0 +907,908,8360,0.07322030784057597,3.2663888888889,35,1715,0 +908,909,8330,0.10504553292421,3.5372222222222,37,1717,0 +909,910,8298,0.10771048774666,3.86,31,1758,0 +910,911,8381,0.07484115005697,3.9216666666667,36,1975,0 +911,912,8393,0.10377526695926,3.8766666666667,30,1865,0 +912,913,3998,0.052336696506499,3.6463888888889,28,3575,0 +913,914,3733,0.039930389849144,3.6552777777778,24,1413,0 +914,915,3735,0.052659026600132,3.5880555555556,68,1414,0 +915,916,3709,0.071593754146172,3.3594444444444003,26,1170,0 +916,917,3755,0.072107773186609,3.1888888888889,78,1209,0 +917,918,3782,0.14407221323011,2.7575,90,1170,0 +918,919,3849,0.078873737285415,2.3936111111111,76,1328,0 +919,920,3801,0.090543853269643,2.1925,94,1258,0 +920,921,3787,0.0,2.16,70,1427,0 +921,922,3835,0.18229662394063,2.2719444444444,129,1480,0 +922,923,4035,0.10064381631633,2.3994444444444,120,1687,0 +923,924,4173,0.0,2.2836111111111,122,1942,0 +924,925,3995,0.0,2.5422222222222004,100,1967,0 +925,926,4016,0.0,2.6908333333333,102,2110,0 +926,927,4049,0.064661049677152,2.7702777777778,118,1956,0 +927,928,4014,0.10610212880951,2.7405555555556,86,1984,0 +928,929,4263,0.098345239553664,2.6908333333333,92,1893,0 +929,930,3941,0.055426072308289,2.7008333333333,44,1821,0 +930,931,4023,0.026036719363444,2.8322222222222,25,1641,0 +931,932,3917,0.058176601538018,3.0922222222222,54,1604,0 +932,933,3910,0.11644035456955,3.4363888888889,48,1265,0 +933,934,3934,0.067489738764642,3.7530555555556,56,1407,0 +934,935,3783,0.091155534540558,3.9127777777778,42,1342,0 +935,936,3834,0.052217414705359004,3.7608333333333,41,1216,0 +936,937,8698,0.028401045145692,3.6472222222222,32,2569,0 +937,938,8969,0.06030991242653401,3.5544444444444,48,2150,0 +938,939,8928,0.057683225704233,3.5036111111111,40,2317,0 +939,940,9020,0.049602244305935,3.2538888888889,26,2047,0 +940,941,8865,0.054771618715138,3.1886111111111,55,2065,0 +941,942,8830,0.014455899164978,2.7341666666667,52,1909,0 +942,943,8879,0.05563571922395901,2.3655555555556003,34,1910,0 +943,944,9120,0.077488949885965,2.1688888888889,61,2037,0 +944,945,9111,0.06776025909838901,2.0977777777778,34,3065,0 +945,946,9071,0.033919453583666,2.3077777777778,50,2452,0 +946,947,9205,0.030948232299768,2.3611111111111,47,3226,0 +947,948,9355,0.0,2.4986111111111,56,3271,0 +948,949,9372,0.0,2.5691666666667,76,3471,0 +949,950,9392,0.0,2.7463888888889,60,3922,0 +950,951,9416,0.0,2.8063888888888995,100,3296,0 +951,952,9394,0.0,2.8091666666667003,80,3171,0 +952,953,9810,0.10150033578287,2.715,74,3208,0 +953,954,9594,0.13650296233629,2.6869444444444,24,3602,0 +954,955,9006,0.048341331534980006,2.8180555555556,41,3208,0 +955,956,9140,0.055919636698743,3.0541666666667004,19,3455,0 +956,957,8925,0.052826773889684014,3.4711111111111004,24,2833,0 +957,958,9047,0.07932984590431501,3.7566666666667,18,3453,0 +958,959,9030,0.033310879512461,3.8633333333333,28,3155,0 +959,960,9088,0.048306771033288,3.7519444444444,5,2145,0 +960,961,8569,0.034002578802562,3.6480555555556,12,1999,0 +961,962,8616,0.047801640470854015,3.5061111111111005,35,2135,0 +962,963,8497,0.13378075099383,3.47,41,1813,0 +963,964,8439,0.063853685461221,3.3086111111111003,30,2020,0 +964,965,8567,0.0,3.1194444444444,22,2127,0 +965,966,8694,0.073869151016554,2.8044444444444,56,1764,0 +966,967,8739,0.043582908466928014,2.4205555555556004,34,2249,0 +967,968,8761,0.0,2.1180555555556,73,3119,0 +968,969,8838,0.062006969698131,2.1266666666667,86,2031,0 +969,970,8908,0.14006961492891,2.1708333333333,68,2246,0 +970,971,9053,0.11198565566104,2.3247222222222,36,3214,0 +971,972,9346,0.0,2.4208333333333,66,4207,0 +972,973,8989,0.058427455554992985,2.5563888888889,74,4195,0 +973,974,8807,0.070887934206661,2.7086111111111,78,3179,0 +974,975,9020,0.031869233863638,2.8027777777778,66,2739,0 +975,976,9034,0.0,2.7711111111111,118,2394,0 +976,977,9558,0.055680379884383,2.74,81,3750,0 +977,978,9042,0.030919398857213,2.6869444444444,85,3000,0 +978,979,8804,0.040222150865381015,2.8113888888889,69,2646,0 +979,980,8885,0.08462727078727299,3.1258333333333,49,2375,0 +980,981,8721,0.15790637433488,3.4711111111111004,56,2442,0 +981,982,8676,0.099165571846447,3.7419444444444,64,2069,0 +982,983,9029,0.051043016646698,3.7258333333333,48,1899,0 +983,984,8670,0.023695834967821,3.5369444444444,65,2277,0 +984,985,8537,0.13363180896924,3.4911111111111004,53,1926,0 +985,986,8418,0.14375985835531,3.3769444444444,70,1949,0 +986,987,8481,0.13890523887057998,3.3327777777778,51,2222,0 +987,988,8535,0.096357518724471,3.1925,30,1797,0 +988,989,8535,0.098277544249084,3.135,97,1860,0 +989,990,8442,0.11251833989481,2.8338888888889,41,2870,0 +990,991,8448,0.074768662666532,2.4997222222222004,32,1899,0 +991,992,8527,0.038008655416852,2.2297222222222004,47,2336,0 +992,993,8541,0.016354174968753,2.1158333333333,34,2703,0 +993,994,8635,0.11898350916153,2.1966666666667,54,2773,0 +994,995,8867,0.0,2.2591666666667,69,2577,0 +995,996,9033,0.0,2.3002777777778,109,2816,0 +996,997,8875,0.0,2.3797222222222003,76,3133,0 +997,998,8708,0.0,2.625,47,3366,0 +998,999,8455,0.020636446066963,2.6661111111111,44,3062,0 +999,1000,8713,0.043044731483849,2.6694444444444,92,3003,0 +1000,1001,8934,0.12513578187909,2.6541666666667,67,3044,0 +1001,1002,8745,0.099581351017555,2.6483333333333,26,3230,0 +1002,1003,8674,0.085903047711976,2.7444444444444,42,2793,0 +1003,1004,8606,0.066698820830796,3.0788888888889,69,1945,0 +1004,1005,8508,0.034228320502586,3.4833333333333,32,2716,0 +1005,1006,8558,0.028479870560763,3.6063888888889,41,2103,0 +1006,1007,8529,0.16430377699282994,3.8069444444444,52,1795,0 +1007,1008,8520,0.020290722486788003,3.6475,56,2840,0 +1008,1009,6662,0.17253761895951006,3.5219444444444,47,2653,0 +1009,1010,6491,0.1150267570489,3.3708333333333,65,2819,0 +1010,1011,6498,0.14119445755296,3.3086111111111003,70,1706,0 +1011,1012,6500,0.079900598296651,3.2411111111111004,84,1801,0 +1012,1013,6471,0.11459361685243,3.0525,71,3271,0 +1013,1014,6354,0.11299850955195,2.7419444444444,110,2001,0 +1014,1015,6592,0.078187238738118,2.4305555555556,65,1678,0 +1015,1016,6552,0.15222680511595002,2.1852777777778,68,1703,0 +1016,1017,6492,0.05823703723779,2.0644444444444,74,2441,0 +1017,1018,6577,0.038270957919533,2.1961111111111,43,2304,0 +1018,1019,6777,0.045436612403901,2.2886111111111,55,3124,0 +1019,1020,6844,0.051111263534218,2.3219444444444,53,3605,0 +1020,1021,6769,0.0,2.4436111111111,64,2985,0 +1021,1022,6642,0.0,2.6463888888889,58,2934,0 +1022,1023,6782,0.057248496594127986,2.735,54,3044,0 +1023,1024,6715,0.0,2.7586111111111005,121,3463,0 +1024,1025,6915,0.084808608043399,2.7138888888889,103,3199,0 +1025,1026,6569,0.05823703723779,2.7119444444444,66,2684,0 +1026,1027,6486,0.12640598881102005,2.8027777777778,73,3317,0 +1027,1028,6504,0.08602692657241201,2.9777777777778,71,2159,0 +1028,1029,6445,0.13712331887199,3.2961111111111,37,2043,0 +1029,1030,6427,0.12184008568979,3.4869444444444,46,2003,0 +1030,1031,6365,0.050317612906928,3.673611111111101,40,2260,0 +1031,1032,6277,0.07167380324199299,3.7469444444444,26,3522,0 +1032,1033,5231,0.051289858799957,3.6133333333333,42,1840,0 +1033,1034,5166,0.094021005766084,3.4752777777778,63,1820,0 +1034,1035,5303,0.020566298353792,3.3602777777778,68,1856,0 +1035,1036,5306,0.12275234276969,3.1605555555556,87,1715,0 +1036,1037,5298,0.1054190746845,3.0733333333333,60,1695,0 +1037,1038,5268,0.19050318144252,2.7130555555556,94,2254,0 +1038,1039,5251,0.10472332930133,2.2886111111111,121,1652,0 +1039,1040,5194,0.12644994481537,2.0783333333333,128,1602,0 +1040,1041,5230,0.08859454436104999,1.9188888888889,68,1792,0 +1041,1042,5244,0.0,1.9355555555556003,76,1954,0 +1042,1043,5102,0.09532581107230803,2.0569444444444,77,1808,0 +1043,1044,5244,0.15766772749983,2.1902777777778,158,1629,0 +1044,1045,5249,0.06429178708826701,2.3477777777778,112,2140,0 +1045,1046,5261,0.068395341911942,2.5502777777778,85,2390,0 +1046,1047,5339,0.025992957736547997,2.6597222222222,77,1707,0 +1047,1048,5241,0.0,2.7238888888888995,89,1901,0 +1048,1049,5491,0.021142167244918,2.7375,106,1820,0 +1049,1050,5374,0.072067861729848,2.7483333333333,47,2167,0 +1050,1051,5354,0.1275228688396,2.8525,34,2063,0 +1051,1052,5232,0.043846003986674,3.0038888888889,32,2184,0 +1052,1053,5217,0.10247450096434,3.2761111111111005,22,1981,0 +1053,1054,5258,0.07584150637714701,3.5761111111111004,16,1813,0 +1054,1055,5251,0.020496657705832,3.8172222222222,32,2033,0 +1055,1056,5223,0.13399493992192998,3.6691666666667,16,1629,0 +1056,1057,3952,0.091121163023619,3.5558333333333,20,1485,0 +1057,1058,3949,0.11809705541338,3.4266666666667,56,1527,0 +1058,1059,4021,0.033014047837867995,3.435,74,2561,0 +1059,1060,3815,0.16367597832104,3.2111111111111,116,1523,0 +1060,1061,3855,0.12469537397569,3.1297222222222,72,1446,0 +1061,1062,3892,0.095002031789468,2.7538888888889,66,1499,0 +1062,1063,3948,0.1028064299952,2.3116666666667003,56,1368,0 +1063,1064,3860,0.028861851985229007,2.0988888888889,61,1426,0 +1064,1065,3830,0.05806984314166,2.0983333333333,2151,3528,0 +1065,1066,3821,0.050886592113012,2.1986111111111,459,2279,0 +1066,1067,3886,0.05081829754409599,2.3677777777778,84,1421,0 +1067,1068,3954,0.0,2.5036111111111,55,2008,0 +1068,1069,3839,0.08354288831032201,2.5786111111111,61,1429,0 +1069,1070,3921,0.0,2.8172222222222,19,1497,0 +1070,1071,3874,0.08142390858425297,2.8727777777778,30,1604,0 +1071,1072,3996,0.047911560407608,2.8294444444444,73,1595,0 +1072,1073,4246,0.12201534565884,2.7136111111111005,63,2217,0 +1073,1074,3803,0.088739417881303,2.7058333333333,35,1580,0 +1074,1075,3594,0.08276214539547999,2.8161111111111,57,1466,0 +1075,1076,3778,0.066779641097052,3.1541666666667,50,1717,0 +1076,1077,3745,0.11367082443275,3.5791666666667004,48,1564,0 +1077,1078,3747,0.021597223158314,3.8158333333333,40,1752,0 +1078,1079,3726,0.16874893592242002,3.9405555555556,36,1598,0 +1079,1080,3729,0.041971530556774,3.7294444444444,59,1842,0 +1080,1081,8513,0.042983941794881,3.6183333333333,14,3066,0 +1081,1082,8738,0.14500733624043,3.4911111111111004,16,2272,0 +1082,1083,8709,0.046727090031129015,3.4566666666667003,36,4344,0 +1083,1084,8601,0.032553617944112004,3.37,65,3242,0 +1084,1085,8719,0.040039251102491,3.1658333333333,80,2291,0 +1085,1086,8820,0.055153759101126985,2.7261111111111003,91,2240,0 +1086,1087,8674,0.05751181017711901,2.3533333333333,102,2012,0 +1087,1088,8859,0.041202889821452,2.1158333333333,85,2305,0 +1088,1089,8905,0.07854024449462599,2.0852777777778,69,2295,0 +1089,1090,8920,0.11628975245152,2.1422222222222,79,2370,0 +1090,1091,9062,0.087543035971238,2.3172222222222003,66,3066,0 +1091,1092,9139,0.0,2.3983333333333,47,3132,0 +1092,1093,8866,0.031151045483539,2.55,51,3006,0 +1093,1094,8997,0.0,2.7413888888888995,20,3101,0 +1094,1095,9122,0.029949950026121008,2.7636111111111004,62,3739,0 +1095,1096,9191,0.067297142748812,2.7002777777778,54,3933,0 +1096,1097,9795,0.08450527625030299,2.7247222222222,99,4537,0 +1097,1098,9255,0.049852109269358014,2.5866666666667,64,3856,0 +1098,1099,8924,0.094084438832673,2.8597222222222,66,2862,0 +1099,1100,9012,0.044896125591910994,3.1269444444444,49,2449,0 +1100,1101,9023,0.07328004196455701,3.5019444444444,73,2222,0 +1101,1102,8875,0.13104465124262998,3.778611111111101,47,2159,0 +1102,1103,8800,0.10394116672902,3.8727777777778,48,2486,0 +1103,1104,8785,0.033616505813902,3.704166666666701,35,3148,0 +1104,1105,8474,0.02672150953308,3.5533333333333,27,3207,0 +1105,1106,8412,0.082058799915824,3.4461111111111005,19,2057,0 +1106,1107,8491,0.05732182787355501,3.4341666666667003,37,2029,0 +1107,1108,8391,0.067005870534182,3.3141666666667,45,3127,0 +1108,1109,8216,0.13429243256821,3.0438888888889,45,2597,0 +1109,1110,8292,0.015094533525413,2.6791666666667004,32,2350,0 +1110,1111,8406,0.063949370932991,2.3202777777778,99,2364,0 +1111,1112,8509,0.094378811742462,2.0691666666667,71,2095,0 +1112,1113,8486,0.02139340711812,2.0091666666667,93,2978,0 +1113,1114,8616,0.0,2.1886111111111,78,2743,0 +1114,1115,8642,0.0,2.3088888888889,71,2668,0 +1115,1116,8823,0.0,2.3794444444444,91,3054,0 +1116,1117,8774,0.0,2.5994444444444,31,3733,0 +1117,1118,8810,0.0,2.7119444444444,35,4312,0 +1118,1119,8611,0.0,2.76,25,4112,0 +1119,1120,8798,0.10029435223064,2.6975,45,3541,0 +1120,1121,9179,0.0,2.5466666666667,33,3901,0 +1121,1122,9057,0.10365337249761998,2.6036111111111,34,4371,0 +1122,1123,8633,0.12418226954696003,2.7927777777778,40,4099,0 +1123,1124,8517,0.0,2.9788888888889,17,3039,0 +1124,1125,8427,0.051166116772473,3.4080555555556,17,3197,0 +1125,1126,8615,0.040222150865381015,3.6813888888889,16,2346,0 +1126,1127,8690,0.17057206553854998,3.7983333333333,26,2285,0 +1127,1128,8438,0.12861588337799,3.6338888888889,19,2313,0 +1128,1129,10388,0.0,3.5111111111111004,30,3216,0 +1129,1130,10588,0.0,3.3613888888889,94,3860,0 +1130,1131,10533,0.14569364884757002,3.3072222222222,73,4781,0 +1131,1132,10397,0.18198813530019,3.2447222222222,59,2957,0 +1132,1133,10347,0.038073868368755,3.1152777777778,53,2171,0 +1133,1134,10405,0.11491272575332,2.6994444444444,56,2856,0 +1134,1135,10411,0.064841538076484,2.3497222222222005,70,2714,0 +1135,1136,10503,0.048708312546253,2.0619444444444,60,2602,0 +1136,1137,10598,0.11629780056153,2.0625,83,2331,0 +1137,1138,10692,0.07659916149791901,2.1905555555556004,265,3586,0 +1138,1139,10874,0.0,2.2588888888889,944,3363,0 +1139,1140,11043,0.043763623117499,2.3983333333333,36,3879,0 +1140,1141,11009,0.0,2.5536111111111,42,3556,0 +1141,1142,10818,0.041436571087464,2.7408333333333,23,4381,0 +1142,1143,10985,0.0,2.7375,75,4777,0 +1143,1144,10861,0.08191467409622599,2.7780555555556,68,4879,0 +1144,1145,12282,0.11084389924027,2.6225,23,3553,0 +1145,1146,11225,0.12510294083344,2.6386111111111,35,3177,0 +1146,1147,10775,0.10213470511717,2.7908333333333,38,2727,0 +1147,1148,10688,0.06332743445339299,3.0922222222222,69,2758,0 +1148,1149,10601,0.033666593475508995,3.4291666666667004,57,4124,0 +1149,1150,10634,0.057459020289436,3.6752777777778,58,3076,0 +1150,1151,10646,0.023008391787587,3.736111111111101,43,2291,0 +1151,1152,10562,0.037622360322278,3.5905555555556,65,2482,0 +1152,1153,10608,0.026766196308354,3.3872222222222,60,2537,0 +1153,1154,10618,0.13691041072327,3.3186111111111005,55,2434,0 +1154,1155,10636,0.024581173073578,3.2775,49,2608,0 +1155,1156,10583,0.050723618686514,3.1625,54,2614,0 +1156,1157,10613,0.038807415292018,3.1391666666667004,66,2904,0 +1157,1158,10603,0.10731539561588,2.7616666666667005,59,2204,0 +1158,1159,10601,0.13649131550296,2.4675,107,2326,0 +1159,1160,10757,0.11190990870167998,2.2166666666667,104,3002,0 +1160,1161,10815,0.17879123074031,2.1205555555556,100,3472,0 +1161,1162,10790,0.08728058888363299,2.2044444444444,133,3496,0 +1162,1163,11082,0.0,2.3147222222222004,65,3168,0 +1163,1164,11121,0.07099894663641,2.2416666666667004,152,4268,0 +1164,1165,10913,0.098617038600063,2.405,83,4350,0 +1165,1166,11004,0.0,2.5705555555556003,158,3555,0 +1166,1167,11135,0.10519721128315,2.7088888888889,145,4986,0 +1167,1168,10960,0.10928571467639,2.6913888888889,77,4576,0 +1168,1169,11686,0.14969099592127,2.6427777777778,13,4451,0 +1169,1170,11244,0.060122448878635,2.705,67,3627,0 +1170,1171,10931,0.068254139999346,2.8738888888889,25,3485,0 +1171,1172,10811,0.056987671819742985,3.0819444444444,27,3046,0 +1172,1173,10679,0.094667935014769,3.4491666666667005,23,2657,0 +1173,1174,10648,0.13287358772218,3.6275,28,2423,0 +1174,1175,10757,0.032507012295146,3.8027777777778,25,2374,0 +1175,1176,10706,0.14779741522058998,3.6436111111111,28,2493,0 +1176,1177,9077,0.10864900088005,3.4861111111111005,30,2495,0 +1177,1178,8836,0.12602969813907,3.3266666666667004,31,2189,0 +1178,1179,8971,0.07253718299881,3.1866666666667003,31,2214,0 +1179,1180,8972,0.31381296416887,3.2213888888888995,44,2374,0 +1180,1181,8903,0.2312064012582,3.0102777777778,27,3230,0 +1181,1182,8967,0.17687421373190998,2.6658333333333,36,2132,0 +1182,1183,8962,0.022073721703464003,2.3902777777778,61,3042,0 +1183,1184,9044,0.11600086139073,2.1380555555556,64,2053,0 +1184,1185,8931,0.10418807549523,2.0161111111111,118,2349,0 +1185,1186,9028,0.040222150865381015,2.0641666666667,98,3381,0 +1186,1187,9240,0.06812462580532,2.1844444444444,76,3436,0 +1187,1188,9227,0.055328485037955,2.2822222222222,57,3280,0 +1188,1189,9227,0.027788383289499,2.4002777777777995,74,4357,0 +1189,1190,9125,0.0,2.5433333333333,72,4522,0 +1190,1191,9075,0.0,2.7469444444444,78,4094,0 +1191,1192,9117,0.035137191893634005,2.6872222222222,69,3296,0 +1192,1193,9562,0.035137191893634005,2.6980555555556,125,4129,0 +1193,1194,9305,0.11258759940039,2.7380555555556,157,3036,0 +1194,1195,8965,0.16105265701128,2.7858333333333,61,2628,0 +1195,1196,8862,0.15210502999287,3.0502777777778,12,2296,0 +1196,1197,8858,0.07673479360192201,3.2991666666667,16,2221,0 +1197,1198,8820,0.17013715283392,3.5533333333333,36,1991,0 +1198,1199,8876,0.1609412187274,3.6652777777778,27,2778,0 +1199,1200,8797,0.12008642730107,3.6116666666667,22,2511,0 +1200,1201,9074,0.045995324803682,3.5463888888889,22,2103,0 +1201,1202,9318,0.23802438276872,3.4013888888889,35,2111,0 +1202,1203,9286,0.18078076076243,3.245,67,2055,0 +1203,1204,9320,0.12741851179236,3.1644444444444,46,1930,0 +1204,1205,9280,0.08024661572906401,2.9361111111111,72,2456,0 +1205,1206,9333,0.32656213417732,2.6952777777778,96,2952,0 +1206,1207,9334,0.28639695711596,2.3702777777778,117,2147,0 +1207,1208,9337,0.083900984173012,2.0947222222222,113,2051,0 +1208,1209,9405,0.12853338721539,1.9538888888889,140,2281,0 +1209,1210,9263,0.032414228925828,1.9925,107,2102,0 +1210,1211,9326,0.08237281480963901,2.0363888888889,102,2062,0 +1211,1212,9421,0.0,2.1919444444444,85,2796,0 +1212,1213,9275,0.0,2.3211111111111,49,2005,0 +1213,1214,9323,0.0,2.4955555555556,69,2075,0 +1214,1215,9347,0.45868581620054,2.6980555555556,68,2058,1 +1215,1216,9333,0.1959092708736,2.7219444444444,104,2733,0 +1216,1217,9846,0.7871265862012701,2.725,111,2170,1 +1217,1218,9497,0.18267963393082,2.7816666666667,88,2282,0 +1218,1219,9383,0.26777755992147,2.7811111111111004,64,2178,0 +1219,1220,9300,0.30404676514833,2.955,29,2283,0 +1220,1221,9389,0.28226806095289003,3.3158333333333,32,2097,0 +1221,1222,9364,0.32093016819692,3.5669444444444003,29,2738,0 +1222,1223,9227,0.24793583772273,3.7419444444444,21,2678,0 +1223,1224,9309,0.27376916868294,3.6236111111111,33,2404,0 +1224,1225,6204,0.32069151905173,3.4416666666667,37,1497,0 +1225,1226,6048,0.16728853165162,3.4172222222222,57,1496,0 +1226,1227,5949,0.17244047836378998,3.3016666666667,72,1935,0 +1227,1228,5981,0.21356200193615,3.1963888888889,86,1521,0 +1228,1229,5897,0.08833993625230199,3.0641666666667,70,2879,0 +1229,1230,6038,0.20141526375625,2.735,63,1561,0 +1230,1231,6094,0.12271171189386,2.3288888888889,49,1381,0 +1231,1232,6022,0.15111333507662,2.0938888888889,81,1826,0 +1232,1233,6122,0.3688420983862,2.1338888888889,58,1896,0 +1233,1234,6034,0.15672074166098002,2.2247222222222005,70,2083,0 +1234,1235,6079,0.099476236793782,2.3308333333333,67,1792,0 +1235,1236,5998,0.18394691317126,2.3902777777778,70,3258,0 +1236,1237,6004,0.076264605227629,2.5819444444444,95,2265,0 +1237,1238,5908,0.058100747891124,2.6661111111111,100,2775,0 +1238,1239,6022,0.18015967729618,2.8258333333333,116,1545,0 +1239,1240,5981,0.059431847203259,2.7502777777778,123,1818,0 +1240,1241,6399,0.14870829462531002,2.6730555555556004,71,1481,0 +1241,1242,6119,0.09565694822541,2.7536111111111,65,1677,0 +1242,1243,6114,0.16022629962173002,2.9677777777778,73,1858,0 +1243,1244,5915,0.4140256163498,3.37,53,1643,0 +1244,1245,6192,0.32447726333369004,3.5958333333333,79,1582,0 +1245,1246,6021,0.15394421357627,3.8144444444444,77,1611,0 +1246,1247,6060,0.060070368432038,3.8283333333333,59,1803,0 +1247,1248,7510,0.14236976564388,3.7030555555556,66,2121,0 +1248,1249,7560,0.12741851179236,3.5802777777778,54,2375,0 +1249,1250,7525,0.093634078744746,3.4197222222222,54,1866,0 +1250,1251,7483,0.13709947889982,3.4438888888889,89,2398,0 +1251,1252,7452,0.06298116794216299,3.3425,85,2577,0 +1252,1253,7512,0.13125017838571,3.1608333333333,96,1801,0 +1253,1254,7572,0.21161148728916,2.7413888888888995,149,1840,0 +1254,1255,7629,0.06783428261124,2.3808333333333,139,1985,0 +1255,1256,7529,0.20877561051189,2.12,90,2041,0 +1256,1257,7623,0.10394294206935002,2.1533333333333,68,2075,0 +1257,1258,7637,0.0,2.2569444444444,445,2564,0 +1258,1259,7921,0.076424293095548,2.3183333333333,100,2734,0 +1259,1260,7790,0.08809461878011901,2.3583333333333,138,3143,0 diff --git a/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_problem/problemDoc.json b/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_problem/problemDoc.json new file mode 100644 index 0000000..417cb6b --- /dev/null +++ b/datasets/anomaly/yahoo_sub_5/yahoo_sub_5_problem/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "yahoo_sub_5_problem", + "problemName": "yahoo_sub_5_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "yahoo_sub_5_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 7, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TRAIN" + } + ], + "test": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TEST" + } + ], + "score": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/SCORE/dataset_TEST/datasetDoc.json b/datasets/anomaly_reserve/kpi/SCORE/dataset_TEST/datasetDoc.json new file mode 100644 index 0000000..2a04d60 --- /dev/null +++ b/datasets/anomaly_reserve/kpi/SCORE/dataset_TEST/datasetDoc.json @@ -0,0 +1,63 @@ +{ + "about": { + "datasetID": "kpi_dataset_TEST", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 4 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/SCORE/dataset_TEST/tables/learningData.csv b/datasets/anomaly_reserve/kpi/SCORE/dataset_TEST/tables/learningData.csv new file mode 100644 index 0000000..b9e432d --- /dev/null +++ b/datasets/anomaly_reserve/kpi/SCORE/dataset_TEST/tables/learningData.csv @@ -0,0 +1,1758 @@ +d3mIndex,timestamp,value,ground_truth +7027,1475026500,0.32264705162415364,0 +7028,1475026800,0.32183430507799304,0 +7029,1475027100,0.31787914535951506,0 +7030,1475027400,0.3296732765365322,0 +7031,1475027700,0.33072178162272026,0 +7032,1475028000,0.3282773378117453,0 +7033,1475028300,0.3412378533449643,0 +7034,1475028600,0.3444485124115538,0 +7035,1475028900,0.34747304631385745,0 +7036,1475029200,0.34477423144747743,0 +7037,1475029500,0.34249419819706234,0 +7038,1475029800,0.3547319276800169,0 +7039,1475030100,0.3569188983482892,0 +7040,1475030400,0.3528241447571223,0 +7041,1475030700,0.3536617079911538,0 +7042,1475031000,0.3595928965267984,0 +7043,1475031300,0.3414456931108743,0 +7044,1475031600,0.3444702270131781,0 +7045,1475031900,0.3567327731850544,0 +7046,1475032200,0.344169324666176,0 +7047,1475032500,0.34747304631385745,0 +7048,1475032800,0.3413309159271072,0 +7049,1475033100,0.3411665053665474,0 +7050,1475033400,0.3484253867327069,0 +7051,1475033700,0.3466571976814503,0 +7052,1475034000,0.3524518944306527,0 +7053,1475034300,0.3450999504823503,0 +7054,1475034600,0.34230807303382743,0 +7055,1475034900,0.32953368266384336,0 +7056,1475035200,0.3585940248174029,0 +7057,1475035500,0.3494738918188949,0 +7058,1475035800,0.3478918279308732,0 +7059,1475036100,0.3570584922209781,0 +7060,1475036400,0.3642925568982159,0 +7061,1475036700,0.3735522837694128,0 +7062,1475037000,0.371529723662751,0 +7063,1475037300,0.36375899809743295,0 +7064,1475037600,0.3717623801165319,0 +7065,1475037900,0.3745759721677299,0 +7066,1475038200,0.3771134785597968,0 +7067,1475038500,0.38916508287951657,0 +7068,1475038800,0.3930954259090729,0 +7069,1475039100,0.3960051826276095,0 +7070,1475039400,0.3930023633279808,0 +7071,1475039700,0.37371669432997257,0 +7072,1475040000,0.3825328228962828,0 +7073,1475040300,0.35663971060291155,0 +7074,1475040600,0.3567793044756004,0 +7075,1475040900,0.3473334524411687,0 +7076,1475041200,0.35570908478673724,0 +7077,1475041500,0.3453077902482603,0 +7078,1475041800,0.3417031662535769,0 +7079,1475042100,0.34623841606443456,0 +7080,1475042400,0.3279050874852757,0 +7081,1475042700,0.3283486857912131,0 +7082,1475043000,0.3181583331038419,0 +7083,1475043300,0.3134586727324244,0 +7084,1475043600,0.3296050306433111,0 +7085,1475043900,0.3221817387155411,0 +7086,1475044200,0.3210184564455859,0 +7087,1475044500,0.31680892567065194,0 +7088,1475044800,0.3279764354647434,0 +7089,1475045100,0.3037088162647441,0 +7090,1475045400,0.30889860623264503,0 +7091,1475045700,0.3124349843341073,0 +7092,1475046000,0.3008021616321388,0 +7093,1475046300,0.3049651611168421,0 +7094,1475046600,0.3101084197941969,0 +7095,1475046900,0.302173283668004,0 +7096,1475047200,0.3071304171824393,0 +7097,1475047500,0.3046177274782433,0 +7098,1475047800,0.3044998482082296,0 +7099,1475048100,0.296124215862766,0 +7100,1475048400,0.3014287830150645,0 +7101,1475048700,0.2965678141684933,0 +7102,1475049000,0.2985686596733205,0 +7103,1475049300,0.2996854106527297,0 +7104,1475049600,0.2962420951327797,0 +7105,1475049900,0.2976845651479024,0 +7106,1475050200,0.29400859317396144,0 +7107,1475050500,0.3005912197805077,0 +7108,1475050800,0.29345021768425683,0 +7109,1475051100,0.28656358664446197,0 +7110,1475051400,0.2895881205470809,0 +7111,1475051700,0.2858873318847437,0 +7112,1475052000,0.28960983514944083,0 +7113,1475052300,0.28656358664446197,0 +7114,1475052600,0.2957302509339364,0 +7115,1475052900,0.2834677047625855,0 +7116,1475053200,0.29375112003146897,0 +7117,1475053500,0.2945204373729098,0 +7118,1475053800,0.2802353310943717,0 +7119,1475054100,0.2943808435004311,0 +7120,1475054400,0.2876555209354749,0 +7121,1475054700,0.28837830698605443,0 +7122,1475055000,0.2714161004429708,0 +7123,1475055300,0.27325563747295945,0 +7124,1475055600,0.2763267026664397,0 +7125,1475055900,0.2695083175197609,0 +7126,1475056200,0.2664620690147821,0 +7127,1475056500,0.2637849687502365,0 +7128,1475056800,0.2611575018625886,0 +7129,1475057100,0.2521304314454878,0 +7130,1475057400,0.2520373688638704,0 +7131,1475057700,0.2764197652480572,0 +7132,1475058000,0.2607387202452576,0 +7133,1475058300,0.24971080432343465,0 +7134,1475058600,0.2539916830779415,0 +7135,1475058900,0.2424519229571701,0 +7136,1475059200,0.2554093364045628,0 +7137,1475059500,0.2629008742249235,0 +7138,1475059800,0.2538055579147066,0 +7139,1475060100,0.2649482510205069,0 +7140,1475060400,0.2632979412397895,0 +7141,1475060700,0.264926536418147,0 +7142,1475061000,0.2610644392809712,0 +7143,1475061300,0.25969021515906965,0 +7144,1475061600,0.27055682260605396,0 +7145,1475061900,0.2705102913151928,0 +7146,1475062200,0.27753651622746633,0 +7147,1475062500,0.2853754876857953,0 +7148,1475062800,0.2817243324002506,0 +7149,1475063100,0.2922404041232303,0 +7150,1475063400,0.29065834023573395,0 +7151,1475063700,0.2966360600617144,0 +7152,1475064000,0.2832350483085944,0 +7153,1475064300,0.293518463577478,0 +7154,1475064600,0.2871436767365265,0 +7155,1475064900,0.29491440230173943,0 +7156,1475065200,0.2984042491124456,0 +7157,1475065500,0.3362590052292199,0 +7158,1475065800,0.31566735800232104,0 +7159,1475066100,0.2938907139039477,0 +7160,1475066400,0.3159465457476987,0 +7161,1475066700,0.30140706841270465,0 +7162,1475067000,0.2997536565458457,0 +7163,1475067300,0.2992200977446425,0 +7164,1475067600,0.2795125450437922,0 +7165,1475067900,0.2827945520889036,0 +7166,1475068200,0.2817243324002506,0 +7167,1475068500,0.2869358369709317,0 +7168,1475068800,0.29658952877085315,0 +7169,1475069100,0.2933106238117782,0 +7170,1475069400,0.2914959034702908,0 +7171,1475069700,0.2925195918680826,0 +7172,1475070000,0.30205850648402666,0 +7173,1475070300,0.2888901511848977,0 +7174,1475070600,0.2945886832660259,0 +7175,1475070900,0.30221981495876016,0 +7176,1475071200,0.2890297450573764,0 +7177,1475071500,0.2816995157118543,0 +7178,1475071800,0.2820252347475679,0 +7179,1475072100,0.2830023918544983,0 +7180,1475072400,0.2746050449064648,0 +7181,1475072700,0.2782344855895445,0 +7182,1475073000,0.281165956910546,0 +7183,1475073300,0.2609248454084925,0 +7184,1475073600,0.2648086571480282,0 +7185,1475073900,0.2579220261082334,0 +7186,1475074200,0.2676470658874123,0 +7187,1475074500,0.24947814786944364,0 +7188,1475074800,0.2447784874977109,0 +7189,1475075100,0.23386534875986525,0 +7190,1475075400,0.2357514170806101,0 +7191,1475075700,0.2283281251534706,0 +7192,1475076000,0.2276084411890325,0 +7193,1475076300,0.21660223986956945,0 +7194,1475076600,0.21753286568574376,0 +7195,1475076900,0.20918205002846632,0 +7196,1475077200,0.20020151090228985,0 +7197,1475077500,0.20303991964166346,0 +7198,1475077800,0.19931741637691366,0 +7199,1475078100,0.18635690084348466,0 +7200,1475078400,0.18859040280232395,0 +7201,1475078700,0.17681798622754055,0 +7202,1475079000,0.18230867854307395,0 +7203,1475079300,0.16653457095865698,0 +7204,1475079600,0.1629764782548348,0 +7205,1475079900,0.15660169141388328,0 +7206,1475080200,0.1467835890530869,0 +7207,1475080500,0.14766768357850502,0 +7208,1475080800,0.13910592606959646,0 +7209,1475081100,0.13633576322343344,0 +7210,1475081400,0.13194010728494932,0 +7211,1475081700,0.1272869782039728,0 +7212,1475082000,0.12249425525062264,0 +7213,1475082300,0.11211777740012167,0 +7214,1475082600,0.10723199186515407,0 +7215,1475082900,0.10087891962666752,0 +7216,1475083200,0.0976465459584538,0 +7217,1475083500,0.09383098011203414,0 +7218,1475083800,0.08482562429739833,0 +7219,1475084100,0.08080221868569913,0 +7220,1475084400,0.08082393328732347,0 +7221,1475084700,0.07554418282352629,0 +7222,1475085000,0.07328586417683709,0 +7223,1475085300,0.07188992545205025,0 +7224,1475085600,0.06672495217307105,0 +7225,1475085900,0.06828530145736683,0 +7226,1475086200,0.062422358815994186,0 +7227,1475086500,0.0631203281783876,0 +7228,1475086800,0.0577226984435259,0 +7229,1475087100,0.056280228429243837,0 +7230,1475087400,0.054626816561754436,0 +7231,1475087700,0.05669901004625956,0 +7232,1475088000,0.052579439766171,0 +7233,1475088300,0.057396979408653015,0 +7234,1475088600,0.05271903363885985,0 +7235,1475088900,0.051276563623527,0 +7236,1475089200,0.05116178643975985,0 +7237,1475089500,0.0513013803124487,0 +7238,1475089800,0.05243984589453297,0 +7239,1475090100,0.052067595568063264,0 +7240,1475090400,0.05172016192946443,0 +7241,1475090700,0.047439283175062685,0 +7242,1475091000,0.047740185522064715,0 +7243,1475091300,0.046462126068342366,0 +7244,1475091600,0.04441474927275894,0 +7245,1475091900,0.0436237173292735,0 +7246,1475092200,0.04120409020616953,0 +7247,1475092500,0.04348412345658465,0 +7248,1475092800,0.04278615409419124,0 +7249,1475093100,0.04299399386010122,0 +7250,1475093400,0.04108621093615582,0 +7251,1475093700,0.04246043505826752,0 +7252,1475094000,0.0424139037677215,0 +7253,1475094300,0.04117927351829865,0 +7254,1475094600,0.04285439998741237,0 +7255,1475094900,0.04255349764041035,0 +7256,1475095200,0.041504992554222346,0 +7257,1475095500,0.041132742227752636,0 +7258,1475095800,0.04339106087444181,0 +7259,1475096100,0.040971433752388674,0 +7260,1475096400,0.03929630728327494,0 +7261,1475096700,0.04031999568159207,0 +7262,1475097000,0.04280786869686635,0 +7263,1475097300,0.04115755891562354,0 +7264,1475097600,0.04255349764041035,0 +7265,1475097900,0.04208818473179781,0 +7266,1475098200,0.0413188673909875,0 +7267,1475098500,0.0413188673909875,0 +7268,1475098800,0.04557803154271409,0 +7269,1475099100,0.04632253219565356,0 +7270,1475099400,0.04841644028178297,0 +7271,1475099700,0.04683437639481212,0 +7272,1475100000,0.05144097418513755,0 +7273,1475100300,0.05232506870971503,0 +7274,1475100600,0.05797706949998192,0 +7275,1475100900,0.056767255939480725,0 +7276,1475101200,0.06551513861151907,0 +7277,1475101500,0.06414401657565393,0 +7278,1475101800,0.0640974852851079,0 +7279,1475102100,0.0660052682080025,0 +7280,1475102400,0.07026133027453338,0 +7281,1475102700,0.07149596052395624,0 +7282,1475103000,0.07317108699306994,0 +7283,1475103300,0.07514711580918569,0 +7284,1475103600,0.07859043132850517,0 +7285,1475103900,0.08021902650707287,0 +7286,1475104200,0.08454643655307144,0 +7287,1475104500,0.0854305310776489,0 +7288,1475104800,0.08652556745543376,0 +7289,1475105100,0.08682646980243576,0 +7290,1475105400,0.08719872012890545,0 +7291,1475105700,0.0928072917138221,0 +7292,1475106000,0.0912252278268512,0 +7293,1475106300,0.09501597698466444,0 +7294,1475106600,0.10369561376369177,0 +7295,1475106900,0.10176301415208568,0 +7296,1475107200,0.10364908247283053,0 +7297,1475107500,0.1077438360641025,0 +7298,1475107800,0.11169899578289576,0 +7299,1475108100,0.1208408433838688,0 +7300,1475108400,0.11490965484874965,0 +7301,1475108700,0.11870040400666795,0 +7302,1475109000,0.11879346658828535,0 +7303,1475109300,0.1214922814551908,0 +7304,1475109600,0.12835719789262576,0 +7305,1475109900,0.12186453178166053,0 +7306,1475110200,0.12688991118910678,0 +7307,1475110500,0.13098466478037873,0 +7308,1475110800,0.14366599256895554,0 +7309,1475111100,0.13545166869801534,0 +7310,1475111400,0.1395464222892873,0 +7311,1475111700,0.14645787001747845,0 +7312,1475112000,0.14405995749778516,0 +7313,1475112300,0.15041302973627171,0 +7314,1475112600,0.15622944108741355,0 +7315,1475112900,0.15308702791478088,0 +7316,1475113200,0.15367022009288173,0 +7317,1475113500,0.1577401569957574,0 +7318,1475113800,0.16897901476921154,0 +7319,1475114100,0.16688510668276682,0 +7320,1475114400,0.1693977963864374,0 +7321,1475114700,0.168001857662176,0 +7322,1475115000,0.16997788647860698,0 +7323,1475115300,0.17419051933989266,0 +7324,1475115600,0.16960563615213722,0 +7325,1475115900,0.17532898492166168,0 +7326,1475116200,0.1670712318460017,0 +7327,1475116500,0.17877230044150655,0 +7328,1475116800,0.1746558322479798,0 +7329,1475117100,0.1758191145181451,0 +7330,1475117400,0.17467754685033965,0 +7331,1475117700,0.1788188317323678,0 +7332,1475118000,0.1797494575485421,0 +7333,1475118300,0.18596293591465504,0 +7334,1475118600,0.1844739346086711,0 +7335,1475118900,0.17339948739609193,0 +7336,1475119200,0.17705064268163664,0 +7337,1475119500,0.17128076262125094,0 +7338,1475119800,0.16681375870350929,0 +7339,1475120100,0.1842164614662838,0 +7340,1475120400,0.1843095240479012,0 +7341,1475120700,0.18377596524659293,0 +7342,1475121000,0.19908475992285968,0 +7343,1475121300,0.1939415012453997,0 +7344,1475121600,0.2005737612287596,0 +7345,1475121900,0.21095023907930247,0 +7346,1475122200,0.22572237286818256,0 +7347,1475122500,0.2368216367692631,0 +7348,1475122800,0.2511067430476961,0 +7349,1475123100,0.2615762834798145,0 +7350,1475123400,0.28062929602320136,0 +7351,1475123700,0.277744355993061,0 +7352,1475124000,0.2713478545498547,0 +7353,1475124300,0.2875872750422537,0 +7354,1475124600,0.2715339797130896,0 +7355,1475124900,0.27039241204517905,0 +7356,1475125200,0.2593179648325999,0 +7357,1475125500,0.2555489302770415,0 +7358,1475125800,0.2493602685993249,0 +7359,1475126100,0.2434290800642057,0 +7360,1475126400,0.2381245129119071,0 +7361,1475126700,0.22241865122071133,0 +7362,1475127000,0.20887804559521767,0 +7363,1475127300,0.20415667062108311,0 +7364,1475127600,0.19252384791874685,0 +7365,1475127900,0.1901507520874604,0 +7366,1475128200,0.17954161778294736,0 +7367,1475128500,0.17546857879403527,0 +7368,1475128800,0.17046801607509038,0 +7369,1475129100,0.16383575609175155,0 +7370,1475129400,0.16162707082132954,0 +7371,1475129700,0.1565551601231271,0 +7372,1475130000,0.16313778672956827,0 +7373,1475130300,0.16139441436723342,0 +7374,1475130600,0.16497732375955698,0 +7375,1475130900,0.16320913470882584,0 +7376,1475131200,0.16688510668276682,0 +7377,1475131500,0.16078950758677268,0 +7378,1475131800,0.16348832245367814,0 +7379,1475132100,0.15636903495989227,0 +7380,1475132400,0.16825622871863194,0 +7381,1475132700,0.1679770409737797,0 +7382,1475133000,0.1589034392659227,0 +7383,1475133300,0.16420800641822128,0 +7384,1475133600,0.1661406060298274,0 +7385,1475133900,0.15827681788299702,0 +7386,1475134200,0.1581372240106234,0 +7387,1475134500,0.16104387864322867,0 +7388,1475134800,0.16095081606161127,0 +7389,1475135100,0.16290513027557724,0 +7390,1475135400,0.1705610786567078,0 +7391,1475135700,0.16592966417819624,0 +7392,1475136000,0.16188144187778555,0 +7393,1475136300,0.1679553263714198,0 +7394,1475136600,0.16597619546895245,0 +7395,1475136900,0.16923338582566752,0 +7396,1475137200,0.16869982702435926,0 +7397,1475137500,0.16816316613701454,0 +7398,1475137800,0.16895419808081524,0 +7399,1475138100,0.17228273641668282,0 +7400,1475138400,0.17402610877901767,0 +7401,1475138700,0.1787040545483905,0 +7402,1475139000,0.18149593199691336,0 +7403,1475139300,0.1832641210476445,0 +7404,1475139600,0.1798673368185558,0 +7405,1475139900,0.18086620852795124,0 +7406,1475140200,0.1858698733330376,0 +7407,1475140500,0.2018052893921881,0 +7408,1475140800,0.19847985314234112,0 +7409,1475141100,0.1994104789585312,0 +7410,1475141400,0.1929178128475975,0 +7411,1475141700,0.20757516945254226,0 +7412,1475142000,0.2017835747898072,0 +7413,1475142300,0.2094612377733186,0 +7414,1475142600,0.21634786881311344,0 +7415,1475142900,0.2105997033551927,0 +7416,1475143200,0.2147658049256172,0 +7417,1475143500,0.2158112079257688,0 +7418,1475143800,0.2109254223908012,0 +7419,1475144100,0.2267925925568356,0 +7420,1475144400,0.21751115108338387,0 +7421,1475144700,0.2202564972411505,0 +7422,1475145000,0.2234671563070044,0 +7423,1475145300,0.21932587142497625,0 +7424,1475145600,0.22132671692969846,0 +7425,1475145900,0.2261876857762697,0 +7426,1475146200,0.23635632386117605,0 +7427,1475146500,0.2259798460106749,0 +7428,1475146800,0.2263738109395045,0 +7429,1475147100,0.2305864438007903,0 +7430,1475147400,0.2418004848858481,0 +7431,1475147700,0.23889072816720644,0 +7432,1475148000,0.2488732410888778,0 +7433,1475148300,0.2500830546499044,0 +7434,1475148600,0.25289664670089224,0 +7435,1475148900,0.2560390598735249,0 +7436,1475149200,0.2524561504812013,0 +7437,1475149500,0.2655996890918289,0 +7438,1475149800,0.2696944426829958,0 +7439,1475150100,0.2600159341946781,0 +7440,1475150400,0.2715805110038457,0 +7441,1475150700,0.2727437932741161,0 +7442,1475151000,0.27148744842222833,0 +7443,1475151300,0.2720458239119329,0 +7444,1475151600,0.2779552978446922,0 +7445,1475151900,0.2750455411261556,0 +7446,1475152200,0.27969867020702704,0 +7447,1475152500,0.27292991843735104,0 +7448,1475152800,0.2846309870328559,0 +7449,1475153100,0.2743723884523686,0 +7450,1475153400,0.2822113599108027,0 +7451,1475153700,0.29072658612885,0 +7452,1475154000,0.2876803376238712,0 +7453,1475154300,0.28414395952240884,0 +7454,1475154600,0.2844913931603772,0 +7455,1475154900,0.28647052406284457,0 +7456,1475155200,0.28076888989568005,0 +7457,1475155500,0.29070487152649016,0 +7458,1475155800,0.2940768390671825,0 +7459,1475156100,0.2925661231588387,0 +7460,1475156400,0.28363211532346044,0 +7461,1475156700,0.28833177569519314,0 +7462,1475157000,0.2873080872974014,0 +7463,1475157300,0.29170374323588555,0 +7464,1475157600,0.2816312698186332,0 +7465,1475157900,0.28635264479283085,0 +7466,1475158200,0.2927057170313174,0 +7467,1475158500,0.2786067359161193,0 +7468,1475158800,0.28986730829193325,0 +7469,1475159100,0.2874942124606363,0 +7470,1475159400,0.2751634203961693,0 +7471,1475159700,0.2706250684992752,0 +7472,1475160000,0.2682519726679782,0 +7473,1475160300,0.26411068778595004,0 +7474,1475160600,0.2754891394318829,0 +7475,1475160900,0.26818372677475705,0 +7476,1475161200,0.2478030214002248,0 +7477,1475161500,0.2479426152727035,0 +7478,1475161800,0.23232981616312515,0 +7479,1475162100,0.23754132073380624,0 +7480,1475162400,0.2335861610150129,0 +7481,1475162700,0.2379849190395335,0 +7482,1475163000,0.21734674052250888,0 +7483,1475163300,0.22041780571588404,0 +7484,1475163600,0.2128580220024073,0 +7485,1475163900,0.205645671926983,0 +7486,1475164200,0.19403456382701714,0 +7487,1475164500,0.18023958714502555,0 +7488,1475164800,0.18449564921113606,0 +7489,1475165100,0.17760901817134125,0 +7490,1475165400,0.1739578628857965,0 +7491,1475165700,0.15727484408767026,0 +7492,1475166000,0.15083181135360266,0 +7493,1475166300,0.15085352595596255,0 +7494,1475166600,0.1451549938748344,0 +7495,1475166900,0.1325201973770138,0 +7496,1475167200,0.12510000753601574,0 +7497,1475167500,0.1167709064810982,0 +7498,1475167800,0.11490965484874965,0 +7499,1475168100,0.11058224480348663,0 +7500,1475168400,0.10295111311075236,0 +7501,1475168700,0.0993682037184288,0 +7502,1475169000,0.0974138895043577,0 +7503,1475169300,0.08927091361235978,0 +7504,1475169600,0.08459296784361743,0 +7505,1475169900,0.08147537135917088,0 +7506,1475170200,0.07542630355351257,0 +7507,1475170500,0.07442743184411715,0 +7508,1475170800,0.07314627030414825,0 +7509,1475171100,0.06926245856513794,0 +7510,1475171400,0.06732985895332161,0 +7511,1475171700,0.061631326872508725,0 +7512,1475172000,0.06195704590738161,0 +7513,1475172300,0.0646775753774876,0 +7514,1475172600,0.06293420301515275,0 +7515,1475172900,0.05776922973512274,0 +7516,1475173200,0.05662766206679187,0 +7517,1475173500,0.05367447614395582,0 +7518,1475173800,0.05248637718507899,0 +7519,1475174100,0.05309128396532956,0 +7520,1475174400,0.05386060130719069,0 +7521,1475174700,0.050743004822744096,0 +7522,1475175000,0.05230025202184412,0 +7523,1475175300,0.0541863203420636,0 +7524,1475175600,0.048649096736614654,0 +7525,1475175900,0.05009156675089673,0 +7526,1475176200,0.04841644028178297,0 +7527,1475176500,0.04608987574187265,0 +7528,1475176800,0.046716497124798376,0 +7529,1475177100,0.04401768225736754,0 +7530,1475177400,0.04359890064035179,0 +7531,1475177700,0.04294746256850437,0 +7532,1475178000,0.04264656022150239,0 +7533,1475178300,0.04101796504293469,0 +7534,1475178600,0.0405061208448269,0 +7535,1475178900,0.04243561837039664,0 +7536,1475179200,0.04190205956856294,0 +7537,1475179500,0.041458461262625534,0 +7538,1475179800,0.041551523844768366,0 +7539,1475180100,0.039901214063525516,0 +7540,1475180400,0.04190205956856294,0 +7541,1475180700,0.040692246008061775,0 +7542,1475181000,0.0389240569568052,0 +7543,1475181300,0.04139021536940436,0 +7544,1475181600,0.041715934405328094,0 +7545,1475181900,0.038179556303865776,0 +7546,1475182200,0.04010905382943551,0 +7547,1475182500,0.04080702319182894,0 +7548,1475182800,0.04115755891562354,0 +7549,1475183100,0.037016274033910605,0 +7550,1475183400,0.04010905382943551,0 +7551,1475183700,0.04122580480884467,0 +7552,1475184000,0.04038824157481321,0 +7553,1475184300,0.040155585119981525,0 +7554,1475184600,0.04083183987969982,0 +7555,1475184900,0.03987639737565465,0 +7556,1475185200,0.04227430989503268,0 +7557,1475185500,0.04478699959922865,0 +7558,1475185800,0.04536708969160833,0 +7559,1475186100,0.04664825123157725,0 +7560,1475186400,0.05146268878676186,0 +7561,1475186700,0.05274385032778155,0 +7562,1475187000,0.05834931982645161,0 +7563,1475187300,0.05932647693422274,0 +7564,1475187600,0.0621648856732916,0 +7565,1475187900,0.0668645460447091,0 +7566,1475188200,0.06679630015148794,0 +7567,1475188500,0.06863273509596564,0 +7568,1475188800,0.07035439285562536,0 +7569,1475189100,0.0710306476153437,0 +7570,1475189400,0.07372946248277452,0 +7571,1475189700,0.07898749834389658,0 +7572,1475190000,0.08243081386426683,0 +7573,1475190300,0.08659381334865487,0 +7574,1475190600,0.09250328728057353,0 +7575,1475190900,0.08682646980243576,0 +7576,1475191200,0.09178360331655576,0 +7577,1475191500,0.092968600189081,0 +7578,1475191800,0.08922438232181379,0 +7579,1475192100,0.09697029119873553,0 +7580,1475192400,0.09655150958140456,0 +7581,1475192700,0.1079764925180935,0 +7582,1475193000,0.10769730477324126,0 +7583,1475193300,0.10509155248795324,0 +7584,1475193600,0.1082773948654108,0 +7585,1475193900,0.1082773948654108,0 +7586,1475194200,0.11711834011917162,0 +7587,1475194500,0.11483830686949208,0 +7588,1475194800,0.1143047480681838,0 +7589,1475195100,0.1182133764962209,0 +7590,1475195400,0.12000328014931196,0 +7591,1475195700,0.12100525394474378,0 +7592,1475196000,0.12430897559221506,0 +7593,1475196300,0.1248673510819196,0 +7594,1475196600,0.13007885565260074,0 +7595,1475196900,0.1352438289324206,0 +7596,1475197200,0.1384762026006343,0 +7597,1475197500,0.1411284861767836,0 +7598,1475197800,0.14229176844705402,0 +7599,1475198100,0.14790034003260116,0 +7600,1475198400,0.14543418161968671,0 +7601,1475198700,0.14838736754304815,0 +7602,1475199000,0.15004077940980198,0 +7603,1475199300,0.1525069378227164,0 +7604,1475199600,0.15427512687344755,0 +7605,1475199900,0.16239328607662887,0 +7606,1475200200,0.16502385505041825,0 +7607,1475200500,0.16672069612189186,0 +7608,1475200800,0.16267247382148114,0 +7609,1475201100,0.16946604227965853,0 +7610,1475201400,0.17847139809429438,0 +7611,1475201700,0.17644573590107082,0 +7612,1475202000,0.1721431425442041,0 +7613,1475202300,0.16921167122320258,0 +7614,1475202600,0.1674434821724714,0 +7615,1475202900,0.16509210094363938,0 +7616,1475203200,0.16818798282541084,0 +7617,1475203500,0.17802779978856711,0 +7618,1475203800,0.17521420773768434,0 +7619,1475204100,0.17979598883940334,0 +7620,1475204400,0.1797742742369384,0 +7621,1475204700,0.17847139809429438,0 +7622,1475205000,0.1763526733194534,0 +7623,1475205300,0.16628019990230608,0 +7624,1475205600,0.1747706094319571,0 +7625,1475205900,0.1673038883000978,0 +7626,1475206200,0.17037495349347295,0 +7627,1475206500,0.17526073902844053,0 +7628,1475206800,0.1769823967884155,0 +7629,1475207100,0.18035436432910792,0 +7630,1475207400,0.18947449732770005,0 +7631,1475207700,0.18742712053208507,0 +7632,1475208000,0.19871250959638787,0 +7633,1475208300,0.2065762977431678,0 +7634,1475208600,0.22039609111352407,0 +7635,1475208900,0.2313061277653334,0 +7636,1475209200,0.2403114835799692,0 +7637,1475209500,0.24731289180374136,0 +7638,1475209800,0.2546648357516233,0 +7639,1475210100,0.2574101819093901,0 +7640,1475210400,0.2671817529793252,0 +7641,1475210700,0.27814142300792705,0 +7642,1475211000,0.2727655078764761,0 +7643,1475211300,0.2663441897447683,0 +7644,1475211600,0.2507344927212264,0 +7645,1475211900,0.2422657977939353,0 +7646,1475212200,0.2380314503302897,0 +7647,1475212500,0.2371938870957329,0 +7648,1475212800,0.21858137077203685,0 +7649,1475213100,0.2115086145690072,0 +7650,1475213400,0.2101343904471056,0 +7651,1475213700,0.205971390962644,0 +7652,1475214000,0.1927565043727904,0 +7653,1475214300,0.18398380501218767,0 +7654,1475214600,0.1925703792095556,0 +7655,1475214900,0.17586564580900635,0 +7656,1475215200,0.17246886157991764,0 +7657,1475215500,0.17300242038122596,0 +7658,1475215800,0.1697700467130122,0 +7659,1475216100,0.16169531671455067,0 +7660,1475216400,0.16427935439747882,0 +7661,1475216700,0.16381404148939166,0 +7662,1475217000,0.1567164685978606,0 +7663,1475217300,0.15992712766371445,0 +7664,1475217600,0.1548800336540134,0 +7665,1475217900,0.1579045675565273,0 +7666,1475218200,0.15499481083799072,0 +7667,1475218500,0.16190625856618182,0 +7668,1475218800,0.15862425152107046,0 +7669,1475219100,0.16262594253072496,0 +7670,1475219400,0.1547621543838946,0 +7671,1475219700,0.15848465764869685,0 +7672,1475220000,0.16239328607662887,0 +7673,1475220300,0.15753231723005756,0 +7674,1475220600,0.15278612556756868,0 +7675,1475220900,0.15643728085300831,0 +7676,1475221200,0.14631827614499976,0 +7677,1475221500,0.16069644500515526,0 +7678,1475221800,0.15457602922065972,0 +7679,1475222100,0.15888172466356282,0 +7680,1475222400,0.1556462489093127,0 +7681,1475222700,0.15962622531650228,0 +7682,1475223000,0.16155572284207198,0 +7683,1475223300,0.1650703863411744,0 +7684,1475223600,0.16590794957583635,0 +7685,1475223900,0.1687463583151154,0 +7686,1475224200,0.1584164117554757,0 +7687,1475224500,0.17023535962109934,0 +7688,1475224800,0.16169531671455067,0 +7689,1475225100,0.17416570265139128,0 +7690,1475225400,0.17053626196831154,0 +7691,1475225700,0.17721505324251166,0 +7692,1475226000,0.1789584256047414,0 +7693,1475226300,0.18075143134397392,0 +7694,1475226600,0.17847139809429438,0 +7695,1475226900,0.1831462417776308,0 +7696,1475227200,0.1974561647445347,0 +7697,1475227500,0.18500749340997927,0 +7698,1475227800,0.19433856826030774,0 +7699,1475228100,0.19650072223991047,0 +7700,1475228400,0.188987469817232,0 +7701,1475228700,0.19617500320424952,0 +7702,1475229000,0.2046685148199895,0 +7703,1475229300,0.20329429069808794,0 +7704,1475229600,0.1920585350106492,0 +7705,1475229900,0.20122519930009208,0 +7706,1475230200,0.2010390741368572,0 +7707,1475230500,0.20613269943745105,0 +7708,1475230800,0.20613269943745105,0 +7709,1475231100,0.2081118303398764,0 +7710,1475231400,0.21327680361973828,0 +7711,1475231700,0.20487635458560524,0 +7712,1475232000,0.20620404741669807,0 +7713,1475232300,0.21858137077203685,0 +7714,1475232600,0.21692795890517788,0 +7715,1475232900,0.21699930688443544,0 +7716,1475233200,0.21788340140985354,0 +7717,1475233500,0.2159973330890036,0 +7718,1475233800,0.2291439737856676,0 +7719,1475234100,0.22923703636728496,0 +7720,1475234400,0.22260477638394616,0 +7721,1475234700,0.22681740924523186,0 +7722,1475235000,0.22253653049083008,0 +7723,1475235300,0.22877172345919786,0 +7724,1475235600,0.2339584113414827,0 +7725,1475235900,0.2378670397694147,0 +7726,1475236200,0.2493137373085687,0 +7727,1475236500,0.24619614082433225,0 +7728,1475236800,0.25831909312320445,0 +7729,1475237100,0.25375902662384536,0 +7730,1475237400,0.2555954615677977,0 +7731,1475237700,0.26911435259093136,0 +7732,1475238000,0.2707181310808926,0 +7733,1475238300,0.2657640996527039,0 +7734,1475238600,0.2643216296375812,0 +7735,1475238900,0.26597193941829866,0 +7736,1475239200,0.25790031150587345,0 +7737,1475239500,0.26818372677475705,0 +7738,1475239800,0.2628791596224585,0 +7739,1475240100,0.25515496534810683,0 +7740,1475240400,0.2565509040724733,0 +7741,1475240700,0.26615806458153346,0 +7742,1475241000,0.2681123787954995,0 +7743,1475241300,0.27895416955408764,0 +7744,1475241600,0.2816995157118543,0 +7745,1475241900,0.27118344398897976,0 +7746,1475242200,0.27434757176397234,0 +7747,1475242500,0.2703706974428192,0 +7748,1475242800,0.2749772952329345,0 +7749,1475243100,0.2694152549381435,0 +7750,1475243400,0.27248632013162377,0 +7751,1475243700,0.26908953590253504,0 +7752,1475244000,0.2738822588558852,0 +7753,1475244300,0.2652057241629993,0 +7754,1475244600,0.2707181310808926,0 +7755,1475244900,0.2743723884523686,0 +7756,1475245200,0.26841638322885314,0 +7757,1475245500,0.25808643666910835,0 +7758,1475245800,0.2680658475047433,0 +7759,1475246100,0.2583873390163205,0 +7760,1475246400,0.2562251850367598,0 +7761,1475246700,0.25375902662384536,0 +7762,1475247000,0.2494316165785824,0 +7763,1475247300,0.2524096191904452,0 +7764,1475247600,0.2515472392673869,0 +7765,1475247900,0.24626748880358976,0 +7766,1475248200,0.23751960613144635,0 +7767,1475248500,0.2333317899585569,0 +7768,1475248800,0.220185149261893,0 +7769,1475249100,0.21776552213983985,0 +7770,1475249400,0.20429626449350924,0 +7771,1475249700,0.20143303906570784,0 +7772,1475250000,0.1998975064689993,0 +7773,1475250300,0.19215159759226666,0 +7774,1475250600,0.1914753428325063,0 +7775,1475250900,0.1825661516855664,0 +7776,1475251200,0.17537551621241784,0 +7777,1475251500,0.1741439880490314,0 +7778,1475251800,0.16150919155131582,0 +7779,1475252100,0.15555318632769527,0 +7780,1475252400,0.14405995749778516,0 +7781,1475252700,0.14264230417105875,0 +7782,1475253000,0.12889075669393404,0 +7783,1475253300,0.1319618218873092,0 +7784,1475253600,0.12316740792430453,0 +7785,1475253900,0.12081912878150892,0 +7786,1475254200,0.11986368627693834,0 +7787,1475254500,0.11253655901745263,0 +7788,1475254800,0.10185607673370307,0 +7789,1475255100,0.10059973188181526,0 +7790,1475255400,0.0938526947144991,0 +7791,1475255700,0.0920845056635578,0 +7792,1475256000,0.08059127683354259,0 +7793,1475256300,0.07779939938501973,0 +7794,1475256600,0.07600949573213882,0 +7795,1475256900,0.06965642349323196,0 +7796,1475257200,0.06805264500358592,0 +7797,1475257500,0.06900498542243537,0 +7798,1475257800,0.06928417316676225,0 +7799,1475258100,0.06405095399456187,0 +7800,1475258400,0.06004926298448705,0 +7801,1475258700,0.061746104056275876,0 +7802,1475259000,0.058653324259700185,0 +7803,1475259300,0.05779094433674705,0 +7804,1475259600,0.05497735228659987,0 +7805,1475259900,0.05569703625061759,0 +7806,1475260200,0.05397537849095788,0 +7807,1475260500,0.04809072124691008,0 +7808,1475260800,0.05055687965950928,0 +7809,1475261100,0.04729968930237384,0 +7810,1475261400,0.04815896714013123,0 +7811,1475261700,0.05015981264411786,0 +7812,1475262000,0.04534537508893322,0 +7813,1475262300,0.04387808838467869,0 +7814,1475262600,0.04699568486912525,0 +7815,1475262900,0.04557803154271409,0 +7816,1475263200,0.045878933889716124,0 +7817,1475263500,0.04443646387543407,0 +7818,1475263800,0.04359890064035179,0 +7819,1475264100,0.0424139037677215,0 +7820,1475264400,0.04227430989503268,0 +7821,1475264700,0.04117927351829865,0 +7822,1475265000,0.04348412345658465,0 +7823,1475265300,0.0400873392267604,0 +7824,1475265600,0.043459306767662936,0 +7825,1475265900,0.04178418029854925,0 +7826,1475266200,0.04004080793621436,0 +7827,1475266500,0.03957549502865262,0 +7828,1475266800,0.0395289637370558,0 +7829,1475267100,0.042202961915565,0 +7830,1475267400,0.0382260875944118,0 +7831,1475267700,0.0388527089773375,0 +7832,1475268000,0.039783334793511815,0 +7833,1475268300,0.03924977599272892,0 +7834,1475268600,0.03913189672271518,0 +7835,1475268900,0.038340864779229766,0 +7836,1475269200,0.03969027221241976,0 +7837,1475269500,0.03924977599272892,0 +7838,1475269800,0.03969027221241976,0 +7839,1475270100,0.03929630728327494,0 +7840,1475270400,0.038179556303865776,0 +7841,1475270700,0.03827261888600862,0 +7842,1475271000,0.040133870518357186,0 +7843,1475271300,0.04287921667633407,0 +7844,1475271600,0.041504992554222346,0 +7845,1475271900,0.04348412345658465,0 +7846,1475272200,0.046276000905107535,0 +7847,1475272500,0.04339106087444181,0 +7848,1475272800,0.051906287092699295,0 +7849,1475273100,0.05334875710803215,0 +7850,1475273400,0.05697819779058647,0 +7851,1475273700,0.05969872726069245,0 +7852,1475274000,0.061026420091207324,0 +7853,1475274300,0.062096639780070476,0 +7854,1475274600,0.06393307472454816,0 +7855,1475274900,0.06486370054072246,0 +7856,1475275200,0.0694237670394511,0 +7857,1475275500,0.07098411632479767,0 +7858,1475275800,0.07351852063061798,0 +7859,1475276100,0.07542630355351257,0 +7860,1475276400,0.07989330747114919,0 +7861,1475276700,0.07791727865503344,0 +7862,1475277000,0.08215162611888915,0 +7863,1475277300,0.08252387644535887,0 +7864,1475277600,0.0872452514194515,0 +7865,1475277900,0.08471084711363118,0 +7866,1475278200,0.0895966326482835,0 +7867,1475278500,0.09175878662763408,0 +7868,1475278800,0.09513385625478324,0 +7869,1475279100,0.09994829381049326,0 +7870,1475279400,0.10325201545796454,0 +7871,1475279700,0.10981292746204584,0 +7872,1475280000,0.10332336343722208,0 +7873,1475280300,0.10737158573763278,0 +7874,1475280600,0.10809126970207084,0 +7875,1475280900,0.10655573710543582,0 +7876,1475281200,0.11616599970053235,0 +7877,1475281500,0.11725793399154524,0 +7878,1475281800,0.1149561861395058,0 +7879,1475282100,0.11574721808330647,0 +7880,1475282400,0.11965584651123852,0 +7881,1475282700,0.12440203817383247,0 +7882,1475283000,0.12926300702040378,0 +7883,1475283300,0.129706605326131,0 +7884,1475283600,0.14210564328381914,0 +7885,1475283900,0.14026920833986686,0 +7886,1475284200,0.14934281004761873,0 +7887,1475284500,0.1468766516347043,0 +7888,1475284800,0.14971506037408844,0 +7889,1475285100,0.14850524681306188,0 +7890,1475285400,0.15548494043447414,0 +7891,1475285700,0.15899650184754016,0 +7892,1475286000,0.1566947539955007,0 +7893,1475286300,0.15543840914371795,0 +7894,1475286600,0.1652565115044093,0 +7895,1475286900,0.16409322923424394,0 +7896,1475287200,0.16576835570335768,0 +7897,1475287500,0.1690255460599677,0 +7898,1475287800,0.16972351542215094,0 +7899,1475288100,0.16579007030571755,0 +7900,1475288400,0.17028189091185553,0 +7901,1475288700,0.17828527293105953,0 +7902,1475289000,0.1741439880490314,0 +7903,1475289300,0.16686028999437053,0 +7904,1475289600,0.16727907161159644,0 +7905,1475289900,0.1813098068336785,0 +7906,1475290200,0.16688510668276682,0 +7907,1475290500,0.17458448426872225,0 +7908,1475290800,0.16790879508055853,0 +7909,1475291100,0.17365385845254794,0 +7910,1475291400,0.1763309587170935,0 +7911,1475291700,0.17095504358553745,0 +7912,1475292000,0.17125904801889108,0 +7913,1475292300,0.16432588568823495,0 +7914,1475292600,0.16304472414795085,0 +7915,1475292900,0.16041725726030295,0 +7916,1475293200,0.16279035309159995,0 +7917,1475293500,0.16988482389698953,0 +7918,1475293800,0.17272323263637365,0 +7919,1475294100,0.18377596524659293,0 +7920,1475294400,0.2057387345086004,0 +7921,1475294700,0.2024101961727013,0 +7922,1475295000,0.21739327181337006,0 +7923,1475295300,0.22681740924523186,0 +7924,1475295600,0.2403331981823291,0 +7925,1475295900,0.2480573924566808,0 +7926,1475296200,0.2518729583031005,0 +7927,1475296500,0.2597367464498258,0 +7928,1475296800,0.26169106066379183,0 +7929,1475297100,0.26206331099026153,0 +7930,1475297400,0.2561786537460036,0 +7931,1475297700,0.257828963526616,0 +7932,1475298000,0.2540382143686977,0 +7933,1475298300,0.2381710442027684,0 +7934,1475298600,0.2365424490244108,0 +7935,1475298900,0.2220464008942416,0 +7936,1475299200,0.21478751952797706,0 +7937,1475299500,0.21115807884489726,0 +7938,1475299800,0.1968977892548228,0 +7939,1475300100,0.19999056905061666,0 +7940,1475300400,0.17563298935491026,0 +7941,1475300700,0.1812384588544209,0 +7942,1475301000,0.17479542612035334,0 +7943,1475301300,0.16502385505041825,0 +7944,1475301600,0.161301351785616,0 +7945,1475301900,0.16109040993398485,0 +7946,1475302200,0.1569956563427129,0 +7947,1475302500,0.1618131959845644,0 +7948,1475302800,0.15690259376109547,0 +7949,1475303100,0.16243981736749016,0 +7950,1475303400,0.15720659819444913,0 +7951,1475303700,0.16586141828497508,0 +7952,1475304000,0.16274382180073868,0 +7953,1475304300,0.15555318632769527,0 +7954,1475304600,0.1553918778528567,0 +7955,1475304900,0.16234675478587268,0 +7956,1475305200,0.1593005062808938,0 +7957,1475305500,0.15997365895457571,0 +7958,1475305800,0.15415724760343386,0 +7959,1475306100,0.15706700432197046,0 +7960,1475306400,0.16793050968291842,0 +7961,1475306700,0.1586707828119317,0 +7962,1475307000,0.16281206769395984,0 +7963,1475307300,0.15150496402728456,0 +7964,1475307600,0.15345927824125058,0 +7965,1475307900,0.15567106559770902,0 +7966,1475308200,0.14796858592571718,0 +7967,1475308500,0.15429684147580744,0 +7968,1475308800,0.15390287654697785,0 +7969,1475309100,0.15743925464844016,0 +7970,1475309400,0.15792628215899224,0 +7971,1475309700,0.16139441436723342,0 +7972,1475310000,0.15888172466356282,0 +7973,1475310300,0.1562759723782748,0 +7974,1475310600,0.17025707422345926,0 +7975,1475310900,0.16346350576528185,0 +7976,1475311200,0.1683492913002494,0 +7977,1475311500,0.1675830760449501,0 +7978,1475311800,0.18107715037968747,0 +7979,1475312100,0.1748884887019708,0 +7980,1475312400,0.1825196203947051,0 +7981,1475312700,0.17905148818635885,0 +7982,1475313000,0.18240174112469126,0 +7983,1475313300,0.18789243344018275,0 +7984,1475313600,0.1883825630367082,0 +7985,1475313900,0.19768882119858144,0 +7986,1475314200,0.1948721270615845,0 +7987,1475314500,0.1946860018983496,0 +7988,1475314800,0.19417415769944327,0 +7989,1475315100,0.20108560542766601,0 +7990,1475315400,0.20622576201906848,0 +7991,1475315700,0.2017587581013688,0 +7992,1475316000,0.2038774828762308,0 +7993,1475316300,0.20934335850330488,0 +7994,1475316600,0.20485463998322434,0 +7995,1475316900,0.2118343336046156,0 +7996,1475317200,0.219930778205437,0 +7997,1475317500,0.2248165637404045,0 +7998,1475317800,0.21397477298181647,0 +7999,1475318100,0.2097156088297746,0 +8000,1475318400,0.2229304954196597,0 +8001,1475318700,0.2194189340065937,0 +8002,1475319000,0.22072181014923767,0 +8003,1475319300,0.2256075956842053,0 +8004,1475319600,0.22253653049083008,0 +8005,1475319900,0.22483827834286954,0 +8006,1475320200,0.2247452157612521,0 +8007,1475320500,0.22488480963362573,0 +8008,1475320800,0.2274223160257977,0 +8009,1475321100,0.2296310012962197,0 +8010,1475321400,0.23975310809026465,0 +8011,1475321700,0.23612366740707985,0 +8012,1475322000,0.2513145828133959,0 +8013,1475322300,0.24161435972261325,0 +8014,1475322600,0.2467793330024331,0 +8015,1475322900,0.2492672060177074,0 +8016,1475323200,0.2460100156610973,0 +8017,1475323500,0.2464753285691845,0 +8018,1475323800,0.2538272725170665,0 +8019,1475324100,0.2534550221905968,0 +8020,1475324400,0.26064565766364023,0 +8021,1475324700,0.2619950650971455,0 +8022,1475325000,0.2581794992507257,0 +8023,1475325300,0.265180907474498,0 +8024,1475325600,0.2650661302905206,0 +8025,1475325900,0.2565043727816121,0 +8026,1475326200,0.2698123219531146,0 +8027,1475326500,0.26776494515753113,0 +8028,1475326800,0.2658571622343213,0 +8029,1475327100,0.2705102913151928,0 +8030,1475327400,0.26601847070915985,0 +8031,1475327700,0.2646225319847933,0 +8032,1475328000,0.27118344398897976,0 +8033,1475328300,0.2724180742384026,0 +8034,1475328600,0.2671817529793252,0 +8035,1475328900,0.2662511271631509,0 +8036,1475329200,0.27830273148276563,0 +8037,1475329500,0.26643725232638577,0 +8038,1475329800,0.2686490396828442,0 +8039,1475330100,0.2789324549517277,0 +8040,1475330400,0.27211406980515396,0 +8041,1475330700,0.25757459247026504,0 +8042,1475331000,0.2658571622343213,0 +8043,1475331300,0.2618306545362705,0 +8044,1475331600,0.2555954615677977,0 +8045,1475331900,0.2600159341946781,0 +8046,1475332200,0.2541995228435362,0 +8047,1475332500,0.2596436838682084,0 +8048,1475332800,0.2443131745895187,0 +8049,1475333100,0.2586913434496742,0 +8050,1475333400,0.2416826056158344,0 +8051,1475333700,0.2493385539969649,0 +8052,1475334000,0.24147476585023964,0 +8053,1475334300,0.23323872737693954,0 +8054,1475334600,0.2253501225417129,0 +8055,1475334900,0.22223252605747645,0 +8056,1475335200,0.2181843037570657,0 +8057,1475335500,0.2226978389655636,0 +8058,1475335800,0.2092037646308262,0 +8059,1475336100,0.20797223646745025,0 +8060,1475336400,0.19617500320424952,0 +8061,1475336700,0.18796378141941927,0 +8062,1475337000,0.17798126849781093,0 +8063,1475337300,0.1806583687623565,0 +8064,1475337600,0.1725619241615351,0 +8065,1475337900,0.17367867514094426,0 +8066,1475338200,0.16292994696397356,0 +8067,1475338500,0.15922915830163625,0 +8068,1475338800,0.1495289352108536,0 +8069,1475339100,0.15108618240995358,0 +8070,1475339400,0.13836142541665702,0 +8071,1475339700,0.14219870586543654,0 +8072,1475340000,0.12626328980618107,0 +8073,1475340300,0.1246346946279286,0 +8074,1475340600,0.12184281717930065,0 +8075,1475340900,0.12409803374058392,0 +8076,1475341200,0.11860734142505053,0 +8077,1475341500,0.10323030085560464,0 +8078,1475341800,0.10592911572251007,0 +8079,1475342100,0.09080644620983548,0 +8080,1475342400,0.08519787462386805,0 +8081,1475342700,0.08652556745543376,0 +8082,1475343000,0.07703318412940516,0 +8083,1475343300,0.07761327422178488,0 +8084,1475343600,0.07095929963587598,0 +8085,1475343900,0.06609833079014532,0 +8086,1475344200,0.06621310797391249,0 +8087,1475344500,0.06395789141241906,0 +8088,1475344800,0.060979888800661325,0 +8089,1475345100,0.05797706949998192,0 +8090,1475345400,0.05588316141385245,0 +8091,1475345700,0.057837475628343876,0 +8092,1475346000,0.05583663012330643,0 +8093,1475346300,0.05325569452588934,0 +8094,1475346600,0.05406844107204989,0 +8095,1475346900,0.05078953611329014,0 +8096,1475347200,0.0493935973895541,0 +8097,1475347500,0.04846297157337981,0 +8098,1475347800,0.05109043846029216,0 +8099,1475348100,0.04648384067101752,0 +8100,1475348400,0.04680955970589041,0 +8101,1475348700,0.04855603415447182,0 +8102,1475349000,0.0480659045579884,0 +8103,1475349300,0.0459502818691838,0 +8104,1475349600,0.04711356413913901,0 +8105,1475349900,0.04466912032921496,0 +8106,1475350200,0.043785025803586654,0 +8107,1475350500,0.045180964528373516,0 +8108,1475350800,0.042600028930956366,0 +8109,1475351100,0.044157276130056385,0 +8110,1475351400,0.043040525150647206,0 +8111,1475351700,0.04418209281897807,0 +8112,1475352000,0.044250338711148425,0 +8113,1475352300,0.043018810547972096,0 +8114,1475352600,0.042249493207161766,0 +8115,1475352900,0.043530654747130665,0 +8116,1475353200,0.04162287182423609,0 +8117,1475353500,0.040133870518357186,0 +8118,1475353800,0.04048130415590522,0 +8119,1475354100,0.04297227925742608,0 +8120,1475354400,0.04004080793621436,0 +8121,1475354700,0.040971433752388674,0 +8122,1475355000,0.04315840442066095,0 +8123,1475355300,0.03992292866620067,0 +8124,1475355600,0.03976162019188748,0 +8125,1475355900,0.041504992554222346,0 +8126,1475356200,0.03889924026893432,0 +8127,1475356500,0.042342555788253806,0 +8128,1475356800,0.04250696634986431,0 +8129,1475357100,0.041064496334531485,0 +8130,1475357400,0.039783334793511815,0 +8131,1475357700,0.0424139037677215,0 +8132,1475358000,0.04273962280364522,0 +8133,1475358300,0.042600028930956366,0 +8134,1475358600,0.044203807420602405,0 +8135,1475358900,0.04515924992569837,0 +8136,1475359200,0.05181322451160726,0 +8137,1475359500,0.05164881394999672,0 +8138,1475359800,0.05351006558234531,0 +8139,1475360100,0.05765135046510901,0 +8140,1475360400,0.06367870366809217,0 +8141,1475360700,0.06321339075947963,0 +8142,1475361000,0.06670323757039591,0 +8143,1475361300,0.06698242531472283,0 +8144,1475361600,0.06486370054072246,0 +8145,1475361900,0.07121677277857852,0 +8146,1475362200,0.07184339416150426,0 +8147,1475362500,0.07600949573213882,0 +8148,1475362800,0.08238428257267004,0 +8149,1475363100,0.08119618361379316,0 +8150,1475363400,0.08436031138983659,0 +8151,1475363700,0.08689781778190346,0 +8152,1475364000,0.09006194555689602,0 +8153,1475364300,0.08680475519976064,0 +8154,1475364600,0.09048072717391176,0 +8155,1475364900,0.09059550435767892,0 +8156,1475365200,0.08822240852617177,0 +8157,1475365500,0.0978078544332924,0 +8158,1475365800,0.10206701858533423,0 +8159,1475366100,0.1094654938239724,0 +8160,1475366400,0.1019956706060767,0 +8161,1475366700,0.11239696514497396,0 +8162,1475367000,0.11909747102153395,0 +8163,1475367300,0.11595505784890126,0 +8164,1475367600,0.11348889943598682,0 +8165,1475367900,0.1190261230422764,0 +8166,1475368200,0.12342488106679694,0 +8167,1475368500,0.12423762761295752,0 +8168,1475368800,0.12395843986810524,0 +8169,1475369100,0.1252861326992506,0 +8170,1475369400,0.1326132599586312,0 +8171,1475369700,0.12952048016289616,0 +8172,1475370000,0.1318222280149356,0 +8173,1475370300,0.1438986490229466,0 +8174,1475370600,0.14771421486936626,0 +8175,1475370900,0.14429261395177614,0 +8176,1475371200,0.14350158200808058,0 +8177,1475371500,0.1494110559408399,0 +8178,1475371800,0.15097140522597627,0 +8179,1475372100,0.14731714785439518,0 +8180,1475372400,0.1518772143537543,0 +8181,1475372700,0.15520575268962186,0 +8182,1475373000,0.16132306638808094,0 +8183,1475373300,0.161301351785616,0 +8184,1475373600,0.17335295610533574,0 +8185,1475373900,0.1797494575485421,0 +8186,1475374200,0.1705827932590677,0 +8187,1475374500,0.1814711153085171,0 +8188,1475374800,0.17037495349347295,0 +8189,1475375100,0.1767032090435632,0 +8190,1475375400,0.17035013680507669,0 +8191,1475375700,0.16823451411627208,0 +8192,1475376000,0.17435182781462613,1 +8193,1475376300,0.23153878421932444,1 +8194,1475376600,0.17970292625778592,1 +8195,1475376900,0.1819612449050005,0 +8196,1475377200,0.17205007996258667,0 +8197,1475377500,0.1790763048748602,0 +8198,1475377800,0.17889017971162532,0 +8199,1475378100,0.17500326588594814,0 +8200,1475378400,0.17586564580900635,0 +8201,1475378700,0.1729341744880048,0 +8202,1475379000,0.1612765350972197,0 +8203,1475379300,0.1682810454070283,0 +8204,1475379600,0.1717708922177344,0 +8205,1475379900,0.1767249236459231,0 +8206,1475380200,0.1915001595209341,0 +8207,1475380500,0.19173281597497768,0 +8208,1475380800,0.2029685716624164,0 +8209,1475381100,0.2225117138023287,0 +8210,1475381400,0.2210940604757074,0 +8211,1475381700,0.23949563494777226,0 +8212,1475382000,0.2506166134512127,0 +8213,1475382300,0.2600159341946781,0 +8214,1475382600,0.264926536418147,0 +8215,1475382900,0.27865326720687544,0 +8216,1475383200,0.26401762520433264,0 +8217,1475383500,0.2631583473673108,0 +8218,1475383800,0.26906782130017515,0 +8219,1475384100,0.26031993862803177,0 +8220,1475384400,0.2616445293730357,0 +8221,1475384700,0.2449863272633057,0 +8222,1475385000,0.25259574435368004,0 +8223,1475385300,0.23072603767326896,0 +8224,1475385600,0.2132519869312369,0 +8225,1475385900,0.2184883081904194,0 +8226,1475386200,0.2117412710229981,0 +8227,1475386500,0.1969195038572005,0 +8228,1475386800,0.1853580291340892,0 +8229,1475387100,0.17968121165532094,0 +8230,1475387400,0.17509632846756554,0 +8231,1475387700,0.16427935439747882,0 +8232,1475388000,0.16313778672956827,0 +8233,1475388300,0.1644654795607137,0 +8234,1475388600,0.15988059637295826,0 +8235,1475388900,0.16969869873375468,0 +8236,1475389200,0.15860253691871054,0 +8237,1475389500,0.1612548204948598,0 +8238,1475389800,0.1541107163125726,0 +8239,1475390100,0.15860253691871054,0 +8240,1475390400,0.1602311320970681,0 +8241,1475390700,0.15660169141388328,0 +8242,1475391000,0.1562976869806347,0 +8243,1475391300,0.1527613088790673,0 +8244,1475391600,0.16895419808081524,0 +8245,1475391900,0.1567164685978606,0 +8246,1475392200,0.1580193447406097,0 +8247,1475392500,0.1595796940257461,0 +8248,1475392800,0.15827681788299702,0 +8249,1475393100,0.1570887189244354,0 +8250,1475393400,0.15034168175701418,0 +8251,1475393700,0.15716006690358789,0 +8252,1475394000,0.161301351785616,0 +8253,1475394300,0.15825200119460073,0 +8254,1475394600,0.15611156181739985,0 +8255,1475394900,0.16388228738250776,0 +8256,1475395200,0.15888172466356282,0 +8257,1475395500,0.16025284669942802,0 +8258,1475395800,0.16565047643334396,0 +8259,1475396100,0.1583915950670794,0 +8260,1475396400,0.16337044318366442,0 +8261,1475396700,0.1674651967748313,0 +8262,1475397000,0.16395363536176527,0 +8263,1475397300,0.1682810454070283,0 +8264,1475397600,0.17314201425359954,0 +8265,1475397900,0.17028189091185553,0 +8266,1475398200,0.17472407814109586,0 +8267,1475398500,0.17237579899830022,0 +8268,1475398800,0.187845902149374,0 +8269,1475399100,0.18947449732770005,0 +8270,1475399400,0.18500749340997927,0 +8271,1475399700,0.19007940410822385,0 +8272,1475400000,0.1904516544346936,0 +8273,1475400300,0.19038340854151445,0 +8274,1475400600,0.1944316308419252,0 +8275,1475400900,0.19933913097928416,0 +8276,1475401200,0.2038061348969838,0 +8277,1475401500,0.20622576201906848,0 +8278,1475401800,0.21416089814505126,0 +8279,1475402100,0.2043893270751267,0 +8280,1475402400,0.2103918635894929,0 +8281,1475402700,0.2101126758446406,0 +8282,1475403000,0.2224899991999688,0 +8283,1475403300,0.21276495942078988,0 +8284,1475403600,0.21022745302872287,0 +8285,1475403900,0.2270252490108266,0 +8286,1475404200,0.2216989672561681,0 +8287,1475404500,0.2264451589187621,0 +8288,1475404800,0.21909321497088013,0 +8289,1475405100,0.2317249093825593,0 +8290,1475405400,0.2230949059805347,0 +8291,1475405700,0.22923703636728496,0 +8292,1475406000,0.22348887090936426,0 +8293,1475406300,0.23326044197929946,0 +8294,1475406600,0.2246521531795296,0 +8295,1475406900,0.2422657977939353,0 +8296,1475407200,0.2369364139532404,0 +8297,1475407500,0.2413351719777609,0 +8298,1475407800,0.2487801785072604,0 +8299,1475408100,0.2448715500793283,0 +8300,1475408400,0.2541064602619188,0 +8301,1475408700,0.2493137373085687,0 +8302,1475409000,0.2652057241629993,0 +8303,1475409300,0.2569914002920591,0 +8304,1475409600,0.2727437932741161,0 +8305,1475409900,0.2700666930095706,0 +8306,1475410200,0.2849815227569657,0 +8307,1475410500,0.283306396287852,0 +8308,1475410800,0.2834459901602256,0 +8309,1475411100,0.28746939577223996,0 +8310,1475411400,0.2713695691522146,0 +8311,1475411700,0.27406838401912004,0 +8312,1475412000,0.27099731882574485,0 +8313,1475412300,0.2791185801149626,0 +8314,1475412600,0.27325563747295945,0 +8315,1475412900,0.2724180742384026,0 +8316,1475413200,0.2730446956213283,0 +8317,1475413500,0.2845162098488785,0 +8318,1475413800,0.28428355339478245,0 +8319,1475414100,0.2833994588694694,0 +8320,1475414400,0.2872832706090052,0 +8321,1475414700,0.2887257406241278,0 +8322,1475415000,0.2860269257571173,0 +8323,1475415300,0.2836786466143217,0 +8324,1475415600,0.2758148584674913,0 +8325,1475415900,0.2862130509203521,0 +8326,1475416200,0.2816312698186332,0 +8327,1475416500,0.2731377582029457,0 +8328,1475416800,0.2852358938134217,0 +8329,1475417100,0.2791868260081837,0 +8330,1475417400,0.27248632013162377,0 +8331,1475417700,0.2745336969272072,0 +8332,1475418000,0.2675540033057949,0 +8333,1475418300,0.2645542860916773,0 +8334,1475418600,0.2692074151725488,0 +8335,1475418900,0.2608317828268751,0 +8336,1475419200,0.2719062300395593,0 +8337,1475419500,0.25864481215881296,0 +8338,1475419800,0.26218119026038034,0 +8339,1475420100,0.26259997187760625,0 +8340,1475420400,0.2500117066706468,0 +8341,1475420700,0.24433488919198365,0 +8342,1475421000,0.2362849758819185,0 +8343,1475421300,0.23933432647293365,0 +8344,1475421600,0.23579794837147136,0 +8345,1475421900,0.22211774887349914,0 +8346,1475422200,0.22716484288330524,0 +8347,1475422500,0.21504499267046945,0 +8348,1475422800,0.20831967010549213,0 +8349,1475423100,0.19703738312724994,0 +8350,1475423400,0.2101126758446406,0 +8351,1475423700,0.1844739346086711,0 +8352,1475424000,0.19152187412331506,0 +8353,1475424300,0.18763806238375824,0 +8354,1475424600,0.16888595218759409,0 +8355,1475424900,0.16807010355539712,0 +8356,1475425200,0.15438990405742484,0 +8357,1475425500,0.1532266217871545,0 +8358,1475425800,0.14045533350310171,0 +8359,1475426100,0.13042628929056907,0 +8360,1475426400,0.1263315356994022,0 +8361,1475426700,0.11765500100651632,0 +8362,1475427000,0.11837468497095438,0 +8363,1475427300,0.10769730477324126,0 +8364,1475427600,0.1044618290189911,0 +8365,1475427900,0.1014838264072334,0 +8366,1475428200,0.09811185886654096,0 +8367,1475428500,0.08587412938358634,0 +8368,1475428800,0.08489697227686599,0 +8369,1475429100,0.08308225193506344,0 +8370,1475429400,0.07424130668088229,0 +8371,1475429700,0.07389077095708768,0 +8372,1475430000,0.07207605061528513,0 +8373,1475430300,0.06802782831571506,0 +8374,1475430600,0.06540036142775191,0 +8375,1475430900,0.0640974852851079,0 +8376,1475431200,0.06177092074414677,0 +8377,1475431500,0.061094665984428476,0 +8378,1475431800,0.06014232556557906,0 +8379,1475432100,0.056441536903557035,0 +8380,1475432400,0.0541863203420636,0 +8381,1475432700,0.05376753872504785,0 +8382,1475433000,0.05709297497540442,0 +8383,1475433300,0.055278254633601864,0 +8384,1475433600,0.05004503546035072,0 +8385,1475433900,0.048602565445017835,0 +8386,1475434200,0.051906287092699295,0 +8387,1475434500,0.050764719425419255,0 +8388,1475434800,0.051834939113231566,0 +8389,1475435100,0.04953319126119216,0 +8390,1475435400,0.046741313812669286,0 +8391,1475435700,0.0448800621803207,0 +8392,1475436000,0.0461829383229647,0 +8393,1475436300,0.0458324025991701,0 +8394,1475436600,0.04608987574187265,0 +8395,1475436900,0.04287921667633407,0 +8396,1475437200,0.04476218291030699,0 +8397,1475437500,0.04099314835506378,0 +8398,1475437800,0.04278615409419124,0 +8399,1475438100,0.039783334793511815,0 +8400,1475438400,0.03922495930380724,0 +8401,1475438700,0.038573521233010624,0 +8402,1475439000,0.042202961915565,0 +8403,1475439300,0.03927149059540405,0 +8404,1475439600,0.041111027625077526,0 +8405,1475439900,0.03766771210470722,0 +8406,1475440200,0.03982986608510861,0 +8407,1475440500,0.03999427664566834,0 +8408,1475440800,0.040155585119981525,0 +8409,1475441100,0.0397368035029658,0 +8410,1475441400,0.044250338711148425,0 +8411,1475441700,0.04252868095148864,0 +8412,1475442000,0.04085355448237496,0 +8413,1475442300,0.040760491901282926,0 +8414,1475442600,0.042342555788253806,0 +8415,1475442900,0.042342555788253806,0 +8416,1475443200,0.041808996987470905,0 +8417,1475443500,0.04139021536940436,0 +8418,1475443800,0.042296024497707814,0 +8419,1475444100,0.041458461262625534,0 +8420,1475444400,0.04429687000274525,0 +8421,1475444700,0.049626253843334966,0 +8422,1475445000,0.04725315801182785,0 +8423,1475445300,0.04692743897590413,0 +8424,1475445600,0.05453375398066244,0 +8425,1475445900,0.05376753872504785,0 +8426,1475446200,0.05844238240859444,0 +8427,1475446500,0.05977007523910935,0 +8428,1475446800,0.061398670417677076,0 +8429,1475447100,0.0638182975397302,0 +8430,1475447400,0.06681801475416305,0 +8431,1475447700,0.06565473248420793,0 +8432,1475448000,0.07363639990063169,0 +8433,1475448300,0.07277401997767856,0 +8434,1475448600,0.07328586417683709,0 +8435,1475448900,0.07852218543528401,0 +8436,1475449200,0.08098834384893397,0 +8437,1475449500,0.0774519657474717,0 +8438,1475449800,0.08494350356741202,0 +8439,1475450100,0.08454643655307144,0 +8440,1475450400,0.0931081940614546,0 +8441,1475450700,0.09131829040794323,0 +8442,1475451000,0.09513385625478324,0 +8443,1475451300,0.0912252278268512,0 +8444,1475451600,0.0941318824593514,0 +8445,1475451900,0.09994829381049326,0 +8446,1475452200,0.10059973188181526,0 +8447,1475452500,0.10690627282954564,0 +8448,1475452800,0.10325201545796454,0 +8449,1475453100,0.11302358652789968,0 +8450,1475453400,0.11379290386934048,0 +8451,1475453700,0.11230390256335654,0 +8452,1475454000,0.11169899578289576,0 +8453,1475454300,0.11790937206286728,0 +8454,1475454600,0.11942319005724747,0 +8455,1475454900,0.11800243464448468,0 +8456,1475455200,0.12349312696001807,0 +8457,1475455500,0.12102696854710365,0 +8458,1475455800,0.12754134926042876,0 +8459,1475456100,0.13356870246330688,0 +8460,1475456400,0.13231235761141902,0 +8461,1475456700,0.1428284293342936,0 +8462,1475457000,0.1424561790078239,0 +8463,1475457300,0.1556462489093127,0 +8464,1475457600,0.15176243716977694,0 +8465,1475457900,0.15466909180227714,0 +8466,1475458200,0.1644654795607137,0 +8467,1475458500,0.1521564020986066,0 +8468,1475458800,0.15878866208194542,0 +8469,1475459100,0.15611156181739985,0 +8470,1475459400,0.15848465764869685,0 +8471,1475459700,0.16893248347835027,0 +8472,1475460000,0.16860676444274178,0 +8473,1475460300,0.1711194541464124,0 +8474,1475460600,0.16776920120818495,0 +8475,1475460900,0.16176666469370315,0 +8476,1475461200,0.16983829260612826,0 +8477,1475461500,0.17181742350859566,0 +8478,1475461800,0.17588736041136624,0 +8479,1475462100,0.16993135518774571,0 +8480,1475462400,0.17025707422345926,0 +8481,1475462700,0.17551511008489654,0 +8482,1475463000,0.1729558890903647,0 +8483,1475463300,0.17276976392712984,0 +8484,1475463600,0.16997788647860698,0 +8485,1475463900,0.16223197760189534,0 +8486,1475464200,0.1652565115044093,0 +8487,1475464500,0.1732133622328571,0 +8488,1475464800,0.17274804932476995,0 +8489,1475465100,0.167837447101301,0 +8490,1475465400,0.16488426117793956,0 +8491,1475465700,0.16586141828497508,0 +8492,1475466000,0.1767032090435632,0 +8493,1475466300,0.18579852535378008,0 +8494,1475466600,0.1806335520739602,0 +8495,1475466900,0.1916149367049325,0 +8496,1475467200,0.2024101961727013,0 +8497,1475467500,0.2003876360655247,0 +8498,1475467800,0.21788340140985354,0 +8499,1475468100,0.2363780384635358,0 +8500,1475468400,0.2384967632383768,0 +8501,1475468700,0.25157205595578325,0 +8502,1475469000,0.25024436312474296,0 +8503,1475469300,0.2617624086430493,0 +8504,1475469600,0.2573884673070302,0 +8505,1475469900,0.26139015831657963,0 +8506,1475470200,0.2494316165785824,0 +8507,1475470500,0.2497573356142959,0 +8508,1475470800,0.2464753285691845,0 +8509,1475471100,0.2477316734210724,0 +8510,1475471400,0.23298125423444715,0 +8511,1475471700,0.22274437025642485,0 +8512,1475472000,0.2129510845840248,0 +8513,1475472300,0.20662282903397647,0 +8514,1475472600,0.20062029251956826,0 +8515,1475472900,0.18891612183798487,0 +8516,1475473200,0.1844491179202748,0 +8517,1475473500,0.1824948037063088,0 +8518,1475473800,0.1835184921041005,0 +8519,1475474100,0.1708154497131638,0 +8520,1475474400,0.16893248347835027,0 +8521,1475474700,0.16460507343308728,0 +8522,1475475000,0.16037072596944169,0 +8523,1475475300,0.16230022349501144,0 +8524,1475475600,0.15464737719991725,0 +8525,1475475900,0.16490597578040453,0 +8526,1475476200,0.15639074956225213,0 +8527,1475476500,0.1554135924552166,0 +8528,1475476800,0.16018460080620686,0 +8529,1475477100,0.15829853248546194,0 +8530,1475477400,0.15964793991886214,0 +8531,1475477700,0.16281206769395984,0 +8532,1475478000,0.1629764782548348,0 +8533,1475478300,0.15713525021519156,0 +8534,1475478600,0.16064991371429402,0 +8535,1475478900,0.15850947433709311,0 +8536,1475479200,0.16655938764715836,0 +8537,1475479500,0.16886113549919782,0 +8538,1475479800,0.16569700772410015,0 +8539,1475480100,0.1640684125457426,0 +8540,1475480400,0.16367444761691294,0 +8541,1475480700,0.1718856694017117,0 +8542,1475481000,0.16604754344820996,0 +8543,1475481300,0.17484195741121464,0 +8544,1475481600,0.1695373902589161,0 +8545,1475481900,0.16941951098890234,0 +8546,1475482200,0.18179683434412552,0 +8547,1475482500,0.1759587083906238,0 +8548,1475482800,0.17772689744135495,0 +8549,1475483100,0.18977850176099065,0 +8550,1475483400,0.1864034321342408,0 +8551,1475483700,0.1859164046237938,0 +8552,1475484000,0.19566315900534315,0 +8553,1475484300,0.20448238965674406,0 +8554,1475484600,0.20336563867732446,0 +8555,1475484900,0.20364482642217674,0 +8556,1475485200,0.2022488876978943,0 +8557,1475485500,0.203272576095707,0 +8558,1475485800,0.2183952456086969,0 +8559,1475486100,0.22351368759776047,0 +8560,1475486400,0.2229770267104159,0 +8561,1475486700,0.2339832280298789,0 +8562,1475487000,0.2281885312810969,0 +8563,1475487300,0.22439778212317868,0 +8564,1475487600,0.2375661374222025,0 +8565,1475487900,0.23768091460617985,0 +8566,1475488200,0.24021842099835186,0 +8567,1475488500,0.244614076936836,0 +8568,1475488800,0.2427311107020224,0 +8569,1475489100,0.2529679946801497,0 +8570,1475489400,0.2408698590696738,0 +8571,1475489700,0.2430785443400958,0 +8572,1475490000,0.2477316734210724,0 +8573,1475490300,0.2582942764347031,0 +8574,1475490600,0.2470585207472853,0 +8575,1475490900,0.25324718242500205,0 +8576,1475491200,0.262389030025975,0 +8577,1475491500,0.2555489302770415,0 +8578,1475491800,0.26294740551567963,0 +8579,1475492100,0.26169106066379183,0 +8580,1475492400,0.2627612803524448,0 +8581,1475492700,0.2650195989997645,0 +8582,1475493000,0.26843809783121303,0 +8583,1475493300,0.2716022256062056,0 +8584,1475493600,0.2857725547006613,0 +8585,1475493900,0.2765345424320345,0 +8586,1475494200,0.2863774614812272,0 +8587,1475494500,0.2893089328022287,0 +8588,1475494800,0.30087040752535993,0 +8589,1475495100,0.29000690216441194,0 +8590,1475495400,0.29977847323434714,0 +8591,1475495700,0.302405940121995,0 +8592,1475496000,0.3125962928094713,0 +8593,1475496300,0.300451625908029,0 +8594,1475496600,0.31501591993152445,0 +8595,1475496900,0.3256964022153791,0 +8596,1475497200,0.3201126473183333,0 +8597,1475497500,0.32469442841973706,0 +8598,1475497800,0.3282090919185243,0 +8599,1475498100,0.3270923409391151,0 +8600,1475498400,0.3254637457615982,0 +8601,1475498700,0.3320711890559103,0 +8602,1475499000,0.3287209361176828,0 +8603,1475499300,0.3341650971420397,0 +8604,1475499600,0.32704580964856905,0 +8605,1475499900,0.3424476669065163,0 +8606,1475500200,0.343120819579988,0 +8607,1475500500,0.3388864721161323,0 +8608,1475500800,0.3437505430491603,0 +8609,1475501100,0.3553833657518644,0 +8610,1475501400,0.3595928965267984,0 +8611,1475501700,0.35884839587385897,0 +8612,1475502000,0.35598827253211496,0 +8613,1475502300,0.3504727635282904,0 +8614,1475502600,0.3672240282194276,0 +8615,1475502900,0.3714583756832833,0 +8616,1475503200,0.36385206067852505,0 +8617,1475503500,0.3524053631401066,0 +8618,1475503800,0.34889069964026864,0 +8619,1475504100,0.3505658261093824,0 +8620,1475504400,0.3418892914168117,0 +8621,1475504700,0.3457048572636517,0 +8622,1475505000,0.3403754734220112,0 +8623,1475505300,0.3398170979323066,0 +8624,1475505600,0.3360015320865174,0 +8625,1475505900,0.32495190156243964,0 +8626,1475506200,0.3362341885402983,0 +8627,1475506500,0.3115726044111541,0 +8628,1475506800,0.3145754237118336,0 +8629,1475507100,0.30903820010533384,0 +8630,1475507400,0.30396628940744663,0 +8631,1475507700,0.3023842255196351,0 +8632,1475508000,0.3015683768874381,0 +8633,1475508300,0.2820965827267203,0 +8634,1475508600,0.2733487000545769,0 +8635,1475508900,0.2742793258707512,0 +8636,1475509200,0.2616693460614319,0 +8637,1475509500,0.2604130012096492,0 +8638,1475509800,0.23049338121917284,0 +8639,1475510100,0.2343771929587086,0 +8640,1475510400,0.2283994731327281,0 +8641,1475510700,0.2214663108021772,0 +8642,1475511000,0.21551030557855647,0 +8643,1475511300,0.2069237313812097,0 +8644,1475511600,0.19531572536730124,0 +8645,1475511900,0.17591217709976253,0 +8646,1475512200,0.1748884887019708,0 +8647,1475512500,0.17030360551421542,0 +8648,1475512800,0.1574609692509051,0 +8649,1475513100,0.15429684147580744,0 +8650,1475513400,0.15888172466356282,0 +8651,1475513700,0.138615796473113,0 +8652,1475514000,0.13214794705054406,0 +8653,1475514300,0.13594179829460384,0 +8654,1475514600,0.12181800049090435,0 +8655,1475514900,0.11532843646597553,0 +8656,1475515200,0.1120929607117254,0 +8657,1475515500,0.10830221155380708,0 +8658,1475515800,0.10267192536590007,0 +8659,1475516100,0.09199144308246578,0 +8660,1475516400,0.09143306759276118,0 +8661,1475516700,0.0912717591173972,0 +8662,1475517000,0.08331490838884432,0 +8663,1475517300,0.08475737840417719,0 +8664,1475517600,0.07612427291590597,0 +8665,1475517900,0.07358986861008568,0 +8666,1475518200,0.07002867381970168,0 +8667,1475518500,0.07005349050862336,0 +8668,1475518800,0.06993561123860964,0 +8669,1475519100,0.06807435960626106,0 +8670,1475519400,0.06982083405484249,0 +8671,1475519700,0.06872579767705768,0 +8672,1475520000,0.06081547824010159,0 +8673,1475520300,0.05834931982645161,0 +8674,1475520600,0.05930476233154761,0 +8675,1475520900,0.05646635359247872,0 +8676,1475521200,0.053395288398578135,0 +8677,1475521500,0.05392884720041181,0 +8678,1475521800,0.051602282659450716,0 +8679,1475522100,0.05092912998597899,0 +8680,1475522400,0.050066750063025835,0 +8681,1475522700,0.05111525514921381,0 +8682,1475523000,0.04983409360924496,0 +8683,1475523300,0.0467630284153444,0 +8684,1475523600,0.04764712294097267,0 +8685,1475523900,0.04702050155804696,0 +8686,1475524200,0.04380984249250836,0 +8687,1475524500,0.04666996583425236,0 +8688,1475524800,0.045298843798387216,0 +8689,1475525100,0.045553214854843226,0 +8690,1475525400,0.04478699959922865,0 +8691,1475525700,0.04318011902333607,0 +8692,1475526000,0.04504137065568466,0 +8693,1475526300,0.0454601522727004,0 +8694,1475526600,0.043902905073600375,0 +8695,1475526900,0.041808996987470905,0 +8696,1475527200,0.04257521224203466,0 +8697,1475527500,0.04108621093615582,0 +8698,1475527800,0.041551523844768366,0 +8699,1475528100,0.04197030546178409,0 +8700,1475528400,0.04608987574187265,0 +8701,1475528700,0.04257521224203466,0 +8702,1475529000,0.044228624109524085,0 +8703,1475529300,0.04273962280364522,0 +8704,1475529600,0.0454601522727004,0 +8705,1475529900,0.04399596765574322,0 +8706,1475530200,0.04373849451304064,0 +8707,1475530500,0.04653037196156353,0 +8708,1475530800,0.04727487261450297,0 +8709,1475531100,0.04969449973655612,0 +8710,1475531400,0.052346783312390135,0 +8711,1475531700,0.05490600430713215,0 +8712,1475532000,0.05937300822476872,0 +8713,1475532300,0.06000273169394104,0 +8714,1475532600,0.0673081443506465,0 +8715,1475532900,0.07182167955882914,0 +8716,1475533200,0.07900921294657168,0 +8717,1475533500,0.08422071751714774,0 +8718,1475533800,0.08508309744010087,0 +8719,1475534100,0.09389922600525527,0 +8720,1475534400,0.09038766459176896,0 +8721,1475534700,0.10285805052913492,0 +8722,1475535000,0.10360255118207436,0 +8723,1475535300,0.10913977478836394,0 +8724,1475535600,0.1112553974771685,0 +8725,1475535900,0.11593334324654135,0 +8726,1475536200,0.12603063335219006,0 +8727,1475536500,0.12761269723968632,0 +8728,1475536800,0.1336834796472842,0 +8729,1475537100,0.14617868227262615,0 +8730,1475537400,0.14199086609973674,0 +8731,1475537700,0.14903880561437013,0 +8732,1475538000,0.1516910891905194,0 +8733,1475538300,0.16262594253072496,0 +8734,1475538600,0.15690259376109547,0 +8735,1475538900,0.1707006725291865,0 +8736,1475539200,0.16974523002451086,0 +8737,1475539500,0.17491020330433069,0 +8738,1475539800,0.17400439417665778,0 +8739,1475540100,0.18689356173082933,0 +8740,1475540400,0.19103484661284686,0 +8741,1475540700,0.19103484661284686,0 +8742,1475541000,0.1991778225044771,0 +8743,1475541300,0.20071335510118568,0 +8744,1475541600,0.20792570517664155,0 +8745,1475541900,0.21101848497241865,0 +8746,1475542200,0.21416089814505126,0 +8747,1475542500,0.2257937208474401,0 +8748,1475542800,0.24287070457450105,0 +8749,1475543100,0.2380314503302897,0 +8750,1475543400,0.2489663036704952,0 +8751,1475543700,0.2555489302770415,0 +8752,1475544000,0.2535946160629704,0 +8753,1475544300,0.2590853083785038,0 +8754,1475544600,0.2673213468518039,0 +8755,1475544900,0.2681123787954995,0 +8756,1475545200,0.2762088233963209,0 +8757,1475545500,0.2662045958723947,0 +8758,1475545800,0.2700915096979669,0 +8759,1475546100,0.2864922386652045,0 +8760,1475546400,0.2939372451947039,0 +8761,1475546700,0.2877268689147324,0 +8762,1475547000,0.2870754308434104,0 +8763,1475547300,0.29945275419863354,0 +8764,1475547600,0.2889583970781189,0 +8765,1475547900,0.29738056071465385,0 +8766,1475548200,0.3018723813207918,0 +8767,1475548500,0.3191354902105622,0 +8768,1475548800,0.3033613826261452,0 +8769,1475549100,0.30647897911059185,0 +8770,1475549400,0.30010419226995555,0 +8771,1475549700,0.31322601627864355,0 +8772,1475550000,0.297172720948954,0 +8773,1475550300,0.30457119618769724,0 +8774,1475550600,0.29947446880099343,0 +8775,1475550900,0.3089668521258661,0 +8776,1475551200,0.29372940542910914,0 +8777,1475551500,0.2971479042605577,0 +8778,1475551800,0.2978706903111373,0 +8779,1475552100,0.3058492556414195,0 +8780,1475552400,0.30889860623264503,0 +8781,1475552700,0.3077818552532359,0 +8782,1475553000,0.309059914708009,0 +8783,1475553300,0.30985094665149443,0 diff --git a/datasets/anomaly_reserve/kpi/SCORE/problem_TEST/dataSplits.csv b/datasets/anomaly_reserve/kpi/SCORE/problem_TEST/dataSplits.csv new file mode 100644 index 0000000..1f92bd4 --- /dev/null +++ b/datasets/anomaly_reserve/kpi/SCORE/problem_TEST/dataSplits.csv @@ -0,0 +1,7028 @@ +d3mIndex,type,repeat,fold +7027,TEST,0,0 +7028,TEST,0,0 +7029,TEST,0,0 +7030,TEST,0,0 +7031,TEST,0,0 +7032,TEST,0,0 +7033,TEST,0,0 +7034,TEST,0,0 +7035,TEST,0,0 +7036,TEST,0,0 +7037,TEST,0,0 +7038,TEST,0,0 +7039,TEST,0,0 +7040,TEST,0,0 +7041,TEST,0,0 +7042,TEST,0,0 +7043,TEST,0,0 +7044,TEST,0,0 +7045,TEST,0,0 +7046,TEST,0,0 +7047,TEST,0,0 +7048,TEST,0,0 +7049,TEST,0,0 +7050,TEST,0,0 +7051,TEST,0,0 +7052,TEST,0,0 +7053,TEST,0,0 +7054,TEST,0,0 +7055,TEST,0,0 +7056,TEST,0,0 +7057,TEST,0,0 +7058,TEST,0,0 +7059,TEST,0,0 +7060,TEST,0,0 +7061,TEST,0,0 +7062,TEST,0,0 +7063,TEST,0,0 +7064,TEST,0,0 +7065,TEST,0,0 +7066,TEST,0,0 +7067,TEST,0,0 +7068,TEST,0,0 +7069,TEST,0,0 +7070,TEST,0,0 +7071,TEST,0,0 +7072,TEST,0,0 +7073,TEST,0,0 +7074,TEST,0,0 +7075,TEST,0,0 +7076,TEST,0,0 +7077,TEST,0,0 +7078,TEST,0,0 +7079,TEST,0,0 +7080,TEST,0,0 +7081,TEST,0,0 +7082,TEST,0,0 +7083,TEST,0,0 +7084,TEST,0,0 +7085,TEST,0,0 +7086,TEST,0,0 +7087,TEST,0,0 +7088,TEST,0,0 +7089,TEST,0,0 +7090,TEST,0,0 +7091,TEST,0,0 +7092,TEST,0,0 +7093,TEST,0,0 +7094,TEST,0,0 +7095,TEST,0,0 +7096,TEST,0,0 +7097,TEST,0,0 +7098,TEST,0,0 +7099,TEST,0,0 +7100,TEST,0,0 +7101,TEST,0,0 +7102,TEST,0,0 +7103,TEST,0,0 +7104,TEST,0,0 +7105,TEST,0,0 +7106,TEST,0,0 +7107,TEST,0,0 +7108,TEST,0,0 +7109,TEST,0,0 +7110,TEST,0,0 +7111,TEST,0,0 +7112,TEST,0,0 +7113,TEST,0,0 +7114,TEST,0,0 +7115,TEST,0,0 +7116,TEST,0,0 +7117,TEST,0,0 +7118,TEST,0,0 +7119,TEST,0,0 +7120,TEST,0,0 +7121,TEST,0,0 +7122,TEST,0,0 +7123,TEST,0,0 +7124,TEST,0,0 +7125,TEST,0,0 +7126,TEST,0,0 +7127,TEST,0,0 +7128,TEST,0,0 +7129,TEST,0,0 +7130,TEST,0,0 +7131,TEST,0,0 +7132,TEST,0,0 +7133,TEST,0,0 +7134,TEST,0,0 +7135,TEST,0,0 +7136,TEST,0,0 +7137,TEST,0,0 +7138,TEST,0,0 +7139,TEST,0,0 +7140,TEST,0,0 +7141,TEST,0,0 +7142,TEST,0,0 +7143,TEST,0,0 +7144,TEST,0,0 +7145,TEST,0,0 +7146,TEST,0,0 +7147,TEST,0,0 +7148,TEST,0,0 +7149,TEST,0,0 +7150,TEST,0,0 +7151,TEST,0,0 +7152,TEST,0,0 +7153,TEST,0,0 +7154,TEST,0,0 +7155,TEST,0,0 +7156,TEST,0,0 +7157,TEST,0,0 +7158,TEST,0,0 +7159,TEST,0,0 +7160,TEST,0,0 +7161,TEST,0,0 +7162,TEST,0,0 +7163,TEST,0,0 +7164,TEST,0,0 +7165,TEST,0,0 +7166,TEST,0,0 +7167,TEST,0,0 +7168,TEST,0,0 +7169,TEST,0,0 +7170,TEST,0,0 +7171,TEST,0,0 +7172,TEST,0,0 +7173,TEST,0,0 +7174,TEST,0,0 +7175,TEST,0,0 +7176,TEST,0,0 +7177,TEST,0,0 +7178,TEST,0,0 +7179,TEST,0,0 +7180,TEST,0,0 +7181,TEST,0,0 +7182,TEST,0,0 +7183,TEST,0,0 +7184,TEST,0,0 +7185,TEST,0,0 +7186,TEST,0,0 +7187,TEST,0,0 +7188,TEST,0,0 +7189,TEST,0,0 +7190,TEST,0,0 +7191,TEST,0,0 +7192,TEST,0,0 +7193,TEST,0,0 +7194,TEST,0,0 +7195,TEST,0,0 +7196,TEST,0,0 +7197,TEST,0,0 +7198,TEST,0,0 +7199,TEST,0,0 +7200,TEST,0,0 +7201,TEST,0,0 +7202,TEST,0,0 +7203,TEST,0,0 +7204,TEST,0,0 +7205,TEST,0,0 +7206,TEST,0,0 +7207,TEST,0,0 +7208,TEST,0,0 +7209,TEST,0,0 +7210,TEST,0,0 +7211,TEST,0,0 +7212,TEST,0,0 +7213,TEST,0,0 +7214,TEST,0,0 +7215,TEST,0,0 +7216,TEST,0,0 +7217,TEST,0,0 +7218,TEST,0,0 +7219,TEST,0,0 +7220,TEST,0,0 +7221,TEST,0,0 +7222,TEST,0,0 +7223,TEST,0,0 +7224,TEST,0,0 +7225,TEST,0,0 +7226,TEST,0,0 +7227,TEST,0,0 +7228,TEST,0,0 +7229,TEST,0,0 +7230,TEST,0,0 +7231,TEST,0,0 +7232,TEST,0,0 +7233,TEST,0,0 +7234,TEST,0,0 +7235,TEST,0,0 +7236,TEST,0,0 +7237,TEST,0,0 +7238,TEST,0,0 +7239,TEST,0,0 +7240,TEST,0,0 +7241,TEST,0,0 +7242,TEST,0,0 +7243,TEST,0,0 +7244,TEST,0,0 +7245,TEST,0,0 +7246,TEST,0,0 +7247,TEST,0,0 +7248,TEST,0,0 +7249,TEST,0,0 +7250,TEST,0,0 +7251,TEST,0,0 +7252,TEST,0,0 +7253,TEST,0,0 +7254,TEST,0,0 +7255,TEST,0,0 +7256,TEST,0,0 +7257,TEST,0,0 +7258,TEST,0,0 +7259,TEST,0,0 +7260,TEST,0,0 +7261,TEST,0,0 +7262,TEST,0,0 +7263,TEST,0,0 +7264,TEST,0,0 +7265,TEST,0,0 +7266,TEST,0,0 +7267,TEST,0,0 +7268,TEST,0,0 +7269,TEST,0,0 +7270,TEST,0,0 +7271,TEST,0,0 +7272,TEST,0,0 +7273,TEST,0,0 +7274,TEST,0,0 +7275,TEST,0,0 +7276,TEST,0,0 +7277,TEST,0,0 +7278,TEST,0,0 +7279,TEST,0,0 +7280,TEST,0,0 +7281,TEST,0,0 +7282,TEST,0,0 +7283,TEST,0,0 +7284,TEST,0,0 +7285,TEST,0,0 +7286,TEST,0,0 +7287,TEST,0,0 +7288,TEST,0,0 +7289,TEST,0,0 +7290,TEST,0,0 +7291,TEST,0,0 +7292,TEST,0,0 +7293,TEST,0,0 +7294,TEST,0,0 +7295,TEST,0,0 +7296,TEST,0,0 +7297,TEST,0,0 +7298,TEST,0,0 +7299,TEST,0,0 +7300,TEST,0,0 +7301,TEST,0,0 +7302,TEST,0,0 +7303,TEST,0,0 +7304,TEST,0,0 +7305,TEST,0,0 +7306,TEST,0,0 +7307,TEST,0,0 +7308,TEST,0,0 +7309,TEST,0,0 +7310,TEST,0,0 +7311,TEST,0,0 +7312,TEST,0,0 +7313,TEST,0,0 +7314,TEST,0,0 +7315,TEST,0,0 +7316,TEST,0,0 +7317,TEST,0,0 +7318,TEST,0,0 +7319,TEST,0,0 +7320,TEST,0,0 +7321,TEST,0,0 +7322,TEST,0,0 +7323,TEST,0,0 +7324,TEST,0,0 +7325,TEST,0,0 +7326,TEST,0,0 +7327,TEST,0,0 +7328,TEST,0,0 +7329,TEST,0,0 +7330,TEST,0,0 +7331,TEST,0,0 +7332,TEST,0,0 +7333,TEST,0,0 +7334,TEST,0,0 +7335,TEST,0,0 +7336,TEST,0,0 +7337,TEST,0,0 +7338,TEST,0,0 +7339,TEST,0,0 +7340,TEST,0,0 +7341,TEST,0,0 +7342,TEST,0,0 +7343,TEST,0,0 +7344,TEST,0,0 +7345,TEST,0,0 +7346,TEST,0,0 +7347,TEST,0,0 +7348,TEST,0,0 +7349,TEST,0,0 +7350,TEST,0,0 +7351,TEST,0,0 +7352,TEST,0,0 +7353,TEST,0,0 +7354,TEST,0,0 +7355,TEST,0,0 +7356,TEST,0,0 +7357,TEST,0,0 +7358,TEST,0,0 +7359,TEST,0,0 +7360,TEST,0,0 +7361,TEST,0,0 +7362,TEST,0,0 +7363,TEST,0,0 +7364,TEST,0,0 +7365,TEST,0,0 +7366,TEST,0,0 +7367,TEST,0,0 +7368,TEST,0,0 +7369,TEST,0,0 +7370,TEST,0,0 +7371,TEST,0,0 +7372,TEST,0,0 +7373,TEST,0,0 +7374,TEST,0,0 +7375,TEST,0,0 +7376,TEST,0,0 +7377,TEST,0,0 +7378,TEST,0,0 +7379,TEST,0,0 +7380,TEST,0,0 +7381,TEST,0,0 +7382,TEST,0,0 +7383,TEST,0,0 +7384,TEST,0,0 +7385,TEST,0,0 +7386,TEST,0,0 +7387,TEST,0,0 +7388,TEST,0,0 +7389,TEST,0,0 +7390,TEST,0,0 +7391,TEST,0,0 +7392,TEST,0,0 +7393,TEST,0,0 +7394,TEST,0,0 +7395,TEST,0,0 +7396,TEST,0,0 +7397,TEST,0,0 +7398,TEST,0,0 +7399,TEST,0,0 +7400,TEST,0,0 +7401,TEST,0,0 +7402,TEST,0,0 +7403,TEST,0,0 +7404,TEST,0,0 +7405,TEST,0,0 +7406,TEST,0,0 +7407,TEST,0,0 +7408,TEST,0,0 +7409,TEST,0,0 +7410,TEST,0,0 +7411,TEST,0,0 +7412,TEST,0,0 +7413,TEST,0,0 +7414,TEST,0,0 +7415,TEST,0,0 +7416,TEST,0,0 +7417,TEST,0,0 +7418,TEST,0,0 +7419,TEST,0,0 +7420,TEST,0,0 +7421,TEST,0,0 +7422,TEST,0,0 +7423,TEST,0,0 +7424,TEST,0,0 +7425,TEST,0,0 +7426,TEST,0,0 +7427,TEST,0,0 +7428,TEST,0,0 +7429,TEST,0,0 +7430,TEST,0,0 +7431,TEST,0,0 +7432,TEST,0,0 +7433,TEST,0,0 +7434,TEST,0,0 +7435,TEST,0,0 +7436,TEST,0,0 +7437,TEST,0,0 +7438,TEST,0,0 +7439,TEST,0,0 +7440,TEST,0,0 +7441,TEST,0,0 +7442,TEST,0,0 +7443,TEST,0,0 +7444,TEST,0,0 +7445,TEST,0,0 +7446,TEST,0,0 +7447,TEST,0,0 +7448,TEST,0,0 +7449,TEST,0,0 +7450,TEST,0,0 +7451,TEST,0,0 +7452,TEST,0,0 +7453,TEST,0,0 +7454,TEST,0,0 +7455,TEST,0,0 +7456,TEST,0,0 +7457,TEST,0,0 +7458,TEST,0,0 +7459,TEST,0,0 +7460,TEST,0,0 +7461,TEST,0,0 +7462,TEST,0,0 +7463,TEST,0,0 +7464,TEST,0,0 +7465,TEST,0,0 +7466,TEST,0,0 +7467,TEST,0,0 +7468,TEST,0,0 +7469,TEST,0,0 +7470,TEST,0,0 +7471,TEST,0,0 +7472,TEST,0,0 +7473,TEST,0,0 +7474,TEST,0,0 +7475,TEST,0,0 +7476,TEST,0,0 +7477,TEST,0,0 +7478,TEST,0,0 +7479,TEST,0,0 +7480,TEST,0,0 +7481,TEST,0,0 +7482,TEST,0,0 +7483,TEST,0,0 +7484,TEST,0,0 +7485,TEST,0,0 +7486,TEST,0,0 +7487,TEST,0,0 +7488,TEST,0,0 +7489,TEST,0,0 +7490,TEST,0,0 +7491,TEST,0,0 +7492,TEST,0,0 +7493,TEST,0,0 +7494,TEST,0,0 +7495,TEST,0,0 +7496,TEST,0,0 +7497,TEST,0,0 +7498,TEST,0,0 +7499,TEST,0,0 +7500,TEST,0,0 +7501,TEST,0,0 +7502,TEST,0,0 +7503,TEST,0,0 +7504,TEST,0,0 +7505,TEST,0,0 +7506,TEST,0,0 +7507,TEST,0,0 +7508,TEST,0,0 +7509,TEST,0,0 +7510,TEST,0,0 +7511,TEST,0,0 +7512,TEST,0,0 +7513,TEST,0,0 +7514,TEST,0,0 +7515,TEST,0,0 +7516,TEST,0,0 +7517,TEST,0,0 +7518,TEST,0,0 +7519,TEST,0,0 +7520,TEST,0,0 +7521,TEST,0,0 +7522,TEST,0,0 +7523,TEST,0,0 +7524,TEST,0,0 +7525,TEST,0,0 +7526,TEST,0,0 +7527,TEST,0,0 +7528,TEST,0,0 +7529,TEST,0,0 +7530,TEST,0,0 +7531,TEST,0,0 +7532,TEST,0,0 +7533,TEST,0,0 +7534,TEST,0,0 +7535,TEST,0,0 +7536,TEST,0,0 +7537,TEST,0,0 +7538,TEST,0,0 +7539,TEST,0,0 +7540,TEST,0,0 +7541,TEST,0,0 +7542,TEST,0,0 +7543,TEST,0,0 +7544,TEST,0,0 +7545,TEST,0,0 +7546,TEST,0,0 +7547,TEST,0,0 +7548,TEST,0,0 +7549,TEST,0,0 +7550,TEST,0,0 +7551,TEST,0,0 +7552,TEST,0,0 +7553,TEST,0,0 +7554,TEST,0,0 +7555,TEST,0,0 +7556,TEST,0,0 +7557,TEST,0,0 +7558,TEST,0,0 +7559,TEST,0,0 +7560,TEST,0,0 +7561,TEST,0,0 +7562,TEST,0,0 +7563,TEST,0,0 +7564,TEST,0,0 +7565,TEST,0,0 +7566,TEST,0,0 +7567,TEST,0,0 +7568,TEST,0,0 +7569,TEST,0,0 +7570,TEST,0,0 +7571,TEST,0,0 +7572,TEST,0,0 +7573,TEST,0,0 +7574,TEST,0,0 +7575,TEST,0,0 +7576,TEST,0,0 +7577,TEST,0,0 +7578,TEST,0,0 +7579,TEST,0,0 +7580,TEST,0,0 +7581,TEST,0,0 +7582,TEST,0,0 +7583,TEST,0,0 +7584,TEST,0,0 +7585,TEST,0,0 +7586,TEST,0,0 +7587,TEST,0,0 +7588,TEST,0,0 +7589,TEST,0,0 +7590,TEST,0,0 +7591,TEST,0,0 +7592,TEST,0,0 +7593,TEST,0,0 +7594,TEST,0,0 +7595,TEST,0,0 +7596,TEST,0,0 +7597,TEST,0,0 +7598,TEST,0,0 +7599,TEST,0,0 +7600,TEST,0,0 +7601,TEST,0,0 +7602,TEST,0,0 +7603,TEST,0,0 +7604,TEST,0,0 +7605,TEST,0,0 +7606,TEST,0,0 +7607,TEST,0,0 +7608,TEST,0,0 +7609,TEST,0,0 +7610,TEST,0,0 +7611,TEST,0,0 +7612,TEST,0,0 +7613,TEST,0,0 +7614,TEST,0,0 +7615,TEST,0,0 +7616,TEST,0,0 +7617,TEST,0,0 +7618,TEST,0,0 +7619,TEST,0,0 +7620,TEST,0,0 +7621,TEST,0,0 +7622,TEST,0,0 +7623,TEST,0,0 +7624,TEST,0,0 +7625,TEST,0,0 +7626,TEST,0,0 +7627,TEST,0,0 +7628,TEST,0,0 +7629,TEST,0,0 +7630,TEST,0,0 +7631,TEST,0,0 +7632,TEST,0,0 +7633,TEST,0,0 +7634,TEST,0,0 +7635,TEST,0,0 +7636,TEST,0,0 +7637,TEST,0,0 +7638,TEST,0,0 +7639,TEST,0,0 +7640,TEST,0,0 +7641,TEST,0,0 +7642,TEST,0,0 +7643,TEST,0,0 +7644,TEST,0,0 +7645,TEST,0,0 +7646,TEST,0,0 +7647,TEST,0,0 +7648,TEST,0,0 +7649,TEST,0,0 +7650,TEST,0,0 +7651,TEST,0,0 +7652,TEST,0,0 +7653,TEST,0,0 +7654,TEST,0,0 +7655,TEST,0,0 +7656,TEST,0,0 +7657,TEST,0,0 +7658,TEST,0,0 +7659,TEST,0,0 +7660,TEST,0,0 +7661,TEST,0,0 +7662,TEST,0,0 +7663,TEST,0,0 +7664,TEST,0,0 +7665,TEST,0,0 +7666,TEST,0,0 +7667,TEST,0,0 +7668,TEST,0,0 +7669,TEST,0,0 +7670,TEST,0,0 +7671,TEST,0,0 +7672,TEST,0,0 +7673,TEST,0,0 +7674,TEST,0,0 +7675,TEST,0,0 +7676,TEST,0,0 +7677,TEST,0,0 +7678,TEST,0,0 +7679,TEST,0,0 +7680,TEST,0,0 +7681,TEST,0,0 +7682,TEST,0,0 +7683,TEST,0,0 +7684,TEST,0,0 +7685,TEST,0,0 +7686,TEST,0,0 +7687,TEST,0,0 +7688,TEST,0,0 +7689,TEST,0,0 +7690,TEST,0,0 +7691,TEST,0,0 +7692,TEST,0,0 +7693,TEST,0,0 +7694,TEST,0,0 +7695,TEST,0,0 +7696,TEST,0,0 +7697,TEST,0,0 +7698,TEST,0,0 +7699,TEST,0,0 +7700,TEST,0,0 +7701,TEST,0,0 +7702,TEST,0,0 +7703,TEST,0,0 +7704,TEST,0,0 +7705,TEST,0,0 +7706,TEST,0,0 +7707,TEST,0,0 +7708,TEST,0,0 +7709,TEST,0,0 +7710,TEST,0,0 +7711,TEST,0,0 +7712,TEST,0,0 +7713,TEST,0,0 +7714,TEST,0,0 +7715,TEST,0,0 +7716,TEST,0,0 +7717,TEST,0,0 +7718,TEST,0,0 +7719,TEST,0,0 +7720,TEST,0,0 +7721,TEST,0,0 +7722,TEST,0,0 +7723,TEST,0,0 +7724,TEST,0,0 +7725,TEST,0,0 +7726,TEST,0,0 +7727,TEST,0,0 +7728,TEST,0,0 +7729,TEST,0,0 +7730,TEST,0,0 +7731,TEST,0,0 +7732,TEST,0,0 +7733,TEST,0,0 +7734,TEST,0,0 +7735,TEST,0,0 +7736,TEST,0,0 +7737,TEST,0,0 +7738,TEST,0,0 +7739,TEST,0,0 +7740,TEST,0,0 +7741,TEST,0,0 +7742,TEST,0,0 +7743,TEST,0,0 +7744,TEST,0,0 +7745,TEST,0,0 +7746,TEST,0,0 +7747,TEST,0,0 +7748,TEST,0,0 +7749,TEST,0,0 +7750,TEST,0,0 +7751,TEST,0,0 +7752,TEST,0,0 +7753,TEST,0,0 +7754,TEST,0,0 +7755,TEST,0,0 +7756,TEST,0,0 +7757,TEST,0,0 +7758,TEST,0,0 +7759,TEST,0,0 +7760,TEST,0,0 +7761,TEST,0,0 +7762,TEST,0,0 +7763,TEST,0,0 +7764,TEST,0,0 +7765,TEST,0,0 +7766,TEST,0,0 +7767,TEST,0,0 +7768,TEST,0,0 +7769,TEST,0,0 +7770,TEST,0,0 +7771,TEST,0,0 +7772,TEST,0,0 +7773,TEST,0,0 +7774,TEST,0,0 +7775,TEST,0,0 +7776,TEST,0,0 +7777,TEST,0,0 +7778,TEST,0,0 +7779,TEST,0,0 +7780,TEST,0,0 +7781,TEST,0,0 +7782,TEST,0,0 +7783,TEST,0,0 +7784,TEST,0,0 +7785,TEST,0,0 +7786,TEST,0,0 +7787,TEST,0,0 +7788,TEST,0,0 +7789,TEST,0,0 +7790,TEST,0,0 +7791,TEST,0,0 +7792,TEST,0,0 +7793,TEST,0,0 +7794,TEST,0,0 +7795,TEST,0,0 +7796,TEST,0,0 +7797,TEST,0,0 +7798,TEST,0,0 +7799,TEST,0,0 +7800,TEST,0,0 +7801,TEST,0,0 +7802,TEST,0,0 +7803,TEST,0,0 +7804,TEST,0,0 +7805,TEST,0,0 +7806,TEST,0,0 +7807,TEST,0,0 +7808,TEST,0,0 +7809,TEST,0,0 +7810,TEST,0,0 +7811,TEST,0,0 +7812,TEST,0,0 +7813,TEST,0,0 +7814,TEST,0,0 +7815,TEST,0,0 +7816,TEST,0,0 +7817,TEST,0,0 +7818,TEST,0,0 +7819,TEST,0,0 +7820,TEST,0,0 +7821,TEST,0,0 +7822,TEST,0,0 +7823,TEST,0,0 +7824,TEST,0,0 +7825,TEST,0,0 +7826,TEST,0,0 +7827,TEST,0,0 +7828,TEST,0,0 +7829,TEST,0,0 +7830,TEST,0,0 +7831,TEST,0,0 +7832,TEST,0,0 +7833,TEST,0,0 +7834,TEST,0,0 +7835,TEST,0,0 +7836,TEST,0,0 +7837,TEST,0,0 +7838,TEST,0,0 +7839,TEST,0,0 +7840,TEST,0,0 +7841,TEST,0,0 +7842,TEST,0,0 +7843,TEST,0,0 +7844,TEST,0,0 +7845,TEST,0,0 +7846,TEST,0,0 +7847,TEST,0,0 +7848,TEST,0,0 +7849,TEST,0,0 +7850,TEST,0,0 +7851,TEST,0,0 +7852,TEST,0,0 +7853,TEST,0,0 +7854,TEST,0,0 +7855,TEST,0,0 +7856,TEST,0,0 +7857,TEST,0,0 +7858,TEST,0,0 +7859,TEST,0,0 +7860,TEST,0,0 +7861,TEST,0,0 +7862,TEST,0,0 +7863,TEST,0,0 +7864,TEST,0,0 +7865,TEST,0,0 +7866,TEST,0,0 +7867,TEST,0,0 +7868,TEST,0,0 +7869,TEST,0,0 +7870,TEST,0,0 +7871,TEST,0,0 +7872,TEST,0,0 +7873,TEST,0,0 +7874,TEST,0,0 +7875,TEST,0,0 +7876,TEST,0,0 +7877,TEST,0,0 +7878,TEST,0,0 +7879,TEST,0,0 +7880,TEST,0,0 +7881,TEST,0,0 +7882,TEST,0,0 +7883,TEST,0,0 +7884,TEST,0,0 +7885,TEST,0,0 +7886,TEST,0,0 +7887,TEST,0,0 +7888,TEST,0,0 +7889,TEST,0,0 +7890,TEST,0,0 +7891,TEST,0,0 +7892,TEST,0,0 +7893,TEST,0,0 +7894,TEST,0,0 +7895,TEST,0,0 +7896,TEST,0,0 +7897,TEST,0,0 +7898,TEST,0,0 +7899,TEST,0,0 +7900,TEST,0,0 +7901,TEST,0,0 +7902,TEST,0,0 +7903,TEST,0,0 +7904,TEST,0,0 +7905,TEST,0,0 +7906,TEST,0,0 +7907,TEST,0,0 +7908,TEST,0,0 +7909,TEST,0,0 +7910,TEST,0,0 +7911,TEST,0,0 +7912,TEST,0,0 +7913,TEST,0,0 +7914,TEST,0,0 +7915,TEST,0,0 +7916,TEST,0,0 +7917,TEST,0,0 +7918,TEST,0,0 +7919,TEST,0,0 +7920,TEST,0,0 +7921,TEST,0,0 +7922,TEST,0,0 +7923,TEST,0,0 +7924,TEST,0,0 +7925,TEST,0,0 +7926,TEST,0,0 +7927,TEST,0,0 +7928,TEST,0,0 +7929,TEST,0,0 +7930,TEST,0,0 +7931,TEST,0,0 +7932,TEST,0,0 +7933,TEST,0,0 +7934,TEST,0,0 +7935,TEST,0,0 +7936,TEST,0,0 +7937,TEST,0,0 +7938,TEST,0,0 +7939,TEST,0,0 +7940,TEST,0,0 +7941,TEST,0,0 +7942,TEST,0,0 +7943,TEST,0,0 +7944,TEST,0,0 +7945,TEST,0,0 +7946,TEST,0,0 +7947,TEST,0,0 +7948,TEST,0,0 +7949,TEST,0,0 +7950,TEST,0,0 +7951,TEST,0,0 +7952,TEST,0,0 +7953,TEST,0,0 +7954,TEST,0,0 +7955,TEST,0,0 +7956,TEST,0,0 +7957,TEST,0,0 +7958,TEST,0,0 +7959,TEST,0,0 +7960,TEST,0,0 +7961,TEST,0,0 +7962,TEST,0,0 +7963,TEST,0,0 +7964,TEST,0,0 +7965,TEST,0,0 +7966,TEST,0,0 +7967,TEST,0,0 +7968,TEST,0,0 +7969,TEST,0,0 +7970,TEST,0,0 +7971,TEST,0,0 +7972,TEST,0,0 +7973,TEST,0,0 +7974,TEST,0,0 +7975,TEST,0,0 +7976,TEST,0,0 +7977,TEST,0,0 +7978,TEST,0,0 +7979,TEST,0,0 +7980,TEST,0,0 +7981,TEST,0,0 +7982,TEST,0,0 +7983,TEST,0,0 +7984,TEST,0,0 +7985,TEST,0,0 +7986,TEST,0,0 +7987,TEST,0,0 +7988,TEST,0,0 +7989,TEST,0,0 +7990,TEST,0,0 +7991,TEST,0,0 +7992,TEST,0,0 +7993,TEST,0,0 +7994,TEST,0,0 +7995,TEST,0,0 +7996,TEST,0,0 +7997,TEST,0,0 +7998,TEST,0,0 +7999,TEST,0,0 +8000,TEST,0,0 +8001,TEST,0,0 +8002,TEST,0,0 +8003,TEST,0,0 +8004,TEST,0,0 +8005,TEST,0,0 +8006,TEST,0,0 +8007,TEST,0,0 +8008,TEST,0,0 +8009,TEST,0,0 +8010,TEST,0,0 +8011,TEST,0,0 +8012,TEST,0,0 +8013,TEST,0,0 +8014,TEST,0,0 +8015,TEST,0,0 +8016,TEST,0,0 +8017,TEST,0,0 +8018,TEST,0,0 +8019,TEST,0,0 +8020,TEST,0,0 +8021,TEST,0,0 +8022,TEST,0,0 +8023,TEST,0,0 +8024,TEST,0,0 +8025,TEST,0,0 +8026,TEST,0,0 +8027,TEST,0,0 +8028,TEST,0,0 +8029,TEST,0,0 +8030,TEST,0,0 +8031,TEST,0,0 +8032,TEST,0,0 +8033,TEST,0,0 +8034,TEST,0,0 +8035,TEST,0,0 +8036,TEST,0,0 +8037,TEST,0,0 +8038,TEST,0,0 +8039,TEST,0,0 +8040,TEST,0,0 +8041,TEST,0,0 +8042,TEST,0,0 +8043,TEST,0,0 +8044,TEST,0,0 +8045,TEST,0,0 +8046,TEST,0,0 +8047,TEST,0,0 +8048,TEST,0,0 +8049,TEST,0,0 +8050,TEST,0,0 +8051,TEST,0,0 +8052,TEST,0,0 +8053,TEST,0,0 +8054,TEST,0,0 +8055,TEST,0,0 +8056,TEST,0,0 +8057,TEST,0,0 +8058,TEST,0,0 +8059,TEST,0,0 +8060,TEST,0,0 +8061,TEST,0,0 +8062,TEST,0,0 +8063,TEST,0,0 +8064,TEST,0,0 +8065,TEST,0,0 +8066,TEST,0,0 +8067,TEST,0,0 +8068,TEST,0,0 +8069,TEST,0,0 +8070,TEST,0,0 +8071,TEST,0,0 +8072,TEST,0,0 +8073,TEST,0,0 +8074,TEST,0,0 +8075,TEST,0,0 +8076,TEST,0,0 +8077,TEST,0,0 +8078,TEST,0,0 +8079,TEST,0,0 +8080,TEST,0,0 +8081,TEST,0,0 +8082,TEST,0,0 +8083,TEST,0,0 +8084,TEST,0,0 +8085,TEST,0,0 +8086,TEST,0,0 +8087,TEST,0,0 +8088,TEST,0,0 +8089,TEST,0,0 +8090,TEST,0,0 +8091,TEST,0,0 +8092,TEST,0,0 +8093,TEST,0,0 +8094,TEST,0,0 +8095,TEST,0,0 +8096,TEST,0,0 +8097,TEST,0,0 +8098,TEST,0,0 +8099,TEST,0,0 +8100,TEST,0,0 +8101,TEST,0,0 +8102,TEST,0,0 +8103,TEST,0,0 +8104,TEST,0,0 +8105,TEST,0,0 +8106,TEST,0,0 +8107,TEST,0,0 +8108,TEST,0,0 +8109,TEST,0,0 +8110,TEST,0,0 +8111,TEST,0,0 +8112,TEST,0,0 +8113,TEST,0,0 +8114,TEST,0,0 +8115,TEST,0,0 +8116,TEST,0,0 +8117,TEST,0,0 +8118,TEST,0,0 +8119,TEST,0,0 +8120,TEST,0,0 +8121,TEST,0,0 +8122,TEST,0,0 +8123,TEST,0,0 +8124,TEST,0,0 +8125,TEST,0,0 +8126,TEST,0,0 +8127,TEST,0,0 +8128,TEST,0,0 +8129,TEST,0,0 +8130,TEST,0,0 +8131,TEST,0,0 +8132,TEST,0,0 +8133,TEST,0,0 +8134,TEST,0,0 +8135,TEST,0,0 +8136,TEST,0,0 +8137,TEST,0,0 +8138,TEST,0,0 +8139,TEST,0,0 +8140,TEST,0,0 +8141,TEST,0,0 +8142,TEST,0,0 +8143,TEST,0,0 +8144,TEST,0,0 +8145,TEST,0,0 +8146,TEST,0,0 +8147,TEST,0,0 +8148,TEST,0,0 +8149,TEST,0,0 +8150,TEST,0,0 +8151,TEST,0,0 +8152,TEST,0,0 +8153,TEST,0,0 +8154,TEST,0,0 +8155,TEST,0,0 +8156,TEST,0,0 +8157,TEST,0,0 +8158,TEST,0,0 +8159,TEST,0,0 +8160,TEST,0,0 +8161,TEST,0,0 +8162,TEST,0,0 +8163,TEST,0,0 +8164,TEST,0,0 +8165,TEST,0,0 +8166,TEST,0,0 +8167,TEST,0,0 +8168,TEST,0,0 +8169,TEST,0,0 +8170,TEST,0,0 +8171,TEST,0,0 +8172,TEST,0,0 +8173,TEST,0,0 +8174,TEST,0,0 +8175,TEST,0,0 +8176,TEST,0,0 +8177,TEST,0,0 +8178,TEST,0,0 +8179,TEST,0,0 +8180,TEST,0,0 +8181,TEST,0,0 +8182,TEST,0,0 +8183,TEST,0,0 +8184,TEST,0,0 +8185,TEST,0,0 +8186,TEST,0,0 +8187,TEST,0,0 +8188,TEST,0,0 +8189,TEST,0,0 +8190,TEST,0,0 +8191,TEST,0,0 +8192,TEST,0,0 +8193,TEST,0,0 +8194,TEST,0,0 +8195,TEST,0,0 +8196,TEST,0,0 +8197,TEST,0,0 +8198,TEST,0,0 +8199,TEST,0,0 +8200,TEST,0,0 +8201,TEST,0,0 +8202,TEST,0,0 +8203,TEST,0,0 +8204,TEST,0,0 +8205,TEST,0,0 +8206,TEST,0,0 +8207,TEST,0,0 +8208,TEST,0,0 +8209,TEST,0,0 +8210,TEST,0,0 +8211,TEST,0,0 +8212,TEST,0,0 +8213,TEST,0,0 +8214,TEST,0,0 +8215,TEST,0,0 +8216,TEST,0,0 +8217,TEST,0,0 +8218,TEST,0,0 +8219,TEST,0,0 +8220,TEST,0,0 +8221,TEST,0,0 +8222,TEST,0,0 +8223,TEST,0,0 +8224,TEST,0,0 +8225,TEST,0,0 +8226,TEST,0,0 +8227,TEST,0,0 +8228,TEST,0,0 +8229,TEST,0,0 +8230,TEST,0,0 +8231,TEST,0,0 +8232,TEST,0,0 +8233,TEST,0,0 +8234,TEST,0,0 +8235,TEST,0,0 +8236,TEST,0,0 +8237,TEST,0,0 +8238,TEST,0,0 +8239,TEST,0,0 +8240,TEST,0,0 +8241,TEST,0,0 +8242,TEST,0,0 +8243,TEST,0,0 +8244,TEST,0,0 +8245,TEST,0,0 +8246,TEST,0,0 +8247,TEST,0,0 +8248,TEST,0,0 +8249,TEST,0,0 +8250,TEST,0,0 +8251,TEST,0,0 +8252,TEST,0,0 +8253,TEST,0,0 +8254,TEST,0,0 +8255,TEST,0,0 +8256,TEST,0,0 +8257,TEST,0,0 +8258,TEST,0,0 +8259,TEST,0,0 +8260,TEST,0,0 +8261,TEST,0,0 +8262,TEST,0,0 +8263,TEST,0,0 +8264,TEST,0,0 +8265,TEST,0,0 +8266,TEST,0,0 +8267,TEST,0,0 +8268,TEST,0,0 +8269,TEST,0,0 +8270,TEST,0,0 +8271,TEST,0,0 +8272,TEST,0,0 +8273,TEST,0,0 +8274,TEST,0,0 +8275,TEST,0,0 +8276,TEST,0,0 +8277,TEST,0,0 +8278,TEST,0,0 +8279,TEST,0,0 +8280,TEST,0,0 +8281,TEST,0,0 +8282,TEST,0,0 +8283,TEST,0,0 +8284,TEST,0,0 +8285,TEST,0,0 +8286,TEST,0,0 +8287,TEST,0,0 +8288,TEST,0,0 +8289,TEST,0,0 +8290,TEST,0,0 +8291,TEST,0,0 +8292,TEST,0,0 +8293,TEST,0,0 +8294,TEST,0,0 +8295,TEST,0,0 +8296,TEST,0,0 +8297,TEST,0,0 +8298,TEST,0,0 +8299,TEST,0,0 +8300,TEST,0,0 +8301,TEST,0,0 +8302,TEST,0,0 +8303,TEST,0,0 +8304,TEST,0,0 +8305,TEST,0,0 +8306,TEST,0,0 +8307,TEST,0,0 +8308,TEST,0,0 +8309,TEST,0,0 +8310,TEST,0,0 +8311,TEST,0,0 +8312,TEST,0,0 +8313,TEST,0,0 +8314,TEST,0,0 +8315,TEST,0,0 +8316,TEST,0,0 +8317,TEST,0,0 +8318,TEST,0,0 +8319,TEST,0,0 +8320,TEST,0,0 +8321,TEST,0,0 +8322,TEST,0,0 +8323,TEST,0,0 +8324,TEST,0,0 +8325,TEST,0,0 +8326,TEST,0,0 +8327,TEST,0,0 +8328,TEST,0,0 +8329,TEST,0,0 +8330,TEST,0,0 +8331,TEST,0,0 +8332,TEST,0,0 +8333,TEST,0,0 +8334,TEST,0,0 +8335,TEST,0,0 +8336,TEST,0,0 +8337,TEST,0,0 +8338,TEST,0,0 +8339,TEST,0,0 +8340,TEST,0,0 +8341,TEST,0,0 +8342,TEST,0,0 +8343,TEST,0,0 +8344,TEST,0,0 +8345,TEST,0,0 +8346,TEST,0,0 +8347,TEST,0,0 +8348,TEST,0,0 +8349,TEST,0,0 +8350,TEST,0,0 +8351,TEST,0,0 +8352,TEST,0,0 +8353,TEST,0,0 +8354,TEST,0,0 +8355,TEST,0,0 +8356,TEST,0,0 +8357,TEST,0,0 +8358,TEST,0,0 +8359,TEST,0,0 +8360,TEST,0,0 +8361,TEST,0,0 +8362,TEST,0,0 +8363,TEST,0,0 +8364,TEST,0,0 +8365,TEST,0,0 +8366,TEST,0,0 +8367,TEST,0,0 +8368,TEST,0,0 +8369,TEST,0,0 +8370,TEST,0,0 +8371,TEST,0,0 +8372,TEST,0,0 +8373,TEST,0,0 +8374,TEST,0,0 +8375,TEST,0,0 +8376,TEST,0,0 +8377,TEST,0,0 +8378,TEST,0,0 +8379,TEST,0,0 +8380,TEST,0,0 +8381,TEST,0,0 +8382,TEST,0,0 +8383,TEST,0,0 +8384,TEST,0,0 +8385,TEST,0,0 +8386,TEST,0,0 +8387,TEST,0,0 +8388,TEST,0,0 +8389,TEST,0,0 +8390,TEST,0,0 +8391,TEST,0,0 +8392,TEST,0,0 +8393,TEST,0,0 +8394,TEST,0,0 +8395,TEST,0,0 +8396,TEST,0,0 +8397,TEST,0,0 +8398,TEST,0,0 +8399,TEST,0,0 +8400,TEST,0,0 +8401,TEST,0,0 +8402,TEST,0,0 +8403,TEST,0,0 +8404,TEST,0,0 +8405,TEST,0,0 +8406,TEST,0,0 +8407,TEST,0,0 +8408,TEST,0,0 +8409,TEST,0,0 +8410,TEST,0,0 +8411,TEST,0,0 +8412,TEST,0,0 +8413,TEST,0,0 +8414,TEST,0,0 +8415,TEST,0,0 +8416,TEST,0,0 +8417,TEST,0,0 +8418,TEST,0,0 +8419,TEST,0,0 +8420,TEST,0,0 +8421,TEST,0,0 +8422,TEST,0,0 +8423,TEST,0,0 +8424,TEST,0,0 +8425,TEST,0,0 +8426,TEST,0,0 +8427,TEST,0,0 +8428,TEST,0,0 +8429,TEST,0,0 +8430,TEST,0,0 +8431,TEST,0,0 +8432,TEST,0,0 +8433,TEST,0,0 +8434,TEST,0,0 +8435,TEST,0,0 +8436,TEST,0,0 +8437,TEST,0,0 +8438,TEST,0,0 +8439,TEST,0,0 +8440,TEST,0,0 +8441,TEST,0,0 +8442,TEST,0,0 +8443,TEST,0,0 +8444,TEST,0,0 +8445,TEST,0,0 +8446,TEST,0,0 +8447,TEST,0,0 +8448,TEST,0,0 +8449,TEST,0,0 +8450,TEST,0,0 +8451,TEST,0,0 +8452,TEST,0,0 +8453,TEST,0,0 +8454,TEST,0,0 +8455,TEST,0,0 +8456,TEST,0,0 +8457,TEST,0,0 +8458,TEST,0,0 +8459,TEST,0,0 +8460,TEST,0,0 +8461,TEST,0,0 +8462,TEST,0,0 +8463,TEST,0,0 +8464,TEST,0,0 +8465,TEST,0,0 +8466,TEST,0,0 +8467,TEST,0,0 +8468,TEST,0,0 +8469,TEST,0,0 +8470,TEST,0,0 +8471,TEST,0,0 +8472,TEST,0,0 +8473,TEST,0,0 +8474,TEST,0,0 +8475,TEST,0,0 +8476,TEST,0,0 +8477,TEST,0,0 +8478,TEST,0,0 +8479,TEST,0,0 +8480,TEST,0,0 +8481,TEST,0,0 +8482,TEST,0,0 +8483,TEST,0,0 +8484,TEST,0,0 +8485,TEST,0,0 +8486,TEST,0,0 +8487,TEST,0,0 +8488,TEST,0,0 +8489,TEST,0,0 +8490,TEST,0,0 +8491,TEST,0,0 +8492,TEST,0,0 +8493,TEST,0,0 +8494,TEST,0,0 +8495,TEST,0,0 +8496,TEST,0,0 +8497,TEST,0,0 +8498,TEST,0,0 +8499,TEST,0,0 +8500,TEST,0,0 +8501,TEST,0,0 +8502,TEST,0,0 +8503,TEST,0,0 +8504,TEST,0,0 +8505,TEST,0,0 +8506,TEST,0,0 +8507,TEST,0,0 +8508,TEST,0,0 +8509,TEST,0,0 +8510,TEST,0,0 +8511,TEST,0,0 +8512,TEST,0,0 +8513,TEST,0,0 +8514,TEST,0,0 +8515,TEST,0,0 +8516,TEST,0,0 +8517,TEST,0,0 +8518,TEST,0,0 +8519,TEST,0,0 +8520,TEST,0,0 +8521,TEST,0,0 +8522,TEST,0,0 +8523,TEST,0,0 +8524,TEST,0,0 +8525,TEST,0,0 +8526,TEST,0,0 +8527,TEST,0,0 +8528,TEST,0,0 +8529,TEST,0,0 +8530,TEST,0,0 +8531,TEST,0,0 +8532,TEST,0,0 +8533,TEST,0,0 +8534,TEST,0,0 +8535,TEST,0,0 +8536,TEST,0,0 +8537,TEST,0,0 +8538,TEST,0,0 +8539,TEST,0,0 +8540,TEST,0,0 +8541,TEST,0,0 +8542,TEST,0,0 +8543,TEST,0,0 +8544,TEST,0,0 +8545,TEST,0,0 +8546,TEST,0,0 +8547,TEST,0,0 +8548,TEST,0,0 +8549,TEST,0,0 +8550,TEST,0,0 +8551,TEST,0,0 +8552,TEST,0,0 +8553,TEST,0,0 +8554,TEST,0,0 +8555,TEST,0,0 +8556,TEST,0,0 +8557,TEST,0,0 +8558,TEST,0,0 +8559,TEST,0,0 +8560,TEST,0,0 +8561,TEST,0,0 +8562,TEST,0,0 +8563,TEST,0,0 +8564,TEST,0,0 +8565,TEST,0,0 +8566,TEST,0,0 +8567,TEST,0,0 +8568,TEST,0,0 +8569,TEST,0,0 +8570,TEST,0,0 +8571,TEST,0,0 +8572,TEST,0,0 +8573,TEST,0,0 +8574,TEST,0,0 +8575,TEST,0,0 +8576,TEST,0,0 +8577,TEST,0,0 +8578,TEST,0,0 +8579,TEST,0,0 +8580,TEST,0,0 +8581,TEST,0,0 +8582,TEST,0,0 +8583,TEST,0,0 +8584,TEST,0,0 +8585,TEST,0,0 +8586,TEST,0,0 +8587,TEST,0,0 +8588,TEST,0,0 +8589,TEST,0,0 +8590,TEST,0,0 +8591,TEST,0,0 +8592,TEST,0,0 +8593,TEST,0,0 +8594,TEST,0,0 +8595,TEST,0,0 +8596,TEST,0,0 +8597,TEST,0,0 +8598,TEST,0,0 +8599,TEST,0,0 +8600,TEST,0,0 +8601,TEST,0,0 +8602,TEST,0,0 +8603,TEST,0,0 +8604,TEST,0,0 +8605,TEST,0,0 +8606,TEST,0,0 +8607,TEST,0,0 +8608,TEST,0,0 +8609,TEST,0,0 +8610,TEST,0,0 +8611,TEST,0,0 +8612,TEST,0,0 +8613,TEST,0,0 +8614,TEST,0,0 +8615,TEST,0,0 +8616,TEST,0,0 +8617,TEST,0,0 +8618,TEST,0,0 +8619,TEST,0,0 +8620,TEST,0,0 +8621,TEST,0,0 +8622,TEST,0,0 +8623,TEST,0,0 +8624,TEST,0,0 +8625,TEST,0,0 +8626,TEST,0,0 +8627,TEST,0,0 +8628,TEST,0,0 +8629,TEST,0,0 +8630,TEST,0,0 +8631,TEST,0,0 +8632,TEST,0,0 +8633,TEST,0,0 +8634,TEST,0,0 +8635,TEST,0,0 +8636,TEST,0,0 +8637,TEST,0,0 +8638,TEST,0,0 +8639,TEST,0,0 +8640,TEST,0,0 +8641,TEST,0,0 +8642,TEST,0,0 +8643,TEST,0,0 +8644,TEST,0,0 +8645,TEST,0,0 +8646,TEST,0,0 +8647,TEST,0,0 +8648,TEST,0,0 +8649,TEST,0,0 +8650,TEST,0,0 +8651,TEST,0,0 +8652,TEST,0,0 +8653,TEST,0,0 +8654,TEST,0,0 +8655,TEST,0,0 +8656,TEST,0,0 +8657,TEST,0,0 +8658,TEST,0,0 +8659,TEST,0,0 +8660,TEST,0,0 +8661,TEST,0,0 +8662,TEST,0,0 +8663,TEST,0,0 +8664,TEST,0,0 +8665,TEST,0,0 +8666,TEST,0,0 +8667,TEST,0,0 +8668,TEST,0,0 +8669,TEST,0,0 +8670,TEST,0,0 +8671,TEST,0,0 +8672,TEST,0,0 +8673,TEST,0,0 +8674,TEST,0,0 +8675,TEST,0,0 +8676,TEST,0,0 +8677,TEST,0,0 +8678,TEST,0,0 +8679,TEST,0,0 +8680,TEST,0,0 +8681,TEST,0,0 +8682,TEST,0,0 +8683,TEST,0,0 +8684,TEST,0,0 +8685,TEST,0,0 +8686,TEST,0,0 +8687,TEST,0,0 +8688,TEST,0,0 +8689,TEST,0,0 +8690,TEST,0,0 +8691,TEST,0,0 +8692,TEST,0,0 +8693,TEST,0,0 +8694,TEST,0,0 +8695,TEST,0,0 +8696,TEST,0,0 +8697,TEST,0,0 +8698,TEST,0,0 +8699,TEST,0,0 +8700,TEST,0,0 +8701,TEST,0,0 +8702,TEST,0,0 +8703,TEST,0,0 +8704,TEST,0,0 +8705,TEST,0,0 +8706,TEST,0,0 +8707,TEST,0,0 +8708,TEST,0,0 +8709,TEST,0,0 +8710,TEST,0,0 +8711,TEST,0,0 +8712,TEST,0,0 +8713,TEST,0,0 +8714,TEST,0,0 +8715,TEST,0,0 +8716,TEST,0,0 +8717,TEST,0,0 +8718,TEST,0,0 +8719,TEST,0,0 +8720,TEST,0,0 +8721,TEST,0,0 +8722,TEST,0,0 +8723,TEST,0,0 +8724,TEST,0,0 +8725,TEST,0,0 +8726,TEST,0,0 +8727,TEST,0,0 +8728,TEST,0,0 +8729,TEST,0,0 +8730,TEST,0,0 +8731,TEST,0,0 +8732,TEST,0,0 +8733,TEST,0,0 +8734,TEST,0,0 +8735,TEST,0,0 +8736,TEST,0,0 +8737,TEST,0,0 +8738,TEST,0,0 +8739,TEST,0,0 +8740,TEST,0,0 +8741,TEST,0,0 +8742,TEST,0,0 +8743,TEST,0,0 +8744,TEST,0,0 +8745,TEST,0,0 +8746,TEST,0,0 +8747,TEST,0,0 +8748,TEST,0,0 +8749,TEST,0,0 +8750,TEST,0,0 +8751,TEST,0,0 +8752,TEST,0,0 +8753,TEST,0,0 +8754,TEST,0,0 +8755,TEST,0,0 +8756,TEST,0,0 +8757,TEST,0,0 +8758,TEST,0,0 +8759,TEST,0,0 +8760,TEST,0,0 +8761,TEST,0,0 +8762,TEST,0,0 +8763,TEST,0,0 +8764,TEST,0,0 +8765,TEST,0,0 +8766,TEST,0,0 +8767,TEST,0,0 +8768,TEST,0,0 +8769,TEST,0,0 +8770,TEST,0,0 +8771,TEST,0,0 +8772,TEST,0,0 +8773,TEST,0,0 +8774,TEST,0,0 +8775,TEST,0,0 +8776,TEST,0,0 +8777,TEST,0,0 +8778,TEST,0,0 +8779,TEST,0,0 +8780,TEST,0,0 +8781,TEST,0,0 +8782,TEST,0,0 +8783,TEST,0,0 +8784,TEST,0,0 +8785,TEST,0,0 +8786,TEST,0,0 +8787,TEST,0,0 +8788,TEST,0,0 +8789,TEST,0,0 +8790,TEST,0,0 +8791,TEST,0,0 +8792,TEST,0,0 +8793,TEST,0,0 +8794,TEST,0,0 +8795,TEST,0,0 +8796,TEST,0,0 +8797,TEST,0,0 +8798,TEST,0,0 +8799,TEST,0,0 +8800,TEST,0,0 +8801,TEST,0,0 +8802,TEST,0,0 +8803,TEST,0,0 +8804,TEST,0,0 +8805,TEST,0,0 +8806,TEST,0,0 +8807,TEST,0,0 +8808,TEST,0,0 +8809,TEST,0,0 +8810,TEST,0,0 +8811,TEST,0,0 +8812,TEST,0,0 +8813,TEST,0,0 +8814,TEST,0,0 +8815,TEST,0,0 +8816,TEST,0,0 +8817,TEST,0,0 +8818,TEST,0,0 +8819,TEST,0,0 +8820,TEST,0,0 +8821,TEST,0,0 +8822,TEST,0,0 +8823,TEST,0,0 +8824,TEST,0,0 +8825,TEST,0,0 +8826,TEST,0,0 +8827,TEST,0,0 +8828,TEST,0,0 +8829,TEST,0,0 +8830,TEST,0,0 +8831,TEST,0,0 +8832,TEST,0,0 +8833,TEST,0,0 +8834,TEST,0,0 +8835,TEST,0,0 +8836,TEST,0,0 +8837,TEST,0,0 +8838,TEST,0,0 +8839,TEST,0,0 +8840,TEST,0,0 +8841,TEST,0,0 +8842,TEST,0,0 +8843,TEST,0,0 +8844,TEST,0,0 +8845,TEST,0,0 +8846,TEST,0,0 +8847,TEST,0,0 +8848,TEST,0,0 +8849,TEST,0,0 +8850,TEST,0,0 +8851,TEST,0,0 +8852,TEST,0,0 +8853,TEST,0,0 +8854,TEST,0,0 +8855,TEST,0,0 +8856,TEST,0,0 +8857,TEST,0,0 +8858,TEST,0,0 +8859,TEST,0,0 +8860,TEST,0,0 +8861,TEST,0,0 +8862,TEST,0,0 +8863,TEST,0,0 +8864,TEST,0,0 +8865,TEST,0,0 +8866,TEST,0,0 +8867,TEST,0,0 +8868,TEST,0,0 +8869,TEST,0,0 +8870,TEST,0,0 +8871,TEST,0,0 +8872,TEST,0,0 +8873,TEST,0,0 +8874,TEST,0,0 +8875,TEST,0,0 +8876,TEST,0,0 +8877,TEST,0,0 +8878,TEST,0,0 +8879,TEST,0,0 +8880,TEST,0,0 +8881,TEST,0,0 +8882,TEST,0,0 +8883,TEST,0,0 +8884,TEST,0,0 +8885,TEST,0,0 +8886,TEST,0,0 +8887,TEST,0,0 +8888,TEST,0,0 +8889,TEST,0,0 +8890,TEST,0,0 +8891,TEST,0,0 +8892,TEST,0,0 +8893,TEST,0,0 +8894,TEST,0,0 +8895,TEST,0,0 +8896,TEST,0,0 +8897,TEST,0,0 +8898,TEST,0,0 +8899,TEST,0,0 +8900,TEST,0,0 +8901,TEST,0,0 +8902,TEST,0,0 +8903,TEST,0,0 +8904,TEST,0,0 +8905,TEST,0,0 +8906,TEST,0,0 +8907,TEST,0,0 +8908,TEST,0,0 +8909,TEST,0,0 +8910,TEST,0,0 +8911,TEST,0,0 +8912,TEST,0,0 +8913,TEST,0,0 +8914,TEST,0,0 +8915,TEST,0,0 +8916,TEST,0,0 +8917,TEST,0,0 +8918,TEST,0,0 +8919,TEST,0,0 +8920,TEST,0,0 +8921,TEST,0,0 +8922,TEST,0,0 +8923,TEST,0,0 +8924,TEST,0,0 +8925,TEST,0,0 +8926,TEST,0,0 +8927,TEST,0,0 +8928,TEST,0,0 +8929,TEST,0,0 +8930,TEST,0,0 +8931,TEST,0,0 +8932,TEST,0,0 +8933,TEST,0,0 +8934,TEST,0,0 +8935,TEST,0,0 +8936,TEST,0,0 +8937,TEST,0,0 +8938,TEST,0,0 +8939,TEST,0,0 +8940,TEST,0,0 +8941,TEST,0,0 +8942,TEST,0,0 +8943,TEST,0,0 +8944,TEST,0,0 +8945,TEST,0,0 +8946,TEST,0,0 +8947,TEST,0,0 +8948,TEST,0,0 +8949,TEST,0,0 +8950,TEST,0,0 +8951,TEST,0,0 +8952,TEST,0,0 +8953,TEST,0,0 +8954,TEST,0,0 +8955,TEST,0,0 +8956,TEST,0,0 +8957,TEST,0,0 +8958,TEST,0,0 +8959,TEST,0,0 +8960,TEST,0,0 +8961,TEST,0,0 +8962,TEST,0,0 +8963,TEST,0,0 +8964,TEST,0,0 +8965,TEST,0,0 +8966,TEST,0,0 +8967,TEST,0,0 +8968,TEST,0,0 +8969,TEST,0,0 +8970,TEST,0,0 +8971,TEST,0,0 +8972,TEST,0,0 +8973,TEST,0,0 +8974,TEST,0,0 +8975,TEST,0,0 +8976,TEST,0,0 +8977,TEST,0,0 +8978,TEST,0,0 +8979,TEST,0,0 +8980,TEST,0,0 +8981,TEST,0,0 +8982,TEST,0,0 +8983,TEST,0,0 +8984,TEST,0,0 +8985,TEST,0,0 +8986,TEST,0,0 +8987,TEST,0,0 +8988,TEST,0,0 +8989,TEST,0,0 +8990,TEST,0,0 +8991,TEST,0,0 +8992,TEST,0,0 +8993,TEST,0,0 +8994,TEST,0,0 +8995,TEST,0,0 +8996,TEST,0,0 +8997,TEST,0,0 +8998,TEST,0,0 +8999,TEST,0,0 +9000,TEST,0,0 +9001,TEST,0,0 +9002,TEST,0,0 +9003,TEST,0,0 +9004,TEST,0,0 +9005,TEST,0,0 +9006,TEST,0,0 +9007,TEST,0,0 +9008,TEST,0,0 +9009,TEST,0,0 +9010,TEST,0,0 +9011,TEST,0,0 +9012,TEST,0,0 +9013,TEST,0,0 +9014,TEST,0,0 +9015,TEST,0,0 +9016,TEST,0,0 +9017,TEST,0,0 +9018,TEST,0,0 +9019,TEST,0,0 +9020,TEST,0,0 +9021,TEST,0,0 +9022,TEST,0,0 +9023,TEST,0,0 +9024,TEST,0,0 +9025,TEST,0,0 +9026,TEST,0,0 +9027,TEST,0,0 +9028,TEST,0,0 +9029,TEST,0,0 +9030,TEST,0,0 +9031,TEST,0,0 +9032,TEST,0,0 +9033,TEST,0,0 +9034,TEST,0,0 +9035,TEST,0,0 +9036,TEST,0,0 +9037,TEST,0,0 +9038,TEST,0,0 +9039,TEST,0,0 +9040,TEST,0,0 +9041,TEST,0,0 +9042,TEST,0,0 +9043,TEST,0,0 +9044,TEST,0,0 +9045,TEST,0,0 +9046,TEST,0,0 +9047,TEST,0,0 +9048,TEST,0,0 +9049,TEST,0,0 +9050,TEST,0,0 +9051,TEST,0,0 +9052,TEST,0,0 +9053,TEST,0,0 +9054,TEST,0,0 +9055,TEST,0,0 +9056,TEST,0,0 +9057,TEST,0,0 +9058,TEST,0,0 +9059,TEST,0,0 +9060,TEST,0,0 +9061,TEST,0,0 +9062,TEST,0,0 +9063,TEST,0,0 +9064,TEST,0,0 +9065,TEST,0,0 +9066,TEST,0,0 +9067,TEST,0,0 +9068,TEST,0,0 +9069,TEST,0,0 +9070,TEST,0,0 +9071,TEST,0,0 +9072,TEST,0,0 +9073,TEST,0,0 +9074,TEST,0,0 +9075,TEST,0,0 +9076,TEST,0,0 +9077,TEST,0,0 +9078,TEST,0,0 +9079,TEST,0,0 +9080,TEST,0,0 +9081,TEST,0,0 +9082,TEST,0,0 +9083,TEST,0,0 +9084,TEST,0,0 +9085,TEST,0,0 +9086,TEST,0,0 +9087,TEST,0,0 +9088,TEST,0,0 +9089,TEST,0,0 +9090,TEST,0,0 +9091,TEST,0,0 +9092,TEST,0,0 +9093,TEST,0,0 +9094,TEST,0,0 +9095,TEST,0,0 +9096,TEST,0,0 +9097,TEST,0,0 +9098,TEST,0,0 +9099,TEST,0,0 +9100,TEST,0,0 +9101,TEST,0,0 +9102,TEST,0,0 +9103,TEST,0,0 +9104,TEST,0,0 +9105,TEST,0,0 +9106,TEST,0,0 +9107,TEST,0,0 +9108,TEST,0,0 +9109,TEST,0,0 +9110,TEST,0,0 +9111,TEST,0,0 +9112,TEST,0,0 +9113,TEST,0,0 +9114,TEST,0,0 +9115,TEST,0,0 +9116,TEST,0,0 +9117,TEST,0,0 +9118,TEST,0,0 +9119,TEST,0,0 +9120,TEST,0,0 +9121,TEST,0,0 +9122,TEST,0,0 +9123,TEST,0,0 +9124,TEST,0,0 +9125,TEST,0,0 +9126,TEST,0,0 +9127,TEST,0,0 +9128,TEST,0,0 +9129,TEST,0,0 +9130,TEST,0,0 +9131,TEST,0,0 +9132,TEST,0,0 +9133,TEST,0,0 +9134,TEST,0,0 +9135,TEST,0,0 +9136,TEST,0,0 +9137,TEST,0,0 +9138,TEST,0,0 +9139,TEST,0,0 +9140,TEST,0,0 +9141,TEST,0,0 +9142,TEST,0,0 +9143,TEST,0,0 +9144,TEST,0,0 +9145,TEST,0,0 +9146,TEST,0,0 +9147,TEST,0,0 +9148,TEST,0,0 +9149,TEST,0,0 +9150,TEST,0,0 +9151,TEST,0,0 +9152,TEST,0,0 +9153,TEST,0,0 +9154,TEST,0,0 +9155,TEST,0,0 +9156,TEST,0,0 +9157,TEST,0,0 +9158,TEST,0,0 +9159,TEST,0,0 +9160,TEST,0,0 +9161,TEST,0,0 +9162,TEST,0,0 +9163,TEST,0,0 +9164,TEST,0,0 +9165,TEST,0,0 +9166,TEST,0,0 +9167,TEST,0,0 +9168,TEST,0,0 +9169,TEST,0,0 +9170,TEST,0,0 +9171,TEST,0,0 +9172,TEST,0,0 +9173,TEST,0,0 +9174,TEST,0,0 +9175,TEST,0,0 +9176,TEST,0,0 +9177,TEST,0,0 +9178,TEST,0,0 +9179,TEST,0,0 +9180,TEST,0,0 +9181,TEST,0,0 +9182,TEST,0,0 +9183,TEST,0,0 +9184,TEST,0,0 +9185,TEST,0,0 +9186,TEST,0,0 +9187,TEST,0,0 +9188,TEST,0,0 +9189,TEST,0,0 +9190,TEST,0,0 +9191,TEST,0,0 +9192,TEST,0,0 +9193,TEST,0,0 +9194,TEST,0,0 +9195,TEST,0,0 +9196,TEST,0,0 +9197,TEST,0,0 +9198,TEST,0,0 +9199,TEST,0,0 +9200,TEST,0,0 +9201,TEST,0,0 +9202,TEST,0,0 +9203,TEST,0,0 +9204,TEST,0,0 +9205,TEST,0,0 +9206,TEST,0,0 +9207,TEST,0,0 +9208,TEST,0,0 +9209,TEST,0,0 +9210,TEST,0,0 +9211,TEST,0,0 +9212,TEST,0,0 +9213,TEST,0,0 +9214,TEST,0,0 +9215,TEST,0,0 +9216,TEST,0,0 +9217,TEST,0,0 +9218,TEST,0,0 +9219,TEST,0,0 +9220,TEST,0,0 +9221,TEST,0,0 +9222,TEST,0,0 +9223,TEST,0,0 +9224,TEST,0,0 +9225,TEST,0,0 +9226,TEST,0,0 +9227,TEST,0,0 +9228,TEST,0,0 +9229,TEST,0,0 +9230,TEST,0,0 +9231,TEST,0,0 +9232,TEST,0,0 +9233,TEST,0,0 +9234,TEST,0,0 +9235,TEST,0,0 +9236,TEST,0,0 +9237,TEST,0,0 +9238,TEST,0,0 +9239,TEST,0,0 +9240,TEST,0,0 +9241,TEST,0,0 +9242,TEST,0,0 +9243,TEST,0,0 +9244,TEST,0,0 +9245,TEST,0,0 +9246,TEST,0,0 +9247,TEST,0,0 +9248,TEST,0,0 +9249,TEST,0,0 +9250,TEST,0,0 +9251,TEST,0,0 +9252,TEST,0,0 +9253,TEST,0,0 +9254,TEST,0,0 +9255,TEST,0,0 +9256,TEST,0,0 +9257,TEST,0,0 +9258,TEST,0,0 +9259,TEST,0,0 +9260,TEST,0,0 +9261,TEST,0,0 +9262,TEST,0,0 +9263,TEST,0,0 +9264,TEST,0,0 +9265,TEST,0,0 +9266,TEST,0,0 +9267,TEST,0,0 +9268,TEST,0,0 +9269,TEST,0,0 +9270,TEST,0,0 +9271,TEST,0,0 +9272,TEST,0,0 +9273,TEST,0,0 +9274,TEST,0,0 +9275,TEST,0,0 +9276,TEST,0,0 +9277,TEST,0,0 +9278,TEST,0,0 +9279,TEST,0,0 +9280,TEST,0,0 +9281,TEST,0,0 +9282,TEST,0,0 +9283,TEST,0,0 +9284,TEST,0,0 +9285,TEST,0,0 +9286,TEST,0,0 +9287,TEST,0,0 +9288,TEST,0,0 +9289,TEST,0,0 +9290,TEST,0,0 +9291,TEST,0,0 +9292,TEST,0,0 +9293,TEST,0,0 +9294,TEST,0,0 +9295,TEST,0,0 +9296,TEST,0,0 +9297,TEST,0,0 +9298,TEST,0,0 +9299,TEST,0,0 +9300,TEST,0,0 +9301,TEST,0,0 +9302,TEST,0,0 +9303,TEST,0,0 +9304,TEST,0,0 +9305,TEST,0,0 +9306,TEST,0,0 +9307,TEST,0,0 +9308,TEST,0,0 +9309,TEST,0,0 +9310,TEST,0,0 +9311,TEST,0,0 +9312,TEST,0,0 +9313,TEST,0,0 +9314,TEST,0,0 +9315,TEST,0,0 +9316,TEST,0,0 +9317,TEST,0,0 +9318,TEST,0,0 +9319,TEST,0,0 +9320,TEST,0,0 +9321,TEST,0,0 +9322,TEST,0,0 +9323,TEST,0,0 +9324,TEST,0,0 +9325,TEST,0,0 +9326,TEST,0,0 +9327,TEST,0,0 +9328,TEST,0,0 +9329,TEST,0,0 +9330,TEST,0,0 +9331,TEST,0,0 +9332,TEST,0,0 +9333,TEST,0,0 +9334,TEST,0,0 +9335,TEST,0,0 +9336,TEST,0,0 +9337,TEST,0,0 +9338,TEST,0,0 +9339,TEST,0,0 +9340,TEST,0,0 +9341,TEST,0,0 +9342,TEST,0,0 +9343,TEST,0,0 +9344,TEST,0,0 +9345,TEST,0,0 +9346,TEST,0,0 +9347,TEST,0,0 +9348,TEST,0,0 +9349,TEST,0,0 +9350,TEST,0,0 +9351,TEST,0,0 +9352,TEST,0,0 +9353,TEST,0,0 +9354,TEST,0,0 +9355,TEST,0,0 +9356,TEST,0,0 +9357,TEST,0,0 +9358,TEST,0,0 +9359,TEST,0,0 +9360,TEST,0,0 +9361,TEST,0,0 +9362,TEST,0,0 +9363,TEST,0,0 +9364,TEST,0,0 +9365,TEST,0,0 +9366,TEST,0,0 +9367,TEST,0,0 +9368,TEST,0,0 +9369,TEST,0,0 +9370,TEST,0,0 +9371,TEST,0,0 +9372,TEST,0,0 +9373,TEST,0,0 +9374,TEST,0,0 +9375,TEST,0,0 +9376,TEST,0,0 +9377,TEST,0,0 +9378,TEST,0,0 +9379,TEST,0,0 +9380,TEST,0,0 +9381,TEST,0,0 +9382,TEST,0,0 +9383,TEST,0,0 +9384,TEST,0,0 +9385,TEST,0,0 +9386,TEST,0,0 +9387,TEST,0,0 +9388,TEST,0,0 +9389,TEST,0,0 +9390,TEST,0,0 +9391,TEST,0,0 +9392,TEST,0,0 +9393,TEST,0,0 +9394,TEST,0,0 +9395,TEST,0,0 +9396,TEST,0,0 +9397,TEST,0,0 +9398,TEST,0,0 +9399,TEST,0,0 +9400,TEST,0,0 +9401,TEST,0,0 +9402,TEST,0,0 +9403,TEST,0,0 +9404,TEST,0,0 +9405,TEST,0,0 +9406,TEST,0,0 +9407,TEST,0,0 +9408,TEST,0,0 +9409,TEST,0,0 +9410,TEST,0,0 +9411,TEST,0,0 +9412,TEST,0,0 +9413,TEST,0,0 +9414,TEST,0,0 +9415,TEST,0,0 +9416,TEST,0,0 +9417,TEST,0,0 +9418,TEST,0,0 +9419,TEST,0,0 +9420,TEST,0,0 +9421,TEST,0,0 +9422,TEST,0,0 +9423,TEST,0,0 +9424,TEST,0,0 +9425,TEST,0,0 +9426,TEST,0,0 +9427,TEST,0,0 +9428,TEST,0,0 +9429,TEST,0,0 +9430,TEST,0,0 +9431,TEST,0,0 +9432,TEST,0,0 +9433,TEST,0,0 +9434,TEST,0,0 +9435,TEST,0,0 +9436,TEST,0,0 +9437,TEST,0,0 +9438,TEST,0,0 +9439,TEST,0,0 +9440,TEST,0,0 +9441,TEST,0,0 +9442,TEST,0,0 +9443,TEST,0,0 +9444,TEST,0,0 +9445,TEST,0,0 +9446,TEST,0,0 +9447,TEST,0,0 +9448,TEST,0,0 +9449,TEST,0,0 +9450,TEST,0,0 +9451,TEST,0,0 +9452,TEST,0,0 +9453,TEST,0,0 +9454,TEST,0,0 +9455,TEST,0,0 +9456,TEST,0,0 +9457,TEST,0,0 +9458,TEST,0,0 +9459,TEST,0,0 +9460,TEST,0,0 +9461,TEST,0,0 +9462,TEST,0,0 +9463,TEST,0,0 +9464,TEST,0,0 +9465,TEST,0,0 +9466,TEST,0,0 +9467,TEST,0,0 +9468,TEST,0,0 +9469,TEST,0,0 +9470,TEST,0,0 +9471,TEST,0,0 +9472,TEST,0,0 +9473,TEST,0,0 +9474,TEST,0,0 +9475,TEST,0,0 +9476,TEST,0,0 +9477,TEST,0,0 +9478,TEST,0,0 +9479,TEST,0,0 +9480,TEST,0,0 +9481,TEST,0,0 +9482,TEST,0,0 +9483,TEST,0,0 +9484,TEST,0,0 +9485,TEST,0,0 +9486,TEST,0,0 +9487,TEST,0,0 +9488,TEST,0,0 +9489,TEST,0,0 +9490,TEST,0,0 +9491,TEST,0,0 +9492,TEST,0,0 +9493,TEST,0,0 +9494,TEST,0,0 +9495,TEST,0,0 +9496,TEST,0,0 +9497,TEST,0,0 +9498,TEST,0,0 +9499,TEST,0,0 +9500,TEST,0,0 +9501,TEST,0,0 +9502,TEST,0,0 +9503,TEST,0,0 +9504,TEST,0,0 +9505,TEST,0,0 +9506,TEST,0,0 +9507,TEST,0,0 +9508,TEST,0,0 +9509,TEST,0,0 +9510,TEST,0,0 +9511,TEST,0,0 +9512,TEST,0,0 +9513,TEST,0,0 +9514,TEST,0,0 +9515,TEST,0,0 +9516,TEST,0,0 +9517,TEST,0,0 +9518,TEST,0,0 +9519,TEST,0,0 +9520,TEST,0,0 +9521,TEST,0,0 +9522,TEST,0,0 +9523,TEST,0,0 +9524,TEST,0,0 +9525,TEST,0,0 +9526,TEST,0,0 +9527,TEST,0,0 +9528,TEST,0,0 +9529,TEST,0,0 +9530,TEST,0,0 +9531,TEST,0,0 +9532,TEST,0,0 +9533,TEST,0,0 +9534,TEST,0,0 +9535,TEST,0,0 +9536,TEST,0,0 +9537,TEST,0,0 +9538,TEST,0,0 +9539,TEST,0,0 +9540,TEST,0,0 +9541,TEST,0,0 +9542,TEST,0,0 +9543,TEST,0,0 +9544,TEST,0,0 +9545,TEST,0,0 +9546,TEST,0,0 +9547,TEST,0,0 +9548,TEST,0,0 +9549,TEST,0,0 +9550,TEST,0,0 +9551,TEST,0,0 +9552,TEST,0,0 +9553,TEST,0,0 +9554,TEST,0,0 +9555,TEST,0,0 +9556,TEST,0,0 +9557,TEST,0,0 +9558,TEST,0,0 +9559,TEST,0,0 +9560,TEST,0,0 +9561,TEST,0,0 +9562,TEST,0,0 +9563,TEST,0,0 +9564,TEST,0,0 +9565,TEST,0,0 +9566,TEST,0,0 +9567,TEST,0,0 +9568,TEST,0,0 +9569,TEST,0,0 +9570,TEST,0,0 +9571,TEST,0,0 +9572,TEST,0,0 +9573,TEST,0,0 +9574,TEST,0,0 +9575,TEST,0,0 +9576,TEST,0,0 +9577,TEST,0,0 +9578,TEST,0,0 +9579,TEST,0,0 +9580,TEST,0,0 +9581,TEST,0,0 +9582,TEST,0,0 +9583,TEST,0,0 +9584,TEST,0,0 +9585,TEST,0,0 +9586,TEST,0,0 +9587,TEST,0,0 +9588,TEST,0,0 +9589,TEST,0,0 +9590,TEST,0,0 +9591,TEST,0,0 +9592,TEST,0,0 +9593,TEST,0,0 +9594,TEST,0,0 +9595,TEST,0,0 +9596,TEST,0,0 +9597,TEST,0,0 +9598,TEST,0,0 +9599,TEST,0,0 +9600,TEST,0,0 +9601,TEST,0,0 +9602,TEST,0,0 +9603,TEST,0,0 +9604,TEST,0,0 +9605,TEST,0,0 +9606,TEST,0,0 +9607,TEST,0,0 +9608,TEST,0,0 +9609,TEST,0,0 +9610,TEST,0,0 +9611,TEST,0,0 +9612,TEST,0,0 +9613,TEST,0,0 +9614,TEST,0,0 +9615,TEST,0,0 +9616,TEST,0,0 +9617,TEST,0,0 +9618,TEST,0,0 +9619,TEST,0,0 +9620,TEST,0,0 +9621,TEST,0,0 +9622,TEST,0,0 +9623,TEST,0,0 +9624,TEST,0,0 +9625,TEST,0,0 +9626,TEST,0,0 +9627,TEST,0,0 +9628,TEST,0,0 +9629,TEST,0,0 +9630,TEST,0,0 +9631,TEST,0,0 +9632,TEST,0,0 +9633,TEST,0,0 +9634,TEST,0,0 +9635,TEST,0,0 +9636,TEST,0,0 +9637,TEST,0,0 +9638,TEST,0,0 +9639,TEST,0,0 +9640,TEST,0,0 +9641,TEST,0,0 +9642,TEST,0,0 +9643,TEST,0,0 +9644,TEST,0,0 +9645,TEST,0,0 +9646,TEST,0,0 +9647,TEST,0,0 +9648,TEST,0,0 +9649,TEST,0,0 +9650,TEST,0,0 +9651,TEST,0,0 +9652,TEST,0,0 +9653,TEST,0,0 +9654,TEST,0,0 +9655,TEST,0,0 +9656,TEST,0,0 +9657,TEST,0,0 +9658,TEST,0,0 +9659,TEST,0,0 +9660,TEST,0,0 +9661,TEST,0,0 +9662,TEST,0,0 +9663,TEST,0,0 +9664,TEST,0,0 +9665,TEST,0,0 +9666,TEST,0,0 +9667,TEST,0,0 +9668,TEST,0,0 +9669,TEST,0,0 +9670,TEST,0,0 +9671,TEST,0,0 +9672,TEST,0,0 +9673,TEST,0,0 +9674,TEST,0,0 +9675,TEST,0,0 +9676,TEST,0,0 +9677,TEST,0,0 +9678,TEST,0,0 +9679,TEST,0,0 +9680,TEST,0,0 +9681,TEST,0,0 +9682,TEST,0,0 +9683,TEST,0,0 +9684,TEST,0,0 +9685,TEST,0,0 +9686,TEST,0,0 +9687,TEST,0,0 +9688,TEST,0,0 +9689,TEST,0,0 +9690,TEST,0,0 +9691,TEST,0,0 +9692,TEST,0,0 +9693,TEST,0,0 +9694,TEST,0,0 +9695,TEST,0,0 +9696,TEST,0,0 +9697,TEST,0,0 +9698,TEST,0,0 +9699,TEST,0,0 +9700,TEST,0,0 +9701,TEST,0,0 +9702,TEST,0,0 +9703,TEST,0,0 +9704,TEST,0,0 +9705,TEST,0,0 +9706,TEST,0,0 +9707,TEST,0,0 +9708,TEST,0,0 +9709,TEST,0,0 +9710,TEST,0,0 +9711,TEST,0,0 +9712,TEST,0,0 +9713,TEST,0,0 +9714,TEST,0,0 +9715,TEST,0,0 +9716,TEST,0,0 +9717,TEST,0,0 +9718,TEST,0,0 +9719,TEST,0,0 +9720,TEST,0,0 +9721,TEST,0,0 +9722,TEST,0,0 +9723,TEST,0,0 +9724,TEST,0,0 +9725,TEST,0,0 +9726,TEST,0,0 +9727,TEST,0,0 +9728,TEST,0,0 +9729,TEST,0,0 +9730,TEST,0,0 +9731,TEST,0,0 +9732,TEST,0,0 +9733,TEST,0,0 +9734,TEST,0,0 +9735,TEST,0,0 +9736,TEST,0,0 +9737,TEST,0,0 +9738,TEST,0,0 +9739,TEST,0,0 +9740,TEST,0,0 +9741,TEST,0,0 +9742,TEST,0,0 +9743,TEST,0,0 +9744,TEST,0,0 +9745,TEST,0,0 +9746,TEST,0,0 +9747,TEST,0,0 +9748,TEST,0,0 +9749,TEST,0,0 +9750,TEST,0,0 +9751,TEST,0,0 +9752,TEST,0,0 +9753,TEST,0,0 +9754,TEST,0,0 +9755,TEST,0,0 +9756,TEST,0,0 +9757,TEST,0,0 +9758,TEST,0,0 +9759,TEST,0,0 +9760,TEST,0,0 +9761,TEST,0,0 +9762,TEST,0,0 +9763,TEST,0,0 +9764,TEST,0,0 +9765,TEST,0,0 +9766,TEST,0,0 +9767,TEST,0,0 +9768,TEST,0,0 +9769,TEST,0,0 +9770,TEST,0,0 +9771,TEST,0,0 +9772,TEST,0,0 +9773,TEST,0,0 +9774,TEST,0,0 +9775,TEST,0,0 +9776,TEST,0,0 +9777,TEST,0,0 +9778,TEST,0,0 +9779,TEST,0,0 +9780,TEST,0,0 +9781,TEST,0,0 +9782,TEST,0,0 +9783,TEST,0,0 +9784,TEST,0,0 +9785,TEST,0,0 +9786,TEST,0,0 +9787,TEST,0,0 +9788,TEST,0,0 +9789,TEST,0,0 +9790,TEST,0,0 +9791,TEST,0,0 +9792,TEST,0,0 +9793,TEST,0,0 +9794,TEST,0,0 +9795,TEST,0,0 +9796,TEST,0,0 +9797,TEST,0,0 +9798,TEST,0,0 +9799,TEST,0,0 +9800,TEST,0,0 +9801,TEST,0,0 +9802,TEST,0,0 +9803,TEST,0,0 +9804,TEST,0,0 +9805,TEST,0,0 +9806,TEST,0,0 +9807,TEST,0,0 +9808,TEST,0,0 +9809,TEST,0,0 +9810,TEST,0,0 +9811,TEST,0,0 +9812,TEST,0,0 +9813,TEST,0,0 +9814,TEST,0,0 +9815,TEST,0,0 +9816,TEST,0,0 +9817,TEST,0,0 +9818,TEST,0,0 +9819,TEST,0,0 +9820,TEST,0,0 +9821,TEST,0,0 +9822,TEST,0,0 +9823,TEST,0,0 +9824,TEST,0,0 +9825,TEST,0,0 +9826,TEST,0,0 +9827,TEST,0,0 +9828,TEST,0,0 +9829,TEST,0,0 +9830,TEST,0,0 +9831,TEST,0,0 +9832,TEST,0,0 +9833,TEST,0,0 +9834,TEST,0,0 +9835,TEST,0,0 +9836,TEST,0,0 +9837,TEST,0,0 +9838,TEST,0,0 +9839,TEST,0,0 +9840,TEST,0,0 +9841,TEST,0,0 +9842,TEST,0,0 +9843,TEST,0,0 +9844,TEST,0,0 +9845,TEST,0,0 +9846,TEST,0,0 +9847,TEST,0,0 +9848,TEST,0,0 +9849,TEST,0,0 +9850,TEST,0,0 +9851,TEST,0,0 +9852,TEST,0,0 +9853,TEST,0,0 +9854,TEST,0,0 +9855,TEST,0,0 +9856,TEST,0,0 +9857,TEST,0,0 +9858,TEST,0,0 +9859,TEST,0,0 +9860,TEST,0,0 +9861,TEST,0,0 +9862,TEST,0,0 +9863,TEST,0,0 +9864,TEST,0,0 +9865,TEST,0,0 +9866,TEST,0,0 +9867,TEST,0,0 +9868,TEST,0,0 +9869,TEST,0,0 +9870,TEST,0,0 +9871,TEST,0,0 +9872,TEST,0,0 +9873,TEST,0,0 +9874,TEST,0,0 +9875,TEST,0,0 +9876,TEST,0,0 +9877,TEST,0,0 +9878,TEST,0,0 +9879,TEST,0,0 +9880,TEST,0,0 +9881,TEST,0,0 +9882,TEST,0,0 +9883,TEST,0,0 +9884,TEST,0,0 +9885,TEST,0,0 +9886,TEST,0,0 +9887,TEST,0,0 +9888,TEST,0,0 +9889,TEST,0,0 +9890,TEST,0,0 +9891,TEST,0,0 +9892,TEST,0,0 +9893,TEST,0,0 +9894,TEST,0,0 +9895,TEST,0,0 +9896,TEST,0,0 +9897,TEST,0,0 +9898,TEST,0,0 +9899,TEST,0,0 +9900,TEST,0,0 +9901,TEST,0,0 +9902,TEST,0,0 +9903,TEST,0,0 +9904,TEST,0,0 +9905,TEST,0,0 +9906,TEST,0,0 +9907,TEST,0,0 +9908,TEST,0,0 +9909,TEST,0,0 +9910,TEST,0,0 +9911,TEST,0,0 +9912,TEST,0,0 +9913,TEST,0,0 +9914,TEST,0,0 +9915,TEST,0,0 +9916,TEST,0,0 +9917,TEST,0,0 +9918,TEST,0,0 +9919,TEST,0,0 +9920,TEST,0,0 +9921,TEST,0,0 +9922,TEST,0,0 +9923,TEST,0,0 +9924,TEST,0,0 +9925,TEST,0,0 +9926,TEST,0,0 +9927,TEST,0,0 +9928,TEST,0,0 +9929,TEST,0,0 +9930,TEST,0,0 +9931,TEST,0,0 +9932,TEST,0,0 +9933,TEST,0,0 +9934,TEST,0,0 +9935,TEST,0,0 +9936,TEST,0,0 +9937,TEST,0,0 +9938,TEST,0,0 +9939,TEST,0,0 +9940,TEST,0,0 +9941,TEST,0,0 +9942,TEST,0,0 +9943,TEST,0,0 +9944,TEST,0,0 +9945,TEST,0,0 +9946,TEST,0,0 +9947,TEST,0,0 +9948,TEST,0,0 +9949,TEST,0,0 +9950,TEST,0,0 +9951,TEST,0,0 +9952,TEST,0,0 +9953,TEST,0,0 +9954,TEST,0,0 +9955,TEST,0,0 +9956,TEST,0,0 +9957,TEST,0,0 +9958,TEST,0,0 +9959,TEST,0,0 +9960,TEST,0,0 +9961,TEST,0,0 +9962,TEST,0,0 +9963,TEST,0,0 +9964,TEST,0,0 +9965,TEST,0,0 +9966,TEST,0,0 +9967,TEST,0,0 +9968,TEST,0,0 +9969,TEST,0,0 +9970,TEST,0,0 +9971,TEST,0,0 +9972,TEST,0,0 +9973,TEST,0,0 +9974,TEST,0,0 +9975,TEST,0,0 +9976,TEST,0,0 +9977,TEST,0,0 +9978,TEST,0,0 +9979,TEST,0,0 +9980,TEST,0,0 +9981,TEST,0,0 +9982,TEST,0,0 +9983,TEST,0,0 +9984,TEST,0,0 +9985,TEST,0,0 +9986,TEST,0,0 +9987,TEST,0,0 +9988,TEST,0,0 +9989,TEST,0,0 +9990,TEST,0,0 +9991,TEST,0,0 +9992,TEST,0,0 +9993,TEST,0,0 +9994,TEST,0,0 +9995,TEST,0,0 +9996,TEST,0,0 +9997,TEST,0,0 +9998,TEST,0,0 +9999,TEST,0,0 +10000,TEST,0,0 +10001,TEST,0,0 +10002,TEST,0,0 +10003,TEST,0,0 +10004,TEST,0,0 +10005,TEST,0,0 +10006,TEST,0,0 +10007,TEST,0,0 +10008,TEST,0,0 +10009,TEST,0,0 +10010,TEST,0,0 +10011,TEST,0,0 +10012,TEST,0,0 +10013,TEST,0,0 +10014,TEST,0,0 +10015,TEST,0,0 +10016,TEST,0,0 +10017,TEST,0,0 +10018,TEST,0,0 +10019,TEST,0,0 +10020,TEST,0,0 +10021,TEST,0,0 +10022,TEST,0,0 +10023,TEST,0,0 +10024,TEST,0,0 +10025,TEST,0,0 +10026,TEST,0,0 +10027,TEST,0,0 +10028,TEST,0,0 +10029,TEST,0,0 +10030,TEST,0,0 +10031,TEST,0,0 +10032,TEST,0,0 +10033,TEST,0,0 +10034,TEST,0,0 +10035,TEST,0,0 +10036,TEST,0,0 +10037,TEST,0,0 +10038,TEST,0,0 +10039,TEST,0,0 +10040,TEST,0,0 +10041,TEST,0,0 +10042,TEST,0,0 +10043,TEST,0,0 +10044,TEST,0,0 +10045,TEST,0,0 +10046,TEST,0,0 +10047,TEST,0,0 +10048,TEST,0,0 +10049,TEST,0,0 +10050,TEST,0,0 +10051,TEST,0,0 +10052,TEST,0,0 +10053,TEST,0,0 +10054,TEST,0,0 +10055,TEST,0,0 +10056,TEST,0,0 +10057,TEST,0,0 +10058,TEST,0,0 +10059,TEST,0,0 +10060,TEST,0,0 +10061,TEST,0,0 +10062,TEST,0,0 +10063,TEST,0,0 +10064,TEST,0,0 +10065,TEST,0,0 +10066,TEST,0,0 +10067,TEST,0,0 +10068,TEST,0,0 +10069,TEST,0,0 +10070,TEST,0,0 +10071,TEST,0,0 +10072,TEST,0,0 +10073,TEST,0,0 +10074,TEST,0,0 +10075,TEST,0,0 +10076,TEST,0,0 +10077,TEST,0,0 +10078,TEST,0,0 +10079,TEST,0,0 +10080,TEST,0,0 +10081,TEST,0,0 +10082,TEST,0,0 +10083,TEST,0,0 +10084,TEST,0,0 +10085,TEST,0,0 +10086,TEST,0,0 +10087,TEST,0,0 +10088,TEST,0,0 +10089,TEST,0,0 +10090,TEST,0,0 +10091,TEST,0,0 +10092,TEST,0,0 +10093,TEST,0,0 +10094,TEST,0,0 +10095,TEST,0,0 +10096,TEST,0,0 +10097,TEST,0,0 +10098,TEST,0,0 +10099,TEST,0,0 +10100,TEST,0,0 +10101,TEST,0,0 +10102,TEST,0,0 +10103,TEST,0,0 +10104,TEST,0,0 +10105,TEST,0,0 +10106,TEST,0,0 +10107,TEST,0,0 +10108,TEST,0,0 +10109,TEST,0,0 +10110,TEST,0,0 +10111,TEST,0,0 +10112,TEST,0,0 +10113,TEST,0,0 +10114,TEST,0,0 +10115,TEST,0,0 +10116,TEST,0,0 +10117,TEST,0,0 +10118,TEST,0,0 +10119,TEST,0,0 +10120,TEST,0,0 +10121,TEST,0,0 +10122,TEST,0,0 +10123,TEST,0,0 +10124,TEST,0,0 +10125,TEST,0,0 +10126,TEST,0,0 +10127,TEST,0,0 +10128,TEST,0,0 +10129,TEST,0,0 +10130,TEST,0,0 +10131,TEST,0,0 +10132,TEST,0,0 +10133,TEST,0,0 +10134,TEST,0,0 +10135,TEST,0,0 +10136,TEST,0,0 +10137,TEST,0,0 +10138,TEST,0,0 +10139,TEST,0,0 +10140,TEST,0,0 +10141,TEST,0,0 +10142,TEST,0,0 +10143,TEST,0,0 +10144,TEST,0,0 +10145,TEST,0,0 +10146,TEST,0,0 +10147,TEST,0,0 +10148,TEST,0,0 +10149,TEST,0,0 +10150,TEST,0,0 +10151,TEST,0,0 +10152,TEST,0,0 +10153,TEST,0,0 +10154,TEST,0,0 +10155,TEST,0,0 +10156,TEST,0,0 +10157,TEST,0,0 +10158,TEST,0,0 +10159,TEST,0,0 +10160,TEST,0,0 +10161,TEST,0,0 +10162,TEST,0,0 +10163,TEST,0,0 +10164,TEST,0,0 +10165,TEST,0,0 +10166,TEST,0,0 +10167,TEST,0,0 +10168,TEST,0,0 +10169,TEST,0,0 +10170,TEST,0,0 +10171,TEST,0,0 +10172,TEST,0,0 +10173,TEST,0,0 +10174,TEST,0,0 +10175,TEST,0,0 +10176,TEST,0,0 +10177,TEST,0,0 +10178,TEST,0,0 +10179,TEST,0,0 +10180,TEST,0,0 +10181,TEST,0,0 +10182,TEST,0,0 +10183,TEST,0,0 +10184,TEST,0,0 +10185,TEST,0,0 +10186,TEST,0,0 +10187,TEST,0,0 +10188,TEST,0,0 +10189,TEST,0,0 +10190,TEST,0,0 +10191,TEST,0,0 +10192,TEST,0,0 +10193,TEST,0,0 +10194,TEST,0,0 +10195,TEST,0,0 +10196,TEST,0,0 +10197,TEST,0,0 +10198,TEST,0,0 +10199,TEST,0,0 +10200,TEST,0,0 +10201,TEST,0,0 +10202,TEST,0,0 +10203,TEST,0,0 +10204,TEST,0,0 +10205,TEST,0,0 +10206,TEST,0,0 +10207,TEST,0,0 +10208,TEST,0,0 +10209,TEST,0,0 +10210,TEST,0,0 +10211,TEST,0,0 +10212,TEST,0,0 +10213,TEST,0,0 +10214,TEST,0,0 +10215,TEST,0,0 +10216,TEST,0,0 +10217,TEST,0,0 +10218,TEST,0,0 +10219,TEST,0,0 +10220,TEST,0,0 +10221,TEST,0,0 +10222,TEST,0,0 +10223,TEST,0,0 +10224,TEST,0,0 +10225,TEST,0,0 +10226,TEST,0,0 +10227,TEST,0,0 +10228,TEST,0,0 +10229,TEST,0,0 +10230,TEST,0,0 +10231,TEST,0,0 +10232,TEST,0,0 +10233,TEST,0,0 +10234,TEST,0,0 +10235,TEST,0,0 +10236,TEST,0,0 +10237,TEST,0,0 +10238,TEST,0,0 +10239,TEST,0,0 +10240,TEST,0,0 +10241,TEST,0,0 +10242,TEST,0,0 +10243,TEST,0,0 +10244,TEST,0,0 +10245,TEST,0,0 +10246,TEST,0,0 +10247,TEST,0,0 +10248,TEST,0,0 +10249,TEST,0,0 +10250,TEST,0,0 +10251,TEST,0,0 +10252,TEST,0,0 +10253,TEST,0,0 +10254,TEST,0,0 +10255,TEST,0,0 +10256,TEST,0,0 +10257,TEST,0,0 +10258,TEST,0,0 +10259,TEST,0,0 +10260,TEST,0,0 +10261,TEST,0,0 +10262,TEST,0,0 +10263,TEST,0,0 +10264,TEST,0,0 +10265,TEST,0,0 +10266,TEST,0,0 +10267,TEST,0,0 +10268,TEST,0,0 +10269,TEST,0,0 +10270,TEST,0,0 +10271,TEST,0,0 +10272,TEST,0,0 +10273,TEST,0,0 +10274,TEST,0,0 +10275,TEST,0,0 +10276,TEST,0,0 +10277,TEST,0,0 +10278,TEST,0,0 +10279,TEST,0,0 +10280,TEST,0,0 +10281,TEST,0,0 +10282,TEST,0,0 +10283,TEST,0,0 +10284,TEST,0,0 +10285,TEST,0,0 +10286,TEST,0,0 +10287,TEST,0,0 +10288,TEST,0,0 +10289,TEST,0,0 +10290,TEST,0,0 +10291,TEST,0,0 +10292,TEST,0,0 +10293,TEST,0,0 +10294,TEST,0,0 +10295,TEST,0,0 +10296,TEST,0,0 +10297,TEST,0,0 +10298,TEST,0,0 +10299,TEST,0,0 +10300,TEST,0,0 +10301,TEST,0,0 +10302,TEST,0,0 +10303,TEST,0,0 +10304,TEST,0,0 +10305,TEST,0,0 +10306,TEST,0,0 +10307,TEST,0,0 +10308,TEST,0,0 +10309,TEST,0,0 +10310,TEST,0,0 +10311,TEST,0,0 +10312,TEST,0,0 +10313,TEST,0,0 +10314,TEST,0,0 +10315,TEST,0,0 +10316,TEST,0,0 +10317,TEST,0,0 +10318,TEST,0,0 +10319,TEST,0,0 +10320,TEST,0,0 +10321,TEST,0,0 +10322,TEST,0,0 +10323,TEST,0,0 +10324,TEST,0,0 +10325,TEST,0,0 +10326,TEST,0,0 +10327,TEST,0,0 +10328,TEST,0,0 +10329,TEST,0,0 +10330,TEST,0,0 +10331,TEST,0,0 +10332,TEST,0,0 +10333,TEST,0,0 +10334,TEST,0,0 +10335,TEST,0,0 +10336,TEST,0,0 +10337,TEST,0,0 +10338,TEST,0,0 +10339,TEST,0,0 +10340,TEST,0,0 +10341,TEST,0,0 +10342,TEST,0,0 +10343,TEST,0,0 +10344,TEST,0,0 +10345,TEST,0,0 +10346,TEST,0,0 +10347,TEST,0,0 +10348,TEST,0,0 +10349,TEST,0,0 +10350,TEST,0,0 +10351,TEST,0,0 +10352,TEST,0,0 +10353,TEST,0,0 +10354,TEST,0,0 +10355,TEST,0,0 +10356,TEST,0,0 +10357,TEST,0,0 +10358,TEST,0,0 +10359,TEST,0,0 +10360,TEST,0,0 +10361,TEST,0,0 +10362,TEST,0,0 +10363,TEST,0,0 +10364,TEST,0,0 +10365,TEST,0,0 +10366,TEST,0,0 +10367,TEST,0,0 +10368,TEST,0,0 +10369,TEST,0,0 +10370,TEST,0,0 +10371,TEST,0,0 +10372,TEST,0,0 +10373,TEST,0,0 +10374,TEST,0,0 +10375,TEST,0,0 +10376,TEST,0,0 +10377,TEST,0,0 +10378,TEST,0,0 +10379,TEST,0,0 +10380,TEST,0,0 +10381,TEST,0,0 +10382,TEST,0,0 +10383,TEST,0,0 +10384,TEST,0,0 +10385,TEST,0,0 +10386,TEST,0,0 +10387,TEST,0,0 +10388,TEST,0,0 +10389,TEST,0,0 +10390,TEST,0,0 +10391,TEST,0,0 +10392,TEST,0,0 +10393,TEST,0,0 +10394,TEST,0,0 +10395,TEST,0,0 +10396,TEST,0,0 +10397,TEST,0,0 +10398,TEST,0,0 +10399,TEST,0,0 +10400,TEST,0,0 +10401,TEST,0,0 +10402,TEST,0,0 +10403,TEST,0,0 +10404,TEST,0,0 +10405,TEST,0,0 +10406,TEST,0,0 +10407,TEST,0,0 +10408,TEST,0,0 +10409,TEST,0,0 +10410,TEST,0,0 +10411,TEST,0,0 +10412,TEST,0,0 +10413,TEST,0,0 +10414,TEST,0,0 +10415,TEST,0,0 +10416,TEST,0,0 +10417,TEST,0,0 +10418,TEST,0,0 +10419,TEST,0,0 +10420,TEST,0,0 +10421,TEST,0,0 +10422,TEST,0,0 +10423,TEST,0,0 +10424,TEST,0,0 +10425,TEST,0,0 +10426,TEST,0,0 +10427,TEST,0,0 +10428,TEST,0,0 +10429,TEST,0,0 +10430,TEST,0,0 +10431,TEST,0,0 +10432,TEST,0,0 +10433,TEST,0,0 +10434,TEST,0,0 +10435,TEST,0,0 +10436,TEST,0,0 +10437,TEST,0,0 +10438,TEST,0,0 +10439,TEST,0,0 +10440,TEST,0,0 +10441,TEST,0,0 +10442,TEST,0,0 +10443,TEST,0,0 +10444,TEST,0,0 +10445,TEST,0,0 +10446,TEST,0,0 +10447,TEST,0,0 +10448,TEST,0,0 +10449,TEST,0,0 +10450,TEST,0,0 +10451,TEST,0,0 +10452,TEST,0,0 +10453,TEST,0,0 +10454,TEST,0,0 +10455,TEST,0,0 +10456,TEST,0,0 +10457,TEST,0,0 +10458,TEST,0,0 +10459,TEST,0,0 +10460,TEST,0,0 +10461,TEST,0,0 +10462,TEST,0,0 +10463,TEST,0,0 +10464,TEST,0,0 +10465,TEST,0,0 +10466,TEST,0,0 +10467,TEST,0,0 +10468,TEST,0,0 +10469,TEST,0,0 +10470,TEST,0,0 +10471,TEST,0,0 +10472,TEST,0,0 +10473,TEST,0,0 +10474,TEST,0,0 +10475,TEST,0,0 +10476,TEST,0,0 +10477,TEST,0,0 +10478,TEST,0,0 +10479,TEST,0,0 +10480,TEST,0,0 +10481,TEST,0,0 +10482,TEST,0,0 +10483,TEST,0,0 +10484,TEST,0,0 +10485,TEST,0,0 +10486,TEST,0,0 +10487,TEST,0,0 +10488,TEST,0,0 +10489,TEST,0,0 +10490,TEST,0,0 +10491,TEST,0,0 +10492,TEST,0,0 +10493,TEST,0,0 +10494,TEST,0,0 +10495,TEST,0,0 +10496,TEST,0,0 +10497,TEST,0,0 +10498,TEST,0,0 +10499,TEST,0,0 +10500,TEST,0,0 +10501,TEST,0,0 +10502,TEST,0,0 +10503,TEST,0,0 +10504,TEST,0,0 +10505,TEST,0,0 +10506,TEST,0,0 +10507,TEST,0,0 +10508,TEST,0,0 +10509,TEST,0,0 +10510,TEST,0,0 +10511,TEST,0,0 +10512,TEST,0,0 +10513,TEST,0,0 +10514,TEST,0,0 +10515,TEST,0,0 +10516,TEST,0,0 +10517,TEST,0,0 +10518,TEST,0,0 +10519,TEST,0,0 +10520,TEST,0,0 +10521,TEST,0,0 +10522,TEST,0,0 +10523,TEST,0,0 +10524,TEST,0,0 +10525,TEST,0,0 +10526,TEST,0,0 +10527,TEST,0,0 +10528,TEST,0,0 +10529,TEST,0,0 +10530,TEST,0,0 +10531,TEST,0,0 +10532,TEST,0,0 +10533,TEST,0,0 +10534,TEST,0,0 +10535,TEST,0,0 +10536,TEST,0,0 +10537,TEST,0,0 +10538,TEST,0,0 +10539,TEST,0,0 +10540,TEST,0,0 +10541,TEST,0,0 +10542,TEST,0,0 +10543,TEST,0,0 +10544,TEST,0,0 +10545,TEST,0,0 +10546,TEST,0,0 +10547,TEST,0,0 +10548,TEST,0,0 +10549,TEST,0,0 +10550,TEST,0,0 +10551,TEST,0,0 +10552,TEST,0,0 +10553,TEST,0,0 +10554,TEST,0,0 +10555,TEST,0,0 +10556,TEST,0,0 +10557,TEST,0,0 +10558,TEST,0,0 +10559,TEST,0,0 +10560,TEST,0,0 +10561,TEST,0,0 +10562,TEST,0,0 +10563,TEST,0,0 +10564,TEST,0,0 +10565,TEST,0,0 +10566,TEST,0,0 +10567,TEST,0,0 +10568,TEST,0,0 +10569,TEST,0,0 +10570,TEST,0,0 +10571,TEST,0,0 +10572,TEST,0,0 +10573,TEST,0,0 +10574,TEST,0,0 +10575,TEST,0,0 +10576,TEST,0,0 +10577,TEST,0,0 +10578,TEST,0,0 +10579,TEST,0,0 +10580,TEST,0,0 +10581,TEST,0,0 +10582,TEST,0,0 +10583,TEST,0,0 +10584,TEST,0,0 +10585,TEST,0,0 +10586,TEST,0,0 +10587,TEST,0,0 +10588,TEST,0,0 +10589,TEST,0,0 +10590,TEST,0,0 +10591,TEST,0,0 +10592,TEST,0,0 +10593,TEST,0,0 +10594,TEST,0,0 +10595,TEST,0,0 +10596,TEST,0,0 +10597,TEST,0,0 +10598,TEST,0,0 +10599,TEST,0,0 +10600,TEST,0,0 +10601,TEST,0,0 +10602,TEST,0,0 +10603,TEST,0,0 +10604,TEST,0,0 +10605,TEST,0,0 +10606,TEST,0,0 +10607,TEST,0,0 +10608,TEST,0,0 +10609,TEST,0,0 +10610,TEST,0,0 +10611,TEST,0,0 +10612,TEST,0,0 +10613,TEST,0,0 +10614,TEST,0,0 +10615,TEST,0,0 +10616,TEST,0,0 +10617,TEST,0,0 +10618,TEST,0,0 +10619,TEST,0,0 +10620,TEST,0,0 +10621,TEST,0,0 +10622,TEST,0,0 +10623,TEST,0,0 +10624,TEST,0,0 +10625,TEST,0,0 +10626,TEST,0,0 +10627,TEST,0,0 +10628,TEST,0,0 +10629,TEST,0,0 +10630,TEST,0,0 +10631,TEST,0,0 +10632,TEST,0,0 +10633,TEST,0,0 +10634,TEST,0,0 +10635,TEST,0,0 +10636,TEST,0,0 +10637,TEST,0,0 +10638,TEST,0,0 +10639,TEST,0,0 +10640,TEST,0,0 +10641,TEST,0,0 +10642,TEST,0,0 +10643,TEST,0,0 +10644,TEST,0,0 +10645,TEST,0,0 +10646,TEST,0,0 +10647,TEST,0,0 +10648,TEST,0,0 +10649,TEST,0,0 +10650,TEST,0,0 +10651,TEST,0,0 +10652,TEST,0,0 +10653,TEST,0,0 +10654,TEST,0,0 +10655,TEST,0,0 +10656,TEST,0,0 +10657,TEST,0,0 +10658,TEST,0,0 +10659,TEST,0,0 +10660,TEST,0,0 +10661,TEST,0,0 +10662,TEST,0,0 +10663,TEST,0,0 +10664,TEST,0,0 +10665,TEST,0,0 +10666,TEST,0,0 +10667,TEST,0,0 +10668,TEST,0,0 +10669,TEST,0,0 +10670,TEST,0,0 +10671,TEST,0,0 +10672,TEST,0,0 +10673,TEST,0,0 +10674,TEST,0,0 +10675,TEST,0,0 +10676,TEST,0,0 +10677,TEST,0,0 +10678,TEST,0,0 +10679,TEST,0,0 +10680,TEST,0,0 +10681,TEST,0,0 +10682,TEST,0,0 +10683,TEST,0,0 +10684,TEST,0,0 +10685,TEST,0,0 +10686,TEST,0,0 +10687,TEST,0,0 +10688,TEST,0,0 +10689,TEST,0,0 +10690,TEST,0,0 +10691,TEST,0,0 +10692,TEST,0,0 +10693,TEST,0,0 +10694,TEST,0,0 +10695,TEST,0,0 +10696,TEST,0,0 +10697,TEST,0,0 +10698,TEST,0,0 +10699,TEST,0,0 +10700,TEST,0,0 +10701,TEST,0,0 +10702,TEST,0,0 +10703,TEST,0,0 +10704,TEST,0,0 +10705,TEST,0,0 +10706,TEST,0,0 +10707,TEST,0,0 +10708,TEST,0,0 +10709,TEST,0,0 +10710,TEST,0,0 +10711,TEST,0,0 +10712,TEST,0,0 +10713,TEST,0,0 +10714,TEST,0,0 +10715,TEST,0,0 +10716,TEST,0,0 +10717,TEST,0,0 +10718,TEST,0,0 +10719,TEST,0,0 +10720,TEST,0,0 +10721,TEST,0,0 +10722,TEST,0,0 +10723,TEST,0,0 +10724,TEST,0,0 +10725,TEST,0,0 +10726,TEST,0,0 +10727,TEST,0,0 +10728,TEST,0,0 +10729,TEST,0,0 +10730,TEST,0,0 +10731,TEST,0,0 +10732,TEST,0,0 +10733,TEST,0,0 +10734,TEST,0,0 +10735,TEST,0,0 +10736,TEST,0,0 +10737,TEST,0,0 +10738,TEST,0,0 +10739,TEST,0,0 +10740,TEST,0,0 +10741,TEST,0,0 +10742,TEST,0,0 +10743,TEST,0,0 +10744,TEST,0,0 +10745,TEST,0,0 +10746,TEST,0,0 +10747,TEST,0,0 +10748,TEST,0,0 +10749,TEST,0,0 +10750,TEST,0,0 +10751,TEST,0,0 +10752,TEST,0,0 +10753,TEST,0,0 +10754,TEST,0,0 +10755,TEST,0,0 +10756,TEST,0,0 +10757,TEST,0,0 +10758,TEST,0,0 +10759,TEST,0,0 +10760,TEST,0,0 +10761,TEST,0,0 +10762,TEST,0,0 +10763,TEST,0,0 +10764,TEST,0,0 +10765,TEST,0,0 +10766,TEST,0,0 +10767,TEST,0,0 +10768,TEST,0,0 +10769,TEST,0,0 +10770,TEST,0,0 +10771,TEST,0,0 +10772,TEST,0,0 +10773,TEST,0,0 +10774,TEST,0,0 +10775,TEST,0,0 +10776,TEST,0,0 +10777,TEST,0,0 +10778,TEST,0,0 +10779,TEST,0,0 +10780,TEST,0,0 +10781,TEST,0,0 +10782,TEST,0,0 +10783,TEST,0,0 +10784,TEST,0,0 +10785,TEST,0,0 +10786,TEST,0,0 +10787,TEST,0,0 +10788,TEST,0,0 +10789,TEST,0,0 +10790,TEST,0,0 +10791,TEST,0,0 +10792,TEST,0,0 +10793,TEST,0,0 +10794,TEST,0,0 +10795,TEST,0,0 +10796,TEST,0,0 +10797,TEST,0,0 +10798,TEST,0,0 +10799,TEST,0,0 +10800,TEST,0,0 +10801,TEST,0,0 +10802,TEST,0,0 +10803,TEST,0,0 +10804,TEST,0,0 +10805,TEST,0,0 +10806,TEST,0,0 +10807,TEST,0,0 +10808,TEST,0,0 +10809,TEST,0,0 +10810,TEST,0,0 +10811,TEST,0,0 +10812,TEST,0,0 +10813,TEST,0,0 +10814,TEST,0,0 +10815,TEST,0,0 +10816,TEST,0,0 +10817,TEST,0,0 +10818,TEST,0,0 +10819,TEST,0,0 +10820,TEST,0,0 +10821,TEST,0,0 +10822,TEST,0,0 +10823,TEST,0,0 +10824,TEST,0,0 +10825,TEST,0,0 +10826,TEST,0,0 +10827,TEST,0,0 +10828,TEST,0,0 +10829,TEST,0,0 +10830,TEST,0,0 +10831,TEST,0,0 +10832,TEST,0,0 +10833,TEST,0,0 +10834,TEST,0,0 +10835,TEST,0,0 +10836,TEST,0,0 +10837,TEST,0,0 +10838,TEST,0,0 +10839,TEST,0,0 +10840,TEST,0,0 +10841,TEST,0,0 +10842,TEST,0,0 +10843,TEST,0,0 +10844,TEST,0,0 +10845,TEST,0,0 +10846,TEST,0,0 +10847,TEST,0,0 +10848,TEST,0,0 +10849,TEST,0,0 +10850,TEST,0,0 +10851,TEST,0,0 +10852,TEST,0,0 +10853,TEST,0,0 +10854,TEST,0,0 +10855,TEST,0,0 +10856,TEST,0,0 +10857,TEST,0,0 +10858,TEST,0,0 +10859,TEST,0,0 +10860,TEST,0,0 +10861,TEST,0,0 +10862,TEST,0,0 +10863,TEST,0,0 +10864,TEST,0,0 +10865,TEST,0,0 +10866,TEST,0,0 +10867,TEST,0,0 +10868,TEST,0,0 +10869,TEST,0,0 +10870,TEST,0,0 +10871,TEST,0,0 +10872,TEST,0,0 +10873,TEST,0,0 +10874,TEST,0,0 +10875,TEST,0,0 +10876,TEST,0,0 +10877,TEST,0,0 +10878,TEST,0,0 +10879,TEST,0,0 +10880,TEST,0,0 +10881,TEST,0,0 +10882,TEST,0,0 +10883,TEST,0,0 +10884,TEST,0,0 +10885,TEST,0,0 +10886,TEST,0,0 +10887,TEST,0,0 +10888,TEST,0,0 +10889,TEST,0,0 +10890,TEST,0,0 +10891,TEST,0,0 +10892,TEST,0,0 +10893,TEST,0,0 +10894,TEST,0,0 +10895,TEST,0,0 +10896,TEST,0,0 +10897,TEST,0,0 +10898,TEST,0,0 +10899,TEST,0,0 +10900,TEST,0,0 +10901,TEST,0,0 +10902,TEST,0,0 +10903,TEST,0,0 +10904,TEST,0,0 +10905,TEST,0,0 +10906,TEST,0,0 +10907,TEST,0,0 +10908,TEST,0,0 +10909,TEST,0,0 +10910,TEST,0,0 +10911,TEST,0,0 +10912,TEST,0,0 +10913,TEST,0,0 +10914,TEST,0,0 +10915,TEST,0,0 +10916,TEST,0,0 +10917,TEST,0,0 +10918,TEST,0,0 +10919,TEST,0,0 +10920,TEST,0,0 +10921,TEST,0,0 +10922,TEST,0,0 +10923,TEST,0,0 +10924,TEST,0,0 +10925,TEST,0,0 +10926,TEST,0,0 +10927,TEST,0,0 +10928,TEST,0,0 +10929,TEST,0,0 +10930,TEST,0,0 +10931,TEST,0,0 +10932,TEST,0,0 +10933,TEST,0,0 +10934,TEST,0,0 +10935,TEST,0,0 +10936,TEST,0,0 +10937,TEST,0,0 +10938,TEST,0,0 +10939,TEST,0,0 +10940,TEST,0,0 +10941,TEST,0,0 +10942,TEST,0,0 +10943,TEST,0,0 +10944,TEST,0,0 +10945,TEST,0,0 +10946,TEST,0,0 +10947,TEST,0,0 +10948,TEST,0,0 +10949,TEST,0,0 +10950,TEST,0,0 +10951,TEST,0,0 +10952,TEST,0,0 +10953,TEST,0,0 +10954,TEST,0,0 +10955,TEST,0,0 +10956,TEST,0,0 +10957,TEST,0,0 +10958,TEST,0,0 +10959,TEST,0,0 +10960,TEST,0,0 +10961,TEST,0,0 +10962,TEST,0,0 +10963,TEST,0,0 +10964,TEST,0,0 +10965,TEST,0,0 +10966,TEST,0,0 +10967,TEST,0,0 +10968,TEST,0,0 +10969,TEST,0,0 +10970,TEST,0,0 +10971,TEST,0,0 +10972,TEST,0,0 +10973,TEST,0,0 +10974,TEST,0,0 +10975,TEST,0,0 +10976,TEST,0,0 +10977,TEST,0,0 +10978,TEST,0,0 +10979,TEST,0,0 +10980,TEST,0,0 +10981,TEST,0,0 +10982,TEST,0,0 +10983,TEST,0,0 +10984,TEST,0,0 +10985,TEST,0,0 +10986,TEST,0,0 +10987,TEST,0,0 +10988,TEST,0,0 +10989,TEST,0,0 +10990,TEST,0,0 +10991,TEST,0,0 +10992,TEST,0,0 +10993,TEST,0,0 +10994,TEST,0,0 +10995,TEST,0,0 +10996,TEST,0,0 +10997,TEST,0,0 +10998,TEST,0,0 +10999,TEST,0,0 +11000,TEST,0,0 +11001,TEST,0,0 +11002,TEST,0,0 +11003,TEST,0,0 +11004,TEST,0,0 +11005,TEST,0,0 +11006,TEST,0,0 +11007,TEST,0,0 +11008,TEST,0,0 +11009,TEST,0,0 +11010,TEST,0,0 +11011,TEST,0,0 +11012,TEST,0,0 +11013,TEST,0,0 +11014,TEST,0,0 +11015,TEST,0,0 +11016,TEST,0,0 +11017,TEST,0,0 +11018,TEST,0,0 +11019,TEST,0,0 +11020,TEST,0,0 +11021,TEST,0,0 +11022,TEST,0,0 +11023,TEST,0,0 +11024,TEST,0,0 +11025,TEST,0,0 +11026,TEST,0,0 +11027,TEST,0,0 +11028,TEST,0,0 +11029,TEST,0,0 +11030,TEST,0,0 +11031,TEST,0,0 +11032,TEST,0,0 +11033,TEST,0,0 +11034,TEST,0,0 +11035,TEST,0,0 +11036,TEST,0,0 +11037,TEST,0,0 +11038,TEST,0,0 +11039,TEST,0,0 +11040,TEST,0,0 +11041,TEST,0,0 +11042,TEST,0,0 +11043,TEST,0,0 +11044,TEST,0,0 +11045,TEST,0,0 +11046,TEST,0,0 +11047,TEST,0,0 +11048,TEST,0,0 +11049,TEST,0,0 +11050,TEST,0,0 +11051,TEST,0,0 +11052,TEST,0,0 +11053,TEST,0,0 +11054,TEST,0,0 +11055,TEST,0,0 +11056,TEST,0,0 +11057,TEST,0,0 +11058,TEST,0,0 +11059,TEST,0,0 +11060,TEST,0,0 +11061,TEST,0,0 +11062,TEST,0,0 +11063,TEST,0,0 +11064,TEST,0,0 +11065,TEST,0,0 +11066,TEST,0,0 +11067,TEST,0,0 +11068,TEST,0,0 +11069,TEST,0,0 +11070,TEST,0,0 +11071,TEST,0,0 +11072,TEST,0,0 +11073,TEST,0,0 +11074,TEST,0,0 +11075,TEST,0,0 +11076,TEST,0,0 +11077,TEST,0,0 +11078,TEST,0,0 +11079,TEST,0,0 +11080,TEST,0,0 +11081,TEST,0,0 +11082,TEST,0,0 +11083,TEST,0,0 +11084,TEST,0,0 +11085,TEST,0,0 +11086,TEST,0,0 +11087,TEST,0,0 +11088,TEST,0,0 +11089,TEST,0,0 +11090,TEST,0,0 +11091,TEST,0,0 +11092,TEST,0,0 +11093,TEST,0,0 +11094,TEST,0,0 +11095,TEST,0,0 +11096,TEST,0,0 +11097,TEST,0,0 +11098,TEST,0,0 +11099,TEST,0,0 +11100,TEST,0,0 +11101,TEST,0,0 +11102,TEST,0,0 +11103,TEST,0,0 +11104,TEST,0,0 +11105,TEST,0,0 +11106,TEST,0,0 +11107,TEST,0,0 +11108,TEST,0,0 +11109,TEST,0,0 +11110,TEST,0,0 +11111,TEST,0,0 +11112,TEST,0,0 +11113,TEST,0,0 +11114,TEST,0,0 +11115,TEST,0,0 +11116,TEST,0,0 +11117,TEST,0,0 +11118,TEST,0,0 +11119,TEST,0,0 +11120,TEST,0,0 +11121,TEST,0,0 +11122,TEST,0,0 +11123,TEST,0,0 +11124,TEST,0,0 +11125,TEST,0,0 +11126,TEST,0,0 +11127,TEST,0,0 +11128,TEST,0,0 +11129,TEST,0,0 +11130,TEST,0,0 +11131,TEST,0,0 +11132,TEST,0,0 +11133,TEST,0,0 +11134,TEST,0,0 +11135,TEST,0,0 +11136,TEST,0,0 +11137,TEST,0,0 +11138,TEST,0,0 +11139,TEST,0,0 +11140,TEST,0,0 +11141,TEST,0,0 +11142,TEST,0,0 +11143,TEST,0,0 +11144,TEST,0,0 +11145,TEST,0,0 +11146,TEST,0,0 +11147,TEST,0,0 +11148,TEST,0,0 +11149,TEST,0,0 +11150,TEST,0,0 +11151,TEST,0,0 +11152,TEST,0,0 +11153,TEST,0,0 +11154,TEST,0,0 +11155,TEST,0,0 +11156,TEST,0,0 +11157,TEST,0,0 +11158,TEST,0,0 +11159,TEST,0,0 +11160,TEST,0,0 +11161,TEST,0,0 +11162,TEST,0,0 +11163,TEST,0,0 +11164,TEST,0,0 +11165,TEST,0,0 +11166,TEST,0,0 +11167,TEST,0,0 +11168,TEST,0,0 +11169,TEST,0,0 +11170,TEST,0,0 +11171,TEST,0,0 +11172,TEST,0,0 +11173,TEST,0,0 +11174,TEST,0,0 +11175,TEST,0,0 +11176,TEST,0,0 +11177,TEST,0,0 +11178,TEST,0,0 +11179,TEST,0,0 +11180,TEST,0,0 +11181,TEST,0,0 +11182,TEST,0,0 +11183,TEST,0,0 +11184,TEST,0,0 +11185,TEST,0,0 +11186,TEST,0,0 +11187,TEST,0,0 +11188,TEST,0,0 +11189,TEST,0,0 +11190,TEST,0,0 +11191,TEST,0,0 +11192,TEST,0,0 +11193,TEST,0,0 +11194,TEST,0,0 +11195,TEST,0,0 +11196,TEST,0,0 +11197,TEST,0,0 +11198,TEST,0,0 +11199,TEST,0,0 +11200,TEST,0,0 +11201,TEST,0,0 +11202,TEST,0,0 +11203,TEST,0,0 +11204,TEST,0,0 +11205,TEST,0,0 +11206,TEST,0,0 +11207,TEST,0,0 +11208,TEST,0,0 +11209,TEST,0,0 +11210,TEST,0,0 +11211,TEST,0,0 +11212,TEST,0,0 +11213,TEST,0,0 +11214,TEST,0,0 +11215,TEST,0,0 +11216,TEST,0,0 +11217,TEST,0,0 +11218,TEST,0,0 +11219,TEST,0,0 +11220,TEST,0,0 +11221,TEST,0,0 +11222,TEST,0,0 +11223,TEST,0,0 +11224,TEST,0,0 +11225,TEST,0,0 +11226,TEST,0,0 +11227,TEST,0,0 +11228,TEST,0,0 +11229,TEST,0,0 +11230,TEST,0,0 +11231,TEST,0,0 +11232,TEST,0,0 +11233,TEST,0,0 +11234,TEST,0,0 +11235,TEST,0,0 +11236,TEST,0,0 +11237,TEST,0,0 +11238,TEST,0,0 +11239,TEST,0,0 +11240,TEST,0,0 +11241,TEST,0,0 +11242,TEST,0,0 +11243,TEST,0,0 +11244,TEST,0,0 +11245,TEST,0,0 +11246,TEST,0,0 +11247,TEST,0,0 +11248,TEST,0,0 +11249,TEST,0,0 +11250,TEST,0,0 +11251,TEST,0,0 +11252,TEST,0,0 +11253,TEST,0,0 +11254,TEST,0,0 +11255,TEST,0,0 +11256,TEST,0,0 +11257,TEST,0,0 +11258,TEST,0,0 +11259,TEST,0,0 +11260,TEST,0,0 +11261,TEST,0,0 +11262,TEST,0,0 +11263,TEST,0,0 +11264,TEST,0,0 +11265,TEST,0,0 +11266,TEST,0,0 +11267,TEST,0,0 +11268,TEST,0,0 +11269,TEST,0,0 +11270,TEST,0,0 +11271,TEST,0,0 +11272,TEST,0,0 +11273,TEST,0,0 +11274,TEST,0,0 +11275,TEST,0,0 +11276,TEST,0,0 +11277,TEST,0,0 +11278,TEST,0,0 +11279,TEST,0,0 +11280,TEST,0,0 +11281,TEST,0,0 +11282,TEST,0,0 +11283,TEST,0,0 +11284,TEST,0,0 +11285,TEST,0,0 +11286,TEST,0,0 +11287,TEST,0,0 +11288,TEST,0,0 +11289,TEST,0,0 +11290,TEST,0,0 +11291,TEST,0,0 +11292,TEST,0,0 +11293,TEST,0,0 +11294,TEST,0,0 +11295,TEST,0,0 +11296,TEST,0,0 +11297,TEST,0,0 +11298,TEST,0,0 +11299,TEST,0,0 +11300,TEST,0,0 +11301,TEST,0,0 +11302,TEST,0,0 +11303,TEST,0,0 +11304,TEST,0,0 +11305,TEST,0,0 +11306,TEST,0,0 +11307,TEST,0,0 +11308,TEST,0,0 +11309,TEST,0,0 +11310,TEST,0,0 +11311,TEST,0,0 +11312,TEST,0,0 +11313,TEST,0,0 +11314,TEST,0,0 +11315,TEST,0,0 +11316,TEST,0,0 +11317,TEST,0,0 +11318,TEST,0,0 +11319,TEST,0,0 +11320,TEST,0,0 +11321,TEST,0,0 +11322,TEST,0,0 +11323,TEST,0,0 +11324,TEST,0,0 +11325,TEST,0,0 +11326,TEST,0,0 +11327,TEST,0,0 +11328,TEST,0,0 +11329,TEST,0,0 +11330,TEST,0,0 +11331,TEST,0,0 +11332,TEST,0,0 +11333,TEST,0,0 +11334,TEST,0,0 +11335,TEST,0,0 +11336,TEST,0,0 +11337,TEST,0,0 +11338,TEST,0,0 +11339,TEST,0,0 +11340,TEST,0,0 +11341,TEST,0,0 +11342,TEST,0,0 +11343,TEST,0,0 +11344,TEST,0,0 +11345,TEST,0,0 +11346,TEST,0,0 +11347,TEST,0,0 +11348,TEST,0,0 +11349,TEST,0,0 +11350,TEST,0,0 +11351,TEST,0,0 +11352,TEST,0,0 +11353,TEST,0,0 +11354,TEST,0,0 +11355,TEST,0,0 +11356,TEST,0,0 +11357,TEST,0,0 +11358,TEST,0,0 +11359,TEST,0,0 +11360,TEST,0,0 +11361,TEST,0,0 +11362,TEST,0,0 +11363,TEST,0,0 +11364,TEST,0,0 +11365,TEST,0,0 +11366,TEST,0,0 +11367,TEST,0,0 +11368,TEST,0,0 +11369,TEST,0,0 +11370,TEST,0,0 +11371,TEST,0,0 +11372,TEST,0,0 +11373,TEST,0,0 +11374,TEST,0,0 +11375,TEST,0,0 +11376,TEST,0,0 +11377,TEST,0,0 +11378,TEST,0,0 +11379,TEST,0,0 +11380,TEST,0,0 +11381,TEST,0,0 +11382,TEST,0,0 +11383,TEST,0,0 +11384,TEST,0,0 +11385,TEST,0,0 +11386,TEST,0,0 +11387,TEST,0,0 +11388,TEST,0,0 +11389,TEST,0,0 +11390,TEST,0,0 +11391,TEST,0,0 +11392,TEST,0,0 +11393,TEST,0,0 +11394,TEST,0,0 +11395,TEST,0,0 +11396,TEST,0,0 +11397,TEST,0,0 +11398,TEST,0,0 +11399,TEST,0,0 +11400,TEST,0,0 +11401,TEST,0,0 +11402,TEST,0,0 +11403,TEST,0,0 +11404,TEST,0,0 +11405,TEST,0,0 +11406,TEST,0,0 +11407,TEST,0,0 +11408,TEST,0,0 +11409,TEST,0,0 +11410,TEST,0,0 +11411,TEST,0,0 +11412,TEST,0,0 +11413,TEST,0,0 +11414,TEST,0,0 +11415,TEST,0,0 +11416,TEST,0,0 +11417,TEST,0,0 +11418,TEST,0,0 +11419,TEST,0,0 +11420,TEST,0,0 +11421,TEST,0,0 +11422,TEST,0,0 +11423,TEST,0,0 +11424,TEST,0,0 +11425,TEST,0,0 +11426,TEST,0,0 +11427,TEST,0,0 +11428,TEST,0,0 +11429,TEST,0,0 +11430,TEST,0,0 +11431,TEST,0,0 +11432,TEST,0,0 +11433,TEST,0,0 +11434,TEST,0,0 +11435,TEST,0,0 +11436,TEST,0,0 +11437,TEST,0,0 +11438,TEST,0,0 +11439,TEST,0,0 +11440,TEST,0,0 +11441,TEST,0,0 +11442,TEST,0,0 +11443,TEST,0,0 +11444,TEST,0,0 +11445,TEST,0,0 +11446,TEST,0,0 +11447,TEST,0,0 +11448,TEST,0,0 +11449,TEST,0,0 +11450,TEST,0,0 +11451,TEST,0,0 +11452,TEST,0,0 +11453,TEST,0,0 +11454,TEST,0,0 +11455,TEST,0,0 +11456,TEST,0,0 +11457,TEST,0,0 +11458,TEST,0,0 +11459,TEST,0,0 +11460,TEST,0,0 +11461,TEST,0,0 +11462,TEST,0,0 +11463,TEST,0,0 +11464,TEST,0,0 +11465,TEST,0,0 +11466,TEST,0,0 +11467,TEST,0,0 +11468,TEST,0,0 +11469,TEST,0,0 +11470,TEST,0,0 +11471,TEST,0,0 +11472,TEST,0,0 +11473,TEST,0,0 +11474,TEST,0,0 +11475,TEST,0,0 +11476,TEST,0,0 +11477,TEST,0,0 +11478,TEST,0,0 +11479,TEST,0,0 +11480,TEST,0,0 +11481,TEST,0,0 +11482,TEST,0,0 +11483,TEST,0,0 +11484,TEST,0,0 +11485,TEST,0,0 +11486,TEST,0,0 +11487,TEST,0,0 +11488,TEST,0,0 +11489,TEST,0,0 +11490,TEST,0,0 +11491,TEST,0,0 +11492,TEST,0,0 +11493,TEST,0,0 +11494,TEST,0,0 +11495,TEST,0,0 +11496,TEST,0,0 +11497,TEST,0,0 +11498,TEST,0,0 +11499,TEST,0,0 +11500,TEST,0,0 +11501,TEST,0,0 +11502,TEST,0,0 +11503,TEST,0,0 +11504,TEST,0,0 +11505,TEST,0,0 +11506,TEST,0,0 +11507,TEST,0,0 +11508,TEST,0,0 +11509,TEST,0,0 +11510,TEST,0,0 +11511,TEST,0,0 +11512,TEST,0,0 +11513,TEST,0,0 +11514,TEST,0,0 +11515,TEST,0,0 +11516,TEST,0,0 +11517,TEST,0,0 +11518,TEST,0,0 +11519,TEST,0,0 +11520,TEST,0,0 +11521,TEST,0,0 +11522,TEST,0,0 +11523,TEST,0,0 +11524,TEST,0,0 +11525,TEST,0,0 +11526,TEST,0,0 +11527,TEST,0,0 +11528,TEST,0,0 +11529,TEST,0,0 +11530,TEST,0,0 +11531,TEST,0,0 +11532,TEST,0,0 +11533,TEST,0,0 +11534,TEST,0,0 +11535,TEST,0,0 +11536,TEST,0,0 +11537,TEST,0,0 +11538,TEST,0,0 +11539,TEST,0,0 +11540,TEST,0,0 +11541,TEST,0,0 +11542,TEST,0,0 +11543,TEST,0,0 +11544,TEST,0,0 +11545,TEST,0,0 +11546,TEST,0,0 +11547,TEST,0,0 +11548,TEST,0,0 +11549,TEST,0,0 +11550,TEST,0,0 +11551,TEST,0,0 +11552,TEST,0,0 +11553,TEST,0,0 +11554,TEST,0,0 +11555,TEST,0,0 +11556,TEST,0,0 +11557,TEST,0,0 +11558,TEST,0,0 +11559,TEST,0,0 +11560,TEST,0,0 +11561,TEST,0,0 +11562,TEST,0,0 +11563,TEST,0,0 +11564,TEST,0,0 +11565,TEST,0,0 +11566,TEST,0,0 +11567,TEST,0,0 +11568,TEST,0,0 +11569,TEST,0,0 +11570,TEST,0,0 +11571,TEST,0,0 +11572,TEST,0,0 +11573,TEST,0,0 +11574,TEST,0,0 +11575,TEST,0,0 +11576,TEST,0,0 +11577,TEST,0,0 +11578,TEST,0,0 +11579,TEST,0,0 +11580,TEST,0,0 +11581,TEST,0,0 +11582,TEST,0,0 +11583,TEST,0,0 +11584,TEST,0,0 +11585,TEST,0,0 +11586,TEST,0,0 +11587,TEST,0,0 +11588,TEST,0,0 +11589,TEST,0,0 +11590,TEST,0,0 +11591,TEST,0,0 +11592,TEST,0,0 +11593,TEST,0,0 +11594,TEST,0,0 +11595,TEST,0,0 +11596,TEST,0,0 +11597,TEST,0,0 +11598,TEST,0,0 +11599,TEST,0,0 +11600,TEST,0,0 +11601,TEST,0,0 +11602,TEST,0,0 +11603,TEST,0,0 +11604,TEST,0,0 +11605,TEST,0,0 +11606,TEST,0,0 +11607,TEST,0,0 +11608,TEST,0,0 +11609,TEST,0,0 +11610,TEST,0,0 +11611,TEST,0,0 +11612,TEST,0,0 +11613,TEST,0,0 +11614,TEST,0,0 +11615,TEST,0,0 +11616,TEST,0,0 +11617,TEST,0,0 +11618,TEST,0,0 +11619,TEST,0,0 +11620,TEST,0,0 +11621,TEST,0,0 +11622,TEST,0,0 +11623,TEST,0,0 +11624,TEST,0,0 +11625,TEST,0,0 +11626,TEST,0,0 +11627,TEST,0,0 +11628,TEST,0,0 +11629,TEST,0,0 +11630,TEST,0,0 +11631,TEST,0,0 +11632,TEST,0,0 +11633,TEST,0,0 +11634,TEST,0,0 +11635,TEST,0,0 +11636,TEST,0,0 +11637,TEST,0,0 +11638,TEST,0,0 +11639,TEST,0,0 +11640,TEST,0,0 +11641,TEST,0,0 +11642,TEST,0,0 +11643,TEST,0,0 +11644,TEST,0,0 +11645,TEST,0,0 +11646,TEST,0,0 +11647,TEST,0,0 +11648,TEST,0,0 +11649,TEST,0,0 +11650,TEST,0,0 +11651,TEST,0,0 +11652,TEST,0,0 +11653,TEST,0,0 +11654,TEST,0,0 +11655,TEST,0,0 +11656,TEST,0,0 +11657,TEST,0,0 +11658,TEST,0,0 +11659,TEST,0,0 +11660,TEST,0,0 +11661,TEST,0,0 +11662,TEST,0,0 +11663,TEST,0,0 +11664,TEST,0,0 +11665,TEST,0,0 +11666,TEST,0,0 +11667,TEST,0,0 +11668,TEST,0,0 +11669,TEST,0,0 +11670,TEST,0,0 +11671,TEST,0,0 +11672,TEST,0,0 +11673,TEST,0,0 +11674,TEST,0,0 +11675,TEST,0,0 +11676,TEST,0,0 +11677,TEST,0,0 +11678,TEST,0,0 +11679,TEST,0,0 +11680,TEST,0,0 +11681,TEST,0,0 +11682,TEST,0,0 +11683,TEST,0,0 +11684,TEST,0,0 +11685,TEST,0,0 +11686,TEST,0,0 +11687,TEST,0,0 +11688,TEST,0,0 +11689,TEST,0,0 +11690,TEST,0,0 +11691,TEST,0,0 +11692,TEST,0,0 +11693,TEST,0,0 +11694,TEST,0,0 +11695,TEST,0,0 +11696,TEST,0,0 +11697,TEST,0,0 +11698,TEST,0,0 +11699,TEST,0,0 +11700,TEST,0,0 +11701,TEST,0,0 +11702,TEST,0,0 +11703,TEST,0,0 +11704,TEST,0,0 +11705,TEST,0,0 +11706,TEST,0,0 +11707,TEST,0,0 +11708,TEST,0,0 +11709,TEST,0,0 +11710,TEST,0,0 +11711,TEST,0,0 +11712,TEST,0,0 +11713,TEST,0,0 +11714,TEST,0,0 +11715,TEST,0,0 +11716,TEST,0,0 +11717,TEST,0,0 +11718,TEST,0,0 +11719,TEST,0,0 +11720,TEST,0,0 +11721,TEST,0,0 +11722,TEST,0,0 +11723,TEST,0,0 +11724,TEST,0,0 +11725,TEST,0,0 +11726,TEST,0,0 +11727,TEST,0,0 +11728,TEST,0,0 +11729,TEST,0,0 +11730,TEST,0,0 +11731,TEST,0,0 +11732,TEST,0,0 +11733,TEST,0,0 +11734,TEST,0,0 +11735,TEST,0,0 +11736,TEST,0,0 +11737,TEST,0,0 +11738,TEST,0,0 +11739,TEST,0,0 +11740,TEST,0,0 +11741,TEST,0,0 +11742,TEST,0,0 +11743,TEST,0,0 +11744,TEST,0,0 +11745,TEST,0,0 +11746,TEST,0,0 +11747,TEST,0,0 +11748,TEST,0,0 +11749,TEST,0,0 +11750,TEST,0,0 +11751,TEST,0,0 +11752,TEST,0,0 +11753,TEST,0,0 +11754,TEST,0,0 +11755,TEST,0,0 +11756,TEST,0,0 +11757,TEST,0,0 +11758,TEST,0,0 +11759,TEST,0,0 +11760,TEST,0,0 +11761,TEST,0,0 +11762,TEST,0,0 +11763,TEST,0,0 +11764,TEST,0,0 +11765,TEST,0,0 +11766,TEST,0,0 +11767,TEST,0,0 +11768,TEST,0,0 +11769,TEST,0,0 +11770,TEST,0,0 +11771,TEST,0,0 +11772,TEST,0,0 +11773,TEST,0,0 +11774,TEST,0,0 +11775,TEST,0,0 +11776,TEST,0,0 +11777,TEST,0,0 +11778,TEST,0,0 +11779,TEST,0,0 +11780,TEST,0,0 +11781,TEST,0,0 +11782,TEST,0,0 +11783,TEST,0,0 +11784,TEST,0,0 +11785,TEST,0,0 +11786,TEST,0,0 +11787,TEST,0,0 +11788,TEST,0,0 +11789,TEST,0,0 +11790,TEST,0,0 +11791,TEST,0,0 +11792,TEST,0,0 +11793,TEST,0,0 +11794,TEST,0,0 +11795,TEST,0,0 +11796,TEST,0,0 +11797,TEST,0,0 +11798,TEST,0,0 +11799,TEST,0,0 +11800,TEST,0,0 +11801,TEST,0,0 +11802,TEST,0,0 +11803,TEST,0,0 +11804,TEST,0,0 +11805,TEST,0,0 +11806,TEST,0,0 +11807,TEST,0,0 +11808,TEST,0,0 +11809,TEST,0,0 +11810,TEST,0,0 +11811,TEST,0,0 +11812,TEST,0,0 +11813,TEST,0,0 +11814,TEST,0,0 +11815,TEST,0,0 +11816,TEST,0,0 +11817,TEST,0,0 +11818,TEST,0,0 +11819,TEST,0,0 +11820,TEST,0,0 +11821,TEST,0,0 +11822,TEST,0,0 +11823,TEST,0,0 +11824,TEST,0,0 +11825,TEST,0,0 +11826,TEST,0,0 +11827,TEST,0,0 +11828,TEST,0,0 +11829,TEST,0,0 +11830,TEST,0,0 +11831,TEST,0,0 +11832,TEST,0,0 +11833,TEST,0,0 +11834,TEST,0,0 +11835,TEST,0,0 +11836,TEST,0,0 +11837,TEST,0,0 +11838,TEST,0,0 +11839,TEST,0,0 +11840,TEST,0,0 +11841,TEST,0,0 +11842,TEST,0,0 +11843,TEST,0,0 +11844,TEST,0,0 +11845,TEST,0,0 +11846,TEST,0,0 +11847,TEST,0,0 +11848,TEST,0,0 +11849,TEST,0,0 +11850,TEST,0,0 +11851,TEST,0,0 +11852,TEST,0,0 +11853,TEST,0,0 +11854,TEST,0,0 +11855,TEST,0,0 +11856,TEST,0,0 +11857,TEST,0,0 +11858,TEST,0,0 +11859,TEST,0,0 +11860,TEST,0,0 +11861,TEST,0,0 +11862,TEST,0,0 +11863,TEST,0,0 +11864,TEST,0,0 +11865,TEST,0,0 +11866,TEST,0,0 +11867,TEST,0,0 +11868,TEST,0,0 +11869,TEST,0,0 +11870,TEST,0,0 +11871,TEST,0,0 +11872,TEST,0,0 +11873,TEST,0,0 +11874,TEST,0,0 +11875,TEST,0,0 +11876,TEST,0,0 +11877,TEST,0,0 +11878,TEST,0,0 +11879,TEST,0,0 +11880,TEST,0,0 +11881,TEST,0,0 +11882,TEST,0,0 +11883,TEST,0,0 +11884,TEST,0,0 +11885,TEST,0,0 +11886,TEST,0,0 +11887,TEST,0,0 +11888,TEST,0,0 +11889,TEST,0,0 +11890,TEST,0,0 +11891,TEST,0,0 +11892,TEST,0,0 +11893,TEST,0,0 +11894,TEST,0,0 +11895,TEST,0,0 +11896,TEST,0,0 +11897,TEST,0,0 +11898,TEST,0,0 +11899,TEST,0,0 +11900,TEST,0,0 +11901,TEST,0,0 +11902,TEST,0,0 +11903,TEST,0,0 +11904,TEST,0,0 +11905,TEST,0,0 +11906,TEST,0,0 +11907,TEST,0,0 +11908,TEST,0,0 +11909,TEST,0,0 +11910,TEST,0,0 +11911,TEST,0,0 +11912,TEST,0,0 +11913,TEST,0,0 +11914,TEST,0,0 +11915,TEST,0,0 +11916,TEST,0,0 +11917,TEST,0,0 +11918,TEST,0,0 +11919,TEST,0,0 +11920,TEST,0,0 +11921,TEST,0,0 +11922,TEST,0,0 +11923,TEST,0,0 +11924,TEST,0,0 +11925,TEST,0,0 +11926,TEST,0,0 +11927,TEST,0,0 +11928,TEST,0,0 +11929,TEST,0,0 +11930,TEST,0,0 +11931,TEST,0,0 +11932,TEST,0,0 +11933,TEST,0,0 +11934,TEST,0,0 +11935,TEST,0,0 +11936,TEST,0,0 +11937,TEST,0,0 +11938,TEST,0,0 +11939,TEST,0,0 +11940,TEST,0,0 +11941,TEST,0,0 +11942,TEST,0,0 +11943,TEST,0,0 +11944,TEST,0,0 +11945,TEST,0,0 +11946,TEST,0,0 +11947,TEST,0,0 +11948,TEST,0,0 +11949,TEST,0,0 +11950,TEST,0,0 +11951,TEST,0,0 +11952,TEST,0,0 +11953,TEST,0,0 +11954,TEST,0,0 +11955,TEST,0,0 +11956,TEST,0,0 +11957,TEST,0,0 +11958,TEST,0,0 +11959,TEST,0,0 +11960,TEST,0,0 +11961,TEST,0,0 +11962,TEST,0,0 +11963,TEST,0,0 +11964,TEST,0,0 +11965,TEST,0,0 +11966,TEST,0,0 +11967,TEST,0,0 +11968,TEST,0,0 +11969,TEST,0,0 +11970,TEST,0,0 +11971,TEST,0,0 +11972,TEST,0,0 +11973,TEST,0,0 +11974,TEST,0,0 +11975,TEST,0,0 +11976,TEST,0,0 +11977,TEST,0,0 +11978,TEST,0,0 +11979,TEST,0,0 +11980,TEST,0,0 +11981,TEST,0,0 +11982,TEST,0,0 +11983,TEST,0,0 +11984,TEST,0,0 +11985,TEST,0,0 +11986,TEST,0,0 +11987,TEST,0,0 +11988,TEST,0,0 +11989,TEST,0,0 +11990,TEST,0,0 +11991,TEST,0,0 +11992,TEST,0,0 +11993,TEST,0,0 +11994,TEST,0,0 +11995,TEST,0,0 +11996,TEST,0,0 +11997,TEST,0,0 +11998,TEST,0,0 +11999,TEST,0,0 +12000,TEST,0,0 +12001,TEST,0,0 +12002,TEST,0,0 +12003,TEST,0,0 +12004,TEST,0,0 +12005,TEST,0,0 +12006,TEST,0,0 +12007,TEST,0,0 +12008,TEST,0,0 +12009,TEST,0,0 +12010,TEST,0,0 +12011,TEST,0,0 +12012,TEST,0,0 +12013,TEST,0,0 +12014,TEST,0,0 +12015,TEST,0,0 +12016,TEST,0,0 +12017,TEST,0,0 +12018,TEST,0,0 +12019,TEST,0,0 +12020,TEST,0,0 +12021,TEST,0,0 +12022,TEST,0,0 +12023,TEST,0,0 +12024,TEST,0,0 +12025,TEST,0,0 +12026,TEST,0,0 +12027,TEST,0,0 +12028,TEST,0,0 +12029,TEST,0,0 +12030,TEST,0,0 +12031,TEST,0,0 +12032,TEST,0,0 +12033,TEST,0,0 +12034,TEST,0,0 +12035,TEST,0,0 +12036,TEST,0,0 +12037,TEST,0,0 +12038,TEST,0,0 +12039,TEST,0,0 +12040,TEST,0,0 +12041,TEST,0,0 +12042,TEST,0,0 +12043,TEST,0,0 +12044,TEST,0,0 +12045,TEST,0,0 +12046,TEST,0,0 +12047,TEST,0,0 +12048,TEST,0,0 +12049,TEST,0,0 +12050,TEST,0,0 +12051,TEST,0,0 +12052,TEST,0,0 +12053,TEST,0,0 +12054,TEST,0,0 +12055,TEST,0,0 +12056,TEST,0,0 +12057,TEST,0,0 +12058,TEST,0,0 +12059,TEST,0,0 +12060,TEST,0,0 +12061,TEST,0,0 +12062,TEST,0,0 +12063,TEST,0,0 +12064,TEST,0,0 +12065,TEST,0,0 +12066,TEST,0,0 +12067,TEST,0,0 +12068,TEST,0,0 +12069,TEST,0,0 +12070,TEST,0,0 +12071,TEST,0,0 +12072,TEST,0,0 +12073,TEST,0,0 +12074,TEST,0,0 +12075,TEST,0,0 +12076,TEST,0,0 +12077,TEST,0,0 +12078,TEST,0,0 +12079,TEST,0,0 +12080,TEST,0,0 +12081,TEST,0,0 +12082,TEST,0,0 +12083,TEST,0,0 +12084,TEST,0,0 +12085,TEST,0,0 +12086,TEST,0,0 +12087,TEST,0,0 +12088,TEST,0,0 +12089,TEST,0,0 +12090,TEST,0,0 +12091,TEST,0,0 +12092,TEST,0,0 +12093,TEST,0,0 +12094,TEST,0,0 +12095,TEST,0,0 +12096,TEST,0,0 +12097,TEST,0,0 +12098,TEST,0,0 +12099,TEST,0,0 +12100,TEST,0,0 +12101,TEST,0,0 +12102,TEST,0,0 +12103,TEST,0,0 +12104,TEST,0,0 +12105,TEST,0,0 +12106,TEST,0,0 +12107,TEST,0,0 +12108,TEST,0,0 +12109,TEST,0,0 +12110,TEST,0,0 +12111,TEST,0,0 +12112,TEST,0,0 +12113,TEST,0,0 +12114,TEST,0,0 +12115,TEST,0,0 +12116,TEST,0,0 +12117,TEST,0,0 +12118,TEST,0,0 +12119,TEST,0,0 +12120,TEST,0,0 +12121,TEST,0,0 +12122,TEST,0,0 +12123,TEST,0,0 +12124,TEST,0,0 +12125,TEST,0,0 +12126,TEST,0,0 +12127,TEST,0,0 +12128,TEST,0,0 +12129,TEST,0,0 +12130,TEST,0,0 +12131,TEST,0,0 +12132,TEST,0,0 +12133,TEST,0,0 +12134,TEST,0,0 +12135,TEST,0,0 +12136,TEST,0,0 +12137,TEST,0,0 +12138,TEST,0,0 +12139,TEST,0,0 +12140,TEST,0,0 +12141,TEST,0,0 +12142,TEST,0,0 +12143,TEST,0,0 +12144,TEST,0,0 +12145,TEST,0,0 +12146,TEST,0,0 +12147,TEST,0,0 +12148,TEST,0,0 +12149,TEST,0,0 +12150,TEST,0,0 +12151,TEST,0,0 +12152,TEST,0,0 +12153,TEST,0,0 +12154,TEST,0,0 +12155,TEST,0,0 +12156,TEST,0,0 +12157,TEST,0,0 +12158,TEST,0,0 +12159,TEST,0,0 +12160,TEST,0,0 +12161,TEST,0,0 +12162,TEST,0,0 +12163,TEST,0,0 +12164,TEST,0,0 +12165,TEST,0,0 +12166,TEST,0,0 +12167,TEST,0,0 +12168,TEST,0,0 +12169,TEST,0,0 +12170,TEST,0,0 +12171,TEST,0,0 +12172,TEST,0,0 +12173,TEST,0,0 +12174,TEST,0,0 +12175,TEST,0,0 +12176,TEST,0,0 +12177,TEST,0,0 +12178,TEST,0,0 +12179,TEST,0,0 +12180,TEST,0,0 +12181,TEST,0,0 +12182,TEST,0,0 +12183,TEST,0,0 +12184,TEST,0,0 +12185,TEST,0,0 +12186,TEST,0,0 +12187,TEST,0,0 +12188,TEST,0,0 +12189,TEST,0,0 +12190,TEST,0,0 +12191,TEST,0,0 +12192,TEST,0,0 +12193,TEST,0,0 +12194,TEST,0,0 +12195,TEST,0,0 +12196,TEST,0,0 +12197,TEST,0,0 +12198,TEST,0,0 +12199,TEST,0,0 +12200,TEST,0,0 +12201,TEST,0,0 +12202,TEST,0,0 +12203,TEST,0,0 +12204,TEST,0,0 +12205,TEST,0,0 +12206,TEST,0,0 +12207,TEST,0,0 +12208,TEST,0,0 +12209,TEST,0,0 +12210,TEST,0,0 +12211,TEST,0,0 +12212,TEST,0,0 +12213,TEST,0,0 +12214,TEST,0,0 +12215,TEST,0,0 +12216,TEST,0,0 +12217,TEST,0,0 +12218,TEST,0,0 +12219,TEST,0,0 +12220,TEST,0,0 +12221,TEST,0,0 +12222,TEST,0,0 +12223,TEST,0,0 +12224,TEST,0,0 +12225,TEST,0,0 +12226,TEST,0,0 +12227,TEST,0,0 +12228,TEST,0,0 +12229,TEST,0,0 +12230,TEST,0,0 +12231,TEST,0,0 +12232,TEST,0,0 +12233,TEST,0,0 +12234,TEST,0,0 +12235,TEST,0,0 +12236,TEST,0,0 +12237,TEST,0,0 +12238,TEST,0,0 +12239,TEST,0,0 +12240,TEST,0,0 +12241,TEST,0,0 +12242,TEST,0,0 +12243,TEST,0,0 +12244,TEST,0,0 +12245,TEST,0,0 +12246,TEST,0,0 +12247,TEST,0,0 +12248,TEST,0,0 +12249,TEST,0,0 +12250,TEST,0,0 +12251,TEST,0,0 +12252,TEST,0,0 +12253,TEST,0,0 +12254,TEST,0,0 +12255,TEST,0,0 +12256,TEST,0,0 +12257,TEST,0,0 +12258,TEST,0,0 +12259,TEST,0,0 +12260,TEST,0,0 +12261,TEST,0,0 +12262,TEST,0,0 +12263,TEST,0,0 +12264,TEST,0,0 +12265,TEST,0,0 +12266,TEST,0,0 +12267,TEST,0,0 +12268,TEST,0,0 +12269,TEST,0,0 +12270,TEST,0,0 +12271,TEST,0,0 +12272,TEST,0,0 +12273,TEST,0,0 +12274,TEST,0,0 +12275,TEST,0,0 +12276,TEST,0,0 +12277,TEST,0,0 +12278,TEST,0,0 +12279,TEST,0,0 +12280,TEST,0,0 +12281,TEST,0,0 +12282,TEST,0,0 +12283,TEST,0,0 +12284,TEST,0,0 +12285,TEST,0,0 +12286,TEST,0,0 +12287,TEST,0,0 +12288,TEST,0,0 +12289,TEST,0,0 +12290,TEST,0,0 +12291,TEST,0,0 +12292,TEST,0,0 +12293,TEST,0,0 +12294,TEST,0,0 +12295,TEST,0,0 +12296,TEST,0,0 +12297,TEST,0,0 +12298,TEST,0,0 +12299,TEST,0,0 +12300,TEST,0,0 +12301,TEST,0,0 +12302,TEST,0,0 +12303,TEST,0,0 +12304,TEST,0,0 +12305,TEST,0,0 +12306,TEST,0,0 +12307,TEST,0,0 +12308,TEST,0,0 +12309,TEST,0,0 +12310,TEST,0,0 +12311,TEST,0,0 +12312,TEST,0,0 +12313,TEST,0,0 +12314,TEST,0,0 +12315,TEST,0,0 +12316,TEST,0,0 +12317,TEST,0,0 +12318,TEST,0,0 +12319,TEST,0,0 +12320,TEST,0,0 +12321,TEST,0,0 +12322,TEST,0,0 +12323,TEST,0,0 +12324,TEST,0,0 +12325,TEST,0,0 +12326,TEST,0,0 +12327,TEST,0,0 +12328,TEST,0,0 +12329,TEST,0,0 +12330,TEST,0,0 +12331,TEST,0,0 +12332,TEST,0,0 +12333,TEST,0,0 +12334,TEST,0,0 +12335,TEST,0,0 +12336,TEST,0,0 +12337,TEST,0,0 +12338,TEST,0,0 +12339,TEST,0,0 +12340,TEST,0,0 +12341,TEST,0,0 +12342,TEST,0,0 +12343,TEST,0,0 +12344,TEST,0,0 +12345,TEST,0,0 +12346,TEST,0,0 +12347,TEST,0,0 +12348,TEST,0,0 +12349,TEST,0,0 +12350,TEST,0,0 +12351,TEST,0,0 +12352,TEST,0,0 +12353,TEST,0,0 +12354,TEST,0,0 +12355,TEST,0,0 +12356,TEST,0,0 +12357,TEST,0,0 +12358,TEST,0,0 +12359,TEST,0,0 +12360,TEST,0,0 +12361,TEST,0,0 +12362,TEST,0,0 +12363,TEST,0,0 +12364,TEST,0,0 +12365,TEST,0,0 +12366,TEST,0,0 +12367,TEST,0,0 +12368,TEST,0,0 +12369,TEST,0,0 +12370,TEST,0,0 +12371,TEST,0,0 +12372,TEST,0,0 +12373,TEST,0,0 +12374,TEST,0,0 +12375,TEST,0,0 +12376,TEST,0,0 +12377,TEST,0,0 +12378,TEST,0,0 +12379,TEST,0,0 +12380,TEST,0,0 +12381,TEST,0,0 +12382,TEST,0,0 +12383,TEST,0,0 +12384,TEST,0,0 +12385,TEST,0,0 +12386,TEST,0,0 +12387,TEST,0,0 +12388,TEST,0,0 +12389,TEST,0,0 +12390,TEST,0,0 +12391,TEST,0,0 +12392,TEST,0,0 +12393,TEST,0,0 +12394,TEST,0,0 +12395,TEST,0,0 +12396,TEST,0,0 +12397,TEST,0,0 +12398,TEST,0,0 +12399,TEST,0,0 +12400,TEST,0,0 +12401,TEST,0,0 +12402,TEST,0,0 +12403,TEST,0,0 +12404,TEST,0,0 +12405,TEST,0,0 +12406,TEST,0,0 +12407,TEST,0,0 +12408,TEST,0,0 +12409,TEST,0,0 +12410,TEST,0,0 +12411,TEST,0,0 +12412,TEST,0,0 +12413,TEST,0,0 +12414,TEST,0,0 +12415,TEST,0,0 +12416,TEST,0,0 +12417,TEST,0,0 +12418,TEST,0,0 +12419,TEST,0,0 +12420,TEST,0,0 +12421,TEST,0,0 +12422,TEST,0,0 +12423,TEST,0,0 +12424,TEST,0,0 +12425,TEST,0,0 +12426,TEST,0,0 +12427,TEST,0,0 +12428,TEST,0,0 +12429,TEST,0,0 +12430,TEST,0,0 +12431,TEST,0,0 +12432,TEST,0,0 +12433,TEST,0,0 +12434,TEST,0,0 +12435,TEST,0,0 +12436,TEST,0,0 +12437,TEST,0,0 +12438,TEST,0,0 +12439,TEST,0,0 +12440,TEST,0,0 +12441,TEST,0,0 +12442,TEST,0,0 +12443,TEST,0,0 +12444,TEST,0,0 +12445,TEST,0,0 +12446,TEST,0,0 +12447,TEST,0,0 +12448,TEST,0,0 +12449,TEST,0,0 +12450,TEST,0,0 +12451,TEST,0,0 +12452,TEST,0,0 +12453,TEST,0,0 +12454,TEST,0,0 +12455,TEST,0,0 +12456,TEST,0,0 +12457,TEST,0,0 +12458,TEST,0,0 +12459,TEST,0,0 +12460,TEST,0,0 +12461,TEST,0,0 +12462,TEST,0,0 +12463,TEST,0,0 +12464,TEST,0,0 +12465,TEST,0,0 +12466,TEST,0,0 +12467,TEST,0,0 +12468,TEST,0,0 +12469,TEST,0,0 +12470,TEST,0,0 +12471,TEST,0,0 +12472,TEST,0,0 +12473,TEST,0,0 +12474,TEST,0,0 +12475,TEST,0,0 +12476,TEST,0,0 +12477,TEST,0,0 +12478,TEST,0,0 +12479,TEST,0,0 +12480,TEST,0,0 +12481,TEST,0,0 +12482,TEST,0,0 +12483,TEST,0,0 +12484,TEST,0,0 +12485,TEST,0,0 +12486,TEST,0,0 +12487,TEST,0,0 +12488,TEST,0,0 +12489,TEST,0,0 +12490,TEST,0,0 +12491,TEST,0,0 +12492,TEST,0,0 +12493,TEST,0,0 +12494,TEST,0,0 +12495,TEST,0,0 +12496,TEST,0,0 +12497,TEST,0,0 +12498,TEST,0,0 +12499,TEST,0,0 +12500,TEST,0,0 +12501,TEST,0,0 +12502,TEST,0,0 +12503,TEST,0,0 +12504,TEST,0,0 +12505,TEST,0,0 +12506,TEST,0,0 +12507,TEST,0,0 +12508,TEST,0,0 +12509,TEST,0,0 +12510,TEST,0,0 +12511,TEST,0,0 +12512,TEST,0,0 +12513,TEST,0,0 +12514,TEST,0,0 +12515,TEST,0,0 +12516,TEST,0,0 +12517,TEST,0,0 +12518,TEST,0,0 +12519,TEST,0,0 +12520,TEST,0,0 +12521,TEST,0,0 +12522,TEST,0,0 +12523,TEST,0,0 +12524,TEST,0,0 +12525,TEST,0,0 +12526,TEST,0,0 +12527,TEST,0,0 +12528,TEST,0,0 +12529,TEST,0,0 +12530,TEST,0,0 +12531,TEST,0,0 +12532,TEST,0,0 +12533,TEST,0,0 +12534,TEST,0,0 +12535,TEST,0,0 +12536,TEST,0,0 +12537,TEST,0,0 +12538,TEST,0,0 +12539,TEST,0,0 +12540,TEST,0,0 +12541,TEST,0,0 +12542,TEST,0,0 +12543,TEST,0,0 +12544,TEST,0,0 +12545,TEST,0,0 +12546,TEST,0,0 +12547,TEST,0,0 +12548,TEST,0,0 +12549,TEST,0,0 +12550,TEST,0,0 +12551,TEST,0,0 +12552,TEST,0,0 +12553,TEST,0,0 +12554,TEST,0,0 +12555,TEST,0,0 +12556,TEST,0,0 +12557,TEST,0,0 +12558,TEST,0,0 +12559,TEST,0,0 +12560,TEST,0,0 +12561,TEST,0,0 +12562,TEST,0,0 +12563,TEST,0,0 +12564,TEST,0,0 +12565,TEST,0,0 +12566,TEST,0,0 +12567,TEST,0,0 +12568,TEST,0,0 +12569,TEST,0,0 +12570,TEST,0,0 +12571,TEST,0,0 +12572,TEST,0,0 +12573,TEST,0,0 +12574,TEST,0,0 +12575,TEST,0,0 +12576,TEST,0,0 +12577,TEST,0,0 +12578,TEST,0,0 +12579,TEST,0,0 +12580,TEST,0,0 +12581,TEST,0,0 +12582,TEST,0,0 +12583,TEST,0,0 +12584,TEST,0,0 +12585,TEST,0,0 +12586,TEST,0,0 +12587,TEST,0,0 +12588,TEST,0,0 +12589,TEST,0,0 +12590,TEST,0,0 +12591,TEST,0,0 +12592,TEST,0,0 +12593,TEST,0,0 +12594,TEST,0,0 +12595,TEST,0,0 +12596,TEST,0,0 +12597,TEST,0,0 +12598,TEST,0,0 +12599,TEST,0,0 +12600,TEST,0,0 +12601,TEST,0,0 +12602,TEST,0,0 +12603,TEST,0,0 +12604,TEST,0,0 +12605,TEST,0,0 +12606,TEST,0,0 +12607,TEST,0,0 +12608,TEST,0,0 +12609,TEST,0,0 +12610,TEST,0,0 +12611,TEST,0,0 +12612,TEST,0,0 +12613,TEST,0,0 +12614,TEST,0,0 +12615,TEST,0,0 +12616,TEST,0,0 +12617,TEST,0,0 +12618,TEST,0,0 +12619,TEST,0,0 +12620,TEST,0,0 +12621,TEST,0,0 +12622,TEST,0,0 +12623,TEST,0,0 +12624,TEST,0,0 +12625,TEST,0,0 +12626,TEST,0,0 +12627,TEST,0,0 +12628,TEST,0,0 +12629,TEST,0,0 +12630,TEST,0,0 +12631,TEST,0,0 +12632,TEST,0,0 +12633,TEST,0,0 +12634,TEST,0,0 +12635,TEST,0,0 +12636,TEST,0,0 +12637,TEST,0,0 +12638,TEST,0,0 +12639,TEST,0,0 +12640,TEST,0,0 +12641,TEST,0,0 +12642,TEST,0,0 +12643,TEST,0,0 +12644,TEST,0,0 +12645,TEST,0,0 +12646,TEST,0,0 +12647,TEST,0,0 +12648,TEST,0,0 +12649,TEST,0,0 +12650,TEST,0,0 +12651,TEST,0,0 +12652,TEST,0,0 +12653,TEST,0,0 +12654,TEST,0,0 +12655,TEST,0,0 +12656,TEST,0,0 +12657,TEST,0,0 +12658,TEST,0,0 +12659,TEST,0,0 +12660,TEST,0,0 +12661,TEST,0,0 +12662,TEST,0,0 +12663,TEST,0,0 +12664,TEST,0,0 +12665,TEST,0,0 +12666,TEST,0,0 +12667,TEST,0,0 +12668,TEST,0,0 +12669,TEST,0,0 +12670,TEST,0,0 +12671,TEST,0,0 +12672,TEST,0,0 +12673,TEST,0,0 +12674,TEST,0,0 +12675,TEST,0,0 +12676,TEST,0,0 +12677,TEST,0,0 +12678,TEST,0,0 +12679,TEST,0,0 +12680,TEST,0,0 +12681,TEST,0,0 +12682,TEST,0,0 +12683,TEST,0,0 +12684,TEST,0,0 +12685,TEST,0,0 +12686,TEST,0,0 +12687,TEST,0,0 +12688,TEST,0,0 +12689,TEST,0,0 +12690,TEST,0,0 +12691,TEST,0,0 +12692,TEST,0,0 +12693,TEST,0,0 +12694,TEST,0,0 +12695,TEST,0,0 +12696,TEST,0,0 +12697,TEST,0,0 +12698,TEST,0,0 +12699,TEST,0,0 +12700,TEST,0,0 +12701,TEST,0,0 +12702,TEST,0,0 +12703,TEST,0,0 +12704,TEST,0,0 +12705,TEST,0,0 +12706,TEST,0,0 +12707,TEST,0,0 +12708,TEST,0,0 +12709,TEST,0,0 +12710,TEST,0,0 +12711,TEST,0,0 +12712,TEST,0,0 +12713,TEST,0,0 +12714,TEST,0,0 +12715,TEST,0,0 +12716,TEST,0,0 +12717,TEST,0,0 +12718,TEST,0,0 +12719,TEST,0,0 +12720,TEST,0,0 +12721,TEST,0,0 +12722,TEST,0,0 +12723,TEST,0,0 +12724,TEST,0,0 +12725,TEST,0,0 +12726,TEST,0,0 +12727,TEST,0,0 +12728,TEST,0,0 +12729,TEST,0,0 +12730,TEST,0,0 +12731,TEST,0,0 +12732,TEST,0,0 +12733,TEST,0,0 +12734,TEST,0,0 +12735,TEST,0,0 +12736,TEST,0,0 +12737,TEST,0,0 +12738,TEST,0,0 +12739,TEST,0,0 +12740,TEST,0,0 +12741,TEST,0,0 +12742,TEST,0,0 +12743,TEST,0,0 +12744,TEST,0,0 +12745,TEST,0,0 +12746,TEST,0,0 +12747,TEST,0,0 +12748,TEST,0,0 +12749,TEST,0,0 +12750,TEST,0,0 +12751,TEST,0,0 +12752,TEST,0,0 +12753,TEST,0,0 +12754,TEST,0,0 +12755,TEST,0,0 +12756,TEST,0,0 +12757,TEST,0,0 +12758,TEST,0,0 +12759,TEST,0,0 +12760,TEST,0,0 +12761,TEST,0,0 +12762,TEST,0,0 +12763,TEST,0,0 +12764,TEST,0,0 +12765,TEST,0,0 +12766,TEST,0,0 +12767,TEST,0,0 +12768,TEST,0,0 +12769,TEST,0,0 +12770,TEST,0,0 +12771,TEST,0,0 +12772,TEST,0,0 +12773,TEST,0,0 +12774,TEST,0,0 +12775,TEST,0,0 +12776,TEST,0,0 +12777,TEST,0,0 +12778,TEST,0,0 +12779,TEST,0,0 +12780,TEST,0,0 +12781,TEST,0,0 +12782,TEST,0,0 +12783,TEST,0,0 +12784,TEST,0,0 +12785,TEST,0,0 +12786,TEST,0,0 +12787,TEST,0,0 +12788,TEST,0,0 +12789,TEST,0,0 +12790,TEST,0,0 +12791,TEST,0,0 +12792,TEST,0,0 +12793,TEST,0,0 +12794,TEST,0,0 +12795,TEST,0,0 +12796,TEST,0,0 +12797,TEST,0,0 +12798,TEST,0,0 +12799,TEST,0,0 +12800,TEST,0,0 +12801,TEST,0,0 +12802,TEST,0,0 +12803,TEST,0,0 +12804,TEST,0,0 +12805,TEST,0,0 +12806,TEST,0,0 +12807,TEST,0,0 +12808,TEST,0,0 +12809,TEST,0,0 +12810,TEST,0,0 +12811,TEST,0,0 +12812,TEST,0,0 +12813,TEST,0,0 +12814,TEST,0,0 +12815,TEST,0,0 +12816,TEST,0,0 +12817,TEST,0,0 +12818,TEST,0,0 +12819,TEST,0,0 +12820,TEST,0,0 +12821,TEST,0,0 +12822,TEST,0,0 +12823,TEST,0,0 +12824,TEST,0,0 +12825,TEST,0,0 +12826,TEST,0,0 +12827,TEST,0,0 +12828,TEST,0,0 +12829,TEST,0,0 +12830,TEST,0,0 +12831,TEST,0,0 +12832,TEST,0,0 +12833,TEST,0,0 +12834,TEST,0,0 +12835,TEST,0,0 +12836,TEST,0,0 +12837,TEST,0,0 +12838,TEST,0,0 +12839,TEST,0,0 +12840,TEST,0,0 +12841,TEST,0,0 +12842,TEST,0,0 +12843,TEST,0,0 +12844,TEST,0,0 +12845,TEST,0,0 +12846,TEST,0,0 +12847,TEST,0,0 +12848,TEST,0,0 +12849,TEST,0,0 +12850,TEST,0,0 +12851,TEST,0,0 +12852,TEST,0,0 +12853,TEST,0,0 +12854,TEST,0,0 +12855,TEST,0,0 +12856,TEST,0,0 +12857,TEST,0,0 +12858,TEST,0,0 +12859,TEST,0,0 +12860,TEST,0,0 +12861,TEST,0,0 +12862,TEST,0,0 +12863,TEST,0,0 +12864,TEST,0,0 +12865,TEST,0,0 +12866,TEST,0,0 +12867,TEST,0,0 +12868,TEST,0,0 +12869,TEST,0,0 +12870,TEST,0,0 +12871,TEST,0,0 +12872,TEST,0,0 +12873,TEST,0,0 +12874,TEST,0,0 +12875,TEST,0,0 +12876,TEST,0,0 +12877,TEST,0,0 +12878,TEST,0,0 +12879,TEST,0,0 +12880,TEST,0,0 +12881,TEST,0,0 +12882,TEST,0,0 +12883,TEST,0,0 +12884,TEST,0,0 +12885,TEST,0,0 +12886,TEST,0,0 +12887,TEST,0,0 +12888,TEST,0,0 +12889,TEST,0,0 +12890,TEST,0,0 +12891,TEST,0,0 +12892,TEST,0,0 +12893,TEST,0,0 +12894,TEST,0,0 +12895,TEST,0,0 +12896,TEST,0,0 +12897,TEST,0,0 +12898,TEST,0,0 +12899,TEST,0,0 +12900,TEST,0,0 +12901,TEST,0,0 +12902,TEST,0,0 +12903,TEST,0,0 +12904,TEST,0,0 +12905,TEST,0,0 +12906,TEST,0,0 +12907,TEST,0,0 +12908,TEST,0,0 +12909,TEST,0,0 +12910,TEST,0,0 +12911,TEST,0,0 +12912,TEST,0,0 +12913,TEST,0,0 +12914,TEST,0,0 +12915,TEST,0,0 +12916,TEST,0,0 +12917,TEST,0,0 +12918,TEST,0,0 +12919,TEST,0,0 +12920,TEST,0,0 +12921,TEST,0,0 +12922,TEST,0,0 +12923,TEST,0,0 +12924,TEST,0,0 +12925,TEST,0,0 +12926,TEST,0,0 +12927,TEST,0,0 +12928,TEST,0,0 +12929,TEST,0,0 +12930,TEST,0,0 +12931,TEST,0,0 +12932,TEST,0,0 +12933,TEST,0,0 +12934,TEST,0,0 +12935,TEST,0,0 +12936,TEST,0,0 +12937,TEST,0,0 +12938,TEST,0,0 +12939,TEST,0,0 +12940,TEST,0,0 +12941,TEST,0,0 +12942,TEST,0,0 +12943,TEST,0,0 +12944,TEST,0,0 +12945,TEST,0,0 +12946,TEST,0,0 +12947,TEST,0,0 +12948,TEST,0,0 +12949,TEST,0,0 +12950,TEST,0,0 +12951,TEST,0,0 +12952,TEST,0,0 +12953,TEST,0,0 +12954,TEST,0,0 +12955,TEST,0,0 +12956,TEST,0,0 +12957,TEST,0,0 +12958,TEST,0,0 +12959,TEST,0,0 +12960,TEST,0,0 +12961,TEST,0,0 +12962,TEST,0,0 +12963,TEST,0,0 +12964,TEST,0,0 +12965,TEST,0,0 +12966,TEST,0,0 +12967,TEST,0,0 +12968,TEST,0,0 +12969,TEST,0,0 +12970,TEST,0,0 +12971,TEST,0,0 +12972,TEST,0,0 +12973,TEST,0,0 +12974,TEST,0,0 +12975,TEST,0,0 +12976,TEST,0,0 +12977,TEST,0,0 +12978,TEST,0,0 +12979,TEST,0,0 +12980,TEST,0,0 +12981,TEST,0,0 +12982,TEST,0,0 +12983,TEST,0,0 +12984,TEST,0,0 +12985,TEST,0,0 +12986,TEST,0,0 +12987,TEST,0,0 +12988,TEST,0,0 +12989,TEST,0,0 +12990,TEST,0,0 +12991,TEST,0,0 +12992,TEST,0,0 +12993,TEST,0,0 +12994,TEST,0,0 +12995,TEST,0,0 +12996,TEST,0,0 +12997,TEST,0,0 +12998,TEST,0,0 +12999,TEST,0,0 +13000,TEST,0,0 +13001,TEST,0,0 +13002,TEST,0,0 +13003,TEST,0,0 +13004,TEST,0,0 +13005,TEST,0,0 +13006,TEST,0,0 +13007,TEST,0,0 +13008,TEST,0,0 +13009,TEST,0,0 +13010,TEST,0,0 +13011,TEST,0,0 +13012,TEST,0,0 +13013,TEST,0,0 +13014,TEST,0,0 +13015,TEST,0,0 +13016,TEST,0,0 +13017,TEST,0,0 +13018,TEST,0,0 +13019,TEST,0,0 +13020,TEST,0,0 +13021,TEST,0,0 +13022,TEST,0,0 +13023,TEST,0,0 +13024,TEST,0,0 +13025,TEST,0,0 +13026,TEST,0,0 +13027,TEST,0,0 +13028,TEST,0,0 +13029,TEST,0,0 +13030,TEST,0,0 +13031,TEST,0,0 +13032,TEST,0,0 +13033,TEST,0,0 +13034,TEST,0,0 +13035,TEST,0,0 +13036,TEST,0,0 +13037,TEST,0,0 +13038,TEST,0,0 +13039,TEST,0,0 +13040,TEST,0,0 +13041,TEST,0,0 +13042,TEST,0,0 +13043,TEST,0,0 +13044,TEST,0,0 +13045,TEST,0,0 +13046,TEST,0,0 +13047,TEST,0,0 +13048,TEST,0,0 +13049,TEST,0,0 +13050,TEST,0,0 +13051,TEST,0,0 +13052,TEST,0,0 +13053,TEST,0,0 +13054,TEST,0,0 +13055,TEST,0,0 +13056,TEST,0,0 +13057,TEST,0,0 +13058,TEST,0,0 +13059,TEST,0,0 +13060,TEST,0,0 +13061,TEST,0,0 +13062,TEST,0,0 +13063,TEST,0,0 +13064,TEST,0,0 +13065,TEST,0,0 +13066,TEST,0,0 +13067,TEST,0,0 +13068,TEST,0,0 +13069,TEST,0,0 +13070,TEST,0,0 +13071,TEST,0,0 +13072,TEST,0,0 +13073,TEST,0,0 +13074,TEST,0,0 +13075,TEST,0,0 +13076,TEST,0,0 +13077,TEST,0,0 +13078,TEST,0,0 +13079,TEST,0,0 +13080,TEST,0,0 +13081,TEST,0,0 +13082,TEST,0,0 +13083,TEST,0,0 +13084,TEST,0,0 +13085,TEST,0,0 +13086,TEST,0,0 +13087,TEST,0,0 +13088,TEST,0,0 +13089,TEST,0,0 +13090,TEST,0,0 +13091,TEST,0,0 +13092,TEST,0,0 +13093,TEST,0,0 +13094,TEST,0,0 +13095,TEST,0,0 +13096,TEST,0,0 +13097,TEST,0,0 +13098,TEST,0,0 +13099,TEST,0,0 +13100,TEST,0,0 +13101,TEST,0,0 +13102,TEST,0,0 +13103,TEST,0,0 +13104,TEST,0,0 +13105,TEST,0,0 +13106,TEST,0,0 +13107,TEST,0,0 +13108,TEST,0,0 +13109,TEST,0,0 +13110,TEST,0,0 +13111,TEST,0,0 +13112,TEST,0,0 +13113,TEST,0,0 +13114,TEST,0,0 +13115,TEST,0,0 +13116,TEST,0,0 +13117,TEST,0,0 +13118,TEST,0,0 +13119,TEST,0,0 +13120,TEST,0,0 +13121,TEST,0,0 +13122,TEST,0,0 +13123,TEST,0,0 +13124,TEST,0,0 +13125,TEST,0,0 +13126,TEST,0,0 +13127,TEST,0,0 +13128,TEST,0,0 +13129,TEST,0,0 +13130,TEST,0,0 +13131,TEST,0,0 +13132,TEST,0,0 +13133,TEST,0,0 +13134,TEST,0,0 +13135,TEST,0,0 +13136,TEST,0,0 +13137,TEST,0,0 +13138,TEST,0,0 +13139,TEST,0,0 +13140,TEST,0,0 +13141,TEST,0,0 +13142,TEST,0,0 +13143,TEST,0,0 +13144,TEST,0,0 +13145,TEST,0,0 +13146,TEST,0,0 +13147,TEST,0,0 +13148,TEST,0,0 +13149,TEST,0,0 +13150,TEST,0,0 +13151,TEST,0,0 +13152,TEST,0,0 +13153,TEST,0,0 +13154,TEST,0,0 +13155,TEST,0,0 +13156,TEST,0,0 +13157,TEST,0,0 +13158,TEST,0,0 +13159,TEST,0,0 +13160,TEST,0,0 +13161,TEST,0,0 +13162,TEST,0,0 +13163,TEST,0,0 +13164,TEST,0,0 +13165,TEST,0,0 +13166,TEST,0,0 +13167,TEST,0,0 +13168,TEST,0,0 +13169,TEST,0,0 +13170,TEST,0,0 +13171,TEST,0,0 +13172,TEST,0,0 +13173,TEST,0,0 +13174,TEST,0,0 +13175,TEST,0,0 +13176,TEST,0,0 +13177,TEST,0,0 +13178,TEST,0,0 +13179,TEST,0,0 +13180,TEST,0,0 +13181,TEST,0,0 +13182,TEST,0,0 +13183,TEST,0,0 +13184,TEST,0,0 +13185,TEST,0,0 +13186,TEST,0,0 +13187,TEST,0,0 +13188,TEST,0,0 +13189,TEST,0,0 +13190,TEST,0,0 +13191,TEST,0,0 +13192,TEST,0,0 +13193,TEST,0,0 +13194,TEST,0,0 +13195,TEST,0,0 +13196,TEST,0,0 +13197,TEST,0,0 +13198,TEST,0,0 +13199,TEST,0,0 +13200,TEST,0,0 +13201,TEST,0,0 +13202,TEST,0,0 +13203,TEST,0,0 +13204,TEST,0,0 +13205,TEST,0,0 +13206,TEST,0,0 +13207,TEST,0,0 +13208,TEST,0,0 +13209,TEST,0,0 +13210,TEST,0,0 +13211,TEST,0,0 +13212,TEST,0,0 +13213,TEST,0,0 +13214,TEST,0,0 +13215,TEST,0,0 +13216,TEST,0,0 +13217,TEST,0,0 +13218,TEST,0,0 +13219,TEST,0,0 +13220,TEST,0,0 +13221,TEST,0,0 +13222,TEST,0,0 +13223,TEST,0,0 +13224,TEST,0,0 +13225,TEST,0,0 +13226,TEST,0,0 +13227,TEST,0,0 +13228,TEST,0,0 +13229,TEST,0,0 +13230,TEST,0,0 +13231,TEST,0,0 +13232,TEST,0,0 +13233,TEST,0,0 +13234,TEST,0,0 +13235,TEST,0,0 +13236,TEST,0,0 +13237,TEST,0,0 +13238,TEST,0,0 +13239,TEST,0,0 +13240,TEST,0,0 +13241,TEST,0,0 +13242,TEST,0,0 +13243,TEST,0,0 +13244,TEST,0,0 +13245,TEST,0,0 +13246,TEST,0,0 +13247,TEST,0,0 +13248,TEST,0,0 +13249,TEST,0,0 +13250,TEST,0,0 +13251,TEST,0,0 +13252,TEST,0,0 +13253,TEST,0,0 +13254,TEST,0,0 +13255,TEST,0,0 +13256,TEST,0,0 +13257,TEST,0,0 +13258,TEST,0,0 +13259,TEST,0,0 +13260,TEST,0,0 +13261,TEST,0,0 +13262,TEST,0,0 +13263,TEST,0,0 +13264,TEST,0,0 +13265,TEST,0,0 +13266,TEST,0,0 +13267,TEST,0,0 +13268,TEST,0,0 +13269,TEST,0,0 +13270,TEST,0,0 +13271,TEST,0,0 +13272,TEST,0,0 +13273,TEST,0,0 +13274,TEST,0,0 +13275,TEST,0,0 +13276,TEST,0,0 +13277,TEST,0,0 +13278,TEST,0,0 +13279,TEST,0,0 +13280,TEST,0,0 +13281,TEST,0,0 +13282,TEST,0,0 +13283,TEST,0,0 +13284,TEST,0,0 +13285,TEST,0,0 +13286,TEST,0,0 +13287,TEST,0,0 +13288,TEST,0,0 +13289,TEST,0,0 +13290,TEST,0,0 +13291,TEST,0,0 +13292,TEST,0,0 +13293,TEST,0,0 +13294,TEST,0,0 +13295,TEST,0,0 +13296,TEST,0,0 +13297,TEST,0,0 +13298,TEST,0,0 +13299,TEST,0,0 +13300,TEST,0,0 +13301,TEST,0,0 +13302,TEST,0,0 +13303,TEST,0,0 +13304,TEST,0,0 +13305,TEST,0,0 +13306,TEST,0,0 +13307,TEST,0,0 +13308,TEST,0,0 +13309,TEST,0,0 +13310,TEST,0,0 +13311,TEST,0,0 +13312,TEST,0,0 +13313,TEST,0,0 +13314,TEST,0,0 +13315,TEST,0,0 +13316,TEST,0,0 +13317,TEST,0,0 +13318,TEST,0,0 +13319,TEST,0,0 +13320,TEST,0,0 +13321,TEST,0,0 +13322,TEST,0,0 +13323,TEST,0,0 +13324,TEST,0,0 +13325,TEST,0,0 +13326,TEST,0,0 +13327,TEST,0,0 +13328,TEST,0,0 +13329,TEST,0,0 +13330,TEST,0,0 +13331,TEST,0,0 +13332,TEST,0,0 +13333,TEST,0,0 +13334,TEST,0,0 +13335,TEST,0,0 +13336,TEST,0,0 +13337,TEST,0,0 +13338,TEST,0,0 +13339,TEST,0,0 +13340,TEST,0,0 +13341,TEST,0,0 +13342,TEST,0,0 +13343,TEST,0,0 +13344,TEST,0,0 +13345,TEST,0,0 +13346,TEST,0,0 +13347,TEST,0,0 +13348,TEST,0,0 +13349,TEST,0,0 +13350,TEST,0,0 +13351,TEST,0,0 +13352,TEST,0,0 +13353,TEST,0,0 +13354,TEST,0,0 +13355,TEST,0,0 +13356,TEST,0,0 +13357,TEST,0,0 +13358,TEST,0,0 +13359,TEST,0,0 +13360,TEST,0,0 +13361,TEST,0,0 +13362,TEST,0,0 +13363,TEST,0,0 +13364,TEST,0,0 +13365,TEST,0,0 +13366,TEST,0,0 +13367,TEST,0,0 +13368,TEST,0,0 +13369,TEST,0,0 +13370,TEST,0,0 +13371,TEST,0,0 +13372,TEST,0,0 +13373,TEST,0,0 +13374,TEST,0,0 +13375,TEST,0,0 +13376,TEST,0,0 +13377,TEST,0,0 +13378,TEST,0,0 +13379,TEST,0,0 +13380,TEST,0,0 +13381,TEST,0,0 +13382,TEST,0,0 +13383,TEST,0,0 +13384,TEST,0,0 +13385,TEST,0,0 +13386,TEST,0,0 +13387,TEST,0,0 +13388,TEST,0,0 +13389,TEST,0,0 +13390,TEST,0,0 +13391,TEST,0,0 +13392,TEST,0,0 +13393,TEST,0,0 +13394,TEST,0,0 +13395,TEST,0,0 +13396,TEST,0,0 +13397,TEST,0,0 +13398,TEST,0,0 +13399,TEST,0,0 +13400,TEST,0,0 +13401,TEST,0,0 +13402,TEST,0,0 +13403,TEST,0,0 +13404,TEST,0,0 +13405,TEST,0,0 +13406,TEST,0,0 +13407,TEST,0,0 +13408,TEST,0,0 +13409,TEST,0,0 +13410,TEST,0,0 +13411,TEST,0,0 +13412,TEST,0,0 +13413,TEST,0,0 +13414,TEST,0,0 +13415,TEST,0,0 +13416,TEST,0,0 +13417,TEST,0,0 +13418,TEST,0,0 +13419,TEST,0,0 +13420,TEST,0,0 +13421,TEST,0,0 +13422,TEST,0,0 +13423,TEST,0,0 +13424,TEST,0,0 +13425,TEST,0,0 +13426,TEST,0,0 +13427,TEST,0,0 +13428,TEST,0,0 +13429,TEST,0,0 +13430,TEST,0,0 +13431,TEST,0,0 +13432,TEST,0,0 +13433,TEST,0,0 +13434,TEST,0,0 +13435,TEST,0,0 +13436,TEST,0,0 +13437,TEST,0,0 +13438,TEST,0,0 +13439,TEST,0,0 +13440,TEST,0,0 +13441,TEST,0,0 +13442,TEST,0,0 +13443,TEST,0,0 +13444,TEST,0,0 +13445,TEST,0,0 +13446,TEST,0,0 +13447,TEST,0,0 +13448,TEST,0,0 +13449,TEST,0,0 +13450,TEST,0,0 +13451,TEST,0,0 +13452,TEST,0,0 +13453,TEST,0,0 +13454,TEST,0,0 +13455,TEST,0,0 +13456,TEST,0,0 +13457,TEST,0,0 +13458,TEST,0,0 +13459,TEST,0,0 +13460,TEST,0,0 +13461,TEST,0,0 +13462,TEST,0,0 +13463,TEST,0,0 +13464,TEST,0,0 +13465,TEST,0,0 +13466,TEST,0,0 +13467,TEST,0,0 +13468,TEST,0,0 +13469,TEST,0,0 +13470,TEST,0,0 +13471,TEST,0,0 +13472,TEST,0,0 +13473,TEST,0,0 +13474,TEST,0,0 +13475,TEST,0,0 +13476,TEST,0,0 +13477,TEST,0,0 +13478,TEST,0,0 +13479,TEST,0,0 +13480,TEST,0,0 +13481,TEST,0,0 +13482,TEST,0,0 +13483,TEST,0,0 +13484,TEST,0,0 +13485,TEST,0,0 +13486,TEST,0,0 +13487,TEST,0,0 +13488,TEST,0,0 +13489,TEST,0,0 +13490,TEST,0,0 +13491,TEST,0,0 +13492,TEST,0,0 +13493,TEST,0,0 +13494,TEST,0,0 +13495,TEST,0,0 +13496,TEST,0,0 +13497,TEST,0,0 +13498,TEST,0,0 +13499,TEST,0,0 +13500,TEST,0,0 +13501,TEST,0,0 +13502,TEST,0,0 +13503,TEST,0,0 +13504,TEST,0,0 +13505,TEST,0,0 +13506,TEST,0,0 +13507,TEST,0,0 +13508,TEST,0,0 +13509,TEST,0,0 +13510,TEST,0,0 +13511,TEST,0,0 +13512,TEST,0,0 +13513,TEST,0,0 +13514,TEST,0,0 +13515,TEST,0,0 +13516,TEST,0,0 +13517,TEST,0,0 +13518,TEST,0,0 +13519,TEST,0,0 +13520,TEST,0,0 +13521,TEST,0,0 +13522,TEST,0,0 +13523,TEST,0,0 +13524,TEST,0,0 +13525,TEST,0,0 +13526,TEST,0,0 +13527,TEST,0,0 +13528,TEST,0,0 +13529,TEST,0,0 +13530,TEST,0,0 +13531,TEST,0,0 +13532,TEST,0,0 +13533,TEST,0,0 +13534,TEST,0,0 +13535,TEST,0,0 +13536,TEST,0,0 +13537,TEST,0,0 +13538,TEST,0,0 +13539,TEST,0,0 +13540,TEST,0,0 +13541,TEST,0,0 +13542,TEST,0,0 +13543,TEST,0,0 +13544,TEST,0,0 +13545,TEST,0,0 +13546,TEST,0,0 +13547,TEST,0,0 +13548,TEST,0,0 +13549,TEST,0,0 +13550,TEST,0,0 +13551,TEST,0,0 +13552,TEST,0,0 +13553,TEST,0,0 +13554,TEST,0,0 +13555,TEST,0,0 +13556,TEST,0,0 +13557,TEST,0,0 +13558,TEST,0,0 +13559,TEST,0,0 +13560,TEST,0,0 +13561,TEST,0,0 +13562,TEST,0,0 +13563,TEST,0,0 +13564,TEST,0,0 +13565,TEST,0,0 +13566,TEST,0,0 +13567,TEST,0,0 +13568,TEST,0,0 +13569,TEST,0,0 +13570,TEST,0,0 +13571,TEST,0,0 +13572,TEST,0,0 +13573,TEST,0,0 +13574,TEST,0,0 +13575,TEST,0,0 +13576,TEST,0,0 +13577,TEST,0,0 +13578,TEST,0,0 +13579,TEST,0,0 +13580,TEST,0,0 +13581,TEST,0,0 +13582,TEST,0,0 +13583,TEST,0,0 +13584,TEST,0,0 +13585,TEST,0,0 +13586,TEST,0,0 +13587,TEST,0,0 +13588,TEST,0,0 +13589,TEST,0,0 +13590,TEST,0,0 +13591,TEST,0,0 +13592,TEST,0,0 +13593,TEST,0,0 +13594,TEST,0,0 +13595,TEST,0,0 +13596,TEST,0,0 +13597,TEST,0,0 +13598,TEST,0,0 +13599,TEST,0,0 +13600,TEST,0,0 +13601,TEST,0,0 +13602,TEST,0,0 +13603,TEST,0,0 +13604,TEST,0,0 +13605,TEST,0,0 +13606,TEST,0,0 +13607,TEST,0,0 +13608,TEST,0,0 +13609,TEST,0,0 +13610,TEST,0,0 +13611,TEST,0,0 +13612,TEST,0,0 +13613,TEST,0,0 +13614,TEST,0,0 +13615,TEST,0,0 +13616,TEST,0,0 +13617,TEST,0,0 +13618,TEST,0,0 +13619,TEST,0,0 +13620,TEST,0,0 +13621,TEST,0,0 +13622,TEST,0,0 +13623,TEST,0,0 +13624,TEST,0,0 +13625,TEST,0,0 +13626,TEST,0,0 +13627,TEST,0,0 +13628,TEST,0,0 +13629,TEST,0,0 +13630,TEST,0,0 +13631,TEST,0,0 +13632,TEST,0,0 +13633,TEST,0,0 +13634,TEST,0,0 +13635,TEST,0,0 +13636,TEST,0,0 +13637,TEST,0,0 +13638,TEST,0,0 +13639,TEST,0,0 +13640,TEST,0,0 +13641,TEST,0,0 +13642,TEST,0,0 +13643,TEST,0,0 +13644,TEST,0,0 +13645,TEST,0,0 +13646,TEST,0,0 +13647,TEST,0,0 +13648,TEST,0,0 +13649,TEST,0,0 +13650,TEST,0,0 +13651,TEST,0,0 +13652,TEST,0,0 +13653,TEST,0,0 +13654,TEST,0,0 +13655,TEST,0,0 +13656,TEST,0,0 +13657,TEST,0,0 +13658,TEST,0,0 +13659,TEST,0,0 +13660,TEST,0,0 +13661,TEST,0,0 +13662,TEST,0,0 +13663,TEST,0,0 +13664,TEST,0,0 +13665,TEST,0,0 +13666,TEST,0,0 +13667,TEST,0,0 +13668,TEST,0,0 +13669,TEST,0,0 +13670,TEST,0,0 +13671,TEST,0,0 +13672,TEST,0,0 +13673,TEST,0,0 +13674,TEST,0,0 +13675,TEST,0,0 +13676,TEST,0,0 +13677,TEST,0,0 +13678,TEST,0,0 +13679,TEST,0,0 +13680,TEST,0,0 +13681,TEST,0,0 +13682,TEST,0,0 +13683,TEST,0,0 +13684,TEST,0,0 +13685,TEST,0,0 +13686,TEST,0,0 +13687,TEST,0,0 +13688,TEST,0,0 +13689,TEST,0,0 +13690,TEST,0,0 +13691,TEST,0,0 +13692,TEST,0,0 +13693,TEST,0,0 +13694,TEST,0,0 +13695,TEST,0,0 +13696,TEST,0,0 +13697,TEST,0,0 +13698,TEST,0,0 +13699,TEST,0,0 +13700,TEST,0,0 +13701,TEST,0,0 +13702,TEST,0,0 +13703,TEST,0,0 +13704,TEST,0,0 +13705,TEST,0,0 +13706,TEST,0,0 +13707,TEST,0,0 +13708,TEST,0,0 +13709,TEST,0,0 +13710,TEST,0,0 +13711,TEST,0,0 +13712,TEST,0,0 +13713,TEST,0,0 +13714,TEST,0,0 +13715,TEST,0,0 +13716,TEST,0,0 +13717,TEST,0,0 +13718,TEST,0,0 +13719,TEST,0,0 +13720,TEST,0,0 +13721,TEST,0,0 +13722,TEST,0,0 +13723,TEST,0,0 +13724,TEST,0,0 +13725,TEST,0,0 +13726,TEST,0,0 +13727,TEST,0,0 +13728,TEST,0,0 +13729,TEST,0,0 +13730,TEST,0,0 +13731,TEST,0,0 +13732,TEST,0,0 +13733,TEST,0,0 +13734,TEST,0,0 +13735,TEST,0,0 +13736,TEST,0,0 +13737,TEST,0,0 +13738,TEST,0,0 +13739,TEST,0,0 +13740,TEST,0,0 +13741,TEST,0,0 +13742,TEST,0,0 +13743,TEST,0,0 +13744,TEST,0,0 +13745,TEST,0,0 +13746,TEST,0,0 +13747,TEST,0,0 +13748,TEST,0,0 +13749,TEST,0,0 +13750,TEST,0,0 +13751,TEST,0,0 +13752,TEST,0,0 +13753,TEST,0,0 +13754,TEST,0,0 +13755,TEST,0,0 +13756,TEST,0,0 +13757,TEST,0,0 +13758,TEST,0,0 +13759,TEST,0,0 +13760,TEST,0,0 +13761,TEST,0,0 +13762,TEST,0,0 +13763,TEST,0,0 +13764,TEST,0,0 +13765,TEST,0,0 +13766,TEST,0,0 +13767,TEST,0,0 +13768,TEST,0,0 +13769,TEST,0,0 +13770,TEST,0,0 +13771,TEST,0,0 +13772,TEST,0,0 +13773,TEST,0,0 +13774,TEST,0,0 +13775,TEST,0,0 +13776,TEST,0,0 +13777,TEST,0,0 +13778,TEST,0,0 +13779,TEST,0,0 +13780,TEST,0,0 +13781,TEST,0,0 +13782,TEST,0,0 +13783,TEST,0,0 +13784,TEST,0,0 +13785,TEST,0,0 +13786,TEST,0,0 +13787,TEST,0,0 +13788,TEST,0,0 +13789,TEST,0,0 +13790,TEST,0,0 +13791,TEST,0,0 +13792,TEST,0,0 +13793,TEST,0,0 +13794,TEST,0,0 +13795,TEST,0,0 +13796,TEST,0,0 +13797,TEST,0,0 +13798,TEST,0,0 +13799,TEST,0,0 +13800,TEST,0,0 +13801,TEST,0,0 +13802,TEST,0,0 +13803,TEST,0,0 +13804,TEST,0,0 +13805,TEST,0,0 +13806,TEST,0,0 +13807,TEST,0,0 +13808,TEST,0,0 +13809,TEST,0,0 +13810,TEST,0,0 +13811,TEST,0,0 +13812,TEST,0,0 +13813,TEST,0,0 +13814,TEST,0,0 +13815,TEST,0,0 +13816,TEST,0,0 +13817,TEST,0,0 +13818,TEST,0,0 +13819,TEST,0,0 +13820,TEST,0,0 +13821,TEST,0,0 +13822,TEST,0,0 +13823,TEST,0,0 +13824,TEST,0,0 +13825,TEST,0,0 +13826,TEST,0,0 +13827,TEST,0,0 +13828,TEST,0,0 +13829,TEST,0,0 +13830,TEST,0,0 +13831,TEST,0,0 +13832,TEST,0,0 +13833,TEST,0,0 +13834,TEST,0,0 +13835,TEST,0,0 +13836,TEST,0,0 +13837,TEST,0,0 +13838,TEST,0,0 +13839,TEST,0,0 +13840,TEST,0,0 +13841,TEST,0,0 +13842,TEST,0,0 +13843,TEST,0,0 +13844,TEST,0,0 +13845,TEST,0,0 +13846,TEST,0,0 +13847,TEST,0,0 +13848,TEST,0,0 +13849,TEST,0,0 +13850,TEST,0,0 +13851,TEST,0,0 +13852,TEST,0,0 +13853,TEST,0,0 +13854,TEST,0,0 +13855,TEST,0,0 +13856,TEST,0,0 +13857,TEST,0,0 +13858,TEST,0,0 +13859,TEST,0,0 +13860,TEST,0,0 +13861,TEST,0,0 +13862,TEST,0,0 +13863,TEST,0,0 +13864,TEST,0,0 +13865,TEST,0,0 +13866,TEST,0,0 +13867,TEST,0,0 +13868,TEST,0,0 +13869,TEST,0,0 +13870,TEST,0,0 +13871,TEST,0,0 +13872,TEST,0,0 +13873,TEST,0,0 +13874,TEST,0,0 +13875,TEST,0,0 +13876,TEST,0,0 +13877,TEST,0,0 +13878,TEST,0,0 +13879,TEST,0,0 +13880,TEST,0,0 +13881,TEST,0,0 +13882,TEST,0,0 +13883,TEST,0,0 +13884,TEST,0,0 +13885,TEST,0,0 +13886,TEST,0,0 +13887,TEST,0,0 +13888,TEST,0,0 +13889,TEST,0,0 +13890,TEST,0,0 +13891,TEST,0,0 +13892,TEST,0,0 +13893,TEST,0,0 +13894,TEST,0,0 +13895,TEST,0,0 +13896,TEST,0,0 +13897,TEST,0,0 +13898,TEST,0,0 +13899,TEST,0,0 +13900,TEST,0,0 +13901,TEST,0,0 +13902,TEST,0,0 +13903,TEST,0,0 +13904,TEST,0,0 +13905,TEST,0,0 +13906,TEST,0,0 +13907,TEST,0,0 +13908,TEST,0,0 +13909,TEST,0,0 +13910,TEST,0,0 +13911,TEST,0,0 +13912,TEST,0,0 +13913,TEST,0,0 +13914,TEST,0,0 +13915,TEST,0,0 +13916,TEST,0,0 +13917,TEST,0,0 +13918,TEST,0,0 +13919,TEST,0,0 +13920,TEST,0,0 +13921,TEST,0,0 +13922,TEST,0,0 +13923,TEST,0,0 +13924,TEST,0,0 +13925,TEST,0,0 +13926,TEST,0,0 +13927,TEST,0,0 +13928,TEST,0,0 +13929,TEST,0,0 +13930,TEST,0,0 +13931,TEST,0,0 +13932,TEST,0,0 +13933,TEST,0,0 +13934,TEST,0,0 +13935,TEST,0,0 +13936,TEST,0,0 +13937,TEST,0,0 +13938,TEST,0,0 +13939,TEST,0,0 +13940,TEST,0,0 +13941,TEST,0,0 +13942,TEST,0,0 +13943,TEST,0,0 +13944,TEST,0,0 +13945,TEST,0,0 +13946,TEST,0,0 +13947,TEST,0,0 +13948,TEST,0,0 +13949,TEST,0,0 +13950,TEST,0,0 +13951,TEST,0,0 +13952,TEST,0,0 +13953,TEST,0,0 +13954,TEST,0,0 +13955,TEST,0,0 +13956,TEST,0,0 +13957,TEST,0,0 +13958,TEST,0,0 +13959,TEST,0,0 +13960,TEST,0,0 +13961,TEST,0,0 +13962,TEST,0,0 +13963,TEST,0,0 +13964,TEST,0,0 +13965,TEST,0,0 +13966,TEST,0,0 +13967,TEST,0,0 +13968,TEST,0,0 +13969,TEST,0,0 +13970,TEST,0,0 +13971,TEST,0,0 +13972,TEST,0,0 +13973,TEST,0,0 +13974,TEST,0,0 +13975,TEST,0,0 +13976,TEST,0,0 +13977,TEST,0,0 +13978,TEST,0,0 +13979,TEST,0,0 +13980,TEST,0,0 +13981,TEST,0,0 +13982,TEST,0,0 +13983,TEST,0,0 +13984,TEST,0,0 +13985,TEST,0,0 +13986,TEST,0,0 +13987,TEST,0,0 +13988,TEST,0,0 +13989,TEST,0,0 +13990,TEST,0,0 +13991,TEST,0,0 +13992,TEST,0,0 +13993,TEST,0,0 +13994,TEST,0,0 +13995,TEST,0,0 +13996,TEST,0,0 +13997,TEST,0,0 +13998,TEST,0,0 +13999,TEST,0,0 +14000,TEST,0,0 +14001,TEST,0,0 +14002,TEST,0,0 +14003,TEST,0,0 +14004,TEST,0,0 +14005,TEST,0,0 +14006,TEST,0,0 +14007,TEST,0,0 +14008,TEST,0,0 +14009,TEST,0,0 +14010,TEST,0,0 +14011,TEST,0,0 +14012,TEST,0,0 +14013,TEST,0,0 +14014,TEST,0,0 +14015,TEST,0,0 +14016,TEST,0,0 +14017,TEST,0,0 +14018,TEST,0,0 +14019,TEST,0,0 +14020,TEST,0,0 +14021,TEST,0,0 +14022,TEST,0,0 +14023,TEST,0,0 +14024,TEST,0,0 +14025,TEST,0,0 +14026,TEST,0,0 +14027,TEST,0,0 +14028,TEST,0,0 +14029,TEST,0,0 +14030,TEST,0,0 +14031,TEST,0,0 +14032,TEST,0,0 +14033,TEST,0,0 +14034,TEST,0,0 +14035,TEST,0,0 +14036,TEST,0,0 +14037,TEST,0,0 +14038,TEST,0,0 +14039,TEST,0,0 +14040,TEST,0,0 +14041,TEST,0,0 +14042,TEST,0,0 +14043,TEST,0,0 +14044,TEST,0,0 +14045,TEST,0,0 +14046,TEST,0,0 +14047,TEST,0,0 +14048,TEST,0,0 +14049,TEST,0,0 +14050,TEST,0,0 +14051,TEST,0,0 +14052,TEST,0,0 +14053,TEST,0,0 diff --git a/datasets/anomaly_reserve/kpi/SCORE/problem_TEST/problemDoc.json b/datasets/anomaly_reserve/kpi/SCORE/problem_TEST/problemDoc.json new file mode 100644 index 0000000..1fd55ad --- /dev/null +++ b/datasets/anomaly_reserve/kpi/SCORE/problem_TEST/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "kpi_problem", + "problemName": "kpi_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "kpi_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 3, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TRAIN" + } + ], + "test": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TEST" + } + ], + "score": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/SCORE/targets.csv b/datasets/anomaly_reserve/kpi/SCORE/targets.csv new file mode 100644 index 0000000..e69de29 diff --git a/datasets/anomaly_reserve/kpi/TEST/dataset_TEST/datasetDoc.json b/datasets/anomaly_reserve/kpi/TEST/dataset_TEST/datasetDoc.json new file mode 100644 index 0000000..2a04d60 --- /dev/null +++ b/datasets/anomaly_reserve/kpi/TEST/dataset_TEST/datasetDoc.json @@ -0,0 +1,63 @@ +{ + "about": { + "datasetID": "kpi_dataset_TEST", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 4 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/TEST/dataset_TEST/tables/learningData.csv b/datasets/anomaly_reserve/kpi/TEST/dataset_TEST/tables/learningData.csv new file mode 100644 index 0000000..b9e432d --- /dev/null +++ b/datasets/anomaly_reserve/kpi/TEST/dataset_TEST/tables/learningData.csv @@ -0,0 +1,1758 @@ +d3mIndex,timestamp,value,ground_truth +7027,1475026500,0.32264705162415364,0 +7028,1475026800,0.32183430507799304,0 +7029,1475027100,0.31787914535951506,0 +7030,1475027400,0.3296732765365322,0 +7031,1475027700,0.33072178162272026,0 +7032,1475028000,0.3282773378117453,0 +7033,1475028300,0.3412378533449643,0 +7034,1475028600,0.3444485124115538,0 +7035,1475028900,0.34747304631385745,0 +7036,1475029200,0.34477423144747743,0 +7037,1475029500,0.34249419819706234,0 +7038,1475029800,0.3547319276800169,0 +7039,1475030100,0.3569188983482892,0 +7040,1475030400,0.3528241447571223,0 +7041,1475030700,0.3536617079911538,0 +7042,1475031000,0.3595928965267984,0 +7043,1475031300,0.3414456931108743,0 +7044,1475031600,0.3444702270131781,0 +7045,1475031900,0.3567327731850544,0 +7046,1475032200,0.344169324666176,0 +7047,1475032500,0.34747304631385745,0 +7048,1475032800,0.3413309159271072,0 +7049,1475033100,0.3411665053665474,0 +7050,1475033400,0.3484253867327069,0 +7051,1475033700,0.3466571976814503,0 +7052,1475034000,0.3524518944306527,0 +7053,1475034300,0.3450999504823503,0 +7054,1475034600,0.34230807303382743,0 +7055,1475034900,0.32953368266384336,0 +7056,1475035200,0.3585940248174029,0 +7057,1475035500,0.3494738918188949,0 +7058,1475035800,0.3478918279308732,0 +7059,1475036100,0.3570584922209781,0 +7060,1475036400,0.3642925568982159,0 +7061,1475036700,0.3735522837694128,0 +7062,1475037000,0.371529723662751,0 +7063,1475037300,0.36375899809743295,0 +7064,1475037600,0.3717623801165319,0 +7065,1475037900,0.3745759721677299,0 +7066,1475038200,0.3771134785597968,0 +7067,1475038500,0.38916508287951657,0 +7068,1475038800,0.3930954259090729,0 +7069,1475039100,0.3960051826276095,0 +7070,1475039400,0.3930023633279808,0 +7071,1475039700,0.37371669432997257,0 +7072,1475040000,0.3825328228962828,0 +7073,1475040300,0.35663971060291155,0 +7074,1475040600,0.3567793044756004,0 +7075,1475040900,0.3473334524411687,0 +7076,1475041200,0.35570908478673724,0 +7077,1475041500,0.3453077902482603,0 +7078,1475041800,0.3417031662535769,0 +7079,1475042100,0.34623841606443456,0 +7080,1475042400,0.3279050874852757,0 +7081,1475042700,0.3283486857912131,0 +7082,1475043000,0.3181583331038419,0 +7083,1475043300,0.3134586727324244,0 +7084,1475043600,0.3296050306433111,0 +7085,1475043900,0.3221817387155411,0 +7086,1475044200,0.3210184564455859,0 +7087,1475044500,0.31680892567065194,0 +7088,1475044800,0.3279764354647434,0 +7089,1475045100,0.3037088162647441,0 +7090,1475045400,0.30889860623264503,0 +7091,1475045700,0.3124349843341073,0 +7092,1475046000,0.3008021616321388,0 +7093,1475046300,0.3049651611168421,0 +7094,1475046600,0.3101084197941969,0 +7095,1475046900,0.302173283668004,0 +7096,1475047200,0.3071304171824393,0 +7097,1475047500,0.3046177274782433,0 +7098,1475047800,0.3044998482082296,0 +7099,1475048100,0.296124215862766,0 +7100,1475048400,0.3014287830150645,0 +7101,1475048700,0.2965678141684933,0 +7102,1475049000,0.2985686596733205,0 +7103,1475049300,0.2996854106527297,0 +7104,1475049600,0.2962420951327797,0 +7105,1475049900,0.2976845651479024,0 +7106,1475050200,0.29400859317396144,0 +7107,1475050500,0.3005912197805077,0 +7108,1475050800,0.29345021768425683,0 +7109,1475051100,0.28656358664446197,0 +7110,1475051400,0.2895881205470809,0 +7111,1475051700,0.2858873318847437,0 +7112,1475052000,0.28960983514944083,0 +7113,1475052300,0.28656358664446197,0 +7114,1475052600,0.2957302509339364,0 +7115,1475052900,0.2834677047625855,0 +7116,1475053200,0.29375112003146897,0 +7117,1475053500,0.2945204373729098,0 +7118,1475053800,0.2802353310943717,0 +7119,1475054100,0.2943808435004311,0 +7120,1475054400,0.2876555209354749,0 +7121,1475054700,0.28837830698605443,0 +7122,1475055000,0.2714161004429708,0 +7123,1475055300,0.27325563747295945,0 +7124,1475055600,0.2763267026664397,0 +7125,1475055900,0.2695083175197609,0 +7126,1475056200,0.2664620690147821,0 +7127,1475056500,0.2637849687502365,0 +7128,1475056800,0.2611575018625886,0 +7129,1475057100,0.2521304314454878,0 +7130,1475057400,0.2520373688638704,0 +7131,1475057700,0.2764197652480572,0 +7132,1475058000,0.2607387202452576,0 +7133,1475058300,0.24971080432343465,0 +7134,1475058600,0.2539916830779415,0 +7135,1475058900,0.2424519229571701,0 +7136,1475059200,0.2554093364045628,0 +7137,1475059500,0.2629008742249235,0 +7138,1475059800,0.2538055579147066,0 +7139,1475060100,0.2649482510205069,0 +7140,1475060400,0.2632979412397895,0 +7141,1475060700,0.264926536418147,0 +7142,1475061000,0.2610644392809712,0 +7143,1475061300,0.25969021515906965,0 +7144,1475061600,0.27055682260605396,0 +7145,1475061900,0.2705102913151928,0 +7146,1475062200,0.27753651622746633,0 +7147,1475062500,0.2853754876857953,0 +7148,1475062800,0.2817243324002506,0 +7149,1475063100,0.2922404041232303,0 +7150,1475063400,0.29065834023573395,0 +7151,1475063700,0.2966360600617144,0 +7152,1475064000,0.2832350483085944,0 +7153,1475064300,0.293518463577478,0 +7154,1475064600,0.2871436767365265,0 +7155,1475064900,0.29491440230173943,0 +7156,1475065200,0.2984042491124456,0 +7157,1475065500,0.3362590052292199,0 +7158,1475065800,0.31566735800232104,0 +7159,1475066100,0.2938907139039477,0 +7160,1475066400,0.3159465457476987,0 +7161,1475066700,0.30140706841270465,0 +7162,1475067000,0.2997536565458457,0 +7163,1475067300,0.2992200977446425,0 +7164,1475067600,0.2795125450437922,0 +7165,1475067900,0.2827945520889036,0 +7166,1475068200,0.2817243324002506,0 +7167,1475068500,0.2869358369709317,0 +7168,1475068800,0.29658952877085315,0 +7169,1475069100,0.2933106238117782,0 +7170,1475069400,0.2914959034702908,0 +7171,1475069700,0.2925195918680826,0 +7172,1475070000,0.30205850648402666,0 +7173,1475070300,0.2888901511848977,0 +7174,1475070600,0.2945886832660259,0 +7175,1475070900,0.30221981495876016,0 +7176,1475071200,0.2890297450573764,0 +7177,1475071500,0.2816995157118543,0 +7178,1475071800,0.2820252347475679,0 +7179,1475072100,0.2830023918544983,0 +7180,1475072400,0.2746050449064648,0 +7181,1475072700,0.2782344855895445,0 +7182,1475073000,0.281165956910546,0 +7183,1475073300,0.2609248454084925,0 +7184,1475073600,0.2648086571480282,0 +7185,1475073900,0.2579220261082334,0 +7186,1475074200,0.2676470658874123,0 +7187,1475074500,0.24947814786944364,0 +7188,1475074800,0.2447784874977109,0 +7189,1475075100,0.23386534875986525,0 +7190,1475075400,0.2357514170806101,0 +7191,1475075700,0.2283281251534706,0 +7192,1475076000,0.2276084411890325,0 +7193,1475076300,0.21660223986956945,0 +7194,1475076600,0.21753286568574376,0 +7195,1475076900,0.20918205002846632,0 +7196,1475077200,0.20020151090228985,0 +7197,1475077500,0.20303991964166346,0 +7198,1475077800,0.19931741637691366,0 +7199,1475078100,0.18635690084348466,0 +7200,1475078400,0.18859040280232395,0 +7201,1475078700,0.17681798622754055,0 +7202,1475079000,0.18230867854307395,0 +7203,1475079300,0.16653457095865698,0 +7204,1475079600,0.1629764782548348,0 +7205,1475079900,0.15660169141388328,0 +7206,1475080200,0.1467835890530869,0 +7207,1475080500,0.14766768357850502,0 +7208,1475080800,0.13910592606959646,0 +7209,1475081100,0.13633576322343344,0 +7210,1475081400,0.13194010728494932,0 +7211,1475081700,0.1272869782039728,0 +7212,1475082000,0.12249425525062264,0 +7213,1475082300,0.11211777740012167,0 +7214,1475082600,0.10723199186515407,0 +7215,1475082900,0.10087891962666752,0 +7216,1475083200,0.0976465459584538,0 +7217,1475083500,0.09383098011203414,0 +7218,1475083800,0.08482562429739833,0 +7219,1475084100,0.08080221868569913,0 +7220,1475084400,0.08082393328732347,0 +7221,1475084700,0.07554418282352629,0 +7222,1475085000,0.07328586417683709,0 +7223,1475085300,0.07188992545205025,0 +7224,1475085600,0.06672495217307105,0 +7225,1475085900,0.06828530145736683,0 +7226,1475086200,0.062422358815994186,0 +7227,1475086500,0.0631203281783876,0 +7228,1475086800,0.0577226984435259,0 +7229,1475087100,0.056280228429243837,0 +7230,1475087400,0.054626816561754436,0 +7231,1475087700,0.05669901004625956,0 +7232,1475088000,0.052579439766171,0 +7233,1475088300,0.057396979408653015,0 +7234,1475088600,0.05271903363885985,0 +7235,1475088900,0.051276563623527,0 +7236,1475089200,0.05116178643975985,0 +7237,1475089500,0.0513013803124487,0 +7238,1475089800,0.05243984589453297,0 +7239,1475090100,0.052067595568063264,0 +7240,1475090400,0.05172016192946443,0 +7241,1475090700,0.047439283175062685,0 +7242,1475091000,0.047740185522064715,0 +7243,1475091300,0.046462126068342366,0 +7244,1475091600,0.04441474927275894,0 +7245,1475091900,0.0436237173292735,0 +7246,1475092200,0.04120409020616953,0 +7247,1475092500,0.04348412345658465,0 +7248,1475092800,0.04278615409419124,0 +7249,1475093100,0.04299399386010122,0 +7250,1475093400,0.04108621093615582,0 +7251,1475093700,0.04246043505826752,0 +7252,1475094000,0.0424139037677215,0 +7253,1475094300,0.04117927351829865,0 +7254,1475094600,0.04285439998741237,0 +7255,1475094900,0.04255349764041035,0 +7256,1475095200,0.041504992554222346,0 +7257,1475095500,0.041132742227752636,0 +7258,1475095800,0.04339106087444181,0 +7259,1475096100,0.040971433752388674,0 +7260,1475096400,0.03929630728327494,0 +7261,1475096700,0.04031999568159207,0 +7262,1475097000,0.04280786869686635,0 +7263,1475097300,0.04115755891562354,0 +7264,1475097600,0.04255349764041035,0 +7265,1475097900,0.04208818473179781,0 +7266,1475098200,0.0413188673909875,0 +7267,1475098500,0.0413188673909875,0 +7268,1475098800,0.04557803154271409,0 +7269,1475099100,0.04632253219565356,0 +7270,1475099400,0.04841644028178297,0 +7271,1475099700,0.04683437639481212,0 +7272,1475100000,0.05144097418513755,0 +7273,1475100300,0.05232506870971503,0 +7274,1475100600,0.05797706949998192,0 +7275,1475100900,0.056767255939480725,0 +7276,1475101200,0.06551513861151907,0 +7277,1475101500,0.06414401657565393,0 +7278,1475101800,0.0640974852851079,0 +7279,1475102100,0.0660052682080025,0 +7280,1475102400,0.07026133027453338,0 +7281,1475102700,0.07149596052395624,0 +7282,1475103000,0.07317108699306994,0 +7283,1475103300,0.07514711580918569,0 +7284,1475103600,0.07859043132850517,0 +7285,1475103900,0.08021902650707287,0 +7286,1475104200,0.08454643655307144,0 +7287,1475104500,0.0854305310776489,0 +7288,1475104800,0.08652556745543376,0 +7289,1475105100,0.08682646980243576,0 +7290,1475105400,0.08719872012890545,0 +7291,1475105700,0.0928072917138221,0 +7292,1475106000,0.0912252278268512,0 +7293,1475106300,0.09501597698466444,0 +7294,1475106600,0.10369561376369177,0 +7295,1475106900,0.10176301415208568,0 +7296,1475107200,0.10364908247283053,0 +7297,1475107500,0.1077438360641025,0 +7298,1475107800,0.11169899578289576,0 +7299,1475108100,0.1208408433838688,0 +7300,1475108400,0.11490965484874965,0 +7301,1475108700,0.11870040400666795,0 +7302,1475109000,0.11879346658828535,0 +7303,1475109300,0.1214922814551908,0 +7304,1475109600,0.12835719789262576,0 +7305,1475109900,0.12186453178166053,0 +7306,1475110200,0.12688991118910678,0 +7307,1475110500,0.13098466478037873,0 +7308,1475110800,0.14366599256895554,0 +7309,1475111100,0.13545166869801534,0 +7310,1475111400,0.1395464222892873,0 +7311,1475111700,0.14645787001747845,0 +7312,1475112000,0.14405995749778516,0 +7313,1475112300,0.15041302973627171,0 +7314,1475112600,0.15622944108741355,0 +7315,1475112900,0.15308702791478088,0 +7316,1475113200,0.15367022009288173,0 +7317,1475113500,0.1577401569957574,0 +7318,1475113800,0.16897901476921154,0 +7319,1475114100,0.16688510668276682,0 +7320,1475114400,0.1693977963864374,0 +7321,1475114700,0.168001857662176,0 +7322,1475115000,0.16997788647860698,0 +7323,1475115300,0.17419051933989266,0 +7324,1475115600,0.16960563615213722,0 +7325,1475115900,0.17532898492166168,0 +7326,1475116200,0.1670712318460017,0 +7327,1475116500,0.17877230044150655,0 +7328,1475116800,0.1746558322479798,0 +7329,1475117100,0.1758191145181451,0 +7330,1475117400,0.17467754685033965,0 +7331,1475117700,0.1788188317323678,0 +7332,1475118000,0.1797494575485421,0 +7333,1475118300,0.18596293591465504,0 +7334,1475118600,0.1844739346086711,0 +7335,1475118900,0.17339948739609193,0 +7336,1475119200,0.17705064268163664,0 +7337,1475119500,0.17128076262125094,0 +7338,1475119800,0.16681375870350929,0 +7339,1475120100,0.1842164614662838,0 +7340,1475120400,0.1843095240479012,0 +7341,1475120700,0.18377596524659293,0 +7342,1475121000,0.19908475992285968,0 +7343,1475121300,0.1939415012453997,0 +7344,1475121600,0.2005737612287596,0 +7345,1475121900,0.21095023907930247,0 +7346,1475122200,0.22572237286818256,0 +7347,1475122500,0.2368216367692631,0 +7348,1475122800,0.2511067430476961,0 +7349,1475123100,0.2615762834798145,0 +7350,1475123400,0.28062929602320136,0 +7351,1475123700,0.277744355993061,0 +7352,1475124000,0.2713478545498547,0 +7353,1475124300,0.2875872750422537,0 +7354,1475124600,0.2715339797130896,0 +7355,1475124900,0.27039241204517905,0 +7356,1475125200,0.2593179648325999,0 +7357,1475125500,0.2555489302770415,0 +7358,1475125800,0.2493602685993249,0 +7359,1475126100,0.2434290800642057,0 +7360,1475126400,0.2381245129119071,0 +7361,1475126700,0.22241865122071133,0 +7362,1475127000,0.20887804559521767,0 +7363,1475127300,0.20415667062108311,0 +7364,1475127600,0.19252384791874685,0 +7365,1475127900,0.1901507520874604,0 +7366,1475128200,0.17954161778294736,0 +7367,1475128500,0.17546857879403527,0 +7368,1475128800,0.17046801607509038,0 +7369,1475129100,0.16383575609175155,0 +7370,1475129400,0.16162707082132954,0 +7371,1475129700,0.1565551601231271,0 +7372,1475130000,0.16313778672956827,0 +7373,1475130300,0.16139441436723342,0 +7374,1475130600,0.16497732375955698,0 +7375,1475130900,0.16320913470882584,0 +7376,1475131200,0.16688510668276682,0 +7377,1475131500,0.16078950758677268,0 +7378,1475131800,0.16348832245367814,0 +7379,1475132100,0.15636903495989227,0 +7380,1475132400,0.16825622871863194,0 +7381,1475132700,0.1679770409737797,0 +7382,1475133000,0.1589034392659227,0 +7383,1475133300,0.16420800641822128,0 +7384,1475133600,0.1661406060298274,0 +7385,1475133900,0.15827681788299702,0 +7386,1475134200,0.1581372240106234,0 +7387,1475134500,0.16104387864322867,0 +7388,1475134800,0.16095081606161127,0 +7389,1475135100,0.16290513027557724,0 +7390,1475135400,0.1705610786567078,0 +7391,1475135700,0.16592966417819624,0 +7392,1475136000,0.16188144187778555,0 +7393,1475136300,0.1679553263714198,0 +7394,1475136600,0.16597619546895245,0 +7395,1475136900,0.16923338582566752,0 +7396,1475137200,0.16869982702435926,0 +7397,1475137500,0.16816316613701454,0 +7398,1475137800,0.16895419808081524,0 +7399,1475138100,0.17228273641668282,0 +7400,1475138400,0.17402610877901767,0 +7401,1475138700,0.1787040545483905,0 +7402,1475139000,0.18149593199691336,0 +7403,1475139300,0.1832641210476445,0 +7404,1475139600,0.1798673368185558,0 +7405,1475139900,0.18086620852795124,0 +7406,1475140200,0.1858698733330376,0 +7407,1475140500,0.2018052893921881,0 +7408,1475140800,0.19847985314234112,0 +7409,1475141100,0.1994104789585312,0 +7410,1475141400,0.1929178128475975,0 +7411,1475141700,0.20757516945254226,0 +7412,1475142000,0.2017835747898072,0 +7413,1475142300,0.2094612377733186,0 +7414,1475142600,0.21634786881311344,0 +7415,1475142900,0.2105997033551927,0 +7416,1475143200,0.2147658049256172,0 +7417,1475143500,0.2158112079257688,0 +7418,1475143800,0.2109254223908012,0 +7419,1475144100,0.2267925925568356,0 +7420,1475144400,0.21751115108338387,0 +7421,1475144700,0.2202564972411505,0 +7422,1475145000,0.2234671563070044,0 +7423,1475145300,0.21932587142497625,0 +7424,1475145600,0.22132671692969846,0 +7425,1475145900,0.2261876857762697,0 +7426,1475146200,0.23635632386117605,0 +7427,1475146500,0.2259798460106749,0 +7428,1475146800,0.2263738109395045,0 +7429,1475147100,0.2305864438007903,0 +7430,1475147400,0.2418004848858481,0 +7431,1475147700,0.23889072816720644,0 +7432,1475148000,0.2488732410888778,0 +7433,1475148300,0.2500830546499044,0 +7434,1475148600,0.25289664670089224,0 +7435,1475148900,0.2560390598735249,0 +7436,1475149200,0.2524561504812013,0 +7437,1475149500,0.2655996890918289,0 +7438,1475149800,0.2696944426829958,0 +7439,1475150100,0.2600159341946781,0 +7440,1475150400,0.2715805110038457,0 +7441,1475150700,0.2727437932741161,0 +7442,1475151000,0.27148744842222833,0 +7443,1475151300,0.2720458239119329,0 +7444,1475151600,0.2779552978446922,0 +7445,1475151900,0.2750455411261556,0 +7446,1475152200,0.27969867020702704,0 +7447,1475152500,0.27292991843735104,0 +7448,1475152800,0.2846309870328559,0 +7449,1475153100,0.2743723884523686,0 +7450,1475153400,0.2822113599108027,0 +7451,1475153700,0.29072658612885,0 +7452,1475154000,0.2876803376238712,0 +7453,1475154300,0.28414395952240884,0 +7454,1475154600,0.2844913931603772,0 +7455,1475154900,0.28647052406284457,0 +7456,1475155200,0.28076888989568005,0 +7457,1475155500,0.29070487152649016,0 +7458,1475155800,0.2940768390671825,0 +7459,1475156100,0.2925661231588387,0 +7460,1475156400,0.28363211532346044,0 +7461,1475156700,0.28833177569519314,0 +7462,1475157000,0.2873080872974014,0 +7463,1475157300,0.29170374323588555,0 +7464,1475157600,0.2816312698186332,0 +7465,1475157900,0.28635264479283085,0 +7466,1475158200,0.2927057170313174,0 +7467,1475158500,0.2786067359161193,0 +7468,1475158800,0.28986730829193325,0 +7469,1475159100,0.2874942124606363,0 +7470,1475159400,0.2751634203961693,0 +7471,1475159700,0.2706250684992752,0 +7472,1475160000,0.2682519726679782,0 +7473,1475160300,0.26411068778595004,0 +7474,1475160600,0.2754891394318829,0 +7475,1475160900,0.26818372677475705,0 +7476,1475161200,0.2478030214002248,0 +7477,1475161500,0.2479426152727035,0 +7478,1475161800,0.23232981616312515,0 +7479,1475162100,0.23754132073380624,0 +7480,1475162400,0.2335861610150129,0 +7481,1475162700,0.2379849190395335,0 +7482,1475163000,0.21734674052250888,0 +7483,1475163300,0.22041780571588404,0 +7484,1475163600,0.2128580220024073,0 +7485,1475163900,0.205645671926983,0 +7486,1475164200,0.19403456382701714,0 +7487,1475164500,0.18023958714502555,0 +7488,1475164800,0.18449564921113606,0 +7489,1475165100,0.17760901817134125,0 +7490,1475165400,0.1739578628857965,0 +7491,1475165700,0.15727484408767026,0 +7492,1475166000,0.15083181135360266,0 +7493,1475166300,0.15085352595596255,0 +7494,1475166600,0.1451549938748344,0 +7495,1475166900,0.1325201973770138,0 +7496,1475167200,0.12510000753601574,0 +7497,1475167500,0.1167709064810982,0 +7498,1475167800,0.11490965484874965,0 +7499,1475168100,0.11058224480348663,0 +7500,1475168400,0.10295111311075236,0 +7501,1475168700,0.0993682037184288,0 +7502,1475169000,0.0974138895043577,0 +7503,1475169300,0.08927091361235978,0 +7504,1475169600,0.08459296784361743,0 +7505,1475169900,0.08147537135917088,0 +7506,1475170200,0.07542630355351257,0 +7507,1475170500,0.07442743184411715,0 +7508,1475170800,0.07314627030414825,0 +7509,1475171100,0.06926245856513794,0 +7510,1475171400,0.06732985895332161,0 +7511,1475171700,0.061631326872508725,0 +7512,1475172000,0.06195704590738161,0 +7513,1475172300,0.0646775753774876,0 +7514,1475172600,0.06293420301515275,0 +7515,1475172900,0.05776922973512274,0 +7516,1475173200,0.05662766206679187,0 +7517,1475173500,0.05367447614395582,0 +7518,1475173800,0.05248637718507899,0 +7519,1475174100,0.05309128396532956,0 +7520,1475174400,0.05386060130719069,0 +7521,1475174700,0.050743004822744096,0 +7522,1475175000,0.05230025202184412,0 +7523,1475175300,0.0541863203420636,0 +7524,1475175600,0.048649096736614654,0 +7525,1475175900,0.05009156675089673,0 +7526,1475176200,0.04841644028178297,0 +7527,1475176500,0.04608987574187265,0 +7528,1475176800,0.046716497124798376,0 +7529,1475177100,0.04401768225736754,0 +7530,1475177400,0.04359890064035179,0 +7531,1475177700,0.04294746256850437,0 +7532,1475178000,0.04264656022150239,0 +7533,1475178300,0.04101796504293469,0 +7534,1475178600,0.0405061208448269,0 +7535,1475178900,0.04243561837039664,0 +7536,1475179200,0.04190205956856294,0 +7537,1475179500,0.041458461262625534,0 +7538,1475179800,0.041551523844768366,0 +7539,1475180100,0.039901214063525516,0 +7540,1475180400,0.04190205956856294,0 +7541,1475180700,0.040692246008061775,0 +7542,1475181000,0.0389240569568052,0 +7543,1475181300,0.04139021536940436,0 +7544,1475181600,0.041715934405328094,0 +7545,1475181900,0.038179556303865776,0 +7546,1475182200,0.04010905382943551,0 +7547,1475182500,0.04080702319182894,0 +7548,1475182800,0.04115755891562354,0 +7549,1475183100,0.037016274033910605,0 +7550,1475183400,0.04010905382943551,0 +7551,1475183700,0.04122580480884467,0 +7552,1475184000,0.04038824157481321,0 +7553,1475184300,0.040155585119981525,0 +7554,1475184600,0.04083183987969982,0 +7555,1475184900,0.03987639737565465,0 +7556,1475185200,0.04227430989503268,0 +7557,1475185500,0.04478699959922865,0 +7558,1475185800,0.04536708969160833,0 +7559,1475186100,0.04664825123157725,0 +7560,1475186400,0.05146268878676186,0 +7561,1475186700,0.05274385032778155,0 +7562,1475187000,0.05834931982645161,0 +7563,1475187300,0.05932647693422274,0 +7564,1475187600,0.0621648856732916,0 +7565,1475187900,0.0668645460447091,0 +7566,1475188200,0.06679630015148794,0 +7567,1475188500,0.06863273509596564,0 +7568,1475188800,0.07035439285562536,0 +7569,1475189100,0.0710306476153437,0 +7570,1475189400,0.07372946248277452,0 +7571,1475189700,0.07898749834389658,0 +7572,1475190000,0.08243081386426683,0 +7573,1475190300,0.08659381334865487,0 +7574,1475190600,0.09250328728057353,0 +7575,1475190900,0.08682646980243576,0 +7576,1475191200,0.09178360331655576,0 +7577,1475191500,0.092968600189081,0 +7578,1475191800,0.08922438232181379,0 +7579,1475192100,0.09697029119873553,0 +7580,1475192400,0.09655150958140456,0 +7581,1475192700,0.1079764925180935,0 +7582,1475193000,0.10769730477324126,0 +7583,1475193300,0.10509155248795324,0 +7584,1475193600,0.1082773948654108,0 +7585,1475193900,0.1082773948654108,0 +7586,1475194200,0.11711834011917162,0 +7587,1475194500,0.11483830686949208,0 +7588,1475194800,0.1143047480681838,0 +7589,1475195100,0.1182133764962209,0 +7590,1475195400,0.12000328014931196,0 +7591,1475195700,0.12100525394474378,0 +7592,1475196000,0.12430897559221506,0 +7593,1475196300,0.1248673510819196,0 +7594,1475196600,0.13007885565260074,0 +7595,1475196900,0.1352438289324206,0 +7596,1475197200,0.1384762026006343,0 +7597,1475197500,0.1411284861767836,0 +7598,1475197800,0.14229176844705402,0 +7599,1475198100,0.14790034003260116,0 +7600,1475198400,0.14543418161968671,0 +7601,1475198700,0.14838736754304815,0 +7602,1475199000,0.15004077940980198,0 +7603,1475199300,0.1525069378227164,0 +7604,1475199600,0.15427512687344755,0 +7605,1475199900,0.16239328607662887,0 +7606,1475200200,0.16502385505041825,0 +7607,1475200500,0.16672069612189186,0 +7608,1475200800,0.16267247382148114,0 +7609,1475201100,0.16946604227965853,0 +7610,1475201400,0.17847139809429438,0 +7611,1475201700,0.17644573590107082,0 +7612,1475202000,0.1721431425442041,0 +7613,1475202300,0.16921167122320258,0 +7614,1475202600,0.1674434821724714,0 +7615,1475202900,0.16509210094363938,0 +7616,1475203200,0.16818798282541084,0 +7617,1475203500,0.17802779978856711,0 +7618,1475203800,0.17521420773768434,0 +7619,1475204100,0.17979598883940334,0 +7620,1475204400,0.1797742742369384,0 +7621,1475204700,0.17847139809429438,0 +7622,1475205000,0.1763526733194534,0 +7623,1475205300,0.16628019990230608,0 +7624,1475205600,0.1747706094319571,0 +7625,1475205900,0.1673038883000978,0 +7626,1475206200,0.17037495349347295,0 +7627,1475206500,0.17526073902844053,0 +7628,1475206800,0.1769823967884155,0 +7629,1475207100,0.18035436432910792,0 +7630,1475207400,0.18947449732770005,0 +7631,1475207700,0.18742712053208507,0 +7632,1475208000,0.19871250959638787,0 +7633,1475208300,0.2065762977431678,0 +7634,1475208600,0.22039609111352407,0 +7635,1475208900,0.2313061277653334,0 +7636,1475209200,0.2403114835799692,0 +7637,1475209500,0.24731289180374136,0 +7638,1475209800,0.2546648357516233,0 +7639,1475210100,0.2574101819093901,0 +7640,1475210400,0.2671817529793252,0 +7641,1475210700,0.27814142300792705,0 +7642,1475211000,0.2727655078764761,0 +7643,1475211300,0.2663441897447683,0 +7644,1475211600,0.2507344927212264,0 +7645,1475211900,0.2422657977939353,0 +7646,1475212200,0.2380314503302897,0 +7647,1475212500,0.2371938870957329,0 +7648,1475212800,0.21858137077203685,0 +7649,1475213100,0.2115086145690072,0 +7650,1475213400,0.2101343904471056,0 +7651,1475213700,0.205971390962644,0 +7652,1475214000,0.1927565043727904,0 +7653,1475214300,0.18398380501218767,0 +7654,1475214600,0.1925703792095556,0 +7655,1475214900,0.17586564580900635,0 +7656,1475215200,0.17246886157991764,0 +7657,1475215500,0.17300242038122596,0 +7658,1475215800,0.1697700467130122,0 +7659,1475216100,0.16169531671455067,0 +7660,1475216400,0.16427935439747882,0 +7661,1475216700,0.16381404148939166,0 +7662,1475217000,0.1567164685978606,0 +7663,1475217300,0.15992712766371445,0 +7664,1475217600,0.1548800336540134,0 +7665,1475217900,0.1579045675565273,0 +7666,1475218200,0.15499481083799072,0 +7667,1475218500,0.16190625856618182,0 +7668,1475218800,0.15862425152107046,0 +7669,1475219100,0.16262594253072496,0 +7670,1475219400,0.1547621543838946,0 +7671,1475219700,0.15848465764869685,0 +7672,1475220000,0.16239328607662887,0 +7673,1475220300,0.15753231723005756,0 +7674,1475220600,0.15278612556756868,0 +7675,1475220900,0.15643728085300831,0 +7676,1475221200,0.14631827614499976,0 +7677,1475221500,0.16069644500515526,0 +7678,1475221800,0.15457602922065972,0 +7679,1475222100,0.15888172466356282,0 +7680,1475222400,0.1556462489093127,0 +7681,1475222700,0.15962622531650228,0 +7682,1475223000,0.16155572284207198,0 +7683,1475223300,0.1650703863411744,0 +7684,1475223600,0.16590794957583635,0 +7685,1475223900,0.1687463583151154,0 +7686,1475224200,0.1584164117554757,0 +7687,1475224500,0.17023535962109934,0 +7688,1475224800,0.16169531671455067,0 +7689,1475225100,0.17416570265139128,0 +7690,1475225400,0.17053626196831154,0 +7691,1475225700,0.17721505324251166,0 +7692,1475226000,0.1789584256047414,0 +7693,1475226300,0.18075143134397392,0 +7694,1475226600,0.17847139809429438,0 +7695,1475226900,0.1831462417776308,0 +7696,1475227200,0.1974561647445347,0 +7697,1475227500,0.18500749340997927,0 +7698,1475227800,0.19433856826030774,0 +7699,1475228100,0.19650072223991047,0 +7700,1475228400,0.188987469817232,0 +7701,1475228700,0.19617500320424952,0 +7702,1475229000,0.2046685148199895,0 +7703,1475229300,0.20329429069808794,0 +7704,1475229600,0.1920585350106492,0 +7705,1475229900,0.20122519930009208,0 +7706,1475230200,0.2010390741368572,0 +7707,1475230500,0.20613269943745105,0 +7708,1475230800,0.20613269943745105,0 +7709,1475231100,0.2081118303398764,0 +7710,1475231400,0.21327680361973828,0 +7711,1475231700,0.20487635458560524,0 +7712,1475232000,0.20620404741669807,0 +7713,1475232300,0.21858137077203685,0 +7714,1475232600,0.21692795890517788,0 +7715,1475232900,0.21699930688443544,0 +7716,1475233200,0.21788340140985354,0 +7717,1475233500,0.2159973330890036,0 +7718,1475233800,0.2291439737856676,0 +7719,1475234100,0.22923703636728496,0 +7720,1475234400,0.22260477638394616,0 +7721,1475234700,0.22681740924523186,0 +7722,1475235000,0.22253653049083008,0 +7723,1475235300,0.22877172345919786,0 +7724,1475235600,0.2339584113414827,0 +7725,1475235900,0.2378670397694147,0 +7726,1475236200,0.2493137373085687,0 +7727,1475236500,0.24619614082433225,0 +7728,1475236800,0.25831909312320445,0 +7729,1475237100,0.25375902662384536,0 +7730,1475237400,0.2555954615677977,0 +7731,1475237700,0.26911435259093136,0 +7732,1475238000,0.2707181310808926,0 +7733,1475238300,0.2657640996527039,0 +7734,1475238600,0.2643216296375812,0 +7735,1475238900,0.26597193941829866,0 +7736,1475239200,0.25790031150587345,0 +7737,1475239500,0.26818372677475705,0 +7738,1475239800,0.2628791596224585,0 +7739,1475240100,0.25515496534810683,0 +7740,1475240400,0.2565509040724733,0 +7741,1475240700,0.26615806458153346,0 +7742,1475241000,0.2681123787954995,0 +7743,1475241300,0.27895416955408764,0 +7744,1475241600,0.2816995157118543,0 +7745,1475241900,0.27118344398897976,0 +7746,1475242200,0.27434757176397234,0 +7747,1475242500,0.2703706974428192,0 +7748,1475242800,0.2749772952329345,0 +7749,1475243100,0.2694152549381435,0 +7750,1475243400,0.27248632013162377,0 +7751,1475243700,0.26908953590253504,0 +7752,1475244000,0.2738822588558852,0 +7753,1475244300,0.2652057241629993,0 +7754,1475244600,0.2707181310808926,0 +7755,1475244900,0.2743723884523686,0 +7756,1475245200,0.26841638322885314,0 +7757,1475245500,0.25808643666910835,0 +7758,1475245800,0.2680658475047433,0 +7759,1475246100,0.2583873390163205,0 +7760,1475246400,0.2562251850367598,0 +7761,1475246700,0.25375902662384536,0 +7762,1475247000,0.2494316165785824,0 +7763,1475247300,0.2524096191904452,0 +7764,1475247600,0.2515472392673869,0 +7765,1475247900,0.24626748880358976,0 +7766,1475248200,0.23751960613144635,0 +7767,1475248500,0.2333317899585569,0 +7768,1475248800,0.220185149261893,0 +7769,1475249100,0.21776552213983985,0 +7770,1475249400,0.20429626449350924,0 +7771,1475249700,0.20143303906570784,0 +7772,1475250000,0.1998975064689993,0 +7773,1475250300,0.19215159759226666,0 +7774,1475250600,0.1914753428325063,0 +7775,1475250900,0.1825661516855664,0 +7776,1475251200,0.17537551621241784,0 +7777,1475251500,0.1741439880490314,0 +7778,1475251800,0.16150919155131582,0 +7779,1475252100,0.15555318632769527,0 +7780,1475252400,0.14405995749778516,0 +7781,1475252700,0.14264230417105875,0 +7782,1475253000,0.12889075669393404,0 +7783,1475253300,0.1319618218873092,0 +7784,1475253600,0.12316740792430453,0 +7785,1475253900,0.12081912878150892,0 +7786,1475254200,0.11986368627693834,0 +7787,1475254500,0.11253655901745263,0 +7788,1475254800,0.10185607673370307,0 +7789,1475255100,0.10059973188181526,0 +7790,1475255400,0.0938526947144991,0 +7791,1475255700,0.0920845056635578,0 +7792,1475256000,0.08059127683354259,0 +7793,1475256300,0.07779939938501973,0 +7794,1475256600,0.07600949573213882,0 +7795,1475256900,0.06965642349323196,0 +7796,1475257200,0.06805264500358592,0 +7797,1475257500,0.06900498542243537,0 +7798,1475257800,0.06928417316676225,0 +7799,1475258100,0.06405095399456187,0 +7800,1475258400,0.06004926298448705,0 +7801,1475258700,0.061746104056275876,0 +7802,1475259000,0.058653324259700185,0 +7803,1475259300,0.05779094433674705,0 +7804,1475259600,0.05497735228659987,0 +7805,1475259900,0.05569703625061759,0 +7806,1475260200,0.05397537849095788,0 +7807,1475260500,0.04809072124691008,0 +7808,1475260800,0.05055687965950928,0 +7809,1475261100,0.04729968930237384,0 +7810,1475261400,0.04815896714013123,0 +7811,1475261700,0.05015981264411786,0 +7812,1475262000,0.04534537508893322,0 +7813,1475262300,0.04387808838467869,0 +7814,1475262600,0.04699568486912525,0 +7815,1475262900,0.04557803154271409,0 +7816,1475263200,0.045878933889716124,0 +7817,1475263500,0.04443646387543407,0 +7818,1475263800,0.04359890064035179,0 +7819,1475264100,0.0424139037677215,0 +7820,1475264400,0.04227430989503268,0 +7821,1475264700,0.04117927351829865,0 +7822,1475265000,0.04348412345658465,0 +7823,1475265300,0.0400873392267604,0 +7824,1475265600,0.043459306767662936,0 +7825,1475265900,0.04178418029854925,0 +7826,1475266200,0.04004080793621436,0 +7827,1475266500,0.03957549502865262,0 +7828,1475266800,0.0395289637370558,0 +7829,1475267100,0.042202961915565,0 +7830,1475267400,0.0382260875944118,0 +7831,1475267700,0.0388527089773375,0 +7832,1475268000,0.039783334793511815,0 +7833,1475268300,0.03924977599272892,0 +7834,1475268600,0.03913189672271518,0 +7835,1475268900,0.038340864779229766,0 +7836,1475269200,0.03969027221241976,0 +7837,1475269500,0.03924977599272892,0 +7838,1475269800,0.03969027221241976,0 +7839,1475270100,0.03929630728327494,0 +7840,1475270400,0.038179556303865776,0 +7841,1475270700,0.03827261888600862,0 +7842,1475271000,0.040133870518357186,0 +7843,1475271300,0.04287921667633407,0 +7844,1475271600,0.041504992554222346,0 +7845,1475271900,0.04348412345658465,0 +7846,1475272200,0.046276000905107535,0 +7847,1475272500,0.04339106087444181,0 +7848,1475272800,0.051906287092699295,0 +7849,1475273100,0.05334875710803215,0 +7850,1475273400,0.05697819779058647,0 +7851,1475273700,0.05969872726069245,0 +7852,1475274000,0.061026420091207324,0 +7853,1475274300,0.062096639780070476,0 +7854,1475274600,0.06393307472454816,0 +7855,1475274900,0.06486370054072246,0 +7856,1475275200,0.0694237670394511,0 +7857,1475275500,0.07098411632479767,0 +7858,1475275800,0.07351852063061798,0 +7859,1475276100,0.07542630355351257,0 +7860,1475276400,0.07989330747114919,0 +7861,1475276700,0.07791727865503344,0 +7862,1475277000,0.08215162611888915,0 +7863,1475277300,0.08252387644535887,0 +7864,1475277600,0.0872452514194515,0 +7865,1475277900,0.08471084711363118,0 +7866,1475278200,0.0895966326482835,0 +7867,1475278500,0.09175878662763408,0 +7868,1475278800,0.09513385625478324,0 +7869,1475279100,0.09994829381049326,0 +7870,1475279400,0.10325201545796454,0 +7871,1475279700,0.10981292746204584,0 +7872,1475280000,0.10332336343722208,0 +7873,1475280300,0.10737158573763278,0 +7874,1475280600,0.10809126970207084,0 +7875,1475280900,0.10655573710543582,0 +7876,1475281200,0.11616599970053235,0 +7877,1475281500,0.11725793399154524,0 +7878,1475281800,0.1149561861395058,0 +7879,1475282100,0.11574721808330647,0 +7880,1475282400,0.11965584651123852,0 +7881,1475282700,0.12440203817383247,0 +7882,1475283000,0.12926300702040378,0 +7883,1475283300,0.129706605326131,0 +7884,1475283600,0.14210564328381914,0 +7885,1475283900,0.14026920833986686,0 +7886,1475284200,0.14934281004761873,0 +7887,1475284500,0.1468766516347043,0 +7888,1475284800,0.14971506037408844,0 +7889,1475285100,0.14850524681306188,0 +7890,1475285400,0.15548494043447414,0 +7891,1475285700,0.15899650184754016,0 +7892,1475286000,0.1566947539955007,0 +7893,1475286300,0.15543840914371795,0 +7894,1475286600,0.1652565115044093,0 +7895,1475286900,0.16409322923424394,0 +7896,1475287200,0.16576835570335768,0 +7897,1475287500,0.1690255460599677,0 +7898,1475287800,0.16972351542215094,0 +7899,1475288100,0.16579007030571755,0 +7900,1475288400,0.17028189091185553,0 +7901,1475288700,0.17828527293105953,0 +7902,1475289000,0.1741439880490314,0 +7903,1475289300,0.16686028999437053,0 +7904,1475289600,0.16727907161159644,0 +7905,1475289900,0.1813098068336785,0 +7906,1475290200,0.16688510668276682,0 +7907,1475290500,0.17458448426872225,0 +7908,1475290800,0.16790879508055853,0 +7909,1475291100,0.17365385845254794,0 +7910,1475291400,0.1763309587170935,0 +7911,1475291700,0.17095504358553745,0 +7912,1475292000,0.17125904801889108,0 +7913,1475292300,0.16432588568823495,0 +7914,1475292600,0.16304472414795085,0 +7915,1475292900,0.16041725726030295,0 +7916,1475293200,0.16279035309159995,0 +7917,1475293500,0.16988482389698953,0 +7918,1475293800,0.17272323263637365,0 +7919,1475294100,0.18377596524659293,0 +7920,1475294400,0.2057387345086004,0 +7921,1475294700,0.2024101961727013,0 +7922,1475295000,0.21739327181337006,0 +7923,1475295300,0.22681740924523186,0 +7924,1475295600,0.2403331981823291,0 +7925,1475295900,0.2480573924566808,0 +7926,1475296200,0.2518729583031005,0 +7927,1475296500,0.2597367464498258,0 +7928,1475296800,0.26169106066379183,0 +7929,1475297100,0.26206331099026153,0 +7930,1475297400,0.2561786537460036,0 +7931,1475297700,0.257828963526616,0 +7932,1475298000,0.2540382143686977,0 +7933,1475298300,0.2381710442027684,0 +7934,1475298600,0.2365424490244108,0 +7935,1475298900,0.2220464008942416,0 +7936,1475299200,0.21478751952797706,0 +7937,1475299500,0.21115807884489726,0 +7938,1475299800,0.1968977892548228,0 +7939,1475300100,0.19999056905061666,0 +7940,1475300400,0.17563298935491026,0 +7941,1475300700,0.1812384588544209,0 +7942,1475301000,0.17479542612035334,0 +7943,1475301300,0.16502385505041825,0 +7944,1475301600,0.161301351785616,0 +7945,1475301900,0.16109040993398485,0 +7946,1475302200,0.1569956563427129,0 +7947,1475302500,0.1618131959845644,0 +7948,1475302800,0.15690259376109547,0 +7949,1475303100,0.16243981736749016,0 +7950,1475303400,0.15720659819444913,0 +7951,1475303700,0.16586141828497508,0 +7952,1475304000,0.16274382180073868,0 +7953,1475304300,0.15555318632769527,0 +7954,1475304600,0.1553918778528567,0 +7955,1475304900,0.16234675478587268,0 +7956,1475305200,0.1593005062808938,0 +7957,1475305500,0.15997365895457571,0 +7958,1475305800,0.15415724760343386,0 +7959,1475306100,0.15706700432197046,0 +7960,1475306400,0.16793050968291842,0 +7961,1475306700,0.1586707828119317,0 +7962,1475307000,0.16281206769395984,0 +7963,1475307300,0.15150496402728456,0 +7964,1475307600,0.15345927824125058,0 +7965,1475307900,0.15567106559770902,0 +7966,1475308200,0.14796858592571718,0 +7967,1475308500,0.15429684147580744,0 +7968,1475308800,0.15390287654697785,0 +7969,1475309100,0.15743925464844016,0 +7970,1475309400,0.15792628215899224,0 +7971,1475309700,0.16139441436723342,0 +7972,1475310000,0.15888172466356282,0 +7973,1475310300,0.1562759723782748,0 +7974,1475310600,0.17025707422345926,0 +7975,1475310900,0.16346350576528185,0 +7976,1475311200,0.1683492913002494,0 +7977,1475311500,0.1675830760449501,0 +7978,1475311800,0.18107715037968747,0 +7979,1475312100,0.1748884887019708,0 +7980,1475312400,0.1825196203947051,0 +7981,1475312700,0.17905148818635885,0 +7982,1475313000,0.18240174112469126,0 +7983,1475313300,0.18789243344018275,0 +7984,1475313600,0.1883825630367082,0 +7985,1475313900,0.19768882119858144,0 +7986,1475314200,0.1948721270615845,0 +7987,1475314500,0.1946860018983496,0 +7988,1475314800,0.19417415769944327,0 +7989,1475315100,0.20108560542766601,0 +7990,1475315400,0.20622576201906848,0 +7991,1475315700,0.2017587581013688,0 +7992,1475316000,0.2038774828762308,0 +7993,1475316300,0.20934335850330488,0 +7994,1475316600,0.20485463998322434,0 +7995,1475316900,0.2118343336046156,0 +7996,1475317200,0.219930778205437,0 +7997,1475317500,0.2248165637404045,0 +7998,1475317800,0.21397477298181647,0 +7999,1475318100,0.2097156088297746,0 +8000,1475318400,0.2229304954196597,0 +8001,1475318700,0.2194189340065937,0 +8002,1475319000,0.22072181014923767,0 +8003,1475319300,0.2256075956842053,0 +8004,1475319600,0.22253653049083008,0 +8005,1475319900,0.22483827834286954,0 +8006,1475320200,0.2247452157612521,0 +8007,1475320500,0.22488480963362573,0 +8008,1475320800,0.2274223160257977,0 +8009,1475321100,0.2296310012962197,0 +8010,1475321400,0.23975310809026465,0 +8011,1475321700,0.23612366740707985,0 +8012,1475322000,0.2513145828133959,0 +8013,1475322300,0.24161435972261325,0 +8014,1475322600,0.2467793330024331,0 +8015,1475322900,0.2492672060177074,0 +8016,1475323200,0.2460100156610973,0 +8017,1475323500,0.2464753285691845,0 +8018,1475323800,0.2538272725170665,0 +8019,1475324100,0.2534550221905968,0 +8020,1475324400,0.26064565766364023,0 +8021,1475324700,0.2619950650971455,0 +8022,1475325000,0.2581794992507257,0 +8023,1475325300,0.265180907474498,0 +8024,1475325600,0.2650661302905206,0 +8025,1475325900,0.2565043727816121,0 +8026,1475326200,0.2698123219531146,0 +8027,1475326500,0.26776494515753113,0 +8028,1475326800,0.2658571622343213,0 +8029,1475327100,0.2705102913151928,0 +8030,1475327400,0.26601847070915985,0 +8031,1475327700,0.2646225319847933,0 +8032,1475328000,0.27118344398897976,0 +8033,1475328300,0.2724180742384026,0 +8034,1475328600,0.2671817529793252,0 +8035,1475328900,0.2662511271631509,0 +8036,1475329200,0.27830273148276563,0 +8037,1475329500,0.26643725232638577,0 +8038,1475329800,0.2686490396828442,0 +8039,1475330100,0.2789324549517277,0 +8040,1475330400,0.27211406980515396,0 +8041,1475330700,0.25757459247026504,0 +8042,1475331000,0.2658571622343213,0 +8043,1475331300,0.2618306545362705,0 +8044,1475331600,0.2555954615677977,0 +8045,1475331900,0.2600159341946781,0 +8046,1475332200,0.2541995228435362,0 +8047,1475332500,0.2596436838682084,0 +8048,1475332800,0.2443131745895187,0 +8049,1475333100,0.2586913434496742,0 +8050,1475333400,0.2416826056158344,0 +8051,1475333700,0.2493385539969649,0 +8052,1475334000,0.24147476585023964,0 +8053,1475334300,0.23323872737693954,0 +8054,1475334600,0.2253501225417129,0 +8055,1475334900,0.22223252605747645,0 +8056,1475335200,0.2181843037570657,0 +8057,1475335500,0.2226978389655636,0 +8058,1475335800,0.2092037646308262,0 +8059,1475336100,0.20797223646745025,0 +8060,1475336400,0.19617500320424952,0 +8061,1475336700,0.18796378141941927,0 +8062,1475337000,0.17798126849781093,0 +8063,1475337300,0.1806583687623565,0 +8064,1475337600,0.1725619241615351,0 +8065,1475337900,0.17367867514094426,0 +8066,1475338200,0.16292994696397356,0 +8067,1475338500,0.15922915830163625,0 +8068,1475338800,0.1495289352108536,0 +8069,1475339100,0.15108618240995358,0 +8070,1475339400,0.13836142541665702,0 +8071,1475339700,0.14219870586543654,0 +8072,1475340000,0.12626328980618107,0 +8073,1475340300,0.1246346946279286,0 +8074,1475340600,0.12184281717930065,0 +8075,1475340900,0.12409803374058392,0 +8076,1475341200,0.11860734142505053,0 +8077,1475341500,0.10323030085560464,0 +8078,1475341800,0.10592911572251007,0 +8079,1475342100,0.09080644620983548,0 +8080,1475342400,0.08519787462386805,0 +8081,1475342700,0.08652556745543376,0 +8082,1475343000,0.07703318412940516,0 +8083,1475343300,0.07761327422178488,0 +8084,1475343600,0.07095929963587598,0 +8085,1475343900,0.06609833079014532,0 +8086,1475344200,0.06621310797391249,0 +8087,1475344500,0.06395789141241906,0 +8088,1475344800,0.060979888800661325,0 +8089,1475345100,0.05797706949998192,0 +8090,1475345400,0.05588316141385245,0 +8091,1475345700,0.057837475628343876,0 +8092,1475346000,0.05583663012330643,0 +8093,1475346300,0.05325569452588934,0 +8094,1475346600,0.05406844107204989,0 +8095,1475346900,0.05078953611329014,0 +8096,1475347200,0.0493935973895541,0 +8097,1475347500,0.04846297157337981,0 +8098,1475347800,0.05109043846029216,0 +8099,1475348100,0.04648384067101752,0 +8100,1475348400,0.04680955970589041,0 +8101,1475348700,0.04855603415447182,0 +8102,1475349000,0.0480659045579884,0 +8103,1475349300,0.0459502818691838,0 +8104,1475349600,0.04711356413913901,0 +8105,1475349900,0.04466912032921496,0 +8106,1475350200,0.043785025803586654,0 +8107,1475350500,0.045180964528373516,0 +8108,1475350800,0.042600028930956366,0 +8109,1475351100,0.044157276130056385,0 +8110,1475351400,0.043040525150647206,0 +8111,1475351700,0.04418209281897807,0 +8112,1475352000,0.044250338711148425,0 +8113,1475352300,0.043018810547972096,0 +8114,1475352600,0.042249493207161766,0 +8115,1475352900,0.043530654747130665,0 +8116,1475353200,0.04162287182423609,0 +8117,1475353500,0.040133870518357186,0 +8118,1475353800,0.04048130415590522,0 +8119,1475354100,0.04297227925742608,0 +8120,1475354400,0.04004080793621436,0 +8121,1475354700,0.040971433752388674,0 +8122,1475355000,0.04315840442066095,0 +8123,1475355300,0.03992292866620067,0 +8124,1475355600,0.03976162019188748,0 +8125,1475355900,0.041504992554222346,0 +8126,1475356200,0.03889924026893432,0 +8127,1475356500,0.042342555788253806,0 +8128,1475356800,0.04250696634986431,0 +8129,1475357100,0.041064496334531485,0 +8130,1475357400,0.039783334793511815,0 +8131,1475357700,0.0424139037677215,0 +8132,1475358000,0.04273962280364522,0 +8133,1475358300,0.042600028930956366,0 +8134,1475358600,0.044203807420602405,0 +8135,1475358900,0.04515924992569837,0 +8136,1475359200,0.05181322451160726,0 +8137,1475359500,0.05164881394999672,0 +8138,1475359800,0.05351006558234531,0 +8139,1475360100,0.05765135046510901,0 +8140,1475360400,0.06367870366809217,0 +8141,1475360700,0.06321339075947963,0 +8142,1475361000,0.06670323757039591,0 +8143,1475361300,0.06698242531472283,0 +8144,1475361600,0.06486370054072246,0 +8145,1475361900,0.07121677277857852,0 +8146,1475362200,0.07184339416150426,0 +8147,1475362500,0.07600949573213882,0 +8148,1475362800,0.08238428257267004,0 +8149,1475363100,0.08119618361379316,0 +8150,1475363400,0.08436031138983659,0 +8151,1475363700,0.08689781778190346,0 +8152,1475364000,0.09006194555689602,0 +8153,1475364300,0.08680475519976064,0 +8154,1475364600,0.09048072717391176,0 +8155,1475364900,0.09059550435767892,0 +8156,1475365200,0.08822240852617177,0 +8157,1475365500,0.0978078544332924,0 +8158,1475365800,0.10206701858533423,0 +8159,1475366100,0.1094654938239724,0 +8160,1475366400,0.1019956706060767,0 +8161,1475366700,0.11239696514497396,0 +8162,1475367000,0.11909747102153395,0 +8163,1475367300,0.11595505784890126,0 +8164,1475367600,0.11348889943598682,0 +8165,1475367900,0.1190261230422764,0 +8166,1475368200,0.12342488106679694,0 +8167,1475368500,0.12423762761295752,0 +8168,1475368800,0.12395843986810524,0 +8169,1475369100,0.1252861326992506,0 +8170,1475369400,0.1326132599586312,0 +8171,1475369700,0.12952048016289616,0 +8172,1475370000,0.1318222280149356,0 +8173,1475370300,0.1438986490229466,0 +8174,1475370600,0.14771421486936626,0 +8175,1475370900,0.14429261395177614,0 +8176,1475371200,0.14350158200808058,0 +8177,1475371500,0.1494110559408399,0 +8178,1475371800,0.15097140522597627,0 +8179,1475372100,0.14731714785439518,0 +8180,1475372400,0.1518772143537543,0 +8181,1475372700,0.15520575268962186,0 +8182,1475373000,0.16132306638808094,0 +8183,1475373300,0.161301351785616,0 +8184,1475373600,0.17335295610533574,0 +8185,1475373900,0.1797494575485421,0 +8186,1475374200,0.1705827932590677,0 +8187,1475374500,0.1814711153085171,0 +8188,1475374800,0.17037495349347295,0 +8189,1475375100,0.1767032090435632,0 +8190,1475375400,0.17035013680507669,0 +8191,1475375700,0.16823451411627208,0 +8192,1475376000,0.17435182781462613,1 +8193,1475376300,0.23153878421932444,1 +8194,1475376600,0.17970292625778592,1 +8195,1475376900,0.1819612449050005,0 +8196,1475377200,0.17205007996258667,0 +8197,1475377500,0.1790763048748602,0 +8198,1475377800,0.17889017971162532,0 +8199,1475378100,0.17500326588594814,0 +8200,1475378400,0.17586564580900635,0 +8201,1475378700,0.1729341744880048,0 +8202,1475379000,0.1612765350972197,0 +8203,1475379300,0.1682810454070283,0 +8204,1475379600,0.1717708922177344,0 +8205,1475379900,0.1767249236459231,0 +8206,1475380200,0.1915001595209341,0 +8207,1475380500,0.19173281597497768,0 +8208,1475380800,0.2029685716624164,0 +8209,1475381100,0.2225117138023287,0 +8210,1475381400,0.2210940604757074,0 +8211,1475381700,0.23949563494777226,0 +8212,1475382000,0.2506166134512127,0 +8213,1475382300,0.2600159341946781,0 +8214,1475382600,0.264926536418147,0 +8215,1475382900,0.27865326720687544,0 +8216,1475383200,0.26401762520433264,0 +8217,1475383500,0.2631583473673108,0 +8218,1475383800,0.26906782130017515,0 +8219,1475384100,0.26031993862803177,0 +8220,1475384400,0.2616445293730357,0 +8221,1475384700,0.2449863272633057,0 +8222,1475385000,0.25259574435368004,0 +8223,1475385300,0.23072603767326896,0 +8224,1475385600,0.2132519869312369,0 +8225,1475385900,0.2184883081904194,0 +8226,1475386200,0.2117412710229981,0 +8227,1475386500,0.1969195038572005,0 +8228,1475386800,0.1853580291340892,0 +8229,1475387100,0.17968121165532094,0 +8230,1475387400,0.17509632846756554,0 +8231,1475387700,0.16427935439747882,0 +8232,1475388000,0.16313778672956827,0 +8233,1475388300,0.1644654795607137,0 +8234,1475388600,0.15988059637295826,0 +8235,1475388900,0.16969869873375468,0 +8236,1475389200,0.15860253691871054,0 +8237,1475389500,0.1612548204948598,0 +8238,1475389800,0.1541107163125726,0 +8239,1475390100,0.15860253691871054,0 +8240,1475390400,0.1602311320970681,0 +8241,1475390700,0.15660169141388328,0 +8242,1475391000,0.1562976869806347,0 +8243,1475391300,0.1527613088790673,0 +8244,1475391600,0.16895419808081524,0 +8245,1475391900,0.1567164685978606,0 +8246,1475392200,0.1580193447406097,0 +8247,1475392500,0.1595796940257461,0 +8248,1475392800,0.15827681788299702,0 +8249,1475393100,0.1570887189244354,0 +8250,1475393400,0.15034168175701418,0 +8251,1475393700,0.15716006690358789,0 +8252,1475394000,0.161301351785616,0 +8253,1475394300,0.15825200119460073,0 +8254,1475394600,0.15611156181739985,0 +8255,1475394900,0.16388228738250776,0 +8256,1475395200,0.15888172466356282,0 +8257,1475395500,0.16025284669942802,0 +8258,1475395800,0.16565047643334396,0 +8259,1475396100,0.1583915950670794,0 +8260,1475396400,0.16337044318366442,0 +8261,1475396700,0.1674651967748313,0 +8262,1475397000,0.16395363536176527,0 +8263,1475397300,0.1682810454070283,0 +8264,1475397600,0.17314201425359954,0 +8265,1475397900,0.17028189091185553,0 +8266,1475398200,0.17472407814109586,0 +8267,1475398500,0.17237579899830022,0 +8268,1475398800,0.187845902149374,0 +8269,1475399100,0.18947449732770005,0 +8270,1475399400,0.18500749340997927,0 +8271,1475399700,0.19007940410822385,0 +8272,1475400000,0.1904516544346936,0 +8273,1475400300,0.19038340854151445,0 +8274,1475400600,0.1944316308419252,0 +8275,1475400900,0.19933913097928416,0 +8276,1475401200,0.2038061348969838,0 +8277,1475401500,0.20622576201906848,0 +8278,1475401800,0.21416089814505126,0 +8279,1475402100,0.2043893270751267,0 +8280,1475402400,0.2103918635894929,0 +8281,1475402700,0.2101126758446406,0 +8282,1475403000,0.2224899991999688,0 +8283,1475403300,0.21276495942078988,0 +8284,1475403600,0.21022745302872287,0 +8285,1475403900,0.2270252490108266,0 +8286,1475404200,0.2216989672561681,0 +8287,1475404500,0.2264451589187621,0 +8288,1475404800,0.21909321497088013,0 +8289,1475405100,0.2317249093825593,0 +8290,1475405400,0.2230949059805347,0 +8291,1475405700,0.22923703636728496,0 +8292,1475406000,0.22348887090936426,0 +8293,1475406300,0.23326044197929946,0 +8294,1475406600,0.2246521531795296,0 +8295,1475406900,0.2422657977939353,0 +8296,1475407200,0.2369364139532404,0 +8297,1475407500,0.2413351719777609,0 +8298,1475407800,0.2487801785072604,0 +8299,1475408100,0.2448715500793283,0 +8300,1475408400,0.2541064602619188,0 +8301,1475408700,0.2493137373085687,0 +8302,1475409000,0.2652057241629993,0 +8303,1475409300,0.2569914002920591,0 +8304,1475409600,0.2727437932741161,0 +8305,1475409900,0.2700666930095706,0 +8306,1475410200,0.2849815227569657,0 +8307,1475410500,0.283306396287852,0 +8308,1475410800,0.2834459901602256,0 +8309,1475411100,0.28746939577223996,0 +8310,1475411400,0.2713695691522146,0 +8311,1475411700,0.27406838401912004,0 +8312,1475412000,0.27099731882574485,0 +8313,1475412300,0.2791185801149626,0 +8314,1475412600,0.27325563747295945,0 +8315,1475412900,0.2724180742384026,0 +8316,1475413200,0.2730446956213283,0 +8317,1475413500,0.2845162098488785,0 +8318,1475413800,0.28428355339478245,0 +8319,1475414100,0.2833994588694694,0 +8320,1475414400,0.2872832706090052,0 +8321,1475414700,0.2887257406241278,0 +8322,1475415000,0.2860269257571173,0 +8323,1475415300,0.2836786466143217,0 +8324,1475415600,0.2758148584674913,0 +8325,1475415900,0.2862130509203521,0 +8326,1475416200,0.2816312698186332,0 +8327,1475416500,0.2731377582029457,0 +8328,1475416800,0.2852358938134217,0 +8329,1475417100,0.2791868260081837,0 +8330,1475417400,0.27248632013162377,0 +8331,1475417700,0.2745336969272072,0 +8332,1475418000,0.2675540033057949,0 +8333,1475418300,0.2645542860916773,0 +8334,1475418600,0.2692074151725488,0 +8335,1475418900,0.2608317828268751,0 +8336,1475419200,0.2719062300395593,0 +8337,1475419500,0.25864481215881296,0 +8338,1475419800,0.26218119026038034,0 +8339,1475420100,0.26259997187760625,0 +8340,1475420400,0.2500117066706468,0 +8341,1475420700,0.24433488919198365,0 +8342,1475421000,0.2362849758819185,0 +8343,1475421300,0.23933432647293365,0 +8344,1475421600,0.23579794837147136,0 +8345,1475421900,0.22211774887349914,0 +8346,1475422200,0.22716484288330524,0 +8347,1475422500,0.21504499267046945,0 +8348,1475422800,0.20831967010549213,0 +8349,1475423100,0.19703738312724994,0 +8350,1475423400,0.2101126758446406,0 +8351,1475423700,0.1844739346086711,0 +8352,1475424000,0.19152187412331506,0 +8353,1475424300,0.18763806238375824,0 +8354,1475424600,0.16888595218759409,0 +8355,1475424900,0.16807010355539712,0 +8356,1475425200,0.15438990405742484,0 +8357,1475425500,0.1532266217871545,0 +8358,1475425800,0.14045533350310171,0 +8359,1475426100,0.13042628929056907,0 +8360,1475426400,0.1263315356994022,0 +8361,1475426700,0.11765500100651632,0 +8362,1475427000,0.11837468497095438,0 +8363,1475427300,0.10769730477324126,0 +8364,1475427600,0.1044618290189911,0 +8365,1475427900,0.1014838264072334,0 +8366,1475428200,0.09811185886654096,0 +8367,1475428500,0.08587412938358634,0 +8368,1475428800,0.08489697227686599,0 +8369,1475429100,0.08308225193506344,0 +8370,1475429400,0.07424130668088229,0 +8371,1475429700,0.07389077095708768,0 +8372,1475430000,0.07207605061528513,0 +8373,1475430300,0.06802782831571506,0 +8374,1475430600,0.06540036142775191,0 +8375,1475430900,0.0640974852851079,0 +8376,1475431200,0.06177092074414677,0 +8377,1475431500,0.061094665984428476,0 +8378,1475431800,0.06014232556557906,0 +8379,1475432100,0.056441536903557035,0 +8380,1475432400,0.0541863203420636,0 +8381,1475432700,0.05376753872504785,0 +8382,1475433000,0.05709297497540442,0 +8383,1475433300,0.055278254633601864,0 +8384,1475433600,0.05004503546035072,0 +8385,1475433900,0.048602565445017835,0 +8386,1475434200,0.051906287092699295,0 +8387,1475434500,0.050764719425419255,0 +8388,1475434800,0.051834939113231566,0 +8389,1475435100,0.04953319126119216,0 +8390,1475435400,0.046741313812669286,0 +8391,1475435700,0.0448800621803207,0 +8392,1475436000,0.0461829383229647,0 +8393,1475436300,0.0458324025991701,0 +8394,1475436600,0.04608987574187265,0 +8395,1475436900,0.04287921667633407,0 +8396,1475437200,0.04476218291030699,0 +8397,1475437500,0.04099314835506378,0 +8398,1475437800,0.04278615409419124,0 +8399,1475438100,0.039783334793511815,0 +8400,1475438400,0.03922495930380724,0 +8401,1475438700,0.038573521233010624,0 +8402,1475439000,0.042202961915565,0 +8403,1475439300,0.03927149059540405,0 +8404,1475439600,0.041111027625077526,0 +8405,1475439900,0.03766771210470722,0 +8406,1475440200,0.03982986608510861,0 +8407,1475440500,0.03999427664566834,0 +8408,1475440800,0.040155585119981525,0 +8409,1475441100,0.0397368035029658,0 +8410,1475441400,0.044250338711148425,0 +8411,1475441700,0.04252868095148864,0 +8412,1475442000,0.04085355448237496,0 +8413,1475442300,0.040760491901282926,0 +8414,1475442600,0.042342555788253806,0 +8415,1475442900,0.042342555788253806,0 +8416,1475443200,0.041808996987470905,0 +8417,1475443500,0.04139021536940436,0 +8418,1475443800,0.042296024497707814,0 +8419,1475444100,0.041458461262625534,0 +8420,1475444400,0.04429687000274525,0 +8421,1475444700,0.049626253843334966,0 +8422,1475445000,0.04725315801182785,0 +8423,1475445300,0.04692743897590413,0 +8424,1475445600,0.05453375398066244,0 +8425,1475445900,0.05376753872504785,0 +8426,1475446200,0.05844238240859444,0 +8427,1475446500,0.05977007523910935,0 +8428,1475446800,0.061398670417677076,0 +8429,1475447100,0.0638182975397302,0 +8430,1475447400,0.06681801475416305,0 +8431,1475447700,0.06565473248420793,0 +8432,1475448000,0.07363639990063169,0 +8433,1475448300,0.07277401997767856,0 +8434,1475448600,0.07328586417683709,0 +8435,1475448900,0.07852218543528401,0 +8436,1475449200,0.08098834384893397,0 +8437,1475449500,0.0774519657474717,0 +8438,1475449800,0.08494350356741202,0 +8439,1475450100,0.08454643655307144,0 +8440,1475450400,0.0931081940614546,0 +8441,1475450700,0.09131829040794323,0 +8442,1475451000,0.09513385625478324,0 +8443,1475451300,0.0912252278268512,0 +8444,1475451600,0.0941318824593514,0 +8445,1475451900,0.09994829381049326,0 +8446,1475452200,0.10059973188181526,0 +8447,1475452500,0.10690627282954564,0 +8448,1475452800,0.10325201545796454,0 +8449,1475453100,0.11302358652789968,0 +8450,1475453400,0.11379290386934048,0 +8451,1475453700,0.11230390256335654,0 +8452,1475454000,0.11169899578289576,0 +8453,1475454300,0.11790937206286728,0 +8454,1475454600,0.11942319005724747,0 +8455,1475454900,0.11800243464448468,0 +8456,1475455200,0.12349312696001807,0 +8457,1475455500,0.12102696854710365,0 +8458,1475455800,0.12754134926042876,0 +8459,1475456100,0.13356870246330688,0 +8460,1475456400,0.13231235761141902,0 +8461,1475456700,0.1428284293342936,0 +8462,1475457000,0.1424561790078239,0 +8463,1475457300,0.1556462489093127,0 +8464,1475457600,0.15176243716977694,0 +8465,1475457900,0.15466909180227714,0 +8466,1475458200,0.1644654795607137,0 +8467,1475458500,0.1521564020986066,0 +8468,1475458800,0.15878866208194542,0 +8469,1475459100,0.15611156181739985,0 +8470,1475459400,0.15848465764869685,0 +8471,1475459700,0.16893248347835027,0 +8472,1475460000,0.16860676444274178,0 +8473,1475460300,0.1711194541464124,0 +8474,1475460600,0.16776920120818495,0 +8475,1475460900,0.16176666469370315,0 +8476,1475461200,0.16983829260612826,0 +8477,1475461500,0.17181742350859566,0 +8478,1475461800,0.17588736041136624,0 +8479,1475462100,0.16993135518774571,0 +8480,1475462400,0.17025707422345926,0 +8481,1475462700,0.17551511008489654,0 +8482,1475463000,0.1729558890903647,0 +8483,1475463300,0.17276976392712984,0 +8484,1475463600,0.16997788647860698,0 +8485,1475463900,0.16223197760189534,0 +8486,1475464200,0.1652565115044093,0 +8487,1475464500,0.1732133622328571,0 +8488,1475464800,0.17274804932476995,0 +8489,1475465100,0.167837447101301,0 +8490,1475465400,0.16488426117793956,0 +8491,1475465700,0.16586141828497508,0 +8492,1475466000,0.1767032090435632,0 +8493,1475466300,0.18579852535378008,0 +8494,1475466600,0.1806335520739602,0 +8495,1475466900,0.1916149367049325,0 +8496,1475467200,0.2024101961727013,0 +8497,1475467500,0.2003876360655247,0 +8498,1475467800,0.21788340140985354,0 +8499,1475468100,0.2363780384635358,0 +8500,1475468400,0.2384967632383768,0 +8501,1475468700,0.25157205595578325,0 +8502,1475469000,0.25024436312474296,0 +8503,1475469300,0.2617624086430493,0 +8504,1475469600,0.2573884673070302,0 +8505,1475469900,0.26139015831657963,0 +8506,1475470200,0.2494316165785824,0 +8507,1475470500,0.2497573356142959,0 +8508,1475470800,0.2464753285691845,0 +8509,1475471100,0.2477316734210724,0 +8510,1475471400,0.23298125423444715,0 +8511,1475471700,0.22274437025642485,0 +8512,1475472000,0.2129510845840248,0 +8513,1475472300,0.20662282903397647,0 +8514,1475472600,0.20062029251956826,0 +8515,1475472900,0.18891612183798487,0 +8516,1475473200,0.1844491179202748,0 +8517,1475473500,0.1824948037063088,0 +8518,1475473800,0.1835184921041005,0 +8519,1475474100,0.1708154497131638,0 +8520,1475474400,0.16893248347835027,0 +8521,1475474700,0.16460507343308728,0 +8522,1475475000,0.16037072596944169,0 +8523,1475475300,0.16230022349501144,0 +8524,1475475600,0.15464737719991725,0 +8525,1475475900,0.16490597578040453,0 +8526,1475476200,0.15639074956225213,0 +8527,1475476500,0.1554135924552166,0 +8528,1475476800,0.16018460080620686,0 +8529,1475477100,0.15829853248546194,0 +8530,1475477400,0.15964793991886214,0 +8531,1475477700,0.16281206769395984,0 +8532,1475478000,0.1629764782548348,0 +8533,1475478300,0.15713525021519156,0 +8534,1475478600,0.16064991371429402,0 +8535,1475478900,0.15850947433709311,0 +8536,1475479200,0.16655938764715836,0 +8537,1475479500,0.16886113549919782,0 +8538,1475479800,0.16569700772410015,0 +8539,1475480100,0.1640684125457426,0 +8540,1475480400,0.16367444761691294,0 +8541,1475480700,0.1718856694017117,0 +8542,1475481000,0.16604754344820996,0 +8543,1475481300,0.17484195741121464,0 +8544,1475481600,0.1695373902589161,0 +8545,1475481900,0.16941951098890234,0 +8546,1475482200,0.18179683434412552,0 +8547,1475482500,0.1759587083906238,0 +8548,1475482800,0.17772689744135495,0 +8549,1475483100,0.18977850176099065,0 +8550,1475483400,0.1864034321342408,0 +8551,1475483700,0.1859164046237938,0 +8552,1475484000,0.19566315900534315,0 +8553,1475484300,0.20448238965674406,0 +8554,1475484600,0.20336563867732446,0 +8555,1475484900,0.20364482642217674,0 +8556,1475485200,0.2022488876978943,0 +8557,1475485500,0.203272576095707,0 +8558,1475485800,0.2183952456086969,0 +8559,1475486100,0.22351368759776047,0 +8560,1475486400,0.2229770267104159,0 +8561,1475486700,0.2339832280298789,0 +8562,1475487000,0.2281885312810969,0 +8563,1475487300,0.22439778212317868,0 +8564,1475487600,0.2375661374222025,0 +8565,1475487900,0.23768091460617985,0 +8566,1475488200,0.24021842099835186,0 +8567,1475488500,0.244614076936836,0 +8568,1475488800,0.2427311107020224,0 +8569,1475489100,0.2529679946801497,0 +8570,1475489400,0.2408698590696738,0 +8571,1475489700,0.2430785443400958,0 +8572,1475490000,0.2477316734210724,0 +8573,1475490300,0.2582942764347031,0 +8574,1475490600,0.2470585207472853,0 +8575,1475490900,0.25324718242500205,0 +8576,1475491200,0.262389030025975,0 +8577,1475491500,0.2555489302770415,0 +8578,1475491800,0.26294740551567963,0 +8579,1475492100,0.26169106066379183,0 +8580,1475492400,0.2627612803524448,0 +8581,1475492700,0.2650195989997645,0 +8582,1475493000,0.26843809783121303,0 +8583,1475493300,0.2716022256062056,0 +8584,1475493600,0.2857725547006613,0 +8585,1475493900,0.2765345424320345,0 +8586,1475494200,0.2863774614812272,0 +8587,1475494500,0.2893089328022287,0 +8588,1475494800,0.30087040752535993,0 +8589,1475495100,0.29000690216441194,0 +8590,1475495400,0.29977847323434714,0 +8591,1475495700,0.302405940121995,0 +8592,1475496000,0.3125962928094713,0 +8593,1475496300,0.300451625908029,0 +8594,1475496600,0.31501591993152445,0 +8595,1475496900,0.3256964022153791,0 +8596,1475497200,0.3201126473183333,0 +8597,1475497500,0.32469442841973706,0 +8598,1475497800,0.3282090919185243,0 +8599,1475498100,0.3270923409391151,0 +8600,1475498400,0.3254637457615982,0 +8601,1475498700,0.3320711890559103,0 +8602,1475499000,0.3287209361176828,0 +8603,1475499300,0.3341650971420397,0 +8604,1475499600,0.32704580964856905,0 +8605,1475499900,0.3424476669065163,0 +8606,1475500200,0.343120819579988,0 +8607,1475500500,0.3388864721161323,0 +8608,1475500800,0.3437505430491603,0 +8609,1475501100,0.3553833657518644,0 +8610,1475501400,0.3595928965267984,0 +8611,1475501700,0.35884839587385897,0 +8612,1475502000,0.35598827253211496,0 +8613,1475502300,0.3504727635282904,0 +8614,1475502600,0.3672240282194276,0 +8615,1475502900,0.3714583756832833,0 +8616,1475503200,0.36385206067852505,0 +8617,1475503500,0.3524053631401066,0 +8618,1475503800,0.34889069964026864,0 +8619,1475504100,0.3505658261093824,0 +8620,1475504400,0.3418892914168117,0 +8621,1475504700,0.3457048572636517,0 +8622,1475505000,0.3403754734220112,0 +8623,1475505300,0.3398170979323066,0 +8624,1475505600,0.3360015320865174,0 +8625,1475505900,0.32495190156243964,0 +8626,1475506200,0.3362341885402983,0 +8627,1475506500,0.3115726044111541,0 +8628,1475506800,0.3145754237118336,0 +8629,1475507100,0.30903820010533384,0 +8630,1475507400,0.30396628940744663,0 +8631,1475507700,0.3023842255196351,0 +8632,1475508000,0.3015683768874381,0 +8633,1475508300,0.2820965827267203,0 +8634,1475508600,0.2733487000545769,0 +8635,1475508900,0.2742793258707512,0 +8636,1475509200,0.2616693460614319,0 +8637,1475509500,0.2604130012096492,0 +8638,1475509800,0.23049338121917284,0 +8639,1475510100,0.2343771929587086,0 +8640,1475510400,0.2283994731327281,0 +8641,1475510700,0.2214663108021772,0 +8642,1475511000,0.21551030557855647,0 +8643,1475511300,0.2069237313812097,0 +8644,1475511600,0.19531572536730124,0 +8645,1475511900,0.17591217709976253,0 +8646,1475512200,0.1748884887019708,0 +8647,1475512500,0.17030360551421542,0 +8648,1475512800,0.1574609692509051,0 +8649,1475513100,0.15429684147580744,0 +8650,1475513400,0.15888172466356282,0 +8651,1475513700,0.138615796473113,0 +8652,1475514000,0.13214794705054406,0 +8653,1475514300,0.13594179829460384,0 +8654,1475514600,0.12181800049090435,0 +8655,1475514900,0.11532843646597553,0 +8656,1475515200,0.1120929607117254,0 +8657,1475515500,0.10830221155380708,0 +8658,1475515800,0.10267192536590007,0 +8659,1475516100,0.09199144308246578,0 +8660,1475516400,0.09143306759276118,0 +8661,1475516700,0.0912717591173972,0 +8662,1475517000,0.08331490838884432,0 +8663,1475517300,0.08475737840417719,0 +8664,1475517600,0.07612427291590597,0 +8665,1475517900,0.07358986861008568,0 +8666,1475518200,0.07002867381970168,0 +8667,1475518500,0.07005349050862336,0 +8668,1475518800,0.06993561123860964,0 +8669,1475519100,0.06807435960626106,0 +8670,1475519400,0.06982083405484249,0 +8671,1475519700,0.06872579767705768,0 +8672,1475520000,0.06081547824010159,0 +8673,1475520300,0.05834931982645161,0 +8674,1475520600,0.05930476233154761,0 +8675,1475520900,0.05646635359247872,0 +8676,1475521200,0.053395288398578135,0 +8677,1475521500,0.05392884720041181,0 +8678,1475521800,0.051602282659450716,0 +8679,1475522100,0.05092912998597899,0 +8680,1475522400,0.050066750063025835,0 +8681,1475522700,0.05111525514921381,0 +8682,1475523000,0.04983409360924496,0 +8683,1475523300,0.0467630284153444,0 +8684,1475523600,0.04764712294097267,0 +8685,1475523900,0.04702050155804696,0 +8686,1475524200,0.04380984249250836,0 +8687,1475524500,0.04666996583425236,0 +8688,1475524800,0.045298843798387216,0 +8689,1475525100,0.045553214854843226,0 +8690,1475525400,0.04478699959922865,0 +8691,1475525700,0.04318011902333607,0 +8692,1475526000,0.04504137065568466,0 +8693,1475526300,0.0454601522727004,0 +8694,1475526600,0.043902905073600375,0 +8695,1475526900,0.041808996987470905,0 +8696,1475527200,0.04257521224203466,0 +8697,1475527500,0.04108621093615582,0 +8698,1475527800,0.041551523844768366,0 +8699,1475528100,0.04197030546178409,0 +8700,1475528400,0.04608987574187265,0 +8701,1475528700,0.04257521224203466,0 +8702,1475529000,0.044228624109524085,0 +8703,1475529300,0.04273962280364522,0 +8704,1475529600,0.0454601522727004,0 +8705,1475529900,0.04399596765574322,0 +8706,1475530200,0.04373849451304064,0 +8707,1475530500,0.04653037196156353,0 +8708,1475530800,0.04727487261450297,0 +8709,1475531100,0.04969449973655612,0 +8710,1475531400,0.052346783312390135,0 +8711,1475531700,0.05490600430713215,0 +8712,1475532000,0.05937300822476872,0 +8713,1475532300,0.06000273169394104,0 +8714,1475532600,0.0673081443506465,0 +8715,1475532900,0.07182167955882914,0 +8716,1475533200,0.07900921294657168,0 +8717,1475533500,0.08422071751714774,0 +8718,1475533800,0.08508309744010087,0 +8719,1475534100,0.09389922600525527,0 +8720,1475534400,0.09038766459176896,0 +8721,1475534700,0.10285805052913492,0 +8722,1475535000,0.10360255118207436,0 +8723,1475535300,0.10913977478836394,0 +8724,1475535600,0.1112553974771685,0 +8725,1475535900,0.11593334324654135,0 +8726,1475536200,0.12603063335219006,0 +8727,1475536500,0.12761269723968632,0 +8728,1475536800,0.1336834796472842,0 +8729,1475537100,0.14617868227262615,0 +8730,1475537400,0.14199086609973674,0 +8731,1475537700,0.14903880561437013,0 +8732,1475538000,0.1516910891905194,0 +8733,1475538300,0.16262594253072496,0 +8734,1475538600,0.15690259376109547,0 +8735,1475538900,0.1707006725291865,0 +8736,1475539200,0.16974523002451086,0 +8737,1475539500,0.17491020330433069,0 +8738,1475539800,0.17400439417665778,0 +8739,1475540100,0.18689356173082933,0 +8740,1475540400,0.19103484661284686,0 +8741,1475540700,0.19103484661284686,0 +8742,1475541000,0.1991778225044771,0 +8743,1475541300,0.20071335510118568,0 +8744,1475541600,0.20792570517664155,0 +8745,1475541900,0.21101848497241865,0 +8746,1475542200,0.21416089814505126,0 +8747,1475542500,0.2257937208474401,0 +8748,1475542800,0.24287070457450105,0 +8749,1475543100,0.2380314503302897,0 +8750,1475543400,0.2489663036704952,0 +8751,1475543700,0.2555489302770415,0 +8752,1475544000,0.2535946160629704,0 +8753,1475544300,0.2590853083785038,0 +8754,1475544600,0.2673213468518039,0 +8755,1475544900,0.2681123787954995,0 +8756,1475545200,0.2762088233963209,0 +8757,1475545500,0.2662045958723947,0 +8758,1475545800,0.2700915096979669,0 +8759,1475546100,0.2864922386652045,0 +8760,1475546400,0.2939372451947039,0 +8761,1475546700,0.2877268689147324,0 +8762,1475547000,0.2870754308434104,0 +8763,1475547300,0.29945275419863354,0 +8764,1475547600,0.2889583970781189,0 +8765,1475547900,0.29738056071465385,0 +8766,1475548200,0.3018723813207918,0 +8767,1475548500,0.3191354902105622,0 +8768,1475548800,0.3033613826261452,0 +8769,1475549100,0.30647897911059185,0 +8770,1475549400,0.30010419226995555,0 +8771,1475549700,0.31322601627864355,0 +8772,1475550000,0.297172720948954,0 +8773,1475550300,0.30457119618769724,0 +8774,1475550600,0.29947446880099343,0 +8775,1475550900,0.3089668521258661,0 +8776,1475551200,0.29372940542910914,0 +8777,1475551500,0.2971479042605577,0 +8778,1475551800,0.2978706903111373,0 +8779,1475552100,0.3058492556414195,0 +8780,1475552400,0.30889860623264503,0 +8781,1475552700,0.3077818552532359,0 +8782,1475553000,0.309059914708009,0 +8783,1475553300,0.30985094665149443,0 diff --git a/datasets/anomaly_reserve/kpi/TEST/problem_TEST/dataSplits.csv b/datasets/anomaly_reserve/kpi/TEST/problem_TEST/dataSplits.csv new file mode 100644 index 0000000..1f92bd4 --- /dev/null +++ b/datasets/anomaly_reserve/kpi/TEST/problem_TEST/dataSplits.csv @@ -0,0 +1,7028 @@ +d3mIndex,type,repeat,fold +7027,TEST,0,0 +7028,TEST,0,0 +7029,TEST,0,0 +7030,TEST,0,0 +7031,TEST,0,0 +7032,TEST,0,0 +7033,TEST,0,0 +7034,TEST,0,0 +7035,TEST,0,0 +7036,TEST,0,0 +7037,TEST,0,0 +7038,TEST,0,0 +7039,TEST,0,0 +7040,TEST,0,0 +7041,TEST,0,0 +7042,TEST,0,0 +7043,TEST,0,0 +7044,TEST,0,0 +7045,TEST,0,0 +7046,TEST,0,0 +7047,TEST,0,0 +7048,TEST,0,0 +7049,TEST,0,0 +7050,TEST,0,0 +7051,TEST,0,0 +7052,TEST,0,0 +7053,TEST,0,0 +7054,TEST,0,0 +7055,TEST,0,0 +7056,TEST,0,0 +7057,TEST,0,0 +7058,TEST,0,0 +7059,TEST,0,0 +7060,TEST,0,0 +7061,TEST,0,0 +7062,TEST,0,0 +7063,TEST,0,0 +7064,TEST,0,0 +7065,TEST,0,0 +7066,TEST,0,0 +7067,TEST,0,0 +7068,TEST,0,0 +7069,TEST,0,0 +7070,TEST,0,0 +7071,TEST,0,0 +7072,TEST,0,0 +7073,TEST,0,0 +7074,TEST,0,0 +7075,TEST,0,0 +7076,TEST,0,0 +7077,TEST,0,0 +7078,TEST,0,0 +7079,TEST,0,0 +7080,TEST,0,0 +7081,TEST,0,0 +7082,TEST,0,0 +7083,TEST,0,0 +7084,TEST,0,0 +7085,TEST,0,0 +7086,TEST,0,0 +7087,TEST,0,0 +7088,TEST,0,0 +7089,TEST,0,0 +7090,TEST,0,0 +7091,TEST,0,0 +7092,TEST,0,0 +7093,TEST,0,0 +7094,TEST,0,0 +7095,TEST,0,0 +7096,TEST,0,0 +7097,TEST,0,0 +7098,TEST,0,0 +7099,TEST,0,0 +7100,TEST,0,0 +7101,TEST,0,0 +7102,TEST,0,0 +7103,TEST,0,0 +7104,TEST,0,0 +7105,TEST,0,0 +7106,TEST,0,0 +7107,TEST,0,0 +7108,TEST,0,0 +7109,TEST,0,0 +7110,TEST,0,0 +7111,TEST,0,0 +7112,TEST,0,0 +7113,TEST,0,0 +7114,TEST,0,0 +7115,TEST,0,0 +7116,TEST,0,0 +7117,TEST,0,0 +7118,TEST,0,0 +7119,TEST,0,0 +7120,TEST,0,0 +7121,TEST,0,0 +7122,TEST,0,0 +7123,TEST,0,0 +7124,TEST,0,0 +7125,TEST,0,0 +7126,TEST,0,0 +7127,TEST,0,0 +7128,TEST,0,0 +7129,TEST,0,0 +7130,TEST,0,0 +7131,TEST,0,0 +7132,TEST,0,0 +7133,TEST,0,0 +7134,TEST,0,0 +7135,TEST,0,0 +7136,TEST,0,0 +7137,TEST,0,0 +7138,TEST,0,0 +7139,TEST,0,0 +7140,TEST,0,0 +7141,TEST,0,0 +7142,TEST,0,0 +7143,TEST,0,0 +7144,TEST,0,0 +7145,TEST,0,0 +7146,TEST,0,0 +7147,TEST,0,0 +7148,TEST,0,0 +7149,TEST,0,0 +7150,TEST,0,0 +7151,TEST,0,0 +7152,TEST,0,0 +7153,TEST,0,0 +7154,TEST,0,0 +7155,TEST,0,0 +7156,TEST,0,0 +7157,TEST,0,0 +7158,TEST,0,0 +7159,TEST,0,0 +7160,TEST,0,0 +7161,TEST,0,0 +7162,TEST,0,0 +7163,TEST,0,0 +7164,TEST,0,0 +7165,TEST,0,0 +7166,TEST,0,0 +7167,TEST,0,0 +7168,TEST,0,0 +7169,TEST,0,0 +7170,TEST,0,0 +7171,TEST,0,0 +7172,TEST,0,0 +7173,TEST,0,0 +7174,TEST,0,0 +7175,TEST,0,0 +7176,TEST,0,0 +7177,TEST,0,0 +7178,TEST,0,0 +7179,TEST,0,0 +7180,TEST,0,0 +7181,TEST,0,0 +7182,TEST,0,0 +7183,TEST,0,0 +7184,TEST,0,0 +7185,TEST,0,0 +7186,TEST,0,0 +7187,TEST,0,0 +7188,TEST,0,0 +7189,TEST,0,0 +7190,TEST,0,0 +7191,TEST,0,0 +7192,TEST,0,0 +7193,TEST,0,0 +7194,TEST,0,0 +7195,TEST,0,0 +7196,TEST,0,0 +7197,TEST,0,0 +7198,TEST,0,0 +7199,TEST,0,0 +7200,TEST,0,0 +7201,TEST,0,0 +7202,TEST,0,0 +7203,TEST,0,0 +7204,TEST,0,0 +7205,TEST,0,0 +7206,TEST,0,0 +7207,TEST,0,0 +7208,TEST,0,0 +7209,TEST,0,0 +7210,TEST,0,0 +7211,TEST,0,0 +7212,TEST,0,0 +7213,TEST,0,0 +7214,TEST,0,0 +7215,TEST,0,0 +7216,TEST,0,0 +7217,TEST,0,0 +7218,TEST,0,0 +7219,TEST,0,0 +7220,TEST,0,0 +7221,TEST,0,0 +7222,TEST,0,0 +7223,TEST,0,0 +7224,TEST,0,0 +7225,TEST,0,0 +7226,TEST,0,0 +7227,TEST,0,0 +7228,TEST,0,0 +7229,TEST,0,0 +7230,TEST,0,0 +7231,TEST,0,0 +7232,TEST,0,0 +7233,TEST,0,0 +7234,TEST,0,0 +7235,TEST,0,0 +7236,TEST,0,0 +7237,TEST,0,0 +7238,TEST,0,0 +7239,TEST,0,0 +7240,TEST,0,0 +7241,TEST,0,0 +7242,TEST,0,0 +7243,TEST,0,0 +7244,TEST,0,0 +7245,TEST,0,0 +7246,TEST,0,0 +7247,TEST,0,0 +7248,TEST,0,0 +7249,TEST,0,0 +7250,TEST,0,0 +7251,TEST,0,0 +7252,TEST,0,0 +7253,TEST,0,0 +7254,TEST,0,0 +7255,TEST,0,0 +7256,TEST,0,0 +7257,TEST,0,0 +7258,TEST,0,0 +7259,TEST,0,0 +7260,TEST,0,0 +7261,TEST,0,0 +7262,TEST,0,0 +7263,TEST,0,0 +7264,TEST,0,0 +7265,TEST,0,0 +7266,TEST,0,0 +7267,TEST,0,0 +7268,TEST,0,0 +7269,TEST,0,0 +7270,TEST,0,0 +7271,TEST,0,0 +7272,TEST,0,0 +7273,TEST,0,0 +7274,TEST,0,0 +7275,TEST,0,0 +7276,TEST,0,0 +7277,TEST,0,0 +7278,TEST,0,0 +7279,TEST,0,0 +7280,TEST,0,0 +7281,TEST,0,0 +7282,TEST,0,0 +7283,TEST,0,0 +7284,TEST,0,0 +7285,TEST,0,0 +7286,TEST,0,0 +7287,TEST,0,0 +7288,TEST,0,0 +7289,TEST,0,0 +7290,TEST,0,0 +7291,TEST,0,0 +7292,TEST,0,0 +7293,TEST,0,0 +7294,TEST,0,0 +7295,TEST,0,0 +7296,TEST,0,0 +7297,TEST,0,0 +7298,TEST,0,0 +7299,TEST,0,0 +7300,TEST,0,0 +7301,TEST,0,0 +7302,TEST,0,0 +7303,TEST,0,0 +7304,TEST,0,0 +7305,TEST,0,0 +7306,TEST,0,0 +7307,TEST,0,0 +7308,TEST,0,0 +7309,TEST,0,0 +7310,TEST,0,0 +7311,TEST,0,0 +7312,TEST,0,0 +7313,TEST,0,0 +7314,TEST,0,0 +7315,TEST,0,0 +7316,TEST,0,0 +7317,TEST,0,0 +7318,TEST,0,0 +7319,TEST,0,0 +7320,TEST,0,0 +7321,TEST,0,0 +7322,TEST,0,0 +7323,TEST,0,0 +7324,TEST,0,0 +7325,TEST,0,0 +7326,TEST,0,0 +7327,TEST,0,0 +7328,TEST,0,0 +7329,TEST,0,0 +7330,TEST,0,0 +7331,TEST,0,0 +7332,TEST,0,0 +7333,TEST,0,0 +7334,TEST,0,0 +7335,TEST,0,0 +7336,TEST,0,0 +7337,TEST,0,0 +7338,TEST,0,0 +7339,TEST,0,0 +7340,TEST,0,0 +7341,TEST,0,0 +7342,TEST,0,0 +7343,TEST,0,0 +7344,TEST,0,0 +7345,TEST,0,0 +7346,TEST,0,0 +7347,TEST,0,0 +7348,TEST,0,0 +7349,TEST,0,0 +7350,TEST,0,0 +7351,TEST,0,0 +7352,TEST,0,0 +7353,TEST,0,0 +7354,TEST,0,0 +7355,TEST,0,0 +7356,TEST,0,0 +7357,TEST,0,0 +7358,TEST,0,0 +7359,TEST,0,0 +7360,TEST,0,0 +7361,TEST,0,0 +7362,TEST,0,0 +7363,TEST,0,0 +7364,TEST,0,0 +7365,TEST,0,0 +7366,TEST,0,0 +7367,TEST,0,0 +7368,TEST,0,0 +7369,TEST,0,0 +7370,TEST,0,0 +7371,TEST,0,0 +7372,TEST,0,0 +7373,TEST,0,0 +7374,TEST,0,0 +7375,TEST,0,0 +7376,TEST,0,0 +7377,TEST,0,0 +7378,TEST,0,0 +7379,TEST,0,0 +7380,TEST,0,0 +7381,TEST,0,0 +7382,TEST,0,0 +7383,TEST,0,0 +7384,TEST,0,0 +7385,TEST,0,0 +7386,TEST,0,0 +7387,TEST,0,0 +7388,TEST,0,0 +7389,TEST,0,0 +7390,TEST,0,0 +7391,TEST,0,0 +7392,TEST,0,0 +7393,TEST,0,0 +7394,TEST,0,0 +7395,TEST,0,0 +7396,TEST,0,0 +7397,TEST,0,0 +7398,TEST,0,0 +7399,TEST,0,0 +7400,TEST,0,0 +7401,TEST,0,0 +7402,TEST,0,0 +7403,TEST,0,0 +7404,TEST,0,0 +7405,TEST,0,0 +7406,TEST,0,0 +7407,TEST,0,0 +7408,TEST,0,0 +7409,TEST,0,0 +7410,TEST,0,0 +7411,TEST,0,0 +7412,TEST,0,0 +7413,TEST,0,0 +7414,TEST,0,0 +7415,TEST,0,0 +7416,TEST,0,0 +7417,TEST,0,0 +7418,TEST,0,0 +7419,TEST,0,0 +7420,TEST,0,0 +7421,TEST,0,0 +7422,TEST,0,0 +7423,TEST,0,0 +7424,TEST,0,0 +7425,TEST,0,0 +7426,TEST,0,0 +7427,TEST,0,0 +7428,TEST,0,0 +7429,TEST,0,0 +7430,TEST,0,0 +7431,TEST,0,0 +7432,TEST,0,0 +7433,TEST,0,0 +7434,TEST,0,0 +7435,TEST,0,0 +7436,TEST,0,0 +7437,TEST,0,0 +7438,TEST,0,0 +7439,TEST,0,0 +7440,TEST,0,0 +7441,TEST,0,0 +7442,TEST,0,0 +7443,TEST,0,0 +7444,TEST,0,0 +7445,TEST,0,0 +7446,TEST,0,0 +7447,TEST,0,0 +7448,TEST,0,0 +7449,TEST,0,0 +7450,TEST,0,0 +7451,TEST,0,0 +7452,TEST,0,0 +7453,TEST,0,0 +7454,TEST,0,0 +7455,TEST,0,0 +7456,TEST,0,0 +7457,TEST,0,0 +7458,TEST,0,0 +7459,TEST,0,0 +7460,TEST,0,0 +7461,TEST,0,0 +7462,TEST,0,0 +7463,TEST,0,0 +7464,TEST,0,0 +7465,TEST,0,0 +7466,TEST,0,0 +7467,TEST,0,0 +7468,TEST,0,0 +7469,TEST,0,0 +7470,TEST,0,0 +7471,TEST,0,0 +7472,TEST,0,0 +7473,TEST,0,0 +7474,TEST,0,0 +7475,TEST,0,0 +7476,TEST,0,0 +7477,TEST,0,0 +7478,TEST,0,0 +7479,TEST,0,0 +7480,TEST,0,0 +7481,TEST,0,0 +7482,TEST,0,0 +7483,TEST,0,0 +7484,TEST,0,0 +7485,TEST,0,0 +7486,TEST,0,0 +7487,TEST,0,0 +7488,TEST,0,0 +7489,TEST,0,0 +7490,TEST,0,0 +7491,TEST,0,0 +7492,TEST,0,0 +7493,TEST,0,0 +7494,TEST,0,0 +7495,TEST,0,0 +7496,TEST,0,0 +7497,TEST,0,0 +7498,TEST,0,0 +7499,TEST,0,0 +7500,TEST,0,0 +7501,TEST,0,0 +7502,TEST,0,0 +7503,TEST,0,0 +7504,TEST,0,0 +7505,TEST,0,0 +7506,TEST,0,0 +7507,TEST,0,0 +7508,TEST,0,0 +7509,TEST,0,0 +7510,TEST,0,0 +7511,TEST,0,0 +7512,TEST,0,0 +7513,TEST,0,0 +7514,TEST,0,0 +7515,TEST,0,0 +7516,TEST,0,0 +7517,TEST,0,0 +7518,TEST,0,0 +7519,TEST,0,0 +7520,TEST,0,0 +7521,TEST,0,0 +7522,TEST,0,0 +7523,TEST,0,0 +7524,TEST,0,0 +7525,TEST,0,0 +7526,TEST,0,0 +7527,TEST,0,0 +7528,TEST,0,0 +7529,TEST,0,0 +7530,TEST,0,0 +7531,TEST,0,0 +7532,TEST,0,0 +7533,TEST,0,0 +7534,TEST,0,0 +7535,TEST,0,0 +7536,TEST,0,0 +7537,TEST,0,0 +7538,TEST,0,0 +7539,TEST,0,0 +7540,TEST,0,0 +7541,TEST,0,0 +7542,TEST,0,0 +7543,TEST,0,0 +7544,TEST,0,0 +7545,TEST,0,0 +7546,TEST,0,0 +7547,TEST,0,0 +7548,TEST,0,0 +7549,TEST,0,0 +7550,TEST,0,0 +7551,TEST,0,0 +7552,TEST,0,0 +7553,TEST,0,0 +7554,TEST,0,0 +7555,TEST,0,0 +7556,TEST,0,0 +7557,TEST,0,0 +7558,TEST,0,0 +7559,TEST,0,0 +7560,TEST,0,0 +7561,TEST,0,0 +7562,TEST,0,0 +7563,TEST,0,0 +7564,TEST,0,0 +7565,TEST,0,0 +7566,TEST,0,0 +7567,TEST,0,0 +7568,TEST,0,0 +7569,TEST,0,0 +7570,TEST,0,0 +7571,TEST,0,0 +7572,TEST,0,0 +7573,TEST,0,0 +7574,TEST,0,0 +7575,TEST,0,0 +7576,TEST,0,0 +7577,TEST,0,0 +7578,TEST,0,0 +7579,TEST,0,0 +7580,TEST,0,0 +7581,TEST,0,0 +7582,TEST,0,0 +7583,TEST,0,0 +7584,TEST,0,0 +7585,TEST,0,0 +7586,TEST,0,0 +7587,TEST,0,0 +7588,TEST,0,0 +7589,TEST,0,0 +7590,TEST,0,0 +7591,TEST,0,0 +7592,TEST,0,0 +7593,TEST,0,0 +7594,TEST,0,0 +7595,TEST,0,0 +7596,TEST,0,0 +7597,TEST,0,0 +7598,TEST,0,0 +7599,TEST,0,0 +7600,TEST,0,0 +7601,TEST,0,0 +7602,TEST,0,0 +7603,TEST,0,0 +7604,TEST,0,0 +7605,TEST,0,0 +7606,TEST,0,0 +7607,TEST,0,0 +7608,TEST,0,0 +7609,TEST,0,0 +7610,TEST,0,0 +7611,TEST,0,0 +7612,TEST,0,0 +7613,TEST,0,0 +7614,TEST,0,0 +7615,TEST,0,0 +7616,TEST,0,0 +7617,TEST,0,0 +7618,TEST,0,0 +7619,TEST,0,0 +7620,TEST,0,0 +7621,TEST,0,0 +7622,TEST,0,0 +7623,TEST,0,0 +7624,TEST,0,0 +7625,TEST,0,0 +7626,TEST,0,0 +7627,TEST,0,0 +7628,TEST,0,0 +7629,TEST,0,0 +7630,TEST,0,0 +7631,TEST,0,0 +7632,TEST,0,0 +7633,TEST,0,0 +7634,TEST,0,0 +7635,TEST,0,0 +7636,TEST,0,0 +7637,TEST,0,0 +7638,TEST,0,0 +7639,TEST,0,0 +7640,TEST,0,0 +7641,TEST,0,0 +7642,TEST,0,0 +7643,TEST,0,0 +7644,TEST,0,0 +7645,TEST,0,0 +7646,TEST,0,0 +7647,TEST,0,0 +7648,TEST,0,0 +7649,TEST,0,0 +7650,TEST,0,0 +7651,TEST,0,0 +7652,TEST,0,0 +7653,TEST,0,0 +7654,TEST,0,0 +7655,TEST,0,0 +7656,TEST,0,0 +7657,TEST,0,0 +7658,TEST,0,0 +7659,TEST,0,0 +7660,TEST,0,0 +7661,TEST,0,0 +7662,TEST,0,0 +7663,TEST,0,0 +7664,TEST,0,0 +7665,TEST,0,0 +7666,TEST,0,0 +7667,TEST,0,0 +7668,TEST,0,0 +7669,TEST,0,0 +7670,TEST,0,0 +7671,TEST,0,0 +7672,TEST,0,0 +7673,TEST,0,0 +7674,TEST,0,0 +7675,TEST,0,0 +7676,TEST,0,0 +7677,TEST,0,0 +7678,TEST,0,0 +7679,TEST,0,0 +7680,TEST,0,0 +7681,TEST,0,0 +7682,TEST,0,0 +7683,TEST,0,0 +7684,TEST,0,0 +7685,TEST,0,0 +7686,TEST,0,0 +7687,TEST,0,0 +7688,TEST,0,0 +7689,TEST,0,0 +7690,TEST,0,0 +7691,TEST,0,0 +7692,TEST,0,0 +7693,TEST,0,0 +7694,TEST,0,0 +7695,TEST,0,0 +7696,TEST,0,0 +7697,TEST,0,0 +7698,TEST,0,0 +7699,TEST,0,0 +7700,TEST,0,0 +7701,TEST,0,0 +7702,TEST,0,0 +7703,TEST,0,0 +7704,TEST,0,0 +7705,TEST,0,0 +7706,TEST,0,0 +7707,TEST,0,0 +7708,TEST,0,0 +7709,TEST,0,0 +7710,TEST,0,0 +7711,TEST,0,0 +7712,TEST,0,0 +7713,TEST,0,0 +7714,TEST,0,0 +7715,TEST,0,0 +7716,TEST,0,0 +7717,TEST,0,0 +7718,TEST,0,0 +7719,TEST,0,0 +7720,TEST,0,0 +7721,TEST,0,0 +7722,TEST,0,0 +7723,TEST,0,0 +7724,TEST,0,0 +7725,TEST,0,0 +7726,TEST,0,0 +7727,TEST,0,0 +7728,TEST,0,0 +7729,TEST,0,0 +7730,TEST,0,0 +7731,TEST,0,0 +7732,TEST,0,0 +7733,TEST,0,0 +7734,TEST,0,0 +7735,TEST,0,0 +7736,TEST,0,0 +7737,TEST,0,0 +7738,TEST,0,0 +7739,TEST,0,0 +7740,TEST,0,0 +7741,TEST,0,0 +7742,TEST,0,0 +7743,TEST,0,0 +7744,TEST,0,0 +7745,TEST,0,0 +7746,TEST,0,0 +7747,TEST,0,0 +7748,TEST,0,0 +7749,TEST,0,0 +7750,TEST,0,0 +7751,TEST,0,0 +7752,TEST,0,0 +7753,TEST,0,0 +7754,TEST,0,0 +7755,TEST,0,0 +7756,TEST,0,0 +7757,TEST,0,0 +7758,TEST,0,0 +7759,TEST,0,0 +7760,TEST,0,0 +7761,TEST,0,0 +7762,TEST,0,0 +7763,TEST,0,0 +7764,TEST,0,0 +7765,TEST,0,0 +7766,TEST,0,0 +7767,TEST,0,0 +7768,TEST,0,0 +7769,TEST,0,0 +7770,TEST,0,0 +7771,TEST,0,0 +7772,TEST,0,0 +7773,TEST,0,0 +7774,TEST,0,0 +7775,TEST,0,0 +7776,TEST,0,0 +7777,TEST,0,0 +7778,TEST,0,0 +7779,TEST,0,0 +7780,TEST,0,0 +7781,TEST,0,0 +7782,TEST,0,0 +7783,TEST,0,0 +7784,TEST,0,0 +7785,TEST,0,0 +7786,TEST,0,0 +7787,TEST,0,0 +7788,TEST,0,0 +7789,TEST,0,0 +7790,TEST,0,0 +7791,TEST,0,0 +7792,TEST,0,0 +7793,TEST,0,0 +7794,TEST,0,0 +7795,TEST,0,0 +7796,TEST,0,0 +7797,TEST,0,0 +7798,TEST,0,0 +7799,TEST,0,0 +7800,TEST,0,0 +7801,TEST,0,0 +7802,TEST,0,0 +7803,TEST,0,0 +7804,TEST,0,0 +7805,TEST,0,0 +7806,TEST,0,0 +7807,TEST,0,0 +7808,TEST,0,0 +7809,TEST,0,0 +7810,TEST,0,0 +7811,TEST,0,0 +7812,TEST,0,0 +7813,TEST,0,0 +7814,TEST,0,0 +7815,TEST,0,0 +7816,TEST,0,0 +7817,TEST,0,0 +7818,TEST,0,0 +7819,TEST,0,0 +7820,TEST,0,0 +7821,TEST,0,0 +7822,TEST,0,0 +7823,TEST,0,0 +7824,TEST,0,0 +7825,TEST,0,0 +7826,TEST,0,0 +7827,TEST,0,0 +7828,TEST,0,0 +7829,TEST,0,0 +7830,TEST,0,0 +7831,TEST,0,0 +7832,TEST,0,0 +7833,TEST,0,0 +7834,TEST,0,0 +7835,TEST,0,0 +7836,TEST,0,0 +7837,TEST,0,0 +7838,TEST,0,0 +7839,TEST,0,0 +7840,TEST,0,0 +7841,TEST,0,0 +7842,TEST,0,0 +7843,TEST,0,0 +7844,TEST,0,0 +7845,TEST,0,0 +7846,TEST,0,0 +7847,TEST,0,0 +7848,TEST,0,0 +7849,TEST,0,0 +7850,TEST,0,0 +7851,TEST,0,0 +7852,TEST,0,0 +7853,TEST,0,0 +7854,TEST,0,0 +7855,TEST,0,0 +7856,TEST,0,0 +7857,TEST,0,0 +7858,TEST,0,0 +7859,TEST,0,0 +7860,TEST,0,0 +7861,TEST,0,0 +7862,TEST,0,0 +7863,TEST,0,0 +7864,TEST,0,0 +7865,TEST,0,0 +7866,TEST,0,0 +7867,TEST,0,0 +7868,TEST,0,0 +7869,TEST,0,0 +7870,TEST,0,0 +7871,TEST,0,0 +7872,TEST,0,0 +7873,TEST,0,0 +7874,TEST,0,0 +7875,TEST,0,0 +7876,TEST,0,0 +7877,TEST,0,0 +7878,TEST,0,0 +7879,TEST,0,0 +7880,TEST,0,0 +7881,TEST,0,0 +7882,TEST,0,0 +7883,TEST,0,0 +7884,TEST,0,0 +7885,TEST,0,0 +7886,TEST,0,0 +7887,TEST,0,0 +7888,TEST,0,0 +7889,TEST,0,0 +7890,TEST,0,0 +7891,TEST,0,0 +7892,TEST,0,0 +7893,TEST,0,0 +7894,TEST,0,0 +7895,TEST,0,0 +7896,TEST,0,0 +7897,TEST,0,0 +7898,TEST,0,0 +7899,TEST,0,0 +7900,TEST,0,0 +7901,TEST,0,0 +7902,TEST,0,0 +7903,TEST,0,0 +7904,TEST,0,0 +7905,TEST,0,0 +7906,TEST,0,0 +7907,TEST,0,0 +7908,TEST,0,0 +7909,TEST,0,0 +7910,TEST,0,0 +7911,TEST,0,0 +7912,TEST,0,0 +7913,TEST,0,0 +7914,TEST,0,0 +7915,TEST,0,0 +7916,TEST,0,0 +7917,TEST,0,0 +7918,TEST,0,0 +7919,TEST,0,0 +7920,TEST,0,0 +7921,TEST,0,0 +7922,TEST,0,0 +7923,TEST,0,0 +7924,TEST,0,0 +7925,TEST,0,0 +7926,TEST,0,0 +7927,TEST,0,0 +7928,TEST,0,0 +7929,TEST,0,0 +7930,TEST,0,0 +7931,TEST,0,0 +7932,TEST,0,0 +7933,TEST,0,0 +7934,TEST,0,0 +7935,TEST,0,0 +7936,TEST,0,0 +7937,TEST,0,0 +7938,TEST,0,0 +7939,TEST,0,0 +7940,TEST,0,0 +7941,TEST,0,0 +7942,TEST,0,0 +7943,TEST,0,0 +7944,TEST,0,0 +7945,TEST,0,0 +7946,TEST,0,0 +7947,TEST,0,0 +7948,TEST,0,0 +7949,TEST,0,0 +7950,TEST,0,0 +7951,TEST,0,0 +7952,TEST,0,0 +7953,TEST,0,0 +7954,TEST,0,0 +7955,TEST,0,0 +7956,TEST,0,0 +7957,TEST,0,0 +7958,TEST,0,0 +7959,TEST,0,0 +7960,TEST,0,0 +7961,TEST,0,0 +7962,TEST,0,0 +7963,TEST,0,0 +7964,TEST,0,0 +7965,TEST,0,0 +7966,TEST,0,0 +7967,TEST,0,0 +7968,TEST,0,0 +7969,TEST,0,0 +7970,TEST,0,0 +7971,TEST,0,0 +7972,TEST,0,0 +7973,TEST,0,0 +7974,TEST,0,0 +7975,TEST,0,0 +7976,TEST,0,0 +7977,TEST,0,0 +7978,TEST,0,0 +7979,TEST,0,0 +7980,TEST,0,0 +7981,TEST,0,0 +7982,TEST,0,0 +7983,TEST,0,0 +7984,TEST,0,0 +7985,TEST,0,0 +7986,TEST,0,0 +7987,TEST,0,0 +7988,TEST,0,0 +7989,TEST,0,0 +7990,TEST,0,0 +7991,TEST,0,0 +7992,TEST,0,0 +7993,TEST,0,0 +7994,TEST,0,0 +7995,TEST,0,0 +7996,TEST,0,0 +7997,TEST,0,0 +7998,TEST,0,0 +7999,TEST,0,0 +8000,TEST,0,0 +8001,TEST,0,0 +8002,TEST,0,0 +8003,TEST,0,0 +8004,TEST,0,0 +8005,TEST,0,0 +8006,TEST,0,0 +8007,TEST,0,0 +8008,TEST,0,0 +8009,TEST,0,0 +8010,TEST,0,0 +8011,TEST,0,0 +8012,TEST,0,0 +8013,TEST,0,0 +8014,TEST,0,0 +8015,TEST,0,0 +8016,TEST,0,0 +8017,TEST,0,0 +8018,TEST,0,0 +8019,TEST,0,0 +8020,TEST,0,0 +8021,TEST,0,0 +8022,TEST,0,0 +8023,TEST,0,0 +8024,TEST,0,0 +8025,TEST,0,0 +8026,TEST,0,0 +8027,TEST,0,0 +8028,TEST,0,0 +8029,TEST,0,0 +8030,TEST,0,0 +8031,TEST,0,0 +8032,TEST,0,0 +8033,TEST,0,0 +8034,TEST,0,0 +8035,TEST,0,0 +8036,TEST,0,0 +8037,TEST,0,0 +8038,TEST,0,0 +8039,TEST,0,0 +8040,TEST,0,0 +8041,TEST,0,0 +8042,TEST,0,0 +8043,TEST,0,0 +8044,TEST,0,0 +8045,TEST,0,0 +8046,TEST,0,0 +8047,TEST,0,0 +8048,TEST,0,0 +8049,TEST,0,0 +8050,TEST,0,0 +8051,TEST,0,0 +8052,TEST,0,0 +8053,TEST,0,0 +8054,TEST,0,0 +8055,TEST,0,0 +8056,TEST,0,0 +8057,TEST,0,0 +8058,TEST,0,0 +8059,TEST,0,0 +8060,TEST,0,0 +8061,TEST,0,0 +8062,TEST,0,0 +8063,TEST,0,0 +8064,TEST,0,0 +8065,TEST,0,0 +8066,TEST,0,0 +8067,TEST,0,0 +8068,TEST,0,0 +8069,TEST,0,0 +8070,TEST,0,0 +8071,TEST,0,0 +8072,TEST,0,0 +8073,TEST,0,0 +8074,TEST,0,0 +8075,TEST,0,0 +8076,TEST,0,0 +8077,TEST,0,0 +8078,TEST,0,0 +8079,TEST,0,0 +8080,TEST,0,0 +8081,TEST,0,0 +8082,TEST,0,0 +8083,TEST,0,0 +8084,TEST,0,0 +8085,TEST,0,0 +8086,TEST,0,0 +8087,TEST,0,0 +8088,TEST,0,0 +8089,TEST,0,0 +8090,TEST,0,0 +8091,TEST,0,0 +8092,TEST,0,0 +8093,TEST,0,0 +8094,TEST,0,0 +8095,TEST,0,0 +8096,TEST,0,0 +8097,TEST,0,0 +8098,TEST,0,0 +8099,TEST,0,0 +8100,TEST,0,0 +8101,TEST,0,0 +8102,TEST,0,0 +8103,TEST,0,0 +8104,TEST,0,0 +8105,TEST,0,0 +8106,TEST,0,0 +8107,TEST,0,0 +8108,TEST,0,0 +8109,TEST,0,0 +8110,TEST,0,0 +8111,TEST,0,0 +8112,TEST,0,0 +8113,TEST,0,0 +8114,TEST,0,0 +8115,TEST,0,0 +8116,TEST,0,0 +8117,TEST,0,0 +8118,TEST,0,0 +8119,TEST,0,0 +8120,TEST,0,0 +8121,TEST,0,0 +8122,TEST,0,0 +8123,TEST,0,0 +8124,TEST,0,0 +8125,TEST,0,0 +8126,TEST,0,0 +8127,TEST,0,0 +8128,TEST,0,0 +8129,TEST,0,0 +8130,TEST,0,0 +8131,TEST,0,0 +8132,TEST,0,0 +8133,TEST,0,0 +8134,TEST,0,0 +8135,TEST,0,0 +8136,TEST,0,0 +8137,TEST,0,0 +8138,TEST,0,0 +8139,TEST,0,0 +8140,TEST,0,0 +8141,TEST,0,0 +8142,TEST,0,0 +8143,TEST,0,0 +8144,TEST,0,0 +8145,TEST,0,0 +8146,TEST,0,0 +8147,TEST,0,0 +8148,TEST,0,0 +8149,TEST,0,0 +8150,TEST,0,0 +8151,TEST,0,0 +8152,TEST,0,0 +8153,TEST,0,0 +8154,TEST,0,0 +8155,TEST,0,0 +8156,TEST,0,0 +8157,TEST,0,0 +8158,TEST,0,0 +8159,TEST,0,0 +8160,TEST,0,0 +8161,TEST,0,0 +8162,TEST,0,0 +8163,TEST,0,0 +8164,TEST,0,0 +8165,TEST,0,0 +8166,TEST,0,0 +8167,TEST,0,0 +8168,TEST,0,0 +8169,TEST,0,0 +8170,TEST,0,0 +8171,TEST,0,0 +8172,TEST,0,0 +8173,TEST,0,0 +8174,TEST,0,0 +8175,TEST,0,0 +8176,TEST,0,0 +8177,TEST,0,0 +8178,TEST,0,0 +8179,TEST,0,0 +8180,TEST,0,0 +8181,TEST,0,0 +8182,TEST,0,0 +8183,TEST,0,0 +8184,TEST,0,0 +8185,TEST,0,0 +8186,TEST,0,0 +8187,TEST,0,0 +8188,TEST,0,0 +8189,TEST,0,0 +8190,TEST,0,0 +8191,TEST,0,0 +8192,TEST,0,0 +8193,TEST,0,0 +8194,TEST,0,0 +8195,TEST,0,0 +8196,TEST,0,0 +8197,TEST,0,0 +8198,TEST,0,0 +8199,TEST,0,0 +8200,TEST,0,0 +8201,TEST,0,0 +8202,TEST,0,0 +8203,TEST,0,0 +8204,TEST,0,0 +8205,TEST,0,0 +8206,TEST,0,0 +8207,TEST,0,0 +8208,TEST,0,0 +8209,TEST,0,0 +8210,TEST,0,0 +8211,TEST,0,0 +8212,TEST,0,0 +8213,TEST,0,0 +8214,TEST,0,0 +8215,TEST,0,0 +8216,TEST,0,0 +8217,TEST,0,0 +8218,TEST,0,0 +8219,TEST,0,0 +8220,TEST,0,0 +8221,TEST,0,0 +8222,TEST,0,0 +8223,TEST,0,0 +8224,TEST,0,0 +8225,TEST,0,0 +8226,TEST,0,0 +8227,TEST,0,0 +8228,TEST,0,0 +8229,TEST,0,0 +8230,TEST,0,0 +8231,TEST,0,0 +8232,TEST,0,0 +8233,TEST,0,0 +8234,TEST,0,0 +8235,TEST,0,0 +8236,TEST,0,0 +8237,TEST,0,0 +8238,TEST,0,0 +8239,TEST,0,0 +8240,TEST,0,0 +8241,TEST,0,0 +8242,TEST,0,0 +8243,TEST,0,0 +8244,TEST,0,0 +8245,TEST,0,0 +8246,TEST,0,0 +8247,TEST,0,0 +8248,TEST,0,0 +8249,TEST,0,0 +8250,TEST,0,0 +8251,TEST,0,0 +8252,TEST,0,0 +8253,TEST,0,0 +8254,TEST,0,0 +8255,TEST,0,0 +8256,TEST,0,0 +8257,TEST,0,0 +8258,TEST,0,0 +8259,TEST,0,0 +8260,TEST,0,0 +8261,TEST,0,0 +8262,TEST,0,0 +8263,TEST,0,0 +8264,TEST,0,0 +8265,TEST,0,0 +8266,TEST,0,0 +8267,TEST,0,0 +8268,TEST,0,0 +8269,TEST,0,0 +8270,TEST,0,0 +8271,TEST,0,0 +8272,TEST,0,0 +8273,TEST,0,0 +8274,TEST,0,0 +8275,TEST,0,0 +8276,TEST,0,0 +8277,TEST,0,0 +8278,TEST,0,0 +8279,TEST,0,0 +8280,TEST,0,0 +8281,TEST,0,0 +8282,TEST,0,0 +8283,TEST,0,0 +8284,TEST,0,0 +8285,TEST,0,0 +8286,TEST,0,0 +8287,TEST,0,0 +8288,TEST,0,0 +8289,TEST,0,0 +8290,TEST,0,0 +8291,TEST,0,0 +8292,TEST,0,0 +8293,TEST,0,0 +8294,TEST,0,0 +8295,TEST,0,0 +8296,TEST,0,0 +8297,TEST,0,0 +8298,TEST,0,0 +8299,TEST,0,0 +8300,TEST,0,0 +8301,TEST,0,0 +8302,TEST,0,0 +8303,TEST,0,0 +8304,TEST,0,0 +8305,TEST,0,0 +8306,TEST,0,0 +8307,TEST,0,0 +8308,TEST,0,0 +8309,TEST,0,0 +8310,TEST,0,0 +8311,TEST,0,0 +8312,TEST,0,0 +8313,TEST,0,0 +8314,TEST,0,0 +8315,TEST,0,0 +8316,TEST,0,0 +8317,TEST,0,0 +8318,TEST,0,0 +8319,TEST,0,0 +8320,TEST,0,0 +8321,TEST,0,0 +8322,TEST,0,0 +8323,TEST,0,0 +8324,TEST,0,0 +8325,TEST,0,0 +8326,TEST,0,0 +8327,TEST,0,0 +8328,TEST,0,0 +8329,TEST,0,0 +8330,TEST,0,0 +8331,TEST,0,0 +8332,TEST,0,0 +8333,TEST,0,0 +8334,TEST,0,0 +8335,TEST,0,0 +8336,TEST,0,0 +8337,TEST,0,0 +8338,TEST,0,0 +8339,TEST,0,0 +8340,TEST,0,0 +8341,TEST,0,0 +8342,TEST,0,0 +8343,TEST,0,0 +8344,TEST,0,0 +8345,TEST,0,0 +8346,TEST,0,0 +8347,TEST,0,0 +8348,TEST,0,0 +8349,TEST,0,0 +8350,TEST,0,0 +8351,TEST,0,0 +8352,TEST,0,0 +8353,TEST,0,0 +8354,TEST,0,0 +8355,TEST,0,0 +8356,TEST,0,0 +8357,TEST,0,0 +8358,TEST,0,0 +8359,TEST,0,0 +8360,TEST,0,0 +8361,TEST,0,0 +8362,TEST,0,0 +8363,TEST,0,0 +8364,TEST,0,0 +8365,TEST,0,0 +8366,TEST,0,0 +8367,TEST,0,0 +8368,TEST,0,0 +8369,TEST,0,0 +8370,TEST,0,0 +8371,TEST,0,0 +8372,TEST,0,0 +8373,TEST,0,0 +8374,TEST,0,0 +8375,TEST,0,0 +8376,TEST,0,0 +8377,TEST,0,0 +8378,TEST,0,0 +8379,TEST,0,0 +8380,TEST,0,0 +8381,TEST,0,0 +8382,TEST,0,0 +8383,TEST,0,0 +8384,TEST,0,0 +8385,TEST,0,0 +8386,TEST,0,0 +8387,TEST,0,0 +8388,TEST,0,0 +8389,TEST,0,0 +8390,TEST,0,0 +8391,TEST,0,0 +8392,TEST,0,0 +8393,TEST,0,0 +8394,TEST,0,0 +8395,TEST,0,0 +8396,TEST,0,0 +8397,TEST,0,0 +8398,TEST,0,0 +8399,TEST,0,0 +8400,TEST,0,0 +8401,TEST,0,0 +8402,TEST,0,0 +8403,TEST,0,0 +8404,TEST,0,0 +8405,TEST,0,0 +8406,TEST,0,0 +8407,TEST,0,0 +8408,TEST,0,0 +8409,TEST,0,0 +8410,TEST,0,0 +8411,TEST,0,0 +8412,TEST,0,0 +8413,TEST,0,0 +8414,TEST,0,0 +8415,TEST,0,0 +8416,TEST,0,0 +8417,TEST,0,0 +8418,TEST,0,0 +8419,TEST,0,0 +8420,TEST,0,0 +8421,TEST,0,0 +8422,TEST,0,0 +8423,TEST,0,0 +8424,TEST,0,0 +8425,TEST,0,0 +8426,TEST,0,0 +8427,TEST,0,0 +8428,TEST,0,0 +8429,TEST,0,0 +8430,TEST,0,0 +8431,TEST,0,0 +8432,TEST,0,0 +8433,TEST,0,0 +8434,TEST,0,0 +8435,TEST,0,0 +8436,TEST,0,0 +8437,TEST,0,0 +8438,TEST,0,0 +8439,TEST,0,0 +8440,TEST,0,0 +8441,TEST,0,0 +8442,TEST,0,0 +8443,TEST,0,0 +8444,TEST,0,0 +8445,TEST,0,0 +8446,TEST,0,0 +8447,TEST,0,0 +8448,TEST,0,0 +8449,TEST,0,0 +8450,TEST,0,0 +8451,TEST,0,0 +8452,TEST,0,0 +8453,TEST,0,0 +8454,TEST,0,0 +8455,TEST,0,0 +8456,TEST,0,0 +8457,TEST,0,0 +8458,TEST,0,0 +8459,TEST,0,0 +8460,TEST,0,0 +8461,TEST,0,0 +8462,TEST,0,0 +8463,TEST,0,0 +8464,TEST,0,0 +8465,TEST,0,0 +8466,TEST,0,0 +8467,TEST,0,0 +8468,TEST,0,0 +8469,TEST,0,0 +8470,TEST,0,0 +8471,TEST,0,0 +8472,TEST,0,0 +8473,TEST,0,0 +8474,TEST,0,0 +8475,TEST,0,0 +8476,TEST,0,0 +8477,TEST,0,0 +8478,TEST,0,0 +8479,TEST,0,0 +8480,TEST,0,0 +8481,TEST,0,0 +8482,TEST,0,0 +8483,TEST,0,0 +8484,TEST,0,0 +8485,TEST,0,0 +8486,TEST,0,0 +8487,TEST,0,0 +8488,TEST,0,0 +8489,TEST,0,0 +8490,TEST,0,0 +8491,TEST,0,0 +8492,TEST,0,0 +8493,TEST,0,0 +8494,TEST,0,0 +8495,TEST,0,0 +8496,TEST,0,0 +8497,TEST,0,0 +8498,TEST,0,0 +8499,TEST,0,0 +8500,TEST,0,0 +8501,TEST,0,0 +8502,TEST,0,0 +8503,TEST,0,0 +8504,TEST,0,0 +8505,TEST,0,0 +8506,TEST,0,0 +8507,TEST,0,0 +8508,TEST,0,0 +8509,TEST,0,0 +8510,TEST,0,0 +8511,TEST,0,0 +8512,TEST,0,0 +8513,TEST,0,0 +8514,TEST,0,0 +8515,TEST,0,0 +8516,TEST,0,0 +8517,TEST,0,0 +8518,TEST,0,0 +8519,TEST,0,0 +8520,TEST,0,0 +8521,TEST,0,0 +8522,TEST,0,0 +8523,TEST,0,0 +8524,TEST,0,0 +8525,TEST,0,0 +8526,TEST,0,0 +8527,TEST,0,0 +8528,TEST,0,0 +8529,TEST,0,0 +8530,TEST,0,0 +8531,TEST,0,0 +8532,TEST,0,0 +8533,TEST,0,0 +8534,TEST,0,0 +8535,TEST,0,0 +8536,TEST,0,0 +8537,TEST,0,0 +8538,TEST,0,0 +8539,TEST,0,0 +8540,TEST,0,0 +8541,TEST,0,0 +8542,TEST,0,0 +8543,TEST,0,0 +8544,TEST,0,0 +8545,TEST,0,0 +8546,TEST,0,0 +8547,TEST,0,0 +8548,TEST,0,0 +8549,TEST,0,0 +8550,TEST,0,0 +8551,TEST,0,0 +8552,TEST,0,0 +8553,TEST,0,0 +8554,TEST,0,0 +8555,TEST,0,0 +8556,TEST,0,0 +8557,TEST,0,0 +8558,TEST,0,0 +8559,TEST,0,0 +8560,TEST,0,0 +8561,TEST,0,0 +8562,TEST,0,0 +8563,TEST,0,0 +8564,TEST,0,0 +8565,TEST,0,0 +8566,TEST,0,0 +8567,TEST,0,0 +8568,TEST,0,0 +8569,TEST,0,0 +8570,TEST,0,0 +8571,TEST,0,0 +8572,TEST,0,0 +8573,TEST,0,0 +8574,TEST,0,0 +8575,TEST,0,0 +8576,TEST,0,0 +8577,TEST,0,0 +8578,TEST,0,0 +8579,TEST,0,0 +8580,TEST,0,0 +8581,TEST,0,0 +8582,TEST,0,0 +8583,TEST,0,0 +8584,TEST,0,0 +8585,TEST,0,0 +8586,TEST,0,0 +8587,TEST,0,0 +8588,TEST,0,0 +8589,TEST,0,0 +8590,TEST,0,0 +8591,TEST,0,0 +8592,TEST,0,0 +8593,TEST,0,0 +8594,TEST,0,0 +8595,TEST,0,0 +8596,TEST,0,0 +8597,TEST,0,0 +8598,TEST,0,0 +8599,TEST,0,0 +8600,TEST,0,0 +8601,TEST,0,0 +8602,TEST,0,0 +8603,TEST,0,0 +8604,TEST,0,0 +8605,TEST,0,0 +8606,TEST,0,0 +8607,TEST,0,0 +8608,TEST,0,0 +8609,TEST,0,0 +8610,TEST,0,0 +8611,TEST,0,0 +8612,TEST,0,0 +8613,TEST,0,0 +8614,TEST,0,0 +8615,TEST,0,0 +8616,TEST,0,0 +8617,TEST,0,0 +8618,TEST,0,0 +8619,TEST,0,0 +8620,TEST,0,0 +8621,TEST,0,0 +8622,TEST,0,0 +8623,TEST,0,0 +8624,TEST,0,0 +8625,TEST,0,0 +8626,TEST,0,0 +8627,TEST,0,0 +8628,TEST,0,0 +8629,TEST,0,0 +8630,TEST,0,0 +8631,TEST,0,0 +8632,TEST,0,0 +8633,TEST,0,0 +8634,TEST,0,0 +8635,TEST,0,0 +8636,TEST,0,0 +8637,TEST,0,0 +8638,TEST,0,0 +8639,TEST,0,0 +8640,TEST,0,0 +8641,TEST,0,0 +8642,TEST,0,0 +8643,TEST,0,0 +8644,TEST,0,0 +8645,TEST,0,0 +8646,TEST,0,0 +8647,TEST,0,0 +8648,TEST,0,0 +8649,TEST,0,0 +8650,TEST,0,0 +8651,TEST,0,0 +8652,TEST,0,0 +8653,TEST,0,0 +8654,TEST,0,0 +8655,TEST,0,0 +8656,TEST,0,0 +8657,TEST,0,0 +8658,TEST,0,0 +8659,TEST,0,0 +8660,TEST,0,0 +8661,TEST,0,0 +8662,TEST,0,0 +8663,TEST,0,0 +8664,TEST,0,0 +8665,TEST,0,0 +8666,TEST,0,0 +8667,TEST,0,0 +8668,TEST,0,0 +8669,TEST,0,0 +8670,TEST,0,0 +8671,TEST,0,0 +8672,TEST,0,0 +8673,TEST,0,0 +8674,TEST,0,0 +8675,TEST,0,0 +8676,TEST,0,0 +8677,TEST,0,0 +8678,TEST,0,0 +8679,TEST,0,0 +8680,TEST,0,0 +8681,TEST,0,0 +8682,TEST,0,0 +8683,TEST,0,0 +8684,TEST,0,0 +8685,TEST,0,0 +8686,TEST,0,0 +8687,TEST,0,0 +8688,TEST,0,0 +8689,TEST,0,0 +8690,TEST,0,0 +8691,TEST,0,0 +8692,TEST,0,0 +8693,TEST,0,0 +8694,TEST,0,0 +8695,TEST,0,0 +8696,TEST,0,0 +8697,TEST,0,0 +8698,TEST,0,0 +8699,TEST,0,0 +8700,TEST,0,0 +8701,TEST,0,0 +8702,TEST,0,0 +8703,TEST,0,0 +8704,TEST,0,0 +8705,TEST,0,0 +8706,TEST,0,0 +8707,TEST,0,0 +8708,TEST,0,0 +8709,TEST,0,0 +8710,TEST,0,0 +8711,TEST,0,0 +8712,TEST,0,0 +8713,TEST,0,0 +8714,TEST,0,0 +8715,TEST,0,0 +8716,TEST,0,0 +8717,TEST,0,0 +8718,TEST,0,0 +8719,TEST,0,0 +8720,TEST,0,0 +8721,TEST,0,0 +8722,TEST,0,0 +8723,TEST,0,0 +8724,TEST,0,0 +8725,TEST,0,0 +8726,TEST,0,0 +8727,TEST,0,0 +8728,TEST,0,0 +8729,TEST,0,0 +8730,TEST,0,0 +8731,TEST,0,0 +8732,TEST,0,0 +8733,TEST,0,0 +8734,TEST,0,0 +8735,TEST,0,0 +8736,TEST,0,0 +8737,TEST,0,0 +8738,TEST,0,0 +8739,TEST,0,0 +8740,TEST,0,0 +8741,TEST,0,0 +8742,TEST,0,0 +8743,TEST,0,0 +8744,TEST,0,0 +8745,TEST,0,0 +8746,TEST,0,0 +8747,TEST,0,0 +8748,TEST,0,0 +8749,TEST,0,0 +8750,TEST,0,0 +8751,TEST,0,0 +8752,TEST,0,0 +8753,TEST,0,0 +8754,TEST,0,0 +8755,TEST,0,0 +8756,TEST,0,0 +8757,TEST,0,0 +8758,TEST,0,0 +8759,TEST,0,0 +8760,TEST,0,0 +8761,TEST,0,0 +8762,TEST,0,0 +8763,TEST,0,0 +8764,TEST,0,0 +8765,TEST,0,0 +8766,TEST,0,0 +8767,TEST,0,0 +8768,TEST,0,0 +8769,TEST,0,0 +8770,TEST,0,0 +8771,TEST,0,0 +8772,TEST,0,0 +8773,TEST,0,0 +8774,TEST,0,0 +8775,TEST,0,0 +8776,TEST,0,0 +8777,TEST,0,0 +8778,TEST,0,0 +8779,TEST,0,0 +8780,TEST,0,0 +8781,TEST,0,0 +8782,TEST,0,0 +8783,TEST,0,0 +8784,TEST,0,0 +8785,TEST,0,0 +8786,TEST,0,0 +8787,TEST,0,0 +8788,TEST,0,0 +8789,TEST,0,0 +8790,TEST,0,0 +8791,TEST,0,0 +8792,TEST,0,0 +8793,TEST,0,0 +8794,TEST,0,0 +8795,TEST,0,0 +8796,TEST,0,0 +8797,TEST,0,0 +8798,TEST,0,0 +8799,TEST,0,0 +8800,TEST,0,0 +8801,TEST,0,0 +8802,TEST,0,0 +8803,TEST,0,0 +8804,TEST,0,0 +8805,TEST,0,0 +8806,TEST,0,0 +8807,TEST,0,0 +8808,TEST,0,0 +8809,TEST,0,0 +8810,TEST,0,0 +8811,TEST,0,0 +8812,TEST,0,0 +8813,TEST,0,0 +8814,TEST,0,0 +8815,TEST,0,0 +8816,TEST,0,0 +8817,TEST,0,0 +8818,TEST,0,0 +8819,TEST,0,0 +8820,TEST,0,0 +8821,TEST,0,0 +8822,TEST,0,0 +8823,TEST,0,0 +8824,TEST,0,0 +8825,TEST,0,0 +8826,TEST,0,0 +8827,TEST,0,0 +8828,TEST,0,0 +8829,TEST,0,0 +8830,TEST,0,0 +8831,TEST,0,0 +8832,TEST,0,0 +8833,TEST,0,0 +8834,TEST,0,0 +8835,TEST,0,0 +8836,TEST,0,0 +8837,TEST,0,0 +8838,TEST,0,0 +8839,TEST,0,0 +8840,TEST,0,0 +8841,TEST,0,0 +8842,TEST,0,0 +8843,TEST,0,0 +8844,TEST,0,0 +8845,TEST,0,0 +8846,TEST,0,0 +8847,TEST,0,0 +8848,TEST,0,0 +8849,TEST,0,0 +8850,TEST,0,0 +8851,TEST,0,0 +8852,TEST,0,0 +8853,TEST,0,0 +8854,TEST,0,0 +8855,TEST,0,0 +8856,TEST,0,0 +8857,TEST,0,0 +8858,TEST,0,0 +8859,TEST,0,0 +8860,TEST,0,0 +8861,TEST,0,0 +8862,TEST,0,0 +8863,TEST,0,0 +8864,TEST,0,0 +8865,TEST,0,0 +8866,TEST,0,0 +8867,TEST,0,0 +8868,TEST,0,0 +8869,TEST,0,0 +8870,TEST,0,0 +8871,TEST,0,0 +8872,TEST,0,0 +8873,TEST,0,0 +8874,TEST,0,0 +8875,TEST,0,0 +8876,TEST,0,0 +8877,TEST,0,0 +8878,TEST,0,0 +8879,TEST,0,0 +8880,TEST,0,0 +8881,TEST,0,0 +8882,TEST,0,0 +8883,TEST,0,0 +8884,TEST,0,0 +8885,TEST,0,0 +8886,TEST,0,0 +8887,TEST,0,0 +8888,TEST,0,0 +8889,TEST,0,0 +8890,TEST,0,0 +8891,TEST,0,0 +8892,TEST,0,0 +8893,TEST,0,0 +8894,TEST,0,0 +8895,TEST,0,0 +8896,TEST,0,0 +8897,TEST,0,0 +8898,TEST,0,0 +8899,TEST,0,0 +8900,TEST,0,0 +8901,TEST,0,0 +8902,TEST,0,0 +8903,TEST,0,0 +8904,TEST,0,0 +8905,TEST,0,0 +8906,TEST,0,0 +8907,TEST,0,0 +8908,TEST,0,0 +8909,TEST,0,0 +8910,TEST,0,0 +8911,TEST,0,0 +8912,TEST,0,0 +8913,TEST,0,0 +8914,TEST,0,0 +8915,TEST,0,0 +8916,TEST,0,0 +8917,TEST,0,0 +8918,TEST,0,0 +8919,TEST,0,0 +8920,TEST,0,0 +8921,TEST,0,0 +8922,TEST,0,0 +8923,TEST,0,0 +8924,TEST,0,0 +8925,TEST,0,0 +8926,TEST,0,0 +8927,TEST,0,0 +8928,TEST,0,0 +8929,TEST,0,0 +8930,TEST,0,0 +8931,TEST,0,0 +8932,TEST,0,0 +8933,TEST,0,0 +8934,TEST,0,0 +8935,TEST,0,0 +8936,TEST,0,0 +8937,TEST,0,0 +8938,TEST,0,0 +8939,TEST,0,0 +8940,TEST,0,0 +8941,TEST,0,0 +8942,TEST,0,0 +8943,TEST,0,0 +8944,TEST,0,0 +8945,TEST,0,0 +8946,TEST,0,0 +8947,TEST,0,0 +8948,TEST,0,0 +8949,TEST,0,0 +8950,TEST,0,0 +8951,TEST,0,0 +8952,TEST,0,0 +8953,TEST,0,0 +8954,TEST,0,0 +8955,TEST,0,0 +8956,TEST,0,0 +8957,TEST,0,0 +8958,TEST,0,0 +8959,TEST,0,0 +8960,TEST,0,0 +8961,TEST,0,0 +8962,TEST,0,0 +8963,TEST,0,0 +8964,TEST,0,0 +8965,TEST,0,0 +8966,TEST,0,0 +8967,TEST,0,0 +8968,TEST,0,0 +8969,TEST,0,0 +8970,TEST,0,0 +8971,TEST,0,0 +8972,TEST,0,0 +8973,TEST,0,0 +8974,TEST,0,0 +8975,TEST,0,0 +8976,TEST,0,0 +8977,TEST,0,0 +8978,TEST,0,0 +8979,TEST,0,0 +8980,TEST,0,0 +8981,TEST,0,0 +8982,TEST,0,0 +8983,TEST,0,0 +8984,TEST,0,0 +8985,TEST,0,0 +8986,TEST,0,0 +8987,TEST,0,0 +8988,TEST,0,0 +8989,TEST,0,0 +8990,TEST,0,0 +8991,TEST,0,0 +8992,TEST,0,0 +8993,TEST,0,0 +8994,TEST,0,0 +8995,TEST,0,0 +8996,TEST,0,0 +8997,TEST,0,0 +8998,TEST,0,0 +8999,TEST,0,0 +9000,TEST,0,0 +9001,TEST,0,0 +9002,TEST,0,0 +9003,TEST,0,0 +9004,TEST,0,0 +9005,TEST,0,0 +9006,TEST,0,0 +9007,TEST,0,0 +9008,TEST,0,0 +9009,TEST,0,0 +9010,TEST,0,0 +9011,TEST,0,0 +9012,TEST,0,0 +9013,TEST,0,0 +9014,TEST,0,0 +9015,TEST,0,0 +9016,TEST,0,0 +9017,TEST,0,0 +9018,TEST,0,0 +9019,TEST,0,0 +9020,TEST,0,0 +9021,TEST,0,0 +9022,TEST,0,0 +9023,TEST,0,0 +9024,TEST,0,0 +9025,TEST,0,0 +9026,TEST,0,0 +9027,TEST,0,0 +9028,TEST,0,0 +9029,TEST,0,0 +9030,TEST,0,0 +9031,TEST,0,0 +9032,TEST,0,0 +9033,TEST,0,0 +9034,TEST,0,0 +9035,TEST,0,0 +9036,TEST,0,0 +9037,TEST,0,0 +9038,TEST,0,0 +9039,TEST,0,0 +9040,TEST,0,0 +9041,TEST,0,0 +9042,TEST,0,0 +9043,TEST,0,0 +9044,TEST,0,0 +9045,TEST,0,0 +9046,TEST,0,0 +9047,TEST,0,0 +9048,TEST,0,0 +9049,TEST,0,0 +9050,TEST,0,0 +9051,TEST,0,0 +9052,TEST,0,0 +9053,TEST,0,0 +9054,TEST,0,0 +9055,TEST,0,0 +9056,TEST,0,0 +9057,TEST,0,0 +9058,TEST,0,0 +9059,TEST,0,0 +9060,TEST,0,0 +9061,TEST,0,0 +9062,TEST,0,0 +9063,TEST,0,0 +9064,TEST,0,0 +9065,TEST,0,0 +9066,TEST,0,0 +9067,TEST,0,0 +9068,TEST,0,0 +9069,TEST,0,0 +9070,TEST,0,0 +9071,TEST,0,0 +9072,TEST,0,0 +9073,TEST,0,0 +9074,TEST,0,0 +9075,TEST,0,0 +9076,TEST,0,0 +9077,TEST,0,0 +9078,TEST,0,0 +9079,TEST,0,0 +9080,TEST,0,0 +9081,TEST,0,0 +9082,TEST,0,0 +9083,TEST,0,0 +9084,TEST,0,0 +9085,TEST,0,0 +9086,TEST,0,0 +9087,TEST,0,0 +9088,TEST,0,0 +9089,TEST,0,0 +9090,TEST,0,0 +9091,TEST,0,0 +9092,TEST,0,0 +9093,TEST,0,0 +9094,TEST,0,0 +9095,TEST,0,0 +9096,TEST,0,0 +9097,TEST,0,0 +9098,TEST,0,0 +9099,TEST,0,0 +9100,TEST,0,0 +9101,TEST,0,0 +9102,TEST,0,0 +9103,TEST,0,0 +9104,TEST,0,0 +9105,TEST,0,0 +9106,TEST,0,0 +9107,TEST,0,0 +9108,TEST,0,0 +9109,TEST,0,0 +9110,TEST,0,0 +9111,TEST,0,0 +9112,TEST,0,0 +9113,TEST,0,0 +9114,TEST,0,0 +9115,TEST,0,0 +9116,TEST,0,0 +9117,TEST,0,0 +9118,TEST,0,0 +9119,TEST,0,0 +9120,TEST,0,0 +9121,TEST,0,0 +9122,TEST,0,0 +9123,TEST,0,0 +9124,TEST,0,0 +9125,TEST,0,0 +9126,TEST,0,0 +9127,TEST,0,0 +9128,TEST,0,0 +9129,TEST,0,0 +9130,TEST,0,0 +9131,TEST,0,0 +9132,TEST,0,0 +9133,TEST,0,0 +9134,TEST,0,0 +9135,TEST,0,0 +9136,TEST,0,0 +9137,TEST,0,0 +9138,TEST,0,0 +9139,TEST,0,0 +9140,TEST,0,0 +9141,TEST,0,0 +9142,TEST,0,0 +9143,TEST,0,0 +9144,TEST,0,0 +9145,TEST,0,0 +9146,TEST,0,0 +9147,TEST,0,0 +9148,TEST,0,0 +9149,TEST,0,0 +9150,TEST,0,0 +9151,TEST,0,0 +9152,TEST,0,0 +9153,TEST,0,0 +9154,TEST,0,0 +9155,TEST,0,0 +9156,TEST,0,0 +9157,TEST,0,0 +9158,TEST,0,0 +9159,TEST,0,0 +9160,TEST,0,0 +9161,TEST,0,0 +9162,TEST,0,0 +9163,TEST,0,0 +9164,TEST,0,0 +9165,TEST,0,0 +9166,TEST,0,0 +9167,TEST,0,0 +9168,TEST,0,0 +9169,TEST,0,0 +9170,TEST,0,0 +9171,TEST,0,0 +9172,TEST,0,0 +9173,TEST,0,0 +9174,TEST,0,0 +9175,TEST,0,0 +9176,TEST,0,0 +9177,TEST,0,0 +9178,TEST,0,0 +9179,TEST,0,0 +9180,TEST,0,0 +9181,TEST,0,0 +9182,TEST,0,0 +9183,TEST,0,0 +9184,TEST,0,0 +9185,TEST,0,0 +9186,TEST,0,0 +9187,TEST,0,0 +9188,TEST,0,0 +9189,TEST,0,0 +9190,TEST,0,0 +9191,TEST,0,0 +9192,TEST,0,0 +9193,TEST,0,0 +9194,TEST,0,0 +9195,TEST,0,0 +9196,TEST,0,0 +9197,TEST,0,0 +9198,TEST,0,0 +9199,TEST,0,0 +9200,TEST,0,0 +9201,TEST,0,0 +9202,TEST,0,0 +9203,TEST,0,0 +9204,TEST,0,0 +9205,TEST,0,0 +9206,TEST,0,0 +9207,TEST,0,0 +9208,TEST,0,0 +9209,TEST,0,0 +9210,TEST,0,0 +9211,TEST,0,0 +9212,TEST,0,0 +9213,TEST,0,0 +9214,TEST,0,0 +9215,TEST,0,0 +9216,TEST,0,0 +9217,TEST,0,0 +9218,TEST,0,0 +9219,TEST,0,0 +9220,TEST,0,0 +9221,TEST,0,0 +9222,TEST,0,0 +9223,TEST,0,0 +9224,TEST,0,0 +9225,TEST,0,0 +9226,TEST,0,0 +9227,TEST,0,0 +9228,TEST,0,0 +9229,TEST,0,0 +9230,TEST,0,0 +9231,TEST,0,0 +9232,TEST,0,0 +9233,TEST,0,0 +9234,TEST,0,0 +9235,TEST,0,0 +9236,TEST,0,0 +9237,TEST,0,0 +9238,TEST,0,0 +9239,TEST,0,0 +9240,TEST,0,0 +9241,TEST,0,0 +9242,TEST,0,0 +9243,TEST,0,0 +9244,TEST,0,0 +9245,TEST,0,0 +9246,TEST,0,0 +9247,TEST,0,0 +9248,TEST,0,0 +9249,TEST,0,0 +9250,TEST,0,0 +9251,TEST,0,0 +9252,TEST,0,0 +9253,TEST,0,0 +9254,TEST,0,0 +9255,TEST,0,0 +9256,TEST,0,0 +9257,TEST,0,0 +9258,TEST,0,0 +9259,TEST,0,0 +9260,TEST,0,0 +9261,TEST,0,0 +9262,TEST,0,0 +9263,TEST,0,0 +9264,TEST,0,0 +9265,TEST,0,0 +9266,TEST,0,0 +9267,TEST,0,0 +9268,TEST,0,0 +9269,TEST,0,0 +9270,TEST,0,0 +9271,TEST,0,0 +9272,TEST,0,0 +9273,TEST,0,0 +9274,TEST,0,0 +9275,TEST,0,0 +9276,TEST,0,0 +9277,TEST,0,0 +9278,TEST,0,0 +9279,TEST,0,0 +9280,TEST,0,0 +9281,TEST,0,0 +9282,TEST,0,0 +9283,TEST,0,0 +9284,TEST,0,0 +9285,TEST,0,0 +9286,TEST,0,0 +9287,TEST,0,0 +9288,TEST,0,0 +9289,TEST,0,0 +9290,TEST,0,0 +9291,TEST,0,0 +9292,TEST,0,0 +9293,TEST,0,0 +9294,TEST,0,0 +9295,TEST,0,0 +9296,TEST,0,0 +9297,TEST,0,0 +9298,TEST,0,0 +9299,TEST,0,0 +9300,TEST,0,0 +9301,TEST,0,0 +9302,TEST,0,0 +9303,TEST,0,0 +9304,TEST,0,0 +9305,TEST,0,0 +9306,TEST,0,0 +9307,TEST,0,0 +9308,TEST,0,0 +9309,TEST,0,0 +9310,TEST,0,0 +9311,TEST,0,0 +9312,TEST,0,0 +9313,TEST,0,0 +9314,TEST,0,0 +9315,TEST,0,0 +9316,TEST,0,0 +9317,TEST,0,0 +9318,TEST,0,0 +9319,TEST,0,0 +9320,TEST,0,0 +9321,TEST,0,0 +9322,TEST,0,0 +9323,TEST,0,0 +9324,TEST,0,0 +9325,TEST,0,0 +9326,TEST,0,0 +9327,TEST,0,0 +9328,TEST,0,0 +9329,TEST,0,0 +9330,TEST,0,0 +9331,TEST,0,0 +9332,TEST,0,0 +9333,TEST,0,0 +9334,TEST,0,0 +9335,TEST,0,0 +9336,TEST,0,0 +9337,TEST,0,0 +9338,TEST,0,0 +9339,TEST,0,0 +9340,TEST,0,0 +9341,TEST,0,0 +9342,TEST,0,0 +9343,TEST,0,0 +9344,TEST,0,0 +9345,TEST,0,0 +9346,TEST,0,0 +9347,TEST,0,0 +9348,TEST,0,0 +9349,TEST,0,0 +9350,TEST,0,0 +9351,TEST,0,0 +9352,TEST,0,0 +9353,TEST,0,0 +9354,TEST,0,0 +9355,TEST,0,0 +9356,TEST,0,0 +9357,TEST,0,0 +9358,TEST,0,0 +9359,TEST,0,0 +9360,TEST,0,0 +9361,TEST,0,0 +9362,TEST,0,0 +9363,TEST,0,0 +9364,TEST,0,0 +9365,TEST,0,0 +9366,TEST,0,0 +9367,TEST,0,0 +9368,TEST,0,0 +9369,TEST,0,0 +9370,TEST,0,0 +9371,TEST,0,0 +9372,TEST,0,0 +9373,TEST,0,0 +9374,TEST,0,0 +9375,TEST,0,0 +9376,TEST,0,0 +9377,TEST,0,0 +9378,TEST,0,0 +9379,TEST,0,0 +9380,TEST,0,0 +9381,TEST,0,0 +9382,TEST,0,0 +9383,TEST,0,0 +9384,TEST,0,0 +9385,TEST,0,0 +9386,TEST,0,0 +9387,TEST,0,0 +9388,TEST,0,0 +9389,TEST,0,0 +9390,TEST,0,0 +9391,TEST,0,0 +9392,TEST,0,0 +9393,TEST,0,0 +9394,TEST,0,0 +9395,TEST,0,0 +9396,TEST,0,0 +9397,TEST,0,0 +9398,TEST,0,0 +9399,TEST,0,0 +9400,TEST,0,0 +9401,TEST,0,0 +9402,TEST,0,0 +9403,TEST,0,0 +9404,TEST,0,0 +9405,TEST,0,0 +9406,TEST,0,0 +9407,TEST,0,0 +9408,TEST,0,0 +9409,TEST,0,0 +9410,TEST,0,0 +9411,TEST,0,0 +9412,TEST,0,0 +9413,TEST,0,0 +9414,TEST,0,0 +9415,TEST,0,0 +9416,TEST,0,0 +9417,TEST,0,0 +9418,TEST,0,0 +9419,TEST,0,0 +9420,TEST,0,0 +9421,TEST,0,0 +9422,TEST,0,0 +9423,TEST,0,0 +9424,TEST,0,0 +9425,TEST,0,0 +9426,TEST,0,0 +9427,TEST,0,0 +9428,TEST,0,0 +9429,TEST,0,0 +9430,TEST,0,0 +9431,TEST,0,0 +9432,TEST,0,0 +9433,TEST,0,0 +9434,TEST,0,0 +9435,TEST,0,0 +9436,TEST,0,0 +9437,TEST,0,0 +9438,TEST,0,0 +9439,TEST,0,0 +9440,TEST,0,0 +9441,TEST,0,0 +9442,TEST,0,0 +9443,TEST,0,0 +9444,TEST,0,0 +9445,TEST,0,0 +9446,TEST,0,0 +9447,TEST,0,0 +9448,TEST,0,0 +9449,TEST,0,0 +9450,TEST,0,0 +9451,TEST,0,0 +9452,TEST,0,0 +9453,TEST,0,0 +9454,TEST,0,0 +9455,TEST,0,0 +9456,TEST,0,0 +9457,TEST,0,0 +9458,TEST,0,0 +9459,TEST,0,0 +9460,TEST,0,0 +9461,TEST,0,0 +9462,TEST,0,0 +9463,TEST,0,0 +9464,TEST,0,0 +9465,TEST,0,0 +9466,TEST,0,0 +9467,TEST,0,0 +9468,TEST,0,0 +9469,TEST,0,0 +9470,TEST,0,0 +9471,TEST,0,0 +9472,TEST,0,0 +9473,TEST,0,0 +9474,TEST,0,0 +9475,TEST,0,0 +9476,TEST,0,0 +9477,TEST,0,0 +9478,TEST,0,0 +9479,TEST,0,0 +9480,TEST,0,0 +9481,TEST,0,0 +9482,TEST,0,0 +9483,TEST,0,0 +9484,TEST,0,0 +9485,TEST,0,0 +9486,TEST,0,0 +9487,TEST,0,0 +9488,TEST,0,0 +9489,TEST,0,0 +9490,TEST,0,0 +9491,TEST,0,0 +9492,TEST,0,0 +9493,TEST,0,0 +9494,TEST,0,0 +9495,TEST,0,0 +9496,TEST,0,0 +9497,TEST,0,0 +9498,TEST,0,0 +9499,TEST,0,0 +9500,TEST,0,0 +9501,TEST,0,0 +9502,TEST,0,0 +9503,TEST,0,0 +9504,TEST,0,0 +9505,TEST,0,0 +9506,TEST,0,0 +9507,TEST,0,0 +9508,TEST,0,0 +9509,TEST,0,0 +9510,TEST,0,0 +9511,TEST,0,0 +9512,TEST,0,0 +9513,TEST,0,0 +9514,TEST,0,0 +9515,TEST,0,0 +9516,TEST,0,0 +9517,TEST,0,0 +9518,TEST,0,0 +9519,TEST,0,0 +9520,TEST,0,0 +9521,TEST,0,0 +9522,TEST,0,0 +9523,TEST,0,0 +9524,TEST,0,0 +9525,TEST,0,0 +9526,TEST,0,0 +9527,TEST,0,0 +9528,TEST,0,0 +9529,TEST,0,0 +9530,TEST,0,0 +9531,TEST,0,0 +9532,TEST,0,0 +9533,TEST,0,0 +9534,TEST,0,0 +9535,TEST,0,0 +9536,TEST,0,0 +9537,TEST,0,0 +9538,TEST,0,0 +9539,TEST,0,0 +9540,TEST,0,0 +9541,TEST,0,0 +9542,TEST,0,0 +9543,TEST,0,0 +9544,TEST,0,0 +9545,TEST,0,0 +9546,TEST,0,0 +9547,TEST,0,0 +9548,TEST,0,0 +9549,TEST,0,0 +9550,TEST,0,0 +9551,TEST,0,0 +9552,TEST,0,0 +9553,TEST,0,0 +9554,TEST,0,0 +9555,TEST,0,0 +9556,TEST,0,0 +9557,TEST,0,0 +9558,TEST,0,0 +9559,TEST,0,0 +9560,TEST,0,0 +9561,TEST,0,0 +9562,TEST,0,0 +9563,TEST,0,0 +9564,TEST,0,0 +9565,TEST,0,0 +9566,TEST,0,0 +9567,TEST,0,0 +9568,TEST,0,0 +9569,TEST,0,0 +9570,TEST,0,0 +9571,TEST,0,0 +9572,TEST,0,0 +9573,TEST,0,0 +9574,TEST,0,0 +9575,TEST,0,0 +9576,TEST,0,0 +9577,TEST,0,0 +9578,TEST,0,0 +9579,TEST,0,0 +9580,TEST,0,0 +9581,TEST,0,0 +9582,TEST,0,0 +9583,TEST,0,0 +9584,TEST,0,0 +9585,TEST,0,0 +9586,TEST,0,0 +9587,TEST,0,0 +9588,TEST,0,0 +9589,TEST,0,0 +9590,TEST,0,0 +9591,TEST,0,0 +9592,TEST,0,0 +9593,TEST,0,0 +9594,TEST,0,0 +9595,TEST,0,0 +9596,TEST,0,0 +9597,TEST,0,0 +9598,TEST,0,0 +9599,TEST,0,0 +9600,TEST,0,0 +9601,TEST,0,0 +9602,TEST,0,0 +9603,TEST,0,0 +9604,TEST,0,0 +9605,TEST,0,0 +9606,TEST,0,0 +9607,TEST,0,0 +9608,TEST,0,0 +9609,TEST,0,0 +9610,TEST,0,0 +9611,TEST,0,0 +9612,TEST,0,0 +9613,TEST,0,0 +9614,TEST,0,0 +9615,TEST,0,0 +9616,TEST,0,0 +9617,TEST,0,0 +9618,TEST,0,0 +9619,TEST,0,0 +9620,TEST,0,0 +9621,TEST,0,0 +9622,TEST,0,0 +9623,TEST,0,0 +9624,TEST,0,0 +9625,TEST,0,0 +9626,TEST,0,0 +9627,TEST,0,0 +9628,TEST,0,0 +9629,TEST,0,0 +9630,TEST,0,0 +9631,TEST,0,0 +9632,TEST,0,0 +9633,TEST,0,0 +9634,TEST,0,0 +9635,TEST,0,0 +9636,TEST,0,0 +9637,TEST,0,0 +9638,TEST,0,0 +9639,TEST,0,0 +9640,TEST,0,0 +9641,TEST,0,0 +9642,TEST,0,0 +9643,TEST,0,0 +9644,TEST,0,0 +9645,TEST,0,0 +9646,TEST,0,0 +9647,TEST,0,0 +9648,TEST,0,0 +9649,TEST,0,0 +9650,TEST,0,0 +9651,TEST,0,0 +9652,TEST,0,0 +9653,TEST,0,0 +9654,TEST,0,0 +9655,TEST,0,0 +9656,TEST,0,0 +9657,TEST,0,0 +9658,TEST,0,0 +9659,TEST,0,0 +9660,TEST,0,0 +9661,TEST,0,0 +9662,TEST,0,0 +9663,TEST,0,0 +9664,TEST,0,0 +9665,TEST,0,0 +9666,TEST,0,0 +9667,TEST,0,0 +9668,TEST,0,0 +9669,TEST,0,0 +9670,TEST,0,0 +9671,TEST,0,0 +9672,TEST,0,0 +9673,TEST,0,0 +9674,TEST,0,0 +9675,TEST,0,0 +9676,TEST,0,0 +9677,TEST,0,0 +9678,TEST,0,0 +9679,TEST,0,0 +9680,TEST,0,0 +9681,TEST,0,0 +9682,TEST,0,0 +9683,TEST,0,0 +9684,TEST,0,0 +9685,TEST,0,0 +9686,TEST,0,0 +9687,TEST,0,0 +9688,TEST,0,0 +9689,TEST,0,0 +9690,TEST,0,0 +9691,TEST,0,0 +9692,TEST,0,0 +9693,TEST,0,0 +9694,TEST,0,0 +9695,TEST,0,0 +9696,TEST,0,0 +9697,TEST,0,0 +9698,TEST,0,0 +9699,TEST,0,0 +9700,TEST,0,0 +9701,TEST,0,0 +9702,TEST,0,0 +9703,TEST,0,0 +9704,TEST,0,0 +9705,TEST,0,0 +9706,TEST,0,0 +9707,TEST,0,0 +9708,TEST,0,0 +9709,TEST,0,0 +9710,TEST,0,0 +9711,TEST,0,0 +9712,TEST,0,0 +9713,TEST,0,0 +9714,TEST,0,0 +9715,TEST,0,0 +9716,TEST,0,0 +9717,TEST,0,0 +9718,TEST,0,0 +9719,TEST,0,0 +9720,TEST,0,0 +9721,TEST,0,0 +9722,TEST,0,0 +9723,TEST,0,0 +9724,TEST,0,0 +9725,TEST,0,0 +9726,TEST,0,0 +9727,TEST,0,0 +9728,TEST,0,0 +9729,TEST,0,0 +9730,TEST,0,0 +9731,TEST,0,0 +9732,TEST,0,0 +9733,TEST,0,0 +9734,TEST,0,0 +9735,TEST,0,0 +9736,TEST,0,0 +9737,TEST,0,0 +9738,TEST,0,0 +9739,TEST,0,0 +9740,TEST,0,0 +9741,TEST,0,0 +9742,TEST,0,0 +9743,TEST,0,0 +9744,TEST,0,0 +9745,TEST,0,0 +9746,TEST,0,0 +9747,TEST,0,0 +9748,TEST,0,0 +9749,TEST,0,0 +9750,TEST,0,0 +9751,TEST,0,0 +9752,TEST,0,0 +9753,TEST,0,0 +9754,TEST,0,0 +9755,TEST,0,0 +9756,TEST,0,0 +9757,TEST,0,0 +9758,TEST,0,0 +9759,TEST,0,0 +9760,TEST,0,0 +9761,TEST,0,0 +9762,TEST,0,0 +9763,TEST,0,0 +9764,TEST,0,0 +9765,TEST,0,0 +9766,TEST,0,0 +9767,TEST,0,0 +9768,TEST,0,0 +9769,TEST,0,0 +9770,TEST,0,0 +9771,TEST,0,0 +9772,TEST,0,0 +9773,TEST,0,0 +9774,TEST,0,0 +9775,TEST,0,0 +9776,TEST,0,0 +9777,TEST,0,0 +9778,TEST,0,0 +9779,TEST,0,0 +9780,TEST,0,0 +9781,TEST,0,0 +9782,TEST,0,0 +9783,TEST,0,0 +9784,TEST,0,0 +9785,TEST,0,0 +9786,TEST,0,0 +9787,TEST,0,0 +9788,TEST,0,0 +9789,TEST,0,0 +9790,TEST,0,0 +9791,TEST,0,0 +9792,TEST,0,0 +9793,TEST,0,0 +9794,TEST,0,0 +9795,TEST,0,0 +9796,TEST,0,0 +9797,TEST,0,0 +9798,TEST,0,0 +9799,TEST,0,0 +9800,TEST,0,0 +9801,TEST,0,0 +9802,TEST,0,0 +9803,TEST,0,0 +9804,TEST,0,0 +9805,TEST,0,0 +9806,TEST,0,0 +9807,TEST,0,0 +9808,TEST,0,0 +9809,TEST,0,0 +9810,TEST,0,0 +9811,TEST,0,0 +9812,TEST,0,0 +9813,TEST,0,0 +9814,TEST,0,0 +9815,TEST,0,0 +9816,TEST,0,0 +9817,TEST,0,0 +9818,TEST,0,0 +9819,TEST,0,0 +9820,TEST,0,0 +9821,TEST,0,0 +9822,TEST,0,0 +9823,TEST,0,0 +9824,TEST,0,0 +9825,TEST,0,0 +9826,TEST,0,0 +9827,TEST,0,0 +9828,TEST,0,0 +9829,TEST,0,0 +9830,TEST,0,0 +9831,TEST,0,0 +9832,TEST,0,0 +9833,TEST,0,0 +9834,TEST,0,0 +9835,TEST,0,0 +9836,TEST,0,0 +9837,TEST,0,0 +9838,TEST,0,0 +9839,TEST,0,0 +9840,TEST,0,0 +9841,TEST,0,0 +9842,TEST,0,0 +9843,TEST,0,0 +9844,TEST,0,0 +9845,TEST,0,0 +9846,TEST,0,0 +9847,TEST,0,0 +9848,TEST,0,0 +9849,TEST,0,0 +9850,TEST,0,0 +9851,TEST,0,0 +9852,TEST,0,0 +9853,TEST,0,0 +9854,TEST,0,0 +9855,TEST,0,0 +9856,TEST,0,0 +9857,TEST,0,0 +9858,TEST,0,0 +9859,TEST,0,0 +9860,TEST,0,0 +9861,TEST,0,0 +9862,TEST,0,0 +9863,TEST,0,0 +9864,TEST,0,0 +9865,TEST,0,0 +9866,TEST,0,0 +9867,TEST,0,0 +9868,TEST,0,0 +9869,TEST,0,0 +9870,TEST,0,0 +9871,TEST,0,0 +9872,TEST,0,0 +9873,TEST,0,0 +9874,TEST,0,0 +9875,TEST,0,0 +9876,TEST,0,0 +9877,TEST,0,0 +9878,TEST,0,0 +9879,TEST,0,0 +9880,TEST,0,0 +9881,TEST,0,0 +9882,TEST,0,0 +9883,TEST,0,0 +9884,TEST,0,0 +9885,TEST,0,0 +9886,TEST,0,0 +9887,TEST,0,0 +9888,TEST,0,0 +9889,TEST,0,0 +9890,TEST,0,0 +9891,TEST,0,0 +9892,TEST,0,0 +9893,TEST,0,0 +9894,TEST,0,0 +9895,TEST,0,0 +9896,TEST,0,0 +9897,TEST,0,0 +9898,TEST,0,0 +9899,TEST,0,0 +9900,TEST,0,0 +9901,TEST,0,0 +9902,TEST,0,0 +9903,TEST,0,0 +9904,TEST,0,0 +9905,TEST,0,0 +9906,TEST,0,0 +9907,TEST,0,0 +9908,TEST,0,0 +9909,TEST,0,0 +9910,TEST,0,0 +9911,TEST,0,0 +9912,TEST,0,0 +9913,TEST,0,0 +9914,TEST,0,0 +9915,TEST,0,0 +9916,TEST,0,0 +9917,TEST,0,0 +9918,TEST,0,0 +9919,TEST,0,0 +9920,TEST,0,0 +9921,TEST,0,0 +9922,TEST,0,0 +9923,TEST,0,0 +9924,TEST,0,0 +9925,TEST,0,0 +9926,TEST,0,0 +9927,TEST,0,0 +9928,TEST,0,0 +9929,TEST,0,0 +9930,TEST,0,0 +9931,TEST,0,0 +9932,TEST,0,0 +9933,TEST,0,0 +9934,TEST,0,0 +9935,TEST,0,0 +9936,TEST,0,0 +9937,TEST,0,0 +9938,TEST,0,0 +9939,TEST,0,0 +9940,TEST,0,0 +9941,TEST,0,0 +9942,TEST,0,0 +9943,TEST,0,0 +9944,TEST,0,0 +9945,TEST,0,0 +9946,TEST,0,0 +9947,TEST,0,0 +9948,TEST,0,0 +9949,TEST,0,0 +9950,TEST,0,0 +9951,TEST,0,0 +9952,TEST,0,0 +9953,TEST,0,0 +9954,TEST,0,0 +9955,TEST,0,0 +9956,TEST,0,0 +9957,TEST,0,0 +9958,TEST,0,0 +9959,TEST,0,0 +9960,TEST,0,0 +9961,TEST,0,0 +9962,TEST,0,0 +9963,TEST,0,0 +9964,TEST,0,0 +9965,TEST,0,0 +9966,TEST,0,0 +9967,TEST,0,0 +9968,TEST,0,0 +9969,TEST,0,0 +9970,TEST,0,0 +9971,TEST,0,0 +9972,TEST,0,0 +9973,TEST,0,0 +9974,TEST,0,0 +9975,TEST,0,0 +9976,TEST,0,0 +9977,TEST,0,0 +9978,TEST,0,0 +9979,TEST,0,0 +9980,TEST,0,0 +9981,TEST,0,0 +9982,TEST,0,0 +9983,TEST,0,0 +9984,TEST,0,0 +9985,TEST,0,0 +9986,TEST,0,0 +9987,TEST,0,0 +9988,TEST,0,0 +9989,TEST,0,0 +9990,TEST,0,0 +9991,TEST,0,0 +9992,TEST,0,0 +9993,TEST,0,0 +9994,TEST,0,0 +9995,TEST,0,0 +9996,TEST,0,0 +9997,TEST,0,0 +9998,TEST,0,0 +9999,TEST,0,0 +10000,TEST,0,0 +10001,TEST,0,0 +10002,TEST,0,0 +10003,TEST,0,0 +10004,TEST,0,0 +10005,TEST,0,0 +10006,TEST,0,0 +10007,TEST,0,0 +10008,TEST,0,0 +10009,TEST,0,0 +10010,TEST,0,0 +10011,TEST,0,0 +10012,TEST,0,0 +10013,TEST,0,0 +10014,TEST,0,0 +10015,TEST,0,0 +10016,TEST,0,0 +10017,TEST,0,0 +10018,TEST,0,0 +10019,TEST,0,0 +10020,TEST,0,0 +10021,TEST,0,0 +10022,TEST,0,0 +10023,TEST,0,0 +10024,TEST,0,0 +10025,TEST,0,0 +10026,TEST,0,0 +10027,TEST,0,0 +10028,TEST,0,0 +10029,TEST,0,0 +10030,TEST,0,0 +10031,TEST,0,0 +10032,TEST,0,0 +10033,TEST,0,0 +10034,TEST,0,0 +10035,TEST,0,0 +10036,TEST,0,0 +10037,TEST,0,0 +10038,TEST,0,0 +10039,TEST,0,0 +10040,TEST,0,0 +10041,TEST,0,0 +10042,TEST,0,0 +10043,TEST,0,0 +10044,TEST,0,0 +10045,TEST,0,0 +10046,TEST,0,0 +10047,TEST,0,0 +10048,TEST,0,0 +10049,TEST,0,0 +10050,TEST,0,0 +10051,TEST,0,0 +10052,TEST,0,0 +10053,TEST,0,0 +10054,TEST,0,0 +10055,TEST,0,0 +10056,TEST,0,0 +10057,TEST,0,0 +10058,TEST,0,0 +10059,TEST,0,0 +10060,TEST,0,0 +10061,TEST,0,0 +10062,TEST,0,0 +10063,TEST,0,0 +10064,TEST,0,0 +10065,TEST,0,0 +10066,TEST,0,0 +10067,TEST,0,0 +10068,TEST,0,0 +10069,TEST,0,0 +10070,TEST,0,0 +10071,TEST,0,0 +10072,TEST,0,0 +10073,TEST,0,0 +10074,TEST,0,0 +10075,TEST,0,0 +10076,TEST,0,0 +10077,TEST,0,0 +10078,TEST,0,0 +10079,TEST,0,0 +10080,TEST,0,0 +10081,TEST,0,0 +10082,TEST,0,0 +10083,TEST,0,0 +10084,TEST,0,0 +10085,TEST,0,0 +10086,TEST,0,0 +10087,TEST,0,0 +10088,TEST,0,0 +10089,TEST,0,0 +10090,TEST,0,0 +10091,TEST,0,0 +10092,TEST,0,0 +10093,TEST,0,0 +10094,TEST,0,0 +10095,TEST,0,0 +10096,TEST,0,0 +10097,TEST,0,0 +10098,TEST,0,0 +10099,TEST,0,0 +10100,TEST,0,0 +10101,TEST,0,0 +10102,TEST,0,0 +10103,TEST,0,0 +10104,TEST,0,0 +10105,TEST,0,0 +10106,TEST,0,0 +10107,TEST,0,0 +10108,TEST,0,0 +10109,TEST,0,0 +10110,TEST,0,0 +10111,TEST,0,0 +10112,TEST,0,0 +10113,TEST,0,0 +10114,TEST,0,0 +10115,TEST,0,0 +10116,TEST,0,0 +10117,TEST,0,0 +10118,TEST,0,0 +10119,TEST,0,0 +10120,TEST,0,0 +10121,TEST,0,0 +10122,TEST,0,0 +10123,TEST,0,0 +10124,TEST,0,0 +10125,TEST,0,0 +10126,TEST,0,0 +10127,TEST,0,0 +10128,TEST,0,0 +10129,TEST,0,0 +10130,TEST,0,0 +10131,TEST,0,0 +10132,TEST,0,0 +10133,TEST,0,0 +10134,TEST,0,0 +10135,TEST,0,0 +10136,TEST,0,0 +10137,TEST,0,0 +10138,TEST,0,0 +10139,TEST,0,0 +10140,TEST,0,0 +10141,TEST,0,0 +10142,TEST,0,0 +10143,TEST,0,0 +10144,TEST,0,0 +10145,TEST,0,0 +10146,TEST,0,0 +10147,TEST,0,0 +10148,TEST,0,0 +10149,TEST,0,0 +10150,TEST,0,0 +10151,TEST,0,0 +10152,TEST,0,0 +10153,TEST,0,0 +10154,TEST,0,0 +10155,TEST,0,0 +10156,TEST,0,0 +10157,TEST,0,0 +10158,TEST,0,0 +10159,TEST,0,0 +10160,TEST,0,0 +10161,TEST,0,0 +10162,TEST,0,0 +10163,TEST,0,0 +10164,TEST,0,0 +10165,TEST,0,0 +10166,TEST,0,0 +10167,TEST,0,0 +10168,TEST,0,0 +10169,TEST,0,0 +10170,TEST,0,0 +10171,TEST,0,0 +10172,TEST,0,0 +10173,TEST,0,0 +10174,TEST,0,0 +10175,TEST,0,0 +10176,TEST,0,0 +10177,TEST,0,0 +10178,TEST,0,0 +10179,TEST,0,0 +10180,TEST,0,0 +10181,TEST,0,0 +10182,TEST,0,0 +10183,TEST,0,0 +10184,TEST,0,0 +10185,TEST,0,0 +10186,TEST,0,0 +10187,TEST,0,0 +10188,TEST,0,0 +10189,TEST,0,0 +10190,TEST,0,0 +10191,TEST,0,0 +10192,TEST,0,0 +10193,TEST,0,0 +10194,TEST,0,0 +10195,TEST,0,0 +10196,TEST,0,0 +10197,TEST,0,0 +10198,TEST,0,0 +10199,TEST,0,0 +10200,TEST,0,0 +10201,TEST,0,0 +10202,TEST,0,0 +10203,TEST,0,0 +10204,TEST,0,0 +10205,TEST,0,0 +10206,TEST,0,0 +10207,TEST,0,0 +10208,TEST,0,0 +10209,TEST,0,0 +10210,TEST,0,0 +10211,TEST,0,0 +10212,TEST,0,0 +10213,TEST,0,0 +10214,TEST,0,0 +10215,TEST,0,0 +10216,TEST,0,0 +10217,TEST,0,0 +10218,TEST,0,0 +10219,TEST,0,0 +10220,TEST,0,0 +10221,TEST,0,0 +10222,TEST,0,0 +10223,TEST,0,0 +10224,TEST,0,0 +10225,TEST,0,0 +10226,TEST,0,0 +10227,TEST,0,0 +10228,TEST,0,0 +10229,TEST,0,0 +10230,TEST,0,0 +10231,TEST,0,0 +10232,TEST,0,0 +10233,TEST,0,0 +10234,TEST,0,0 +10235,TEST,0,0 +10236,TEST,0,0 +10237,TEST,0,0 +10238,TEST,0,0 +10239,TEST,0,0 +10240,TEST,0,0 +10241,TEST,0,0 +10242,TEST,0,0 +10243,TEST,0,0 +10244,TEST,0,0 +10245,TEST,0,0 +10246,TEST,0,0 +10247,TEST,0,0 +10248,TEST,0,0 +10249,TEST,0,0 +10250,TEST,0,0 +10251,TEST,0,0 +10252,TEST,0,0 +10253,TEST,0,0 +10254,TEST,0,0 +10255,TEST,0,0 +10256,TEST,0,0 +10257,TEST,0,0 +10258,TEST,0,0 +10259,TEST,0,0 +10260,TEST,0,0 +10261,TEST,0,0 +10262,TEST,0,0 +10263,TEST,0,0 +10264,TEST,0,0 +10265,TEST,0,0 +10266,TEST,0,0 +10267,TEST,0,0 +10268,TEST,0,0 +10269,TEST,0,0 +10270,TEST,0,0 +10271,TEST,0,0 +10272,TEST,0,0 +10273,TEST,0,0 +10274,TEST,0,0 +10275,TEST,0,0 +10276,TEST,0,0 +10277,TEST,0,0 +10278,TEST,0,0 +10279,TEST,0,0 +10280,TEST,0,0 +10281,TEST,0,0 +10282,TEST,0,0 +10283,TEST,0,0 +10284,TEST,0,0 +10285,TEST,0,0 +10286,TEST,0,0 +10287,TEST,0,0 +10288,TEST,0,0 +10289,TEST,0,0 +10290,TEST,0,0 +10291,TEST,0,0 +10292,TEST,0,0 +10293,TEST,0,0 +10294,TEST,0,0 +10295,TEST,0,0 +10296,TEST,0,0 +10297,TEST,0,0 +10298,TEST,0,0 +10299,TEST,0,0 +10300,TEST,0,0 +10301,TEST,0,0 +10302,TEST,0,0 +10303,TEST,0,0 +10304,TEST,0,0 +10305,TEST,0,0 +10306,TEST,0,0 +10307,TEST,0,0 +10308,TEST,0,0 +10309,TEST,0,0 +10310,TEST,0,0 +10311,TEST,0,0 +10312,TEST,0,0 +10313,TEST,0,0 +10314,TEST,0,0 +10315,TEST,0,0 +10316,TEST,0,0 +10317,TEST,0,0 +10318,TEST,0,0 +10319,TEST,0,0 +10320,TEST,0,0 +10321,TEST,0,0 +10322,TEST,0,0 +10323,TEST,0,0 +10324,TEST,0,0 +10325,TEST,0,0 +10326,TEST,0,0 +10327,TEST,0,0 +10328,TEST,0,0 +10329,TEST,0,0 +10330,TEST,0,0 +10331,TEST,0,0 +10332,TEST,0,0 +10333,TEST,0,0 +10334,TEST,0,0 +10335,TEST,0,0 +10336,TEST,0,0 +10337,TEST,0,0 +10338,TEST,0,0 +10339,TEST,0,0 +10340,TEST,0,0 +10341,TEST,0,0 +10342,TEST,0,0 +10343,TEST,0,0 +10344,TEST,0,0 +10345,TEST,0,0 +10346,TEST,0,0 +10347,TEST,0,0 +10348,TEST,0,0 +10349,TEST,0,0 +10350,TEST,0,0 +10351,TEST,0,0 +10352,TEST,0,0 +10353,TEST,0,0 +10354,TEST,0,0 +10355,TEST,0,0 +10356,TEST,0,0 +10357,TEST,0,0 +10358,TEST,0,0 +10359,TEST,0,0 +10360,TEST,0,0 +10361,TEST,0,0 +10362,TEST,0,0 +10363,TEST,0,0 +10364,TEST,0,0 +10365,TEST,0,0 +10366,TEST,0,0 +10367,TEST,0,0 +10368,TEST,0,0 +10369,TEST,0,0 +10370,TEST,0,0 +10371,TEST,0,0 +10372,TEST,0,0 +10373,TEST,0,0 +10374,TEST,0,0 +10375,TEST,0,0 +10376,TEST,0,0 +10377,TEST,0,0 +10378,TEST,0,0 +10379,TEST,0,0 +10380,TEST,0,0 +10381,TEST,0,0 +10382,TEST,0,0 +10383,TEST,0,0 +10384,TEST,0,0 +10385,TEST,0,0 +10386,TEST,0,0 +10387,TEST,0,0 +10388,TEST,0,0 +10389,TEST,0,0 +10390,TEST,0,0 +10391,TEST,0,0 +10392,TEST,0,0 +10393,TEST,0,0 +10394,TEST,0,0 +10395,TEST,0,0 +10396,TEST,0,0 +10397,TEST,0,0 +10398,TEST,0,0 +10399,TEST,0,0 +10400,TEST,0,0 +10401,TEST,0,0 +10402,TEST,0,0 +10403,TEST,0,0 +10404,TEST,0,0 +10405,TEST,0,0 +10406,TEST,0,0 +10407,TEST,0,0 +10408,TEST,0,0 +10409,TEST,0,0 +10410,TEST,0,0 +10411,TEST,0,0 +10412,TEST,0,0 +10413,TEST,0,0 +10414,TEST,0,0 +10415,TEST,0,0 +10416,TEST,0,0 +10417,TEST,0,0 +10418,TEST,0,0 +10419,TEST,0,0 +10420,TEST,0,0 +10421,TEST,0,0 +10422,TEST,0,0 +10423,TEST,0,0 +10424,TEST,0,0 +10425,TEST,0,0 +10426,TEST,0,0 +10427,TEST,0,0 +10428,TEST,0,0 +10429,TEST,0,0 +10430,TEST,0,0 +10431,TEST,0,0 +10432,TEST,0,0 +10433,TEST,0,0 +10434,TEST,0,0 +10435,TEST,0,0 +10436,TEST,0,0 +10437,TEST,0,0 +10438,TEST,0,0 +10439,TEST,0,0 +10440,TEST,0,0 +10441,TEST,0,0 +10442,TEST,0,0 +10443,TEST,0,0 +10444,TEST,0,0 +10445,TEST,0,0 +10446,TEST,0,0 +10447,TEST,0,0 +10448,TEST,0,0 +10449,TEST,0,0 +10450,TEST,0,0 +10451,TEST,0,0 +10452,TEST,0,0 +10453,TEST,0,0 +10454,TEST,0,0 +10455,TEST,0,0 +10456,TEST,0,0 +10457,TEST,0,0 +10458,TEST,0,0 +10459,TEST,0,0 +10460,TEST,0,0 +10461,TEST,0,0 +10462,TEST,0,0 +10463,TEST,0,0 +10464,TEST,0,0 +10465,TEST,0,0 +10466,TEST,0,0 +10467,TEST,0,0 +10468,TEST,0,0 +10469,TEST,0,0 +10470,TEST,0,0 +10471,TEST,0,0 +10472,TEST,0,0 +10473,TEST,0,0 +10474,TEST,0,0 +10475,TEST,0,0 +10476,TEST,0,0 +10477,TEST,0,0 +10478,TEST,0,0 +10479,TEST,0,0 +10480,TEST,0,0 +10481,TEST,0,0 +10482,TEST,0,0 +10483,TEST,0,0 +10484,TEST,0,0 +10485,TEST,0,0 +10486,TEST,0,0 +10487,TEST,0,0 +10488,TEST,0,0 +10489,TEST,0,0 +10490,TEST,0,0 +10491,TEST,0,0 +10492,TEST,0,0 +10493,TEST,0,0 +10494,TEST,0,0 +10495,TEST,0,0 +10496,TEST,0,0 +10497,TEST,0,0 +10498,TEST,0,0 +10499,TEST,0,0 +10500,TEST,0,0 +10501,TEST,0,0 +10502,TEST,0,0 +10503,TEST,0,0 +10504,TEST,0,0 +10505,TEST,0,0 +10506,TEST,0,0 +10507,TEST,0,0 +10508,TEST,0,0 +10509,TEST,0,0 +10510,TEST,0,0 +10511,TEST,0,0 +10512,TEST,0,0 +10513,TEST,0,0 +10514,TEST,0,0 +10515,TEST,0,0 +10516,TEST,0,0 +10517,TEST,0,0 +10518,TEST,0,0 +10519,TEST,0,0 +10520,TEST,0,0 +10521,TEST,0,0 +10522,TEST,0,0 +10523,TEST,0,0 +10524,TEST,0,0 +10525,TEST,0,0 +10526,TEST,0,0 +10527,TEST,0,0 +10528,TEST,0,0 +10529,TEST,0,0 +10530,TEST,0,0 +10531,TEST,0,0 +10532,TEST,0,0 +10533,TEST,0,0 +10534,TEST,0,0 +10535,TEST,0,0 +10536,TEST,0,0 +10537,TEST,0,0 +10538,TEST,0,0 +10539,TEST,0,0 +10540,TEST,0,0 +10541,TEST,0,0 +10542,TEST,0,0 +10543,TEST,0,0 +10544,TEST,0,0 +10545,TEST,0,0 +10546,TEST,0,0 +10547,TEST,0,0 +10548,TEST,0,0 +10549,TEST,0,0 +10550,TEST,0,0 +10551,TEST,0,0 +10552,TEST,0,0 +10553,TEST,0,0 +10554,TEST,0,0 +10555,TEST,0,0 +10556,TEST,0,0 +10557,TEST,0,0 +10558,TEST,0,0 +10559,TEST,0,0 +10560,TEST,0,0 +10561,TEST,0,0 +10562,TEST,0,0 +10563,TEST,0,0 +10564,TEST,0,0 +10565,TEST,0,0 +10566,TEST,0,0 +10567,TEST,0,0 +10568,TEST,0,0 +10569,TEST,0,0 +10570,TEST,0,0 +10571,TEST,0,0 +10572,TEST,0,0 +10573,TEST,0,0 +10574,TEST,0,0 +10575,TEST,0,0 +10576,TEST,0,0 +10577,TEST,0,0 +10578,TEST,0,0 +10579,TEST,0,0 +10580,TEST,0,0 +10581,TEST,0,0 +10582,TEST,0,0 +10583,TEST,0,0 +10584,TEST,0,0 +10585,TEST,0,0 +10586,TEST,0,0 +10587,TEST,0,0 +10588,TEST,0,0 +10589,TEST,0,0 +10590,TEST,0,0 +10591,TEST,0,0 +10592,TEST,0,0 +10593,TEST,0,0 +10594,TEST,0,0 +10595,TEST,0,0 +10596,TEST,0,0 +10597,TEST,0,0 +10598,TEST,0,0 +10599,TEST,0,0 +10600,TEST,0,0 +10601,TEST,0,0 +10602,TEST,0,0 +10603,TEST,0,0 +10604,TEST,0,0 +10605,TEST,0,0 +10606,TEST,0,0 +10607,TEST,0,0 +10608,TEST,0,0 +10609,TEST,0,0 +10610,TEST,0,0 +10611,TEST,0,0 +10612,TEST,0,0 +10613,TEST,0,0 +10614,TEST,0,0 +10615,TEST,0,0 +10616,TEST,0,0 +10617,TEST,0,0 +10618,TEST,0,0 +10619,TEST,0,0 +10620,TEST,0,0 +10621,TEST,0,0 +10622,TEST,0,0 +10623,TEST,0,0 +10624,TEST,0,0 +10625,TEST,0,0 +10626,TEST,0,0 +10627,TEST,0,0 +10628,TEST,0,0 +10629,TEST,0,0 +10630,TEST,0,0 +10631,TEST,0,0 +10632,TEST,0,0 +10633,TEST,0,0 +10634,TEST,0,0 +10635,TEST,0,0 +10636,TEST,0,0 +10637,TEST,0,0 +10638,TEST,0,0 +10639,TEST,0,0 +10640,TEST,0,0 +10641,TEST,0,0 +10642,TEST,0,0 +10643,TEST,0,0 +10644,TEST,0,0 +10645,TEST,0,0 +10646,TEST,0,0 +10647,TEST,0,0 +10648,TEST,0,0 +10649,TEST,0,0 +10650,TEST,0,0 +10651,TEST,0,0 +10652,TEST,0,0 +10653,TEST,0,0 +10654,TEST,0,0 +10655,TEST,0,0 +10656,TEST,0,0 +10657,TEST,0,0 +10658,TEST,0,0 +10659,TEST,0,0 +10660,TEST,0,0 +10661,TEST,0,0 +10662,TEST,0,0 +10663,TEST,0,0 +10664,TEST,0,0 +10665,TEST,0,0 +10666,TEST,0,0 +10667,TEST,0,0 +10668,TEST,0,0 +10669,TEST,0,0 +10670,TEST,0,0 +10671,TEST,0,0 +10672,TEST,0,0 +10673,TEST,0,0 +10674,TEST,0,0 +10675,TEST,0,0 +10676,TEST,0,0 +10677,TEST,0,0 +10678,TEST,0,0 +10679,TEST,0,0 +10680,TEST,0,0 +10681,TEST,0,0 +10682,TEST,0,0 +10683,TEST,0,0 +10684,TEST,0,0 +10685,TEST,0,0 +10686,TEST,0,0 +10687,TEST,0,0 +10688,TEST,0,0 +10689,TEST,0,0 +10690,TEST,0,0 +10691,TEST,0,0 +10692,TEST,0,0 +10693,TEST,0,0 +10694,TEST,0,0 +10695,TEST,0,0 +10696,TEST,0,0 +10697,TEST,0,0 +10698,TEST,0,0 +10699,TEST,0,0 +10700,TEST,0,0 +10701,TEST,0,0 +10702,TEST,0,0 +10703,TEST,0,0 +10704,TEST,0,0 +10705,TEST,0,0 +10706,TEST,0,0 +10707,TEST,0,0 +10708,TEST,0,0 +10709,TEST,0,0 +10710,TEST,0,0 +10711,TEST,0,0 +10712,TEST,0,0 +10713,TEST,0,0 +10714,TEST,0,0 +10715,TEST,0,0 +10716,TEST,0,0 +10717,TEST,0,0 +10718,TEST,0,0 +10719,TEST,0,0 +10720,TEST,0,0 +10721,TEST,0,0 +10722,TEST,0,0 +10723,TEST,0,0 +10724,TEST,0,0 +10725,TEST,0,0 +10726,TEST,0,0 +10727,TEST,0,0 +10728,TEST,0,0 +10729,TEST,0,0 +10730,TEST,0,0 +10731,TEST,0,0 +10732,TEST,0,0 +10733,TEST,0,0 +10734,TEST,0,0 +10735,TEST,0,0 +10736,TEST,0,0 +10737,TEST,0,0 +10738,TEST,0,0 +10739,TEST,0,0 +10740,TEST,0,0 +10741,TEST,0,0 +10742,TEST,0,0 +10743,TEST,0,0 +10744,TEST,0,0 +10745,TEST,0,0 +10746,TEST,0,0 +10747,TEST,0,0 +10748,TEST,0,0 +10749,TEST,0,0 +10750,TEST,0,0 +10751,TEST,0,0 +10752,TEST,0,0 +10753,TEST,0,0 +10754,TEST,0,0 +10755,TEST,0,0 +10756,TEST,0,0 +10757,TEST,0,0 +10758,TEST,0,0 +10759,TEST,0,0 +10760,TEST,0,0 +10761,TEST,0,0 +10762,TEST,0,0 +10763,TEST,0,0 +10764,TEST,0,0 +10765,TEST,0,0 +10766,TEST,0,0 +10767,TEST,0,0 +10768,TEST,0,0 +10769,TEST,0,0 +10770,TEST,0,0 +10771,TEST,0,0 +10772,TEST,0,0 +10773,TEST,0,0 +10774,TEST,0,0 +10775,TEST,0,0 +10776,TEST,0,0 +10777,TEST,0,0 +10778,TEST,0,0 +10779,TEST,0,0 +10780,TEST,0,0 +10781,TEST,0,0 +10782,TEST,0,0 +10783,TEST,0,0 +10784,TEST,0,0 +10785,TEST,0,0 +10786,TEST,0,0 +10787,TEST,0,0 +10788,TEST,0,0 +10789,TEST,0,0 +10790,TEST,0,0 +10791,TEST,0,0 +10792,TEST,0,0 +10793,TEST,0,0 +10794,TEST,0,0 +10795,TEST,0,0 +10796,TEST,0,0 +10797,TEST,0,0 +10798,TEST,0,0 +10799,TEST,0,0 +10800,TEST,0,0 +10801,TEST,0,0 +10802,TEST,0,0 +10803,TEST,0,0 +10804,TEST,0,0 +10805,TEST,0,0 +10806,TEST,0,0 +10807,TEST,0,0 +10808,TEST,0,0 +10809,TEST,0,0 +10810,TEST,0,0 +10811,TEST,0,0 +10812,TEST,0,0 +10813,TEST,0,0 +10814,TEST,0,0 +10815,TEST,0,0 +10816,TEST,0,0 +10817,TEST,0,0 +10818,TEST,0,0 +10819,TEST,0,0 +10820,TEST,0,0 +10821,TEST,0,0 +10822,TEST,0,0 +10823,TEST,0,0 +10824,TEST,0,0 +10825,TEST,0,0 +10826,TEST,0,0 +10827,TEST,0,0 +10828,TEST,0,0 +10829,TEST,0,0 +10830,TEST,0,0 +10831,TEST,0,0 +10832,TEST,0,0 +10833,TEST,0,0 +10834,TEST,0,0 +10835,TEST,0,0 +10836,TEST,0,0 +10837,TEST,0,0 +10838,TEST,0,0 +10839,TEST,0,0 +10840,TEST,0,0 +10841,TEST,0,0 +10842,TEST,0,0 +10843,TEST,0,0 +10844,TEST,0,0 +10845,TEST,0,0 +10846,TEST,0,0 +10847,TEST,0,0 +10848,TEST,0,0 +10849,TEST,0,0 +10850,TEST,0,0 +10851,TEST,0,0 +10852,TEST,0,0 +10853,TEST,0,0 +10854,TEST,0,0 +10855,TEST,0,0 +10856,TEST,0,0 +10857,TEST,0,0 +10858,TEST,0,0 +10859,TEST,0,0 +10860,TEST,0,0 +10861,TEST,0,0 +10862,TEST,0,0 +10863,TEST,0,0 +10864,TEST,0,0 +10865,TEST,0,0 +10866,TEST,0,0 +10867,TEST,0,0 +10868,TEST,0,0 +10869,TEST,0,0 +10870,TEST,0,0 +10871,TEST,0,0 +10872,TEST,0,0 +10873,TEST,0,0 +10874,TEST,0,0 +10875,TEST,0,0 +10876,TEST,0,0 +10877,TEST,0,0 +10878,TEST,0,0 +10879,TEST,0,0 +10880,TEST,0,0 +10881,TEST,0,0 +10882,TEST,0,0 +10883,TEST,0,0 +10884,TEST,0,0 +10885,TEST,0,0 +10886,TEST,0,0 +10887,TEST,0,0 +10888,TEST,0,0 +10889,TEST,0,0 +10890,TEST,0,0 +10891,TEST,0,0 +10892,TEST,0,0 +10893,TEST,0,0 +10894,TEST,0,0 +10895,TEST,0,0 +10896,TEST,0,0 +10897,TEST,0,0 +10898,TEST,0,0 +10899,TEST,0,0 +10900,TEST,0,0 +10901,TEST,0,0 +10902,TEST,0,0 +10903,TEST,0,0 +10904,TEST,0,0 +10905,TEST,0,0 +10906,TEST,0,0 +10907,TEST,0,0 +10908,TEST,0,0 +10909,TEST,0,0 +10910,TEST,0,0 +10911,TEST,0,0 +10912,TEST,0,0 +10913,TEST,0,0 +10914,TEST,0,0 +10915,TEST,0,0 +10916,TEST,0,0 +10917,TEST,0,0 +10918,TEST,0,0 +10919,TEST,0,0 +10920,TEST,0,0 +10921,TEST,0,0 +10922,TEST,0,0 +10923,TEST,0,0 +10924,TEST,0,0 +10925,TEST,0,0 +10926,TEST,0,0 +10927,TEST,0,0 +10928,TEST,0,0 +10929,TEST,0,0 +10930,TEST,0,0 +10931,TEST,0,0 +10932,TEST,0,0 +10933,TEST,0,0 +10934,TEST,0,0 +10935,TEST,0,0 +10936,TEST,0,0 +10937,TEST,0,0 +10938,TEST,0,0 +10939,TEST,0,0 +10940,TEST,0,0 +10941,TEST,0,0 +10942,TEST,0,0 +10943,TEST,0,0 +10944,TEST,0,0 +10945,TEST,0,0 +10946,TEST,0,0 +10947,TEST,0,0 +10948,TEST,0,0 +10949,TEST,0,0 +10950,TEST,0,0 +10951,TEST,0,0 +10952,TEST,0,0 +10953,TEST,0,0 +10954,TEST,0,0 +10955,TEST,0,0 +10956,TEST,0,0 +10957,TEST,0,0 +10958,TEST,0,0 +10959,TEST,0,0 +10960,TEST,0,0 +10961,TEST,0,0 +10962,TEST,0,0 +10963,TEST,0,0 +10964,TEST,0,0 +10965,TEST,0,0 +10966,TEST,0,0 +10967,TEST,0,0 +10968,TEST,0,0 +10969,TEST,0,0 +10970,TEST,0,0 +10971,TEST,0,0 +10972,TEST,0,0 +10973,TEST,0,0 +10974,TEST,0,0 +10975,TEST,0,0 +10976,TEST,0,0 +10977,TEST,0,0 +10978,TEST,0,0 +10979,TEST,0,0 +10980,TEST,0,0 +10981,TEST,0,0 +10982,TEST,0,0 +10983,TEST,0,0 +10984,TEST,0,0 +10985,TEST,0,0 +10986,TEST,0,0 +10987,TEST,0,0 +10988,TEST,0,0 +10989,TEST,0,0 +10990,TEST,0,0 +10991,TEST,0,0 +10992,TEST,0,0 +10993,TEST,0,0 +10994,TEST,0,0 +10995,TEST,0,0 +10996,TEST,0,0 +10997,TEST,0,0 +10998,TEST,0,0 +10999,TEST,0,0 +11000,TEST,0,0 +11001,TEST,0,0 +11002,TEST,0,0 +11003,TEST,0,0 +11004,TEST,0,0 +11005,TEST,0,0 +11006,TEST,0,0 +11007,TEST,0,0 +11008,TEST,0,0 +11009,TEST,0,0 +11010,TEST,0,0 +11011,TEST,0,0 +11012,TEST,0,0 +11013,TEST,0,0 +11014,TEST,0,0 +11015,TEST,0,0 +11016,TEST,0,0 +11017,TEST,0,0 +11018,TEST,0,0 +11019,TEST,0,0 +11020,TEST,0,0 +11021,TEST,0,0 +11022,TEST,0,0 +11023,TEST,0,0 +11024,TEST,0,0 +11025,TEST,0,0 +11026,TEST,0,0 +11027,TEST,0,0 +11028,TEST,0,0 +11029,TEST,0,0 +11030,TEST,0,0 +11031,TEST,0,0 +11032,TEST,0,0 +11033,TEST,0,0 +11034,TEST,0,0 +11035,TEST,0,0 +11036,TEST,0,0 +11037,TEST,0,0 +11038,TEST,0,0 +11039,TEST,0,0 +11040,TEST,0,0 +11041,TEST,0,0 +11042,TEST,0,0 +11043,TEST,0,0 +11044,TEST,0,0 +11045,TEST,0,0 +11046,TEST,0,0 +11047,TEST,0,0 +11048,TEST,0,0 +11049,TEST,0,0 +11050,TEST,0,0 +11051,TEST,0,0 +11052,TEST,0,0 +11053,TEST,0,0 +11054,TEST,0,0 +11055,TEST,0,0 +11056,TEST,0,0 +11057,TEST,0,0 +11058,TEST,0,0 +11059,TEST,0,0 +11060,TEST,0,0 +11061,TEST,0,0 +11062,TEST,0,0 +11063,TEST,0,0 +11064,TEST,0,0 +11065,TEST,0,0 +11066,TEST,0,0 +11067,TEST,0,0 +11068,TEST,0,0 +11069,TEST,0,0 +11070,TEST,0,0 +11071,TEST,0,0 +11072,TEST,0,0 +11073,TEST,0,0 +11074,TEST,0,0 +11075,TEST,0,0 +11076,TEST,0,0 +11077,TEST,0,0 +11078,TEST,0,0 +11079,TEST,0,0 +11080,TEST,0,0 +11081,TEST,0,0 +11082,TEST,0,0 +11083,TEST,0,0 +11084,TEST,0,0 +11085,TEST,0,0 +11086,TEST,0,0 +11087,TEST,0,0 +11088,TEST,0,0 +11089,TEST,0,0 +11090,TEST,0,0 +11091,TEST,0,0 +11092,TEST,0,0 +11093,TEST,0,0 +11094,TEST,0,0 +11095,TEST,0,0 +11096,TEST,0,0 +11097,TEST,0,0 +11098,TEST,0,0 +11099,TEST,0,0 +11100,TEST,0,0 +11101,TEST,0,0 +11102,TEST,0,0 +11103,TEST,0,0 +11104,TEST,0,0 +11105,TEST,0,0 +11106,TEST,0,0 +11107,TEST,0,0 +11108,TEST,0,0 +11109,TEST,0,0 +11110,TEST,0,0 +11111,TEST,0,0 +11112,TEST,0,0 +11113,TEST,0,0 +11114,TEST,0,0 +11115,TEST,0,0 +11116,TEST,0,0 +11117,TEST,0,0 +11118,TEST,0,0 +11119,TEST,0,0 +11120,TEST,0,0 +11121,TEST,0,0 +11122,TEST,0,0 +11123,TEST,0,0 +11124,TEST,0,0 +11125,TEST,0,0 +11126,TEST,0,0 +11127,TEST,0,0 +11128,TEST,0,0 +11129,TEST,0,0 +11130,TEST,0,0 +11131,TEST,0,0 +11132,TEST,0,0 +11133,TEST,0,0 +11134,TEST,0,0 +11135,TEST,0,0 +11136,TEST,0,0 +11137,TEST,0,0 +11138,TEST,0,0 +11139,TEST,0,0 +11140,TEST,0,0 +11141,TEST,0,0 +11142,TEST,0,0 +11143,TEST,0,0 +11144,TEST,0,0 +11145,TEST,0,0 +11146,TEST,0,0 +11147,TEST,0,0 +11148,TEST,0,0 +11149,TEST,0,0 +11150,TEST,0,0 +11151,TEST,0,0 +11152,TEST,0,0 +11153,TEST,0,0 +11154,TEST,0,0 +11155,TEST,0,0 +11156,TEST,0,0 +11157,TEST,0,0 +11158,TEST,0,0 +11159,TEST,0,0 +11160,TEST,0,0 +11161,TEST,0,0 +11162,TEST,0,0 +11163,TEST,0,0 +11164,TEST,0,0 +11165,TEST,0,0 +11166,TEST,0,0 +11167,TEST,0,0 +11168,TEST,0,0 +11169,TEST,0,0 +11170,TEST,0,0 +11171,TEST,0,0 +11172,TEST,0,0 +11173,TEST,0,0 +11174,TEST,0,0 +11175,TEST,0,0 +11176,TEST,0,0 +11177,TEST,0,0 +11178,TEST,0,0 +11179,TEST,0,0 +11180,TEST,0,0 +11181,TEST,0,0 +11182,TEST,0,0 +11183,TEST,0,0 +11184,TEST,0,0 +11185,TEST,0,0 +11186,TEST,0,0 +11187,TEST,0,0 +11188,TEST,0,0 +11189,TEST,0,0 +11190,TEST,0,0 +11191,TEST,0,0 +11192,TEST,0,0 +11193,TEST,0,0 +11194,TEST,0,0 +11195,TEST,0,0 +11196,TEST,0,0 +11197,TEST,0,0 +11198,TEST,0,0 +11199,TEST,0,0 +11200,TEST,0,0 +11201,TEST,0,0 +11202,TEST,0,0 +11203,TEST,0,0 +11204,TEST,0,0 +11205,TEST,0,0 +11206,TEST,0,0 +11207,TEST,0,0 +11208,TEST,0,0 +11209,TEST,0,0 +11210,TEST,0,0 +11211,TEST,0,0 +11212,TEST,0,0 +11213,TEST,0,0 +11214,TEST,0,0 +11215,TEST,0,0 +11216,TEST,0,0 +11217,TEST,0,0 +11218,TEST,0,0 +11219,TEST,0,0 +11220,TEST,0,0 +11221,TEST,0,0 +11222,TEST,0,0 +11223,TEST,0,0 +11224,TEST,0,0 +11225,TEST,0,0 +11226,TEST,0,0 +11227,TEST,0,0 +11228,TEST,0,0 +11229,TEST,0,0 +11230,TEST,0,0 +11231,TEST,0,0 +11232,TEST,0,0 +11233,TEST,0,0 +11234,TEST,0,0 +11235,TEST,0,0 +11236,TEST,0,0 +11237,TEST,0,0 +11238,TEST,0,0 +11239,TEST,0,0 +11240,TEST,0,0 +11241,TEST,0,0 +11242,TEST,0,0 +11243,TEST,0,0 +11244,TEST,0,0 +11245,TEST,0,0 +11246,TEST,0,0 +11247,TEST,0,0 +11248,TEST,0,0 +11249,TEST,0,0 +11250,TEST,0,0 +11251,TEST,0,0 +11252,TEST,0,0 +11253,TEST,0,0 +11254,TEST,0,0 +11255,TEST,0,0 +11256,TEST,0,0 +11257,TEST,0,0 +11258,TEST,0,0 +11259,TEST,0,0 +11260,TEST,0,0 +11261,TEST,0,0 +11262,TEST,0,0 +11263,TEST,0,0 +11264,TEST,0,0 +11265,TEST,0,0 +11266,TEST,0,0 +11267,TEST,0,0 +11268,TEST,0,0 +11269,TEST,0,0 +11270,TEST,0,0 +11271,TEST,0,0 +11272,TEST,0,0 +11273,TEST,0,0 +11274,TEST,0,0 +11275,TEST,0,0 +11276,TEST,0,0 +11277,TEST,0,0 +11278,TEST,0,0 +11279,TEST,0,0 +11280,TEST,0,0 +11281,TEST,0,0 +11282,TEST,0,0 +11283,TEST,0,0 +11284,TEST,0,0 +11285,TEST,0,0 +11286,TEST,0,0 +11287,TEST,0,0 +11288,TEST,0,0 +11289,TEST,0,0 +11290,TEST,0,0 +11291,TEST,0,0 +11292,TEST,0,0 +11293,TEST,0,0 +11294,TEST,0,0 +11295,TEST,0,0 +11296,TEST,0,0 +11297,TEST,0,0 +11298,TEST,0,0 +11299,TEST,0,0 +11300,TEST,0,0 +11301,TEST,0,0 +11302,TEST,0,0 +11303,TEST,0,0 +11304,TEST,0,0 +11305,TEST,0,0 +11306,TEST,0,0 +11307,TEST,0,0 +11308,TEST,0,0 +11309,TEST,0,0 +11310,TEST,0,0 +11311,TEST,0,0 +11312,TEST,0,0 +11313,TEST,0,0 +11314,TEST,0,0 +11315,TEST,0,0 +11316,TEST,0,0 +11317,TEST,0,0 +11318,TEST,0,0 +11319,TEST,0,0 +11320,TEST,0,0 +11321,TEST,0,0 +11322,TEST,0,0 +11323,TEST,0,0 +11324,TEST,0,0 +11325,TEST,0,0 +11326,TEST,0,0 +11327,TEST,0,0 +11328,TEST,0,0 +11329,TEST,0,0 +11330,TEST,0,0 +11331,TEST,0,0 +11332,TEST,0,0 +11333,TEST,0,0 +11334,TEST,0,0 +11335,TEST,0,0 +11336,TEST,0,0 +11337,TEST,0,0 +11338,TEST,0,0 +11339,TEST,0,0 +11340,TEST,0,0 +11341,TEST,0,0 +11342,TEST,0,0 +11343,TEST,0,0 +11344,TEST,0,0 +11345,TEST,0,0 +11346,TEST,0,0 +11347,TEST,0,0 +11348,TEST,0,0 +11349,TEST,0,0 +11350,TEST,0,0 +11351,TEST,0,0 +11352,TEST,0,0 +11353,TEST,0,0 +11354,TEST,0,0 +11355,TEST,0,0 +11356,TEST,0,0 +11357,TEST,0,0 +11358,TEST,0,0 +11359,TEST,0,0 +11360,TEST,0,0 +11361,TEST,0,0 +11362,TEST,0,0 +11363,TEST,0,0 +11364,TEST,0,0 +11365,TEST,0,0 +11366,TEST,0,0 +11367,TEST,0,0 +11368,TEST,0,0 +11369,TEST,0,0 +11370,TEST,0,0 +11371,TEST,0,0 +11372,TEST,0,0 +11373,TEST,0,0 +11374,TEST,0,0 +11375,TEST,0,0 +11376,TEST,0,0 +11377,TEST,0,0 +11378,TEST,0,0 +11379,TEST,0,0 +11380,TEST,0,0 +11381,TEST,0,0 +11382,TEST,0,0 +11383,TEST,0,0 +11384,TEST,0,0 +11385,TEST,0,0 +11386,TEST,0,0 +11387,TEST,0,0 +11388,TEST,0,0 +11389,TEST,0,0 +11390,TEST,0,0 +11391,TEST,0,0 +11392,TEST,0,0 +11393,TEST,0,0 +11394,TEST,0,0 +11395,TEST,0,0 +11396,TEST,0,0 +11397,TEST,0,0 +11398,TEST,0,0 +11399,TEST,0,0 +11400,TEST,0,0 +11401,TEST,0,0 +11402,TEST,0,0 +11403,TEST,0,0 +11404,TEST,0,0 +11405,TEST,0,0 +11406,TEST,0,0 +11407,TEST,0,0 +11408,TEST,0,0 +11409,TEST,0,0 +11410,TEST,0,0 +11411,TEST,0,0 +11412,TEST,0,0 +11413,TEST,0,0 +11414,TEST,0,0 +11415,TEST,0,0 +11416,TEST,0,0 +11417,TEST,0,0 +11418,TEST,0,0 +11419,TEST,0,0 +11420,TEST,0,0 +11421,TEST,0,0 +11422,TEST,0,0 +11423,TEST,0,0 +11424,TEST,0,0 +11425,TEST,0,0 +11426,TEST,0,0 +11427,TEST,0,0 +11428,TEST,0,0 +11429,TEST,0,0 +11430,TEST,0,0 +11431,TEST,0,0 +11432,TEST,0,0 +11433,TEST,0,0 +11434,TEST,0,0 +11435,TEST,0,0 +11436,TEST,0,0 +11437,TEST,0,0 +11438,TEST,0,0 +11439,TEST,0,0 +11440,TEST,0,0 +11441,TEST,0,0 +11442,TEST,0,0 +11443,TEST,0,0 +11444,TEST,0,0 +11445,TEST,0,0 +11446,TEST,0,0 +11447,TEST,0,0 +11448,TEST,0,0 +11449,TEST,0,0 +11450,TEST,0,0 +11451,TEST,0,0 +11452,TEST,0,0 +11453,TEST,0,0 +11454,TEST,0,0 +11455,TEST,0,0 +11456,TEST,0,0 +11457,TEST,0,0 +11458,TEST,0,0 +11459,TEST,0,0 +11460,TEST,0,0 +11461,TEST,0,0 +11462,TEST,0,0 +11463,TEST,0,0 +11464,TEST,0,0 +11465,TEST,0,0 +11466,TEST,0,0 +11467,TEST,0,0 +11468,TEST,0,0 +11469,TEST,0,0 +11470,TEST,0,0 +11471,TEST,0,0 +11472,TEST,0,0 +11473,TEST,0,0 +11474,TEST,0,0 +11475,TEST,0,0 +11476,TEST,0,0 +11477,TEST,0,0 +11478,TEST,0,0 +11479,TEST,0,0 +11480,TEST,0,0 +11481,TEST,0,0 +11482,TEST,0,0 +11483,TEST,0,0 +11484,TEST,0,0 +11485,TEST,0,0 +11486,TEST,0,0 +11487,TEST,0,0 +11488,TEST,0,0 +11489,TEST,0,0 +11490,TEST,0,0 +11491,TEST,0,0 +11492,TEST,0,0 +11493,TEST,0,0 +11494,TEST,0,0 +11495,TEST,0,0 +11496,TEST,0,0 +11497,TEST,0,0 +11498,TEST,0,0 +11499,TEST,0,0 +11500,TEST,0,0 +11501,TEST,0,0 +11502,TEST,0,0 +11503,TEST,0,0 +11504,TEST,0,0 +11505,TEST,0,0 +11506,TEST,0,0 +11507,TEST,0,0 +11508,TEST,0,0 +11509,TEST,0,0 +11510,TEST,0,0 +11511,TEST,0,0 +11512,TEST,0,0 +11513,TEST,0,0 +11514,TEST,0,0 +11515,TEST,0,0 +11516,TEST,0,0 +11517,TEST,0,0 +11518,TEST,0,0 +11519,TEST,0,0 +11520,TEST,0,0 +11521,TEST,0,0 +11522,TEST,0,0 +11523,TEST,0,0 +11524,TEST,0,0 +11525,TEST,0,0 +11526,TEST,0,0 +11527,TEST,0,0 +11528,TEST,0,0 +11529,TEST,0,0 +11530,TEST,0,0 +11531,TEST,0,0 +11532,TEST,0,0 +11533,TEST,0,0 +11534,TEST,0,0 +11535,TEST,0,0 +11536,TEST,0,0 +11537,TEST,0,0 +11538,TEST,0,0 +11539,TEST,0,0 +11540,TEST,0,0 +11541,TEST,0,0 +11542,TEST,0,0 +11543,TEST,0,0 +11544,TEST,0,0 +11545,TEST,0,0 +11546,TEST,0,0 +11547,TEST,0,0 +11548,TEST,0,0 +11549,TEST,0,0 +11550,TEST,0,0 +11551,TEST,0,0 +11552,TEST,0,0 +11553,TEST,0,0 +11554,TEST,0,0 +11555,TEST,0,0 +11556,TEST,0,0 +11557,TEST,0,0 +11558,TEST,0,0 +11559,TEST,0,0 +11560,TEST,0,0 +11561,TEST,0,0 +11562,TEST,0,0 +11563,TEST,0,0 +11564,TEST,0,0 +11565,TEST,0,0 +11566,TEST,0,0 +11567,TEST,0,0 +11568,TEST,0,0 +11569,TEST,0,0 +11570,TEST,0,0 +11571,TEST,0,0 +11572,TEST,0,0 +11573,TEST,0,0 +11574,TEST,0,0 +11575,TEST,0,0 +11576,TEST,0,0 +11577,TEST,0,0 +11578,TEST,0,0 +11579,TEST,0,0 +11580,TEST,0,0 +11581,TEST,0,0 +11582,TEST,0,0 +11583,TEST,0,0 +11584,TEST,0,0 +11585,TEST,0,0 +11586,TEST,0,0 +11587,TEST,0,0 +11588,TEST,0,0 +11589,TEST,0,0 +11590,TEST,0,0 +11591,TEST,0,0 +11592,TEST,0,0 +11593,TEST,0,0 +11594,TEST,0,0 +11595,TEST,0,0 +11596,TEST,0,0 +11597,TEST,0,0 +11598,TEST,0,0 +11599,TEST,0,0 +11600,TEST,0,0 +11601,TEST,0,0 +11602,TEST,0,0 +11603,TEST,0,0 +11604,TEST,0,0 +11605,TEST,0,0 +11606,TEST,0,0 +11607,TEST,0,0 +11608,TEST,0,0 +11609,TEST,0,0 +11610,TEST,0,0 +11611,TEST,0,0 +11612,TEST,0,0 +11613,TEST,0,0 +11614,TEST,0,0 +11615,TEST,0,0 +11616,TEST,0,0 +11617,TEST,0,0 +11618,TEST,0,0 +11619,TEST,0,0 +11620,TEST,0,0 +11621,TEST,0,0 +11622,TEST,0,0 +11623,TEST,0,0 +11624,TEST,0,0 +11625,TEST,0,0 +11626,TEST,0,0 +11627,TEST,0,0 +11628,TEST,0,0 +11629,TEST,0,0 +11630,TEST,0,0 +11631,TEST,0,0 +11632,TEST,0,0 +11633,TEST,0,0 +11634,TEST,0,0 +11635,TEST,0,0 +11636,TEST,0,0 +11637,TEST,0,0 +11638,TEST,0,0 +11639,TEST,0,0 +11640,TEST,0,0 +11641,TEST,0,0 +11642,TEST,0,0 +11643,TEST,0,0 +11644,TEST,0,0 +11645,TEST,0,0 +11646,TEST,0,0 +11647,TEST,0,0 +11648,TEST,0,0 +11649,TEST,0,0 +11650,TEST,0,0 +11651,TEST,0,0 +11652,TEST,0,0 +11653,TEST,0,0 +11654,TEST,0,0 +11655,TEST,0,0 +11656,TEST,0,0 +11657,TEST,0,0 +11658,TEST,0,0 +11659,TEST,0,0 +11660,TEST,0,0 +11661,TEST,0,0 +11662,TEST,0,0 +11663,TEST,0,0 +11664,TEST,0,0 +11665,TEST,0,0 +11666,TEST,0,0 +11667,TEST,0,0 +11668,TEST,0,0 +11669,TEST,0,0 +11670,TEST,0,0 +11671,TEST,0,0 +11672,TEST,0,0 +11673,TEST,0,0 +11674,TEST,0,0 +11675,TEST,0,0 +11676,TEST,0,0 +11677,TEST,0,0 +11678,TEST,0,0 +11679,TEST,0,0 +11680,TEST,0,0 +11681,TEST,0,0 +11682,TEST,0,0 +11683,TEST,0,0 +11684,TEST,0,0 +11685,TEST,0,0 +11686,TEST,0,0 +11687,TEST,0,0 +11688,TEST,0,0 +11689,TEST,0,0 +11690,TEST,0,0 +11691,TEST,0,0 +11692,TEST,0,0 +11693,TEST,0,0 +11694,TEST,0,0 +11695,TEST,0,0 +11696,TEST,0,0 +11697,TEST,0,0 +11698,TEST,0,0 +11699,TEST,0,0 +11700,TEST,0,0 +11701,TEST,0,0 +11702,TEST,0,0 +11703,TEST,0,0 +11704,TEST,0,0 +11705,TEST,0,0 +11706,TEST,0,0 +11707,TEST,0,0 +11708,TEST,0,0 +11709,TEST,0,0 +11710,TEST,0,0 +11711,TEST,0,0 +11712,TEST,0,0 +11713,TEST,0,0 +11714,TEST,0,0 +11715,TEST,0,0 +11716,TEST,0,0 +11717,TEST,0,0 +11718,TEST,0,0 +11719,TEST,0,0 +11720,TEST,0,0 +11721,TEST,0,0 +11722,TEST,0,0 +11723,TEST,0,0 +11724,TEST,0,0 +11725,TEST,0,0 +11726,TEST,0,0 +11727,TEST,0,0 +11728,TEST,0,0 +11729,TEST,0,0 +11730,TEST,0,0 +11731,TEST,0,0 +11732,TEST,0,0 +11733,TEST,0,0 +11734,TEST,0,0 +11735,TEST,0,0 +11736,TEST,0,0 +11737,TEST,0,0 +11738,TEST,0,0 +11739,TEST,0,0 +11740,TEST,0,0 +11741,TEST,0,0 +11742,TEST,0,0 +11743,TEST,0,0 +11744,TEST,0,0 +11745,TEST,0,0 +11746,TEST,0,0 +11747,TEST,0,0 +11748,TEST,0,0 +11749,TEST,0,0 +11750,TEST,0,0 +11751,TEST,0,0 +11752,TEST,0,0 +11753,TEST,0,0 +11754,TEST,0,0 +11755,TEST,0,0 +11756,TEST,0,0 +11757,TEST,0,0 +11758,TEST,0,0 +11759,TEST,0,0 +11760,TEST,0,0 +11761,TEST,0,0 +11762,TEST,0,0 +11763,TEST,0,0 +11764,TEST,0,0 +11765,TEST,0,0 +11766,TEST,0,0 +11767,TEST,0,0 +11768,TEST,0,0 +11769,TEST,0,0 +11770,TEST,0,0 +11771,TEST,0,0 +11772,TEST,0,0 +11773,TEST,0,0 +11774,TEST,0,0 +11775,TEST,0,0 +11776,TEST,0,0 +11777,TEST,0,0 +11778,TEST,0,0 +11779,TEST,0,0 +11780,TEST,0,0 +11781,TEST,0,0 +11782,TEST,0,0 +11783,TEST,0,0 +11784,TEST,0,0 +11785,TEST,0,0 +11786,TEST,0,0 +11787,TEST,0,0 +11788,TEST,0,0 +11789,TEST,0,0 +11790,TEST,0,0 +11791,TEST,0,0 +11792,TEST,0,0 +11793,TEST,0,0 +11794,TEST,0,0 +11795,TEST,0,0 +11796,TEST,0,0 +11797,TEST,0,0 +11798,TEST,0,0 +11799,TEST,0,0 +11800,TEST,0,0 +11801,TEST,0,0 +11802,TEST,0,0 +11803,TEST,0,0 +11804,TEST,0,0 +11805,TEST,0,0 +11806,TEST,0,0 +11807,TEST,0,0 +11808,TEST,0,0 +11809,TEST,0,0 +11810,TEST,0,0 +11811,TEST,0,0 +11812,TEST,0,0 +11813,TEST,0,0 +11814,TEST,0,0 +11815,TEST,0,0 +11816,TEST,0,0 +11817,TEST,0,0 +11818,TEST,0,0 +11819,TEST,0,0 +11820,TEST,0,0 +11821,TEST,0,0 +11822,TEST,0,0 +11823,TEST,0,0 +11824,TEST,0,0 +11825,TEST,0,0 +11826,TEST,0,0 +11827,TEST,0,0 +11828,TEST,0,0 +11829,TEST,0,0 +11830,TEST,0,0 +11831,TEST,0,0 +11832,TEST,0,0 +11833,TEST,0,0 +11834,TEST,0,0 +11835,TEST,0,0 +11836,TEST,0,0 +11837,TEST,0,0 +11838,TEST,0,0 +11839,TEST,0,0 +11840,TEST,0,0 +11841,TEST,0,0 +11842,TEST,0,0 +11843,TEST,0,0 +11844,TEST,0,0 +11845,TEST,0,0 +11846,TEST,0,0 +11847,TEST,0,0 +11848,TEST,0,0 +11849,TEST,0,0 +11850,TEST,0,0 +11851,TEST,0,0 +11852,TEST,0,0 +11853,TEST,0,0 +11854,TEST,0,0 +11855,TEST,0,0 +11856,TEST,0,0 +11857,TEST,0,0 +11858,TEST,0,0 +11859,TEST,0,0 +11860,TEST,0,0 +11861,TEST,0,0 +11862,TEST,0,0 +11863,TEST,0,0 +11864,TEST,0,0 +11865,TEST,0,0 +11866,TEST,0,0 +11867,TEST,0,0 +11868,TEST,0,0 +11869,TEST,0,0 +11870,TEST,0,0 +11871,TEST,0,0 +11872,TEST,0,0 +11873,TEST,0,0 +11874,TEST,0,0 +11875,TEST,0,0 +11876,TEST,0,0 +11877,TEST,0,0 +11878,TEST,0,0 +11879,TEST,0,0 +11880,TEST,0,0 +11881,TEST,0,0 +11882,TEST,0,0 +11883,TEST,0,0 +11884,TEST,0,0 +11885,TEST,0,0 +11886,TEST,0,0 +11887,TEST,0,0 +11888,TEST,0,0 +11889,TEST,0,0 +11890,TEST,0,0 +11891,TEST,0,0 +11892,TEST,0,0 +11893,TEST,0,0 +11894,TEST,0,0 +11895,TEST,0,0 +11896,TEST,0,0 +11897,TEST,0,0 +11898,TEST,0,0 +11899,TEST,0,0 +11900,TEST,0,0 +11901,TEST,0,0 +11902,TEST,0,0 +11903,TEST,0,0 +11904,TEST,0,0 +11905,TEST,0,0 +11906,TEST,0,0 +11907,TEST,0,0 +11908,TEST,0,0 +11909,TEST,0,0 +11910,TEST,0,0 +11911,TEST,0,0 +11912,TEST,0,0 +11913,TEST,0,0 +11914,TEST,0,0 +11915,TEST,0,0 +11916,TEST,0,0 +11917,TEST,0,0 +11918,TEST,0,0 +11919,TEST,0,0 +11920,TEST,0,0 +11921,TEST,0,0 +11922,TEST,0,0 +11923,TEST,0,0 +11924,TEST,0,0 +11925,TEST,0,0 +11926,TEST,0,0 +11927,TEST,0,0 +11928,TEST,0,0 +11929,TEST,0,0 +11930,TEST,0,0 +11931,TEST,0,0 +11932,TEST,0,0 +11933,TEST,0,0 +11934,TEST,0,0 +11935,TEST,0,0 +11936,TEST,0,0 +11937,TEST,0,0 +11938,TEST,0,0 +11939,TEST,0,0 +11940,TEST,0,0 +11941,TEST,0,0 +11942,TEST,0,0 +11943,TEST,0,0 +11944,TEST,0,0 +11945,TEST,0,0 +11946,TEST,0,0 +11947,TEST,0,0 +11948,TEST,0,0 +11949,TEST,0,0 +11950,TEST,0,0 +11951,TEST,0,0 +11952,TEST,0,0 +11953,TEST,0,0 +11954,TEST,0,0 +11955,TEST,0,0 +11956,TEST,0,0 +11957,TEST,0,0 +11958,TEST,0,0 +11959,TEST,0,0 +11960,TEST,0,0 +11961,TEST,0,0 +11962,TEST,0,0 +11963,TEST,0,0 +11964,TEST,0,0 +11965,TEST,0,0 +11966,TEST,0,0 +11967,TEST,0,0 +11968,TEST,0,0 +11969,TEST,0,0 +11970,TEST,0,0 +11971,TEST,0,0 +11972,TEST,0,0 +11973,TEST,0,0 +11974,TEST,0,0 +11975,TEST,0,0 +11976,TEST,0,0 +11977,TEST,0,0 +11978,TEST,0,0 +11979,TEST,0,0 +11980,TEST,0,0 +11981,TEST,0,0 +11982,TEST,0,0 +11983,TEST,0,0 +11984,TEST,0,0 +11985,TEST,0,0 +11986,TEST,0,0 +11987,TEST,0,0 +11988,TEST,0,0 +11989,TEST,0,0 +11990,TEST,0,0 +11991,TEST,0,0 +11992,TEST,0,0 +11993,TEST,0,0 +11994,TEST,0,0 +11995,TEST,0,0 +11996,TEST,0,0 +11997,TEST,0,0 +11998,TEST,0,0 +11999,TEST,0,0 +12000,TEST,0,0 +12001,TEST,0,0 +12002,TEST,0,0 +12003,TEST,0,0 +12004,TEST,0,0 +12005,TEST,0,0 +12006,TEST,0,0 +12007,TEST,0,0 +12008,TEST,0,0 +12009,TEST,0,0 +12010,TEST,0,0 +12011,TEST,0,0 +12012,TEST,0,0 +12013,TEST,0,0 +12014,TEST,0,0 +12015,TEST,0,0 +12016,TEST,0,0 +12017,TEST,0,0 +12018,TEST,0,0 +12019,TEST,0,0 +12020,TEST,0,0 +12021,TEST,0,0 +12022,TEST,0,0 +12023,TEST,0,0 +12024,TEST,0,0 +12025,TEST,0,0 +12026,TEST,0,0 +12027,TEST,0,0 +12028,TEST,0,0 +12029,TEST,0,0 +12030,TEST,0,0 +12031,TEST,0,0 +12032,TEST,0,0 +12033,TEST,0,0 +12034,TEST,0,0 +12035,TEST,0,0 +12036,TEST,0,0 +12037,TEST,0,0 +12038,TEST,0,0 +12039,TEST,0,0 +12040,TEST,0,0 +12041,TEST,0,0 +12042,TEST,0,0 +12043,TEST,0,0 +12044,TEST,0,0 +12045,TEST,0,0 +12046,TEST,0,0 +12047,TEST,0,0 +12048,TEST,0,0 +12049,TEST,0,0 +12050,TEST,0,0 +12051,TEST,0,0 +12052,TEST,0,0 +12053,TEST,0,0 +12054,TEST,0,0 +12055,TEST,0,0 +12056,TEST,0,0 +12057,TEST,0,0 +12058,TEST,0,0 +12059,TEST,0,0 +12060,TEST,0,0 +12061,TEST,0,0 +12062,TEST,0,0 +12063,TEST,0,0 +12064,TEST,0,0 +12065,TEST,0,0 +12066,TEST,0,0 +12067,TEST,0,0 +12068,TEST,0,0 +12069,TEST,0,0 +12070,TEST,0,0 +12071,TEST,0,0 +12072,TEST,0,0 +12073,TEST,0,0 +12074,TEST,0,0 +12075,TEST,0,0 +12076,TEST,0,0 +12077,TEST,0,0 +12078,TEST,0,0 +12079,TEST,0,0 +12080,TEST,0,0 +12081,TEST,0,0 +12082,TEST,0,0 +12083,TEST,0,0 +12084,TEST,0,0 +12085,TEST,0,0 +12086,TEST,0,0 +12087,TEST,0,0 +12088,TEST,0,0 +12089,TEST,0,0 +12090,TEST,0,0 +12091,TEST,0,0 +12092,TEST,0,0 +12093,TEST,0,0 +12094,TEST,0,0 +12095,TEST,0,0 +12096,TEST,0,0 +12097,TEST,0,0 +12098,TEST,0,0 +12099,TEST,0,0 +12100,TEST,0,0 +12101,TEST,0,0 +12102,TEST,0,0 +12103,TEST,0,0 +12104,TEST,0,0 +12105,TEST,0,0 +12106,TEST,0,0 +12107,TEST,0,0 +12108,TEST,0,0 +12109,TEST,0,0 +12110,TEST,0,0 +12111,TEST,0,0 +12112,TEST,0,0 +12113,TEST,0,0 +12114,TEST,0,0 +12115,TEST,0,0 +12116,TEST,0,0 +12117,TEST,0,0 +12118,TEST,0,0 +12119,TEST,0,0 +12120,TEST,0,0 +12121,TEST,0,0 +12122,TEST,0,0 +12123,TEST,0,0 +12124,TEST,0,0 +12125,TEST,0,0 +12126,TEST,0,0 +12127,TEST,0,0 +12128,TEST,0,0 +12129,TEST,0,0 +12130,TEST,0,0 +12131,TEST,0,0 +12132,TEST,0,0 +12133,TEST,0,0 +12134,TEST,0,0 +12135,TEST,0,0 +12136,TEST,0,0 +12137,TEST,0,0 +12138,TEST,0,0 +12139,TEST,0,0 +12140,TEST,0,0 +12141,TEST,0,0 +12142,TEST,0,0 +12143,TEST,0,0 +12144,TEST,0,0 +12145,TEST,0,0 +12146,TEST,0,0 +12147,TEST,0,0 +12148,TEST,0,0 +12149,TEST,0,0 +12150,TEST,0,0 +12151,TEST,0,0 +12152,TEST,0,0 +12153,TEST,0,0 +12154,TEST,0,0 +12155,TEST,0,0 +12156,TEST,0,0 +12157,TEST,0,0 +12158,TEST,0,0 +12159,TEST,0,0 +12160,TEST,0,0 +12161,TEST,0,0 +12162,TEST,0,0 +12163,TEST,0,0 +12164,TEST,0,0 +12165,TEST,0,0 +12166,TEST,0,0 +12167,TEST,0,0 +12168,TEST,0,0 +12169,TEST,0,0 +12170,TEST,0,0 +12171,TEST,0,0 +12172,TEST,0,0 +12173,TEST,0,0 +12174,TEST,0,0 +12175,TEST,0,0 +12176,TEST,0,0 +12177,TEST,0,0 +12178,TEST,0,0 +12179,TEST,0,0 +12180,TEST,0,0 +12181,TEST,0,0 +12182,TEST,0,0 +12183,TEST,0,0 +12184,TEST,0,0 +12185,TEST,0,0 +12186,TEST,0,0 +12187,TEST,0,0 +12188,TEST,0,0 +12189,TEST,0,0 +12190,TEST,0,0 +12191,TEST,0,0 +12192,TEST,0,0 +12193,TEST,0,0 +12194,TEST,0,0 +12195,TEST,0,0 +12196,TEST,0,0 +12197,TEST,0,0 +12198,TEST,0,0 +12199,TEST,0,0 +12200,TEST,0,0 +12201,TEST,0,0 +12202,TEST,0,0 +12203,TEST,0,0 +12204,TEST,0,0 +12205,TEST,0,0 +12206,TEST,0,0 +12207,TEST,0,0 +12208,TEST,0,0 +12209,TEST,0,0 +12210,TEST,0,0 +12211,TEST,0,0 +12212,TEST,0,0 +12213,TEST,0,0 +12214,TEST,0,0 +12215,TEST,0,0 +12216,TEST,0,0 +12217,TEST,0,0 +12218,TEST,0,0 +12219,TEST,0,0 +12220,TEST,0,0 +12221,TEST,0,0 +12222,TEST,0,0 +12223,TEST,0,0 +12224,TEST,0,0 +12225,TEST,0,0 +12226,TEST,0,0 +12227,TEST,0,0 +12228,TEST,0,0 +12229,TEST,0,0 +12230,TEST,0,0 +12231,TEST,0,0 +12232,TEST,0,0 +12233,TEST,0,0 +12234,TEST,0,0 +12235,TEST,0,0 +12236,TEST,0,0 +12237,TEST,0,0 +12238,TEST,0,0 +12239,TEST,0,0 +12240,TEST,0,0 +12241,TEST,0,0 +12242,TEST,0,0 +12243,TEST,0,0 +12244,TEST,0,0 +12245,TEST,0,0 +12246,TEST,0,0 +12247,TEST,0,0 +12248,TEST,0,0 +12249,TEST,0,0 +12250,TEST,0,0 +12251,TEST,0,0 +12252,TEST,0,0 +12253,TEST,0,0 +12254,TEST,0,0 +12255,TEST,0,0 +12256,TEST,0,0 +12257,TEST,0,0 +12258,TEST,0,0 +12259,TEST,0,0 +12260,TEST,0,0 +12261,TEST,0,0 +12262,TEST,0,0 +12263,TEST,0,0 +12264,TEST,0,0 +12265,TEST,0,0 +12266,TEST,0,0 +12267,TEST,0,0 +12268,TEST,0,0 +12269,TEST,0,0 +12270,TEST,0,0 +12271,TEST,0,0 +12272,TEST,0,0 +12273,TEST,0,0 +12274,TEST,0,0 +12275,TEST,0,0 +12276,TEST,0,0 +12277,TEST,0,0 +12278,TEST,0,0 +12279,TEST,0,0 +12280,TEST,0,0 +12281,TEST,0,0 +12282,TEST,0,0 +12283,TEST,0,0 +12284,TEST,0,0 +12285,TEST,0,0 +12286,TEST,0,0 +12287,TEST,0,0 +12288,TEST,0,0 +12289,TEST,0,0 +12290,TEST,0,0 +12291,TEST,0,0 +12292,TEST,0,0 +12293,TEST,0,0 +12294,TEST,0,0 +12295,TEST,0,0 +12296,TEST,0,0 +12297,TEST,0,0 +12298,TEST,0,0 +12299,TEST,0,0 +12300,TEST,0,0 +12301,TEST,0,0 +12302,TEST,0,0 +12303,TEST,0,0 +12304,TEST,0,0 +12305,TEST,0,0 +12306,TEST,0,0 +12307,TEST,0,0 +12308,TEST,0,0 +12309,TEST,0,0 +12310,TEST,0,0 +12311,TEST,0,0 +12312,TEST,0,0 +12313,TEST,0,0 +12314,TEST,0,0 +12315,TEST,0,0 +12316,TEST,0,0 +12317,TEST,0,0 +12318,TEST,0,0 +12319,TEST,0,0 +12320,TEST,0,0 +12321,TEST,0,0 +12322,TEST,0,0 +12323,TEST,0,0 +12324,TEST,0,0 +12325,TEST,0,0 +12326,TEST,0,0 +12327,TEST,0,0 +12328,TEST,0,0 +12329,TEST,0,0 +12330,TEST,0,0 +12331,TEST,0,0 +12332,TEST,0,0 +12333,TEST,0,0 +12334,TEST,0,0 +12335,TEST,0,0 +12336,TEST,0,0 +12337,TEST,0,0 +12338,TEST,0,0 +12339,TEST,0,0 +12340,TEST,0,0 +12341,TEST,0,0 +12342,TEST,0,0 +12343,TEST,0,0 +12344,TEST,0,0 +12345,TEST,0,0 +12346,TEST,0,0 +12347,TEST,0,0 +12348,TEST,0,0 +12349,TEST,0,0 +12350,TEST,0,0 +12351,TEST,0,0 +12352,TEST,0,0 +12353,TEST,0,0 +12354,TEST,0,0 +12355,TEST,0,0 +12356,TEST,0,0 +12357,TEST,0,0 +12358,TEST,0,0 +12359,TEST,0,0 +12360,TEST,0,0 +12361,TEST,0,0 +12362,TEST,0,0 +12363,TEST,0,0 +12364,TEST,0,0 +12365,TEST,0,0 +12366,TEST,0,0 +12367,TEST,0,0 +12368,TEST,0,0 +12369,TEST,0,0 +12370,TEST,0,0 +12371,TEST,0,0 +12372,TEST,0,0 +12373,TEST,0,0 +12374,TEST,0,0 +12375,TEST,0,0 +12376,TEST,0,0 +12377,TEST,0,0 +12378,TEST,0,0 +12379,TEST,0,0 +12380,TEST,0,0 +12381,TEST,0,0 +12382,TEST,0,0 +12383,TEST,0,0 +12384,TEST,0,0 +12385,TEST,0,0 +12386,TEST,0,0 +12387,TEST,0,0 +12388,TEST,0,0 +12389,TEST,0,0 +12390,TEST,0,0 +12391,TEST,0,0 +12392,TEST,0,0 +12393,TEST,0,0 +12394,TEST,0,0 +12395,TEST,0,0 +12396,TEST,0,0 +12397,TEST,0,0 +12398,TEST,0,0 +12399,TEST,0,0 +12400,TEST,0,0 +12401,TEST,0,0 +12402,TEST,0,0 +12403,TEST,0,0 +12404,TEST,0,0 +12405,TEST,0,0 +12406,TEST,0,0 +12407,TEST,0,0 +12408,TEST,0,0 +12409,TEST,0,0 +12410,TEST,0,0 +12411,TEST,0,0 +12412,TEST,0,0 +12413,TEST,0,0 +12414,TEST,0,0 +12415,TEST,0,0 +12416,TEST,0,0 +12417,TEST,0,0 +12418,TEST,0,0 +12419,TEST,0,0 +12420,TEST,0,0 +12421,TEST,0,0 +12422,TEST,0,0 +12423,TEST,0,0 +12424,TEST,0,0 +12425,TEST,0,0 +12426,TEST,0,0 +12427,TEST,0,0 +12428,TEST,0,0 +12429,TEST,0,0 +12430,TEST,0,0 +12431,TEST,0,0 +12432,TEST,0,0 +12433,TEST,0,0 +12434,TEST,0,0 +12435,TEST,0,0 +12436,TEST,0,0 +12437,TEST,0,0 +12438,TEST,0,0 +12439,TEST,0,0 +12440,TEST,0,0 +12441,TEST,0,0 +12442,TEST,0,0 +12443,TEST,0,0 +12444,TEST,0,0 +12445,TEST,0,0 +12446,TEST,0,0 +12447,TEST,0,0 +12448,TEST,0,0 +12449,TEST,0,0 +12450,TEST,0,0 +12451,TEST,0,0 +12452,TEST,0,0 +12453,TEST,0,0 +12454,TEST,0,0 +12455,TEST,0,0 +12456,TEST,0,0 +12457,TEST,0,0 +12458,TEST,0,0 +12459,TEST,0,0 +12460,TEST,0,0 +12461,TEST,0,0 +12462,TEST,0,0 +12463,TEST,0,0 +12464,TEST,0,0 +12465,TEST,0,0 +12466,TEST,0,0 +12467,TEST,0,0 +12468,TEST,0,0 +12469,TEST,0,0 +12470,TEST,0,0 +12471,TEST,0,0 +12472,TEST,0,0 +12473,TEST,0,0 +12474,TEST,0,0 +12475,TEST,0,0 +12476,TEST,0,0 +12477,TEST,0,0 +12478,TEST,0,0 +12479,TEST,0,0 +12480,TEST,0,0 +12481,TEST,0,0 +12482,TEST,0,0 +12483,TEST,0,0 +12484,TEST,0,0 +12485,TEST,0,0 +12486,TEST,0,0 +12487,TEST,0,0 +12488,TEST,0,0 +12489,TEST,0,0 +12490,TEST,0,0 +12491,TEST,0,0 +12492,TEST,0,0 +12493,TEST,0,0 +12494,TEST,0,0 +12495,TEST,0,0 +12496,TEST,0,0 +12497,TEST,0,0 +12498,TEST,0,0 +12499,TEST,0,0 +12500,TEST,0,0 +12501,TEST,0,0 +12502,TEST,0,0 +12503,TEST,0,0 +12504,TEST,0,0 +12505,TEST,0,0 +12506,TEST,0,0 +12507,TEST,0,0 +12508,TEST,0,0 +12509,TEST,0,0 +12510,TEST,0,0 +12511,TEST,0,0 +12512,TEST,0,0 +12513,TEST,0,0 +12514,TEST,0,0 +12515,TEST,0,0 +12516,TEST,0,0 +12517,TEST,0,0 +12518,TEST,0,0 +12519,TEST,0,0 +12520,TEST,0,0 +12521,TEST,0,0 +12522,TEST,0,0 +12523,TEST,0,0 +12524,TEST,0,0 +12525,TEST,0,0 +12526,TEST,0,0 +12527,TEST,0,0 +12528,TEST,0,0 +12529,TEST,0,0 +12530,TEST,0,0 +12531,TEST,0,0 +12532,TEST,0,0 +12533,TEST,0,0 +12534,TEST,0,0 +12535,TEST,0,0 +12536,TEST,0,0 +12537,TEST,0,0 +12538,TEST,0,0 +12539,TEST,0,0 +12540,TEST,0,0 +12541,TEST,0,0 +12542,TEST,0,0 +12543,TEST,0,0 +12544,TEST,0,0 +12545,TEST,0,0 +12546,TEST,0,0 +12547,TEST,0,0 +12548,TEST,0,0 +12549,TEST,0,0 +12550,TEST,0,0 +12551,TEST,0,0 +12552,TEST,0,0 +12553,TEST,0,0 +12554,TEST,0,0 +12555,TEST,0,0 +12556,TEST,0,0 +12557,TEST,0,0 +12558,TEST,0,0 +12559,TEST,0,0 +12560,TEST,0,0 +12561,TEST,0,0 +12562,TEST,0,0 +12563,TEST,0,0 +12564,TEST,0,0 +12565,TEST,0,0 +12566,TEST,0,0 +12567,TEST,0,0 +12568,TEST,0,0 +12569,TEST,0,0 +12570,TEST,0,0 +12571,TEST,0,0 +12572,TEST,0,0 +12573,TEST,0,0 +12574,TEST,0,0 +12575,TEST,0,0 +12576,TEST,0,0 +12577,TEST,0,0 +12578,TEST,0,0 +12579,TEST,0,0 +12580,TEST,0,0 +12581,TEST,0,0 +12582,TEST,0,0 +12583,TEST,0,0 +12584,TEST,0,0 +12585,TEST,0,0 +12586,TEST,0,0 +12587,TEST,0,0 +12588,TEST,0,0 +12589,TEST,0,0 +12590,TEST,0,0 +12591,TEST,0,0 +12592,TEST,0,0 +12593,TEST,0,0 +12594,TEST,0,0 +12595,TEST,0,0 +12596,TEST,0,0 +12597,TEST,0,0 +12598,TEST,0,0 +12599,TEST,0,0 +12600,TEST,0,0 +12601,TEST,0,0 +12602,TEST,0,0 +12603,TEST,0,0 +12604,TEST,0,0 +12605,TEST,0,0 +12606,TEST,0,0 +12607,TEST,0,0 +12608,TEST,0,0 +12609,TEST,0,0 +12610,TEST,0,0 +12611,TEST,0,0 +12612,TEST,0,0 +12613,TEST,0,0 +12614,TEST,0,0 +12615,TEST,0,0 +12616,TEST,0,0 +12617,TEST,0,0 +12618,TEST,0,0 +12619,TEST,0,0 +12620,TEST,0,0 +12621,TEST,0,0 +12622,TEST,0,0 +12623,TEST,0,0 +12624,TEST,0,0 +12625,TEST,0,0 +12626,TEST,0,0 +12627,TEST,0,0 +12628,TEST,0,0 +12629,TEST,0,0 +12630,TEST,0,0 +12631,TEST,0,0 +12632,TEST,0,0 +12633,TEST,0,0 +12634,TEST,0,0 +12635,TEST,0,0 +12636,TEST,0,0 +12637,TEST,0,0 +12638,TEST,0,0 +12639,TEST,0,0 +12640,TEST,0,0 +12641,TEST,0,0 +12642,TEST,0,0 +12643,TEST,0,0 +12644,TEST,0,0 +12645,TEST,0,0 +12646,TEST,0,0 +12647,TEST,0,0 +12648,TEST,0,0 +12649,TEST,0,0 +12650,TEST,0,0 +12651,TEST,0,0 +12652,TEST,0,0 +12653,TEST,0,0 +12654,TEST,0,0 +12655,TEST,0,0 +12656,TEST,0,0 +12657,TEST,0,0 +12658,TEST,0,0 +12659,TEST,0,0 +12660,TEST,0,0 +12661,TEST,0,0 +12662,TEST,0,0 +12663,TEST,0,0 +12664,TEST,0,0 +12665,TEST,0,0 +12666,TEST,0,0 +12667,TEST,0,0 +12668,TEST,0,0 +12669,TEST,0,0 +12670,TEST,0,0 +12671,TEST,0,0 +12672,TEST,0,0 +12673,TEST,0,0 +12674,TEST,0,0 +12675,TEST,0,0 +12676,TEST,0,0 +12677,TEST,0,0 +12678,TEST,0,0 +12679,TEST,0,0 +12680,TEST,0,0 +12681,TEST,0,0 +12682,TEST,0,0 +12683,TEST,0,0 +12684,TEST,0,0 +12685,TEST,0,0 +12686,TEST,0,0 +12687,TEST,0,0 +12688,TEST,0,0 +12689,TEST,0,0 +12690,TEST,0,0 +12691,TEST,0,0 +12692,TEST,0,0 +12693,TEST,0,0 +12694,TEST,0,0 +12695,TEST,0,0 +12696,TEST,0,0 +12697,TEST,0,0 +12698,TEST,0,0 +12699,TEST,0,0 +12700,TEST,0,0 +12701,TEST,0,0 +12702,TEST,0,0 +12703,TEST,0,0 +12704,TEST,0,0 +12705,TEST,0,0 +12706,TEST,0,0 +12707,TEST,0,0 +12708,TEST,0,0 +12709,TEST,0,0 +12710,TEST,0,0 +12711,TEST,0,0 +12712,TEST,0,0 +12713,TEST,0,0 +12714,TEST,0,0 +12715,TEST,0,0 +12716,TEST,0,0 +12717,TEST,0,0 +12718,TEST,0,0 +12719,TEST,0,0 +12720,TEST,0,0 +12721,TEST,0,0 +12722,TEST,0,0 +12723,TEST,0,0 +12724,TEST,0,0 +12725,TEST,0,0 +12726,TEST,0,0 +12727,TEST,0,0 +12728,TEST,0,0 +12729,TEST,0,0 +12730,TEST,0,0 +12731,TEST,0,0 +12732,TEST,0,0 +12733,TEST,0,0 +12734,TEST,0,0 +12735,TEST,0,0 +12736,TEST,0,0 +12737,TEST,0,0 +12738,TEST,0,0 +12739,TEST,0,0 +12740,TEST,0,0 +12741,TEST,0,0 +12742,TEST,0,0 +12743,TEST,0,0 +12744,TEST,0,0 +12745,TEST,0,0 +12746,TEST,0,0 +12747,TEST,0,0 +12748,TEST,0,0 +12749,TEST,0,0 +12750,TEST,0,0 +12751,TEST,0,0 +12752,TEST,0,0 +12753,TEST,0,0 +12754,TEST,0,0 +12755,TEST,0,0 +12756,TEST,0,0 +12757,TEST,0,0 +12758,TEST,0,0 +12759,TEST,0,0 +12760,TEST,0,0 +12761,TEST,0,0 +12762,TEST,0,0 +12763,TEST,0,0 +12764,TEST,0,0 +12765,TEST,0,0 +12766,TEST,0,0 +12767,TEST,0,0 +12768,TEST,0,0 +12769,TEST,0,0 +12770,TEST,0,0 +12771,TEST,0,0 +12772,TEST,0,0 +12773,TEST,0,0 +12774,TEST,0,0 +12775,TEST,0,0 +12776,TEST,0,0 +12777,TEST,0,0 +12778,TEST,0,0 +12779,TEST,0,0 +12780,TEST,0,0 +12781,TEST,0,0 +12782,TEST,0,0 +12783,TEST,0,0 +12784,TEST,0,0 +12785,TEST,0,0 +12786,TEST,0,0 +12787,TEST,0,0 +12788,TEST,0,0 +12789,TEST,0,0 +12790,TEST,0,0 +12791,TEST,0,0 +12792,TEST,0,0 +12793,TEST,0,0 +12794,TEST,0,0 +12795,TEST,0,0 +12796,TEST,0,0 +12797,TEST,0,0 +12798,TEST,0,0 +12799,TEST,0,0 +12800,TEST,0,0 +12801,TEST,0,0 +12802,TEST,0,0 +12803,TEST,0,0 +12804,TEST,0,0 +12805,TEST,0,0 +12806,TEST,0,0 +12807,TEST,0,0 +12808,TEST,0,0 +12809,TEST,0,0 +12810,TEST,0,0 +12811,TEST,0,0 +12812,TEST,0,0 +12813,TEST,0,0 +12814,TEST,0,0 +12815,TEST,0,0 +12816,TEST,0,0 +12817,TEST,0,0 +12818,TEST,0,0 +12819,TEST,0,0 +12820,TEST,0,0 +12821,TEST,0,0 +12822,TEST,0,0 +12823,TEST,0,0 +12824,TEST,0,0 +12825,TEST,0,0 +12826,TEST,0,0 +12827,TEST,0,0 +12828,TEST,0,0 +12829,TEST,0,0 +12830,TEST,0,0 +12831,TEST,0,0 +12832,TEST,0,0 +12833,TEST,0,0 +12834,TEST,0,0 +12835,TEST,0,0 +12836,TEST,0,0 +12837,TEST,0,0 +12838,TEST,0,0 +12839,TEST,0,0 +12840,TEST,0,0 +12841,TEST,0,0 +12842,TEST,0,0 +12843,TEST,0,0 +12844,TEST,0,0 +12845,TEST,0,0 +12846,TEST,0,0 +12847,TEST,0,0 +12848,TEST,0,0 +12849,TEST,0,0 +12850,TEST,0,0 +12851,TEST,0,0 +12852,TEST,0,0 +12853,TEST,0,0 +12854,TEST,0,0 +12855,TEST,0,0 +12856,TEST,0,0 +12857,TEST,0,0 +12858,TEST,0,0 +12859,TEST,0,0 +12860,TEST,0,0 +12861,TEST,0,0 +12862,TEST,0,0 +12863,TEST,0,0 +12864,TEST,0,0 +12865,TEST,0,0 +12866,TEST,0,0 +12867,TEST,0,0 +12868,TEST,0,0 +12869,TEST,0,0 +12870,TEST,0,0 +12871,TEST,0,0 +12872,TEST,0,0 +12873,TEST,0,0 +12874,TEST,0,0 +12875,TEST,0,0 +12876,TEST,0,0 +12877,TEST,0,0 +12878,TEST,0,0 +12879,TEST,0,0 +12880,TEST,0,0 +12881,TEST,0,0 +12882,TEST,0,0 +12883,TEST,0,0 +12884,TEST,0,0 +12885,TEST,0,0 +12886,TEST,0,0 +12887,TEST,0,0 +12888,TEST,0,0 +12889,TEST,0,0 +12890,TEST,0,0 +12891,TEST,0,0 +12892,TEST,0,0 +12893,TEST,0,0 +12894,TEST,0,0 +12895,TEST,0,0 +12896,TEST,0,0 +12897,TEST,0,0 +12898,TEST,0,0 +12899,TEST,0,0 +12900,TEST,0,0 +12901,TEST,0,0 +12902,TEST,0,0 +12903,TEST,0,0 +12904,TEST,0,0 +12905,TEST,0,0 +12906,TEST,0,0 +12907,TEST,0,0 +12908,TEST,0,0 +12909,TEST,0,0 +12910,TEST,0,0 +12911,TEST,0,0 +12912,TEST,0,0 +12913,TEST,0,0 +12914,TEST,0,0 +12915,TEST,0,0 +12916,TEST,0,0 +12917,TEST,0,0 +12918,TEST,0,0 +12919,TEST,0,0 +12920,TEST,0,0 +12921,TEST,0,0 +12922,TEST,0,0 +12923,TEST,0,0 +12924,TEST,0,0 +12925,TEST,0,0 +12926,TEST,0,0 +12927,TEST,0,0 +12928,TEST,0,0 +12929,TEST,0,0 +12930,TEST,0,0 +12931,TEST,0,0 +12932,TEST,0,0 +12933,TEST,0,0 +12934,TEST,0,0 +12935,TEST,0,0 +12936,TEST,0,0 +12937,TEST,0,0 +12938,TEST,0,0 +12939,TEST,0,0 +12940,TEST,0,0 +12941,TEST,0,0 +12942,TEST,0,0 +12943,TEST,0,0 +12944,TEST,0,0 +12945,TEST,0,0 +12946,TEST,0,0 +12947,TEST,0,0 +12948,TEST,0,0 +12949,TEST,0,0 +12950,TEST,0,0 +12951,TEST,0,0 +12952,TEST,0,0 +12953,TEST,0,0 +12954,TEST,0,0 +12955,TEST,0,0 +12956,TEST,0,0 +12957,TEST,0,0 +12958,TEST,0,0 +12959,TEST,0,0 +12960,TEST,0,0 +12961,TEST,0,0 +12962,TEST,0,0 +12963,TEST,0,0 +12964,TEST,0,0 +12965,TEST,0,0 +12966,TEST,0,0 +12967,TEST,0,0 +12968,TEST,0,0 +12969,TEST,0,0 +12970,TEST,0,0 +12971,TEST,0,0 +12972,TEST,0,0 +12973,TEST,0,0 +12974,TEST,0,0 +12975,TEST,0,0 +12976,TEST,0,0 +12977,TEST,0,0 +12978,TEST,0,0 +12979,TEST,0,0 +12980,TEST,0,0 +12981,TEST,0,0 +12982,TEST,0,0 +12983,TEST,0,0 +12984,TEST,0,0 +12985,TEST,0,0 +12986,TEST,0,0 +12987,TEST,0,0 +12988,TEST,0,0 +12989,TEST,0,0 +12990,TEST,0,0 +12991,TEST,0,0 +12992,TEST,0,0 +12993,TEST,0,0 +12994,TEST,0,0 +12995,TEST,0,0 +12996,TEST,0,0 +12997,TEST,0,0 +12998,TEST,0,0 +12999,TEST,0,0 +13000,TEST,0,0 +13001,TEST,0,0 +13002,TEST,0,0 +13003,TEST,0,0 +13004,TEST,0,0 +13005,TEST,0,0 +13006,TEST,0,0 +13007,TEST,0,0 +13008,TEST,0,0 +13009,TEST,0,0 +13010,TEST,0,0 +13011,TEST,0,0 +13012,TEST,0,0 +13013,TEST,0,0 +13014,TEST,0,0 +13015,TEST,0,0 +13016,TEST,0,0 +13017,TEST,0,0 +13018,TEST,0,0 +13019,TEST,0,0 +13020,TEST,0,0 +13021,TEST,0,0 +13022,TEST,0,0 +13023,TEST,0,0 +13024,TEST,0,0 +13025,TEST,0,0 +13026,TEST,0,0 +13027,TEST,0,0 +13028,TEST,0,0 +13029,TEST,0,0 +13030,TEST,0,0 +13031,TEST,0,0 +13032,TEST,0,0 +13033,TEST,0,0 +13034,TEST,0,0 +13035,TEST,0,0 +13036,TEST,0,0 +13037,TEST,0,0 +13038,TEST,0,0 +13039,TEST,0,0 +13040,TEST,0,0 +13041,TEST,0,0 +13042,TEST,0,0 +13043,TEST,0,0 +13044,TEST,0,0 +13045,TEST,0,0 +13046,TEST,0,0 +13047,TEST,0,0 +13048,TEST,0,0 +13049,TEST,0,0 +13050,TEST,0,0 +13051,TEST,0,0 +13052,TEST,0,0 +13053,TEST,0,0 +13054,TEST,0,0 +13055,TEST,0,0 +13056,TEST,0,0 +13057,TEST,0,0 +13058,TEST,0,0 +13059,TEST,0,0 +13060,TEST,0,0 +13061,TEST,0,0 +13062,TEST,0,0 +13063,TEST,0,0 +13064,TEST,0,0 +13065,TEST,0,0 +13066,TEST,0,0 +13067,TEST,0,0 +13068,TEST,0,0 +13069,TEST,0,0 +13070,TEST,0,0 +13071,TEST,0,0 +13072,TEST,0,0 +13073,TEST,0,0 +13074,TEST,0,0 +13075,TEST,0,0 +13076,TEST,0,0 +13077,TEST,0,0 +13078,TEST,0,0 +13079,TEST,0,0 +13080,TEST,0,0 +13081,TEST,0,0 +13082,TEST,0,0 +13083,TEST,0,0 +13084,TEST,0,0 +13085,TEST,0,0 +13086,TEST,0,0 +13087,TEST,0,0 +13088,TEST,0,0 +13089,TEST,0,0 +13090,TEST,0,0 +13091,TEST,0,0 +13092,TEST,0,0 +13093,TEST,0,0 +13094,TEST,0,0 +13095,TEST,0,0 +13096,TEST,0,0 +13097,TEST,0,0 +13098,TEST,0,0 +13099,TEST,0,0 +13100,TEST,0,0 +13101,TEST,0,0 +13102,TEST,0,0 +13103,TEST,0,0 +13104,TEST,0,0 +13105,TEST,0,0 +13106,TEST,0,0 +13107,TEST,0,0 +13108,TEST,0,0 +13109,TEST,0,0 +13110,TEST,0,0 +13111,TEST,0,0 +13112,TEST,0,0 +13113,TEST,0,0 +13114,TEST,0,0 +13115,TEST,0,0 +13116,TEST,0,0 +13117,TEST,0,0 +13118,TEST,0,0 +13119,TEST,0,0 +13120,TEST,0,0 +13121,TEST,0,0 +13122,TEST,0,0 +13123,TEST,0,0 +13124,TEST,0,0 +13125,TEST,0,0 +13126,TEST,0,0 +13127,TEST,0,0 +13128,TEST,0,0 +13129,TEST,0,0 +13130,TEST,0,0 +13131,TEST,0,0 +13132,TEST,0,0 +13133,TEST,0,0 +13134,TEST,0,0 +13135,TEST,0,0 +13136,TEST,0,0 +13137,TEST,0,0 +13138,TEST,0,0 +13139,TEST,0,0 +13140,TEST,0,0 +13141,TEST,0,0 +13142,TEST,0,0 +13143,TEST,0,0 +13144,TEST,0,0 +13145,TEST,0,0 +13146,TEST,0,0 +13147,TEST,0,0 +13148,TEST,0,0 +13149,TEST,0,0 +13150,TEST,0,0 +13151,TEST,0,0 +13152,TEST,0,0 +13153,TEST,0,0 +13154,TEST,0,0 +13155,TEST,0,0 +13156,TEST,0,0 +13157,TEST,0,0 +13158,TEST,0,0 +13159,TEST,0,0 +13160,TEST,0,0 +13161,TEST,0,0 +13162,TEST,0,0 +13163,TEST,0,0 +13164,TEST,0,0 +13165,TEST,0,0 +13166,TEST,0,0 +13167,TEST,0,0 +13168,TEST,0,0 +13169,TEST,0,0 +13170,TEST,0,0 +13171,TEST,0,0 +13172,TEST,0,0 +13173,TEST,0,0 +13174,TEST,0,0 +13175,TEST,0,0 +13176,TEST,0,0 +13177,TEST,0,0 +13178,TEST,0,0 +13179,TEST,0,0 +13180,TEST,0,0 +13181,TEST,0,0 +13182,TEST,0,0 +13183,TEST,0,0 +13184,TEST,0,0 +13185,TEST,0,0 +13186,TEST,0,0 +13187,TEST,0,0 +13188,TEST,0,0 +13189,TEST,0,0 +13190,TEST,0,0 +13191,TEST,0,0 +13192,TEST,0,0 +13193,TEST,0,0 +13194,TEST,0,0 +13195,TEST,0,0 +13196,TEST,0,0 +13197,TEST,0,0 +13198,TEST,0,0 +13199,TEST,0,0 +13200,TEST,0,0 +13201,TEST,0,0 +13202,TEST,0,0 +13203,TEST,0,0 +13204,TEST,0,0 +13205,TEST,0,0 +13206,TEST,0,0 +13207,TEST,0,0 +13208,TEST,0,0 +13209,TEST,0,0 +13210,TEST,0,0 +13211,TEST,0,0 +13212,TEST,0,0 +13213,TEST,0,0 +13214,TEST,0,0 +13215,TEST,0,0 +13216,TEST,0,0 +13217,TEST,0,0 +13218,TEST,0,0 +13219,TEST,0,0 +13220,TEST,0,0 +13221,TEST,0,0 +13222,TEST,0,0 +13223,TEST,0,0 +13224,TEST,0,0 +13225,TEST,0,0 +13226,TEST,0,0 +13227,TEST,0,0 +13228,TEST,0,0 +13229,TEST,0,0 +13230,TEST,0,0 +13231,TEST,0,0 +13232,TEST,0,0 +13233,TEST,0,0 +13234,TEST,0,0 +13235,TEST,0,0 +13236,TEST,0,0 +13237,TEST,0,0 +13238,TEST,0,0 +13239,TEST,0,0 +13240,TEST,0,0 +13241,TEST,0,0 +13242,TEST,0,0 +13243,TEST,0,0 +13244,TEST,0,0 +13245,TEST,0,0 +13246,TEST,0,0 +13247,TEST,0,0 +13248,TEST,0,0 +13249,TEST,0,0 +13250,TEST,0,0 +13251,TEST,0,0 +13252,TEST,0,0 +13253,TEST,0,0 +13254,TEST,0,0 +13255,TEST,0,0 +13256,TEST,0,0 +13257,TEST,0,0 +13258,TEST,0,0 +13259,TEST,0,0 +13260,TEST,0,0 +13261,TEST,0,0 +13262,TEST,0,0 +13263,TEST,0,0 +13264,TEST,0,0 +13265,TEST,0,0 +13266,TEST,0,0 +13267,TEST,0,0 +13268,TEST,0,0 +13269,TEST,0,0 +13270,TEST,0,0 +13271,TEST,0,0 +13272,TEST,0,0 +13273,TEST,0,0 +13274,TEST,0,0 +13275,TEST,0,0 +13276,TEST,0,0 +13277,TEST,0,0 +13278,TEST,0,0 +13279,TEST,0,0 +13280,TEST,0,0 +13281,TEST,0,0 +13282,TEST,0,0 +13283,TEST,0,0 +13284,TEST,0,0 +13285,TEST,0,0 +13286,TEST,0,0 +13287,TEST,0,0 +13288,TEST,0,0 +13289,TEST,0,0 +13290,TEST,0,0 +13291,TEST,0,0 +13292,TEST,0,0 +13293,TEST,0,0 +13294,TEST,0,0 +13295,TEST,0,0 +13296,TEST,0,0 +13297,TEST,0,0 +13298,TEST,0,0 +13299,TEST,0,0 +13300,TEST,0,0 +13301,TEST,0,0 +13302,TEST,0,0 +13303,TEST,0,0 +13304,TEST,0,0 +13305,TEST,0,0 +13306,TEST,0,0 +13307,TEST,0,0 +13308,TEST,0,0 +13309,TEST,0,0 +13310,TEST,0,0 +13311,TEST,0,0 +13312,TEST,0,0 +13313,TEST,0,0 +13314,TEST,0,0 +13315,TEST,0,0 +13316,TEST,0,0 +13317,TEST,0,0 +13318,TEST,0,0 +13319,TEST,0,0 +13320,TEST,0,0 +13321,TEST,0,0 +13322,TEST,0,0 +13323,TEST,0,0 +13324,TEST,0,0 +13325,TEST,0,0 +13326,TEST,0,0 +13327,TEST,0,0 +13328,TEST,0,0 +13329,TEST,0,0 +13330,TEST,0,0 +13331,TEST,0,0 +13332,TEST,0,0 +13333,TEST,0,0 +13334,TEST,0,0 +13335,TEST,0,0 +13336,TEST,0,0 +13337,TEST,0,0 +13338,TEST,0,0 +13339,TEST,0,0 +13340,TEST,0,0 +13341,TEST,0,0 +13342,TEST,0,0 +13343,TEST,0,0 +13344,TEST,0,0 +13345,TEST,0,0 +13346,TEST,0,0 +13347,TEST,0,0 +13348,TEST,0,0 +13349,TEST,0,0 +13350,TEST,0,0 +13351,TEST,0,0 +13352,TEST,0,0 +13353,TEST,0,0 +13354,TEST,0,0 +13355,TEST,0,0 +13356,TEST,0,0 +13357,TEST,0,0 +13358,TEST,0,0 +13359,TEST,0,0 +13360,TEST,0,0 +13361,TEST,0,0 +13362,TEST,0,0 +13363,TEST,0,0 +13364,TEST,0,0 +13365,TEST,0,0 +13366,TEST,0,0 +13367,TEST,0,0 +13368,TEST,0,0 +13369,TEST,0,0 +13370,TEST,0,0 +13371,TEST,0,0 +13372,TEST,0,0 +13373,TEST,0,0 +13374,TEST,0,0 +13375,TEST,0,0 +13376,TEST,0,0 +13377,TEST,0,0 +13378,TEST,0,0 +13379,TEST,0,0 +13380,TEST,0,0 +13381,TEST,0,0 +13382,TEST,0,0 +13383,TEST,0,0 +13384,TEST,0,0 +13385,TEST,0,0 +13386,TEST,0,0 +13387,TEST,0,0 +13388,TEST,0,0 +13389,TEST,0,0 +13390,TEST,0,0 +13391,TEST,0,0 +13392,TEST,0,0 +13393,TEST,0,0 +13394,TEST,0,0 +13395,TEST,0,0 +13396,TEST,0,0 +13397,TEST,0,0 +13398,TEST,0,0 +13399,TEST,0,0 +13400,TEST,0,0 +13401,TEST,0,0 +13402,TEST,0,0 +13403,TEST,0,0 +13404,TEST,0,0 +13405,TEST,0,0 +13406,TEST,0,0 +13407,TEST,0,0 +13408,TEST,0,0 +13409,TEST,0,0 +13410,TEST,0,0 +13411,TEST,0,0 +13412,TEST,0,0 +13413,TEST,0,0 +13414,TEST,0,0 +13415,TEST,0,0 +13416,TEST,0,0 +13417,TEST,0,0 +13418,TEST,0,0 +13419,TEST,0,0 +13420,TEST,0,0 +13421,TEST,0,0 +13422,TEST,0,0 +13423,TEST,0,0 +13424,TEST,0,0 +13425,TEST,0,0 +13426,TEST,0,0 +13427,TEST,0,0 +13428,TEST,0,0 +13429,TEST,0,0 +13430,TEST,0,0 +13431,TEST,0,0 +13432,TEST,0,0 +13433,TEST,0,0 +13434,TEST,0,0 +13435,TEST,0,0 +13436,TEST,0,0 +13437,TEST,0,0 +13438,TEST,0,0 +13439,TEST,0,0 +13440,TEST,0,0 +13441,TEST,0,0 +13442,TEST,0,0 +13443,TEST,0,0 +13444,TEST,0,0 +13445,TEST,0,0 +13446,TEST,0,0 +13447,TEST,0,0 +13448,TEST,0,0 +13449,TEST,0,0 +13450,TEST,0,0 +13451,TEST,0,0 +13452,TEST,0,0 +13453,TEST,0,0 +13454,TEST,0,0 +13455,TEST,0,0 +13456,TEST,0,0 +13457,TEST,0,0 +13458,TEST,0,0 +13459,TEST,0,0 +13460,TEST,0,0 +13461,TEST,0,0 +13462,TEST,0,0 +13463,TEST,0,0 +13464,TEST,0,0 +13465,TEST,0,0 +13466,TEST,0,0 +13467,TEST,0,0 +13468,TEST,0,0 +13469,TEST,0,0 +13470,TEST,0,0 +13471,TEST,0,0 +13472,TEST,0,0 +13473,TEST,0,0 +13474,TEST,0,0 +13475,TEST,0,0 +13476,TEST,0,0 +13477,TEST,0,0 +13478,TEST,0,0 +13479,TEST,0,0 +13480,TEST,0,0 +13481,TEST,0,0 +13482,TEST,0,0 +13483,TEST,0,0 +13484,TEST,0,0 +13485,TEST,0,0 +13486,TEST,0,0 +13487,TEST,0,0 +13488,TEST,0,0 +13489,TEST,0,0 +13490,TEST,0,0 +13491,TEST,0,0 +13492,TEST,0,0 +13493,TEST,0,0 +13494,TEST,0,0 +13495,TEST,0,0 +13496,TEST,0,0 +13497,TEST,0,0 +13498,TEST,0,0 +13499,TEST,0,0 +13500,TEST,0,0 +13501,TEST,0,0 +13502,TEST,0,0 +13503,TEST,0,0 +13504,TEST,0,0 +13505,TEST,0,0 +13506,TEST,0,0 +13507,TEST,0,0 +13508,TEST,0,0 +13509,TEST,0,0 +13510,TEST,0,0 +13511,TEST,0,0 +13512,TEST,0,0 +13513,TEST,0,0 +13514,TEST,0,0 +13515,TEST,0,0 +13516,TEST,0,0 +13517,TEST,0,0 +13518,TEST,0,0 +13519,TEST,0,0 +13520,TEST,0,0 +13521,TEST,0,0 +13522,TEST,0,0 +13523,TEST,0,0 +13524,TEST,0,0 +13525,TEST,0,0 +13526,TEST,0,0 +13527,TEST,0,0 +13528,TEST,0,0 +13529,TEST,0,0 +13530,TEST,0,0 +13531,TEST,0,0 +13532,TEST,0,0 +13533,TEST,0,0 +13534,TEST,0,0 +13535,TEST,0,0 +13536,TEST,0,0 +13537,TEST,0,0 +13538,TEST,0,0 +13539,TEST,0,0 +13540,TEST,0,0 +13541,TEST,0,0 +13542,TEST,0,0 +13543,TEST,0,0 +13544,TEST,0,0 +13545,TEST,0,0 +13546,TEST,0,0 +13547,TEST,0,0 +13548,TEST,0,0 +13549,TEST,0,0 +13550,TEST,0,0 +13551,TEST,0,0 +13552,TEST,0,0 +13553,TEST,0,0 +13554,TEST,0,0 +13555,TEST,0,0 +13556,TEST,0,0 +13557,TEST,0,0 +13558,TEST,0,0 +13559,TEST,0,0 +13560,TEST,0,0 +13561,TEST,0,0 +13562,TEST,0,0 +13563,TEST,0,0 +13564,TEST,0,0 +13565,TEST,0,0 +13566,TEST,0,0 +13567,TEST,0,0 +13568,TEST,0,0 +13569,TEST,0,0 +13570,TEST,0,0 +13571,TEST,0,0 +13572,TEST,0,0 +13573,TEST,0,0 +13574,TEST,0,0 +13575,TEST,0,0 +13576,TEST,0,0 +13577,TEST,0,0 +13578,TEST,0,0 +13579,TEST,0,0 +13580,TEST,0,0 +13581,TEST,0,0 +13582,TEST,0,0 +13583,TEST,0,0 +13584,TEST,0,0 +13585,TEST,0,0 +13586,TEST,0,0 +13587,TEST,0,0 +13588,TEST,0,0 +13589,TEST,0,0 +13590,TEST,0,0 +13591,TEST,0,0 +13592,TEST,0,0 +13593,TEST,0,0 +13594,TEST,0,0 +13595,TEST,0,0 +13596,TEST,0,0 +13597,TEST,0,0 +13598,TEST,0,0 +13599,TEST,0,0 +13600,TEST,0,0 +13601,TEST,0,0 +13602,TEST,0,0 +13603,TEST,0,0 +13604,TEST,0,0 +13605,TEST,0,0 +13606,TEST,0,0 +13607,TEST,0,0 +13608,TEST,0,0 +13609,TEST,0,0 +13610,TEST,0,0 +13611,TEST,0,0 +13612,TEST,0,0 +13613,TEST,0,0 +13614,TEST,0,0 +13615,TEST,0,0 +13616,TEST,0,0 +13617,TEST,0,0 +13618,TEST,0,0 +13619,TEST,0,0 +13620,TEST,0,0 +13621,TEST,0,0 +13622,TEST,0,0 +13623,TEST,0,0 +13624,TEST,0,0 +13625,TEST,0,0 +13626,TEST,0,0 +13627,TEST,0,0 +13628,TEST,0,0 +13629,TEST,0,0 +13630,TEST,0,0 +13631,TEST,0,0 +13632,TEST,0,0 +13633,TEST,0,0 +13634,TEST,0,0 +13635,TEST,0,0 +13636,TEST,0,0 +13637,TEST,0,0 +13638,TEST,0,0 +13639,TEST,0,0 +13640,TEST,0,0 +13641,TEST,0,0 +13642,TEST,0,0 +13643,TEST,0,0 +13644,TEST,0,0 +13645,TEST,0,0 +13646,TEST,0,0 +13647,TEST,0,0 +13648,TEST,0,0 +13649,TEST,0,0 +13650,TEST,0,0 +13651,TEST,0,0 +13652,TEST,0,0 +13653,TEST,0,0 +13654,TEST,0,0 +13655,TEST,0,0 +13656,TEST,0,0 +13657,TEST,0,0 +13658,TEST,0,0 +13659,TEST,0,0 +13660,TEST,0,0 +13661,TEST,0,0 +13662,TEST,0,0 +13663,TEST,0,0 +13664,TEST,0,0 +13665,TEST,0,0 +13666,TEST,0,0 +13667,TEST,0,0 +13668,TEST,0,0 +13669,TEST,0,0 +13670,TEST,0,0 +13671,TEST,0,0 +13672,TEST,0,0 +13673,TEST,0,0 +13674,TEST,0,0 +13675,TEST,0,0 +13676,TEST,0,0 +13677,TEST,0,0 +13678,TEST,0,0 +13679,TEST,0,0 +13680,TEST,0,0 +13681,TEST,0,0 +13682,TEST,0,0 +13683,TEST,0,0 +13684,TEST,0,0 +13685,TEST,0,0 +13686,TEST,0,0 +13687,TEST,0,0 +13688,TEST,0,0 +13689,TEST,0,0 +13690,TEST,0,0 +13691,TEST,0,0 +13692,TEST,0,0 +13693,TEST,0,0 +13694,TEST,0,0 +13695,TEST,0,0 +13696,TEST,0,0 +13697,TEST,0,0 +13698,TEST,0,0 +13699,TEST,0,0 +13700,TEST,0,0 +13701,TEST,0,0 +13702,TEST,0,0 +13703,TEST,0,0 +13704,TEST,0,0 +13705,TEST,0,0 +13706,TEST,0,0 +13707,TEST,0,0 +13708,TEST,0,0 +13709,TEST,0,0 +13710,TEST,0,0 +13711,TEST,0,0 +13712,TEST,0,0 +13713,TEST,0,0 +13714,TEST,0,0 +13715,TEST,0,0 +13716,TEST,0,0 +13717,TEST,0,0 +13718,TEST,0,0 +13719,TEST,0,0 +13720,TEST,0,0 +13721,TEST,0,0 +13722,TEST,0,0 +13723,TEST,0,0 +13724,TEST,0,0 +13725,TEST,0,0 +13726,TEST,0,0 +13727,TEST,0,0 +13728,TEST,0,0 +13729,TEST,0,0 +13730,TEST,0,0 +13731,TEST,0,0 +13732,TEST,0,0 +13733,TEST,0,0 +13734,TEST,0,0 +13735,TEST,0,0 +13736,TEST,0,0 +13737,TEST,0,0 +13738,TEST,0,0 +13739,TEST,0,0 +13740,TEST,0,0 +13741,TEST,0,0 +13742,TEST,0,0 +13743,TEST,0,0 +13744,TEST,0,0 +13745,TEST,0,0 +13746,TEST,0,0 +13747,TEST,0,0 +13748,TEST,0,0 +13749,TEST,0,0 +13750,TEST,0,0 +13751,TEST,0,0 +13752,TEST,0,0 +13753,TEST,0,0 +13754,TEST,0,0 +13755,TEST,0,0 +13756,TEST,0,0 +13757,TEST,0,0 +13758,TEST,0,0 +13759,TEST,0,0 +13760,TEST,0,0 +13761,TEST,0,0 +13762,TEST,0,0 +13763,TEST,0,0 +13764,TEST,0,0 +13765,TEST,0,0 +13766,TEST,0,0 +13767,TEST,0,0 +13768,TEST,0,0 +13769,TEST,0,0 +13770,TEST,0,0 +13771,TEST,0,0 +13772,TEST,0,0 +13773,TEST,0,0 +13774,TEST,0,0 +13775,TEST,0,0 +13776,TEST,0,0 +13777,TEST,0,0 +13778,TEST,0,0 +13779,TEST,0,0 +13780,TEST,0,0 +13781,TEST,0,0 +13782,TEST,0,0 +13783,TEST,0,0 +13784,TEST,0,0 +13785,TEST,0,0 +13786,TEST,0,0 +13787,TEST,0,0 +13788,TEST,0,0 +13789,TEST,0,0 +13790,TEST,0,0 +13791,TEST,0,0 +13792,TEST,0,0 +13793,TEST,0,0 +13794,TEST,0,0 +13795,TEST,0,0 +13796,TEST,0,0 +13797,TEST,0,0 +13798,TEST,0,0 +13799,TEST,0,0 +13800,TEST,0,0 +13801,TEST,0,0 +13802,TEST,0,0 +13803,TEST,0,0 +13804,TEST,0,0 +13805,TEST,0,0 +13806,TEST,0,0 +13807,TEST,0,0 +13808,TEST,0,0 +13809,TEST,0,0 +13810,TEST,0,0 +13811,TEST,0,0 +13812,TEST,0,0 +13813,TEST,0,0 +13814,TEST,0,0 +13815,TEST,0,0 +13816,TEST,0,0 +13817,TEST,0,0 +13818,TEST,0,0 +13819,TEST,0,0 +13820,TEST,0,0 +13821,TEST,0,0 +13822,TEST,0,0 +13823,TEST,0,0 +13824,TEST,0,0 +13825,TEST,0,0 +13826,TEST,0,0 +13827,TEST,0,0 +13828,TEST,0,0 +13829,TEST,0,0 +13830,TEST,0,0 +13831,TEST,0,0 +13832,TEST,0,0 +13833,TEST,0,0 +13834,TEST,0,0 +13835,TEST,0,0 +13836,TEST,0,0 +13837,TEST,0,0 +13838,TEST,0,0 +13839,TEST,0,0 +13840,TEST,0,0 +13841,TEST,0,0 +13842,TEST,0,0 +13843,TEST,0,0 +13844,TEST,0,0 +13845,TEST,0,0 +13846,TEST,0,0 +13847,TEST,0,0 +13848,TEST,0,0 +13849,TEST,0,0 +13850,TEST,0,0 +13851,TEST,0,0 +13852,TEST,0,0 +13853,TEST,0,0 +13854,TEST,0,0 +13855,TEST,0,0 +13856,TEST,0,0 +13857,TEST,0,0 +13858,TEST,0,0 +13859,TEST,0,0 +13860,TEST,0,0 +13861,TEST,0,0 +13862,TEST,0,0 +13863,TEST,0,0 +13864,TEST,0,0 +13865,TEST,0,0 +13866,TEST,0,0 +13867,TEST,0,0 +13868,TEST,0,0 +13869,TEST,0,0 +13870,TEST,0,0 +13871,TEST,0,0 +13872,TEST,0,0 +13873,TEST,0,0 +13874,TEST,0,0 +13875,TEST,0,0 +13876,TEST,0,0 +13877,TEST,0,0 +13878,TEST,0,0 +13879,TEST,0,0 +13880,TEST,0,0 +13881,TEST,0,0 +13882,TEST,0,0 +13883,TEST,0,0 +13884,TEST,0,0 +13885,TEST,0,0 +13886,TEST,0,0 +13887,TEST,0,0 +13888,TEST,0,0 +13889,TEST,0,0 +13890,TEST,0,0 +13891,TEST,0,0 +13892,TEST,0,0 +13893,TEST,0,0 +13894,TEST,0,0 +13895,TEST,0,0 +13896,TEST,0,0 +13897,TEST,0,0 +13898,TEST,0,0 +13899,TEST,0,0 +13900,TEST,0,0 +13901,TEST,0,0 +13902,TEST,0,0 +13903,TEST,0,0 +13904,TEST,0,0 +13905,TEST,0,0 +13906,TEST,0,0 +13907,TEST,0,0 +13908,TEST,0,0 +13909,TEST,0,0 +13910,TEST,0,0 +13911,TEST,0,0 +13912,TEST,0,0 +13913,TEST,0,0 +13914,TEST,0,0 +13915,TEST,0,0 +13916,TEST,0,0 +13917,TEST,0,0 +13918,TEST,0,0 +13919,TEST,0,0 +13920,TEST,0,0 +13921,TEST,0,0 +13922,TEST,0,0 +13923,TEST,0,0 +13924,TEST,0,0 +13925,TEST,0,0 +13926,TEST,0,0 +13927,TEST,0,0 +13928,TEST,0,0 +13929,TEST,0,0 +13930,TEST,0,0 +13931,TEST,0,0 +13932,TEST,0,0 +13933,TEST,0,0 +13934,TEST,0,0 +13935,TEST,0,0 +13936,TEST,0,0 +13937,TEST,0,0 +13938,TEST,0,0 +13939,TEST,0,0 +13940,TEST,0,0 +13941,TEST,0,0 +13942,TEST,0,0 +13943,TEST,0,0 +13944,TEST,0,0 +13945,TEST,0,0 +13946,TEST,0,0 +13947,TEST,0,0 +13948,TEST,0,0 +13949,TEST,0,0 +13950,TEST,0,0 +13951,TEST,0,0 +13952,TEST,0,0 +13953,TEST,0,0 +13954,TEST,0,0 +13955,TEST,0,0 +13956,TEST,0,0 +13957,TEST,0,0 +13958,TEST,0,0 +13959,TEST,0,0 +13960,TEST,0,0 +13961,TEST,0,0 +13962,TEST,0,0 +13963,TEST,0,0 +13964,TEST,0,0 +13965,TEST,0,0 +13966,TEST,0,0 +13967,TEST,0,0 +13968,TEST,0,0 +13969,TEST,0,0 +13970,TEST,0,0 +13971,TEST,0,0 +13972,TEST,0,0 +13973,TEST,0,0 +13974,TEST,0,0 +13975,TEST,0,0 +13976,TEST,0,0 +13977,TEST,0,0 +13978,TEST,0,0 +13979,TEST,0,0 +13980,TEST,0,0 +13981,TEST,0,0 +13982,TEST,0,0 +13983,TEST,0,0 +13984,TEST,0,0 +13985,TEST,0,0 +13986,TEST,0,0 +13987,TEST,0,0 +13988,TEST,0,0 +13989,TEST,0,0 +13990,TEST,0,0 +13991,TEST,0,0 +13992,TEST,0,0 +13993,TEST,0,0 +13994,TEST,0,0 +13995,TEST,0,0 +13996,TEST,0,0 +13997,TEST,0,0 +13998,TEST,0,0 +13999,TEST,0,0 +14000,TEST,0,0 +14001,TEST,0,0 +14002,TEST,0,0 +14003,TEST,0,0 +14004,TEST,0,0 +14005,TEST,0,0 +14006,TEST,0,0 +14007,TEST,0,0 +14008,TEST,0,0 +14009,TEST,0,0 +14010,TEST,0,0 +14011,TEST,0,0 +14012,TEST,0,0 +14013,TEST,0,0 +14014,TEST,0,0 +14015,TEST,0,0 +14016,TEST,0,0 +14017,TEST,0,0 +14018,TEST,0,0 +14019,TEST,0,0 +14020,TEST,0,0 +14021,TEST,0,0 +14022,TEST,0,0 +14023,TEST,0,0 +14024,TEST,0,0 +14025,TEST,0,0 +14026,TEST,0,0 +14027,TEST,0,0 +14028,TEST,0,0 +14029,TEST,0,0 +14030,TEST,0,0 +14031,TEST,0,0 +14032,TEST,0,0 +14033,TEST,0,0 +14034,TEST,0,0 +14035,TEST,0,0 +14036,TEST,0,0 +14037,TEST,0,0 +14038,TEST,0,0 +14039,TEST,0,0 +14040,TEST,0,0 +14041,TEST,0,0 +14042,TEST,0,0 +14043,TEST,0,0 +14044,TEST,0,0 +14045,TEST,0,0 +14046,TEST,0,0 +14047,TEST,0,0 +14048,TEST,0,0 +14049,TEST,0,0 +14050,TEST,0,0 +14051,TEST,0,0 +14052,TEST,0,0 +14053,TEST,0,0 diff --git a/datasets/anomaly_reserve/kpi/TEST/problem_TEST/problemDoc.json b/datasets/anomaly_reserve/kpi/TEST/problem_TEST/problemDoc.json new file mode 100644 index 0000000..1fd55ad --- /dev/null +++ b/datasets/anomaly_reserve/kpi/TEST/problem_TEST/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "kpi_problem", + "problemName": "kpi_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "kpi_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 3, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TRAIN" + } + ], + "test": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TEST" + } + ], + "score": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/TRAIN/dataset_TRAIN/datasetDoc.json b/datasets/anomaly_reserve/kpi/TRAIN/dataset_TRAIN/datasetDoc.json new file mode 100644 index 0000000..a196e7d --- /dev/null +++ b/datasets/anomaly_reserve/kpi/TRAIN/dataset_TRAIN/datasetDoc.json @@ -0,0 +1,63 @@ +{ + "about": { + "datasetID": "kpi_dataset_TRAIN", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 4 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv.REMOVED.git-id b/datasets/anomaly_reserve/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv.REMOVED.git-id new file mode 100644 index 0000000..7376921 --- /dev/null +++ b/datasets/anomaly_reserve/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv.REMOVED.git-id @@ -0,0 +1 @@ +44db328c252a8156434142a37ef65765869e7548 \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/TRAIN/problem_TRAIN/dataSplits.csv b/datasets/anomaly_reserve/kpi/TRAIN/problem_TRAIN/dataSplits.csv new file mode 100644 index 0000000..41a5012 --- /dev/null +++ b/datasets/anomaly_reserve/kpi/TRAIN/problem_TRAIN/dataSplits.csv @@ -0,0 +1,7028 @@ +d3mIndex,type,repeat,fold +0,TRAIN,0,0 +1,TRAIN,0,0 +2,TRAIN,0,0 +3,TRAIN,0,0 +4,TRAIN,0,0 +5,TRAIN,0,0 +6,TRAIN,0,0 +7,TRAIN,0,0 +8,TRAIN,0,0 +9,TRAIN,0,0 +10,TRAIN,0,0 +11,TRAIN,0,0 +12,TRAIN,0,0 +13,TRAIN,0,0 +14,TRAIN,0,0 +15,TRAIN,0,0 +16,TRAIN,0,0 +17,TRAIN,0,0 +18,TRAIN,0,0 +19,TRAIN,0,0 +20,TRAIN,0,0 +21,TRAIN,0,0 +22,TRAIN,0,0 +23,TRAIN,0,0 +24,TRAIN,0,0 +25,TRAIN,0,0 +26,TRAIN,0,0 +27,TRAIN,0,0 +28,TRAIN,0,0 +29,TRAIN,0,0 +30,TRAIN,0,0 +31,TRAIN,0,0 +32,TRAIN,0,0 +33,TRAIN,0,0 +34,TRAIN,0,0 +35,TRAIN,0,0 +36,TRAIN,0,0 +37,TRAIN,0,0 +38,TRAIN,0,0 +39,TRAIN,0,0 +40,TRAIN,0,0 +41,TRAIN,0,0 +42,TRAIN,0,0 +43,TRAIN,0,0 +44,TRAIN,0,0 +45,TRAIN,0,0 +46,TRAIN,0,0 +47,TRAIN,0,0 +48,TRAIN,0,0 +49,TRAIN,0,0 +50,TRAIN,0,0 +51,TRAIN,0,0 +52,TRAIN,0,0 +53,TRAIN,0,0 +54,TRAIN,0,0 +55,TRAIN,0,0 +56,TRAIN,0,0 +57,TRAIN,0,0 +58,TRAIN,0,0 +59,TRAIN,0,0 +60,TRAIN,0,0 +61,TRAIN,0,0 +62,TRAIN,0,0 +63,TRAIN,0,0 +64,TRAIN,0,0 +65,TRAIN,0,0 +66,TRAIN,0,0 +67,TRAIN,0,0 +68,TRAIN,0,0 +69,TRAIN,0,0 +70,TRAIN,0,0 +71,TRAIN,0,0 +72,TRAIN,0,0 +73,TRAIN,0,0 +74,TRAIN,0,0 +75,TRAIN,0,0 +76,TRAIN,0,0 +77,TRAIN,0,0 +78,TRAIN,0,0 +79,TRAIN,0,0 +80,TRAIN,0,0 +81,TRAIN,0,0 +82,TRAIN,0,0 +83,TRAIN,0,0 +84,TRAIN,0,0 +85,TRAIN,0,0 +86,TRAIN,0,0 +87,TRAIN,0,0 +88,TRAIN,0,0 +89,TRAIN,0,0 +90,TRAIN,0,0 +91,TRAIN,0,0 +92,TRAIN,0,0 +93,TRAIN,0,0 +94,TRAIN,0,0 +95,TRAIN,0,0 +96,TRAIN,0,0 +97,TRAIN,0,0 +98,TRAIN,0,0 +99,TRAIN,0,0 +100,TRAIN,0,0 +101,TRAIN,0,0 +102,TRAIN,0,0 +103,TRAIN,0,0 +104,TRAIN,0,0 +105,TRAIN,0,0 +106,TRAIN,0,0 +107,TRAIN,0,0 +108,TRAIN,0,0 +109,TRAIN,0,0 +110,TRAIN,0,0 +111,TRAIN,0,0 +112,TRAIN,0,0 +113,TRAIN,0,0 +114,TRAIN,0,0 +115,TRAIN,0,0 +116,TRAIN,0,0 +117,TRAIN,0,0 +118,TRAIN,0,0 +119,TRAIN,0,0 +120,TRAIN,0,0 +121,TRAIN,0,0 +122,TRAIN,0,0 +123,TRAIN,0,0 +124,TRAIN,0,0 +125,TRAIN,0,0 +126,TRAIN,0,0 +127,TRAIN,0,0 +128,TRAIN,0,0 +129,TRAIN,0,0 +130,TRAIN,0,0 +131,TRAIN,0,0 +132,TRAIN,0,0 +133,TRAIN,0,0 +134,TRAIN,0,0 +135,TRAIN,0,0 +136,TRAIN,0,0 +137,TRAIN,0,0 +138,TRAIN,0,0 +139,TRAIN,0,0 +140,TRAIN,0,0 +141,TRAIN,0,0 +142,TRAIN,0,0 +143,TRAIN,0,0 +144,TRAIN,0,0 +145,TRAIN,0,0 +146,TRAIN,0,0 +147,TRAIN,0,0 +148,TRAIN,0,0 +149,TRAIN,0,0 +150,TRAIN,0,0 +151,TRAIN,0,0 +152,TRAIN,0,0 +153,TRAIN,0,0 +154,TRAIN,0,0 +155,TRAIN,0,0 +156,TRAIN,0,0 +157,TRAIN,0,0 +158,TRAIN,0,0 +159,TRAIN,0,0 +160,TRAIN,0,0 +161,TRAIN,0,0 +162,TRAIN,0,0 +163,TRAIN,0,0 +164,TRAIN,0,0 +165,TRAIN,0,0 +166,TRAIN,0,0 +167,TRAIN,0,0 +168,TRAIN,0,0 +169,TRAIN,0,0 +170,TRAIN,0,0 +171,TRAIN,0,0 +172,TRAIN,0,0 +173,TRAIN,0,0 +174,TRAIN,0,0 +175,TRAIN,0,0 +176,TRAIN,0,0 +177,TRAIN,0,0 +178,TRAIN,0,0 +179,TRAIN,0,0 +180,TRAIN,0,0 +181,TRAIN,0,0 +182,TRAIN,0,0 +183,TRAIN,0,0 +184,TRAIN,0,0 +185,TRAIN,0,0 +186,TRAIN,0,0 +187,TRAIN,0,0 +188,TRAIN,0,0 +189,TRAIN,0,0 +190,TRAIN,0,0 +191,TRAIN,0,0 +192,TRAIN,0,0 +193,TRAIN,0,0 +194,TRAIN,0,0 +195,TRAIN,0,0 +196,TRAIN,0,0 +197,TRAIN,0,0 +198,TRAIN,0,0 +199,TRAIN,0,0 +200,TRAIN,0,0 +201,TRAIN,0,0 +202,TRAIN,0,0 +203,TRAIN,0,0 +204,TRAIN,0,0 +205,TRAIN,0,0 +206,TRAIN,0,0 +207,TRAIN,0,0 +208,TRAIN,0,0 +209,TRAIN,0,0 +210,TRAIN,0,0 +211,TRAIN,0,0 +212,TRAIN,0,0 +213,TRAIN,0,0 +214,TRAIN,0,0 +215,TRAIN,0,0 +216,TRAIN,0,0 +217,TRAIN,0,0 +218,TRAIN,0,0 +219,TRAIN,0,0 +220,TRAIN,0,0 +221,TRAIN,0,0 +222,TRAIN,0,0 +223,TRAIN,0,0 +224,TRAIN,0,0 +225,TRAIN,0,0 +226,TRAIN,0,0 +227,TRAIN,0,0 +228,TRAIN,0,0 +229,TRAIN,0,0 +230,TRAIN,0,0 +231,TRAIN,0,0 +232,TRAIN,0,0 +233,TRAIN,0,0 +234,TRAIN,0,0 +235,TRAIN,0,0 +236,TRAIN,0,0 +237,TRAIN,0,0 +238,TRAIN,0,0 +239,TRAIN,0,0 +240,TRAIN,0,0 +241,TRAIN,0,0 +242,TRAIN,0,0 +243,TRAIN,0,0 +244,TRAIN,0,0 +245,TRAIN,0,0 +246,TRAIN,0,0 +247,TRAIN,0,0 +248,TRAIN,0,0 +249,TRAIN,0,0 +250,TRAIN,0,0 +251,TRAIN,0,0 +252,TRAIN,0,0 +253,TRAIN,0,0 +254,TRAIN,0,0 +255,TRAIN,0,0 +256,TRAIN,0,0 +257,TRAIN,0,0 +258,TRAIN,0,0 +259,TRAIN,0,0 +260,TRAIN,0,0 +261,TRAIN,0,0 +262,TRAIN,0,0 +263,TRAIN,0,0 +264,TRAIN,0,0 +265,TRAIN,0,0 +266,TRAIN,0,0 +267,TRAIN,0,0 +268,TRAIN,0,0 +269,TRAIN,0,0 +270,TRAIN,0,0 +271,TRAIN,0,0 +272,TRAIN,0,0 +273,TRAIN,0,0 +274,TRAIN,0,0 +275,TRAIN,0,0 +276,TRAIN,0,0 +277,TRAIN,0,0 +278,TRAIN,0,0 +279,TRAIN,0,0 +280,TRAIN,0,0 +281,TRAIN,0,0 +282,TRAIN,0,0 +283,TRAIN,0,0 +284,TRAIN,0,0 +285,TRAIN,0,0 +286,TRAIN,0,0 +287,TRAIN,0,0 +288,TRAIN,0,0 +289,TRAIN,0,0 +290,TRAIN,0,0 +291,TRAIN,0,0 +292,TRAIN,0,0 +293,TRAIN,0,0 +294,TRAIN,0,0 +295,TRAIN,0,0 +296,TRAIN,0,0 +297,TRAIN,0,0 +298,TRAIN,0,0 +299,TRAIN,0,0 +300,TRAIN,0,0 +301,TRAIN,0,0 +302,TRAIN,0,0 +303,TRAIN,0,0 +304,TRAIN,0,0 +305,TRAIN,0,0 +306,TRAIN,0,0 +307,TRAIN,0,0 +308,TRAIN,0,0 +309,TRAIN,0,0 +310,TRAIN,0,0 +311,TRAIN,0,0 +312,TRAIN,0,0 +313,TRAIN,0,0 +314,TRAIN,0,0 +315,TRAIN,0,0 +316,TRAIN,0,0 +317,TRAIN,0,0 +318,TRAIN,0,0 +319,TRAIN,0,0 +320,TRAIN,0,0 +321,TRAIN,0,0 +322,TRAIN,0,0 +323,TRAIN,0,0 +324,TRAIN,0,0 +325,TRAIN,0,0 +326,TRAIN,0,0 +327,TRAIN,0,0 +328,TRAIN,0,0 +329,TRAIN,0,0 +330,TRAIN,0,0 +331,TRAIN,0,0 +332,TRAIN,0,0 +333,TRAIN,0,0 +334,TRAIN,0,0 +335,TRAIN,0,0 +336,TRAIN,0,0 +337,TRAIN,0,0 +338,TRAIN,0,0 +339,TRAIN,0,0 +340,TRAIN,0,0 +341,TRAIN,0,0 +342,TRAIN,0,0 +343,TRAIN,0,0 +344,TRAIN,0,0 +345,TRAIN,0,0 +346,TRAIN,0,0 +347,TRAIN,0,0 +348,TRAIN,0,0 +349,TRAIN,0,0 +350,TRAIN,0,0 +351,TRAIN,0,0 +352,TRAIN,0,0 +353,TRAIN,0,0 +354,TRAIN,0,0 +355,TRAIN,0,0 +356,TRAIN,0,0 +357,TRAIN,0,0 +358,TRAIN,0,0 +359,TRAIN,0,0 +360,TRAIN,0,0 +361,TRAIN,0,0 +362,TRAIN,0,0 +363,TRAIN,0,0 +364,TRAIN,0,0 +365,TRAIN,0,0 +366,TRAIN,0,0 +367,TRAIN,0,0 +368,TRAIN,0,0 +369,TRAIN,0,0 +370,TRAIN,0,0 +371,TRAIN,0,0 +372,TRAIN,0,0 +373,TRAIN,0,0 +374,TRAIN,0,0 +375,TRAIN,0,0 +376,TRAIN,0,0 +377,TRAIN,0,0 +378,TRAIN,0,0 +379,TRAIN,0,0 +380,TRAIN,0,0 +381,TRAIN,0,0 +382,TRAIN,0,0 +383,TRAIN,0,0 +384,TRAIN,0,0 +385,TRAIN,0,0 +386,TRAIN,0,0 +387,TRAIN,0,0 +388,TRAIN,0,0 +389,TRAIN,0,0 +390,TRAIN,0,0 +391,TRAIN,0,0 +392,TRAIN,0,0 +393,TRAIN,0,0 +394,TRAIN,0,0 +395,TRAIN,0,0 +396,TRAIN,0,0 +397,TRAIN,0,0 +398,TRAIN,0,0 +399,TRAIN,0,0 +400,TRAIN,0,0 +401,TRAIN,0,0 +402,TRAIN,0,0 +403,TRAIN,0,0 +404,TRAIN,0,0 +405,TRAIN,0,0 +406,TRAIN,0,0 +407,TRAIN,0,0 +408,TRAIN,0,0 +409,TRAIN,0,0 +410,TRAIN,0,0 +411,TRAIN,0,0 +412,TRAIN,0,0 +413,TRAIN,0,0 +414,TRAIN,0,0 +415,TRAIN,0,0 +416,TRAIN,0,0 +417,TRAIN,0,0 +418,TRAIN,0,0 +419,TRAIN,0,0 +420,TRAIN,0,0 +421,TRAIN,0,0 +422,TRAIN,0,0 +423,TRAIN,0,0 +424,TRAIN,0,0 +425,TRAIN,0,0 +426,TRAIN,0,0 +427,TRAIN,0,0 +428,TRAIN,0,0 +429,TRAIN,0,0 +430,TRAIN,0,0 +431,TRAIN,0,0 +432,TRAIN,0,0 +433,TRAIN,0,0 +434,TRAIN,0,0 +435,TRAIN,0,0 +436,TRAIN,0,0 +437,TRAIN,0,0 +438,TRAIN,0,0 +439,TRAIN,0,0 +440,TRAIN,0,0 +441,TRAIN,0,0 +442,TRAIN,0,0 +443,TRAIN,0,0 +444,TRAIN,0,0 +445,TRAIN,0,0 +446,TRAIN,0,0 +447,TRAIN,0,0 +448,TRAIN,0,0 +449,TRAIN,0,0 +450,TRAIN,0,0 +451,TRAIN,0,0 +452,TRAIN,0,0 +453,TRAIN,0,0 +454,TRAIN,0,0 +455,TRAIN,0,0 +456,TRAIN,0,0 +457,TRAIN,0,0 +458,TRAIN,0,0 +459,TRAIN,0,0 +460,TRAIN,0,0 +461,TRAIN,0,0 +462,TRAIN,0,0 +463,TRAIN,0,0 +464,TRAIN,0,0 +465,TRAIN,0,0 +466,TRAIN,0,0 +467,TRAIN,0,0 +468,TRAIN,0,0 +469,TRAIN,0,0 +470,TRAIN,0,0 +471,TRAIN,0,0 +472,TRAIN,0,0 +473,TRAIN,0,0 +474,TRAIN,0,0 +475,TRAIN,0,0 +476,TRAIN,0,0 +477,TRAIN,0,0 +478,TRAIN,0,0 +479,TRAIN,0,0 +480,TRAIN,0,0 +481,TRAIN,0,0 +482,TRAIN,0,0 +483,TRAIN,0,0 +484,TRAIN,0,0 +485,TRAIN,0,0 +486,TRAIN,0,0 +487,TRAIN,0,0 +488,TRAIN,0,0 +489,TRAIN,0,0 +490,TRAIN,0,0 +491,TRAIN,0,0 +492,TRAIN,0,0 +493,TRAIN,0,0 +494,TRAIN,0,0 +495,TRAIN,0,0 +496,TRAIN,0,0 +497,TRAIN,0,0 +498,TRAIN,0,0 +499,TRAIN,0,0 +500,TRAIN,0,0 +501,TRAIN,0,0 +502,TRAIN,0,0 +503,TRAIN,0,0 +504,TRAIN,0,0 +505,TRAIN,0,0 +506,TRAIN,0,0 +507,TRAIN,0,0 +508,TRAIN,0,0 +509,TRAIN,0,0 +510,TRAIN,0,0 +511,TRAIN,0,0 +512,TRAIN,0,0 +513,TRAIN,0,0 +514,TRAIN,0,0 +515,TRAIN,0,0 +516,TRAIN,0,0 +517,TRAIN,0,0 +518,TRAIN,0,0 +519,TRAIN,0,0 +520,TRAIN,0,0 +521,TRAIN,0,0 +522,TRAIN,0,0 +523,TRAIN,0,0 +524,TRAIN,0,0 +525,TRAIN,0,0 +526,TRAIN,0,0 +527,TRAIN,0,0 +528,TRAIN,0,0 +529,TRAIN,0,0 +530,TRAIN,0,0 +531,TRAIN,0,0 +532,TRAIN,0,0 +533,TRAIN,0,0 +534,TRAIN,0,0 +535,TRAIN,0,0 +536,TRAIN,0,0 +537,TRAIN,0,0 +538,TRAIN,0,0 +539,TRAIN,0,0 +540,TRAIN,0,0 +541,TRAIN,0,0 +542,TRAIN,0,0 +543,TRAIN,0,0 +544,TRAIN,0,0 +545,TRAIN,0,0 +546,TRAIN,0,0 +547,TRAIN,0,0 +548,TRAIN,0,0 +549,TRAIN,0,0 +550,TRAIN,0,0 +551,TRAIN,0,0 +552,TRAIN,0,0 +553,TRAIN,0,0 +554,TRAIN,0,0 +555,TRAIN,0,0 +556,TRAIN,0,0 +557,TRAIN,0,0 +558,TRAIN,0,0 +559,TRAIN,0,0 +560,TRAIN,0,0 +561,TRAIN,0,0 +562,TRAIN,0,0 +563,TRAIN,0,0 +564,TRAIN,0,0 +565,TRAIN,0,0 +566,TRAIN,0,0 +567,TRAIN,0,0 +568,TRAIN,0,0 +569,TRAIN,0,0 +570,TRAIN,0,0 +571,TRAIN,0,0 +572,TRAIN,0,0 +573,TRAIN,0,0 +574,TRAIN,0,0 +575,TRAIN,0,0 +576,TRAIN,0,0 +577,TRAIN,0,0 +578,TRAIN,0,0 +579,TRAIN,0,0 +580,TRAIN,0,0 +581,TRAIN,0,0 +582,TRAIN,0,0 +583,TRAIN,0,0 +584,TRAIN,0,0 +585,TRAIN,0,0 +586,TRAIN,0,0 +587,TRAIN,0,0 +588,TRAIN,0,0 +589,TRAIN,0,0 +590,TRAIN,0,0 +591,TRAIN,0,0 +592,TRAIN,0,0 +593,TRAIN,0,0 +594,TRAIN,0,0 +595,TRAIN,0,0 +596,TRAIN,0,0 +597,TRAIN,0,0 +598,TRAIN,0,0 +599,TRAIN,0,0 +600,TRAIN,0,0 +601,TRAIN,0,0 +602,TRAIN,0,0 +603,TRAIN,0,0 +604,TRAIN,0,0 +605,TRAIN,0,0 +606,TRAIN,0,0 +607,TRAIN,0,0 +608,TRAIN,0,0 +609,TRAIN,0,0 +610,TRAIN,0,0 +611,TRAIN,0,0 +612,TRAIN,0,0 +613,TRAIN,0,0 +614,TRAIN,0,0 +615,TRAIN,0,0 +616,TRAIN,0,0 +617,TRAIN,0,0 +618,TRAIN,0,0 +619,TRAIN,0,0 +620,TRAIN,0,0 +621,TRAIN,0,0 +622,TRAIN,0,0 +623,TRAIN,0,0 +624,TRAIN,0,0 +625,TRAIN,0,0 +626,TRAIN,0,0 +627,TRAIN,0,0 +628,TRAIN,0,0 +629,TRAIN,0,0 +630,TRAIN,0,0 +631,TRAIN,0,0 +632,TRAIN,0,0 +633,TRAIN,0,0 +634,TRAIN,0,0 +635,TRAIN,0,0 +636,TRAIN,0,0 +637,TRAIN,0,0 +638,TRAIN,0,0 +639,TRAIN,0,0 +640,TRAIN,0,0 +641,TRAIN,0,0 +642,TRAIN,0,0 +643,TRAIN,0,0 +644,TRAIN,0,0 +645,TRAIN,0,0 +646,TRAIN,0,0 +647,TRAIN,0,0 +648,TRAIN,0,0 +649,TRAIN,0,0 +650,TRAIN,0,0 +651,TRAIN,0,0 +652,TRAIN,0,0 +653,TRAIN,0,0 +654,TRAIN,0,0 +655,TRAIN,0,0 +656,TRAIN,0,0 +657,TRAIN,0,0 +658,TRAIN,0,0 +659,TRAIN,0,0 +660,TRAIN,0,0 +661,TRAIN,0,0 +662,TRAIN,0,0 +663,TRAIN,0,0 +664,TRAIN,0,0 +665,TRAIN,0,0 +666,TRAIN,0,0 +667,TRAIN,0,0 +668,TRAIN,0,0 +669,TRAIN,0,0 +670,TRAIN,0,0 +671,TRAIN,0,0 +672,TRAIN,0,0 +673,TRAIN,0,0 +674,TRAIN,0,0 +675,TRAIN,0,0 +676,TRAIN,0,0 +677,TRAIN,0,0 +678,TRAIN,0,0 +679,TRAIN,0,0 +680,TRAIN,0,0 +681,TRAIN,0,0 +682,TRAIN,0,0 +683,TRAIN,0,0 +684,TRAIN,0,0 +685,TRAIN,0,0 +686,TRAIN,0,0 +687,TRAIN,0,0 +688,TRAIN,0,0 +689,TRAIN,0,0 +690,TRAIN,0,0 +691,TRAIN,0,0 +692,TRAIN,0,0 +693,TRAIN,0,0 +694,TRAIN,0,0 +695,TRAIN,0,0 +696,TRAIN,0,0 +697,TRAIN,0,0 +698,TRAIN,0,0 +699,TRAIN,0,0 +700,TRAIN,0,0 +701,TRAIN,0,0 +702,TRAIN,0,0 +703,TRAIN,0,0 +704,TRAIN,0,0 +705,TRAIN,0,0 +706,TRAIN,0,0 +707,TRAIN,0,0 +708,TRAIN,0,0 +709,TRAIN,0,0 +710,TRAIN,0,0 +711,TRAIN,0,0 +712,TRAIN,0,0 +713,TRAIN,0,0 +714,TRAIN,0,0 +715,TRAIN,0,0 +716,TRAIN,0,0 +717,TRAIN,0,0 +718,TRAIN,0,0 +719,TRAIN,0,0 +720,TRAIN,0,0 +721,TRAIN,0,0 +722,TRAIN,0,0 +723,TRAIN,0,0 +724,TRAIN,0,0 +725,TRAIN,0,0 +726,TRAIN,0,0 +727,TRAIN,0,0 +728,TRAIN,0,0 +729,TRAIN,0,0 +730,TRAIN,0,0 +731,TRAIN,0,0 +732,TRAIN,0,0 +733,TRAIN,0,0 +734,TRAIN,0,0 +735,TRAIN,0,0 +736,TRAIN,0,0 +737,TRAIN,0,0 +738,TRAIN,0,0 +739,TRAIN,0,0 +740,TRAIN,0,0 +741,TRAIN,0,0 +742,TRAIN,0,0 +743,TRAIN,0,0 +744,TRAIN,0,0 +745,TRAIN,0,0 +746,TRAIN,0,0 +747,TRAIN,0,0 +748,TRAIN,0,0 +749,TRAIN,0,0 +750,TRAIN,0,0 +751,TRAIN,0,0 +752,TRAIN,0,0 +753,TRAIN,0,0 +754,TRAIN,0,0 +755,TRAIN,0,0 +756,TRAIN,0,0 +757,TRAIN,0,0 +758,TRAIN,0,0 +759,TRAIN,0,0 +760,TRAIN,0,0 +761,TRAIN,0,0 +762,TRAIN,0,0 +763,TRAIN,0,0 +764,TRAIN,0,0 +765,TRAIN,0,0 +766,TRAIN,0,0 +767,TRAIN,0,0 +768,TRAIN,0,0 +769,TRAIN,0,0 +770,TRAIN,0,0 +771,TRAIN,0,0 +772,TRAIN,0,0 +773,TRAIN,0,0 +774,TRAIN,0,0 +775,TRAIN,0,0 +776,TRAIN,0,0 +777,TRAIN,0,0 +778,TRAIN,0,0 +779,TRAIN,0,0 +780,TRAIN,0,0 +781,TRAIN,0,0 +782,TRAIN,0,0 +783,TRAIN,0,0 +784,TRAIN,0,0 +785,TRAIN,0,0 +786,TRAIN,0,0 +787,TRAIN,0,0 +788,TRAIN,0,0 +789,TRAIN,0,0 +790,TRAIN,0,0 +791,TRAIN,0,0 +792,TRAIN,0,0 +793,TRAIN,0,0 +794,TRAIN,0,0 +795,TRAIN,0,0 +796,TRAIN,0,0 +797,TRAIN,0,0 +798,TRAIN,0,0 +799,TRAIN,0,0 +800,TRAIN,0,0 +801,TRAIN,0,0 +802,TRAIN,0,0 +803,TRAIN,0,0 +804,TRAIN,0,0 +805,TRAIN,0,0 +806,TRAIN,0,0 +807,TRAIN,0,0 +808,TRAIN,0,0 +809,TRAIN,0,0 +810,TRAIN,0,0 +811,TRAIN,0,0 +812,TRAIN,0,0 +813,TRAIN,0,0 +814,TRAIN,0,0 +815,TRAIN,0,0 +816,TRAIN,0,0 +817,TRAIN,0,0 +818,TRAIN,0,0 +819,TRAIN,0,0 +820,TRAIN,0,0 +821,TRAIN,0,0 +822,TRAIN,0,0 +823,TRAIN,0,0 +824,TRAIN,0,0 +825,TRAIN,0,0 +826,TRAIN,0,0 +827,TRAIN,0,0 +828,TRAIN,0,0 +829,TRAIN,0,0 +830,TRAIN,0,0 +831,TRAIN,0,0 +832,TRAIN,0,0 +833,TRAIN,0,0 +834,TRAIN,0,0 +835,TRAIN,0,0 +836,TRAIN,0,0 +837,TRAIN,0,0 +838,TRAIN,0,0 +839,TRAIN,0,0 +840,TRAIN,0,0 +841,TRAIN,0,0 +842,TRAIN,0,0 +843,TRAIN,0,0 +844,TRAIN,0,0 +845,TRAIN,0,0 +846,TRAIN,0,0 +847,TRAIN,0,0 +848,TRAIN,0,0 +849,TRAIN,0,0 +850,TRAIN,0,0 +851,TRAIN,0,0 +852,TRAIN,0,0 +853,TRAIN,0,0 +854,TRAIN,0,0 +855,TRAIN,0,0 +856,TRAIN,0,0 +857,TRAIN,0,0 +858,TRAIN,0,0 +859,TRAIN,0,0 +860,TRAIN,0,0 +861,TRAIN,0,0 +862,TRAIN,0,0 +863,TRAIN,0,0 +864,TRAIN,0,0 +865,TRAIN,0,0 +866,TRAIN,0,0 +867,TRAIN,0,0 +868,TRAIN,0,0 +869,TRAIN,0,0 +870,TRAIN,0,0 +871,TRAIN,0,0 +872,TRAIN,0,0 +873,TRAIN,0,0 +874,TRAIN,0,0 +875,TRAIN,0,0 +876,TRAIN,0,0 +877,TRAIN,0,0 +878,TRAIN,0,0 +879,TRAIN,0,0 +880,TRAIN,0,0 +881,TRAIN,0,0 +882,TRAIN,0,0 +883,TRAIN,0,0 +884,TRAIN,0,0 +885,TRAIN,0,0 +886,TRAIN,0,0 +887,TRAIN,0,0 +888,TRAIN,0,0 +889,TRAIN,0,0 +890,TRAIN,0,0 +891,TRAIN,0,0 +892,TRAIN,0,0 +893,TRAIN,0,0 +894,TRAIN,0,0 +895,TRAIN,0,0 +896,TRAIN,0,0 +897,TRAIN,0,0 +898,TRAIN,0,0 +899,TRAIN,0,0 +900,TRAIN,0,0 +901,TRAIN,0,0 +902,TRAIN,0,0 +903,TRAIN,0,0 +904,TRAIN,0,0 +905,TRAIN,0,0 +906,TRAIN,0,0 +907,TRAIN,0,0 +908,TRAIN,0,0 +909,TRAIN,0,0 +910,TRAIN,0,0 +911,TRAIN,0,0 +912,TRAIN,0,0 +913,TRAIN,0,0 +914,TRAIN,0,0 +915,TRAIN,0,0 +916,TRAIN,0,0 +917,TRAIN,0,0 +918,TRAIN,0,0 +919,TRAIN,0,0 +920,TRAIN,0,0 +921,TRAIN,0,0 +922,TRAIN,0,0 +923,TRAIN,0,0 +924,TRAIN,0,0 +925,TRAIN,0,0 +926,TRAIN,0,0 +927,TRAIN,0,0 +928,TRAIN,0,0 +929,TRAIN,0,0 +930,TRAIN,0,0 +931,TRAIN,0,0 +932,TRAIN,0,0 +933,TRAIN,0,0 +934,TRAIN,0,0 +935,TRAIN,0,0 +936,TRAIN,0,0 +937,TRAIN,0,0 +938,TRAIN,0,0 +939,TRAIN,0,0 +940,TRAIN,0,0 +941,TRAIN,0,0 +942,TRAIN,0,0 +943,TRAIN,0,0 +944,TRAIN,0,0 +945,TRAIN,0,0 +946,TRAIN,0,0 +947,TRAIN,0,0 +948,TRAIN,0,0 +949,TRAIN,0,0 +950,TRAIN,0,0 +951,TRAIN,0,0 +952,TRAIN,0,0 +953,TRAIN,0,0 +954,TRAIN,0,0 +955,TRAIN,0,0 +956,TRAIN,0,0 +957,TRAIN,0,0 +958,TRAIN,0,0 +959,TRAIN,0,0 +960,TRAIN,0,0 +961,TRAIN,0,0 +962,TRAIN,0,0 +963,TRAIN,0,0 +964,TRAIN,0,0 +965,TRAIN,0,0 +966,TRAIN,0,0 +967,TRAIN,0,0 +968,TRAIN,0,0 +969,TRAIN,0,0 +970,TRAIN,0,0 +971,TRAIN,0,0 +972,TRAIN,0,0 +973,TRAIN,0,0 +974,TRAIN,0,0 +975,TRAIN,0,0 +976,TRAIN,0,0 +977,TRAIN,0,0 +978,TRAIN,0,0 +979,TRAIN,0,0 +980,TRAIN,0,0 +981,TRAIN,0,0 +982,TRAIN,0,0 +983,TRAIN,0,0 +984,TRAIN,0,0 +985,TRAIN,0,0 +986,TRAIN,0,0 +987,TRAIN,0,0 +988,TRAIN,0,0 +989,TRAIN,0,0 +990,TRAIN,0,0 +991,TRAIN,0,0 +992,TRAIN,0,0 +993,TRAIN,0,0 +994,TRAIN,0,0 +995,TRAIN,0,0 +996,TRAIN,0,0 +997,TRAIN,0,0 +998,TRAIN,0,0 +999,TRAIN,0,0 +1000,TRAIN,0,0 +1001,TRAIN,0,0 +1002,TRAIN,0,0 +1003,TRAIN,0,0 +1004,TRAIN,0,0 +1005,TRAIN,0,0 +1006,TRAIN,0,0 +1007,TRAIN,0,0 +1008,TRAIN,0,0 +1009,TRAIN,0,0 +1010,TRAIN,0,0 +1011,TRAIN,0,0 +1012,TRAIN,0,0 +1013,TRAIN,0,0 +1014,TRAIN,0,0 +1015,TRAIN,0,0 +1016,TRAIN,0,0 +1017,TRAIN,0,0 +1018,TRAIN,0,0 +1019,TRAIN,0,0 +1020,TRAIN,0,0 +1021,TRAIN,0,0 +1022,TRAIN,0,0 +1023,TRAIN,0,0 +1024,TRAIN,0,0 +1025,TRAIN,0,0 +1026,TRAIN,0,0 +1027,TRAIN,0,0 +1028,TRAIN,0,0 +1029,TRAIN,0,0 +1030,TRAIN,0,0 +1031,TRAIN,0,0 +1032,TRAIN,0,0 +1033,TRAIN,0,0 +1034,TRAIN,0,0 +1035,TRAIN,0,0 +1036,TRAIN,0,0 +1037,TRAIN,0,0 +1038,TRAIN,0,0 +1039,TRAIN,0,0 +1040,TRAIN,0,0 +1041,TRAIN,0,0 +1042,TRAIN,0,0 +1043,TRAIN,0,0 +1044,TRAIN,0,0 +1045,TRAIN,0,0 +1046,TRAIN,0,0 +1047,TRAIN,0,0 +1048,TRAIN,0,0 +1049,TRAIN,0,0 +1050,TRAIN,0,0 +1051,TRAIN,0,0 +1052,TRAIN,0,0 +1053,TRAIN,0,0 +1054,TRAIN,0,0 +1055,TRAIN,0,0 +1056,TRAIN,0,0 +1057,TRAIN,0,0 +1058,TRAIN,0,0 +1059,TRAIN,0,0 +1060,TRAIN,0,0 +1061,TRAIN,0,0 +1062,TRAIN,0,0 +1063,TRAIN,0,0 +1064,TRAIN,0,0 +1065,TRAIN,0,0 +1066,TRAIN,0,0 +1067,TRAIN,0,0 +1068,TRAIN,0,0 +1069,TRAIN,0,0 +1070,TRAIN,0,0 +1071,TRAIN,0,0 +1072,TRAIN,0,0 +1073,TRAIN,0,0 +1074,TRAIN,0,0 +1075,TRAIN,0,0 +1076,TRAIN,0,0 +1077,TRAIN,0,0 +1078,TRAIN,0,0 +1079,TRAIN,0,0 +1080,TRAIN,0,0 +1081,TRAIN,0,0 +1082,TRAIN,0,0 +1083,TRAIN,0,0 +1084,TRAIN,0,0 +1085,TRAIN,0,0 +1086,TRAIN,0,0 +1087,TRAIN,0,0 +1088,TRAIN,0,0 +1089,TRAIN,0,0 +1090,TRAIN,0,0 +1091,TRAIN,0,0 +1092,TRAIN,0,0 +1093,TRAIN,0,0 +1094,TRAIN,0,0 +1095,TRAIN,0,0 +1096,TRAIN,0,0 +1097,TRAIN,0,0 +1098,TRAIN,0,0 +1099,TRAIN,0,0 +1100,TRAIN,0,0 +1101,TRAIN,0,0 +1102,TRAIN,0,0 +1103,TRAIN,0,0 +1104,TRAIN,0,0 +1105,TRAIN,0,0 +1106,TRAIN,0,0 +1107,TRAIN,0,0 +1108,TRAIN,0,0 +1109,TRAIN,0,0 +1110,TRAIN,0,0 +1111,TRAIN,0,0 +1112,TRAIN,0,0 +1113,TRAIN,0,0 +1114,TRAIN,0,0 +1115,TRAIN,0,0 +1116,TRAIN,0,0 +1117,TRAIN,0,0 +1118,TRAIN,0,0 +1119,TRAIN,0,0 +1120,TRAIN,0,0 +1121,TRAIN,0,0 +1122,TRAIN,0,0 +1123,TRAIN,0,0 +1124,TRAIN,0,0 +1125,TRAIN,0,0 +1126,TRAIN,0,0 +1127,TRAIN,0,0 +1128,TRAIN,0,0 +1129,TRAIN,0,0 +1130,TRAIN,0,0 +1131,TRAIN,0,0 +1132,TRAIN,0,0 +1133,TRAIN,0,0 +1134,TRAIN,0,0 +1135,TRAIN,0,0 +1136,TRAIN,0,0 +1137,TRAIN,0,0 +1138,TRAIN,0,0 +1139,TRAIN,0,0 +1140,TRAIN,0,0 +1141,TRAIN,0,0 +1142,TRAIN,0,0 +1143,TRAIN,0,0 +1144,TRAIN,0,0 +1145,TRAIN,0,0 +1146,TRAIN,0,0 +1147,TRAIN,0,0 +1148,TRAIN,0,0 +1149,TRAIN,0,0 +1150,TRAIN,0,0 +1151,TRAIN,0,0 +1152,TRAIN,0,0 +1153,TRAIN,0,0 +1154,TRAIN,0,0 +1155,TRAIN,0,0 +1156,TRAIN,0,0 +1157,TRAIN,0,0 +1158,TRAIN,0,0 +1159,TRAIN,0,0 +1160,TRAIN,0,0 +1161,TRAIN,0,0 +1162,TRAIN,0,0 +1163,TRAIN,0,0 +1164,TRAIN,0,0 +1165,TRAIN,0,0 +1166,TRAIN,0,0 +1167,TRAIN,0,0 +1168,TRAIN,0,0 +1169,TRAIN,0,0 +1170,TRAIN,0,0 +1171,TRAIN,0,0 +1172,TRAIN,0,0 +1173,TRAIN,0,0 +1174,TRAIN,0,0 +1175,TRAIN,0,0 +1176,TRAIN,0,0 +1177,TRAIN,0,0 +1178,TRAIN,0,0 +1179,TRAIN,0,0 +1180,TRAIN,0,0 +1181,TRAIN,0,0 +1182,TRAIN,0,0 +1183,TRAIN,0,0 +1184,TRAIN,0,0 +1185,TRAIN,0,0 +1186,TRAIN,0,0 +1187,TRAIN,0,0 +1188,TRAIN,0,0 +1189,TRAIN,0,0 +1190,TRAIN,0,0 +1191,TRAIN,0,0 +1192,TRAIN,0,0 +1193,TRAIN,0,0 +1194,TRAIN,0,0 +1195,TRAIN,0,0 +1196,TRAIN,0,0 +1197,TRAIN,0,0 +1198,TRAIN,0,0 +1199,TRAIN,0,0 +1200,TRAIN,0,0 +1201,TRAIN,0,0 +1202,TRAIN,0,0 +1203,TRAIN,0,0 +1204,TRAIN,0,0 +1205,TRAIN,0,0 +1206,TRAIN,0,0 +1207,TRAIN,0,0 +1208,TRAIN,0,0 +1209,TRAIN,0,0 +1210,TRAIN,0,0 +1211,TRAIN,0,0 +1212,TRAIN,0,0 +1213,TRAIN,0,0 +1214,TRAIN,0,0 +1215,TRAIN,0,0 +1216,TRAIN,0,0 +1217,TRAIN,0,0 +1218,TRAIN,0,0 +1219,TRAIN,0,0 +1220,TRAIN,0,0 +1221,TRAIN,0,0 +1222,TRAIN,0,0 +1223,TRAIN,0,0 +1224,TRAIN,0,0 +1225,TRAIN,0,0 +1226,TRAIN,0,0 +1227,TRAIN,0,0 +1228,TRAIN,0,0 +1229,TRAIN,0,0 +1230,TRAIN,0,0 +1231,TRAIN,0,0 +1232,TRAIN,0,0 +1233,TRAIN,0,0 +1234,TRAIN,0,0 +1235,TRAIN,0,0 +1236,TRAIN,0,0 +1237,TRAIN,0,0 +1238,TRAIN,0,0 +1239,TRAIN,0,0 +1240,TRAIN,0,0 +1241,TRAIN,0,0 +1242,TRAIN,0,0 +1243,TRAIN,0,0 +1244,TRAIN,0,0 +1245,TRAIN,0,0 +1246,TRAIN,0,0 +1247,TRAIN,0,0 +1248,TRAIN,0,0 +1249,TRAIN,0,0 +1250,TRAIN,0,0 +1251,TRAIN,0,0 +1252,TRAIN,0,0 +1253,TRAIN,0,0 +1254,TRAIN,0,0 +1255,TRAIN,0,0 +1256,TRAIN,0,0 +1257,TRAIN,0,0 +1258,TRAIN,0,0 +1259,TRAIN,0,0 +1260,TRAIN,0,0 +1261,TRAIN,0,0 +1262,TRAIN,0,0 +1263,TRAIN,0,0 +1264,TRAIN,0,0 +1265,TRAIN,0,0 +1266,TRAIN,0,0 +1267,TRAIN,0,0 +1268,TRAIN,0,0 +1269,TRAIN,0,0 +1270,TRAIN,0,0 +1271,TRAIN,0,0 +1272,TRAIN,0,0 +1273,TRAIN,0,0 +1274,TRAIN,0,0 +1275,TRAIN,0,0 +1276,TRAIN,0,0 +1277,TRAIN,0,0 +1278,TRAIN,0,0 +1279,TRAIN,0,0 +1280,TRAIN,0,0 +1281,TRAIN,0,0 +1282,TRAIN,0,0 +1283,TRAIN,0,0 +1284,TRAIN,0,0 +1285,TRAIN,0,0 +1286,TRAIN,0,0 +1287,TRAIN,0,0 +1288,TRAIN,0,0 +1289,TRAIN,0,0 +1290,TRAIN,0,0 +1291,TRAIN,0,0 +1292,TRAIN,0,0 +1293,TRAIN,0,0 +1294,TRAIN,0,0 +1295,TRAIN,0,0 +1296,TRAIN,0,0 +1297,TRAIN,0,0 +1298,TRAIN,0,0 +1299,TRAIN,0,0 +1300,TRAIN,0,0 +1301,TRAIN,0,0 +1302,TRAIN,0,0 +1303,TRAIN,0,0 +1304,TRAIN,0,0 +1305,TRAIN,0,0 +1306,TRAIN,0,0 +1307,TRAIN,0,0 +1308,TRAIN,0,0 +1309,TRAIN,0,0 +1310,TRAIN,0,0 +1311,TRAIN,0,0 +1312,TRAIN,0,0 +1313,TRAIN,0,0 +1314,TRAIN,0,0 +1315,TRAIN,0,0 +1316,TRAIN,0,0 +1317,TRAIN,0,0 +1318,TRAIN,0,0 +1319,TRAIN,0,0 +1320,TRAIN,0,0 +1321,TRAIN,0,0 +1322,TRAIN,0,0 +1323,TRAIN,0,0 +1324,TRAIN,0,0 +1325,TRAIN,0,0 +1326,TRAIN,0,0 +1327,TRAIN,0,0 +1328,TRAIN,0,0 +1329,TRAIN,0,0 +1330,TRAIN,0,0 +1331,TRAIN,0,0 +1332,TRAIN,0,0 +1333,TRAIN,0,0 +1334,TRAIN,0,0 +1335,TRAIN,0,0 +1336,TRAIN,0,0 +1337,TRAIN,0,0 +1338,TRAIN,0,0 +1339,TRAIN,0,0 +1340,TRAIN,0,0 +1341,TRAIN,0,0 +1342,TRAIN,0,0 +1343,TRAIN,0,0 +1344,TRAIN,0,0 +1345,TRAIN,0,0 +1346,TRAIN,0,0 +1347,TRAIN,0,0 +1348,TRAIN,0,0 +1349,TRAIN,0,0 +1350,TRAIN,0,0 +1351,TRAIN,0,0 +1352,TRAIN,0,0 +1353,TRAIN,0,0 +1354,TRAIN,0,0 +1355,TRAIN,0,0 +1356,TRAIN,0,0 +1357,TRAIN,0,0 +1358,TRAIN,0,0 +1359,TRAIN,0,0 +1360,TRAIN,0,0 +1361,TRAIN,0,0 +1362,TRAIN,0,0 +1363,TRAIN,0,0 +1364,TRAIN,0,0 +1365,TRAIN,0,0 +1366,TRAIN,0,0 +1367,TRAIN,0,0 +1368,TRAIN,0,0 +1369,TRAIN,0,0 +1370,TRAIN,0,0 +1371,TRAIN,0,0 +1372,TRAIN,0,0 +1373,TRAIN,0,0 +1374,TRAIN,0,0 +1375,TRAIN,0,0 +1376,TRAIN,0,0 +1377,TRAIN,0,0 +1378,TRAIN,0,0 +1379,TRAIN,0,0 +1380,TRAIN,0,0 +1381,TRAIN,0,0 +1382,TRAIN,0,0 +1383,TRAIN,0,0 +1384,TRAIN,0,0 +1385,TRAIN,0,0 +1386,TRAIN,0,0 +1387,TRAIN,0,0 +1388,TRAIN,0,0 +1389,TRAIN,0,0 +1390,TRAIN,0,0 +1391,TRAIN,0,0 +1392,TRAIN,0,0 +1393,TRAIN,0,0 +1394,TRAIN,0,0 +1395,TRAIN,0,0 +1396,TRAIN,0,0 +1397,TRAIN,0,0 +1398,TRAIN,0,0 +1399,TRAIN,0,0 +1400,TRAIN,0,0 +1401,TRAIN,0,0 +1402,TRAIN,0,0 +1403,TRAIN,0,0 +1404,TRAIN,0,0 +1405,TRAIN,0,0 +1406,TRAIN,0,0 +1407,TRAIN,0,0 +1408,TRAIN,0,0 +1409,TRAIN,0,0 +1410,TRAIN,0,0 +1411,TRAIN,0,0 +1412,TRAIN,0,0 +1413,TRAIN,0,0 +1414,TRAIN,0,0 +1415,TRAIN,0,0 +1416,TRAIN,0,0 +1417,TRAIN,0,0 +1418,TRAIN,0,0 +1419,TRAIN,0,0 +1420,TRAIN,0,0 +1421,TRAIN,0,0 +1422,TRAIN,0,0 +1423,TRAIN,0,0 +1424,TRAIN,0,0 +1425,TRAIN,0,0 +1426,TRAIN,0,0 +1427,TRAIN,0,0 +1428,TRAIN,0,0 +1429,TRAIN,0,0 +1430,TRAIN,0,0 +1431,TRAIN,0,0 +1432,TRAIN,0,0 +1433,TRAIN,0,0 +1434,TRAIN,0,0 +1435,TRAIN,0,0 +1436,TRAIN,0,0 +1437,TRAIN,0,0 +1438,TRAIN,0,0 +1439,TRAIN,0,0 +1440,TRAIN,0,0 +1441,TRAIN,0,0 +1442,TRAIN,0,0 +1443,TRAIN,0,0 +1444,TRAIN,0,0 +1445,TRAIN,0,0 +1446,TRAIN,0,0 +1447,TRAIN,0,0 +1448,TRAIN,0,0 +1449,TRAIN,0,0 +1450,TRAIN,0,0 +1451,TRAIN,0,0 +1452,TRAIN,0,0 +1453,TRAIN,0,0 +1454,TRAIN,0,0 +1455,TRAIN,0,0 +1456,TRAIN,0,0 +1457,TRAIN,0,0 +1458,TRAIN,0,0 +1459,TRAIN,0,0 +1460,TRAIN,0,0 +1461,TRAIN,0,0 +1462,TRAIN,0,0 +1463,TRAIN,0,0 +1464,TRAIN,0,0 +1465,TRAIN,0,0 +1466,TRAIN,0,0 +1467,TRAIN,0,0 +1468,TRAIN,0,0 +1469,TRAIN,0,0 +1470,TRAIN,0,0 +1471,TRAIN,0,0 +1472,TRAIN,0,0 +1473,TRAIN,0,0 +1474,TRAIN,0,0 +1475,TRAIN,0,0 +1476,TRAIN,0,0 +1477,TRAIN,0,0 +1478,TRAIN,0,0 +1479,TRAIN,0,0 +1480,TRAIN,0,0 +1481,TRAIN,0,0 +1482,TRAIN,0,0 +1483,TRAIN,0,0 +1484,TRAIN,0,0 +1485,TRAIN,0,0 +1486,TRAIN,0,0 +1487,TRAIN,0,0 +1488,TRAIN,0,0 +1489,TRAIN,0,0 +1490,TRAIN,0,0 +1491,TRAIN,0,0 +1492,TRAIN,0,0 +1493,TRAIN,0,0 +1494,TRAIN,0,0 +1495,TRAIN,0,0 +1496,TRAIN,0,0 +1497,TRAIN,0,0 +1498,TRAIN,0,0 +1499,TRAIN,0,0 +1500,TRAIN,0,0 +1501,TRAIN,0,0 +1502,TRAIN,0,0 +1503,TRAIN,0,0 +1504,TRAIN,0,0 +1505,TRAIN,0,0 +1506,TRAIN,0,0 +1507,TRAIN,0,0 +1508,TRAIN,0,0 +1509,TRAIN,0,0 +1510,TRAIN,0,0 +1511,TRAIN,0,0 +1512,TRAIN,0,0 +1513,TRAIN,0,0 +1514,TRAIN,0,0 +1515,TRAIN,0,0 +1516,TRAIN,0,0 +1517,TRAIN,0,0 +1518,TRAIN,0,0 +1519,TRAIN,0,0 +1520,TRAIN,0,0 +1521,TRAIN,0,0 +1522,TRAIN,0,0 +1523,TRAIN,0,0 +1524,TRAIN,0,0 +1525,TRAIN,0,0 +1526,TRAIN,0,0 +1527,TRAIN,0,0 +1528,TRAIN,0,0 +1529,TRAIN,0,0 +1530,TRAIN,0,0 +1531,TRAIN,0,0 +1532,TRAIN,0,0 +1533,TRAIN,0,0 +1534,TRAIN,0,0 +1535,TRAIN,0,0 +1536,TRAIN,0,0 +1537,TRAIN,0,0 +1538,TRAIN,0,0 +1539,TRAIN,0,0 +1540,TRAIN,0,0 +1541,TRAIN,0,0 +1542,TRAIN,0,0 +1543,TRAIN,0,0 +1544,TRAIN,0,0 +1545,TRAIN,0,0 +1546,TRAIN,0,0 +1547,TRAIN,0,0 +1548,TRAIN,0,0 +1549,TRAIN,0,0 +1550,TRAIN,0,0 +1551,TRAIN,0,0 +1552,TRAIN,0,0 +1553,TRAIN,0,0 +1554,TRAIN,0,0 +1555,TRAIN,0,0 +1556,TRAIN,0,0 +1557,TRAIN,0,0 +1558,TRAIN,0,0 +1559,TRAIN,0,0 +1560,TRAIN,0,0 +1561,TRAIN,0,0 +1562,TRAIN,0,0 +1563,TRAIN,0,0 +1564,TRAIN,0,0 +1565,TRAIN,0,0 +1566,TRAIN,0,0 +1567,TRAIN,0,0 +1568,TRAIN,0,0 +1569,TRAIN,0,0 +1570,TRAIN,0,0 +1571,TRAIN,0,0 +1572,TRAIN,0,0 +1573,TRAIN,0,0 +1574,TRAIN,0,0 +1575,TRAIN,0,0 +1576,TRAIN,0,0 +1577,TRAIN,0,0 +1578,TRAIN,0,0 +1579,TRAIN,0,0 +1580,TRAIN,0,0 +1581,TRAIN,0,0 +1582,TRAIN,0,0 +1583,TRAIN,0,0 +1584,TRAIN,0,0 +1585,TRAIN,0,0 +1586,TRAIN,0,0 +1587,TRAIN,0,0 +1588,TRAIN,0,0 +1589,TRAIN,0,0 +1590,TRAIN,0,0 +1591,TRAIN,0,0 +1592,TRAIN,0,0 +1593,TRAIN,0,0 +1594,TRAIN,0,0 +1595,TRAIN,0,0 +1596,TRAIN,0,0 +1597,TRAIN,0,0 +1598,TRAIN,0,0 +1599,TRAIN,0,0 +1600,TRAIN,0,0 +1601,TRAIN,0,0 +1602,TRAIN,0,0 +1603,TRAIN,0,0 +1604,TRAIN,0,0 +1605,TRAIN,0,0 +1606,TRAIN,0,0 +1607,TRAIN,0,0 +1608,TRAIN,0,0 +1609,TRAIN,0,0 +1610,TRAIN,0,0 +1611,TRAIN,0,0 +1612,TRAIN,0,0 +1613,TRAIN,0,0 +1614,TRAIN,0,0 +1615,TRAIN,0,0 +1616,TRAIN,0,0 +1617,TRAIN,0,0 +1618,TRAIN,0,0 +1619,TRAIN,0,0 +1620,TRAIN,0,0 +1621,TRAIN,0,0 +1622,TRAIN,0,0 +1623,TRAIN,0,0 +1624,TRAIN,0,0 +1625,TRAIN,0,0 +1626,TRAIN,0,0 +1627,TRAIN,0,0 +1628,TRAIN,0,0 +1629,TRAIN,0,0 +1630,TRAIN,0,0 +1631,TRAIN,0,0 +1632,TRAIN,0,0 +1633,TRAIN,0,0 +1634,TRAIN,0,0 +1635,TRAIN,0,0 +1636,TRAIN,0,0 +1637,TRAIN,0,0 +1638,TRAIN,0,0 +1639,TRAIN,0,0 +1640,TRAIN,0,0 +1641,TRAIN,0,0 +1642,TRAIN,0,0 +1643,TRAIN,0,0 +1644,TRAIN,0,0 +1645,TRAIN,0,0 +1646,TRAIN,0,0 +1647,TRAIN,0,0 +1648,TRAIN,0,0 +1649,TRAIN,0,0 +1650,TRAIN,0,0 +1651,TRAIN,0,0 +1652,TRAIN,0,0 +1653,TRAIN,0,0 +1654,TRAIN,0,0 +1655,TRAIN,0,0 +1656,TRAIN,0,0 +1657,TRAIN,0,0 +1658,TRAIN,0,0 +1659,TRAIN,0,0 +1660,TRAIN,0,0 +1661,TRAIN,0,0 +1662,TRAIN,0,0 +1663,TRAIN,0,0 +1664,TRAIN,0,0 +1665,TRAIN,0,0 +1666,TRAIN,0,0 +1667,TRAIN,0,0 +1668,TRAIN,0,0 +1669,TRAIN,0,0 +1670,TRAIN,0,0 +1671,TRAIN,0,0 +1672,TRAIN,0,0 +1673,TRAIN,0,0 +1674,TRAIN,0,0 +1675,TRAIN,0,0 +1676,TRAIN,0,0 +1677,TRAIN,0,0 +1678,TRAIN,0,0 +1679,TRAIN,0,0 +1680,TRAIN,0,0 +1681,TRAIN,0,0 +1682,TRAIN,0,0 +1683,TRAIN,0,0 +1684,TRAIN,0,0 +1685,TRAIN,0,0 +1686,TRAIN,0,0 +1687,TRAIN,0,0 +1688,TRAIN,0,0 +1689,TRAIN,0,0 +1690,TRAIN,0,0 +1691,TRAIN,0,0 +1692,TRAIN,0,0 +1693,TRAIN,0,0 +1694,TRAIN,0,0 +1695,TRAIN,0,0 +1696,TRAIN,0,0 +1697,TRAIN,0,0 +1698,TRAIN,0,0 +1699,TRAIN,0,0 +1700,TRAIN,0,0 +1701,TRAIN,0,0 +1702,TRAIN,0,0 +1703,TRAIN,0,0 +1704,TRAIN,0,0 +1705,TRAIN,0,0 +1706,TRAIN,0,0 +1707,TRAIN,0,0 +1708,TRAIN,0,0 +1709,TRAIN,0,0 +1710,TRAIN,0,0 +1711,TRAIN,0,0 +1712,TRAIN,0,0 +1713,TRAIN,0,0 +1714,TRAIN,0,0 +1715,TRAIN,0,0 +1716,TRAIN,0,0 +1717,TRAIN,0,0 +1718,TRAIN,0,0 +1719,TRAIN,0,0 +1720,TRAIN,0,0 +1721,TRAIN,0,0 +1722,TRAIN,0,0 +1723,TRAIN,0,0 +1724,TRAIN,0,0 +1725,TRAIN,0,0 +1726,TRAIN,0,0 +1727,TRAIN,0,0 +1728,TRAIN,0,0 +1729,TRAIN,0,0 +1730,TRAIN,0,0 +1731,TRAIN,0,0 +1732,TRAIN,0,0 +1733,TRAIN,0,0 +1734,TRAIN,0,0 +1735,TRAIN,0,0 +1736,TRAIN,0,0 +1737,TRAIN,0,0 +1738,TRAIN,0,0 +1739,TRAIN,0,0 +1740,TRAIN,0,0 +1741,TRAIN,0,0 +1742,TRAIN,0,0 +1743,TRAIN,0,0 +1744,TRAIN,0,0 +1745,TRAIN,0,0 +1746,TRAIN,0,0 +1747,TRAIN,0,0 +1748,TRAIN,0,0 +1749,TRAIN,0,0 +1750,TRAIN,0,0 +1751,TRAIN,0,0 +1752,TRAIN,0,0 +1753,TRAIN,0,0 +1754,TRAIN,0,0 +1755,TRAIN,0,0 +1756,TRAIN,0,0 +1757,TRAIN,0,0 +1758,TRAIN,0,0 +1759,TRAIN,0,0 +1760,TRAIN,0,0 +1761,TRAIN,0,0 +1762,TRAIN,0,0 +1763,TRAIN,0,0 +1764,TRAIN,0,0 +1765,TRAIN,0,0 +1766,TRAIN,0,0 +1767,TRAIN,0,0 +1768,TRAIN,0,0 +1769,TRAIN,0,0 +1770,TRAIN,0,0 +1771,TRAIN,0,0 +1772,TRAIN,0,0 +1773,TRAIN,0,0 +1774,TRAIN,0,0 +1775,TRAIN,0,0 +1776,TRAIN,0,0 +1777,TRAIN,0,0 +1778,TRAIN,0,0 +1779,TRAIN,0,0 +1780,TRAIN,0,0 +1781,TRAIN,0,0 +1782,TRAIN,0,0 +1783,TRAIN,0,0 +1784,TRAIN,0,0 +1785,TRAIN,0,0 +1786,TRAIN,0,0 +1787,TRAIN,0,0 +1788,TRAIN,0,0 +1789,TRAIN,0,0 +1790,TRAIN,0,0 +1791,TRAIN,0,0 +1792,TRAIN,0,0 +1793,TRAIN,0,0 +1794,TRAIN,0,0 +1795,TRAIN,0,0 +1796,TRAIN,0,0 +1797,TRAIN,0,0 +1798,TRAIN,0,0 +1799,TRAIN,0,0 +1800,TRAIN,0,0 +1801,TRAIN,0,0 +1802,TRAIN,0,0 +1803,TRAIN,0,0 +1804,TRAIN,0,0 +1805,TRAIN,0,0 +1806,TRAIN,0,0 +1807,TRAIN,0,0 +1808,TRAIN,0,0 +1809,TRAIN,0,0 +1810,TRAIN,0,0 +1811,TRAIN,0,0 +1812,TRAIN,0,0 +1813,TRAIN,0,0 +1814,TRAIN,0,0 +1815,TRAIN,0,0 +1816,TRAIN,0,0 +1817,TRAIN,0,0 +1818,TRAIN,0,0 +1819,TRAIN,0,0 +1820,TRAIN,0,0 +1821,TRAIN,0,0 +1822,TRAIN,0,0 +1823,TRAIN,0,0 +1824,TRAIN,0,0 +1825,TRAIN,0,0 +1826,TRAIN,0,0 +1827,TRAIN,0,0 +1828,TRAIN,0,0 +1829,TRAIN,0,0 +1830,TRAIN,0,0 +1831,TRAIN,0,0 +1832,TRAIN,0,0 +1833,TRAIN,0,0 +1834,TRAIN,0,0 +1835,TRAIN,0,0 +1836,TRAIN,0,0 +1837,TRAIN,0,0 +1838,TRAIN,0,0 +1839,TRAIN,0,0 +1840,TRAIN,0,0 +1841,TRAIN,0,0 +1842,TRAIN,0,0 +1843,TRAIN,0,0 +1844,TRAIN,0,0 +1845,TRAIN,0,0 +1846,TRAIN,0,0 +1847,TRAIN,0,0 +1848,TRAIN,0,0 +1849,TRAIN,0,0 +1850,TRAIN,0,0 +1851,TRAIN,0,0 +1852,TRAIN,0,0 +1853,TRAIN,0,0 +1854,TRAIN,0,0 +1855,TRAIN,0,0 +1856,TRAIN,0,0 +1857,TRAIN,0,0 +1858,TRAIN,0,0 +1859,TRAIN,0,0 +1860,TRAIN,0,0 +1861,TRAIN,0,0 +1862,TRAIN,0,0 +1863,TRAIN,0,0 +1864,TRAIN,0,0 +1865,TRAIN,0,0 +1866,TRAIN,0,0 +1867,TRAIN,0,0 +1868,TRAIN,0,0 +1869,TRAIN,0,0 +1870,TRAIN,0,0 +1871,TRAIN,0,0 +1872,TRAIN,0,0 +1873,TRAIN,0,0 +1874,TRAIN,0,0 +1875,TRAIN,0,0 +1876,TRAIN,0,0 +1877,TRAIN,0,0 +1878,TRAIN,0,0 +1879,TRAIN,0,0 +1880,TRAIN,0,0 +1881,TRAIN,0,0 +1882,TRAIN,0,0 +1883,TRAIN,0,0 +1884,TRAIN,0,0 +1885,TRAIN,0,0 +1886,TRAIN,0,0 +1887,TRAIN,0,0 +1888,TRAIN,0,0 +1889,TRAIN,0,0 +1890,TRAIN,0,0 +1891,TRAIN,0,0 +1892,TRAIN,0,0 +1893,TRAIN,0,0 +1894,TRAIN,0,0 +1895,TRAIN,0,0 +1896,TRAIN,0,0 +1897,TRAIN,0,0 +1898,TRAIN,0,0 +1899,TRAIN,0,0 +1900,TRAIN,0,0 +1901,TRAIN,0,0 +1902,TRAIN,0,0 +1903,TRAIN,0,0 +1904,TRAIN,0,0 +1905,TRAIN,0,0 +1906,TRAIN,0,0 +1907,TRAIN,0,0 +1908,TRAIN,0,0 +1909,TRAIN,0,0 +1910,TRAIN,0,0 +1911,TRAIN,0,0 +1912,TRAIN,0,0 +1913,TRAIN,0,0 +1914,TRAIN,0,0 +1915,TRAIN,0,0 +1916,TRAIN,0,0 +1917,TRAIN,0,0 +1918,TRAIN,0,0 +1919,TRAIN,0,0 +1920,TRAIN,0,0 +1921,TRAIN,0,0 +1922,TRAIN,0,0 +1923,TRAIN,0,0 +1924,TRAIN,0,0 +1925,TRAIN,0,0 +1926,TRAIN,0,0 +1927,TRAIN,0,0 +1928,TRAIN,0,0 +1929,TRAIN,0,0 +1930,TRAIN,0,0 +1931,TRAIN,0,0 +1932,TRAIN,0,0 +1933,TRAIN,0,0 +1934,TRAIN,0,0 +1935,TRAIN,0,0 +1936,TRAIN,0,0 +1937,TRAIN,0,0 +1938,TRAIN,0,0 +1939,TRAIN,0,0 +1940,TRAIN,0,0 +1941,TRAIN,0,0 +1942,TRAIN,0,0 +1943,TRAIN,0,0 +1944,TRAIN,0,0 +1945,TRAIN,0,0 +1946,TRAIN,0,0 +1947,TRAIN,0,0 +1948,TRAIN,0,0 +1949,TRAIN,0,0 +1950,TRAIN,0,0 +1951,TRAIN,0,0 +1952,TRAIN,0,0 +1953,TRAIN,0,0 +1954,TRAIN,0,0 +1955,TRAIN,0,0 +1956,TRAIN,0,0 +1957,TRAIN,0,0 +1958,TRAIN,0,0 +1959,TRAIN,0,0 +1960,TRAIN,0,0 +1961,TRAIN,0,0 +1962,TRAIN,0,0 +1963,TRAIN,0,0 +1964,TRAIN,0,0 +1965,TRAIN,0,0 +1966,TRAIN,0,0 +1967,TRAIN,0,0 +1968,TRAIN,0,0 +1969,TRAIN,0,0 +1970,TRAIN,0,0 +1971,TRAIN,0,0 +1972,TRAIN,0,0 +1973,TRAIN,0,0 +1974,TRAIN,0,0 +1975,TRAIN,0,0 +1976,TRAIN,0,0 +1977,TRAIN,0,0 +1978,TRAIN,0,0 +1979,TRAIN,0,0 +1980,TRAIN,0,0 +1981,TRAIN,0,0 +1982,TRAIN,0,0 +1983,TRAIN,0,0 +1984,TRAIN,0,0 +1985,TRAIN,0,0 +1986,TRAIN,0,0 +1987,TRAIN,0,0 +1988,TRAIN,0,0 +1989,TRAIN,0,0 +1990,TRAIN,0,0 +1991,TRAIN,0,0 +1992,TRAIN,0,0 +1993,TRAIN,0,0 +1994,TRAIN,0,0 +1995,TRAIN,0,0 +1996,TRAIN,0,0 +1997,TRAIN,0,0 +1998,TRAIN,0,0 +1999,TRAIN,0,0 +2000,TRAIN,0,0 +2001,TRAIN,0,0 +2002,TRAIN,0,0 +2003,TRAIN,0,0 +2004,TRAIN,0,0 +2005,TRAIN,0,0 +2006,TRAIN,0,0 +2007,TRAIN,0,0 +2008,TRAIN,0,0 +2009,TRAIN,0,0 +2010,TRAIN,0,0 +2011,TRAIN,0,0 +2012,TRAIN,0,0 +2013,TRAIN,0,0 +2014,TRAIN,0,0 +2015,TRAIN,0,0 +2016,TRAIN,0,0 +2017,TRAIN,0,0 +2018,TRAIN,0,0 +2019,TRAIN,0,0 +2020,TRAIN,0,0 +2021,TRAIN,0,0 +2022,TRAIN,0,0 +2023,TRAIN,0,0 +2024,TRAIN,0,0 +2025,TRAIN,0,0 +2026,TRAIN,0,0 +2027,TRAIN,0,0 +2028,TRAIN,0,0 +2029,TRAIN,0,0 +2030,TRAIN,0,0 +2031,TRAIN,0,0 +2032,TRAIN,0,0 +2033,TRAIN,0,0 +2034,TRAIN,0,0 +2035,TRAIN,0,0 +2036,TRAIN,0,0 +2037,TRAIN,0,0 +2038,TRAIN,0,0 +2039,TRAIN,0,0 +2040,TRAIN,0,0 +2041,TRAIN,0,0 +2042,TRAIN,0,0 +2043,TRAIN,0,0 +2044,TRAIN,0,0 +2045,TRAIN,0,0 +2046,TRAIN,0,0 +2047,TRAIN,0,0 +2048,TRAIN,0,0 +2049,TRAIN,0,0 +2050,TRAIN,0,0 +2051,TRAIN,0,0 +2052,TRAIN,0,0 +2053,TRAIN,0,0 +2054,TRAIN,0,0 +2055,TRAIN,0,0 +2056,TRAIN,0,0 +2057,TRAIN,0,0 +2058,TRAIN,0,0 +2059,TRAIN,0,0 +2060,TRAIN,0,0 +2061,TRAIN,0,0 +2062,TRAIN,0,0 +2063,TRAIN,0,0 +2064,TRAIN,0,0 +2065,TRAIN,0,0 +2066,TRAIN,0,0 +2067,TRAIN,0,0 +2068,TRAIN,0,0 +2069,TRAIN,0,0 +2070,TRAIN,0,0 +2071,TRAIN,0,0 +2072,TRAIN,0,0 +2073,TRAIN,0,0 +2074,TRAIN,0,0 +2075,TRAIN,0,0 +2076,TRAIN,0,0 +2077,TRAIN,0,0 +2078,TRAIN,0,0 +2079,TRAIN,0,0 +2080,TRAIN,0,0 +2081,TRAIN,0,0 +2082,TRAIN,0,0 +2083,TRAIN,0,0 +2084,TRAIN,0,0 +2085,TRAIN,0,0 +2086,TRAIN,0,0 +2087,TRAIN,0,0 +2088,TRAIN,0,0 +2089,TRAIN,0,0 +2090,TRAIN,0,0 +2091,TRAIN,0,0 +2092,TRAIN,0,0 +2093,TRAIN,0,0 +2094,TRAIN,0,0 +2095,TRAIN,0,0 +2096,TRAIN,0,0 +2097,TRAIN,0,0 +2098,TRAIN,0,0 +2099,TRAIN,0,0 +2100,TRAIN,0,0 +2101,TRAIN,0,0 +2102,TRAIN,0,0 +2103,TRAIN,0,0 +2104,TRAIN,0,0 +2105,TRAIN,0,0 +2106,TRAIN,0,0 +2107,TRAIN,0,0 +2108,TRAIN,0,0 +2109,TRAIN,0,0 +2110,TRAIN,0,0 +2111,TRAIN,0,0 +2112,TRAIN,0,0 +2113,TRAIN,0,0 +2114,TRAIN,0,0 +2115,TRAIN,0,0 +2116,TRAIN,0,0 +2117,TRAIN,0,0 +2118,TRAIN,0,0 +2119,TRAIN,0,0 +2120,TRAIN,0,0 +2121,TRAIN,0,0 +2122,TRAIN,0,0 +2123,TRAIN,0,0 +2124,TRAIN,0,0 +2125,TRAIN,0,0 +2126,TRAIN,0,0 +2127,TRAIN,0,0 +2128,TRAIN,0,0 +2129,TRAIN,0,0 +2130,TRAIN,0,0 +2131,TRAIN,0,0 +2132,TRAIN,0,0 +2133,TRAIN,0,0 +2134,TRAIN,0,0 +2135,TRAIN,0,0 +2136,TRAIN,0,0 +2137,TRAIN,0,0 +2138,TRAIN,0,0 +2139,TRAIN,0,0 +2140,TRAIN,0,0 +2141,TRAIN,0,0 +2142,TRAIN,0,0 +2143,TRAIN,0,0 +2144,TRAIN,0,0 +2145,TRAIN,0,0 +2146,TRAIN,0,0 +2147,TRAIN,0,0 +2148,TRAIN,0,0 +2149,TRAIN,0,0 +2150,TRAIN,0,0 +2151,TRAIN,0,0 +2152,TRAIN,0,0 +2153,TRAIN,0,0 +2154,TRAIN,0,0 +2155,TRAIN,0,0 +2156,TRAIN,0,0 +2157,TRAIN,0,0 +2158,TRAIN,0,0 +2159,TRAIN,0,0 +2160,TRAIN,0,0 +2161,TRAIN,0,0 +2162,TRAIN,0,0 +2163,TRAIN,0,0 +2164,TRAIN,0,0 +2165,TRAIN,0,0 +2166,TRAIN,0,0 +2167,TRAIN,0,0 +2168,TRAIN,0,0 +2169,TRAIN,0,0 +2170,TRAIN,0,0 +2171,TRAIN,0,0 +2172,TRAIN,0,0 +2173,TRAIN,0,0 +2174,TRAIN,0,0 +2175,TRAIN,0,0 +2176,TRAIN,0,0 +2177,TRAIN,0,0 +2178,TRAIN,0,0 +2179,TRAIN,0,0 +2180,TRAIN,0,0 +2181,TRAIN,0,0 +2182,TRAIN,0,0 +2183,TRAIN,0,0 +2184,TRAIN,0,0 +2185,TRAIN,0,0 +2186,TRAIN,0,0 +2187,TRAIN,0,0 +2188,TRAIN,0,0 +2189,TRAIN,0,0 +2190,TRAIN,0,0 +2191,TRAIN,0,0 +2192,TRAIN,0,0 +2193,TRAIN,0,0 +2194,TRAIN,0,0 +2195,TRAIN,0,0 +2196,TRAIN,0,0 +2197,TRAIN,0,0 +2198,TRAIN,0,0 +2199,TRAIN,0,0 +2200,TRAIN,0,0 +2201,TRAIN,0,0 +2202,TRAIN,0,0 +2203,TRAIN,0,0 +2204,TRAIN,0,0 +2205,TRAIN,0,0 +2206,TRAIN,0,0 +2207,TRAIN,0,0 +2208,TRAIN,0,0 +2209,TRAIN,0,0 +2210,TRAIN,0,0 +2211,TRAIN,0,0 +2212,TRAIN,0,0 +2213,TRAIN,0,0 +2214,TRAIN,0,0 +2215,TRAIN,0,0 +2216,TRAIN,0,0 +2217,TRAIN,0,0 +2218,TRAIN,0,0 +2219,TRAIN,0,0 +2220,TRAIN,0,0 +2221,TRAIN,0,0 +2222,TRAIN,0,0 +2223,TRAIN,0,0 +2224,TRAIN,0,0 +2225,TRAIN,0,0 +2226,TRAIN,0,0 +2227,TRAIN,0,0 +2228,TRAIN,0,0 +2229,TRAIN,0,0 +2230,TRAIN,0,0 +2231,TRAIN,0,0 +2232,TRAIN,0,0 +2233,TRAIN,0,0 +2234,TRAIN,0,0 +2235,TRAIN,0,0 +2236,TRAIN,0,0 +2237,TRAIN,0,0 +2238,TRAIN,0,0 +2239,TRAIN,0,0 +2240,TRAIN,0,0 +2241,TRAIN,0,0 +2242,TRAIN,0,0 +2243,TRAIN,0,0 +2244,TRAIN,0,0 +2245,TRAIN,0,0 +2246,TRAIN,0,0 +2247,TRAIN,0,0 +2248,TRAIN,0,0 +2249,TRAIN,0,0 +2250,TRAIN,0,0 +2251,TRAIN,0,0 +2252,TRAIN,0,0 +2253,TRAIN,0,0 +2254,TRAIN,0,0 +2255,TRAIN,0,0 +2256,TRAIN,0,0 +2257,TRAIN,0,0 +2258,TRAIN,0,0 +2259,TRAIN,0,0 +2260,TRAIN,0,0 +2261,TRAIN,0,0 +2262,TRAIN,0,0 +2263,TRAIN,0,0 +2264,TRAIN,0,0 +2265,TRAIN,0,0 +2266,TRAIN,0,0 +2267,TRAIN,0,0 +2268,TRAIN,0,0 +2269,TRAIN,0,0 +2270,TRAIN,0,0 +2271,TRAIN,0,0 +2272,TRAIN,0,0 +2273,TRAIN,0,0 +2274,TRAIN,0,0 +2275,TRAIN,0,0 +2276,TRAIN,0,0 +2277,TRAIN,0,0 +2278,TRAIN,0,0 +2279,TRAIN,0,0 +2280,TRAIN,0,0 +2281,TRAIN,0,0 +2282,TRAIN,0,0 +2283,TRAIN,0,0 +2284,TRAIN,0,0 +2285,TRAIN,0,0 +2286,TRAIN,0,0 +2287,TRAIN,0,0 +2288,TRAIN,0,0 +2289,TRAIN,0,0 +2290,TRAIN,0,0 +2291,TRAIN,0,0 +2292,TRAIN,0,0 +2293,TRAIN,0,0 +2294,TRAIN,0,0 +2295,TRAIN,0,0 +2296,TRAIN,0,0 +2297,TRAIN,0,0 +2298,TRAIN,0,0 +2299,TRAIN,0,0 +2300,TRAIN,0,0 +2301,TRAIN,0,0 +2302,TRAIN,0,0 +2303,TRAIN,0,0 +2304,TRAIN,0,0 +2305,TRAIN,0,0 +2306,TRAIN,0,0 +2307,TRAIN,0,0 +2308,TRAIN,0,0 +2309,TRAIN,0,0 +2310,TRAIN,0,0 +2311,TRAIN,0,0 +2312,TRAIN,0,0 +2313,TRAIN,0,0 +2314,TRAIN,0,0 +2315,TRAIN,0,0 +2316,TRAIN,0,0 +2317,TRAIN,0,0 +2318,TRAIN,0,0 +2319,TRAIN,0,0 +2320,TRAIN,0,0 +2321,TRAIN,0,0 +2322,TRAIN,0,0 +2323,TRAIN,0,0 +2324,TRAIN,0,0 +2325,TRAIN,0,0 +2326,TRAIN,0,0 +2327,TRAIN,0,0 +2328,TRAIN,0,0 +2329,TRAIN,0,0 +2330,TRAIN,0,0 +2331,TRAIN,0,0 +2332,TRAIN,0,0 +2333,TRAIN,0,0 +2334,TRAIN,0,0 +2335,TRAIN,0,0 +2336,TRAIN,0,0 +2337,TRAIN,0,0 +2338,TRAIN,0,0 +2339,TRAIN,0,0 +2340,TRAIN,0,0 +2341,TRAIN,0,0 +2342,TRAIN,0,0 +2343,TRAIN,0,0 +2344,TRAIN,0,0 +2345,TRAIN,0,0 +2346,TRAIN,0,0 +2347,TRAIN,0,0 +2348,TRAIN,0,0 +2349,TRAIN,0,0 +2350,TRAIN,0,0 +2351,TRAIN,0,0 +2352,TRAIN,0,0 +2353,TRAIN,0,0 +2354,TRAIN,0,0 +2355,TRAIN,0,0 +2356,TRAIN,0,0 +2357,TRAIN,0,0 +2358,TRAIN,0,0 +2359,TRAIN,0,0 +2360,TRAIN,0,0 +2361,TRAIN,0,0 +2362,TRAIN,0,0 +2363,TRAIN,0,0 +2364,TRAIN,0,0 +2365,TRAIN,0,0 +2366,TRAIN,0,0 +2367,TRAIN,0,0 +2368,TRAIN,0,0 +2369,TRAIN,0,0 +2370,TRAIN,0,0 +2371,TRAIN,0,0 +2372,TRAIN,0,0 +2373,TRAIN,0,0 +2374,TRAIN,0,0 +2375,TRAIN,0,0 +2376,TRAIN,0,0 +2377,TRAIN,0,0 +2378,TRAIN,0,0 +2379,TRAIN,0,0 +2380,TRAIN,0,0 +2381,TRAIN,0,0 +2382,TRAIN,0,0 +2383,TRAIN,0,0 +2384,TRAIN,0,0 +2385,TRAIN,0,0 +2386,TRAIN,0,0 +2387,TRAIN,0,0 +2388,TRAIN,0,0 +2389,TRAIN,0,0 +2390,TRAIN,0,0 +2391,TRAIN,0,0 +2392,TRAIN,0,0 +2393,TRAIN,0,0 +2394,TRAIN,0,0 +2395,TRAIN,0,0 +2396,TRAIN,0,0 +2397,TRAIN,0,0 +2398,TRAIN,0,0 +2399,TRAIN,0,0 +2400,TRAIN,0,0 +2401,TRAIN,0,0 +2402,TRAIN,0,0 +2403,TRAIN,0,0 +2404,TRAIN,0,0 +2405,TRAIN,0,0 +2406,TRAIN,0,0 +2407,TRAIN,0,0 +2408,TRAIN,0,0 +2409,TRAIN,0,0 +2410,TRAIN,0,0 +2411,TRAIN,0,0 +2412,TRAIN,0,0 +2413,TRAIN,0,0 +2414,TRAIN,0,0 +2415,TRAIN,0,0 +2416,TRAIN,0,0 +2417,TRAIN,0,0 +2418,TRAIN,0,0 +2419,TRAIN,0,0 +2420,TRAIN,0,0 +2421,TRAIN,0,0 +2422,TRAIN,0,0 +2423,TRAIN,0,0 +2424,TRAIN,0,0 +2425,TRAIN,0,0 +2426,TRAIN,0,0 +2427,TRAIN,0,0 +2428,TRAIN,0,0 +2429,TRAIN,0,0 +2430,TRAIN,0,0 +2431,TRAIN,0,0 +2432,TRAIN,0,0 +2433,TRAIN,0,0 +2434,TRAIN,0,0 +2435,TRAIN,0,0 +2436,TRAIN,0,0 +2437,TRAIN,0,0 +2438,TRAIN,0,0 +2439,TRAIN,0,0 +2440,TRAIN,0,0 +2441,TRAIN,0,0 +2442,TRAIN,0,0 +2443,TRAIN,0,0 +2444,TRAIN,0,0 +2445,TRAIN,0,0 +2446,TRAIN,0,0 +2447,TRAIN,0,0 +2448,TRAIN,0,0 +2449,TRAIN,0,0 +2450,TRAIN,0,0 +2451,TRAIN,0,0 +2452,TRAIN,0,0 +2453,TRAIN,0,0 +2454,TRAIN,0,0 +2455,TRAIN,0,0 +2456,TRAIN,0,0 +2457,TRAIN,0,0 +2458,TRAIN,0,0 +2459,TRAIN,0,0 +2460,TRAIN,0,0 +2461,TRAIN,0,0 +2462,TRAIN,0,0 +2463,TRAIN,0,0 +2464,TRAIN,0,0 +2465,TRAIN,0,0 +2466,TRAIN,0,0 +2467,TRAIN,0,0 +2468,TRAIN,0,0 +2469,TRAIN,0,0 +2470,TRAIN,0,0 +2471,TRAIN,0,0 +2472,TRAIN,0,0 +2473,TRAIN,0,0 +2474,TRAIN,0,0 +2475,TRAIN,0,0 +2476,TRAIN,0,0 +2477,TRAIN,0,0 +2478,TRAIN,0,0 +2479,TRAIN,0,0 +2480,TRAIN,0,0 +2481,TRAIN,0,0 +2482,TRAIN,0,0 +2483,TRAIN,0,0 +2484,TRAIN,0,0 +2485,TRAIN,0,0 +2486,TRAIN,0,0 +2487,TRAIN,0,0 +2488,TRAIN,0,0 +2489,TRAIN,0,0 +2490,TRAIN,0,0 +2491,TRAIN,0,0 +2492,TRAIN,0,0 +2493,TRAIN,0,0 +2494,TRAIN,0,0 +2495,TRAIN,0,0 +2496,TRAIN,0,0 +2497,TRAIN,0,0 +2498,TRAIN,0,0 +2499,TRAIN,0,0 +2500,TRAIN,0,0 +2501,TRAIN,0,0 +2502,TRAIN,0,0 +2503,TRAIN,0,0 +2504,TRAIN,0,0 +2505,TRAIN,0,0 +2506,TRAIN,0,0 +2507,TRAIN,0,0 +2508,TRAIN,0,0 +2509,TRAIN,0,0 +2510,TRAIN,0,0 +2511,TRAIN,0,0 +2512,TRAIN,0,0 +2513,TRAIN,0,0 +2514,TRAIN,0,0 +2515,TRAIN,0,0 +2516,TRAIN,0,0 +2517,TRAIN,0,0 +2518,TRAIN,0,0 +2519,TRAIN,0,0 +2520,TRAIN,0,0 +2521,TRAIN,0,0 +2522,TRAIN,0,0 +2523,TRAIN,0,0 +2524,TRAIN,0,0 +2525,TRAIN,0,0 +2526,TRAIN,0,0 +2527,TRAIN,0,0 +2528,TRAIN,0,0 +2529,TRAIN,0,0 +2530,TRAIN,0,0 +2531,TRAIN,0,0 +2532,TRAIN,0,0 +2533,TRAIN,0,0 +2534,TRAIN,0,0 +2535,TRAIN,0,0 +2536,TRAIN,0,0 +2537,TRAIN,0,0 +2538,TRAIN,0,0 +2539,TRAIN,0,0 +2540,TRAIN,0,0 +2541,TRAIN,0,0 +2542,TRAIN,0,0 +2543,TRAIN,0,0 +2544,TRAIN,0,0 +2545,TRAIN,0,0 +2546,TRAIN,0,0 +2547,TRAIN,0,0 +2548,TRAIN,0,0 +2549,TRAIN,0,0 +2550,TRAIN,0,0 +2551,TRAIN,0,0 +2552,TRAIN,0,0 +2553,TRAIN,0,0 +2554,TRAIN,0,0 +2555,TRAIN,0,0 +2556,TRAIN,0,0 +2557,TRAIN,0,0 +2558,TRAIN,0,0 +2559,TRAIN,0,0 +2560,TRAIN,0,0 +2561,TRAIN,0,0 +2562,TRAIN,0,0 +2563,TRAIN,0,0 +2564,TRAIN,0,0 +2565,TRAIN,0,0 +2566,TRAIN,0,0 +2567,TRAIN,0,0 +2568,TRAIN,0,0 +2569,TRAIN,0,0 +2570,TRAIN,0,0 +2571,TRAIN,0,0 +2572,TRAIN,0,0 +2573,TRAIN,0,0 +2574,TRAIN,0,0 +2575,TRAIN,0,0 +2576,TRAIN,0,0 +2577,TRAIN,0,0 +2578,TRAIN,0,0 +2579,TRAIN,0,0 +2580,TRAIN,0,0 +2581,TRAIN,0,0 +2582,TRAIN,0,0 +2583,TRAIN,0,0 +2584,TRAIN,0,0 +2585,TRAIN,0,0 +2586,TRAIN,0,0 +2587,TRAIN,0,0 +2588,TRAIN,0,0 +2589,TRAIN,0,0 +2590,TRAIN,0,0 +2591,TRAIN,0,0 +2592,TRAIN,0,0 +2593,TRAIN,0,0 +2594,TRAIN,0,0 +2595,TRAIN,0,0 +2596,TRAIN,0,0 +2597,TRAIN,0,0 +2598,TRAIN,0,0 +2599,TRAIN,0,0 +2600,TRAIN,0,0 +2601,TRAIN,0,0 +2602,TRAIN,0,0 +2603,TRAIN,0,0 +2604,TRAIN,0,0 +2605,TRAIN,0,0 +2606,TRAIN,0,0 +2607,TRAIN,0,0 +2608,TRAIN,0,0 +2609,TRAIN,0,0 +2610,TRAIN,0,0 +2611,TRAIN,0,0 +2612,TRAIN,0,0 +2613,TRAIN,0,0 +2614,TRAIN,0,0 +2615,TRAIN,0,0 +2616,TRAIN,0,0 +2617,TRAIN,0,0 +2618,TRAIN,0,0 +2619,TRAIN,0,0 +2620,TRAIN,0,0 +2621,TRAIN,0,0 +2622,TRAIN,0,0 +2623,TRAIN,0,0 +2624,TRAIN,0,0 +2625,TRAIN,0,0 +2626,TRAIN,0,0 +2627,TRAIN,0,0 +2628,TRAIN,0,0 +2629,TRAIN,0,0 +2630,TRAIN,0,0 +2631,TRAIN,0,0 +2632,TRAIN,0,0 +2633,TRAIN,0,0 +2634,TRAIN,0,0 +2635,TRAIN,0,0 +2636,TRAIN,0,0 +2637,TRAIN,0,0 +2638,TRAIN,0,0 +2639,TRAIN,0,0 +2640,TRAIN,0,0 +2641,TRAIN,0,0 +2642,TRAIN,0,0 +2643,TRAIN,0,0 +2644,TRAIN,0,0 +2645,TRAIN,0,0 +2646,TRAIN,0,0 +2647,TRAIN,0,0 +2648,TRAIN,0,0 +2649,TRAIN,0,0 +2650,TRAIN,0,0 +2651,TRAIN,0,0 +2652,TRAIN,0,0 +2653,TRAIN,0,0 +2654,TRAIN,0,0 +2655,TRAIN,0,0 +2656,TRAIN,0,0 +2657,TRAIN,0,0 +2658,TRAIN,0,0 +2659,TRAIN,0,0 +2660,TRAIN,0,0 +2661,TRAIN,0,0 +2662,TRAIN,0,0 +2663,TRAIN,0,0 +2664,TRAIN,0,0 +2665,TRAIN,0,0 +2666,TRAIN,0,0 +2667,TRAIN,0,0 +2668,TRAIN,0,0 +2669,TRAIN,0,0 +2670,TRAIN,0,0 +2671,TRAIN,0,0 +2672,TRAIN,0,0 +2673,TRAIN,0,0 +2674,TRAIN,0,0 +2675,TRAIN,0,0 +2676,TRAIN,0,0 +2677,TRAIN,0,0 +2678,TRAIN,0,0 +2679,TRAIN,0,0 +2680,TRAIN,0,0 +2681,TRAIN,0,0 +2682,TRAIN,0,0 +2683,TRAIN,0,0 +2684,TRAIN,0,0 +2685,TRAIN,0,0 +2686,TRAIN,0,0 +2687,TRAIN,0,0 +2688,TRAIN,0,0 +2689,TRAIN,0,0 +2690,TRAIN,0,0 +2691,TRAIN,0,0 +2692,TRAIN,0,0 +2693,TRAIN,0,0 +2694,TRAIN,0,0 +2695,TRAIN,0,0 +2696,TRAIN,0,0 +2697,TRAIN,0,0 +2698,TRAIN,0,0 +2699,TRAIN,0,0 +2700,TRAIN,0,0 +2701,TRAIN,0,0 +2702,TRAIN,0,0 +2703,TRAIN,0,0 +2704,TRAIN,0,0 +2705,TRAIN,0,0 +2706,TRAIN,0,0 +2707,TRAIN,0,0 +2708,TRAIN,0,0 +2709,TRAIN,0,0 +2710,TRAIN,0,0 +2711,TRAIN,0,0 +2712,TRAIN,0,0 +2713,TRAIN,0,0 +2714,TRAIN,0,0 +2715,TRAIN,0,0 +2716,TRAIN,0,0 +2717,TRAIN,0,0 +2718,TRAIN,0,0 +2719,TRAIN,0,0 +2720,TRAIN,0,0 +2721,TRAIN,0,0 +2722,TRAIN,0,0 +2723,TRAIN,0,0 +2724,TRAIN,0,0 +2725,TRAIN,0,0 +2726,TRAIN,0,0 +2727,TRAIN,0,0 +2728,TRAIN,0,0 +2729,TRAIN,0,0 +2730,TRAIN,0,0 +2731,TRAIN,0,0 +2732,TRAIN,0,0 +2733,TRAIN,0,0 +2734,TRAIN,0,0 +2735,TRAIN,0,0 +2736,TRAIN,0,0 +2737,TRAIN,0,0 +2738,TRAIN,0,0 +2739,TRAIN,0,0 +2740,TRAIN,0,0 +2741,TRAIN,0,0 +2742,TRAIN,0,0 +2743,TRAIN,0,0 +2744,TRAIN,0,0 +2745,TRAIN,0,0 +2746,TRAIN,0,0 +2747,TRAIN,0,0 +2748,TRAIN,0,0 +2749,TRAIN,0,0 +2750,TRAIN,0,0 +2751,TRAIN,0,0 +2752,TRAIN,0,0 +2753,TRAIN,0,0 +2754,TRAIN,0,0 +2755,TRAIN,0,0 +2756,TRAIN,0,0 +2757,TRAIN,0,0 +2758,TRAIN,0,0 +2759,TRAIN,0,0 +2760,TRAIN,0,0 +2761,TRAIN,0,0 +2762,TRAIN,0,0 +2763,TRAIN,0,0 +2764,TRAIN,0,0 +2765,TRAIN,0,0 +2766,TRAIN,0,0 +2767,TRAIN,0,0 +2768,TRAIN,0,0 +2769,TRAIN,0,0 +2770,TRAIN,0,0 +2771,TRAIN,0,0 +2772,TRAIN,0,0 +2773,TRAIN,0,0 +2774,TRAIN,0,0 +2775,TRAIN,0,0 +2776,TRAIN,0,0 +2777,TRAIN,0,0 +2778,TRAIN,0,0 +2779,TRAIN,0,0 +2780,TRAIN,0,0 +2781,TRAIN,0,0 +2782,TRAIN,0,0 +2783,TRAIN,0,0 +2784,TRAIN,0,0 +2785,TRAIN,0,0 +2786,TRAIN,0,0 +2787,TRAIN,0,0 +2788,TRAIN,0,0 +2789,TRAIN,0,0 +2790,TRAIN,0,0 +2791,TRAIN,0,0 +2792,TRAIN,0,0 +2793,TRAIN,0,0 +2794,TRAIN,0,0 +2795,TRAIN,0,0 +2796,TRAIN,0,0 +2797,TRAIN,0,0 +2798,TRAIN,0,0 +2799,TRAIN,0,0 +2800,TRAIN,0,0 +2801,TRAIN,0,0 +2802,TRAIN,0,0 +2803,TRAIN,0,0 +2804,TRAIN,0,0 +2805,TRAIN,0,0 +2806,TRAIN,0,0 +2807,TRAIN,0,0 +2808,TRAIN,0,0 +2809,TRAIN,0,0 +2810,TRAIN,0,0 +2811,TRAIN,0,0 +2812,TRAIN,0,0 +2813,TRAIN,0,0 +2814,TRAIN,0,0 +2815,TRAIN,0,0 +2816,TRAIN,0,0 +2817,TRAIN,0,0 +2818,TRAIN,0,0 +2819,TRAIN,0,0 +2820,TRAIN,0,0 +2821,TRAIN,0,0 +2822,TRAIN,0,0 +2823,TRAIN,0,0 +2824,TRAIN,0,0 +2825,TRAIN,0,0 +2826,TRAIN,0,0 +2827,TRAIN,0,0 +2828,TRAIN,0,0 +2829,TRAIN,0,0 +2830,TRAIN,0,0 +2831,TRAIN,0,0 +2832,TRAIN,0,0 +2833,TRAIN,0,0 +2834,TRAIN,0,0 +2835,TRAIN,0,0 +2836,TRAIN,0,0 +2837,TRAIN,0,0 +2838,TRAIN,0,0 +2839,TRAIN,0,0 +2840,TRAIN,0,0 +2841,TRAIN,0,0 +2842,TRAIN,0,0 +2843,TRAIN,0,0 +2844,TRAIN,0,0 +2845,TRAIN,0,0 +2846,TRAIN,0,0 +2847,TRAIN,0,0 +2848,TRAIN,0,0 +2849,TRAIN,0,0 +2850,TRAIN,0,0 +2851,TRAIN,0,0 +2852,TRAIN,0,0 +2853,TRAIN,0,0 +2854,TRAIN,0,0 +2855,TRAIN,0,0 +2856,TRAIN,0,0 +2857,TRAIN,0,0 +2858,TRAIN,0,0 +2859,TRAIN,0,0 +2860,TRAIN,0,0 +2861,TRAIN,0,0 +2862,TRAIN,0,0 +2863,TRAIN,0,0 +2864,TRAIN,0,0 +2865,TRAIN,0,0 +2866,TRAIN,0,0 +2867,TRAIN,0,0 +2868,TRAIN,0,0 +2869,TRAIN,0,0 +2870,TRAIN,0,0 +2871,TRAIN,0,0 +2872,TRAIN,0,0 +2873,TRAIN,0,0 +2874,TRAIN,0,0 +2875,TRAIN,0,0 +2876,TRAIN,0,0 +2877,TRAIN,0,0 +2878,TRAIN,0,0 +2879,TRAIN,0,0 +2880,TRAIN,0,0 +2881,TRAIN,0,0 +2882,TRAIN,0,0 +2883,TRAIN,0,0 +2884,TRAIN,0,0 +2885,TRAIN,0,0 +2886,TRAIN,0,0 +2887,TRAIN,0,0 +2888,TRAIN,0,0 +2889,TRAIN,0,0 +2890,TRAIN,0,0 +2891,TRAIN,0,0 +2892,TRAIN,0,0 +2893,TRAIN,0,0 +2894,TRAIN,0,0 +2895,TRAIN,0,0 +2896,TRAIN,0,0 +2897,TRAIN,0,0 +2898,TRAIN,0,0 +2899,TRAIN,0,0 +2900,TRAIN,0,0 +2901,TRAIN,0,0 +2902,TRAIN,0,0 +2903,TRAIN,0,0 +2904,TRAIN,0,0 +2905,TRAIN,0,0 +2906,TRAIN,0,0 +2907,TRAIN,0,0 +2908,TRAIN,0,0 +2909,TRAIN,0,0 +2910,TRAIN,0,0 +2911,TRAIN,0,0 +2912,TRAIN,0,0 +2913,TRAIN,0,0 +2914,TRAIN,0,0 +2915,TRAIN,0,0 +2916,TRAIN,0,0 +2917,TRAIN,0,0 +2918,TRAIN,0,0 +2919,TRAIN,0,0 +2920,TRAIN,0,0 +2921,TRAIN,0,0 +2922,TRAIN,0,0 +2923,TRAIN,0,0 +2924,TRAIN,0,0 +2925,TRAIN,0,0 +2926,TRAIN,0,0 +2927,TRAIN,0,0 +2928,TRAIN,0,0 +2929,TRAIN,0,0 +2930,TRAIN,0,0 +2931,TRAIN,0,0 +2932,TRAIN,0,0 +2933,TRAIN,0,0 +2934,TRAIN,0,0 +2935,TRAIN,0,0 +2936,TRAIN,0,0 +2937,TRAIN,0,0 +2938,TRAIN,0,0 +2939,TRAIN,0,0 +2940,TRAIN,0,0 +2941,TRAIN,0,0 +2942,TRAIN,0,0 +2943,TRAIN,0,0 +2944,TRAIN,0,0 +2945,TRAIN,0,0 +2946,TRAIN,0,0 +2947,TRAIN,0,0 +2948,TRAIN,0,0 +2949,TRAIN,0,0 +2950,TRAIN,0,0 +2951,TRAIN,0,0 +2952,TRAIN,0,0 +2953,TRAIN,0,0 +2954,TRAIN,0,0 +2955,TRAIN,0,0 +2956,TRAIN,0,0 +2957,TRAIN,0,0 +2958,TRAIN,0,0 +2959,TRAIN,0,0 +2960,TRAIN,0,0 +2961,TRAIN,0,0 +2962,TRAIN,0,0 +2963,TRAIN,0,0 +2964,TRAIN,0,0 +2965,TRAIN,0,0 +2966,TRAIN,0,0 +2967,TRAIN,0,0 +2968,TRAIN,0,0 +2969,TRAIN,0,0 +2970,TRAIN,0,0 +2971,TRAIN,0,0 +2972,TRAIN,0,0 +2973,TRAIN,0,0 +2974,TRAIN,0,0 +2975,TRAIN,0,0 +2976,TRAIN,0,0 +2977,TRAIN,0,0 +2978,TRAIN,0,0 +2979,TRAIN,0,0 +2980,TRAIN,0,0 +2981,TRAIN,0,0 +2982,TRAIN,0,0 +2983,TRAIN,0,0 +2984,TRAIN,0,0 +2985,TRAIN,0,0 +2986,TRAIN,0,0 +2987,TRAIN,0,0 +2988,TRAIN,0,0 +2989,TRAIN,0,0 +2990,TRAIN,0,0 +2991,TRAIN,0,0 +2992,TRAIN,0,0 +2993,TRAIN,0,0 +2994,TRAIN,0,0 +2995,TRAIN,0,0 +2996,TRAIN,0,0 +2997,TRAIN,0,0 +2998,TRAIN,0,0 +2999,TRAIN,0,0 +3000,TRAIN,0,0 +3001,TRAIN,0,0 +3002,TRAIN,0,0 +3003,TRAIN,0,0 +3004,TRAIN,0,0 +3005,TRAIN,0,0 +3006,TRAIN,0,0 +3007,TRAIN,0,0 +3008,TRAIN,0,0 +3009,TRAIN,0,0 +3010,TRAIN,0,0 +3011,TRAIN,0,0 +3012,TRAIN,0,0 +3013,TRAIN,0,0 +3014,TRAIN,0,0 +3015,TRAIN,0,0 +3016,TRAIN,0,0 +3017,TRAIN,0,0 +3018,TRAIN,0,0 +3019,TRAIN,0,0 +3020,TRAIN,0,0 +3021,TRAIN,0,0 +3022,TRAIN,0,0 +3023,TRAIN,0,0 +3024,TRAIN,0,0 +3025,TRAIN,0,0 +3026,TRAIN,0,0 +3027,TRAIN,0,0 +3028,TRAIN,0,0 +3029,TRAIN,0,0 +3030,TRAIN,0,0 +3031,TRAIN,0,0 +3032,TRAIN,0,0 +3033,TRAIN,0,0 +3034,TRAIN,0,0 +3035,TRAIN,0,0 +3036,TRAIN,0,0 +3037,TRAIN,0,0 +3038,TRAIN,0,0 +3039,TRAIN,0,0 +3040,TRAIN,0,0 +3041,TRAIN,0,0 +3042,TRAIN,0,0 +3043,TRAIN,0,0 +3044,TRAIN,0,0 +3045,TRAIN,0,0 +3046,TRAIN,0,0 +3047,TRAIN,0,0 +3048,TRAIN,0,0 +3049,TRAIN,0,0 +3050,TRAIN,0,0 +3051,TRAIN,0,0 +3052,TRAIN,0,0 +3053,TRAIN,0,0 +3054,TRAIN,0,0 +3055,TRAIN,0,0 +3056,TRAIN,0,0 +3057,TRAIN,0,0 +3058,TRAIN,0,0 +3059,TRAIN,0,0 +3060,TRAIN,0,0 +3061,TRAIN,0,0 +3062,TRAIN,0,0 +3063,TRAIN,0,0 +3064,TRAIN,0,0 +3065,TRAIN,0,0 +3066,TRAIN,0,0 +3067,TRAIN,0,0 +3068,TRAIN,0,0 +3069,TRAIN,0,0 +3070,TRAIN,0,0 +3071,TRAIN,0,0 +3072,TRAIN,0,0 +3073,TRAIN,0,0 +3074,TRAIN,0,0 +3075,TRAIN,0,0 +3076,TRAIN,0,0 +3077,TRAIN,0,0 +3078,TRAIN,0,0 +3079,TRAIN,0,0 +3080,TRAIN,0,0 +3081,TRAIN,0,0 +3082,TRAIN,0,0 +3083,TRAIN,0,0 +3084,TRAIN,0,0 +3085,TRAIN,0,0 +3086,TRAIN,0,0 +3087,TRAIN,0,0 +3088,TRAIN,0,0 +3089,TRAIN,0,0 +3090,TRAIN,0,0 +3091,TRAIN,0,0 +3092,TRAIN,0,0 +3093,TRAIN,0,0 +3094,TRAIN,0,0 +3095,TRAIN,0,0 +3096,TRAIN,0,0 +3097,TRAIN,0,0 +3098,TRAIN,0,0 +3099,TRAIN,0,0 +3100,TRAIN,0,0 +3101,TRAIN,0,0 +3102,TRAIN,0,0 +3103,TRAIN,0,0 +3104,TRAIN,0,0 +3105,TRAIN,0,0 +3106,TRAIN,0,0 +3107,TRAIN,0,0 +3108,TRAIN,0,0 +3109,TRAIN,0,0 +3110,TRAIN,0,0 +3111,TRAIN,0,0 +3112,TRAIN,0,0 +3113,TRAIN,0,0 +3114,TRAIN,0,0 +3115,TRAIN,0,0 +3116,TRAIN,0,0 +3117,TRAIN,0,0 +3118,TRAIN,0,0 +3119,TRAIN,0,0 +3120,TRAIN,0,0 +3121,TRAIN,0,0 +3122,TRAIN,0,0 +3123,TRAIN,0,0 +3124,TRAIN,0,0 +3125,TRAIN,0,0 +3126,TRAIN,0,0 +3127,TRAIN,0,0 +3128,TRAIN,0,0 +3129,TRAIN,0,0 +3130,TRAIN,0,0 +3131,TRAIN,0,0 +3132,TRAIN,0,0 +3133,TRAIN,0,0 +3134,TRAIN,0,0 +3135,TRAIN,0,0 +3136,TRAIN,0,0 +3137,TRAIN,0,0 +3138,TRAIN,0,0 +3139,TRAIN,0,0 +3140,TRAIN,0,0 +3141,TRAIN,0,0 +3142,TRAIN,0,0 +3143,TRAIN,0,0 +3144,TRAIN,0,0 +3145,TRAIN,0,0 +3146,TRAIN,0,0 +3147,TRAIN,0,0 +3148,TRAIN,0,0 +3149,TRAIN,0,0 +3150,TRAIN,0,0 +3151,TRAIN,0,0 +3152,TRAIN,0,0 +3153,TRAIN,0,0 +3154,TRAIN,0,0 +3155,TRAIN,0,0 +3156,TRAIN,0,0 +3157,TRAIN,0,0 +3158,TRAIN,0,0 +3159,TRAIN,0,0 +3160,TRAIN,0,0 +3161,TRAIN,0,0 +3162,TRAIN,0,0 +3163,TRAIN,0,0 +3164,TRAIN,0,0 +3165,TRAIN,0,0 +3166,TRAIN,0,0 +3167,TRAIN,0,0 +3168,TRAIN,0,0 +3169,TRAIN,0,0 +3170,TRAIN,0,0 +3171,TRAIN,0,0 +3172,TRAIN,0,0 +3173,TRAIN,0,0 +3174,TRAIN,0,0 +3175,TRAIN,0,0 +3176,TRAIN,0,0 +3177,TRAIN,0,0 +3178,TRAIN,0,0 +3179,TRAIN,0,0 +3180,TRAIN,0,0 +3181,TRAIN,0,0 +3182,TRAIN,0,0 +3183,TRAIN,0,0 +3184,TRAIN,0,0 +3185,TRAIN,0,0 +3186,TRAIN,0,0 +3187,TRAIN,0,0 +3188,TRAIN,0,0 +3189,TRAIN,0,0 +3190,TRAIN,0,0 +3191,TRAIN,0,0 +3192,TRAIN,0,0 +3193,TRAIN,0,0 +3194,TRAIN,0,0 +3195,TRAIN,0,0 +3196,TRAIN,0,0 +3197,TRAIN,0,0 +3198,TRAIN,0,0 +3199,TRAIN,0,0 +3200,TRAIN,0,0 +3201,TRAIN,0,0 +3202,TRAIN,0,0 +3203,TRAIN,0,0 +3204,TRAIN,0,0 +3205,TRAIN,0,0 +3206,TRAIN,0,0 +3207,TRAIN,0,0 +3208,TRAIN,0,0 +3209,TRAIN,0,0 +3210,TRAIN,0,0 +3211,TRAIN,0,0 +3212,TRAIN,0,0 +3213,TRAIN,0,0 +3214,TRAIN,0,0 +3215,TRAIN,0,0 +3216,TRAIN,0,0 +3217,TRAIN,0,0 +3218,TRAIN,0,0 +3219,TRAIN,0,0 +3220,TRAIN,0,0 +3221,TRAIN,0,0 +3222,TRAIN,0,0 +3223,TRAIN,0,0 +3224,TRAIN,0,0 +3225,TRAIN,0,0 +3226,TRAIN,0,0 +3227,TRAIN,0,0 +3228,TRAIN,0,0 +3229,TRAIN,0,0 +3230,TRAIN,0,0 +3231,TRAIN,0,0 +3232,TRAIN,0,0 +3233,TRAIN,0,0 +3234,TRAIN,0,0 +3235,TRAIN,0,0 +3236,TRAIN,0,0 +3237,TRAIN,0,0 +3238,TRAIN,0,0 +3239,TRAIN,0,0 +3240,TRAIN,0,0 +3241,TRAIN,0,0 +3242,TRAIN,0,0 +3243,TRAIN,0,0 +3244,TRAIN,0,0 +3245,TRAIN,0,0 +3246,TRAIN,0,0 +3247,TRAIN,0,0 +3248,TRAIN,0,0 +3249,TRAIN,0,0 +3250,TRAIN,0,0 +3251,TRAIN,0,0 +3252,TRAIN,0,0 +3253,TRAIN,0,0 +3254,TRAIN,0,0 +3255,TRAIN,0,0 +3256,TRAIN,0,0 +3257,TRAIN,0,0 +3258,TRAIN,0,0 +3259,TRAIN,0,0 +3260,TRAIN,0,0 +3261,TRAIN,0,0 +3262,TRAIN,0,0 +3263,TRAIN,0,0 +3264,TRAIN,0,0 +3265,TRAIN,0,0 +3266,TRAIN,0,0 +3267,TRAIN,0,0 +3268,TRAIN,0,0 +3269,TRAIN,0,0 +3270,TRAIN,0,0 +3271,TRAIN,0,0 +3272,TRAIN,0,0 +3273,TRAIN,0,0 +3274,TRAIN,0,0 +3275,TRAIN,0,0 +3276,TRAIN,0,0 +3277,TRAIN,0,0 +3278,TRAIN,0,0 +3279,TRAIN,0,0 +3280,TRAIN,0,0 +3281,TRAIN,0,0 +3282,TRAIN,0,0 +3283,TRAIN,0,0 +3284,TRAIN,0,0 +3285,TRAIN,0,0 +3286,TRAIN,0,0 +3287,TRAIN,0,0 +3288,TRAIN,0,0 +3289,TRAIN,0,0 +3290,TRAIN,0,0 +3291,TRAIN,0,0 +3292,TRAIN,0,0 +3293,TRAIN,0,0 +3294,TRAIN,0,0 +3295,TRAIN,0,0 +3296,TRAIN,0,0 +3297,TRAIN,0,0 +3298,TRAIN,0,0 +3299,TRAIN,0,0 +3300,TRAIN,0,0 +3301,TRAIN,0,0 +3302,TRAIN,0,0 +3303,TRAIN,0,0 +3304,TRAIN,0,0 +3305,TRAIN,0,0 +3306,TRAIN,0,0 +3307,TRAIN,0,0 +3308,TRAIN,0,0 +3309,TRAIN,0,0 +3310,TRAIN,0,0 +3311,TRAIN,0,0 +3312,TRAIN,0,0 +3313,TRAIN,0,0 +3314,TRAIN,0,0 +3315,TRAIN,0,0 +3316,TRAIN,0,0 +3317,TRAIN,0,0 +3318,TRAIN,0,0 +3319,TRAIN,0,0 +3320,TRAIN,0,0 +3321,TRAIN,0,0 +3322,TRAIN,0,0 +3323,TRAIN,0,0 +3324,TRAIN,0,0 +3325,TRAIN,0,0 +3326,TRAIN,0,0 +3327,TRAIN,0,0 +3328,TRAIN,0,0 +3329,TRAIN,0,0 +3330,TRAIN,0,0 +3331,TRAIN,0,0 +3332,TRAIN,0,0 +3333,TRAIN,0,0 +3334,TRAIN,0,0 +3335,TRAIN,0,0 +3336,TRAIN,0,0 +3337,TRAIN,0,0 +3338,TRAIN,0,0 +3339,TRAIN,0,0 +3340,TRAIN,0,0 +3341,TRAIN,0,0 +3342,TRAIN,0,0 +3343,TRAIN,0,0 +3344,TRAIN,0,0 +3345,TRAIN,0,0 +3346,TRAIN,0,0 +3347,TRAIN,0,0 +3348,TRAIN,0,0 +3349,TRAIN,0,0 +3350,TRAIN,0,0 +3351,TRAIN,0,0 +3352,TRAIN,0,0 +3353,TRAIN,0,0 +3354,TRAIN,0,0 +3355,TRAIN,0,0 +3356,TRAIN,0,0 +3357,TRAIN,0,0 +3358,TRAIN,0,0 +3359,TRAIN,0,0 +3360,TRAIN,0,0 +3361,TRAIN,0,0 +3362,TRAIN,0,0 +3363,TRAIN,0,0 +3364,TRAIN,0,0 +3365,TRAIN,0,0 +3366,TRAIN,0,0 +3367,TRAIN,0,0 +3368,TRAIN,0,0 +3369,TRAIN,0,0 +3370,TRAIN,0,0 +3371,TRAIN,0,0 +3372,TRAIN,0,0 +3373,TRAIN,0,0 +3374,TRAIN,0,0 +3375,TRAIN,0,0 +3376,TRAIN,0,0 +3377,TRAIN,0,0 +3378,TRAIN,0,0 +3379,TRAIN,0,0 +3380,TRAIN,0,0 +3381,TRAIN,0,0 +3382,TRAIN,0,0 +3383,TRAIN,0,0 +3384,TRAIN,0,0 +3385,TRAIN,0,0 +3386,TRAIN,0,0 +3387,TRAIN,0,0 +3388,TRAIN,0,0 +3389,TRAIN,0,0 +3390,TRAIN,0,0 +3391,TRAIN,0,0 +3392,TRAIN,0,0 +3393,TRAIN,0,0 +3394,TRAIN,0,0 +3395,TRAIN,0,0 +3396,TRAIN,0,0 +3397,TRAIN,0,0 +3398,TRAIN,0,0 +3399,TRAIN,0,0 +3400,TRAIN,0,0 +3401,TRAIN,0,0 +3402,TRAIN,0,0 +3403,TRAIN,0,0 +3404,TRAIN,0,0 +3405,TRAIN,0,0 +3406,TRAIN,0,0 +3407,TRAIN,0,0 +3408,TRAIN,0,0 +3409,TRAIN,0,0 +3410,TRAIN,0,0 +3411,TRAIN,0,0 +3412,TRAIN,0,0 +3413,TRAIN,0,0 +3414,TRAIN,0,0 +3415,TRAIN,0,0 +3416,TRAIN,0,0 +3417,TRAIN,0,0 +3418,TRAIN,0,0 +3419,TRAIN,0,0 +3420,TRAIN,0,0 +3421,TRAIN,0,0 +3422,TRAIN,0,0 +3423,TRAIN,0,0 +3424,TRAIN,0,0 +3425,TRAIN,0,0 +3426,TRAIN,0,0 +3427,TRAIN,0,0 +3428,TRAIN,0,0 +3429,TRAIN,0,0 +3430,TRAIN,0,0 +3431,TRAIN,0,0 +3432,TRAIN,0,0 +3433,TRAIN,0,0 +3434,TRAIN,0,0 +3435,TRAIN,0,0 +3436,TRAIN,0,0 +3437,TRAIN,0,0 +3438,TRAIN,0,0 +3439,TRAIN,0,0 +3440,TRAIN,0,0 +3441,TRAIN,0,0 +3442,TRAIN,0,0 +3443,TRAIN,0,0 +3444,TRAIN,0,0 +3445,TRAIN,0,0 +3446,TRAIN,0,0 +3447,TRAIN,0,0 +3448,TRAIN,0,0 +3449,TRAIN,0,0 +3450,TRAIN,0,0 +3451,TRAIN,0,0 +3452,TRAIN,0,0 +3453,TRAIN,0,0 +3454,TRAIN,0,0 +3455,TRAIN,0,0 +3456,TRAIN,0,0 +3457,TRAIN,0,0 +3458,TRAIN,0,0 +3459,TRAIN,0,0 +3460,TRAIN,0,0 +3461,TRAIN,0,0 +3462,TRAIN,0,0 +3463,TRAIN,0,0 +3464,TRAIN,0,0 +3465,TRAIN,0,0 +3466,TRAIN,0,0 +3467,TRAIN,0,0 +3468,TRAIN,0,0 +3469,TRAIN,0,0 +3470,TRAIN,0,0 +3471,TRAIN,0,0 +3472,TRAIN,0,0 +3473,TRAIN,0,0 +3474,TRAIN,0,0 +3475,TRAIN,0,0 +3476,TRAIN,0,0 +3477,TRAIN,0,0 +3478,TRAIN,0,0 +3479,TRAIN,0,0 +3480,TRAIN,0,0 +3481,TRAIN,0,0 +3482,TRAIN,0,0 +3483,TRAIN,0,0 +3484,TRAIN,0,0 +3485,TRAIN,0,0 +3486,TRAIN,0,0 +3487,TRAIN,0,0 +3488,TRAIN,0,0 +3489,TRAIN,0,0 +3490,TRAIN,0,0 +3491,TRAIN,0,0 +3492,TRAIN,0,0 +3493,TRAIN,0,0 +3494,TRAIN,0,0 +3495,TRAIN,0,0 +3496,TRAIN,0,0 +3497,TRAIN,0,0 +3498,TRAIN,0,0 +3499,TRAIN,0,0 +3500,TRAIN,0,0 +3501,TRAIN,0,0 +3502,TRAIN,0,0 +3503,TRAIN,0,0 +3504,TRAIN,0,0 +3505,TRAIN,0,0 +3506,TRAIN,0,0 +3507,TRAIN,0,0 +3508,TRAIN,0,0 +3509,TRAIN,0,0 +3510,TRAIN,0,0 +3511,TRAIN,0,0 +3512,TRAIN,0,0 +3513,TRAIN,0,0 +3514,TRAIN,0,0 +3515,TRAIN,0,0 +3516,TRAIN,0,0 +3517,TRAIN,0,0 +3518,TRAIN,0,0 +3519,TRAIN,0,0 +3520,TRAIN,0,0 +3521,TRAIN,0,0 +3522,TRAIN,0,0 +3523,TRAIN,0,0 +3524,TRAIN,0,0 +3525,TRAIN,0,0 +3526,TRAIN,0,0 +3527,TRAIN,0,0 +3528,TRAIN,0,0 +3529,TRAIN,0,0 +3530,TRAIN,0,0 +3531,TRAIN,0,0 +3532,TRAIN,0,0 +3533,TRAIN,0,0 +3534,TRAIN,0,0 +3535,TRAIN,0,0 +3536,TRAIN,0,0 +3537,TRAIN,0,0 +3538,TRAIN,0,0 +3539,TRAIN,0,0 +3540,TRAIN,0,0 +3541,TRAIN,0,0 +3542,TRAIN,0,0 +3543,TRAIN,0,0 +3544,TRAIN,0,0 +3545,TRAIN,0,0 +3546,TRAIN,0,0 +3547,TRAIN,0,0 +3548,TRAIN,0,0 +3549,TRAIN,0,0 +3550,TRAIN,0,0 +3551,TRAIN,0,0 +3552,TRAIN,0,0 +3553,TRAIN,0,0 +3554,TRAIN,0,0 +3555,TRAIN,0,0 +3556,TRAIN,0,0 +3557,TRAIN,0,0 +3558,TRAIN,0,0 +3559,TRAIN,0,0 +3560,TRAIN,0,0 +3561,TRAIN,0,0 +3562,TRAIN,0,0 +3563,TRAIN,0,0 +3564,TRAIN,0,0 +3565,TRAIN,0,0 +3566,TRAIN,0,0 +3567,TRAIN,0,0 +3568,TRAIN,0,0 +3569,TRAIN,0,0 +3570,TRAIN,0,0 +3571,TRAIN,0,0 +3572,TRAIN,0,0 +3573,TRAIN,0,0 +3574,TRAIN,0,0 +3575,TRAIN,0,0 +3576,TRAIN,0,0 +3577,TRAIN,0,0 +3578,TRAIN,0,0 +3579,TRAIN,0,0 +3580,TRAIN,0,0 +3581,TRAIN,0,0 +3582,TRAIN,0,0 +3583,TRAIN,0,0 +3584,TRAIN,0,0 +3585,TRAIN,0,0 +3586,TRAIN,0,0 +3587,TRAIN,0,0 +3588,TRAIN,0,0 +3589,TRAIN,0,0 +3590,TRAIN,0,0 +3591,TRAIN,0,0 +3592,TRAIN,0,0 +3593,TRAIN,0,0 +3594,TRAIN,0,0 +3595,TRAIN,0,0 +3596,TRAIN,0,0 +3597,TRAIN,0,0 +3598,TRAIN,0,0 +3599,TRAIN,0,0 +3600,TRAIN,0,0 +3601,TRAIN,0,0 +3602,TRAIN,0,0 +3603,TRAIN,0,0 +3604,TRAIN,0,0 +3605,TRAIN,0,0 +3606,TRAIN,0,0 +3607,TRAIN,0,0 +3608,TRAIN,0,0 +3609,TRAIN,0,0 +3610,TRAIN,0,0 +3611,TRAIN,0,0 +3612,TRAIN,0,0 +3613,TRAIN,0,0 +3614,TRAIN,0,0 +3615,TRAIN,0,0 +3616,TRAIN,0,0 +3617,TRAIN,0,0 +3618,TRAIN,0,0 +3619,TRAIN,0,0 +3620,TRAIN,0,0 +3621,TRAIN,0,0 +3622,TRAIN,0,0 +3623,TRAIN,0,0 +3624,TRAIN,0,0 +3625,TRAIN,0,0 +3626,TRAIN,0,0 +3627,TRAIN,0,0 +3628,TRAIN,0,0 +3629,TRAIN,0,0 +3630,TRAIN,0,0 +3631,TRAIN,0,0 +3632,TRAIN,0,0 +3633,TRAIN,0,0 +3634,TRAIN,0,0 +3635,TRAIN,0,0 +3636,TRAIN,0,0 +3637,TRAIN,0,0 +3638,TRAIN,0,0 +3639,TRAIN,0,0 +3640,TRAIN,0,0 +3641,TRAIN,0,0 +3642,TRAIN,0,0 +3643,TRAIN,0,0 +3644,TRAIN,0,0 +3645,TRAIN,0,0 +3646,TRAIN,0,0 +3647,TRAIN,0,0 +3648,TRAIN,0,0 +3649,TRAIN,0,0 +3650,TRAIN,0,0 +3651,TRAIN,0,0 +3652,TRAIN,0,0 +3653,TRAIN,0,0 +3654,TRAIN,0,0 +3655,TRAIN,0,0 +3656,TRAIN,0,0 +3657,TRAIN,0,0 +3658,TRAIN,0,0 +3659,TRAIN,0,0 +3660,TRAIN,0,0 +3661,TRAIN,0,0 +3662,TRAIN,0,0 +3663,TRAIN,0,0 +3664,TRAIN,0,0 +3665,TRAIN,0,0 +3666,TRAIN,0,0 +3667,TRAIN,0,0 +3668,TRAIN,0,0 +3669,TRAIN,0,0 +3670,TRAIN,0,0 +3671,TRAIN,0,0 +3672,TRAIN,0,0 +3673,TRAIN,0,0 +3674,TRAIN,0,0 +3675,TRAIN,0,0 +3676,TRAIN,0,0 +3677,TRAIN,0,0 +3678,TRAIN,0,0 +3679,TRAIN,0,0 +3680,TRAIN,0,0 +3681,TRAIN,0,0 +3682,TRAIN,0,0 +3683,TRAIN,0,0 +3684,TRAIN,0,0 +3685,TRAIN,0,0 +3686,TRAIN,0,0 +3687,TRAIN,0,0 +3688,TRAIN,0,0 +3689,TRAIN,0,0 +3690,TRAIN,0,0 +3691,TRAIN,0,0 +3692,TRAIN,0,0 +3693,TRAIN,0,0 +3694,TRAIN,0,0 +3695,TRAIN,0,0 +3696,TRAIN,0,0 +3697,TRAIN,0,0 +3698,TRAIN,0,0 +3699,TRAIN,0,0 +3700,TRAIN,0,0 +3701,TRAIN,0,0 +3702,TRAIN,0,0 +3703,TRAIN,0,0 +3704,TRAIN,0,0 +3705,TRAIN,0,0 +3706,TRAIN,0,0 +3707,TRAIN,0,0 +3708,TRAIN,0,0 +3709,TRAIN,0,0 +3710,TRAIN,0,0 +3711,TRAIN,0,0 +3712,TRAIN,0,0 +3713,TRAIN,0,0 +3714,TRAIN,0,0 +3715,TRAIN,0,0 +3716,TRAIN,0,0 +3717,TRAIN,0,0 +3718,TRAIN,0,0 +3719,TRAIN,0,0 +3720,TRAIN,0,0 +3721,TRAIN,0,0 +3722,TRAIN,0,0 +3723,TRAIN,0,0 +3724,TRAIN,0,0 +3725,TRAIN,0,0 +3726,TRAIN,0,0 +3727,TRAIN,0,0 +3728,TRAIN,0,0 +3729,TRAIN,0,0 +3730,TRAIN,0,0 +3731,TRAIN,0,0 +3732,TRAIN,0,0 +3733,TRAIN,0,0 +3734,TRAIN,0,0 +3735,TRAIN,0,0 +3736,TRAIN,0,0 +3737,TRAIN,0,0 +3738,TRAIN,0,0 +3739,TRAIN,0,0 +3740,TRAIN,0,0 +3741,TRAIN,0,0 +3742,TRAIN,0,0 +3743,TRAIN,0,0 +3744,TRAIN,0,0 +3745,TRAIN,0,0 +3746,TRAIN,0,0 +3747,TRAIN,0,0 +3748,TRAIN,0,0 +3749,TRAIN,0,0 +3750,TRAIN,0,0 +3751,TRAIN,0,0 +3752,TRAIN,0,0 +3753,TRAIN,0,0 +3754,TRAIN,0,0 +3755,TRAIN,0,0 +3756,TRAIN,0,0 +3757,TRAIN,0,0 +3758,TRAIN,0,0 +3759,TRAIN,0,0 +3760,TRAIN,0,0 +3761,TRAIN,0,0 +3762,TRAIN,0,0 +3763,TRAIN,0,0 +3764,TRAIN,0,0 +3765,TRAIN,0,0 +3766,TRAIN,0,0 +3767,TRAIN,0,0 +3768,TRAIN,0,0 +3769,TRAIN,0,0 +3770,TRAIN,0,0 +3771,TRAIN,0,0 +3772,TRAIN,0,0 +3773,TRAIN,0,0 +3774,TRAIN,0,0 +3775,TRAIN,0,0 +3776,TRAIN,0,0 +3777,TRAIN,0,0 +3778,TRAIN,0,0 +3779,TRAIN,0,0 +3780,TRAIN,0,0 +3781,TRAIN,0,0 +3782,TRAIN,0,0 +3783,TRAIN,0,0 +3784,TRAIN,0,0 +3785,TRAIN,0,0 +3786,TRAIN,0,0 +3787,TRAIN,0,0 +3788,TRAIN,0,0 +3789,TRAIN,0,0 +3790,TRAIN,0,0 +3791,TRAIN,0,0 +3792,TRAIN,0,0 +3793,TRAIN,0,0 +3794,TRAIN,0,0 +3795,TRAIN,0,0 +3796,TRAIN,0,0 +3797,TRAIN,0,0 +3798,TRAIN,0,0 +3799,TRAIN,0,0 +3800,TRAIN,0,0 +3801,TRAIN,0,0 +3802,TRAIN,0,0 +3803,TRAIN,0,0 +3804,TRAIN,0,0 +3805,TRAIN,0,0 +3806,TRAIN,0,0 +3807,TRAIN,0,0 +3808,TRAIN,0,0 +3809,TRAIN,0,0 +3810,TRAIN,0,0 +3811,TRAIN,0,0 +3812,TRAIN,0,0 +3813,TRAIN,0,0 +3814,TRAIN,0,0 +3815,TRAIN,0,0 +3816,TRAIN,0,0 +3817,TRAIN,0,0 +3818,TRAIN,0,0 +3819,TRAIN,0,0 +3820,TRAIN,0,0 +3821,TRAIN,0,0 +3822,TRAIN,0,0 +3823,TRAIN,0,0 +3824,TRAIN,0,0 +3825,TRAIN,0,0 +3826,TRAIN,0,0 +3827,TRAIN,0,0 +3828,TRAIN,0,0 +3829,TRAIN,0,0 +3830,TRAIN,0,0 +3831,TRAIN,0,0 +3832,TRAIN,0,0 +3833,TRAIN,0,0 +3834,TRAIN,0,0 +3835,TRAIN,0,0 +3836,TRAIN,0,0 +3837,TRAIN,0,0 +3838,TRAIN,0,0 +3839,TRAIN,0,0 +3840,TRAIN,0,0 +3841,TRAIN,0,0 +3842,TRAIN,0,0 +3843,TRAIN,0,0 +3844,TRAIN,0,0 +3845,TRAIN,0,0 +3846,TRAIN,0,0 +3847,TRAIN,0,0 +3848,TRAIN,0,0 +3849,TRAIN,0,0 +3850,TRAIN,0,0 +3851,TRAIN,0,0 +3852,TRAIN,0,0 +3853,TRAIN,0,0 +3854,TRAIN,0,0 +3855,TRAIN,0,0 +3856,TRAIN,0,0 +3857,TRAIN,0,0 +3858,TRAIN,0,0 +3859,TRAIN,0,0 +3860,TRAIN,0,0 +3861,TRAIN,0,0 +3862,TRAIN,0,0 +3863,TRAIN,0,0 +3864,TRAIN,0,0 +3865,TRAIN,0,0 +3866,TRAIN,0,0 +3867,TRAIN,0,0 +3868,TRAIN,0,0 +3869,TRAIN,0,0 +3870,TRAIN,0,0 +3871,TRAIN,0,0 +3872,TRAIN,0,0 +3873,TRAIN,0,0 +3874,TRAIN,0,0 +3875,TRAIN,0,0 +3876,TRAIN,0,0 +3877,TRAIN,0,0 +3878,TRAIN,0,0 +3879,TRAIN,0,0 +3880,TRAIN,0,0 +3881,TRAIN,0,0 +3882,TRAIN,0,0 +3883,TRAIN,0,0 +3884,TRAIN,0,0 +3885,TRAIN,0,0 +3886,TRAIN,0,0 +3887,TRAIN,0,0 +3888,TRAIN,0,0 +3889,TRAIN,0,0 +3890,TRAIN,0,0 +3891,TRAIN,0,0 +3892,TRAIN,0,0 +3893,TRAIN,0,0 +3894,TRAIN,0,0 +3895,TRAIN,0,0 +3896,TRAIN,0,0 +3897,TRAIN,0,0 +3898,TRAIN,0,0 +3899,TRAIN,0,0 +3900,TRAIN,0,0 +3901,TRAIN,0,0 +3902,TRAIN,0,0 +3903,TRAIN,0,0 +3904,TRAIN,0,0 +3905,TRAIN,0,0 +3906,TRAIN,0,0 +3907,TRAIN,0,0 +3908,TRAIN,0,0 +3909,TRAIN,0,0 +3910,TRAIN,0,0 +3911,TRAIN,0,0 +3912,TRAIN,0,0 +3913,TRAIN,0,0 +3914,TRAIN,0,0 +3915,TRAIN,0,0 +3916,TRAIN,0,0 +3917,TRAIN,0,0 +3918,TRAIN,0,0 +3919,TRAIN,0,0 +3920,TRAIN,0,0 +3921,TRAIN,0,0 +3922,TRAIN,0,0 +3923,TRAIN,0,0 +3924,TRAIN,0,0 +3925,TRAIN,0,0 +3926,TRAIN,0,0 +3927,TRAIN,0,0 +3928,TRAIN,0,0 +3929,TRAIN,0,0 +3930,TRAIN,0,0 +3931,TRAIN,0,0 +3932,TRAIN,0,0 +3933,TRAIN,0,0 +3934,TRAIN,0,0 +3935,TRAIN,0,0 +3936,TRAIN,0,0 +3937,TRAIN,0,0 +3938,TRAIN,0,0 +3939,TRAIN,0,0 +3940,TRAIN,0,0 +3941,TRAIN,0,0 +3942,TRAIN,0,0 +3943,TRAIN,0,0 +3944,TRAIN,0,0 +3945,TRAIN,0,0 +3946,TRAIN,0,0 +3947,TRAIN,0,0 +3948,TRAIN,0,0 +3949,TRAIN,0,0 +3950,TRAIN,0,0 +3951,TRAIN,0,0 +3952,TRAIN,0,0 +3953,TRAIN,0,0 +3954,TRAIN,0,0 +3955,TRAIN,0,0 +3956,TRAIN,0,0 +3957,TRAIN,0,0 +3958,TRAIN,0,0 +3959,TRAIN,0,0 +3960,TRAIN,0,0 +3961,TRAIN,0,0 +3962,TRAIN,0,0 +3963,TRAIN,0,0 +3964,TRAIN,0,0 +3965,TRAIN,0,0 +3966,TRAIN,0,0 +3967,TRAIN,0,0 +3968,TRAIN,0,0 +3969,TRAIN,0,0 +3970,TRAIN,0,0 +3971,TRAIN,0,0 +3972,TRAIN,0,0 +3973,TRAIN,0,0 +3974,TRAIN,0,0 +3975,TRAIN,0,0 +3976,TRAIN,0,0 +3977,TRAIN,0,0 +3978,TRAIN,0,0 +3979,TRAIN,0,0 +3980,TRAIN,0,0 +3981,TRAIN,0,0 +3982,TRAIN,0,0 +3983,TRAIN,0,0 +3984,TRAIN,0,0 +3985,TRAIN,0,0 +3986,TRAIN,0,0 +3987,TRAIN,0,0 +3988,TRAIN,0,0 +3989,TRAIN,0,0 +3990,TRAIN,0,0 +3991,TRAIN,0,0 +3992,TRAIN,0,0 +3993,TRAIN,0,0 +3994,TRAIN,0,0 +3995,TRAIN,0,0 +3996,TRAIN,0,0 +3997,TRAIN,0,0 +3998,TRAIN,0,0 +3999,TRAIN,0,0 +4000,TRAIN,0,0 +4001,TRAIN,0,0 +4002,TRAIN,0,0 +4003,TRAIN,0,0 +4004,TRAIN,0,0 +4005,TRAIN,0,0 +4006,TRAIN,0,0 +4007,TRAIN,0,0 +4008,TRAIN,0,0 +4009,TRAIN,0,0 +4010,TRAIN,0,0 +4011,TRAIN,0,0 +4012,TRAIN,0,0 +4013,TRAIN,0,0 +4014,TRAIN,0,0 +4015,TRAIN,0,0 +4016,TRAIN,0,0 +4017,TRAIN,0,0 +4018,TRAIN,0,0 +4019,TRAIN,0,0 +4020,TRAIN,0,0 +4021,TRAIN,0,0 +4022,TRAIN,0,0 +4023,TRAIN,0,0 +4024,TRAIN,0,0 +4025,TRAIN,0,0 +4026,TRAIN,0,0 +4027,TRAIN,0,0 +4028,TRAIN,0,0 +4029,TRAIN,0,0 +4030,TRAIN,0,0 +4031,TRAIN,0,0 +4032,TRAIN,0,0 +4033,TRAIN,0,0 +4034,TRAIN,0,0 +4035,TRAIN,0,0 +4036,TRAIN,0,0 +4037,TRAIN,0,0 +4038,TRAIN,0,0 +4039,TRAIN,0,0 +4040,TRAIN,0,0 +4041,TRAIN,0,0 +4042,TRAIN,0,0 +4043,TRAIN,0,0 +4044,TRAIN,0,0 +4045,TRAIN,0,0 +4046,TRAIN,0,0 +4047,TRAIN,0,0 +4048,TRAIN,0,0 +4049,TRAIN,0,0 +4050,TRAIN,0,0 +4051,TRAIN,0,0 +4052,TRAIN,0,0 +4053,TRAIN,0,0 +4054,TRAIN,0,0 +4055,TRAIN,0,0 +4056,TRAIN,0,0 +4057,TRAIN,0,0 +4058,TRAIN,0,0 +4059,TRAIN,0,0 +4060,TRAIN,0,0 +4061,TRAIN,0,0 +4062,TRAIN,0,0 +4063,TRAIN,0,0 +4064,TRAIN,0,0 +4065,TRAIN,0,0 +4066,TRAIN,0,0 +4067,TRAIN,0,0 +4068,TRAIN,0,0 +4069,TRAIN,0,0 +4070,TRAIN,0,0 +4071,TRAIN,0,0 +4072,TRAIN,0,0 +4073,TRAIN,0,0 +4074,TRAIN,0,0 +4075,TRAIN,0,0 +4076,TRAIN,0,0 +4077,TRAIN,0,0 +4078,TRAIN,0,0 +4079,TRAIN,0,0 +4080,TRAIN,0,0 +4081,TRAIN,0,0 +4082,TRAIN,0,0 +4083,TRAIN,0,0 +4084,TRAIN,0,0 +4085,TRAIN,0,0 +4086,TRAIN,0,0 +4087,TRAIN,0,0 +4088,TRAIN,0,0 +4089,TRAIN,0,0 +4090,TRAIN,0,0 +4091,TRAIN,0,0 +4092,TRAIN,0,0 +4093,TRAIN,0,0 +4094,TRAIN,0,0 +4095,TRAIN,0,0 +4096,TRAIN,0,0 +4097,TRAIN,0,0 +4098,TRAIN,0,0 +4099,TRAIN,0,0 +4100,TRAIN,0,0 +4101,TRAIN,0,0 +4102,TRAIN,0,0 +4103,TRAIN,0,0 +4104,TRAIN,0,0 +4105,TRAIN,0,0 +4106,TRAIN,0,0 +4107,TRAIN,0,0 +4108,TRAIN,0,0 +4109,TRAIN,0,0 +4110,TRAIN,0,0 +4111,TRAIN,0,0 +4112,TRAIN,0,0 +4113,TRAIN,0,0 +4114,TRAIN,0,0 +4115,TRAIN,0,0 +4116,TRAIN,0,0 +4117,TRAIN,0,0 +4118,TRAIN,0,0 +4119,TRAIN,0,0 +4120,TRAIN,0,0 +4121,TRAIN,0,0 +4122,TRAIN,0,0 +4123,TRAIN,0,0 +4124,TRAIN,0,0 +4125,TRAIN,0,0 +4126,TRAIN,0,0 +4127,TRAIN,0,0 +4128,TRAIN,0,0 +4129,TRAIN,0,0 +4130,TRAIN,0,0 +4131,TRAIN,0,0 +4132,TRAIN,0,0 +4133,TRAIN,0,0 +4134,TRAIN,0,0 +4135,TRAIN,0,0 +4136,TRAIN,0,0 +4137,TRAIN,0,0 +4138,TRAIN,0,0 +4139,TRAIN,0,0 +4140,TRAIN,0,0 +4141,TRAIN,0,0 +4142,TRAIN,0,0 +4143,TRAIN,0,0 +4144,TRAIN,0,0 +4145,TRAIN,0,0 +4146,TRAIN,0,0 +4147,TRAIN,0,0 +4148,TRAIN,0,0 +4149,TRAIN,0,0 +4150,TRAIN,0,0 +4151,TRAIN,0,0 +4152,TRAIN,0,0 +4153,TRAIN,0,0 +4154,TRAIN,0,0 +4155,TRAIN,0,0 +4156,TRAIN,0,0 +4157,TRAIN,0,0 +4158,TRAIN,0,0 +4159,TRAIN,0,0 +4160,TRAIN,0,0 +4161,TRAIN,0,0 +4162,TRAIN,0,0 +4163,TRAIN,0,0 +4164,TRAIN,0,0 +4165,TRAIN,0,0 +4166,TRAIN,0,0 +4167,TRAIN,0,0 +4168,TRAIN,0,0 +4169,TRAIN,0,0 +4170,TRAIN,0,0 +4171,TRAIN,0,0 +4172,TRAIN,0,0 +4173,TRAIN,0,0 +4174,TRAIN,0,0 +4175,TRAIN,0,0 +4176,TRAIN,0,0 +4177,TRAIN,0,0 +4178,TRAIN,0,0 +4179,TRAIN,0,0 +4180,TRAIN,0,0 +4181,TRAIN,0,0 +4182,TRAIN,0,0 +4183,TRAIN,0,0 +4184,TRAIN,0,0 +4185,TRAIN,0,0 +4186,TRAIN,0,0 +4187,TRAIN,0,0 +4188,TRAIN,0,0 +4189,TRAIN,0,0 +4190,TRAIN,0,0 +4191,TRAIN,0,0 +4192,TRAIN,0,0 +4193,TRAIN,0,0 +4194,TRAIN,0,0 +4195,TRAIN,0,0 +4196,TRAIN,0,0 +4197,TRAIN,0,0 +4198,TRAIN,0,0 +4199,TRAIN,0,0 +4200,TRAIN,0,0 +4201,TRAIN,0,0 +4202,TRAIN,0,0 +4203,TRAIN,0,0 +4204,TRAIN,0,0 +4205,TRAIN,0,0 +4206,TRAIN,0,0 +4207,TRAIN,0,0 +4208,TRAIN,0,0 +4209,TRAIN,0,0 +4210,TRAIN,0,0 +4211,TRAIN,0,0 +4212,TRAIN,0,0 +4213,TRAIN,0,0 +4214,TRAIN,0,0 +4215,TRAIN,0,0 +4216,TRAIN,0,0 +4217,TRAIN,0,0 +4218,TRAIN,0,0 +4219,TRAIN,0,0 +4220,TRAIN,0,0 +4221,TRAIN,0,0 +4222,TRAIN,0,0 +4223,TRAIN,0,0 +4224,TRAIN,0,0 +4225,TRAIN,0,0 +4226,TRAIN,0,0 +4227,TRAIN,0,0 +4228,TRAIN,0,0 +4229,TRAIN,0,0 +4230,TRAIN,0,0 +4231,TRAIN,0,0 +4232,TRAIN,0,0 +4233,TRAIN,0,0 +4234,TRAIN,0,0 +4235,TRAIN,0,0 +4236,TRAIN,0,0 +4237,TRAIN,0,0 +4238,TRAIN,0,0 +4239,TRAIN,0,0 +4240,TRAIN,0,0 +4241,TRAIN,0,0 +4242,TRAIN,0,0 +4243,TRAIN,0,0 +4244,TRAIN,0,0 +4245,TRAIN,0,0 +4246,TRAIN,0,0 +4247,TRAIN,0,0 +4248,TRAIN,0,0 +4249,TRAIN,0,0 +4250,TRAIN,0,0 +4251,TRAIN,0,0 +4252,TRAIN,0,0 +4253,TRAIN,0,0 +4254,TRAIN,0,0 +4255,TRAIN,0,0 +4256,TRAIN,0,0 +4257,TRAIN,0,0 +4258,TRAIN,0,0 +4259,TRAIN,0,0 +4260,TRAIN,0,0 +4261,TRAIN,0,0 +4262,TRAIN,0,0 +4263,TRAIN,0,0 +4264,TRAIN,0,0 +4265,TRAIN,0,0 +4266,TRAIN,0,0 +4267,TRAIN,0,0 +4268,TRAIN,0,0 +4269,TRAIN,0,0 +4270,TRAIN,0,0 +4271,TRAIN,0,0 +4272,TRAIN,0,0 +4273,TRAIN,0,0 +4274,TRAIN,0,0 +4275,TRAIN,0,0 +4276,TRAIN,0,0 +4277,TRAIN,0,0 +4278,TRAIN,0,0 +4279,TRAIN,0,0 +4280,TRAIN,0,0 +4281,TRAIN,0,0 +4282,TRAIN,0,0 +4283,TRAIN,0,0 +4284,TRAIN,0,0 +4285,TRAIN,0,0 +4286,TRAIN,0,0 +4287,TRAIN,0,0 +4288,TRAIN,0,0 +4289,TRAIN,0,0 +4290,TRAIN,0,0 +4291,TRAIN,0,0 +4292,TRAIN,0,0 +4293,TRAIN,0,0 +4294,TRAIN,0,0 +4295,TRAIN,0,0 +4296,TRAIN,0,0 +4297,TRAIN,0,0 +4298,TRAIN,0,0 +4299,TRAIN,0,0 +4300,TRAIN,0,0 +4301,TRAIN,0,0 +4302,TRAIN,0,0 +4303,TRAIN,0,0 +4304,TRAIN,0,0 +4305,TRAIN,0,0 +4306,TRAIN,0,0 +4307,TRAIN,0,0 +4308,TRAIN,0,0 +4309,TRAIN,0,0 +4310,TRAIN,0,0 +4311,TRAIN,0,0 +4312,TRAIN,0,0 +4313,TRAIN,0,0 +4314,TRAIN,0,0 +4315,TRAIN,0,0 +4316,TRAIN,0,0 +4317,TRAIN,0,0 +4318,TRAIN,0,0 +4319,TRAIN,0,0 +4320,TRAIN,0,0 +4321,TRAIN,0,0 +4322,TRAIN,0,0 +4323,TRAIN,0,0 +4324,TRAIN,0,0 +4325,TRAIN,0,0 +4326,TRAIN,0,0 +4327,TRAIN,0,0 +4328,TRAIN,0,0 +4329,TRAIN,0,0 +4330,TRAIN,0,0 +4331,TRAIN,0,0 +4332,TRAIN,0,0 +4333,TRAIN,0,0 +4334,TRAIN,0,0 +4335,TRAIN,0,0 +4336,TRAIN,0,0 +4337,TRAIN,0,0 +4338,TRAIN,0,0 +4339,TRAIN,0,0 +4340,TRAIN,0,0 +4341,TRAIN,0,0 +4342,TRAIN,0,0 +4343,TRAIN,0,0 +4344,TRAIN,0,0 +4345,TRAIN,0,0 +4346,TRAIN,0,0 +4347,TRAIN,0,0 +4348,TRAIN,0,0 +4349,TRAIN,0,0 +4350,TRAIN,0,0 +4351,TRAIN,0,0 +4352,TRAIN,0,0 +4353,TRAIN,0,0 +4354,TRAIN,0,0 +4355,TRAIN,0,0 +4356,TRAIN,0,0 +4357,TRAIN,0,0 +4358,TRAIN,0,0 +4359,TRAIN,0,0 +4360,TRAIN,0,0 +4361,TRAIN,0,0 +4362,TRAIN,0,0 +4363,TRAIN,0,0 +4364,TRAIN,0,0 +4365,TRAIN,0,0 +4366,TRAIN,0,0 +4367,TRAIN,0,0 +4368,TRAIN,0,0 +4369,TRAIN,0,0 +4370,TRAIN,0,0 +4371,TRAIN,0,0 +4372,TRAIN,0,0 +4373,TRAIN,0,0 +4374,TRAIN,0,0 +4375,TRAIN,0,0 +4376,TRAIN,0,0 +4377,TRAIN,0,0 +4378,TRAIN,0,0 +4379,TRAIN,0,0 +4380,TRAIN,0,0 +4381,TRAIN,0,0 +4382,TRAIN,0,0 +4383,TRAIN,0,0 +4384,TRAIN,0,0 +4385,TRAIN,0,0 +4386,TRAIN,0,0 +4387,TRAIN,0,0 +4388,TRAIN,0,0 +4389,TRAIN,0,0 +4390,TRAIN,0,0 +4391,TRAIN,0,0 +4392,TRAIN,0,0 +4393,TRAIN,0,0 +4394,TRAIN,0,0 +4395,TRAIN,0,0 +4396,TRAIN,0,0 +4397,TRAIN,0,0 +4398,TRAIN,0,0 +4399,TRAIN,0,0 +4400,TRAIN,0,0 +4401,TRAIN,0,0 +4402,TRAIN,0,0 +4403,TRAIN,0,0 +4404,TRAIN,0,0 +4405,TRAIN,0,0 +4406,TRAIN,0,0 +4407,TRAIN,0,0 +4408,TRAIN,0,0 +4409,TRAIN,0,0 +4410,TRAIN,0,0 +4411,TRAIN,0,0 +4412,TRAIN,0,0 +4413,TRAIN,0,0 +4414,TRAIN,0,0 +4415,TRAIN,0,0 +4416,TRAIN,0,0 +4417,TRAIN,0,0 +4418,TRAIN,0,0 +4419,TRAIN,0,0 +4420,TRAIN,0,0 +4421,TRAIN,0,0 +4422,TRAIN,0,0 +4423,TRAIN,0,0 +4424,TRAIN,0,0 +4425,TRAIN,0,0 +4426,TRAIN,0,0 +4427,TRAIN,0,0 +4428,TRAIN,0,0 +4429,TRAIN,0,0 +4430,TRAIN,0,0 +4431,TRAIN,0,0 +4432,TRAIN,0,0 +4433,TRAIN,0,0 +4434,TRAIN,0,0 +4435,TRAIN,0,0 +4436,TRAIN,0,0 +4437,TRAIN,0,0 +4438,TRAIN,0,0 +4439,TRAIN,0,0 +4440,TRAIN,0,0 +4441,TRAIN,0,0 +4442,TRAIN,0,0 +4443,TRAIN,0,0 +4444,TRAIN,0,0 +4445,TRAIN,0,0 +4446,TRAIN,0,0 +4447,TRAIN,0,0 +4448,TRAIN,0,0 +4449,TRAIN,0,0 +4450,TRAIN,0,0 +4451,TRAIN,0,0 +4452,TRAIN,0,0 +4453,TRAIN,0,0 +4454,TRAIN,0,0 +4455,TRAIN,0,0 +4456,TRAIN,0,0 +4457,TRAIN,0,0 +4458,TRAIN,0,0 +4459,TRAIN,0,0 +4460,TRAIN,0,0 +4461,TRAIN,0,0 +4462,TRAIN,0,0 +4463,TRAIN,0,0 +4464,TRAIN,0,0 +4465,TRAIN,0,0 +4466,TRAIN,0,0 +4467,TRAIN,0,0 +4468,TRAIN,0,0 +4469,TRAIN,0,0 +4470,TRAIN,0,0 +4471,TRAIN,0,0 +4472,TRAIN,0,0 +4473,TRAIN,0,0 +4474,TRAIN,0,0 +4475,TRAIN,0,0 +4476,TRAIN,0,0 +4477,TRAIN,0,0 +4478,TRAIN,0,0 +4479,TRAIN,0,0 +4480,TRAIN,0,0 +4481,TRAIN,0,0 +4482,TRAIN,0,0 +4483,TRAIN,0,0 +4484,TRAIN,0,0 +4485,TRAIN,0,0 +4486,TRAIN,0,0 +4487,TRAIN,0,0 +4488,TRAIN,0,0 +4489,TRAIN,0,0 +4490,TRAIN,0,0 +4491,TRAIN,0,0 +4492,TRAIN,0,0 +4493,TRAIN,0,0 +4494,TRAIN,0,0 +4495,TRAIN,0,0 +4496,TRAIN,0,0 +4497,TRAIN,0,0 +4498,TRAIN,0,0 +4499,TRAIN,0,0 +4500,TRAIN,0,0 +4501,TRAIN,0,0 +4502,TRAIN,0,0 +4503,TRAIN,0,0 +4504,TRAIN,0,0 +4505,TRAIN,0,0 +4506,TRAIN,0,0 +4507,TRAIN,0,0 +4508,TRAIN,0,0 +4509,TRAIN,0,0 +4510,TRAIN,0,0 +4511,TRAIN,0,0 +4512,TRAIN,0,0 +4513,TRAIN,0,0 +4514,TRAIN,0,0 +4515,TRAIN,0,0 +4516,TRAIN,0,0 +4517,TRAIN,0,0 +4518,TRAIN,0,0 +4519,TRAIN,0,0 +4520,TRAIN,0,0 +4521,TRAIN,0,0 +4522,TRAIN,0,0 +4523,TRAIN,0,0 +4524,TRAIN,0,0 +4525,TRAIN,0,0 +4526,TRAIN,0,0 +4527,TRAIN,0,0 +4528,TRAIN,0,0 +4529,TRAIN,0,0 +4530,TRAIN,0,0 +4531,TRAIN,0,0 +4532,TRAIN,0,0 +4533,TRAIN,0,0 +4534,TRAIN,0,0 +4535,TRAIN,0,0 +4536,TRAIN,0,0 +4537,TRAIN,0,0 +4538,TRAIN,0,0 +4539,TRAIN,0,0 +4540,TRAIN,0,0 +4541,TRAIN,0,0 +4542,TRAIN,0,0 +4543,TRAIN,0,0 +4544,TRAIN,0,0 +4545,TRAIN,0,0 +4546,TRAIN,0,0 +4547,TRAIN,0,0 +4548,TRAIN,0,0 +4549,TRAIN,0,0 +4550,TRAIN,0,0 +4551,TRAIN,0,0 +4552,TRAIN,0,0 +4553,TRAIN,0,0 +4554,TRAIN,0,0 +4555,TRAIN,0,0 +4556,TRAIN,0,0 +4557,TRAIN,0,0 +4558,TRAIN,0,0 +4559,TRAIN,0,0 +4560,TRAIN,0,0 +4561,TRAIN,0,0 +4562,TRAIN,0,0 +4563,TRAIN,0,0 +4564,TRAIN,0,0 +4565,TRAIN,0,0 +4566,TRAIN,0,0 +4567,TRAIN,0,0 +4568,TRAIN,0,0 +4569,TRAIN,0,0 +4570,TRAIN,0,0 +4571,TRAIN,0,0 +4572,TRAIN,0,0 +4573,TRAIN,0,0 +4574,TRAIN,0,0 +4575,TRAIN,0,0 +4576,TRAIN,0,0 +4577,TRAIN,0,0 +4578,TRAIN,0,0 +4579,TRAIN,0,0 +4580,TRAIN,0,0 +4581,TRAIN,0,0 +4582,TRAIN,0,0 +4583,TRAIN,0,0 +4584,TRAIN,0,0 +4585,TRAIN,0,0 +4586,TRAIN,0,0 +4587,TRAIN,0,0 +4588,TRAIN,0,0 +4589,TRAIN,0,0 +4590,TRAIN,0,0 +4591,TRAIN,0,0 +4592,TRAIN,0,0 +4593,TRAIN,0,0 +4594,TRAIN,0,0 +4595,TRAIN,0,0 +4596,TRAIN,0,0 +4597,TRAIN,0,0 +4598,TRAIN,0,0 +4599,TRAIN,0,0 +4600,TRAIN,0,0 +4601,TRAIN,0,0 +4602,TRAIN,0,0 +4603,TRAIN,0,0 +4604,TRAIN,0,0 +4605,TRAIN,0,0 +4606,TRAIN,0,0 +4607,TRAIN,0,0 +4608,TRAIN,0,0 +4609,TRAIN,0,0 +4610,TRAIN,0,0 +4611,TRAIN,0,0 +4612,TRAIN,0,0 +4613,TRAIN,0,0 +4614,TRAIN,0,0 +4615,TRAIN,0,0 +4616,TRAIN,0,0 +4617,TRAIN,0,0 +4618,TRAIN,0,0 +4619,TRAIN,0,0 +4620,TRAIN,0,0 +4621,TRAIN,0,0 +4622,TRAIN,0,0 +4623,TRAIN,0,0 +4624,TRAIN,0,0 +4625,TRAIN,0,0 +4626,TRAIN,0,0 +4627,TRAIN,0,0 +4628,TRAIN,0,0 +4629,TRAIN,0,0 +4630,TRAIN,0,0 +4631,TRAIN,0,0 +4632,TRAIN,0,0 +4633,TRAIN,0,0 +4634,TRAIN,0,0 +4635,TRAIN,0,0 +4636,TRAIN,0,0 +4637,TRAIN,0,0 +4638,TRAIN,0,0 +4639,TRAIN,0,0 +4640,TRAIN,0,0 +4641,TRAIN,0,0 +4642,TRAIN,0,0 +4643,TRAIN,0,0 +4644,TRAIN,0,0 +4645,TRAIN,0,0 +4646,TRAIN,0,0 +4647,TRAIN,0,0 +4648,TRAIN,0,0 +4649,TRAIN,0,0 +4650,TRAIN,0,0 +4651,TRAIN,0,0 +4652,TRAIN,0,0 +4653,TRAIN,0,0 +4654,TRAIN,0,0 +4655,TRAIN,0,0 +4656,TRAIN,0,0 +4657,TRAIN,0,0 +4658,TRAIN,0,0 +4659,TRAIN,0,0 +4660,TRAIN,0,0 +4661,TRAIN,0,0 +4662,TRAIN,0,0 +4663,TRAIN,0,0 +4664,TRAIN,0,0 +4665,TRAIN,0,0 +4666,TRAIN,0,0 +4667,TRAIN,0,0 +4668,TRAIN,0,0 +4669,TRAIN,0,0 +4670,TRAIN,0,0 +4671,TRAIN,0,0 +4672,TRAIN,0,0 +4673,TRAIN,0,0 +4674,TRAIN,0,0 +4675,TRAIN,0,0 +4676,TRAIN,0,0 +4677,TRAIN,0,0 +4678,TRAIN,0,0 +4679,TRAIN,0,0 +4680,TRAIN,0,0 +4681,TRAIN,0,0 +4682,TRAIN,0,0 +4683,TRAIN,0,0 +4684,TRAIN,0,0 +4685,TRAIN,0,0 +4686,TRAIN,0,0 +4687,TRAIN,0,0 +4688,TRAIN,0,0 +4689,TRAIN,0,0 +4690,TRAIN,0,0 +4691,TRAIN,0,0 +4692,TRAIN,0,0 +4693,TRAIN,0,0 +4694,TRAIN,0,0 +4695,TRAIN,0,0 +4696,TRAIN,0,0 +4697,TRAIN,0,0 +4698,TRAIN,0,0 +4699,TRAIN,0,0 +4700,TRAIN,0,0 +4701,TRAIN,0,0 +4702,TRAIN,0,0 +4703,TRAIN,0,0 +4704,TRAIN,0,0 +4705,TRAIN,0,0 +4706,TRAIN,0,0 +4707,TRAIN,0,0 +4708,TRAIN,0,0 +4709,TRAIN,0,0 +4710,TRAIN,0,0 +4711,TRAIN,0,0 +4712,TRAIN,0,0 +4713,TRAIN,0,0 +4714,TRAIN,0,0 +4715,TRAIN,0,0 +4716,TRAIN,0,0 +4717,TRAIN,0,0 +4718,TRAIN,0,0 +4719,TRAIN,0,0 +4720,TRAIN,0,0 +4721,TRAIN,0,0 +4722,TRAIN,0,0 +4723,TRAIN,0,0 +4724,TRAIN,0,0 +4725,TRAIN,0,0 +4726,TRAIN,0,0 +4727,TRAIN,0,0 +4728,TRAIN,0,0 +4729,TRAIN,0,0 +4730,TRAIN,0,0 +4731,TRAIN,0,0 +4732,TRAIN,0,0 +4733,TRAIN,0,0 +4734,TRAIN,0,0 +4735,TRAIN,0,0 +4736,TRAIN,0,0 +4737,TRAIN,0,0 +4738,TRAIN,0,0 +4739,TRAIN,0,0 +4740,TRAIN,0,0 +4741,TRAIN,0,0 +4742,TRAIN,0,0 +4743,TRAIN,0,0 +4744,TRAIN,0,0 +4745,TRAIN,0,0 +4746,TRAIN,0,0 +4747,TRAIN,0,0 +4748,TRAIN,0,0 +4749,TRAIN,0,0 +4750,TRAIN,0,0 +4751,TRAIN,0,0 +4752,TRAIN,0,0 +4753,TRAIN,0,0 +4754,TRAIN,0,0 +4755,TRAIN,0,0 +4756,TRAIN,0,0 +4757,TRAIN,0,0 +4758,TRAIN,0,0 +4759,TRAIN,0,0 +4760,TRAIN,0,0 +4761,TRAIN,0,0 +4762,TRAIN,0,0 +4763,TRAIN,0,0 +4764,TRAIN,0,0 +4765,TRAIN,0,0 +4766,TRAIN,0,0 +4767,TRAIN,0,0 +4768,TRAIN,0,0 +4769,TRAIN,0,0 +4770,TRAIN,0,0 +4771,TRAIN,0,0 +4772,TRAIN,0,0 +4773,TRAIN,0,0 +4774,TRAIN,0,0 +4775,TRAIN,0,0 +4776,TRAIN,0,0 +4777,TRAIN,0,0 +4778,TRAIN,0,0 +4779,TRAIN,0,0 +4780,TRAIN,0,0 +4781,TRAIN,0,0 +4782,TRAIN,0,0 +4783,TRAIN,0,0 +4784,TRAIN,0,0 +4785,TRAIN,0,0 +4786,TRAIN,0,0 +4787,TRAIN,0,0 +4788,TRAIN,0,0 +4789,TRAIN,0,0 +4790,TRAIN,0,0 +4791,TRAIN,0,0 +4792,TRAIN,0,0 +4793,TRAIN,0,0 +4794,TRAIN,0,0 +4795,TRAIN,0,0 +4796,TRAIN,0,0 +4797,TRAIN,0,0 +4798,TRAIN,0,0 +4799,TRAIN,0,0 +4800,TRAIN,0,0 +4801,TRAIN,0,0 +4802,TRAIN,0,0 +4803,TRAIN,0,0 +4804,TRAIN,0,0 +4805,TRAIN,0,0 +4806,TRAIN,0,0 +4807,TRAIN,0,0 +4808,TRAIN,0,0 +4809,TRAIN,0,0 +4810,TRAIN,0,0 +4811,TRAIN,0,0 +4812,TRAIN,0,0 +4813,TRAIN,0,0 +4814,TRAIN,0,0 +4815,TRAIN,0,0 +4816,TRAIN,0,0 +4817,TRAIN,0,0 +4818,TRAIN,0,0 +4819,TRAIN,0,0 +4820,TRAIN,0,0 +4821,TRAIN,0,0 +4822,TRAIN,0,0 +4823,TRAIN,0,0 +4824,TRAIN,0,0 +4825,TRAIN,0,0 +4826,TRAIN,0,0 +4827,TRAIN,0,0 +4828,TRAIN,0,0 +4829,TRAIN,0,0 +4830,TRAIN,0,0 +4831,TRAIN,0,0 +4832,TRAIN,0,0 +4833,TRAIN,0,0 +4834,TRAIN,0,0 +4835,TRAIN,0,0 +4836,TRAIN,0,0 +4837,TRAIN,0,0 +4838,TRAIN,0,0 +4839,TRAIN,0,0 +4840,TRAIN,0,0 +4841,TRAIN,0,0 +4842,TRAIN,0,0 +4843,TRAIN,0,0 +4844,TRAIN,0,0 +4845,TRAIN,0,0 +4846,TRAIN,0,0 +4847,TRAIN,0,0 +4848,TRAIN,0,0 +4849,TRAIN,0,0 +4850,TRAIN,0,0 +4851,TRAIN,0,0 +4852,TRAIN,0,0 +4853,TRAIN,0,0 +4854,TRAIN,0,0 +4855,TRAIN,0,0 +4856,TRAIN,0,0 +4857,TRAIN,0,0 +4858,TRAIN,0,0 +4859,TRAIN,0,0 +4860,TRAIN,0,0 +4861,TRAIN,0,0 +4862,TRAIN,0,0 +4863,TRAIN,0,0 +4864,TRAIN,0,0 +4865,TRAIN,0,0 +4866,TRAIN,0,0 +4867,TRAIN,0,0 +4868,TRAIN,0,0 +4869,TRAIN,0,0 +4870,TRAIN,0,0 +4871,TRAIN,0,0 +4872,TRAIN,0,0 +4873,TRAIN,0,0 +4874,TRAIN,0,0 +4875,TRAIN,0,0 +4876,TRAIN,0,0 +4877,TRAIN,0,0 +4878,TRAIN,0,0 +4879,TRAIN,0,0 +4880,TRAIN,0,0 +4881,TRAIN,0,0 +4882,TRAIN,0,0 +4883,TRAIN,0,0 +4884,TRAIN,0,0 +4885,TRAIN,0,0 +4886,TRAIN,0,0 +4887,TRAIN,0,0 +4888,TRAIN,0,0 +4889,TRAIN,0,0 +4890,TRAIN,0,0 +4891,TRAIN,0,0 +4892,TRAIN,0,0 +4893,TRAIN,0,0 +4894,TRAIN,0,0 +4895,TRAIN,0,0 +4896,TRAIN,0,0 +4897,TRAIN,0,0 +4898,TRAIN,0,0 +4899,TRAIN,0,0 +4900,TRAIN,0,0 +4901,TRAIN,0,0 +4902,TRAIN,0,0 +4903,TRAIN,0,0 +4904,TRAIN,0,0 +4905,TRAIN,0,0 +4906,TRAIN,0,0 +4907,TRAIN,0,0 +4908,TRAIN,0,0 +4909,TRAIN,0,0 +4910,TRAIN,0,0 +4911,TRAIN,0,0 +4912,TRAIN,0,0 +4913,TRAIN,0,0 +4914,TRAIN,0,0 +4915,TRAIN,0,0 +4916,TRAIN,0,0 +4917,TRAIN,0,0 +4918,TRAIN,0,0 +4919,TRAIN,0,0 +4920,TRAIN,0,0 +4921,TRAIN,0,0 +4922,TRAIN,0,0 +4923,TRAIN,0,0 +4924,TRAIN,0,0 +4925,TRAIN,0,0 +4926,TRAIN,0,0 +4927,TRAIN,0,0 +4928,TRAIN,0,0 +4929,TRAIN,0,0 +4930,TRAIN,0,0 +4931,TRAIN,0,0 +4932,TRAIN,0,0 +4933,TRAIN,0,0 +4934,TRAIN,0,0 +4935,TRAIN,0,0 +4936,TRAIN,0,0 +4937,TRAIN,0,0 +4938,TRAIN,0,0 +4939,TRAIN,0,0 +4940,TRAIN,0,0 +4941,TRAIN,0,0 +4942,TRAIN,0,0 +4943,TRAIN,0,0 +4944,TRAIN,0,0 +4945,TRAIN,0,0 +4946,TRAIN,0,0 +4947,TRAIN,0,0 +4948,TRAIN,0,0 +4949,TRAIN,0,0 +4950,TRAIN,0,0 +4951,TRAIN,0,0 +4952,TRAIN,0,0 +4953,TRAIN,0,0 +4954,TRAIN,0,0 +4955,TRAIN,0,0 +4956,TRAIN,0,0 +4957,TRAIN,0,0 +4958,TRAIN,0,0 +4959,TRAIN,0,0 +4960,TRAIN,0,0 +4961,TRAIN,0,0 +4962,TRAIN,0,0 +4963,TRAIN,0,0 +4964,TRAIN,0,0 +4965,TRAIN,0,0 +4966,TRAIN,0,0 +4967,TRAIN,0,0 +4968,TRAIN,0,0 +4969,TRAIN,0,0 +4970,TRAIN,0,0 +4971,TRAIN,0,0 +4972,TRAIN,0,0 +4973,TRAIN,0,0 +4974,TRAIN,0,0 +4975,TRAIN,0,0 +4976,TRAIN,0,0 +4977,TRAIN,0,0 +4978,TRAIN,0,0 +4979,TRAIN,0,0 +4980,TRAIN,0,0 +4981,TRAIN,0,0 +4982,TRAIN,0,0 +4983,TRAIN,0,0 +4984,TRAIN,0,0 +4985,TRAIN,0,0 +4986,TRAIN,0,0 +4987,TRAIN,0,0 +4988,TRAIN,0,0 +4989,TRAIN,0,0 +4990,TRAIN,0,0 +4991,TRAIN,0,0 +4992,TRAIN,0,0 +4993,TRAIN,0,0 +4994,TRAIN,0,0 +4995,TRAIN,0,0 +4996,TRAIN,0,0 +4997,TRAIN,0,0 +4998,TRAIN,0,0 +4999,TRAIN,0,0 +5000,TRAIN,0,0 +5001,TRAIN,0,0 +5002,TRAIN,0,0 +5003,TRAIN,0,0 +5004,TRAIN,0,0 +5005,TRAIN,0,0 +5006,TRAIN,0,0 +5007,TRAIN,0,0 +5008,TRAIN,0,0 +5009,TRAIN,0,0 +5010,TRAIN,0,0 +5011,TRAIN,0,0 +5012,TRAIN,0,0 +5013,TRAIN,0,0 +5014,TRAIN,0,0 +5015,TRAIN,0,0 +5016,TRAIN,0,0 +5017,TRAIN,0,0 +5018,TRAIN,0,0 +5019,TRAIN,0,0 +5020,TRAIN,0,0 +5021,TRAIN,0,0 +5022,TRAIN,0,0 +5023,TRAIN,0,0 +5024,TRAIN,0,0 +5025,TRAIN,0,0 +5026,TRAIN,0,0 +5027,TRAIN,0,0 +5028,TRAIN,0,0 +5029,TRAIN,0,0 +5030,TRAIN,0,0 +5031,TRAIN,0,0 +5032,TRAIN,0,0 +5033,TRAIN,0,0 +5034,TRAIN,0,0 +5035,TRAIN,0,0 +5036,TRAIN,0,0 +5037,TRAIN,0,0 +5038,TRAIN,0,0 +5039,TRAIN,0,0 +5040,TRAIN,0,0 +5041,TRAIN,0,0 +5042,TRAIN,0,0 +5043,TRAIN,0,0 +5044,TRAIN,0,0 +5045,TRAIN,0,0 +5046,TRAIN,0,0 +5047,TRAIN,0,0 +5048,TRAIN,0,0 +5049,TRAIN,0,0 +5050,TRAIN,0,0 +5051,TRAIN,0,0 +5052,TRAIN,0,0 +5053,TRAIN,0,0 +5054,TRAIN,0,0 +5055,TRAIN,0,0 +5056,TRAIN,0,0 +5057,TRAIN,0,0 +5058,TRAIN,0,0 +5059,TRAIN,0,0 +5060,TRAIN,0,0 +5061,TRAIN,0,0 +5062,TRAIN,0,0 +5063,TRAIN,0,0 +5064,TRAIN,0,0 +5065,TRAIN,0,0 +5066,TRAIN,0,0 +5067,TRAIN,0,0 +5068,TRAIN,0,0 +5069,TRAIN,0,0 +5070,TRAIN,0,0 +5071,TRAIN,0,0 +5072,TRAIN,0,0 +5073,TRAIN,0,0 +5074,TRAIN,0,0 +5075,TRAIN,0,0 +5076,TRAIN,0,0 +5077,TRAIN,0,0 +5078,TRAIN,0,0 +5079,TRAIN,0,0 +5080,TRAIN,0,0 +5081,TRAIN,0,0 +5082,TRAIN,0,0 +5083,TRAIN,0,0 +5084,TRAIN,0,0 +5085,TRAIN,0,0 +5086,TRAIN,0,0 +5087,TRAIN,0,0 +5088,TRAIN,0,0 +5089,TRAIN,0,0 +5090,TRAIN,0,0 +5091,TRAIN,0,0 +5092,TRAIN,0,0 +5093,TRAIN,0,0 +5094,TRAIN,0,0 +5095,TRAIN,0,0 +5096,TRAIN,0,0 +5097,TRAIN,0,0 +5098,TRAIN,0,0 +5099,TRAIN,0,0 +5100,TRAIN,0,0 +5101,TRAIN,0,0 +5102,TRAIN,0,0 +5103,TRAIN,0,0 +5104,TRAIN,0,0 +5105,TRAIN,0,0 +5106,TRAIN,0,0 +5107,TRAIN,0,0 +5108,TRAIN,0,0 +5109,TRAIN,0,0 +5110,TRAIN,0,0 +5111,TRAIN,0,0 +5112,TRAIN,0,0 +5113,TRAIN,0,0 +5114,TRAIN,0,0 +5115,TRAIN,0,0 +5116,TRAIN,0,0 +5117,TRAIN,0,0 +5118,TRAIN,0,0 +5119,TRAIN,0,0 +5120,TRAIN,0,0 +5121,TRAIN,0,0 +5122,TRAIN,0,0 +5123,TRAIN,0,0 +5124,TRAIN,0,0 +5125,TRAIN,0,0 +5126,TRAIN,0,0 +5127,TRAIN,0,0 +5128,TRAIN,0,0 +5129,TRAIN,0,0 +5130,TRAIN,0,0 +5131,TRAIN,0,0 +5132,TRAIN,0,0 +5133,TRAIN,0,0 +5134,TRAIN,0,0 +5135,TRAIN,0,0 +5136,TRAIN,0,0 +5137,TRAIN,0,0 +5138,TRAIN,0,0 +5139,TRAIN,0,0 +5140,TRAIN,0,0 +5141,TRAIN,0,0 +5142,TRAIN,0,0 +5143,TRAIN,0,0 +5144,TRAIN,0,0 +5145,TRAIN,0,0 +5146,TRAIN,0,0 +5147,TRAIN,0,0 +5148,TRAIN,0,0 +5149,TRAIN,0,0 +5150,TRAIN,0,0 +5151,TRAIN,0,0 +5152,TRAIN,0,0 +5153,TRAIN,0,0 +5154,TRAIN,0,0 +5155,TRAIN,0,0 +5156,TRAIN,0,0 +5157,TRAIN,0,0 +5158,TRAIN,0,0 +5159,TRAIN,0,0 +5160,TRAIN,0,0 +5161,TRAIN,0,0 +5162,TRAIN,0,0 +5163,TRAIN,0,0 +5164,TRAIN,0,0 +5165,TRAIN,0,0 +5166,TRAIN,0,0 +5167,TRAIN,0,0 +5168,TRAIN,0,0 +5169,TRAIN,0,0 +5170,TRAIN,0,0 +5171,TRAIN,0,0 +5172,TRAIN,0,0 +5173,TRAIN,0,0 +5174,TRAIN,0,0 +5175,TRAIN,0,0 +5176,TRAIN,0,0 +5177,TRAIN,0,0 +5178,TRAIN,0,0 +5179,TRAIN,0,0 +5180,TRAIN,0,0 +5181,TRAIN,0,0 +5182,TRAIN,0,0 +5183,TRAIN,0,0 +5184,TRAIN,0,0 +5185,TRAIN,0,0 +5186,TRAIN,0,0 +5187,TRAIN,0,0 +5188,TRAIN,0,0 +5189,TRAIN,0,0 +5190,TRAIN,0,0 +5191,TRAIN,0,0 +5192,TRAIN,0,0 +5193,TRAIN,0,0 +5194,TRAIN,0,0 +5195,TRAIN,0,0 +5196,TRAIN,0,0 +5197,TRAIN,0,0 +5198,TRAIN,0,0 +5199,TRAIN,0,0 +5200,TRAIN,0,0 +5201,TRAIN,0,0 +5202,TRAIN,0,0 +5203,TRAIN,0,0 +5204,TRAIN,0,0 +5205,TRAIN,0,0 +5206,TRAIN,0,0 +5207,TRAIN,0,0 +5208,TRAIN,0,0 +5209,TRAIN,0,0 +5210,TRAIN,0,0 +5211,TRAIN,0,0 +5212,TRAIN,0,0 +5213,TRAIN,0,0 +5214,TRAIN,0,0 +5215,TRAIN,0,0 +5216,TRAIN,0,0 +5217,TRAIN,0,0 +5218,TRAIN,0,0 +5219,TRAIN,0,0 +5220,TRAIN,0,0 +5221,TRAIN,0,0 +5222,TRAIN,0,0 +5223,TRAIN,0,0 +5224,TRAIN,0,0 +5225,TRAIN,0,0 +5226,TRAIN,0,0 +5227,TRAIN,0,0 +5228,TRAIN,0,0 +5229,TRAIN,0,0 +5230,TRAIN,0,0 +5231,TRAIN,0,0 +5232,TRAIN,0,0 +5233,TRAIN,0,0 +5234,TRAIN,0,0 +5235,TRAIN,0,0 +5236,TRAIN,0,0 +5237,TRAIN,0,0 +5238,TRAIN,0,0 +5239,TRAIN,0,0 +5240,TRAIN,0,0 +5241,TRAIN,0,0 +5242,TRAIN,0,0 +5243,TRAIN,0,0 +5244,TRAIN,0,0 +5245,TRAIN,0,0 +5246,TRAIN,0,0 +5247,TRAIN,0,0 +5248,TRAIN,0,0 +5249,TRAIN,0,0 +5250,TRAIN,0,0 +5251,TRAIN,0,0 +5252,TRAIN,0,0 +5253,TRAIN,0,0 +5254,TRAIN,0,0 +5255,TRAIN,0,0 +5256,TRAIN,0,0 +5257,TRAIN,0,0 +5258,TRAIN,0,0 +5259,TRAIN,0,0 +5260,TRAIN,0,0 +5261,TRAIN,0,0 +5262,TRAIN,0,0 +5263,TRAIN,0,0 +5264,TRAIN,0,0 +5265,TRAIN,0,0 +5266,TRAIN,0,0 +5267,TRAIN,0,0 +5268,TRAIN,0,0 +5269,TRAIN,0,0 +5270,TRAIN,0,0 +5271,TRAIN,0,0 +5272,TRAIN,0,0 +5273,TRAIN,0,0 +5274,TRAIN,0,0 +5275,TRAIN,0,0 +5276,TRAIN,0,0 +5277,TRAIN,0,0 +5278,TRAIN,0,0 +5279,TRAIN,0,0 +5280,TRAIN,0,0 +5281,TRAIN,0,0 +5282,TRAIN,0,0 +5283,TRAIN,0,0 +5284,TRAIN,0,0 +5285,TRAIN,0,0 +5286,TRAIN,0,0 +5287,TRAIN,0,0 +5288,TRAIN,0,0 +5289,TRAIN,0,0 +5290,TRAIN,0,0 +5291,TRAIN,0,0 +5292,TRAIN,0,0 +5293,TRAIN,0,0 +5294,TRAIN,0,0 +5295,TRAIN,0,0 +5296,TRAIN,0,0 +5297,TRAIN,0,0 +5298,TRAIN,0,0 +5299,TRAIN,0,0 +5300,TRAIN,0,0 +5301,TRAIN,0,0 +5302,TRAIN,0,0 +5303,TRAIN,0,0 +5304,TRAIN,0,0 +5305,TRAIN,0,0 +5306,TRAIN,0,0 +5307,TRAIN,0,0 +5308,TRAIN,0,0 +5309,TRAIN,0,0 +5310,TRAIN,0,0 +5311,TRAIN,0,0 +5312,TRAIN,0,0 +5313,TRAIN,0,0 +5314,TRAIN,0,0 +5315,TRAIN,0,0 +5316,TRAIN,0,0 +5317,TRAIN,0,0 +5318,TRAIN,0,0 +5319,TRAIN,0,0 +5320,TRAIN,0,0 +5321,TRAIN,0,0 +5322,TRAIN,0,0 +5323,TRAIN,0,0 +5324,TRAIN,0,0 +5325,TRAIN,0,0 +5326,TRAIN,0,0 +5327,TRAIN,0,0 +5328,TRAIN,0,0 +5329,TRAIN,0,0 +5330,TRAIN,0,0 +5331,TRAIN,0,0 +5332,TRAIN,0,0 +5333,TRAIN,0,0 +5334,TRAIN,0,0 +5335,TRAIN,0,0 +5336,TRAIN,0,0 +5337,TRAIN,0,0 +5338,TRAIN,0,0 +5339,TRAIN,0,0 +5340,TRAIN,0,0 +5341,TRAIN,0,0 +5342,TRAIN,0,0 +5343,TRAIN,0,0 +5344,TRAIN,0,0 +5345,TRAIN,0,0 +5346,TRAIN,0,0 +5347,TRAIN,0,0 +5348,TRAIN,0,0 +5349,TRAIN,0,0 +5350,TRAIN,0,0 +5351,TRAIN,0,0 +5352,TRAIN,0,0 +5353,TRAIN,0,0 +5354,TRAIN,0,0 +5355,TRAIN,0,0 +5356,TRAIN,0,0 +5357,TRAIN,0,0 +5358,TRAIN,0,0 +5359,TRAIN,0,0 +5360,TRAIN,0,0 +5361,TRAIN,0,0 +5362,TRAIN,0,0 +5363,TRAIN,0,0 +5364,TRAIN,0,0 +5365,TRAIN,0,0 +5366,TRAIN,0,0 +5367,TRAIN,0,0 +5368,TRAIN,0,0 +5369,TRAIN,0,0 +5370,TRAIN,0,0 +5371,TRAIN,0,0 +5372,TRAIN,0,0 +5373,TRAIN,0,0 +5374,TRAIN,0,0 +5375,TRAIN,0,0 +5376,TRAIN,0,0 +5377,TRAIN,0,0 +5378,TRAIN,0,0 +5379,TRAIN,0,0 +5380,TRAIN,0,0 +5381,TRAIN,0,0 +5382,TRAIN,0,0 +5383,TRAIN,0,0 +5384,TRAIN,0,0 +5385,TRAIN,0,0 +5386,TRAIN,0,0 +5387,TRAIN,0,0 +5388,TRAIN,0,0 +5389,TRAIN,0,0 +5390,TRAIN,0,0 +5391,TRAIN,0,0 +5392,TRAIN,0,0 +5393,TRAIN,0,0 +5394,TRAIN,0,0 +5395,TRAIN,0,0 +5396,TRAIN,0,0 +5397,TRAIN,0,0 +5398,TRAIN,0,0 +5399,TRAIN,0,0 +5400,TRAIN,0,0 +5401,TRAIN,0,0 +5402,TRAIN,0,0 +5403,TRAIN,0,0 +5404,TRAIN,0,0 +5405,TRAIN,0,0 +5406,TRAIN,0,0 +5407,TRAIN,0,0 +5408,TRAIN,0,0 +5409,TRAIN,0,0 +5410,TRAIN,0,0 +5411,TRAIN,0,0 +5412,TRAIN,0,0 +5413,TRAIN,0,0 +5414,TRAIN,0,0 +5415,TRAIN,0,0 +5416,TRAIN,0,0 +5417,TRAIN,0,0 +5418,TRAIN,0,0 +5419,TRAIN,0,0 +5420,TRAIN,0,0 +5421,TRAIN,0,0 +5422,TRAIN,0,0 +5423,TRAIN,0,0 +5424,TRAIN,0,0 +5425,TRAIN,0,0 +5426,TRAIN,0,0 +5427,TRAIN,0,0 +5428,TRAIN,0,0 +5429,TRAIN,0,0 +5430,TRAIN,0,0 +5431,TRAIN,0,0 +5432,TRAIN,0,0 +5433,TRAIN,0,0 +5434,TRAIN,0,0 +5435,TRAIN,0,0 +5436,TRAIN,0,0 +5437,TRAIN,0,0 +5438,TRAIN,0,0 +5439,TRAIN,0,0 +5440,TRAIN,0,0 +5441,TRAIN,0,0 +5442,TRAIN,0,0 +5443,TRAIN,0,0 +5444,TRAIN,0,0 +5445,TRAIN,0,0 +5446,TRAIN,0,0 +5447,TRAIN,0,0 +5448,TRAIN,0,0 +5449,TRAIN,0,0 +5450,TRAIN,0,0 +5451,TRAIN,0,0 +5452,TRAIN,0,0 +5453,TRAIN,0,0 +5454,TRAIN,0,0 +5455,TRAIN,0,0 +5456,TRAIN,0,0 +5457,TRAIN,0,0 +5458,TRAIN,0,0 +5459,TRAIN,0,0 +5460,TRAIN,0,0 +5461,TRAIN,0,0 +5462,TRAIN,0,0 +5463,TRAIN,0,0 +5464,TRAIN,0,0 +5465,TRAIN,0,0 +5466,TRAIN,0,0 +5467,TRAIN,0,0 +5468,TRAIN,0,0 +5469,TRAIN,0,0 +5470,TRAIN,0,0 +5471,TRAIN,0,0 +5472,TRAIN,0,0 +5473,TRAIN,0,0 +5474,TRAIN,0,0 +5475,TRAIN,0,0 +5476,TRAIN,0,0 +5477,TRAIN,0,0 +5478,TRAIN,0,0 +5479,TRAIN,0,0 +5480,TRAIN,0,0 +5481,TRAIN,0,0 +5482,TRAIN,0,0 +5483,TRAIN,0,0 +5484,TRAIN,0,0 +5485,TRAIN,0,0 +5486,TRAIN,0,0 +5487,TRAIN,0,0 +5488,TRAIN,0,0 +5489,TRAIN,0,0 +5490,TRAIN,0,0 +5491,TRAIN,0,0 +5492,TRAIN,0,0 +5493,TRAIN,0,0 +5494,TRAIN,0,0 +5495,TRAIN,0,0 +5496,TRAIN,0,0 +5497,TRAIN,0,0 +5498,TRAIN,0,0 +5499,TRAIN,0,0 +5500,TRAIN,0,0 +5501,TRAIN,0,0 +5502,TRAIN,0,0 +5503,TRAIN,0,0 +5504,TRAIN,0,0 +5505,TRAIN,0,0 +5506,TRAIN,0,0 +5507,TRAIN,0,0 +5508,TRAIN,0,0 +5509,TRAIN,0,0 +5510,TRAIN,0,0 +5511,TRAIN,0,0 +5512,TRAIN,0,0 +5513,TRAIN,0,0 +5514,TRAIN,0,0 +5515,TRAIN,0,0 +5516,TRAIN,0,0 +5517,TRAIN,0,0 +5518,TRAIN,0,0 +5519,TRAIN,0,0 +5520,TRAIN,0,0 +5521,TRAIN,0,0 +5522,TRAIN,0,0 +5523,TRAIN,0,0 +5524,TRAIN,0,0 +5525,TRAIN,0,0 +5526,TRAIN,0,0 +5527,TRAIN,0,0 +5528,TRAIN,0,0 +5529,TRAIN,0,0 +5530,TRAIN,0,0 +5531,TRAIN,0,0 +5532,TRAIN,0,0 +5533,TRAIN,0,0 +5534,TRAIN,0,0 +5535,TRAIN,0,0 +5536,TRAIN,0,0 +5537,TRAIN,0,0 +5538,TRAIN,0,0 +5539,TRAIN,0,0 +5540,TRAIN,0,0 +5541,TRAIN,0,0 +5542,TRAIN,0,0 +5543,TRAIN,0,0 +5544,TRAIN,0,0 +5545,TRAIN,0,0 +5546,TRAIN,0,0 +5547,TRAIN,0,0 +5548,TRAIN,0,0 +5549,TRAIN,0,0 +5550,TRAIN,0,0 +5551,TRAIN,0,0 +5552,TRAIN,0,0 +5553,TRAIN,0,0 +5554,TRAIN,0,0 +5555,TRAIN,0,0 +5556,TRAIN,0,0 +5557,TRAIN,0,0 +5558,TRAIN,0,0 +5559,TRAIN,0,0 +5560,TRAIN,0,0 +5561,TRAIN,0,0 +5562,TRAIN,0,0 +5563,TRAIN,0,0 +5564,TRAIN,0,0 +5565,TRAIN,0,0 +5566,TRAIN,0,0 +5567,TRAIN,0,0 +5568,TRAIN,0,0 +5569,TRAIN,0,0 +5570,TRAIN,0,0 +5571,TRAIN,0,0 +5572,TRAIN,0,0 +5573,TRAIN,0,0 +5574,TRAIN,0,0 +5575,TRAIN,0,0 +5576,TRAIN,0,0 +5577,TRAIN,0,0 +5578,TRAIN,0,0 +5579,TRAIN,0,0 +5580,TRAIN,0,0 +5581,TRAIN,0,0 +5582,TRAIN,0,0 +5583,TRAIN,0,0 +5584,TRAIN,0,0 +5585,TRAIN,0,0 +5586,TRAIN,0,0 +5587,TRAIN,0,0 +5588,TRAIN,0,0 +5589,TRAIN,0,0 +5590,TRAIN,0,0 +5591,TRAIN,0,0 +5592,TRAIN,0,0 +5593,TRAIN,0,0 +5594,TRAIN,0,0 +5595,TRAIN,0,0 +5596,TRAIN,0,0 +5597,TRAIN,0,0 +5598,TRAIN,0,0 +5599,TRAIN,0,0 +5600,TRAIN,0,0 +5601,TRAIN,0,0 +5602,TRAIN,0,0 +5603,TRAIN,0,0 +5604,TRAIN,0,0 +5605,TRAIN,0,0 +5606,TRAIN,0,0 +5607,TRAIN,0,0 +5608,TRAIN,0,0 +5609,TRAIN,0,0 +5610,TRAIN,0,0 +5611,TRAIN,0,0 +5612,TRAIN,0,0 +5613,TRAIN,0,0 +5614,TRAIN,0,0 +5615,TRAIN,0,0 +5616,TRAIN,0,0 +5617,TRAIN,0,0 +5618,TRAIN,0,0 +5619,TRAIN,0,0 +5620,TRAIN,0,0 +5621,TRAIN,0,0 +5622,TRAIN,0,0 +5623,TRAIN,0,0 +5624,TRAIN,0,0 +5625,TRAIN,0,0 +5626,TRAIN,0,0 +5627,TRAIN,0,0 +5628,TRAIN,0,0 +5629,TRAIN,0,0 +5630,TRAIN,0,0 +5631,TRAIN,0,0 +5632,TRAIN,0,0 +5633,TRAIN,0,0 +5634,TRAIN,0,0 +5635,TRAIN,0,0 +5636,TRAIN,0,0 +5637,TRAIN,0,0 +5638,TRAIN,0,0 +5639,TRAIN,0,0 +5640,TRAIN,0,0 +5641,TRAIN,0,0 +5642,TRAIN,0,0 +5643,TRAIN,0,0 +5644,TRAIN,0,0 +5645,TRAIN,0,0 +5646,TRAIN,0,0 +5647,TRAIN,0,0 +5648,TRAIN,0,0 +5649,TRAIN,0,0 +5650,TRAIN,0,0 +5651,TRAIN,0,0 +5652,TRAIN,0,0 +5653,TRAIN,0,0 +5654,TRAIN,0,0 +5655,TRAIN,0,0 +5656,TRAIN,0,0 +5657,TRAIN,0,0 +5658,TRAIN,0,0 +5659,TRAIN,0,0 +5660,TRAIN,0,0 +5661,TRAIN,0,0 +5662,TRAIN,0,0 +5663,TRAIN,0,0 +5664,TRAIN,0,0 +5665,TRAIN,0,0 +5666,TRAIN,0,0 +5667,TRAIN,0,0 +5668,TRAIN,0,0 +5669,TRAIN,0,0 +5670,TRAIN,0,0 +5671,TRAIN,0,0 +5672,TRAIN,0,0 +5673,TRAIN,0,0 +5674,TRAIN,0,0 +5675,TRAIN,0,0 +5676,TRAIN,0,0 +5677,TRAIN,0,0 +5678,TRAIN,0,0 +5679,TRAIN,0,0 +5680,TRAIN,0,0 +5681,TRAIN,0,0 +5682,TRAIN,0,0 +5683,TRAIN,0,0 +5684,TRAIN,0,0 +5685,TRAIN,0,0 +5686,TRAIN,0,0 +5687,TRAIN,0,0 +5688,TRAIN,0,0 +5689,TRAIN,0,0 +5690,TRAIN,0,0 +5691,TRAIN,0,0 +5692,TRAIN,0,0 +5693,TRAIN,0,0 +5694,TRAIN,0,0 +5695,TRAIN,0,0 +5696,TRAIN,0,0 +5697,TRAIN,0,0 +5698,TRAIN,0,0 +5699,TRAIN,0,0 +5700,TRAIN,0,0 +5701,TRAIN,0,0 +5702,TRAIN,0,0 +5703,TRAIN,0,0 +5704,TRAIN,0,0 +5705,TRAIN,0,0 +5706,TRAIN,0,0 +5707,TRAIN,0,0 +5708,TRAIN,0,0 +5709,TRAIN,0,0 +5710,TRAIN,0,0 +5711,TRAIN,0,0 +5712,TRAIN,0,0 +5713,TRAIN,0,0 +5714,TRAIN,0,0 +5715,TRAIN,0,0 +5716,TRAIN,0,0 +5717,TRAIN,0,0 +5718,TRAIN,0,0 +5719,TRAIN,0,0 +5720,TRAIN,0,0 +5721,TRAIN,0,0 +5722,TRAIN,0,0 +5723,TRAIN,0,0 +5724,TRAIN,0,0 +5725,TRAIN,0,0 +5726,TRAIN,0,0 +5727,TRAIN,0,0 +5728,TRAIN,0,0 +5729,TRAIN,0,0 +5730,TRAIN,0,0 +5731,TRAIN,0,0 +5732,TRAIN,0,0 +5733,TRAIN,0,0 +5734,TRAIN,0,0 +5735,TRAIN,0,0 +5736,TRAIN,0,0 +5737,TRAIN,0,0 +5738,TRAIN,0,0 +5739,TRAIN,0,0 +5740,TRAIN,0,0 +5741,TRAIN,0,0 +5742,TRAIN,0,0 +5743,TRAIN,0,0 +5744,TRAIN,0,0 +5745,TRAIN,0,0 +5746,TRAIN,0,0 +5747,TRAIN,0,0 +5748,TRAIN,0,0 +5749,TRAIN,0,0 +5750,TRAIN,0,0 +5751,TRAIN,0,0 +5752,TRAIN,0,0 +5753,TRAIN,0,0 +5754,TRAIN,0,0 +5755,TRAIN,0,0 +5756,TRAIN,0,0 +5757,TRAIN,0,0 +5758,TRAIN,0,0 +5759,TRAIN,0,0 +5760,TRAIN,0,0 +5761,TRAIN,0,0 +5762,TRAIN,0,0 +5763,TRAIN,0,0 +5764,TRAIN,0,0 +5765,TRAIN,0,0 +5766,TRAIN,0,0 +5767,TRAIN,0,0 +5768,TRAIN,0,0 +5769,TRAIN,0,0 +5770,TRAIN,0,0 +5771,TRAIN,0,0 +5772,TRAIN,0,0 +5773,TRAIN,0,0 +5774,TRAIN,0,0 +5775,TRAIN,0,0 +5776,TRAIN,0,0 +5777,TRAIN,0,0 +5778,TRAIN,0,0 +5779,TRAIN,0,0 +5780,TRAIN,0,0 +5781,TRAIN,0,0 +5782,TRAIN,0,0 +5783,TRAIN,0,0 +5784,TRAIN,0,0 +5785,TRAIN,0,0 +5786,TRAIN,0,0 +5787,TRAIN,0,0 +5788,TRAIN,0,0 +5789,TRAIN,0,0 +5790,TRAIN,0,0 +5791,TRAIN,0,0 +5792,TRAIN,0,0 +5793,TRAIN,0,0 +5794,TRAIN,0,0 +5795,TRAIN,0,0 +5796,TRAIN,0,0 +5797,TRAIN,0,0 +5798,TRAIN,0,0 +5799,TRAIN,0,0 +5800,TRAIN,0,0 +5801,TRAIN,0,0 +5802,TRAIN,0,0 +5803,TRAIN,0,0 +5804,TRAIN,0,0 +5805,TRAIN,0,0 +5806,TRAIN,0,0 +5807,TRAIN,0,0 +5808,TRAIN,0,0 +5809,TRAIN,0,0 +5810,TRAIN,0,0 +5811,TRAIN,0,0 +5812,TRAIN,0,0 +5813,TRAIN,0,0 +5814,TRAIN,0,0 +5815,TRAIN,0,0 +5816,TRAIN,0,0 +5817,TRAIN,0,0 +5818,TRAIN,0,0 +5819,TRAIN,0,0 +5820,TRAIN,0,0 +5821,TRAIN,0,0 +5822,TRAIN,0,0 +5823,TRAIN,0,0 +5824,TRAIN,0,0 +5825,TRAIN,0,0 +5826,TRAIN,0,0 +5827,TRAIN,0,0 +5828,TRAIN,0,0 +5829,TRAIN,0,0 +5830,TRAIN,0,0 +5831,TRAIN,0,0 +5832,TRAIN,0,0 +5833,TRAIN,0,0 +5834,TRAIN,0,0 +5835,TRAIN,0,0 +5836,TRAIN,0,0 +5837,TRAIN,0,0 +5838,TRAIN,0,0 +5839,TRAIN,0,0 +5840,TRAIN,0,0 +5841,TRAIN,0,0 +5842,TRAIN,0,0 +5843,TRAIN,0,0 +5844,TRAIN,0,0 +5845,TRAIN,0,0 +5846,TRAIN,0,0 +5847,TRAIN,0,0 +5848,TRAIN,0,0 +5849,TRAIN,0,0 +5850,TRAIN,0,0 +5851,TRAIN,0,0 +5852,TRAIN,0,0 +5853,TRAIN,0,0 +5854,TRAIN,0,0 +5855,TRAIN,0,0 +5856,TRAIN,0,0 +5857,TRAIN,0,0 +5858,TRAIN,0,0 +5859,TRAIN,0,0 +5860,TRAIN,0,0 +5861,TRAIN,0,0 +5862,TRAIN,0,0 +5863,TRAIN,0,0 +5864,TRAIN,0,0 +5865,TRAIN,0,0 +5866,TRAIN,0,0 +5867,TRAIN,0,0 +5868,TRAIN,0,0 +5869,TRAIN,0,0 +5870,TRAIN,0,0 +5871,TRAIN,0,0 +5872,TRAIN,0,0 +5873,TRAIN,0,0 +5874,TRAIN,0,0 +5875,TRAIN,0,0 +5876,TRAIN,0,0 +5877,TRAIN,0,0 +5878,TRAIN,0,0 +5879,TRAIN,0,0 +5880,TRAIN,0,0 +5881,TRAIN,0,0 +5882,TRAIN,0,0 +5883,TRAIN,0,0 +5884,TRAIN,0,0 +5885,TRAIN,0,0 +5886,TRAIN,0,0 +5887,TRAIN,0,0 +5888,TRAIN,0,0 +5889,TRAIN,0,0 +5890,TRAIN,0,0 +5891,TRAIN,0,0 +5892,TRAIN,0,0 +5893,TRAIN,0,0 +5894,TRAIN,0,0 +5895,TRAIN,0,0 +5896,TRAIN,0,0 +5897,TRAIN,0,0 +5898,TRAIN,0,0 +5899,TRAIN,0,0 +5900,TRAIN,0,0 +5901,TRAIN,0,0 +5902,TRAIN,0,0 +5903,TRAIN,0,0 +5904,TRAIN,0,0 +5905,TRAIN,0,0 +5906,TRAIN,0,0 +5907,TRAIN,0,0 +5908,TRAIN,0,0 +5909,TRAIN,0,0 +5910,TRAIN,0,0 +5911,TRAIN,0,0 +5912,TRAIN,0,0 +5913,TRAIN,0,0 +5914,TRAIN,0,0 +5915,TRAIN,0,0 +5916,TRAIN,0,0 +5917,TRAIN,0,0 +5918,TRAIN,0,0 +5919,TRAIN,0,0 +5920,TRAIN,0,0 +5921,TRAIN,0,0 +5922,TRAIN,0,0 +5923,TRAIN,0,0 +5924,TRAIN,0,0 +5925,TRAIN,0,0 +5926,TRAIN,0,0 +5927,TRAIN,0,0 +5928,TRAIN,0,0 +5929,TRAIN,0,0 +5930,TRAIN,0,0 +5931,TRAIN,0,0 +5932,TRAIN,0,0 +5933,TRAIN,0,0 +5934,TRAIN,0,0 +5935,TRAIN,0,0 +5936,TRAIN,0,0 +5937,TRAIN,0,0 +5938,TRAIN,0,0 +5939,TRAIN,0,0 +5940,TRAIN,0,0 +5941,TRAIN,0,0 +5942,TRAIN,0,0 +5943,TRAIN,0,0 +5944,TRAIN,0,0 +5945,TRAIN,0,0 +5946,TRAIN,0,0 +5947,TRAIN,0,0 +5948,TRAIN,0,0 +5949,TRAIN,0,0 +5950,TRAIN,0,0 +5951,TRAIN,0,0 +5952,TRAIN,0,0 +5953,TRAIN,0,0 +5954,TRAIN,0,0 +5955,TRAIN,0,0 +5956,TRAIN,0,0 +5957,TRAIN,0,0 +5958,TRAIN,0,0 +5959,TRAIN,0,0 +5960,TRAIN,0,0 +5961,TRAIN,0,0 +5962,TRAIN,0,0 +5963,TRAIN,0,0 +5964,TRAIN,0,0 +5965,TRAIN,0,0 +5966,TRAIN,0,0 +5967,TRAIN,0,0 +5968,TRAIN,0,0 +5969,TRAIN,0,0 +5970,TRAIN,0,0 +5971,TRAIN,0,0 +5972,TRAIN,0,0 +5973,TRAIN,0,0 +5974,TRAIN,0,0 +5975,TRAIN,0,0 +5976,TRAIN,0,0 +5977,TRAIN,0,0 +5978,TRAIN,0,0 +5979,TRAIN,0,0 +5980,TRAIN,0,0 +5981,TRAIN,0,0 +5982,TRAIN,0,0 +5983,TRAIN,0,0 +5984,TRAIN,0,0 +5985,TRAIN,0,0 +5986,TRAIN,0,0 +5987,TRAIN,0,0 +5988,TRAIN,0,0 +5989,TRAIN,0,0 +5990,TRAIN,0,0 +5991,TRAIN,0,0 +5992,TRAIN,0,0 +5993,TRAIN,0,0 +5994,TRAIN,0,0 +5995,TRAIN,0,0 +5996,TRAIN,0,0 +5997,TRAIN,0,0 +5998,TRAIN,0,0 +5999,TRAIN,0,0 +6000,TRAIN,0,0 +6001,TRAIN,0,0 +6002,TRAIN,0,0 +6003,TRAIN,0,0 +6004,TRAIN,0,0 +6005,TRAIN,0,0 +6006,TRAIN,0,0 +6007,TRAIN,0,0 +6008,TRAIN,0,0 +6009,TRAIN,0,0 +6010,TRAIN,0,0 +6011,TRAIN,0,0 +6012,TRAIN,0,0 +6013,TRAIN,0,0 +6014,TRAIN,0,0 +6015,TRAIN,0,0 +6016,TRAIN,0,0 +6017,TRAIN,0,0 +6018,TRAIN,0,0 +6019,TRAIN,0,0 +6020,TRAIN,0,0 +6021,TRAIN,0,0 +6022,TRAIN,0,0 +6023,TRAIN,0,0 +6024,TRAIN,0,0 +6025,TRAIN,0,0 +6026,TRAIN,0,0 +6027,TRAIN,0,0 +6028,TRAIN,0,0 +6029,TRAIN,0,0 +6030,TRAIN,0,0 +6031,TRAIN,0,0 +6032,TRAIN,0,0 +6033,TRAIN,0,0 +6034,TRAIN,0,0 +6035,TRAIN,0,0 +6036,TRAIN,0,0 +6037,TRAIN,0,0 +6038,TRAIN,0,0 +6039,TRAIN,0,0 +6040,TRAIN,0,0 +6041,TRAIN,0,0 +6042,TRAIN,0,0 +6043,TRAIN,0,0 +6044,TRAIN,0,0 +6045,TRAIN,0,0 +6046,TRAIN,0,0 +6047,TRAIN,0,0 +6048,TRAIN,0,0 +6049,TRAIN,0,0 +6050,TRAIN,0,0 +6051,TRAIN,0,0 +6052,TRAIN,0,0 +6053,TRAIN,0,0 +6054,TRAIN,0,0 +6055,TRAIN,0,0 +6056,TRAIN,0,0 +6057,TRAIN,0,0 +6058,TRAIN,0,0 +6059,TRAIN,0,0 +6060,TRAIN,0,0 +6061,TRAIN,0,0 +6062,TRAIN,0,0 +6063,TRAIN,0,0 +6064,TRAIN,0,0 +6065,TRAIN,0,0 +6066,TRAIN,0,0 +6067,TRAIN,0,0 +6068,TRAIN,0,0 +6069,TRAIN,0,0 +6070,TRAIN,0,0 +6071,TRAIN,0,0 +6072,TRAIN,0,0 +6073,TRAIN,0,0 +6074,TRAIN,0,0 +6075,TRAIN,0,0 +6076,TRAIN,0,0 +6077,TRAIN,0,0 +6078,TRAIN,0,0 +6079,TRAIN,0,0 +6080,TRAIN,0,0 +6081,TRAIN,0,0 +6082,TRAIN,0,0 +6083,TRAIN,0,0 +6084,TRAIN,0,0 +6085,TRAIN,0,0 +6086,TRAIN,0,0 +6087,TRAIN,0,0 +6088,TRAIN,0,0 +6089,TRAIN,0,0 +6090,TRAIN,0,0 +6091,TRAIN,0,0 +6092,TRAIN,0,0 +6093,TRAIN,0,0 +6094,TRAIN,0,0 +6095,TRAIN,0,0 +6096,TRAIN,0,0 +6097,TRAIN,0,0 +6098,TRAIN,0,0 +6099,TRAIN,0,0 +6100,TRAIN,0,0 +6101,TRAIN,0,0 +6102,TRAIN,0,0 +6103,TRAIN,0,0 +6104,TRAIN,0,0 +6105,TRAIN,0,0 +6106,TRAIN,0,0 +6107,TRAIN,0,0 +6108,TRAIN,0,0 +6109,TRAIN,0,0 +6110,TRAIN,0,0 +6111,TRAIN,0,0 +6112,TRAIN,0,0 +6113,TRAIN,0,0 +6114,TRAIN,0,0 +6115,TRAIN,0,0 +6116,TRAIN,0,0 +6117,TRAIN,0,0 +6118,TRAIN,0,0 +6119,TRAIN,0,0 +6120,TRAIN,0,0 +6121,TRAIN,0,0 +6122,TRAIN,0,0 +6123,TRAIN,0,0 +6124,TRAIN,0,0 +6125,TRAIN,0,0 +6126,TRAIN,0,0 +6127,TRAIN,0,0 +6128,TRAIN,0,0 +6129,TRAIN,0,0 +6130,TRAIN,0,0 +6131,TRAIN,0,0 +6132,TRAIN,0,0 +6133,TRAIN,0,0 +6134,TRAIN,0,0 +6135,TRAIN,0,0 +6136,TRAIN,0,0 +6137,TRAIN,0,0 +6138,TRAIN,0,0 +6139,TRAIN,0,0 +6140,TRAIN,0,0 +6141,TRAIN,0,0 +6142,TRAIN,0,0 +6143,TRAIN,0,0 +6144,TRAIN,0,0 +6145,TRAIN,0,0 +6146,TRAIN,0,0 +6147,TRAIN,0,0 +6148,TRAIN,0,0 +6149,TRAIN,0,0 +6150,TRAIN,0,0 +6151,TRAIN,0,0 +6152,TRAIN,0,0 +6153,TRAIN,0,0 +6154,TRAIN,0,0 +6155,TRAIN,0,0 +6156,TRAIN,0,0 +6157,TRAIN,0,0 +6158,TRAIN,0,0 +6159,TRAIN,0,0 +6160,TRAIN,0,0 +6161,TRAIN,0,0 +6162,TRAIN,0,0 +6163,TRAIN,0,0 +6164,TRAIN,0,0 +6165,TRAIN,0,0 +6166,TRAIN,0,0 +6167,TRAIN,0,0 +6168,TRAIN,0,0 +6169,TRAIN,0,0 +6170,TRAIN,0,0 +6171,TRAIN,0,0 +6172,TRAIN,0,0 +6173,TRAIN,0,0 +6174,TRAIN,0,0 +6175,TRAIN,0,0 +6176,TRAIN,0,0 +6177,TRAIN,0,0 +6178,TRAIN,0,0 +6179,TRAIN,0,0 +6180,TRAIN,0,0 +6181,TRAIN,0,0 +6182,TRAIN,0,0 +6183,TRAIN,0,0 +6184,TRAIN,0,0 +6185,TRAIN,0,0 +6186,TRAIN,0,0 +6187,TRAIN,0,0 +6188,TRAIN,0,0 +6189,TRAIN,0,0 +6190,TRAIN,0,0 +6191,TRAIN,0,0 +6192,TRAIN,0,0 +6193,TRAIN,0,0 +6194,TRAIN,0,0 +6195,TRAIN,0,0 +6196,TRAIN,0,0 +6197,TRAIN,0,0 +6198,TRAIN,0,0 +6199,TRAIN,0,0 +6200,TRAIN,0,0 +6201,TRAIN,0,0 +6202,TRAIN,0,0 +6203,TRAIN,0,0 +6204,TRAIN,0,0 +6205,TRAIN,0,0 +6206,TRAIN,0,0 +6207,TRAIN,0,0 +6208,TRAIN,0,0 +6209,TRAIN,0,0 +6210,TRAIN,0,0 +6211,TRAIN,0,0 +6212,TRAIN,0,0 +6213,TRAIN,0,0 +6214,TRAIN,0,0 +6215,TRAIN,0,0 +6216,TRAIN,0,0 +6217,TRAIN,0,0 +6218,TRAIN,0,0 +6219,TRAIN,0,0 +6220,TRAIN,0,0 +6221,TRAIN,0,0 +6222,TRAIN,0,0 +6223,TRAIN,0,0 +6224,TRAIN,0,0 +6225,TRAIN,0,0 +6226,TRAIN,0,0 +6227,TRAIN,0,0 +6228,TRAIN,0,0 +6229,TRAIN,0,0 +6230,TRAIN,0,0 +6231,TRAIN,0,0 +6232,TRAIN,0,0 +6233,TRAIN,0,0 +6234,TRAIN,0,0 +6235,TRAIN,0,0 +6236,TRAIN,0,0 +6237,TRAIN,0,0 +6238,TRAIN,0,0 +6239,TRAIN,0,0 +6240,TRAIN,0,0 +6241,TRAIN,0,0 +6242,TRAIN,0,0 +6243,TRAIN,0,0 +6244,TRAIN,0,0 +6245,TRAIN,0,0 +6246,TRAIN,0,0 +6247,TRAIN,0,0 +6248,TRAIN,0,0 +6249,TRAIN,0,0 +6250,TRAIN,0,0 +6251,TRAIN,0,0 +6252,TRAIN,0,0 +6253,TRAIN,0,0 +6254,TRAIN,0,0 +6255,TRAIN,0,0 +6256,TRAIN,0,0 +6257,TRAIN,0,0 +6258,TRAIN,0,0 +6259,TRAIN,0,0 +6260,TRAIN,0,0 +6261,TRAIN,0,0 +6262,TRAIN,0,0 +6263,TRAIN,0,0 +6264,TRAIN,0,0 +6265,TRAIN,0,0 +6266,TRAIN,0,0 +6267,TRAIN,0,0 +6268,TRAIN,0,0 +6269,TRAIN,0,0 +6270,TRAIN,0,0 +6271,TRAIN,0,0 +6272,TRAIN,0,0 +6273,TRAIN,0,0 +6274,TRAIN,0,0 +6275,TRAIN,0,0 +6276,TRAIN,0,0 +6277,TRAIN,0,0 +6278,TRAIN,0,0 +6279,TRAIN,0,0 +6280,TRAIN,0,0 +6281,TRAIN,0,0 +6282,TRAIN,0,0 +6283,TRAIN,0,0 +6284,TRAIN,0,0 +6285,TRAIN,0,0 +6286,TRAIN,0,0 +6287,TRAIN,0,0 +6288,TRAIN,0,0 +6289,TRAIN,0,0 +6290,TRAIN,0,0 +6291,TRAIN,0,0 +6292,TRAIN,0,0 +6293,TRAIN,0,0 +6294,TRAIN,0,0 +6295,TRAIN,0,0 +6296,TRAIN,0,0 +6297,TRAIN,0,0 +6298,TRAIN,0,0 +6299,TRAIN,0,0 +6300,TRAIN,0,0 +6301,TRAIN,0,0 +6302,TRAIN,0,0 +6303,TRAIN,0,0 +6304,TRAIN,0,0 +6305,TRAIN,0,0 +6306,TRAIN,0,0 +6307,TRAIN,0,0 +6308,TRAIN,0,0 +6309,TRAIN,0,0 +6310,TRAIN,0,0 +6311,TRAIN,0,0 +6312,TRAIN,0,0 +6313,TRAIN,0,0 +6314,TRAIN,0,0 +6315,TRAIN,0,0 +6316,TRAIN,0,0 +6317,TRAIN,0,0 +6318,TRAIN,0,0 +6319,TRAIN,0,0 +6320,TRAIN,0,0 +6321,TRAIN,0,0 +6322,TRAIN,0,0 +6323,TRAIN,0,0 +6324,TRAIN,0,0 +6325,TRAIN,0,0 +6326,TRAIN,0,0 +6327,TRAIN,0,0 +6328,TRAIN,0,0 +6329,TRAIN,0,0 +6330,TRAIN,0,0 +6331,TRAIN,0,0 +6332,TRAIN,0,0 +6333,TRAIN,0,0 +6334,TRAIN,0,0 +6335,TRAIN,0,0 +6336,TRAIN,0,0 +6337,TRAIN,0,0 +6338,TRAIN,0,0 +6339,TRAIN,0,0 +6340,TRAIN,0,0 +6341,TRAIN,0,0 +6342,TRAIN,0,0 +6343,TRAIN,0,0 +6344,TRAIN,0,0 +6345,TRAIN,0,0 +6346,TRAIN,0,0 +6347,TRAIN,0,0 +6348,TRAIN,0,0 +6349,TRAIN,0,0 +6350,TRAIN,0,0 +6351,TRAIN,0,0 +6352,TRAIN,0,0 +6353,TRAIN,0,0 +6354,TRAIN,0,0 +6355,TRAIN,0,0 +6356,TRAIN,0,0 +6357,TRAIN,0,0 +6358,TRAIN,0,0 +6359,TRAIN,0,0 +6360,TRAIN,0,0 +6361,TRAIN,0,0 +6362,TRAIN,0,0 +6363,TRAIN,0,0 +6364,TRAIN,0,0 +6365,TRAIN,0,0 +6366,TRAIN,0,0 +6367,TRAIN,0,0 +6368,TRAIN,0,0 +6369,TRAIN,0,0 +6370,TRAIN,0,0 +6371,TRAIN,0,0 +6372,TRAIN,0,0 +6373,TRAIN,0,0 +6374,TRAIN,0,0 +6375,TRAIN,0,0 +6376,TRAIN,0,0 +6377,TRAIN,0,0 +6378,TRAIN,0,0 +6379,TRAIN,0,0 +6380,TRAIN,0,0 +6381,TRAIN,0,0 +6382,TRAIN,0,0 +6383,TRAIN,0,0 +6384,TRAIN,0,0 +6385,TRAIN,0,0 +6386,TRAIN,0,0 +6387,TRAIN,0,0 +6388,TRAIN,0,0 +6389,TRAIN,0,0 +6390,TRAIN,0,0 +6391,TRAIN,0,0 +6392,TRAIN,0,0 +6393,TRAIN,0,0 +6394,TRAIN,0,0 +6395,TRAIN,0,0 +6396,TRAIN,0,0 +6397,TRAIN,0,0 +6398,TRAIN,0,0 +6399,TRAIN,0,0 +6400,TRAIN,0,0 +6401,TRAIN,0,0 +6402,TRAIN,0,0 +6403,TRAIN,0,0 +6404,TRAIN,0,0 +6405,TRAIN,0,0 +6406,TRAIN,0,0 +6407,TRAIN,0,0 +6408,TRAIN,0,0 +6409,TRAIN,0,0 +6410,TRAIN,0,0 +6411,TRAIN,0,0 +6412,TRAIN,0,0 +6413,TRAIN,0,0 +6414,TRAIN,0,0 +6415,TRAIN,0,0 +6416,TRAIN,0,0 +6417,TRAIN,0,0 +6418,TRAIN,0,0 +6419,TRAIN,0,0 +6420,TRAIN,0,0 +6421,TRAIN,0,0 +6422,TRAIN,0,0 +6423,TRAIN,0,0 +6424,TRAIN,0,0 +6425,TRAIN,0,0 +6426,TRAIN,0,0 +6427,TRAIN,0,0 +6428,TRAIN,0,0 +6429,TRAIN,0,0 +6430,TRAIN,0,0 +6431,TRAIN,0,0 +6432,TRAIN,0,0 +6433,TRAIN,0,0 +6434,TRAIN,0,0 +6435,TRAIN,0,0 +6436,TRAIN,0,0 +6437,TRAIN,0,0 +6438,TRAIN,0,0 +6439,TRAIN,0,0 +6440,TRAIN,0,0 +6441,TRAIN,0,0 +6442,TRAIN,0,0 +6443,TRAIN,0,0 +6444,TRAIN,0,0 +6445,TRAIN,0,0 +6446,TRAIN,0,0 +6447,TRAIN,0,0 +6448,TRAIN,0,0 +6449,TRAIN,0,0 +6450,TRAIN,0,0 +6451,TRAIN,0,0 +6452,TRAIN,0,0 +6453,TRAIN,0,0 +6454,TRAIN,0,0 +6455,TRAIN,0,0 +6456,TRAIN,0,0 +6457,TRAIN,0,0 +6458,TRAIN,0,0 +6459,TRAIN,0,0 +6460,TRAIN,0,0 +6461,TRAIN,0,0 +6462,TRAIN,0,0 +6463,TRAIN,0,0 +6464,TRAIN,0,0 +6465,TRAIN,0,0 +6466,TRAIN,0,0 +6467,TRAIN,0,0 +6468,TRAIN,0,0 +6469,TRAIN,0,0 +6470,TRAIN,0,0 +6471,TRAIN,0,0 +6472,TRAIN,0,0 +6473,TRAIN,0,0 +6474,TRAIN,0,0 +6475,TRAIN,0,0 +6476,TRAIN,0,0 +6477,TRAIN,0,0 +6478,TRAIN,0,0 +6479,TRAIN,0,0 +6480,TRAIN,0,0 +6481,TRAIN,0,0 +6482,TRAIN,0,0 +6483,TRAIN,0,0 +6484,TRAIN,0,0 +6485,TRAIN,0,0 +6486,TRAIN,0,0 +6487,TRAIN,0,0 +6488,TRAIN,0,0 +6489,TRAIN,0,0 +6490,TRAIN,0,0 +6491,TRAIN,0,0 +6492,TRAIN,0,0 +6493,TRAIN,0,0 +6494,TRAIN,0,0 +6495,TRAIN,0,0 +6496,TRAIN,0,0 +6497,TRAIN,0,0 +6498,TRAIN,0,0 +6499,TRAIN,0,0 +6500,TRAIN,0,0 +6501,TRAIN,0,0 +6502,TRAIN,0,0 +6503,TRAIN,0,0 +6504,TRAIN,0,0 +6505,TRAIN,0,0 +6506,TRAIN,0,0 +6507,TRAIN,0,0 +6508,TRAIN,0,0 +6509,TRAIN,0,0 +6510,TRAIN,0,0 +6511,TRAIN,0,0 +6512,TRAIN,0,0 +6513,TRAIN,0,0 +6514,TRAIN,0,0 +6515,TRAIN,0,0 +6516,TRAIN,0,0 +6517,TRAIN,0,0 +6518,TRAIN,0,0 +6519,TRAIN,0,0 +6520,TRAIN,0,0 +6521,TRAIN,0,0 +6522,TRAIN,0,0 +6523,TRAIN,0,0 +6524,TRAIN,0,0 +6525,TRAIN,0,0 +6526,TRAIN,0,0 +6527,TRAIN,0,0 +6528,TRAIN,0,0 +6529,TRAIN,0,0 +6530,TRAIN,0,0 +6531,TRAIN,0,0 +6532,TRAIN,0,0 +6533,TRAIN,0,0 +6534,TRAIN,0,0 +6535,TRAIN,0,0 +6536,TRAIN,0,0 +6537,TRAIN,0,0 +6538,TRAIN,0,0 +6539,TRAIN,0,0 +6540,TRAIN,0,0 +6541,TRAIN,0,0 +6542,TRAIN,0,0 +6543,TRAIN,0,0 +6544,TRAIN,0,0 +6545,TRAIN,0,0 +6546,TRAIN,0,0 +6547,TRAIN,0,0 +6548,TRAIN,0,0 +6549,TRAIN,0,0 +6550,TRAIN,0,0 +6551,TRAIN,0,0 +6552,TRAIN,0,0 +6553,TRAIN,0,0 +6554,TRAIN,0,0 +6555,TRAIN,0,0 +6556,TRAIN,0,0 +6557,TRAIN,0,0 +6558,TRAIN,0,0 +6559,TRAIN,0,0 +6560,TRAIN,0,0 +6561,TRAIN,0,0 +6562,TRAIN,0,0 +6563,TRAIN,0,0 +6564,TRAIN,0,0 +6565,TRAIN,0,0 +6566,TRAIN,0,0 +6567,TRAIN,0,0 +6568,TRAIN,0,0 +6569,TRAIN,0,0 +6570,TRAIN,0,0 +6571,TRAIN,0,0 +6572,TRAIN,0,0 +6573,TRAIN,0,0 +6574,TRAIN,0,0 +6575,TRAIN,0,0 +6576,TRAIN,0,0 +6577,TRAIN,0,0 +6578,TRAIN,0,0 +6579,TRAIN,0,0 +6580,TRAIN,0,0 +6581,TRAIN,0,0 +6582,TRAIN,0,0 +6583,TRAIN,0,0 +6584,TRAIN,0,0 +6585,TRAIN,0,0 +6586,TRAIN,0,0 +6587,TRAIN,0,0 +6588,TRAIN,0,0 +6589,TRAIN,0,0 +6590,TRAIN,0,0 +6591,TRAIN,0,0 +6592,TRAIN,0,0 +6593,TRAIN,0,0 +6594,TRAIN,0,0 +6595,TRAIN,0,0 +6596,TRAIN,0,0 +6597,TRAIN,0,0 +6598,TRAIN,0,0 +6599,TRAIN,0,0 +6600,TRAIN,0,0 +6601,TRAIN,0,0 +6602,TRAIN,0,0 +6603,TRAIN,0,0 +6604,TRAIN,0,0 +6605,TRAIN,0,0 +6606,TRAIN,0,0 +6607,TRAIN,0,0 +6608,TRAIN,0,0 +6609,TRAIN,0,0 +6610,TRAIN,0,0 +6611,TRAIN,0,0 +6612,TRAIN,0,0 +6613,TRAIN,0,0 +6614,TRAIN,0,0 +6615,TRAIN,0,0 +6616,TRAIN,0,0 +6617,TRAIN,0,0 +6618,TRAIN,0,0 +6619,TRAIN,0,0 +6620,TRAIN,0,0 +6621,TRAIN,0,0 +6622,TRAIN,0,0 +6623,TRAIN,0,0 +6624,TRAIN,0,0 +6625,TRAIN,0,0 +6626,TRAIN,0,0 +6627,TRAIN,0,0 +6628,TRAIN,0,0 +6629,TRAIN,0,0 +6630,TRAIN,0,0 +6631,TRAIN,0,0 +6632,TRAIN,0,0 +6633,TRAIN,0,0 +6634,TRAIN,0,0 +6635,TRAIN,0,0 +6636,TRAIN,0,0 +6637,TRAIN,0,0 +6638,TRAIN,0,0 +6639,TRAIN,0,0 +6640,TRAIN,0,0 +6641,TRAIN,0,0 +6642,TRAIN,0,0 +6643,TRAIN,0,0 +6644,TRAIN,0,0 +6645,TRAIN,0,0 +6646,TRAIN,0,0 +6647,TRAIN,0,0 +6648,TRAIN,0,0 +6649,TRAIN,0,0 +6650,TRAIN,0,0 +6651,TRAIN,0,0 +6652,TRAIN,0,0 +6653,TRAIN,0,0 +6654,TRAIN,0,0 +6655,TRAIN,0,0 +6656,TRAIN,0,0 +6657,TRAIN,0,0 +6658,TRAIN,0,0 +6659,TRAIN,0,0 +6660,TRAIN,0,0 +6661,TRAIN,0,0 +6662,TRAIN,0,0 +6663,TRAIN,0,0 +6664,TRAIN,0,0 +6665,TRAIN,0,0 +6666,TRAIN,0,0 +6667,TRAIN,0,0 +6668,TRAIN,0,0 +6669,TRAIN,0,0 +6670,TRAIN,0,0 +6671,TRAIN,0,0 +6672,TRAIN,0,0 +6673,TRAIN,0,0 +6674,TRAIN,0,0 +6675,TRAIN,0,0 +6676,TRAIN,0,0 +6677,TRAIN,0,0 +6678,TRAIN,0,0 +6679,TRAIN,0,0 +6680,TRAIN,0,0 +6681,TRAIN,0,0 +6682,TRAIN,0,0 +6683,TRAIN,0,0 +6684,TRAIN,0,0 +6685,TRAIN,0,0 +6686,TRAIN,0,0 +6687,TRAIN,0,0 +6688,TRAIN,0,0 +6689,TRAIN,0,0 +6690,TRAIN,0,0 +6691,TRAIN,0,0 +6692,TRAIN,0,0 +6693,TRAIN,0,0 +6694,TRAIN,0,0 +6695,TRAIN,0,0 +6696,TRAIN,0,0 +6697,TRAIN,0,0 +6698,TRAIN,0,0 +6699,TRAIN,0,0 +6700,TRAIN,0,0 +6701,TRAIN,0,0 +6702,TRAIN,0,0 +6703,TRAIN,0,0 +6704,TRAIN,0,0 +6705,TRAIN,0,0 +6706,TRAIN,0,0 +6707,TRAIN,0,0 +6708,TRAIN,0,0 +6709,TRAIN,0,0 +6710,TRAIN,0,0 +6711,TRAIN,0,0 +6712,TRAIN,0,0 +6713,TRAIN,0,0 +6714,TRAIN,0,0 +6715,TRAIN,0,0 +6716,TRAIN,0,0 +6717,TRAIN,0,0 +6718,TRAIN,0,0 +6719,TRAIN,0,0 +6720,TRAIN,0,0 +6721,TRAIN,0,0 +6722,TRAIN,0,0 +6723,TRAIN,0,0 +6724,TRAIN,0,0 +6725,TRAIN,0,0 +6726,TRAIN,0,0 +6727,TRAIN,0,0 +6728,TRAIN,0,0 +6729,TRAIN,0,0 +6730,TRAIN,0,0 +6731,TRAIN,0,0 +6732,TRAIN,0,0 +6733,TRAIN,0,0 +6734,TRAIN,0,0 +6735,TRAIN,0,0 +6736,TRAIN,0,0 +6737,TRAIN,0,0 +6738,TRAIN,0,0 +6739,TRAIN,0,0 +6740,TRAIN,0,0 +6741,TRAIN,0,0 +6742,TRAIN,0,0 +6743,TRAIN,0,0 +6744,TRAIN,0,0 +6745,TRAIN,0,0 +6746,TRAIN,0,0 +6747,TRAIN,0,0 +6748,TRAIN,0,0 +6749,TRAIN,0,0 +6750,TRAIN,0,0 +6751,TRAIN,0,0 +6752,TRAIN,0,0 +6753,TRAIN,0,0 +6754,TRAIN,0,0 +6755,TRAIN,0,0 +6756,TRAIN,0,0 +6757,TRAIN,0,0 +6758,TRAIN,0,0 +6759,TRAIN,0,0 +6760,TRAIN,0,0 +6761,TRAIN,0,0 +6762,TRAIN,0,0 +6763,TRAIN,0,0 +6764,TRAIN,0,0 +6765,TRAIN,0,0 +6766,TRAIN,0,0 +6767,TRAIN,0,0 +6768,TRAIN,0,0 +6769,TRAIN,0,0 +6770,TRAIN,0,0 +6771,TRAIN,0,0 +6772,TRAIN,0,0 +6773,TRAIN,0,0 +6774,TRAIN,0,0 +6775,TRAIN,0,0 +6776,TRAIN,0,0 +6777,TRAIN,0,0 +6778,TRAIN,0,0 +6779,TRAIN,0,0 +6780,TRAIN,0,0 +6781,TRAIN,0,0 +6782,TRAIN,0,0 +6783,TRAIN,0,0 +6784,TRAIN,0,0 +6785,TRAIN,0,0 +6786,TRAIN,0,0 +6787,TRAIN,0,0 +6788,TRAIN,0,0 +6789,TRAIN,0,0 +6790,TRAIN,0,0 +6791,TRAIN,0,0 +6792,TRAIN,0,0 +6793,TRAIN,0,0 +6794,TRAIN,0,0 +6795,TRAIN,0,0 +6796,TRAIN,0,0 +6797,TRAIN,0,0 +6798,TRAIN,0,0 +6799,TRAIN,0,0 +6800,TRAIN,0,0 +6801,TRAIN,0,0 +6802,TRAIN,0,0 +6803,TRAIN,0,0 +6804,TRAIN,0,0 +6805,TRAIN,0,0 +6806,TRAIN,0,0 +6807,TRAIN,0,0 +6808,TRAIN,0,0 +6809,TRAIN,0,0 +6810,TRAIN,0,0 +6811,TRAIN,0,0 +6812,TRAIN,0,0 +6813,TRAIN,0,0 +6814,TRAIN,0,0 +6815,TRAIN,0,0 +6816,TRAIN,0,0 +6817,TRAIN,0,0 +6818,TRAIN,0,0 +6819,TRAIN,0,0 +6820,TRAIN,0,0 +6821,TRAIN,0,0 +6822,TRAIN,0,0 +6823,TRAIN,0,0 +6824,TRAIN,0,0 +6825,TRAIN,0,0 +6826,TRAIN,0,0 +6827,TRAIN,0,0 +6828,TRAIN,0,0 +6829,TRAIN,0,0 +6830,TRAIN,0,0 +6831,TRAIN,0,0 +6832,TRAIN,0,0 +6833,TRAIN,0,0 +6834,TRAIN,0,0 +6835,TRAIN,0,0 +6836,TRAIN,0,0 +6837,TRAIN,0,0 +6838,TRAIN,0,0 +6839,TRAIN,0,0 +6840,TRAIN,0,0 +6841,TRAIN,0,0 +6842,TRAIN,0,0 +6843,TRAIN,0,0 +6844,TRAIN,0,0 +6845,TRAIN,0,0 +6846,TRAIN,0,0 +6847,TRAIN,0,0 +6848,TRAIN,0,0 +6849,TRAIN,0,0 +6850,TRAIN,0,0 +6851,TRAIN,0,0 +6852,TRAIN,0,0 +6853,TRAIN,0,0 +6854,TRAIN,0,0 +6855,TRAIN,0,0 +6856,TRAIN,0,0 +6857,TRAIN,0,0 +6858,TRAIN,0,0 +6859,TRAIN,0,0 +6860,TRAIN,0,0 +6861,TRAIN,0,0 +6862,TRAIN,0,0 +6863,TRAIN,0,0 +6864,TRAIN,0,0 +6865,TRAIN,0,0 +6866,TRAIN,0,0 +6867,TRAIN,0,0 +6868,TRAIN,0,0 +6869,TRAIN,0,0 +6870,TRAIN,0,0 +6871,TRAIN,0,0 +6872,TRAIN,0,0 +6873,TRAIN,0,0 +6874,TRAIN,0,0 +6875,TRAIN,0,0 +6876,TRAIN,0,0 +6877,TRAIN,0,0 +6878,TRAIN,0,0 +6879,TRAIN,0,0 +6880,TRAIN,0,0 +6881,TRAIN,0,0 +6882,TRAIN,0,0 +6883,TRAIN,0,0 +6884,TRAIN,0,0 +6885,TRAIN,0,0 +6886,TRAIN,0,0 +6887,TRAIN,0,0 +6888,TRAIN,0,0 +6889,TRAIN,0,0 +6890,TRAIN,0,0 +6891,TRAIN,0,0 +6892,TRAIN,0,0 +6893,TRAIN,0,0 +6894,TRAIN,0,0 +6895,TRAIN,0,0 +6896,TRAIN,0,0 +6897,TRAIN,0,0 +6898,TRAIN,0,0 +6899,TRAIN,0,0 +6900,TRAIN,0,0 +6901,TRAIN,0,0 +6902,TRAIN,0,0 +6903,TRAIN,0,0 +6904,TRAIN,0,0 +6905,TRAIN,0,0 +6906,TRAIN,0,0 +6907,TRAIN,0,0 +6908,TRAIN,0,0 +6909,TRAIN,0,0 +6910,TRAIN,0,0 +6911,TRAIN,0,0 +6912,TRAIN,0,0 +6913,TRAIN,0,0 +6914,TRAIN,0,0 +6915,TRAIN,0,0 +6916,TRAIN,0,0 +6917,TRAIN,0,0 +6918,TRAIN,0,0 +6919,TRAIN,0,0 +6920,TRAIN,0,0 +6921,TRAIN,0,0 +6922,TRAIN,0,0 +6923,TRAIN,0,0 +6924,TRAIN,0,0 +6925,TRAIN,0,0 +6926,TRAIN,0,0 +6927,TRAIN,0,0 +6928,TRAIN,0,0 +6929,TRAIN,0,0 +6930,TRAIN,0,0 +6931,TRAIN,0,0 +6932,TRAIN,0,0 +6933,TRAIN,0,0 +6934,TRAIN,0,0 +6935,TRAIN,0,0 +6936,TRAIN,0,0 +6937,TRAIN,0,0 +6938,TRAIN,0,0 +6939,TRAIN,0,0 +6940,TRAIN,0,0 +6941,TRAIN,0,0 +6942,TRAIN,0,0 +6943,TRAIN,0,0 +6944,TRAIN,0,0 +6945,TRAIN,0,0 +6946,TRAIN,0,0 +6947,TRAIN,0,0 +6948,TRAIN,0,0 +6949,TRAIN,0,0 +6950,TRAIN,0,0 +6951,TRAIN,0,0 +6952,TRAIN,0,0 +6953,TRAIN,0,0 +6954,TRAIN,0,0 +6955,TRAIN,0,0 +6956,TRAIN,0,0 +6957,TRAIN,0,0 +6958,TRAIN,0,0 +6959,TRAIN,0,0 +6960,TRAIN,0,0 +6961,TRAIN,0,0 +6962,TRAIN,0,0 +6963,TRAIN,0,0 +6964,TRAIN,0,0 +6965,TRAIN,0,0 +6966,TRAIN,0,0 +6967,TRAIN,0,0 +6968,TRAIN,0,0 +6969,TRAIN,0,0 +6970,TRAIN,0,0 +6971,TRAIN,0,0 +6972,TRAIN,0,0 +6973,TRAIN,0,0 +6974,TRAIN,0,0 +6975,TRAIN,0,0 +6976,TRAIN,0,0 +6977,TRAIN,0,0 +6978,TRAIN,0,0 +6979,TRAIN,0,0 +6980,TRAIN,0,0 +6981,TRAIN,0,0 +6982,TRAIN,0,0 +6983,TRAIN,0,0 +6984,TRAIN,0,0 +6985,TRAIN,0,0 +6986,TRAIN,0,0 +6987,TRAIN,0,0 +6988,TRAIN,0,0 +6989,TRAIN,0,0 +6990,TRAIN,0,0 +6991,TRAIN,0,0 +6992,TRAIN,0,0 +6993,TRAIN,0,0 +6994,TRAIN,0,0 +6995,TRAIN,0,0 +6996,TRAIN,0,0 +6997,TRAIN,0,0 +6998,TRAIN,0,0 +6999,TRAIN,0,0 +7000,TRAIN,0,0 +7001,TRAIN,0,0 +7002,TRAIN,0,0 +7003,TRAIN,0,0 +7004,TRAIN,0,0 +7005,TRAIN,0,0 +7006,TRAIN,0,0 +7007,TRAIN,0,0 +7008,TRAIN,0,0 +7009,TRAIN,0,0 +7010,TRAIN,0,0 +7011,TRAIN,0,0 +7012,TRAIN,0,0 +7013,TRAIN,0,0 +7014,TRAIN,0,0 +7015,TRAIN,0,0 +7016,TRAIN,0,0 +7017,TRAIN,0,0 +7018,TRAIN,0,0 +7019,TRAIN,0,0 +7020,TRAIN,0,0 +7021,TRAIN,0,0 +7022,TRAIN,0,0 +7023,TRAIN,0,0 +7024,TRAIN,0,0 +7025,TRAIN,0,0 +7026,TRAIN,0,0 diff --git a/datasets/anomaly_reserve/kpi/TRAIN/problem_TRAIN/problemDoc.json b/datasets/anomaly_reserve/kpi/TRAIN/problem_TRAIN/problemDoc.json new file mode 100644 index 0000000..1fd55ad --- /dev/null +++ b/datasets/anomaly_reserve/kpi/TRAIN/problem_TRAIN/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "kpi_problem", + "problemName": "kpi_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "kpi_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 3, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TRAIN" + } + ], + "test": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TEST" + } + ], + "score": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/kpi_dataset/datasetDoc.json b/datasets/anomaly_reserve/kpi/kpi_dataset/datasetDoc.json new file mode 100644 index 0000000..c062016 --- /dev/null +++ b/datasets/anomaly_reserve/kpi/kpi_dataset/datasetDoc.json @@ -0,0 +1,63 @@ +{ + "about": { + "datasetID": "kpi_dataset", + "datasetName": "kpi", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 4 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/kpi_dataset/tables/learningData.csv.REMOVED.git-id b/datasets/anomaly_reserve/kpi/kpi_dataset/tables/learningData.csv.REMOVED.git-id new file mode 100644 index 0000000..3a466ae --- /dev/null +++ b/datasets/anomaly_reserve/kpi/kpi_dataset/tables/learningData.csv.REMOVED.git-id @@ -0,0 +1 @@ +d80846dc46c173472f646a52005a1fb3670ccd09 \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/kpi_problem/dataSplits.csv.REMOVED.git-id b/datasets/anomaly_reserve/kpi/kpi_problem/dataSplits.csv.REMOVED.git-id new file mode 100644 index 0000000..7376921 --- /dev/null +++ b/datasets/anomaly_reserve/kpi/kpi_problem/dataSplits.csv.REMOVED.git-id @@ -0,0 +1 @@ +44db328c252a8156434142a37ef65765869e7548 \ No newline at end of file diff --git a/datasets/anomaly_reserve/kpi/kpi_problem/problemDoc.json b/datasets/anomaly_reserve/kpi/kpi_problem/problemDoc.json new file mode 100644 index 0000000..1fd55ad --- /dev/null +++ b/datasets/anomaly_reserve/kpi/kpi_problem/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "kpi_problem", + "problemName": "kpi_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "kpi_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 3, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TRAIN" + } + ], + "test": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_TEST" + } + ], + "score": [ + { + "from": "kpi_dataset", + "to": "kpi_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/raw_data/kpi.csv.REMOVED.git-id b/datasets/anomaly_reserve/raw_data/kpi.csv.REMOVED.git-id new file mode 100644 index 0000000..9049ed1 --- /dev/null +++ b/datasets/anomaly_reserve/raw_data/kpi.csv.REMOVED.git-id @@ -0,0 +1 @@ +bea5d1c052730eaba76b84ff5df854477cdfa80b \ No newline at end of file diff --git a/datasets/anomaly_reserve/template/datasetDoc.json b/datasets/anomaly_reserve/template/datasetDoc.json new file mode 100644 index 0000000..0494777 --- /dev/null +++ b/datasets/anomaly_reserve/template/datasetDoc.json @@ -0,0 +1,183 @@ +{ + "about": { + "datasetID": "template", + "datasetName": "baseball", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "Player", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "Number_seasons", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "Games_played", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "At_bats", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "Runs", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "Hits", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "Doubles", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 8, + "colName": "Triples", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 9, + "colName": "Home_runs", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 10, + "colName": "RBIs", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 11, + "colName": "Walks", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 12, + "colName": "Strikeouts", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 13, + "colName": "Batting_average", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 14, + "colName": "On_base_pct", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 15, + "colName": "Slugging_pct", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 16, + "colName": "Fielding_ave", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 17, + "colName": "Position", + "colType": "categorical", + "role": [ + "attribute" + ] + }, + { + "colIndex": 18, + "colName": "Hall_of_Fame", + "colType": "categorical", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 19 + } + ] +} diff --git a/datasets/anomaly_reserve/template/problemDoc.json b/datasets/anomaly_reserve/template/problemDoc.json new file mode 100644 index 0000000..514a80a --- /dev/null +++ b/datasets/anomaly_reserve/template/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "template", + "problemName": "baseball_problem", + "problemDescription": "**Author**: Jeffrey S. Simonoff \n**Source**: [AnalCatData](http://www.stern.nyu.edu/~jsimonof/AnalCatData) - 2003 \n**Please cite**: Jeffrey S. Simonoff, Analyzing Categorical Data, Springer-Verlag, New York, 2003 \n \nDatabase of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave' \n\nNotes: \n* Quotes, Single-Quotes and Backslashes were removed, Blanks replaced with Underscores\n* Player is an identifier that should be ignored when modelling the data", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "multiClass", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "185_baseball_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 18, + "colName": "Hall_of_Fame" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "185_baseball_dataset", + "to": "185_baseball_dataset_TRAIN" + } + ], + "test": [ + { + "from": "185_baseball_dataset", + "to": "185_baseball_dataset_TEST" + } + ], + "score": [ + { + "from": "185_baseball_dataset", + "to": "185_baseball_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} diff --git a/datasets/anomaly_reserve/transform.py b/datasets/anomaly_reserve/transform.py new file mode 100644 index 0000000..e8bd319 --- /dev/null +++ b/datasets/anomaly_reserve/transform.py @@ -0,0 +1,160 @@ +# TODO: Wrap it as a class and connect it to GUI +# A script to transform anomaly data to d3m format +import pandas as pd +import numpy as np +import os +import json + +############################## +# Some information for the dataset to be transformed +# Designed for time series data +name = 'kpi' +src_path = './raw_data/kpi.csv' +label_name = 'label' +timestamp_name = 'timestamp' +value_names = ['value'] +ratio = 0.8 # Ratio of training data, the rest is for testing + +############################### + + + +dst_root = './' + name +dirs = ['./', 'SCORE', 'TEST', 'TRAIN'] +maps = {'./': None, 'SCORE': 'TEST', 'TEST': 'TEST', 'TRAIN': 'TRAIN'} + +# Create the corresponding directories +for d in dirs: + if maps[d] is not None: + dataset_name = 'dataset_' + maps[d] + problem_name = 'problem_' + maps[d] + else: + dataset_name = name + '_dataset' + problem_name = name + '_problem' + tables_dir = os.path.join(dst_root, d, dataset_name, 'tables') + if not os.path.exists(tables_dir): + os.makedirs(tables_dir) + problem_dir = os.path.join(dst_root, d, problem_name) + if not os.path.exists(problem_dir): + os.makedirs(problem_dir) + +# Process data +_df = pd.DataFrame() +df = pd.read_csv(src_path) +_df['d3mIndex'] = df.index +_df['timestamp'] = df[timestamp_name] +for value_name in value_names: + _df[value_name] = df[value_name] +_df['ground_truth'] = df[label_name] +df = _df +cols = df.columns.tolist() + +# Save all the data +df.to_csv(os.path.join(dst_root, name+'_dataset', 'tables', 'learningData.csv'), index=False) + +# Save training and testing data +train_df, test_df = df[:int(df.shape[0]*ratio)], df[int(df.shape[0]*ratio):] + +train_df.to_csv(os.path.join(dst_root, 'TRAIN', 'dataset_TRAIN', 'tables', 'learningData.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'TEST', 'dataset_TEST', 'tables', 'learningData.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'SCORE', 'dataset_TEST', 'tables', 'learningData.csv'), index=False) + +# Data splits +row_0 = train_df.shape[0] +row_1 = train_df.shape[0] +row = row_0 + row_1 +df = pd.DataFrame(np.array([[i for i in range(row)], ['TRAIN' for _ in range(row_0)] + ['TEST' for _ in range(row_1)], [0 for _ in range(row)], [0 for _ in range(row)]]).transpose(), columns = ['d3mIndex', 'type', 'repeat', 'fold']) + +# Save data splits for all data +train_df.to_csv(os.path.join(dst_root, name+'_problem', 'dataSplits.csv'), index=False) + +# Save training and testing splits +train_df, test_df = df[:row_0], df[row_0:] +train_df.to_csv(os.path.join(dst_root, 'TRAIN', 'problem_TRAIN', 'dataSplits.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'TEST', 'problem_TEST', 'dataSplits.csv'), index=False) +test_df.to_csv(os.path.join(dst_root, 'SCORE', 'problem_TEST', 'dataSplits.csv'), index=False) + + +# Dataset JSON files +# Load template +with open('template/datasetDoc.json') as json_file: + data = json.load(json_file) +columns = [] +for i in range(len(cols)): + c = {} + c['colIndex'] = i + c['colName'] = cols[i] + if i == 0: + c['colType'] = 'integer' + c['role'] = ['index'] + elif i == 1: + c['colType'] = 'integer' + c['role'] = ['attribute'] + elif i == len(cols)-1: + c['colType'] = 'integer' + c['role'] = ['suggestedTarget'] + else: + c['colType'] = 'real' + c['role'] = ['attribute'] + columns.append(c) +data['dataResources'][0]['columns'] = columns +data['dataResources'][0]['columnsCount'] = len(cols) + +data['about']['datasetID'] = name + '_dataset' +data['about']['datasetName'] = name +with open(os.path.join(dst_root, name+'_dataset', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +data['about']['datasetID'] = name +'_dataset_TRAIN' +data['about']['datasetName'] = "NULL" +with open(os.path.join(dst_root, 'TRAIN', 'dataset_TRAIN', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +data['about']['datasetID'] = name + '_dataset_TEST' +data['about']['datasetName'] = 'NULL' +with open(os.path.join(dst_root, 'TEST', 'dataset_TEST', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +data['about']['datasetID'] = name + '_dataset_TEST' +data['about']['datasetName'] = 'NULL' +with open(os.path.join(dst_root, 'SCORE', 'dataset_TEST', 'datasetDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +# Problem JSON files +# Load template +with open('template/problemDoc.json') as json_file: + data = json.load(json_file) + +data['about']['problemID'] = name+'_problem' +data['about']['problemName'] = name+'_problem' +data['about']['problemDescription'] = 'Anomaly detection' +data['about']['taskKeywords'] = ['classification', 'binary', 'tabular'] +data['inputs']['data'][0]['datasetID'] = name + '_dataset' +data['inputs']['data'][0]['targets'][0]['colIndex'] = len(cols)-1 +data['inputs']['data'][0]['targets'][0]['colName'] = cols[-1] +data['inputs']['dataSplits']['datasetViewMaps']['train'][0]['from'] = name+'_dataset' +data['inputs']['dataSplits']['datasetViewMaps']['test'][0]['from'] = name+'_dataset' +data['inputs']['dataSplits']['datasetViewMaps']['score'][0]['from'] = name+'_dataset' +data['inputs']['dataSplits']['datasetViewMaps']['train'][0]['to'] = name+'_dataset_TRAIN' +data['inputs']['dataSplits']['datasetViewMaps']['test'][0]['to'] = name+'_dataset_TEST' +data['inputs']['dataSplits']['datasetViewMaps']['score'][0]['to'] = name+'_dataset_SCORE' + +with open(os.path.join(dst_root, name+'_problem', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +with open(os.path.join(dst_root, 'TRAIN', 'problem_TRAIN', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +with open(os.path.join(dst_root, 'TEST', 'problem_TEST', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +with open(os.path.join(dst_root, 'SCORE', 'problem_TEST', 'problemDoc.json'), 'w') as outfile: + json.dump(data, outfile, indent=4) + +# Make an empty targets.csv +with open(os.path.join(dst_root, 'SCORE', 'targets.csv'), 'w') as outfile: + outfile.write('') + + + + diff --git a/datasets/anomaly_reserve/yahoo_sub_5/SCORE/dataset_TEST/datasetDoc.json b/datasets/anomaly_reserve/yahoo_sub_5/SCORE/dataset_TEST/datasetDoc.json new file mode 100644 index 0000000..ff5dec4 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/SCORE/dataset_TEST/datasetDoc.json @@ -0,0 +1,95 @@ +{ + "about": { + "datasetID": "yahoo_sub_5_dataset_TEST", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value_0", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "value_1", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "value_2", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "value_3", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "value_4", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 8 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/yahoo_sub_5/SCORE/dataset_TEST/tables/learningData.csv b/datasets/anomaly_reserve/yahoo_sub_5/SCORE/dataset_TEST/tables/learningData.csv new file mode 100644 index 0000000..e3d5131 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/SCORE/dataset_TEST/tables/learningData.csv @@ -0,0 +1,141 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +1260,1261,7782,0.034280386319742985,2.5072222222222003,104,3119,0 +1261,1262,7829,0.039360296791109,2.5927777777778,82,3590,0 +1262,1263,7902,0.0,2.6894444444444,208,3893,0 +1263,1264,8039,0.038944065994356014,2.6291666666667,92,3264,0 +1264,1265,8350,0.18176011684739,2.6469444444444,53,3963,0 +1265,1266,8142,0.18521047165852,2.7461111111111003,65,2757,0 +1266,1267,7886,0.13079770999921,2.9363888888889,62,2306,0 +1267,1268,7743,0.13310058077443,3.2797222222222,73,2549,0 +1268,1269,7707,0.054750658073534006,3.5194444444444,84,2212,0 +1269,1270,7726,0.030588852697706,3.8130555555556,90,2286,0 +1270,1271,7717,0.12998124134227002,3.7941666666667,80,2979,0 +1271,1272,10331,0.09100057249197198,3.6086111111111,90,3158,0 +1272,1273,10515,0.19464543002904006,3.3858333333333,84,2645,0 +1273,1274,10415,0.22178651521516,3.3336111111111,34,3161,0 +1274,1275,10387,0.22983578430825,3.3116666666667003,67,4460,0 +1275,1276,10471,0.298229429356,3.2616666666667005,74,2630,0 +1276,1277,10385,0.12923377484588,3.0044444444444003,44,2593,0 +1277,1278,10439,0.19609416059774,2.6741666666667,64,2625,0 +1278,1279,10516,0.040518533819385014,2.3191666666667,70,4834,0 +1279,1280,10587,0.07099894663641,2.0597222222222,96,4056,0 +1280,1281,10586,0.07584150637714701,2.0547222222222,110,5713,0 +1281,1282,10684,0.08180100127782801,2.1511111111111,68,3940,0 +1282,1283,10880,0.0,2.2602777777778,90,4414,0 +1283,1284,10830,0.0,2.2883333333333,90,5044,0 +1284,1285,10794,0.09140162014739303,2.3736111111111,69,3894,0 +1285,1286,10843,0.0,2.5869444444444,46,3993,0 +1286,1287,10805,0.0,2.6480555555556,74,4404,0 +1287,1288,10996,0.0,2.6077777777777995,68,4072,0 +1288,1289,11327,0.05363316840061,2.6069444444444,67,4182,0 +1289,1290,11090,0.26818151064716,2.6908333333333,51,3351,0 +1290,1291,10578,0.21887772653901,2.9019444444444003,39,4183,0 +1291,1292,10528,0.32371296573811,3.2711111111111,26,4068,0 +1292,1293,10475,0.12565805017257,3.5872222222222,25,8139,0 +1293,1294,10664,0.092277247744574,3.6913888888889,32,11000,0 +1294,1295,10513,0.077016875742983,3.6313888888889,17,2975,0 +1295,1296,9072,0.3714480797312501,3.5605555555556,19,2692,0 +1296,1297,9069,0.19332372237792,3.4402777777778,16,2502,0 +1297,1298,9089,0.06345811641554701,3.35,28,2510,0 +1298,1299,9027,0.2267121559473,3.3469444444444,24,2663,0 +1299,1300,8969,0.053072279964629,3.2708333333333,35,3575,0 +1300,1301,9073,0.13336345197744,3.2519444444444,49,2586,0 +1301,1302,8957,0.1252855094715,2.7311111111111,106,2908,0 +1302,1303,9126,0.096211952864224,2.3875,80,3530,0 +1303,1304,9122,0.096524467517755,2.0847222222222,90,2776,0 +1304,1305,9231,0.08924770147957402,2.0975,169,2962,0 +1305,1306,9368,0.11889606284162,2.1763888888889,98,3441,0 +1306,1307,9458,0.031429841710104,2.2327777777777995,92,4376,0 +1307,1308,9463,0.0,2.2725,91,3857,0 +1308,1309,9356,0.036512411627868,2.3202777777778,99,4685,0 +1309,1310,9340,0.0,2.5425,90,4585,0 +1310,1311,9340,0.0,2.5986111111111,126,3542,0 +1311,1312,9276,0.0,2.6319444444444,102,3370,0 +1312,1313,9611,0.10106696361212,2.5836111111111,132,3515,0 +1313,1314,9532,0.14854949043035,2.675,88,3793,0 +1314,1315,9156,0.08612162048398897,2.8522222222222,135,2954,0 +1315,1316,9222,0.16494200410492002,3.1302777777778,114,2627,0 +1316,1317,9282,0.28637713141253,3.4805555555556,35,2550,0 +1317,1318,9573,0.13206535647488,3.5994444444444,24,2480,0 +1318,1319,9333,0.27364025607799,3.5847222222222,44,2521,0 +1319,1320,9987,0.38382339961227,3.4963888888889,26,2860,0 +1320,1321,10133,0.08426242877623301,3.3825,37,3675,0 +1321,1322,10010,0.3290413568025901,3.2694444444444,45,2704,0 +1322,1323,10028,0.22632868808708,3.2322222222222,42,3121,0 +1323,1324,9984,0.17914189971361,3.1936111111111005,47,2603,0 +1324,1325,10041,0.30046815361859003,3.0536111111111004,34,3984,0 +1325,1326,10072,0.22650915594248,2.7819444444444,56,2537,0 +1326,1327,10025,0.0,2.4152777777778,87,3349,0 +1327,1328,10116,0.1223093269317,2.1569444444444,74,3958,0 +1328,1329,10232,0.1696074188221,2.1125,90,4243,0 +1329,1330,10516,0.0,2.1833333333333003,79,4159,0 +1330,1331,10449,0.028193633007367,2.205,97,5637,0 +1331,1332,10598,0.0,2.1697222222222,90,8142,0 +1332,1333,10337,0.0,2.3075,77,5713,0 +1333,1334,10469,0.097305232437507,2.4575,101,3668,0 +1334,1335,10426,0.11905908868379,2.6077777777777995,74,4307,0 +1335,1336,10531,0.11660374103282,2.6275,439,4354,0 +1336,1337,10875,0.060474297756584014,2.6144444444444,79,4262,0 +1337,1338,10494,0.22568442027805,2.6477777777777995,165,3446,0 +1338,1339,10195,0.14077736537045002,2.8594444444444003,139,2677,0 +1339,1340,9918,0.1924574892026,3.2675,56,4450,0 +1340,1341,9889,0.18922597300629,3.5136111111111004,102,3044,0 +1341,1342,9947,0.041593949118095004,3.5725,101,3428,0 +1342,1343,9977,0.2502095174271,3.6863888888889,41,2845,0 +1343,1344,10835,0.18663972932643,3.5636111111111,94,2781,0 +1344,1345,10765,0.07351854082400297,3.4127777777778,116,2743,0 +1345,1346,10656,0.081949111399618,3.295,94,4470,0 +1346,1347,10485,0.20148511394009,3.2666666666667004,89,2596,0 +1347,1348,10681,0.11515101921294,3.1933333333333,141,3249,0 +1348,1349,10852,0.07797276382811,3.0688888888889,167,2529,0 +1349,1350,10728,0.07244862879413201,2.8102777777778,148,2452,0 +1350,1351,10874,0.07310929970435699,2.42,105,2934,0 +1351,1352,10964,0.066868365737218,2.1358333333333,210,3159,0 +1352,1353,10984,0.05788512501593701,1.9916666666667,145,3974,0 +1353,1354,11055,0.09727414207464803,2.0947222222222,136,4305,0 +1354,1355,11233,0.033270317741558,2.1591666666667,126,5012,0 +1355,1356,11161,0.0,2.2377777777778,157,4455,0 +1356,1357,10966,0.038270957919533,2.2511111111111,105,4108,0 +1357,1358,11193,0.08728058888363299,2.4208333333333,114,4339,0 +1358,1359,11167,0.10536774813238,2.5241666666667,104,5056,0 +1359,1360,11367,0.1233991317089,2.5794444444444,69,5573,0 +1360,1361,51251,0.042565915766552,2.5936111111111,75,3366,1 +1361,1362,17953,0.23147422367229,2.6830555555556,73,2559,1 +1362,1363,170029,0.08983405162538903,2.8188888888889,74,1999,1 +1363,1364,10955,0.07464756469365201,2.9513888888888995,126,1993,0 +1364,1365,10984,0.099244104918934,3.2830555555556,67,1913,0 +1365,1366,10964,0.11535172009194,3.4819444444444,32,1760,0 +1366,1367,10980,0.21774881707852,3.5886111111111005,38,1890,0 +1367,1368,10852,0.1305066423559,3.4836111111111,34,2469,0 +1368,1369,10786,0.10054853030204,3.3955555555556,36,2133,0 +1369,1370,10841,0.02468393737575,3.2847222222222,26,3359,0 +1370,1371,10762,0.10018007414459,3.2383333333332995,74,3783,0 +1371,1372,10419,0.12522619841308,3.2188888888889,85,1809,0 +1372,1373,10467,0.11781887197077,2.9483333333333,67,2143,0 +1373,1374,10502,0.13417256350298,2.5855555555556,84,2567,0 +1374,1375,10519,0.07474686582090599,2.3005555555556003,1630,2176,0 +1375,1376,10579,0.13570963056519,2.0855555555556,1435,1929,0 +1376,1377,10502,0.076431907457478,1.9027777777778,857,2244,0 +1377,1378,10661,0.0,1.9411111111111,31,1810,0 +1378,1379,10818,0.1936428046839,2.0444444444444,500,2088,0 +1379,1380,10918,0.052826773889684014,2.1363888888889,53,2371,0 +1380,1381,10871,0.0,2.22,61,1843,0 +1381,1382,10796,0.054466597481213,2.3530555555556,158,2668,0 +1382,1383,10774,0.057459020289436,2.545,184,2309,0 +1383,1384,10898,0.28750562005936,2.6202777777778,91,1998,0 +1384,1385,11442,0.075538554674309,2.6847222222222,60,2480,0 +1385,1386,11113,0.08112608570492501,2.6591666666667004,107,2147,0 +1386,1387,10888,0.21563803296368,2.7863888888888995,5157,1802,0 +1387,1388,10894,0.095725002305685,3.0269444444444003,28,1789,0 +1388,1389,10888,0.17516056892320994,3.3227777777778,24,1999,0 +1389,1390,10896,0.32902836018586,3.6097222222222,21,2142,0 +1390,1391,10800,0.10216065221678,3.6805555555556,12,1904,0 +1391,1392,11000,0.19741931250852,3.6075,24,1876,0 +1392,1393,10985,0.10149107903671,3.4091666666667004,17,2434,0 +1393,1394,11017,0.17479255893624,3.3666666666667004,48,2472,0 +1394,1395,10863,0.034385029573777,3.3158333333333,41,1744,0 +1395,1396,10875,0.21988771218053,3.1622222222222,1088,2404,0 +1396,1397,10987,0.10149107903671,3.1086111111111,68,1971,0 +1397,1398,10778,0.10269981175445,2.6552777777778,2575,1713,0 +1398,1399,10957,0.11258759940039,2.2730555555556,4688,1765,0 +1399,1400,10832,0.13022351806001,2.0591666666667,477,3156,0 diff --git a/datasets/anomaly_reserve/yahoo_sub_5/SCORE/problem_TEST/dataSplits.csv b/datasets/anomaly_reserve/yahoo_sub_5/SCORE/problem_TEST/dataSplits.csv new file mode 100644 index 0000000..c72d454 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/SCORE/problem_TEST/dataSplits.csv @@ -0,0 +1,1261 @@ +d3mIndex,type,repeat,fold +1260,TEST,0,0 +1261,TEST,0,0 +1262,TEST,0,0 +1263,TEST,0,0 +1264,TEST,0,0 +1265,TEST,0,0 +1266,TEST,0,0 +1267,TEST,0,0 +1268,TEST,0,0 +1269,TEST,0,0 +1270,TEST,0,0 +1271,TEST,0,0 +1272,TEST,0,0 +1273,TEST,0,0 +1274,TEST,0,0 +1275,TEST,0,0 +1276,TEST,0,0 +1277,TEST,0,0 +1278,TEST,0,0 +1279,TEST,0,0 +1280,TEST,0,0 +1281,TEST,0,0 +1282,TEST,0,0 +1283,TEST,0,0 +1284,TEST,0,0 +1285,TEST,0,0 +1286,TEST,0,0 +1287,TEST,0,0 +1288,TEST,0,0 +1289,TEST,0,0 +1290,TEST,0,0 +1291,TEST,0,0 +1292,TEST,0,0 +1293,TEST,0,0 +1294,TEST,0,0 +1295,TEST,0,0 +1296,TEST,0,0 +1297,TEST,0,0 +1298,TEST,0,0 +1299,TEST,0,0 +1300,TEST,0,0 +1301,TEST,0,0 +1302,TEST,0,0 +1303,TEST,0,0 +1304,TEST,0,0 +1305,TEST,0,0 +1306,TEST,0,0 +1307,TEST,0,0 +1308,TEST,0,0 +1309,TEST,0,0 +1310,TEST,0,0 +1311,TEST,0,0 +1312,TEST,0,0 +1313,TEST,0,0 +1314,TEST,0,0 +1315,TEST,0,0 +1316,TEST,0,0 +1317,TEST,0,0 +1318,TEST,0,0 +1319,TEST,0,0 +1320,TEST,0,0 +1321,TEST,0,0 +1322,TEST,0,0 +1323,TEST,0,0 +1324,TEST,0,0 +1325,TEST,0,0 +1326,TEST,0,0 +1327,TEST,0,0 +1328,TEST,0,0 +1329,TEST,0,0 +1330,TEST,0,0 +1331,TEST,0,0 +1332,TEST,0,0 +1333,TEST,0,0 +1334,TEST,0,0 +1335,TEST,0,0 +1336,TEST,0,0 +1337,TEST,0,0 +1338,TEST,0,0 +1339,TEST,0,0 +1340,TEST,0,0 +1341,TEST,0,0 +1342,TEST,0,0 +1343,TEST,0,0 +1344,TEST,0,0 +1345,TEST,0,0 +1346,TEST,0,0 +1347,TEST,0,0 +1348,TEST,0,0 +1349,TEST,0,0 +1350,TEST,0,0 +1351,TEST,0,0 +1352,TEST,0,0 +1353,TEST,0,0 +1354,TEST,0,0 +1355,TEST,0,0 +1356,TEST,0,0 +1357,TEST,0,0 +1358,TEST,0,0 +1359,TEST,0,0 +1360,TEST,0,0 +1361,TEST,0,0 +1362,TEST,0,0 +1363,TEST,0,0 +1364,TEST,0,0 +1365,TEST,0,0 +1366,TEST,0,0 +1367,TEST,0,0 +1368,TEST,0,0 +1369,TEST,0,0 +1370,TEST,0,0 +1371,TEST,0,0 +1372,TEST,0,0 +1373,TEST,0,0 +1374,TEST,0,0 +1375,TEST,0,0 +1376,TEST,0,0 +1377,TEST,0,0 +1378,TEST,0,0 +1379,TEST,0,0 +1380,TEST,0,0 +1381,TEST,0,0 +1382,TEST,0,0 +1383,TEST,0,0 +1384,TEST,0,0 +1385,TEST,0,0 +1386,TEST,0,0 +1387,TEST,0,0 +1388,TEST,0,0 +1389,TEST,0,0 +1390,TEST,0,0 +1391,TEST,0,0 +1392,TEST,0,0 +1393,TEST,0,0 +1394,TEST,0,0 +1395,TEST,0,0 +1396,TEST,0,0 +1397,TEST,0,0 +1398,TEST,0,0 +1399,TEST,0,0 +1400,TEST,0,0 +1401,TEST,0,0 +1402,TEST,0,0 +1403,TEST,0,0 +1404,TEST,0,0 +1405,TEST,0,0 +1406,TEST,0,0 +1407,TEST,0,0 +1408,TEST,0,0 +1409,TEST,0,0 +1410,TEST,0,0 +1411,TEST,0,0 +1412,TEST,0,0 +1413,TEST,0,0 +1414,TEST,0,0 +1415,TEST,0,0 +1416,TEST,0,0 +1417,TEST,0,0 +1418,TEST,0,0 +1419,TEST,0,0 +1420,TEST,0,0 +1421,TEST,0,0 +1422,TEST,0,0 +1423,TEST,0,0 +1424,TEST,0,0 +1425,TEST,0,0 +1426,TEST,0,0 +1427,TEST,0,0 +1428,TEST,0,0 +1429,TEST,0,0 +1430,TEST,0,0 +1431,TEST,0,0 +1432,TEST,0,0 +1433,TEST,0,0 +1434,TEST,0,0 +1435,TEST,0,0 +1436,TEST,0,0 +1437,TEST,0,0 +1438,TEST,0,0 +1439,TEST,0,0 +1440,TEST,0,0 +1441,TEST,0,0 +1442,TEST,0,0 +1443,TEST,0,0 +1444,TEST,0,0 +1445,TEST,0,0 +1446,TEST,0,0 +1447,TEST,0,0 +1448,TEST,0,0 +1449,TEST,0,0 +1450,TEST,0,0 +1451,TEST,0,0 +1452,TEST,0,0 +1453,TEST,0,0 +1454,TEST,0,0 +1455,TEST,0,0 +1456,TEST,0,0 +1457,TEST,0,0 +1458,TEST,0,0 +1459,TEST,0,0 +1460,TEST,0,0 +1461,TEST,0,0 +1462,TEST,0,0 +1463,TEST,0,0 +1464,TEST,0,0 +1465,TEST,0,0 +1466,TEST,0,0 +1467,TEST,0,0 +1468,TEST,0,0 +1469,TEST,0,0 +1470,TEST,0,0 +1471,TEST,0,0 +1472,TEST,0,0 +1473,TEST,0,0 +1474,TEST,0,0 +1475,TEST,0,0 +1476,TEST,0,0 +1477,TEST,0,0 +1478,TEST,0,0 +1479,TEST,0,0 +1480,TEST,0,0 +1481,TEST,0,0 +1482,TEST,0,0 +1483,TEST,0,0 +1484,TEST,0,0 +1485,TEST,0,0 +1486,TEST,0,0 +1487,TEST,0,0 +1488,TEST,0,0 +1489,TEST,0,0 +1490,TEST,0,0 +1491,TEST,0,0 +1492,TEST,0,0 +1493,TEST,0,0 +1494,TEST,0,0 +1495,TEST,0,0 +1496,TEST,0,0 +1497,TEST,0,0 +1498,TEST,0,0 +1499,TEST,0,0 +1500,TEST,0,0 +1501,TEST,0,0 +1502,TEST,0,0 +1503,TEST,0,0 +1504,TEST,0,0 +1505,TEST,0,0 +1506,TEST,0,0 +1507,TEST,0,0 +1508,TEST,0,0 +1509,TEST,0,0 +1510,TEST,0,0 +1511,TEST,0,0 +1512,TEST,0,0 +1513,TEST,0,0 +1514,TEST,0,0 +1515,TEST,0,0 +1516,TEST,0,0 +1517,TEST,0,0 +1518,TEST,0,0 +1519,TEST,0,0 +1520,TEST,0,0 +1521,TEST,0,0 +1522,TEST,0,0 +1523,TEST,0,0 +1524,TEST,0,0 +1525,TEST,0,0 +1526,TEST,0,0 +1527,TEST,0,0 +1528,TEST,0,0 +1529,TEST,0,0 +1530,TEST,0,0 +1531,TEST,0,0 +1532,TEST,0,0 +1533,TEST,0,0 +1534,TEST,0,0 +1535,TEST,0,0 +1536,TEST,0,0 +1537,TEST,0,0 +1538,TEST,0,0 +1539,TEST,0,0 +1540,TEST,0,0 +1541,TEST,0,0 +1542,TEST,0,0 +1543,TEST,0,0 +1544,TEST,0,0 +1545,TEST,0,0 +1546,TEST,0,0 +1547,TEST,0,0 +1548,TEST,0,0 +1549,TEST,0,0 +1550,TEST,0,0 +1551,TEST,0,0 +1552,TEST,0,0 +1553,TEST,0,0 +1554,TEST,0,0 +1555,TEST,0,0 +1556,TEST,0,0 +1557,TEST,0,0 +1558,TEST,0,0 +1559,TEST,0,0 +1560,TEST,0,0 +1561,TEST,0,0 +1562,TEST,0,0 +1563,TEST,0,0 +1564,TEST,0,0 +1565,TEST,0,0 +1566,TEST,0,0 +1567,TEST,0,0 +1568,TEST,0,0 +1569,TEST,0,0 +1570,TEST,0,0 +1571,TEST,0,0 +1572,TEST,0,0 +1573,TEST,0,0 +1574,TEST,0,0 +1575,TEST,0,0 +1576,TEST,0,0 +1577,TEST,0,0 +1578,TEST,0,0 +1579,TEST,0,0 +1580,TEST,0,0 +1581,TEST,0,0 +1582,TEST,0,0 +1583,TEST,0,0 +1584,TEST,0,0 +1585,TEST,0,0 +1586,TEST,0,0 +1587,TEST,0,0 +1588,TEST,0,0 +1589,TEST,0,0 +1590,TEST,0,0 +1591,TEST,0,0 +1592,TEST,0,0 +1593,TEST,0,0 +1594,TEST,0,0 +1595,TEST,0,0 +1596,TEST,0,0 +1597,TEST,0,0 +1598,TEST,0,0 +1599,TEST,0,0 +1600,TEST,0,0 +1601,TEST,0,0 +1602,TEST,0,0 +1603,TEST,0,0 +1604,TEST,0,0 +1605,TEST,0,0 +1606,TEST,0,0 +1607,TEST,0,0 +1608,TEST,0,0 +1609,TEST,0,0 +1610,TEST,0,0 +1611,TEST,0,0 +1612,TEST,0,0 +1613,TEST,0,0 +1614,TEST,0,0 +1615,TEST,0,0 +1616,TEST,0,0 +1617,TEST,0,0 +1618,TEST,0,0 +1619,TEST,0,0 +1620,TEST,0,0 +1621,TEST,0,0 +1622,TEST,0,0 +1623,TEST,0,0 +1624,TEST,0,0 +1625,TEST,0,0 +1626,TEST,0,0 +1627,TEST,0,0 +1628,TEST,0,0 +1629,TEST,0,0 +1630,TEST,0,0 +1631,TEST,0,0 +1632,TEST,0,0 +1633,TEST,0,0 +1634,TEST,0,0 +1635,TEST,0,0 +1636,TEST,0,0 +1637,TEST,0,0 +1638,TEST,0,0 +1639,TEST,0,0 +1640,TEST,0,0 +1641,TEST,0,0 +1642,TEST,0,0 +1643,TEST,0,0 +1644,TEST,0,0 +1645,TEST,0,0 +1646,TEST,0,0 +1647,TEST,0,0 +1648,TEST,0,0 +1649,TEST,0,0 +1650,TEST,0,0 +1651,TEST,0,0 +1652,TEST,0,0 +1653,TEST,0,0 +1654,TEST,0,0 +1655,TEST,0,0 +1656,TEST,0,0 +1657,TEST,0,0 +1658,TEST,0,0 +1659,TEST,0,0 +1660,TEST,0,0 +1661,TEST,0,0 +1662,TEST,0,0 +1663,TEST,0,0 +1664,TEST,0,0 +1665,TEST,0,0 +1666,TEST,0,0 +1667,TEST,0,0 +1668,TEST,0,0 +1669,TEST,0,0 +1670,TEST,0,0 +1671,TEST,0,0 +1672,TEST,0,0 +1673,TEST,0,0 +1674,TEST,0,0 +1675,TEST,0,0 +1676,TEST,0,0 +1677,TEST,0,0 +1678,TEST,0,0 +1679,TEST,0,0 +1680,TEST,0,0 +1681,TEST,0,0 +1682,TEST,0,0 +1683,TEST,0,0 +1684,TEST,0,0 +1685,TEST,0,0 +1686,TEST,0,0 +1687,TEST,0,0 +1688,TEST,0,0 +1689,TEST,0,0 +1690,TEST,0,0 +1691,TEST,0,0 +1692,TEST,0,0 +1693,TEST,0,0 +1694,TEST,0,0 +1695,TEST,0,0 +1696,TEST,0,0 +1697,TEST,0,0 +1698,TEST,0,0 +1699,TEST,0,0 +1700,TEST,0,0 +1701,TEST,0,0 +1702,TEST,0,0 +1703,TEST,0,0 +1704,TEST,0,0 +1705,TEST,0,0 +1706,TEST,0,0 +1707,TEST,0,0 +1708,TEST,0,0 +1709,TEST,0,0 +1710,TEST,0,0 +1711,TEST,0,0 +1712,TEST,0,0 +1713,TEST,0,0 +1714,TEST,0,0 +1715,TEST,0,0 +1716,TEST,0,0 +1717,TEST,0,0 +1718,TEST,0,0 +1719,TEST,0,0 +1720,TEST,0,0 +1721,TEST,0,0 +1722,TEST,0,0 +1723,TEST,0,0 +1724,TEST,0,0 +1725,TEST,0,0 +1726,TEST,0,0 +1727,TEST,0,0 +1728,TEST,0,0 +1729,TEST,0,0 +1730,TEST,0,0 +1731,TEST,0,0 +1732,TEST,0,0 +1733,TEST,0,0 +1734,TEST,0,0 +1735,TEST,0,0 +1736,TEST,0,0 +1737,TEST,0,0 +1738,TEST,0,0 +1739,TEST,0,0 +1740,TEST,0,0 +1741,TEST,0,0 +1742,TEST,0,0 +1743,TEST,0,0 +1744,TEST,0,0 +1745,TEST,0,0 +1746,TEST,0,0 +1747,TEST,0,0 +1748,TEST,0,0 +1749,TEST,0,0 +1750,TEST,0,0 +1751,TEST,0,0 +1752,TEST,0,0 +1753,TEST,0,0 +1754,TEST,0,0 +1755,TEST,0,0 +1756,TEST,0,0 +1757,TEST,0,0 +1758,TEST,0,0 +1759,TEST,0,0 +1760,TEST,0,0 +1761,TEST,0,0 +1762,TEST,0,0 +1763,TEST,0,0 +1764,TEST,0,0 +1765,TEST,0,0 +1766,TEST,0,0 +1767,TEST,0,0 +1768,TEST,0,0 +1769,TEST,0,0 +1770,TEST,0,0 +1771,TEST,0,0 +1772,TEST,0,0 +1773,TEST,0,0 +1774,TEST,0,0 +1775,TEST,0,0 +1776,TEST,0,0 +1777,TEST,0,0 +1778,TEST,0,0 +1779,TEST,0,0 +1780,TEST,0,0 +1781,TEST,0,0 +1782,TEST,0,0 +1783,TEST,0,0 +1784,TEST,0,0 +1785,TEST,0,0 +1786,TEST,0,0 +1787,TEST,0,0 +1788,TEST,0,0 +1789,TEST,0,0 +1790,TEST,0,0 +1791,TEST,0,0 +1792,TEST,0,0 +1793,TEST,0,0 +1794,TEST,0,0 +1795,TEST,0,0 +1796,TEST,0,0 +1797,TEST,0,0 +1798,TEST,0,0 +1799,TEST,0,0 +1800,TEST,0,0 +1801,TEST,0,0 +1802,TEST,0,0 +1803,TEST,0,0 +1804,TEST,0,0 +1805,TEST,0,0 +1806,TEST,0,0 +1807,TEST,0,0 +1808,TEST,0,0 +1809,TEST,0,0 +1810,TEST,0,0 +1811,TEST,0,0 +1812,TEST,0,0 +1813,TEST,0,0 +1814,TEST,0,0 +1815,TEST,0,0 +1816,TEST,0,0 +1817,TEST,0,0 +1818,TEST,0,0 +1819,TEST,0,0 +1820,TEST,0,0 +1821,TEST,0,0 +1822,TEST,0,0 +1823,TEST,0,0 +1824,TEST,0,0 +1825,TEST,0,0 +1826,TEST,0,0 +1827,TEST,0,0 +1828,TEST,0,0 +1829,TEST,0,0 +1830,TEST,0,0 +1831,TEST,0,0 +1832,TEST,0,0 +1833,TEST,0,0 +1834,TEST,0,0 +1835,TEST,0,0 +1836,TEST,0,0 +1837,TEST,0,0 +1838,TEST,0,0 +1839,TEST,0,0 +1840,TEST,0,0 +1841,TEST,0,0 +1842,TEST,0,0 +1843,TEST,0,0 +1844,TEST,0,0 +1845,TEST,0,0 +1846,TEST,0,0 +1847,TEST,0,0 +1848,TEST,0,0 +1849,TEST,0,0 +1850,TEST,0,0 +1851,TEST,0,0 +1852,TEST,0,0 +1853,TEST,0,0 +1854,TEST,0,0 +1855,TEST,0,0 +1856,TEST,0,0 +1857,TEST,0,0 +1858,TEST,0,0 +1859,TEST,0,0 +1860,TEST,0,0 +1861,TEST,0,0 +1862,TEST,0,0 +1863,TEST,0,0 +1864,TEST,0,0 +1865,TEST,0,0 +1866,TEST,0,0 +1867,TEST,0,0 +1868,TEST,0,0 +1869,TEST,0,0 +1870,TEST,0,0 +1871,TEST,0,0 +1872,TEST,0,0 +1873,TEST,0,0 +1874,TEST,0,0 +1875,TEST,0,0 +1876,TEST,0,0 +1877,TEST,0,0 +1878,TEST,0,0 +1879,TEST,0,0 +1880,TEST,0,0 +1881,TEST,0,0 +1882,TEST,0,0 +1883,TEST,0,0 +1884,TEST,0,0 +1885,TEST,0,0 +1886,TEST,0,0 +1887,TEST,0,0 +1888,TEST,0,0 +1889,TEST,0,0 +1890,TEST,0,0 +1891,TEST,0,0 +1892,TEST,0,0 +1893,TEST,0,0 +1894,TEST,0,0 +1895,TEST,0,0 +1896,TEST,0,0 +1897,TEST,0,0 +1898,TEST,0,0 +1899,TEST,0,0 +1900,TEST,0,0 +1901,TEST,0,0 +1902,TEST,0,0 +1903,TEST,0,0 +1904,TEST,0,0 +1905,TEST,0,0 +1906,TEST,0,0 +1907,TEST,0,0 +1908,TEST,0,0 +1909,TEST,0,0 +1910,TEST,0,0 +1911,TEST,0,0 +1912,TEST,0,0 +1913,TEST,0,0 +1914,TEST,0,0 +1915,TEST,0,0 +1916,TEST,0,0 +1917,TEST,0,0 +1918,TEST,0,0 +1919,TEST,0,0 +1920,TEST,0,0 +1921,TEST,0,0 +1922,TEST,0,0 +1923,TEST,0,0 +1924,TEST,0,0 +1925,TEST,0,0 +1926,TEST,0,0 +1927,TEST,0,0 +1928,TEST,0,0 +1929,TEST,0,0 +1930,TEST,0,0 +1931,TEST,0,0 +1932,TEST,0,0 +1933,TEST,0,0 +1934,TEST,0,0 +1935,TEST,0,0 +1936,TEST,0,0 +1937,TEST,0,0 +1938,TEST,0,0 +1939,TEST,0,0 +1940,TEST,0,0 +1941,TEST,0,0 +1942,TEST,0,0 +1943,TEST,0,0 +1944,TEST,0,0 +1945,TEST,0,0 +1946,TEST,0,0 +1947,TEST,0,0 +1948,TEST,0,0 +1949,TEST,0,0 +1950,TEST,0,0 +1951,TEST,0,0 +1952,TEST,0,0 +1953,TEST,0,0 +1954,TEST,0,0 +1955,TEST,0,0 +1956,TEST,0,0 +1957,TEST,0,0 +1958,TEST,0,0 +1959,TEST,0,0 +1960,TEST,0,0 +1961,TEST,0,0 +1962,TEST,0,0 +1963,TEST,0,0 +1964,TEST,0,0 +1965,TEST,0,0 +1966,TEST,0,0 +1967,TEST,0,0 +1968,TEST,0,0 +1969,TEST,0,0 +1970,TEST,0,0 +1971,TEST,0,0 +1972,TEST,0,0 +1973,TEST,0,0 +1974,TEST,0,0 +1975,TEST,0,0 +1976,TEST,0,0 +1977,TEST,0,0 +1978,TEST,0,0 +1979,TEST,0,0 +1980,TEST,0,0 +1981,TEST,0,0 +1982,TEST,0,0 +1983,TEST,0,0 +1984,TEST,0,0 +1985,TEST,0,0 +1986,TEST,0,0 +1987,TEST,0,0 +1988,TEST,0,0 +1989,TEST,0,0 +1990,TEST,0,0 +1991,TEST,0,0 +1992,TEST,0,0 +1993,TEST,0,0 +1994,TEST,0,0 +1995,TEST,0,0 +1996,TEST,0,0 +1997,TEST,0,0 +1998,TEST,0,0 +1999,TEST,0,0 +2000,TEST,0,0 +2001,TEST,0,0 +2002,TEST,0,0 +2003,TEST,0,0 +2004,TEST,0,0 +2005,TEST,0,0 +2006,TEST,0,0 +2007,TEST,0,0 +2008,TEST,0,0 +2009,TEST,0,0 +2010,TEST,0,0 +2011,TEST,0,0 +2012,TEST,0,0 +2013,TEST,0,0 +2014,TEST,0,0 +2015,TEST,0,0 +2016,TEST,0,0 +2017,TEST,0,0 +2018,TEST,0,0 +2019,TEST,0,0 +2020,TEST,0,0 +2021,TEST,0,0 +2022,TEST,0,0 +2023,TEST,0,0 +2024,TEST,0,0 +2025,TEST,0,0 +2026,TEST,0,0 +2027,TEST,0,0 +2028,TEST,0,0 +2029,TEST,0,0 +2030,TEST,0,0 +2031,TEST,0,0 +2032,TEST,0,0 +2033,TEST,0,0 +2034,TEST,0,0 +2035,TEST,0,0 +2036,TEST,0,0 +2037,TEST,0,0 +2038,TEST,0,0 +2039,TEST,0,0 +2040,TEST,0,0 +2041,TEST,0,0 +2042,TEST,0,0 +2043,TEST,0,0 +2044,TEST,0,0 +2045,TEST,0,0 +2046,TEST,0,0 +2047,TEST,0,0 +2048,TEST,0,0 +2049,TEST,0,0 +2050,TEST,0,0 +2051,TEST,0,0 +2052,TEST,0,0 +2053,TEST,0,0 +2054,TEST,0,0 +2055,TEST,0,0 +2056,TEST,0,0 +2057,TEST,0,0 +2058,TEST,0,0 +2059,TEST,0,0 +2060,TEST,0,0 +2061,TEST,0,0 +2062,TEST,0,0 +2063,TEST,0,0 +2064,TEST,0,0 +2065,TEST,0,0 +2066,TEST,0,0 +2067,TEST,0,0 +2068,TEST,0,0 +2069,TEST,0,0 +2070,TEST,0,0 +2071,TEST,0,0 +2072,TEST,0,0 +2073,TEST,0,0 +2074,TEST,0,0 +2075,TEST,0,0 +2076,TEST,0,0 +2077,TEST,0,0 +2078,TEST,0,0 +2079,TEST,0,0 +2080,TEST,0,0 +2081,TEST,0,0 +2082,TEST,0,0 +2083,TEST,0,0 +2084,TEST,0,0 +2085,TEST,0,0 +2086,TEST,0,0 +2087,TEST,0,0 +2088,TEST,0,0 +2089,TEST,0,0 +2090,TEST,0,0 +2091,TEST,0,0 +2092,TEST,0,0 +2093,TEST,0,0 +2094,TEST,0,0 +2095,TEST,0,0 +2096,TEST,0,0 +2097,TEST,0,0 +2098,TEST,0,0 +2099,TEST,0,0 +2100,TEST,0,0 +2101,TEST,0,0 +2102,TEST,0,0 +2103,TEST,0,0 +2104,TEST,0,0 +2105,TEST,0,0 +2106,TEST,0,0 +2107,TEST,0,0 +2108,TEST,0,0 +2109,TEST,0,0 +2110,TEST,0,0 +2111,TEST,0,0 +2112,TEST,0,0 +2113,TEST,0,0 +2114,TEST,0,0 +2115,TEST,0,0 +2116,TEST,0,0 +2117,TEST,0,0 +2118,TEST,0,0 +2119,TEST,0,0 +2120,TEST,0,0 +2121,TEST,0,0 +2122,TEST,0,0 +2123,TEST,0,0 +2124,TEST,0,0 +2125,TEST,0,0 +2126,TEST,0,0 +2127,TEST,0,0 +2128,TEST,0,0 +2129,TEST,0,0 +2130,TEST,0,0 +2131,TEST,0,0 +2132,TEST,0,0 +2133,TEST,0,0 +2134,TEST,0,0 +2135,TEST,0,0 +2136,TEST,0,0 +2137,TEST,0,0 +2138,TEST,0,0 +2139,TEST,0,0 +2140,TEST,0,0 +2141,TEST,0,0 +2142,TEST,0,0 +2143,TEST,0,0 +2144,TEST,0,0 +2145,TEST,0,0 +2146,TEST,0,0 +2147,TEST,0,0 +2148,TEST,0,0 +2149,TEST,0,0 +2150,TEST,0,0 +2151,TEST,0,0 +2152,TEST,0,0 +2153,TEST,0,0 +2154,TEST,0,0 +2155,TEST,0,0 +2156,TEST,0,0 +2157,TEST,0,0 +2158,TEST,0,0 +2159,TEST,0,0 +2160,TEST,0,0 +2161,TEST,0,0 +2162,TEST,0,0 +2163,TEST,0,0 +2164,TEST,0,0 +2165,TEST,0,0 +2166,TEST,0,0 +2167,TEST,0,0 +2168,TEST,0,0 +2169,TEST,0,0 +2170,TEST,0,0 +2171,TEST,0,0 +2172,TEST,0,0 +2173,TEST,0,0 +2174,TEST,0,0 +2175,TEST,0,0 +2176,TEST,0,0 +2177,TEST,0,0 +2178,TEST,0,0 +2179,TEST,0,0 +2180,TEST,0,0 +2181,TEST,0,0 +2182,TEST,0,0 +2183,TEST,0,0 +2184,TEST,0,0 +2185,TEST,0,0 +2186,TEST,0,0 +2187,TEST,0,0 +2188,TEST,0,0 +2189,TEST,0,0 +2190,TEST,0,0 +2191,TEST,0,0 +2192,TEST,0,0 +2193,TEST,0,0 +2194,TEST,0,0 +2195,TEST,0,0 +2196,TEST,0,0 +2197,TEST,0,0 +2198,TEST,0,0 +2199,TEST,0,0 +2200,TEST,0,0 +2201,TEST,0,0 +2202,TEST,0,0 +2203,TEST,0,0 +2204,TEST,0,0 +2205,TEST,0,0 +2206,TEST,0,0 +2207,TEST,0,0 +2208,TEST,0,0 +2209,TEST,0,0 +2210,TEST,0,0 +2211,TEST,0,0 +2212,TEST,0,0 +2213,TEST,0,0 +2214,TEST,0,0 +2215,TEST,0,0 +2216,TEST,0,0 +2217,TEST,0,0 +2218,TEST,0,0 +2219,TEST,0,0 +2220,TEST,0,0 +2221,TEST,0,0 +2222,TEST,0,0 +2223,TEST,0,0 +2224,TEST,0,0 +2225,TEST,0,0 +2226,TEST,0,0 +2227,TEST,0,0 +2228,TEST,0,0 +2229,TEST,0,0 +2230,TEST,0,0 +2231,TEST,0,0 +2232,TEST,0,0 +2233,TEST,0,0 +2234,TEST,0,0 +2235,TEST,0,0 +2236,TEST,0,0 +2237,TEST,0,0 +2238,TEST,0,0 +2239,TEST,0,0 +2240,TEST,0,0 +2241,TEST,0,0 +2242,TEST,0,0 +2243,TEST,0,0 +2244,TEST,0,0 +2245,TEST,0,0 +2246,TEST,0,0 +2247,TEST,0,0 +2248,TEST,0,0 +2249,TEST,0,0 +2250,TEST,0,0 +2251,TEST,0,0 +2252,TEST,0,0 +2253,TEST,0,0 +2254,TEST,0,0 +2255,TEST,0,0 +2256,TEST,0,0 +2257,TEST,0,0 +2258,TEST,0,0 +2259,TEST,0,0 +2260,TEST,0,0 +2261,TEST,0,0 +2262,TEST,0,0 +2263,TEST,0,0 +2264,TEST,0,0 +2265,TEST,0,0 +2266,TEST,0,0 +2267,TEST,0,0 +2268,TEST,0,0 +2269,TEST,0,0 +2270,TEST,0,0 +2271,TEST,0,0 +2272,TEST,0,0 +2273,TEST,0,0 +2274,TEST,0,0 +2275,TEST,0,0 +2276,TEST,0,0 +2277,TEST,0,0 +2278,TEST,0,0 +2279,TEST,0,0 +2280,TEST,0,0 +2281,TEST,0,0 +2282,TEST,0,0 +2283,TEST,0,0 +2284,TEST,0,0 +2285,TEST,0,0 +2286,TEST,0,0 +2287,TEST,0,0 +2288,TEST,0,0 +2289,TEST,0,0 +2290,TEST,0,0 +2291,TEST,0,0 +2292,TEST,0,0 +2293,TEST,0,0 +2294,TEST,0,0 +2295,TEST,0,0 +2296,TEST,0,0 +2297,TEST,0,0 +2298,TEST,0,0 +2299,TEST,0,0 +2300,TEST,0,0 +2301,TEST,0,0 +2302,TEST,0,0 +2303,TEST,0,0 +2304,TEST,0,0 +2305,TEST,0,0 +2306,TEST,0,0 +2307,TEST,0,0 +2308,TEST,0,0 +2309,TEST,0,0 +2310,TEST,0,0 +2311,TEST,0,0 +2312,TEST,0,0 +2313,TEST,0,0 +2314,TEST,0,0 +2315,TEST,0,0 +2316,TEST,0,0 +2317,TEST,0,0 +2318,TEST,0,0 +2319,TEST,0,0 +2320,TEST,0,0 +2321,TEST,0,0 +2322,TEST,0,0 +2323,TEST,0,0 +2324,TEST,0,0 +2325,TEST,0,0 +2326,TEST,0,0 +2327,TEST,0,0 +2328,TEST,0,0 +2329,TEST,0,0 +2330,TEST,0,0 +2331,TEST,0,0 +2332,TEST,0,0 +2333,TEST,0,0 +2334,TEST,0,0 +2335,TEST,0,0 +2336,TEST,0,0 +2337,TEST,0,0 +2338,TEST,0,0 +2339,TEST,0,0 +2340,TEST,0,0 +2341,TEST,0,0 +2342,TEST,0,0 +2343,TEST,0,0 +2344,TEST,0,0 +2345,TEST,0,0 +2346,TEST,0,0 +2347,TEST,0,0 +2348,TEST,0,0 +2349,TEST,0,0 +2350,TEST,0,0 +2351,TEST,0,0 +2352,TEST,0,0 +2353,TEST,0,0 +2354,TEST,0,0 +2355,TEST,0,0 +2356,TEST,0,0 +2357,TEST,0,0 +2358,TEST,0,0 +2359,TEST,0,0 +2360,TEST,0,0 +2361,TEST,0,0 +2362,TEST,0,0 +2363,TEST,0,0 +2364,TEST,0,0 +2365,TEST,0,0 +2366,TEST,0,0 +2367,TEST,0,0 +2368,TEST,0,0 +2369,TEST,0,0 +2370,TEST,0,0 +2371,TEST,0,0 +2372,TEST,0,0 +2373,TEST,0,0 +2374,TEST,0,0 +2375,TEST,0,0 +2376,TEST,0,0 +2377,TEST,0,0 +2378,TEST,0,0 +2379,TEST,0,0 +2380,TEST,0,0 +2381,TEST,0,0 +2382,TEST,0,0 +2383,TEST,0,0 +2384,TEST,0,0 +2385,TEST,0,0 +2386,TEST,0,0 +2387,TEST,0,0 +2388,TEST,0,0 +2389,TEST,0,0 +2390,TEST,0,0 +2391,TEST,0,0 +2392,TEST,0,0 +2393,TEST,0,0 +2394,TEST,0,0 +2395,TEST,0,0 +2396,TEST,0,0 +2397,TEST,0,0 +2398,TEST,0,0 +2399,TEST,0,0 +2400,TEST,0,0 +2401,TEST,0,0 +2402,TEST,0,0 +2403,TEST,0,0 +2404,TEST,0,0 +2405,TEST,0,0 +2406,TEST,0,0 +2407,TEST,0,0 +2408,TEST,0,0 +2409,TEST,0,0 +2410,TEST,0,0 +2411,TEST,0,0 +2412,TEST,0,0 +2413,TEST,0,0 +2414,TEST,0,0 +2415,TEST,0,0 +2416,TEST,0,0 +2417,TEST,0,0 +2418,TEST,0,0 +2419,TEST,0,0 +2420,TEST,0,0 +2421,TEST,0,0 +2422,TEST,0,0 +2423,TEST,0,0 +2424,TEST,0,0 +2425,TEST,0,0 +2426,TEST,0,0 +2427,TEST,0,0 +2428,TEST,0,0 +2429,TEST,0,0 +2430,TEST,0,0 +2431,TEST,0,0 +2432,TEST,0,0 +2433,TEST,0,0 +2434,TEST,0,0 +2435,TEST,0,0 +2436,TEST,0,0 +2437,TEST,0,0 +2438,TEST,0,0 +2439,TEST,0,0 +2440,TEST,0,0 +2441,TEST,0,0 +2442,TEST,0,0 +2443,TEST,0,0 +2444,TEST,0,0 +2445,TEST,0,0 +2446,TEST,0,0 +2447,TEST,0,0 +2448,TEST,0,0 +2449,TEST,0,0 +2450,TEST,0,0 +2451,TEST,0,0 +2452,TEST,0,0 +2453,TEST,0,0 +2454,TEST,0,0 +2455,TEST,0,0 +2456,TEST,0,0 +2457,TEST,0,0 +2458,TEST,0,0 +2459,TEST,0,0 +2460,TEST,0,0 +2461,TEST,0,0 +2462,TEST,0,0 +2463,TEST,0,0 +2464,TEST,0,0 +2465,TEST,0,0 +2466,TEST,0,0 +2467,TEST,0,0 +2468,TEST,0,0 +2469,TEST,0,0 +2470,TEST,0,0 +2471,TEST,0,0 +2472,TEST,0,0 +2473,TEST,0,0 +2474,TEST,0,0 +2475,TEST,0,0 +2476,TEST,0,0 +2477,TEST,0,0 +2478,TEST,0,0 +2479,TEST,0,0 +2480,TEST,0,0 +2481,TEST,0,0 +2482,TEST,0,0 +2483,TEST,0,0 +2484,TEST,0,0 +2485,TEST,0,0 +2486,TEST,0,0 +2487,TEST,0,0 +2488,TEST,0,0 +2489,TEST,0,0 +2490,TEST,0,0 +2491,TEST,0,0 +2492,TEST,0,0 +2493,TEST,0,0 +2494,TEST,0,0 +2495,TEST,0,0 +2496,TEST,0,0 +2497,TEST,0,0 +2498,TEST,0,0 +2499,TEST,0,0 +2500,TEST,0,0 +2501,TEST,0,0 +2502,TEST,0,0 +2503,TEST,0,0 +2504,TEST,0,0 +2505,TEST,0,0 +2506,TEST,0,0 +2507,TEST,0,0 +2508,TEST,0,0 +2509,TEST,0,0 +2510,TEST,0,0 +2511,TEST,0,0 +2512,TEST,0,0 +2513,TEST,0,0 +2514,TEST,0,0 +2515,TEST,0,0 +2516,TEST,0,0 +2517,TEST,0,0 +2518,TEST,0,0 +2519,TEST,0,0 diff --git a/datasets/anomaly_reserve/yahoo_sub_5/SCORE/problem_TEST/problemDoc.json b/datasets/anomaly_reserve/yahoo_sub_5/SCORE/problem_TEST/problemDoc.json new file mode 100644 index 0000000..417cb6b --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/SCORE/problem_TEST/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "yahoo_sub_5_problem", + "problemName": "yahoo_sub_5_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "yahoo_sub_5_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 7, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TRAIN" + } + ], + "test": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TEST" + } + ], + "score": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/yahoo_sub_5/SCORE/targets.csv b/datasets/anomaly_reserve/yahoo_sub_5/SCORE/targets.csv new file mode 100644 index 0000000..e69de29 diff --git a/datasets/anomaly_reserve/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json b/datasets/anomaly_reserve/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json new file mode 100644 index 0000000..ff5dec4 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json @@ -0,0 +1,95 @@ +{ + "about": { + "datasetID": "yahoo_sub_5_dataset_TEST", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value_0", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "value_1", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "value_2", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "value_3", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "value_4", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 8 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/yahoo_sub_5/TEST/dataset_TEST/tables/learningData.csv b/datasets/anomaly_reserve/yahoo_sub_5/TEST/dataset_TEST/tables/learningData.csv new file mode 100644 index 0000000..e3d5131 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/TEST/dataset_TEST/tables/learningData.csv @@ -0,0 +1,141 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +1260,1261,7782,0.034280386319742985,2.5072222222222003,104,3119,0 +1261,1262,7829,0.039360296791109,2.5927777777778,82,3590,0 +1262,1263,7902,0.0,2.6894444444444,208,3893,0 +1263,1264,8039,0.038944065994356014,2.6291666666667,92,3264,0 +1264,1265,8350,0.18176011684739,2.6469444444444,53,3963,0 +1265,1266,8142,0.18521047165852,2.7461111111111003,65,2757,0 +1266,1267,7886,0.13079770999921,2.9363888888889,62,2306,0 +1267,1268,7743,0.13310058077443,3.2797222222222,73,2549,0 +1268,1269,7707,0.054750658073534006,3.5194444444444,84,2212,0 +1269,1270,7726,0.030588852697706,3.8130555555556,90,2286,0 +1270,1271,7717,0.12998124134227002,3.7941666666667,80,2979,0 +1271,1272,10331,0.09100057249197198,3.6086111111111,90,3158,0 +1272,1273,10515,0.19464543002904006,3.3858333333333,84,2645,0 +1273,1274,10415,0.22178651521516,3.3336111111111,34,3161,0 +1274,1275,10387,0.22983578430825,3.3116666666667003,67,4460,0 +1275,1276,10471,0.298229429356,3.2616666666667005,74,2630,0 +1276,1277,10385,0.12923377484588,3.0044444444444003,44,2593,0 +1277,1278,10439,0.19609416059774,2.6741666666667,64,2625,0 +1278,1279,10516,0.040518533819385014,2.3191666666667,70,4834,0 +1279,1280,10587,0.07099894663641,2.0597222222222,96,4056,0 +1280,1281,10586,0.07584150637714701,2.0547222222222,110,5713,0 +1281,1282,10684,0.08180100127782801,2.1511111111111,68,3940,0 +1282,1283,10880,0.0,2.2602777777778,90,4414,0 +1283,1284,10830,0.0,2.2883333333333,90,5044,0 +1284,1285,10794,0.09140162014739303,2.3736111111111,69,3894,0 +1285,1286,10843,0.0,2.5869444444444,46,3993,0 +1286,1287,10805,0.0,2.6480555555556,74,4404,0 +1287,1288,10996,0.0,2.6077777777777995,68,4072,0 +1288,1289,11327,0.05363316840061,2.6069444444444,67,4182,0 +1289,1290,11090,0.26818151064716,2.6908333333333,51,3351,0 +1290,1291,10578,0.21887772653901,2.9019444444444003,39,4183,0 +1291,1292,10528,0.32371296573811,3.2711111111111,26,4068,0 +1292,1293,10475,0.12565805017257,3.5872222222222,25,8139,0 +1293,1294,10664,0.092277247744574,3.6913888888889,32,11000,0 +1294,1295,10513,0.077016875742983,3.6313888888889,17,2975,0 +1295,1296,9072,0.3714480797312501,3.5605555555556,19,2692,0 +1296,1297,9069,0.19332372237792,3.4402777777778,16,2502,0 +1297,1298,9089,0.06345811641554701,3.35,28,2510,0 +1298,1299,9027,0.2267121559473,3.3469444444444,24,2663,0 +1299,1300,8969,0.053072279964629,3.2708333333333,35,3575,0 +1300,1301,9073,0.13336345197744,3.2519444444444,49,2586,0 +1301,1302,8957,0.1252855094715,2.7311111111111,106,2908,0 +1302,1303,9126,0.096211952864224,2.3875,80,3530,0 +1303,1304,9122,0.096524467517755,2.0847222222222,90,2776,0 +1304,1305,9231,0.08924770147957402,2.0975,169,2962,0 +1305,1306,9368,0.11889606284162,2.1763888888889,98,3441,0 +1306,1307,9458,0.031429841710104,2.2327777777777995,92,4376,0 +1307,1308,9463,0.0,2.2725,91,3857,0 +1308,1309,9356,0.036512411627868,2.3202777777778,99,4685,0 +1309,1310,9340,0.0,2.5425,90,4585,0 +1310,1311,9340,0.0,2.5986111111111,126,3542,0 +1311,1312,9276,0.0,2.6319444444444,102,3370,0 +1312,1313,9611,0.10106696361212,2.5836111111111,132,3515,0 +1313,1314,9532,0.14854949043035,2.675,88,3793,0 +1314,1315,9156,0.08612162048398897,2.8522222222222,135,2954,0 +1315,1316,9222,0.16494200410492002,3.1302777777778,114,2627,0 +1316,1317,9282,0.28637713141253,3.4805555555556,35,2550,0 +1317,1318,9573,0.13206535647488,3.5994444444444,24,2480,0 +1318,1319,9333,0.27364025607799,3.5847222222222,44,2521,0 +1319,1320,9987,0.38382339961227,3.4963888888889,26,2860,0 +1320,1321,10133,0.08426242877623301,3.3825,37,3675,0 +1321,1322,10010,0.3290413568025901,3.2694444444444,45,2704,0 +1322,1323,10028,0.22632868808708,3.2322222222222,42,3121,0 +1323,1324,9984,0.17914189971361,3.1936111111111005,47,2603,0 +1324,1325,10041,0.30046815361859003,3.0536111111111004,34,3984,0 +1325,1326,10072,0.22650915594248,2.7819444444444,56,2537,0 +1326,1327,10025,0.0,2.4152777777778,87,3349,0 +1327,1328,10116,0.1223093269317,2.1569444444444,74,3958,0 +1328,1329,10232,0.1696074188221,2.1125,90,4243,0 +1329,1330,10516,0.0,2.1833333333333003,79,4159,0 +1330,1331,10449,0.028193633007367,2.205,97,5637,0 +1331,1332,10598,0.0,2.1697222222222,90,8142,0 +1332,1333,10337,0.0,2.3075,77,5713,0 +1333,1334,10469,0.097305232437507,2.4575,101,3668,0 +1334,1335,10426,0.11905908868379,2.6077777777777995,74,4307,0 +1335,1336,10531,0.11660374103282,2.6275,439,4354,0 +1336,1337,10875,0.060474297756584014,2.6144444444444,79,4262,0 +1337,1338,10494,0.22568442027805,2.6477777777777995,165,3446,0 +1338,1339,10195,0.14077736537045002,2.8594444444444003,139,2677,0 +1339,1340,9918,0.1924574892026,3.2675,56,4450,0 +1340,1341,9889,0.18922597300629,3.5136111111111004,102,3044,0 +1341,1342,9947,0.041593949118095004,3.5725,101,3428,0 +1342,1343,9977,0.2502095174271,3.6863888888889,41,2845,0 +1343,1344,10835,0.18663972932643,3.5636111111111,94,2781,0 +1344,1345,10765,0.07351854082400297,3.4127777777778,116,2743,0 +1345,1346,10656,0.081949111399618,3.295,94,4470,0 +1346,1347,10485,0.20148511394009,3.2666666666667004,89,2596,0 +1347,1348,10681,0.11515101921294,3.1933333333333,141,3249,0 +1348,1349,10852,0.07797276382811,3.0688888888889,167,2529,0 +1349,1350,10728,0.07244862879413201,2.8102777777778,148,2452,0 +1350,1351,10874,0.07310929970435699,2.42,105,2934,0 +1351,1352,10964,0.066868365737218,2.1358333333333,210,3159,0 +1352,1353,10984,0.05788512501593701,1.9916666666667,145,3974,0 +1353,1354,11055,0.09727414207464803,2.0947222222222,136,4305,0 +1354,1355,11233,0.033270317741558,2.1591666666667,126,5012,0 +1355,1356,11161,0.0,2.2377777777778,157,4455,0 +1356,1357,10966,0.038270957919533,2.2511111111111,105,4108,0 +1357,1358,11193,0.08728058888363299,2.4208333333333,114,4339,0 +1358,1359,11167,0.10536774813238,2.5241666666667,104,5056,0 +1359,1360,11367,0.1233991317089,2.5794444444444,69,5573,0 +1360,1361,51251,0.042565915766552,2.5936111111111,75,3366,1 +1361,1362,17953,0.23147422367229,2.6830555555556,73,2559,1 +1362,1363,170029,0.08983405162538903,2.8188888888889,74,1999,1 +1363,1364,10955,0.07464756469365201,2.9513888888888995,126,1993,0 +1364,1365,10984,0.099244104918934,3.2830555555556,67,1913,0 +1365,1366,10964,0.11535172009194,3.4819444444444,32,1760,0 +1366,1367,10980,0.21774881707852,3.5886111111111005,38,1890,0 +1367,1368,10852,0.1305066423559,3.4836111111111,34,2469,0 +1368,1369,10786,0.10054853030204,3.3955555555556,36,2133,0 +1369,1370,10841,0.02468393737575,3.2847222222222,26,3359,0 +1370,1371,10762,0.10018007414459,3.2383333333332995,74,3783,0 +1371,1372,10419,0.12522619841308,3.2188888888889,85,1809,0 +1372,1373,10467,0.11781887197077,2.9483333333333,67,2143,0 +1373,1374,10502,0.13417256350298,2.5855555555556,84,2567,0 +1374,1375,10519,0.07474686582090599,2.3005555555556003,1630,2176,0 +1375,1376,10579,0.13570963056519,2.0855555555556,1435,1929,0 +1376,1377,10502,0.076431907457478,1.9027777777778,857,2244,0 +1377,1378,10661,0.0,1.9411111111111,31,1810,0 +1378,1379,10818,0.1936428046839,2.0444444444444,500,2088,0 +1379,1380,10918,0.052826773889684014,2.1363888888889,53,2371,0 +1380,1381,10871,0.0,2.22,61,1843,0 +1381,1382,10796,0.054466597481213,2.3530555555556,158,2668,0 +1382,1383,10774,0.057459020289436,2.545,184,2309,0 +1383,1384,10898,0.28750562005936,2.6202777777778,91,1998,0 +1384,1385,11442,0.075538554674309,2.6847222222222,60,2480,0 +1385,1386,11113,0.08112608570492501,2.6591666666667004,107,2147,0 +1386,1387,10888,0.21563803296368,2.7863888888888995,5157,1802,0 +1387,1388,10894,0.095725002305685,3.0269444444444003,28,1789,0 +1388,1389,10888,0.17516056892320994,3.3227777777778,24,1999,0 +1389,1390,10896,0.32902836018586,3.6097222222222,21,2142,0 +1390,1391,10800,0.10216065221678,3.6805555555556,12,1904,0 +1391,1392,11000,0.19741931250852,3.6075,24,1876,0 +1392,1393,10985,0.10149107903671,3.4091666666667004,17,2434,0 +1393,1394,11017,0.17479255893624,3.3666666666667004,48,2472,0 +1394,1395,10863,0.034385029573777,3.3158333333333,41,1744,0 +1395,1396,10875,0.21988771218053,3.1622222222222,1088,2404,0 +1396,1397,10987,0.10149107903671,3.1086111111111,68,1971,0 +1397,1398,10778,0.10269981175445,2.6552777777778,2575,1713,0 +1398,1399,10957,0.11258759940039,2.2730555555556,4688,1765,0 +1399,1400,10832,0.13022351806001,2.0591666666667,477,3156,0 diff --git a/datasets/anomaly_reserve/yahoo_sub_5/TEST/problem_TEST/dataSplits.csv b/datasets/anomaly_reserve/yahoo_sub_5/TEST/problem_TEST/dataSplits.csv new file mode 100644 index 0000000..c72d454 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/TEST/problem_TEST/dataSplits.csv @@ -0,0 +1,1261 @@ +d3mIndex,type,repeat,fold +1260,TEST,0,0 +1261,TEST,0,0 +1262,TEST,0,0 +1263,TEST,0,0 +1264,TEST,0,0 +1265,TEST,0,0 +1266,TEST,0,0 +1267,TEST,0,0 +1268,TEST,0,0 +1269,TEST,0,0 +1270,TEST,0,0 +1271,TEST,0,0 +1272,TEST,0,0 +1273,TEST,0,0 +1274,TEST,0,0 +1275,TEST,0,0 +1276,TEST,0,0 +1277,TEST,0,0 +1278,TEST,0,0 +1279,TEST,0,0 +1280,TEST,0,0 +1281,TEST,0,0 +1282,TEST,0,0 +1283,TEST,0,0 +1284,TEST,0,0 +1285,TEST,0,0 +1286,TEST,0,0 +1287,TEST,0,0 +1288,TEST,0,0 +1289,TEST,0,0 +1290,TEST,0,0 +1291,TEST,0,0 +1292,TEST,0,0 +1293,TEST,0,0 +1294,TEST,0,0 +1295,TEST,0,0 +1296,TEST,0,0 +1297,TEST,0,0 +1298,TEST,0,0 +1299,TEST,0,0 +1300,TEST,0,0 +1301,TEST,0,0 +1302,TEST,0,0 +1303,TEST,0,0 +1304,TEST,0,0 +1305,TEST,0,0 +1306,TEST,0,0 +1307,TEST,0,0 +1308,TEST,0,0 +1309,TEST,0,0 +1310,TEST,0,0 +1311,TEST,0,0 +1312,TEST,0,0 +1313,TEST,0,0 +1314,TEST,0,0 +1315,TEST,0,0 +1316,TEST,0,0 +1317,TEST,0,0 +1318,TEST,0,0 +1319,TEST,0,0 +1320,TEST,0,0 +1321,TEST,0,0 +1322,TEST,0,0 +1323,TEST,0,0 +1324,TEST,0,0 +1325,TEST,0,0 +1326,TEST,0,0 +1327,TEST,0,0 +1328,TEST,0,0 +1329,TEST,0,0 +1330,TEST,0,0 +1331,TEST,0,0 +1332,TEST,0,0 +1333,TEST,0,0 +1334,TEST,0,0 +1335,TEST,0,0 +1336,TEST,0,0 +1337,TEST,0,0 +1338,TEST,0,0 +1339,TEST,0,0 +1340,TEST,0,0 +1341,TEST,0,0 +1342,TEST,0,0 +1343,TEST,0,0 +1344,TEST,0,0 +1345,TEST,0,0 +1346,TEST,0,0 +1347,TEST,0,0 +1348,TEST,0,0 +1349,TEST,0,0 +1350,TEST,0,0 +1351,TEST,0,0 +1352,TEST,0,0 +1353,TEST,0,0 +1354,TEST,0,0 +1355,TEST,0,0 +1356,TEST,0,0 +1357,TEST,0,0 +1358,TEST,0,0 +1359,TEST,0,0 +1360,TEST,0,0 +1361,TEST,0,0 +1362,TEST,0,0 +1363,TEST,0,0 +1364,TEST,0,0 +1365,TEST,0,0 +1366,TEST,0,0 +1367,TEST,0,0 +1368,TEST,0,0 +1369,TEST,0,0 +1370,TEST,0,0 +1371,TEST,0,0 +1372,TEST,0,0 +1373,TEST,0,0 +1374,TEST,0,0 +1375,TEST,0,0 +1376,TEST,0,0 +1377,TEST,0,0 +1378,TEST,0,0 +1379,TEST,0,0 +1380,TEST,0,0 +1381,TEST,0,0 +1382,TEST,0,0 +1383,TEST,0,0 +1384,TEST,0,0 +1385,TEST,0,0 +1386,TEST,0,0 +1387,TEST,0,0 +1388,TEST,0,0 +1389,TEST,0,0 +1390,TEST,0,0 +1391,TEST,0,0 +1392,TEST,0,0 +1393,TEST,0,0 +1394,TEST,0,0 +1395,TEST,0,0 +1396,TEST,0,0 +1397,TEST,0,0 +1398,TEST,0,0 +1399,TEST,0,0 +1400,TEST,0,0 +1401,TEST,0,0 +1402,TEST,0,0 +1403,TEST,0,0 +1404,TEST,0,0 +1405,TEST,0,0 +1406,TEST,0,0 +1407,TEST,0,0 +1408,TEST,0,0 +1409,TEST,0,0 +1410,TEST,0,0 +1411,TEST,0,0 +1412,TEST,0,0 +1413,TEST,0,0 +1414,TEST,0,0 +1415,TEST,0,0 +1416,TEST,0,0 +1417,TEST,0,0 +1418,TEST,0,0 +1419,TEST,0,0 +1420,TEST,0,0 +1421,TEST,0,0 +1422,TEST,0,0 +1423,TEST,0,0 +1424,TEST,0,0 +1425,TEST,0,0 +1426,TEST,0,0 +1427,TEST,0,0 +1428,TEST,0,0 +1429,TEST,0,0 +1430,TEST,0,0 +1431,TEST,0,0 +1432,TEST,0,0 +1433,TEST,0,0 +1434,TEST,0,0 +1435,TEST,0,0 +1436,TEST,0,0 +1437,TEST,0,0 +1438,TEST,0,0 +1439,TEST,0,0 +1440,TEST,0,0 +1441,TEST,0,0 +1442,TEST,0,0 +1443,TEST,0,0 +1444,TEST,0,0 +1445,TEST,0,0 +1446,TEST,0,0 +1447,TEST,0,0 +1448,TEST,0,0 +1449,TEST,0,0 +1450,TEST,0,0 +1451,TEST,0,0 +1452,TEST,0,0 +1453,TEST,0,0 +1454,TEST,0,0 +1455,TEST,0,0 +1456,TEST,0,0 +1457,TEST,0,0 +1458,TEST,0,0 +1459,TEST,0,0 +1460,TEST,0,0 +1461,TEST,0,0 +1462,TEST,0,0 +1463,TEST,0,0 +1464,TEST,0,0 +1465,TEST,0,0 +1466,TEST,0,0 +1467,TEST,0,0 +1468,TEST,0,0 +1469,TEST,0,0 +1470,TEST,0,0 +1471,TEST,0,0 +1472,TEST,0,0 +1473,TEST,0,0 +1474,TEST,0,0 +1475,TEST,0,0 +1476,TEST,0,0 +1477,TEST,0,0 +1478,TEST,0,0 +1479,TEST,0,0 +1480,TEST,0,0 +1481,TEST,0,0 +1482,TEST,0,0 +1483,TEST,0,0 +1484,TEST,0,0 +1485,TEST,0,0 +1486,TEST,0,0 +1487,TEST,0,0 +1488,TEST,0,0 +1489,TEST,0,0 +1490,TEST,0,0 +1491,TEST,0,0 +1492,TEST,0,0 +1493,TEST,0,0 +1494,TEST,0,0 +1495,TEST,0,0 +1496,TEST,0,0 +1497,TEST,0,0 +1498,TEST,0,0 +1499,TEST,0,0 +1500,TEST,0,0 +1501,TEST,0,0 +1502,TEST,0,0 +1503,TEST,0,0 +1504,TEST,0,0 +1505,TEST,0,0 +1506,TEST,0,0 +1507,TEST,0,0 +1508,TEST,0,0 +1509,TEST,0,0 +1510,TEST,0,0 +1511,TEST,0,0 +1512,TEST,0,0 +1513,TEST,0,0 +1514,TEST,0,0 +1515,TEST,0,0 +1516,TEST,0,0 +1517,TEST,0,0 +1518,TEST,0,0 +1519,TEST,0,0 +1520,TEST,0,0 +1521,TEST,0,0 +1522,TEST,0,0 +1523,TEST,0,0 +1524,TEST,0,0 +1525,TEST,0,0 +1526,TEST,0,0 +1527,TEST,0,0 +1528,TEST,0,0 +1529,TEST,0,0 +1530,TEST,0,0 +1531,TEST,0,0 +1532,TEST,0,0 +1533,TEST,0,0 +1534,TEST,0,0 +1535,TEST,0,0 +1536,TEST,0,0 +1537,TEST,0,0 +1538,TEST,0,0 +1539,TEST,0,0 +1540,TEST,0,0 +1541,TEST,0,0 +1542,TEST,0,0 +1543,TEST,0,0 +1544,TEST,0,0 +1545,TEST,0,0 +1546,TEST,0,0 +1547,TEST,0,0 +1548,TEST,0,0 +1549,TEST,0,0 +1550,TEST,0,0 +1551,TEST,0,0 +1552,TEST,0,0 +1553,TEST,0,0 +1554,TEST,0,0 +1555,TEST,0,0 +1556,TEST,0,0 +1557,TEST,0,0 +1558,TEST,0,0 +1559,TEST,0,0 +1560,TEST,0,0 +1561,TEST,0,0 +1562,TEST,0,0 +1563,TEST,0,0 +1564,TEST,0,0 +1565,TEST,0,0 +1566,TEST,0,0 +1567,TEST,0,0 +1568,TEST,0,0 +1569,TEST,0,0 +1570,TEST,0,0 +1571,TEST,0,0 +1572,TEST,0,0 +1573,TEST,0,0 +1574,TEST,0,0 +1575,TEST,0,0 +1576,TEST,0,0 +1577,TEST,0,0 +1578,TEST,0,0 +1579,TEST,0,0 +1580,TEST,0,0 +1581,TEST,0,0 +1582,TEST,0,0 +1583,TEST,0,0 +1584,TEST,0,0 +1585,TEST,0,0 +1586,TEST,0,0 +1587,TEST,0,0 +1588,TEST,0,0 +1589,TEST,0,0 +1590,TEST,0,0 +1591,TEST,0,0 +1592,TEST,0,0 +1593,TEST,0,0 +1594,TEST,0,0 +1595,TEST,0,0 +1596,TEST,0,0 +1597,TEST,0,0 +1598,TEST,0,0 +1599,TEST,0,0 +1600,TEST,0,0 +1601,TEST,0,0 +1602,TEST,0,0 +1603,TEST,0,0 +1604,TEST,0,0 +1605,TEST,0,0 +1606,TEST,0,0 +1607,TEST,0,0 +1608,TEST,0,0 +1609,TEST,0,0 +1610,TEST,0,0 +1611,TEST,0,0 +1612,TEST,0,0 +1613,TEST,0,0 +1614,TEST,0,0 +1615,TEST,0,0 +1616,TEST,0,0 +1617,TEST,0,0 +1618,TEST,0,0 +1619,TEST,0,0 +1620,TEST,0,0 +1621,TEST,0,0 +1622,TEST,0,0 +1623,TEST,0,0 +1624,TEST,0,0 +1625,TEST,0,0 +1626,TEST,0,0 +1627,TEST,0,0 +1628,TEST,0,0 +1629,TEST,0,0 +1630,TEST,0,0 +1631,TEST,0,0 +1632,TEST,0,0 +1633,TEST,0,0 +1634,TEST,0,0 +1635,TEST,0,0 +1636,TEST,0,0 +1637,TEST,0,0 +1638,TEST,0,0 +1639,TEST,0,0 +1640,TEST,0,0 +1641,TEST,0,0 +1642,TEST,0,0 +1643,TEST,0,0 +1644,TEST,0,0 +1645,TEST,0,0 +1646,TEST,0,0 +1647,TEST,0,0 +1648,TEST,0,0 +1649,TEST,0,0 +1650,TEST,0,0 +1651,TEST,0,0 +1652,TEST,0,0 +1653,TEST,0,0 +1654,TEST,0,0 +1655,TEST,0,0 +1656,TEST,0,0 +1657,TEST,0,0 +1658,TEST,0,0 +1659,TEST,0,0 +1660,TEST,0,0 +1661,TEST,0,0 +1662,TEST,0,0 +1663,TEST,0,0 +1664,TEST,0,0 +1665,TEST,0,0 +1666,TEST,0,0 +1667,TEST,0,0 +1668,TEST,0,0 +1669,TEST,0,0 +1670,TEST,0,0 +1671,TEST,0,0 +1672,TEST,0,0 +1673,TEST,0,0 +1674,TEST,0,0 +1675,TEST,0,0 +1676,TEST,0,0 +1677,TEST,0,0 +1678,TEST,0,0 +1679,TEST,0,0 +1680,TEST,0,0 +1681,TEST,0,0 +1682,TEST,0,0 +1683,TEST,0,0 +1684,TEST,0,0 +1685,TEST,0,0 +1686,TEST,0,0 +1687,TEST,0,0 +1688,TEST,0,0 +1689,TEST,0,0 +1690,TEST,0,0 +1691,TEST,0,0 +1692,TEST,0,0 +1693,TEST,0,0 +1694,TEST,0,0 +1695,TEST,0,0 +1696,TEST,0,0 +1697,TEST,0,0 +1698,TEST,0,0 +1699,TEST,0,0 +1700,TEST,0,0 +1701,TEST,0,0 +1702,TEST,0,0 +1703,TEST,0,0 +1704,TEST,0,0 +1705,TEST,0,0 +1706,TEST,0,0 +1707,TEST,0,0 +1708,TEST,0,0 +1709,TEST,0,0 +1710,TEST,0,0 +1711,TEST,0,0 +1712,TEST,0,0 +1713,TEST,0,0 +1714,TEST,0,0 +1715,TEST,0,0 +1716,TEST,0,0 +1717,TEST,0,0 +1718,TEST,0,0 +1719,TEST,0,0 +1720,TEST,0,0 +1721,TEST,0,0 +1722,TEST,0,0 +1723,TEST,0,0 +1724,TEST,0,0 +1725,TEST,0,0 +1726,TEST,0,0 +1727,TEST,0,0 +1728,TEST,0,0 +1729,TEST,0,0 +1730,TEST,0,0 +1731,TEST,0,0 +1732,TEST,0,0 +1733,TEST,0,0 +1734,TEST,0,0 +1735,TEST,0,0 +1736,TEST,0,0 +1737,TEST,0,0 +1738,TEST,0,0 +1739,TEST,0,0 +1740,TEST,0,0 +1741,TEST,0,0 +1742,TEST,0,0 +1743,TEST,0,0 +1744,TEST,0,0 +1745,TEST,0,0 +1746,TEST,0,0 +1747,TEST,0,0 +1748,TEST,0,0 +1749,TEST,0,0 +1750,TEST,0,0 +1751,TEST,0,0 +1752,TEST,0,0 +1753,TEST,0,0 +1754,TEST,0,0 +1755,TEST,0,0 +1756,TEST,0,0 +1757,TEST,0,0 +1758,TEST,0,0 +1759,TEST,0,0 +1760,TEST,0,0 +1761,TEST,0,0 +1762,TEST,0,0 +1763,TEST,0,0 +1764,TEST,0,0 +1765,TEST,0,0 +1766,TEST,0,0 +1767,TEST,0,0 +1768,TEST,0,0 +1769,TEST,0,0 +1770,TEST,0,0 +1771,TEST,0,0 +1772,TEST,0,0 +1773,TEST,0,0 +1774,TEST,0,0 +1775,TEST,0,0 +1776,TEST,0,0 +1777,TEST,0,0 +1778,TEST,0,0 +1779,TEST,0,0 +1780,TEST,0,0 +1781,TEST,0,0 +1782,TEST,0,0 +1783,TEST,0,0 +1784,TEST,0,0 +1785,TEST,0,0 +1786,TEST,0,0 +1787,TEST,0,0 +1788,TEST,0,0 +1789,TEST,0,0 +1790,TEST,0,0 +1791,TEST,0,0 +1792,TEST,0,0 +1793,TEST,0,0 +1794,TEST,0,0 +1795,TEST,0,0 +1796,TEST,0,0 +1797,TEST,0,0 +1798,TEST,0,0 +1799,TEST,0,0 +1800,TEST,0,0 +1801,TEST,0,0 +1802,TEST,0,0 +1803,TEST,0,0 +1804,TEST,0,0 +1805,TEST,0,0 +1806,TEST,0,0 +1807,TEST,0,0 +1808,TEST,0,0 +1809,TEST,0,0 +1810,TEST,0,0 +1811,TEST,0,0 +1812,TEST,0,0 +1813,TEST,0,0 +1814,TEST,0,0 +1815,TEST,0,0 +1816,TEST,0,0 +1817,TEST,0,0 +1818,TEST,0,0 +1819,TEST,0,0 +1820,TEST,0,0 +1821,TEST,0,0 +1822,TEST,0,0 +1823,TEST,0,0 +1824,TEST,0,0 +1825,TEST,0,0 +1826,TEST,0,0 +1827,TEST,0,0 +1828,TEST,0,0 +1829,TEST,0,0 +1830,TEST,0,0 +1831,TEST,0,0 +1832,TEST,0,0 +1833,TEST,0,0 +1834,TEST,0,0 +1835,TEST,0,0 +1836,TEST,0,0 +1837,TEST,0,0 +1838,TEST,0,0 +1839,TEST,0,0 +1840,TEST,0,0 +1841,TEST,0,0 +1842,TEST,0,0 +1843,TEST,0,0 +1844,TEST,0,0 +1845,TEST,0,0 +1846,TEST,0,0 +1847,TEST,0,0 +1848,TEST,0,0 +1849,TEST,0,0 +1850,TEST,0,0 +1851,TEST,0,0 +1852,TEST,0,0 +1853,TEST,0,0 +1854,TEST,0,0 +1855,TEST,0,0 +1856,TEST,0,0 +1857,TEST,0,0 +1858,TEST,0,0 +1859,TEST,0,0 +1860,TEST,0,0 +1861,TEST,0,0 +1862,TEST,0,0 +1863,TEST,0,0 +1864,TEST,0,0 +1865,TEST,0,0 +1866,TEST,0,0 +1867,TEST,0,0 +1868,TEST,0,0 +1869,TEST,0,0 +1870,TEST,0,0 +1871,TEST,0,0 +1872,TEST,0,0 +1873,TEST,0,0 +1874,TEST,0,0 +1875,TEST,0,0 +1876,TEST,0,0 +1877,TEST,0,0 +1878,TEST,0,0 +1879,TEST,0,0 +1880,TEST,0,0 +1881,TEST,0,0 +1882,TEST,0,0 +1883,TEST,0,0 +1884,TEST,0,0 +1885,TEST,0,0 +1886,TEST,0,0 +1887,TEST,0,0 +1888,TEST,0,0 +1889,TEST,0,0 +1890,TEST,0,0 +1891,TEST,0,0 +1892,TEST,0,0 +1893,TEST,0,0 +1894,TEST,0,0 +1895,TEST,0,0 +1896,TEST,0,0 +1897,TEST,0,0 +1898,TEST,0,0 +1899,TEST,0,0 +1900,TEST,0,0 +1901,TEST,0,0 +1902,TEST,0,0 +1903,TEST,0,0 +1904,TEST,0,0 +1905,TEST,0,0 +1906,TEST,0,0 +1907,TEST,0,0 +1908,TEST,0,0 +1909,TEST,0,0 +1910,TEST,0,0 +1911,TEST,0,0 +1912,TEST,0,0 +1913,TEST,0,0 +1914,TEST,0,0 +1915,TEST,0,0 +1916,TEST,0,0 +1917,TEST,0,0 +1918,TEST,0,0 +1919,TEST,0,0 +1920,TEST,0,0 +1921,TEST,0,0 +1922,TEST,0,0 +1923,TEST,0,0 +1924,TEST,0,0 +1925,TEST,0,0 +1926,TEST,0,0 +1927,TEST,0,0 +1928,TEST,0,0 +1929,TEST,0,0 +1930,TEST,0,0 +1931,TEST,0,0 +1932,TEST,0,0 +1933,TEST,0,0 +1934,TEST,0,0 +1935,TEST,0,0 +1936,TEST,0,0 +1937,TEST,0,0 +1938,TEST,0,0 +1939,TEST,0,0 +1940,TEST,0,0 +1941,TEST,0,0 +1942,TEST,0,0 +1943,TEST,0,0 +1944,TEST,0,0 +1945,TEST,0,0 +1946,TEST,0,0 +1947,TEST,0,0 +1948,TEST,0,0 +1949,TEST,0,0 +1950,TEST,0,0 +1951,TEST,0,0 +1952,TEST,0,0 +1953,TEST,0,0 +1954,TEST,0,0 +1955,TEST,0,0 +1956,TEST,0,0 +1957,TEST,0,0 +1958,TEST,0,0 +1959,TEST,0,0 +1960,TEST,0,0 +1961,TEST,0,0 +1962,TEST,0,0 +1963,TEST,0,0 +1964,TEST,0,0 +1965,TEST,0,0 +1966,TEST,0,0 +1967,TEST,0,0 +1968,TEST,0,0 +1969,TEST,0,0 +1970,TEST,0,0 +1971,TEST,0,0 +1972,TEST,0,0 +1973,TEST,0,0 +1974,TEST,0,0 +1975,TEST,0,0 +1976,TEST,0,0 +1977,TEST,0,0 +1978,TEST,0,0 +1979,TEST,0,0 +1980,TEST,0,0 +1981,TEST,0,0 +1982,TEST,0,0 +1983,TEST,0,0 +1984,TEST,0,0 +1985,TEST,0,0 +1986,TEST,0,0 +1987,TEST,0,0 +1988,TEST,0,0 +1989,TEST,0,0 +1990,TEST,0,0 +1991,TEST,0,0 +1992,TEST,0,0 +1993,TEST,0,0 +1994,TEST,0,0 +1995,TEST,0,0 +1996,TEST,0,0 +1997,TEST,0,0 +1998,TEST,0,0 +1999,TEST,0,0 +2000,TEST,0,0 +2001,TEST,0,0 +2002,TEST,0,0 +2003,TEST,0,0 +2004,TEST,0,0 +2005,TEST,0,0 +2006,TEST,0,0 +2007,TEST,0,0 +2008,TEST,0,0 +2009,TEST,0,0 +2010,TEST,0,0 +2011,TEST,0,0 +2012,TEST,0,0 +2013,TEST,0,0 +2014,TEST,0,0 +2015,TEST,0,0 +2016,TEST,0,0 +2017,TEST,0,0 +2018,TEST,0,0 +2019,TEST,0,0 +2020,TEST,0,0 +2021,TEST,0,0 +2022,TEST,0,0 +2023,TEST,0,0 +2024,TEST,0,0 +2025,TEST,0,0 +2026,TEST,0,0 +2027,TEST,0,0 +2028,TEST,0,0 +2029,TEST,0,0 +2030,TEST,0,0 +2031,TEST,0,0 +2032,TEST,0,0 +2033,TEST,0,0 +2034,TEST,0,0 +2035,TEST,0,0 +2036,TEST,0,0 +2037,TEST,0,0 +2038,TEST,0,0 +2039,TEST,0,0 +2040,TEST,0,0 +2041,TEST,0,0 +2042,TEST,0,0 +2043,TEST,0,0 +2044,TEST,0,0 +2045,TEST,0,0 +2046,TEST,0,0 +2047,TEST,0,0 +2048,TEST,0,0 +2049,TEST,0,0 +2050,TEST,0,0 +2051,TEST,0,0 +2052,TEST,0,0 +2053,TEST,0,0 +2054,TEST,0,0 +2055,TEST,0,0 +2056,TEST,0,0 +2057,TEST,0,0 +2058,TEST,0,0 +2059,TEST,0,0 +2060,TEST,0,0 +2061,TEST,0,0 +2062,TEST,0,0 +2063,TEST,0,0 +2064,TEST,0,0 +2065,TEST,0,0 +2066,TEST,0,0 +2067,TEST,0,0 +2068,TEST,0,0 +2069,TEST,0,0 +2070,TEST,0,0 +2071,TEST,0,0 +2072,TEST,0,0 +2073,TEST,0,0 +2074,TEST,0,0 +2075,TEST,0,0 +2076,TEST,0,0 +2077,TEST,0,0 +2078,TEST,0,0 +2079,TEST,0,0 +2080,TEST,0,0 +2081,TEST,0,0 +2082,TEST,0,0 +2083,TEST,0,0 +2084,TEST,0,0 +2085,TEST,0,0 +2086,TEST,0,0 +2087,TEST,0,0 +2088,TEST,0,0 +2089,TEST,0,0 +2090,TEST,0,0 +2091,TEST,0,0 +2092,TEST,0,0 +2093,TEST,0,0 +2094,TEST,0,0 +2095,TEST,0,0 +2096,TEST,0,0 +2097,TEST,0,0 +2098,TEST,0,0 +2099,TEST,0,0 +2100,TEST,0,0 +2101,TEST,0,0 +2102,TEST,0,0 +2103,TEST,0,0 +2104,TEST,0,0 +2105,TEST,0,0 +2106,TEST,0,0 +2107,TEST,0,0 +2108,TEST,0,0 +2109,TEST,0,0 +2110,TEST,0,0 +2111,TEST,0,0 +2112,TEST,0,0 +2113,TEST,0,0 +2114,TEST,0,0 +2115,TEST,0,0 +2116,TEST,0,0 +2117,TEST,0,0 +2118,TEST,0,0 +2119,TEST,0,0 +2120,TEST,0,0 +2121,TEST,0,0 +2122,TEST,0,0 +2123,TEST,0,0 +2124,TEST,0,0 +2125,TEST,0,0 +2126,TEST,0,0 +2127,TEST,0,0 +2128,TEST,0,0 +2129,TEST,0,0 +2130,TEST,0,0 +2131,TEST,0,0 +2132,TEST,0,0 +2133,TEST,0,0 +2134,TEST,0,0 +2135,TEST,0,0 +2136,TEST,0,0 +2137,TEST,0,0 +2138,TEST,0,0 +2139,TEST,0,0 +2140,TEST,0,0 +2141,TEST,0,0 +2142,TEST,0,0 +2143,TEST,0,0 +2144,TEST,0,0 +2145,TEST,0,0 +2146,TEST,0,0 +2147,TEST,0,0 +2148,TEST,0,0 +2149,TEST,0,0 +2150,TEST,0,0 +2151,TEST,0,0 +2152,TEST,0,0 +2153,TEST,0,0 +2154,TEST,0,0 +2155,TEST,0,0 +2156,TEST,0,0 +2157,TEST,0,0 +2158,TEST,0,0 +2159,TEST,0,0 +2160,TEST,0,0 +2161,TEST,0,0 +2162,TEST,0,0 +2163,TEST,0,0 +2164,TEST,0,0 +2165,TEST,0,0 +2166,TEST,0,0 +2167,TEST,0,0 +2168,TEST,0,0 +2169,TEST,0,0 +2170,TEST,0,0 +2171,TEST,0,0 +2172,TEST,0,0 +2173,TEST,0,0 +2174,TEST,0,0 +2175,TEST,0,0 +2176,TEST,0,0 +2177,TEST,0,0 +2178,TEST,0,0 +2179,TEST,0,0 +2180,TEST,0,0 +2181,TEST,0,0 +2182,TEST,0,0 +2183,TEST,0,0 +2184,TEST,0,0 +2185,TEST,0,0 +2186,TEST,0,0 +2187,TEST,0,0 +2188,TEST,0,0 +2189,TEST,0,0 +2190,TEST,0,0 +2191,TEST,0,0 +2192,TEST,0,0 +2193,TEST,0,0 +2194,TEST,0,0 +2195,TEST,0,0 +2196,TEST,0,0 +2197,TEST,0,0 +2198,TEST,0,0 +2199,TEST,0,0 +2200,TEST,0,0 +2201,TEST,0,0 +2202,TEST,0,0 +2203,TEST,0,0 +2204,TEST,0,0 +2205,TEST,0,0 +2206,TEST,0,0 +2207,TEST,0,0 +2208,TEST,0,0 +2209,TEST,0,0 +2210,TEST,0,0 +2211,TEST,0,0 +2212,TEST,0,0 +2213,TEST,0,0 +2214,TEST,0,0 +2215,TEST,0,0 +2216,TEST,0,0 +2217,TEST,0,0 +2218,TEST,0,0 +2219,TEST,0,0 +2220,TEST,0,0 +2221,TEST,0,0 +2222,TEST,0,0 +2223,TEST,0,0 +2224,TEST,0,0 +2225,TEST,0,0 +2226,TEST,0,0 +2227,TEST,0,0 +2228,TEST,0,0 +2229,TEST,0,0 +2230,TEST,0,0 +2231,TEST,0,0 +2232,TEST,0,0 +2233,TEST,0,0 +2234,TEST,0,0 +2235,TEST,0,0 +2236,TEST,0,0 +2237,TEST,0,0 +2238,TEST,0,0 +2239,TEST,0,0 +2240,TEST,0,0 +2241,TEST,0,0 +2242,TEST,0,0 +2243,TEST,0,0 +2244,TEST,0,0 +2245,TEST,0,0 +2246,TEST,0,0 +2247,TEST,0,0 +2248,TEST,0,0 +2249,TEST,0,0 +2250,TEST,0,0 +2251,TEST,0,0 +2252,TEST,0,0 +2253,TEST,0,0 +2254,TEST,0,0 +2255,TEST,0,0 +2256,TEST,0,0 +2257,TEST,0,0 +2258,TEST,0,0 +2259,TEST,0,0 +2260,TEST,0,0 +2261,TEST,0,0 +2262,TEST,0,0 +2263,TEST,0,0 +2264,TEST,0,0 +2265,TEST,0,0 +2266,TEST,0,0 +2267,TEST,0,0 +2268,TEST,0,0 +2269,TEST,0,0 +2270,TEST,0,0 +2271,TEST,0,0 +2272,TEST,0,0 +2273,TEST,0,0 +2274,TEST,0,0 +2275,TEST,0,0 +2276,TEST,0,0 +2277,TEST,0,0 +2278,TEST,0,0 +2279,TEST,0,0 +2280,TEST,0,0 +2281,TEST,0,0 +2282,TEST,0,0 +2283,TEST,0,0 +2284,TEST,0,0 +2285,TEST,0,0 +2286,TEST,0,0 +2287,TEST,0,0 +2288,TEST,0,0 +2289,TEST,0,0 +2290,TEST,0,0 +2291,TEST,0,0 +2292,TEST,0,0 +2293,TEST,0,0 +2294,TEST,0,0 +2295,TEST,0,0 +2296,TEST,0,0 +2297,TEST,0,0 +2298,TEST,0,0 +2299,TEST,0,0 +2300,TEST,0,0 +2301,TEST,0,0 +2302,TEST,0,0 +2303,TEST,0,0 +2304,TEST,0,0 +2305,TEST,0,0 +2306,TEST,0,0 +2307,TEST,0,0 +2308,TEST,0,0 +2309,TEST,0,0 +2310,TEST,0,0 +2311,TEST,0,0 +2312,TEST,0,0 +2313,TEST,0,0 +2314,TEST,0,0 +2315,TEST,0,0 +2316,TEST,0,0 +2317,TEST,0,0 +2318,TEST,0,0 +2319,TEST,0,0 +2320,TEST,0,0 +2321,TEST,0,0 +2322,TEST,0,0 +2323,TEST,0,0 +2324,TEST,0,0 +2325,TEST,0,0 +2326,TEST,0,0 +2327,TEST,0,0 +2328,TEST,0,0 +2329,TEST,0,0 +2330,TEST,0,0 +2331,TEST,0,0 +2332,TEST,0,0 +2333,TEST,0,0 +2334,TEST,0,0 +2335,TEST,0,0 +2336,TEST,0,0 +2337,TEST,0,0 +2338,TEST,0,0 +2339,TEST,0,0 +2340,TEST,0,0 +2341,TEST,0,0 +2342,TEST,0,0 +2343,TEST,0,0 +2344,TEST,0,0 +2345,TEST,0,0 +2346,TEST,0,0 +2347,TEST,0,0 +2348,TEST,0,0 +2349,TEST,0,0 +2350,TEST,0,0 +2351,TEST,0,0 +2352,TEST,0,0 +2353,TEST,0,0 +2354,TEST,0,0 +2355,TEST,0,0 +2356,TEST,0,0 +2357,TEST,0,0 +2358,TEST,0,0 +2359,TEST,0,0 +2360,TEST,0,0 +2361,TEST,0,0 +2362,TEST,0,0 +2363,TEST,0,0 +2364,TEST,0,0 +2365,TEST,0,0 +2366,TEST,0,0 +2367,TEST,0,0 +2368,TEST,0,0 +2369,TEST,0,0 +2370,TEST,0,0 +2371,TEST,0,0 +2372,TEST,0,0 +2373,TEST,0,0 +2374,TEST,0,0 +2375,TEST,0,0 +2376,TEST,0,0 +2377,TEST,0,0 +2378,TEST,0,0 +2379,TEST,0,0 +2380,TEST,0,0 +2381,TEST,0,0 +2382,TEST,0,0 +2383,TEST,0,0 +2384,TEST,0,0 +2385,TEST,0,0 +2386,TEST,0,0 +2387,TEST,0,0 +2388,TEST,0,0 +2389,TEST,0,0 +2390,TEST,0,0 +2391,TEST,0,0 +2392,TEST,0,0 +2393,TEST,0,0 +2394,TEST,0,0 +2395,TEST,0,0 +2396,TEST,0,0 +2397,TEST,0,0 +2398,TEST,0,0 +2399,TEST,0,0 +2400,TEST,0,0 +2401,TEST,0,0 +2402,TEST,0,0 +2403,TEST,0,0 +2404,TEST,0,0 +2405,TEST,0,0 +2406,TEST,0,0 +2407,TEST,0,0 +2408,TEST,0,0 +2409,TEST,0,0 +2410,TEST,0,0 +2411,TEST,0,0 +2412,TEST,0,0 +2413,TEST,0,0 +2414,TEST,0,0 +2415,TEST,0,0 +2416,TEST,0,0 +2417,TEST,0,0 +2418,TEST,0,0 +2419,TEST,0,0 +2420,TEST,0,0 +2421,TEST,0,0 +2422,TEST,0,0 +2423,TEST,0,0 +2424,TEST,0,0 +2425,TEST,0,0 +2426,TEST,0,0 +2427,TEST,0,0 +2428,TEST,0,0 +2429,TEST,0,0 +2430,TEST,0,0 +2431,TEST,0,0 +2432,TEST,0,0 +2433,TEST,0,0 +2434,TEST,0,0 +2435,TEST,0,0 +2436,TEST,0,0 +2437,TEST,0,0 +2438,TEST,0,0 +2439,TEST,0,0 +2440,TEST,0,0 +2441,TEST,0,0 +2442,TEST,0,0 +2443,TEST,0,0 +2444,TEST,0,0 +2445,TEST,0,0 +2446,TEST,0,0 +2447,TEST,0,0 +2448,TEST,0,0 +2449,TEST,0,0 +2450,TEST,0,0 +2451,TEST,0,0 +2452,TEST,0,0 +2453,TEST,0,0 +2454,TEST,0,0 +2455,TEST,0,0 +2456,TEST,0,0 +2457,TEST,0,0 +2458,TEST,0,0 +2459,TEST,0,0 +2460,TEST,0,0 +2461,TEST,0,0 +2462,TEST,0,0 +2463,TEST,0,0 +2464,TEST,0,0 +2465,TEST,0,0 +2466,TEST,0,0 +2467,TEST,0,0 +2468,TEST,0,0 +2469,TEST,0,0 +2470,TEST,0,0 +2471,TEST,0,0 +2472,TEST,0,0 +2473,TEST,0,0 +2474,TEST,0,0 +2475,TEST,0,0 +2476,TEST,0,0 +2477,TEST,0,0 +2478,TEST,0,0 +2479,TEST,0,0 +2480,TEST,0,0 +2481,TEST,0,0 +2482,TEST,0,0 +2483,TEST,0,0 +2484,TEST,0,0 +2485,TEST,0,0 +2486,TEST,0,0 +2487,TEST,0,0 +2488,TEST,0,0 +2489,TEST,0,0 +2490,TEST,0,0 +2491,TEST,0,0 +2492,TEST,0,0 +2493,TEST,0,0 +2494,TEST,0,0 +2495,TEST,0,0 +2496,TEST,0,0 +2497,TEST,0,0 +2498,TEST,0,0 +2499,TEST,0,0 +2500,TEST,0,0 +2501,TEST,0,0 +2502,TEST,0,0 +2503,TEST,0,0 +2504,TEST,0,0 +2505,TEST,0,0 +2506,TEST,0,0 +2507,TEST,0,0 +2508,TEST,0,0 +2509,TEST,0,0 +2510,TEST,0,0 +2511,TEST,0,0 +2512,TEST,0,0 +2513,TEST,0,0 +2514,TEST,0,0 +2515,TEST,0,0 +2516,TEST,0,0 +2517,TEST,0,0 +2518,TEST,0,0 +2519,TEST,0,0 diff --git a/datasets/anomaly_reserve/yahoo_sub_5/TEST/problem_TEST/problemDoc.json b/datasets/anomaly_reserve/yahoo_sub_5/TEST/problem_TEST/problemDoc.json new file mode 100644 index 0000000..417cb6b --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/TEST/problem_TEST/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "yahoo_sub_5_problem", + "problemName": "yahoo_sub_5_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "yahoo_sub_5_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 7, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TRAIN" + } + ], + "test": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TEST" + } + ], + "score": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json b/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json new file mode 100644 index 0000000..be6f5c0 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json @@ -0,0 +1,95 @@ +{ + "about": { + "datasetID": "yahoo_sub_5_dataset_TRAIN", + "datasetName": "NULL", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value_0", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "value_1", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "value_2", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "value_3", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "value_4", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 8 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv b/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv new file mode 100644 index 0000000..c07dc45 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/dataset_TRAIN/tables/learningData.csv @@ -0,0 +1,1261 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +0,1,12183,0.0,3.7166666666667,5,2109,0 +1,2,12715,0.091757964510557,3.6108333333333,60,3229,0 +2,3,12736,0.17229675238449998,3.4813888888889,88,3637,0 +3,4,12716,0.22621935431999,3.3802777777778,84,1982,0 +4,5,12739,0.17635798469946,3.1933333333333,111,2751,0 +5,6,12737,0.090491245476051,2.7866666666667004,112,2128,0 +6,7,12857,0.08460994072769001,2.4627777777777995,1235,2109,0 +7,8,12884,0.06842699169496,2.2541666666667,710,2328,0 +8,9,12894,0.13330269689422,2.1180555555556,618,2453,0 +9,10,12675,0.085026586189321,2.0691666666667,84,2847,0 +10,11,13260,0.097073068447328,2.1972222222222,100,3659,0 +11,12,13470,0.0,2.3188888888889,125,5207,0 +12,13,13060,0.031063767542922,2.34,114,5146,0 +13,14,12949,0.017732750501525,2.4902777777778,145,4712,0 +14,15,13035,0.063354504072079,2.6438888888889,91,6363,0 +15,16,12980,0.087870391896335,2.8486111111111003,94,5010,0 +16,17,13677,0.11546815687729,2.8833333333333,79,3956,0 +17,18,13381,0.073413457727404,2.8808333333333,50,4063,0 +18,19,12737,0.040392584616896,2.9005555555556,39,3748,0 +19,20,12554,0.08911335594722301,3.0855555555556,28,3047,0 +20,21,12470,0.098030053711531,3.3536111111111,29,4099,0 +21,22,12490,0.047140641497552,3.7438888888889,24,2122,0 +22,23,12539,0.10481279080241,3.7947222222222,19,3387,0 +23,24,12530,0.20478886838928,3.801111111111101,21,1950,0 +24,25,13002,0.04485100631921201,3.6508333333333,27,2927,0 +25,26,12989,0.1053622140254,3.555,46,1889,0 +26,27,13038,0.08436887679639,3.4769444444444,133,1910,0 +27,28,13011,0.097980673762982,3.2158333333333,143,3747,0 +28,29,12984,0.10165726215275,3.1141666666667,86,4994,0 +29,30,13079,0.056764513454874,2.7983333333333,118,2009,0 +30,31,13048,0.074428708878932,2.4252777777778,56,2899,0 +31,32,13096,0.091244453451818,2.14,92,2298,0 +32,33,13003,0.094529332881679,1.9822222222222,85,1894,0 +33,34,13057,0.016638011234698,1.9694444444444,122,1999,0 +34,35,13023,0.038096861957006005,2.0741666666667,74,3007,0 +35,36,13033,0.064497814457643,2.2505555555556,84,2838,0 +36,37,13034,0.030426401876334,2.2819444444444,54,4113,0 +37,38,13068,0.095423209955973,2.4216666666667,77,2150,0 +38,39,13057,0.069688744272108,2.5997222222222005,84,3007,0 +39,40,13047,0.03468622413034,2.7544444444444003,139,2484,0 +40,41,13795,0.089564461084836,2.7258333333333,65,2101,0 +41,42,13528,0.07337616196456799,2.8302777777778,38,2001,0 +42,43,13032,0.061939295606039,2.9422222222222,35,2102,0 +43,44,13084,0.11419089175512,3.0919444444444,47,2129,0 +44,45,13000,0.10475925920163,3.3519444444444,37,4422,0 +45,46,13008,0.079657960399444,3.6952777777778,53,4573,0 +46,47,12978,0.14475546275416,3.8269444444444,55,1989,0 +47,48,13067,0.1421711341096,3.7877777777778,45,1953,0 +48,49,13086,0.07696963969656899,3.7536111111111,46,1872,0 +49,50,13023,0.06393273436444799,3.61,35,1850,0 +50,51,13046,0.14973281021845006,3.5091666666667,68,2879,0 +51,52,13032,0.041478839355346,3.4205555555556,82,1840,0 +52,53,13012,0.089317973365284,3.2647222222222,154,2134,0 +53,54,13051,0.088820248166203,2.7944444444444,128,2234,0 +54,55,12979,0.054872994406929,2.46,79,3769,0 +55,56,13025,0.07913553329046401,2.2075,66,2717,0 +56,57,13007,0.16317996709063,2.1758333333333,92,2171,0 +57,58,13036,0.08671926699280201,2.3058333333333,67,2224,0 +58,59,13043,0.0733999511789,2.3983333333333,58,1967,0 +59,60,13023,0.0,2.55,58,2148,0 +60,61,13022,0.032756244361869,2.7302777777778,63,1978,0 +61,62,13033,0.054893891024455,2.8169444444444003,61,2021,0 +62,63,13024,0.068514114108229,2.9247222222222,55,2060,0 +63,64,13048,0.05279414163165401,2.8911111111111003,71,2096,0 +64,65,13740,0.023853017353212,2.9575,64,2082,0 +65,66,13540,0.07426125441559799,2.9080555555556,92,2175,0 +66,67,12724,0.024228588329879,3.0088888888889,44,2332,0 +67,68,13070,0.09233413002519697,3.2033333333333,35,2147,0 +68,69,13106,0.15930655332113,3.6213888888889,53,2163,0 +69,70,13025,0.12755838225296,4.0322222222222,49,2406,0 +70,71,13074,0.10152541717054,4.1227777777778,49,2022,0 +71,72,13079,0.040148453968243986,3.9736111111111,103,2188,0 +72,73,13184,0.087208372094752,3.8425,107,2758,0 +73,74,13194,0.074209918996797,3.7097222222222,74,2925,0 +74,75,13191,0.059044537369404015,3.6258333333333,56,3223,0 +75,76,13059,0.06248169832921499,3.4705555555556,60,2507,0 +76,77,13169,0.08876527685714597,3.2877777777778,73,2435,0 +77,78,13114,0.051354431854972,2.9286111111111004,99,2552,0 +78,79,13037,0.074790104163639,2.4888888888889,84,2540,0 +79,80,13179,0.091817341555971,2.2744444444444,129,2642,0 +80,81,13152,0.14762794333026005,2.1733333333333,101,2254,0 +81,82,13095,0.07101004447510299,2.3416666666667,101,2539,0 +82,83,13144,0.07689756334240598,2.3808333333333,51,2596,0 +83,84,13170,0.08412575787388403,2.4663888888889,95,2573,0 +84,85,13162,0.06328921386603299,2.6608333333333,48,2302,0 +85,86,13117,0.057393902128707,2.7558333333333,40,2991,0 +86,87,13129,0.041819399065704,2.8636111111111004,55,3141,0 +87,88,13386,0.073729686380986,2.7586111111111005,56,3285,0 +88,89,13929,0.15365285617975,2.7377777777778,935,3807,0 +89,90,13385,0.060355859742407016,2.6961111111111005,34,2892,0 +90,91,13106,0.10644586288975,2.8569444444444,57,2538,0 +91,92,13113,0.059314286360126985,3.1833333333333,70,2234,0 +92,93,13155,0.096293806236591,3.5544444444444,72,2707,0 +93,94,13186,0.085101425467407,3.8894444444444,66,2382,0 +94,95,13151,0.11149072274185,4.1138888888889,72,2426,0 +95,96,13156,0.076266981262989,3.9519444444444,49,2451,0 +96,97,12813,0.097952120177625,3.8275,41,2288,0 +97,98,12821,0.17250021935572,3.6438888888889,42,2256,0 +98,99,12867,0.11389182319254,3.5608333333333,39,2884,0 +99,100,12837,0.08999961787521,3.5013888888889,81,2398,0 +100,101,12911,0.048649372449385005,3.3088888888889,90,2239,0 +101,102,12842,0.13861764684085998,2.9063888888889,92,2248,0 +102,103,12905,0.1088795585287,2.5027777777777995,81,2387,0 +103,104,12993,0.054235162564995,2.2466666666667003,145,3876,0 +104,105,12974,0.0390040506742,2.1869444444444,47,3073,0 +105,106,13039,0.0744713077811,2.2402777777778,63,3113,0 +106,107,13322,0.040258943675435,2.3727777777778,118,3363,0 +107,108,13606,0.0,2.4566666666667003,56,3796,0 +108,109,13536,0.027955712584728,2.5452777777777995,127,4924,0 +109,110,13341,0.047309968420241,2.6830555555556,48,4300,0 +110,111,13360,0.016602764360002,2.805,114,5225,0 +111,112,13450,0.042432577628353986,2.7386111111111004,78,4047,0 +112,113,14102,0.051191743726563,2.7438888888888995,58,4134,0 +113,114,14026,0.0,2.7586111111111005,56,4786,0 +114,115,13162,0.056724832354639,2.9013888888889,67,4184,0 +115,116,13118,0.055771058827737,3.19,155,2888,0 +116,117,12953,0.081014772096658,3.5561111111111003,123,2674,0 +117,118,12854,0.08253629738290899,3.8433333333333,118,2574,0 +118,119,12952,0.11499203730886,4.0319444444444,133,3123,0 +119,120,12915,0.07668513845109799,3.8844444444444,75,3369,0 +120,121,11994,0.070057457403873,3.6908333333333,29,3284,0 +121,122,11868,0.07031477357556501,3.6141666666667,68,2127,0 +122,123,11977,0.091946448716499,3.5019444444444,91,2117,0 +123,124,11874,0.14560588482235998,3.4205555555556,101,2271,0 +124,125,11913,0.094774329323472,3.1780555555556,22,2513,0 +125,126,11933,0.10217989327054,2.8361111111111,20,2746,0 +126,127,11844,0.04854243074027901,2.5222222222222004,27,2076,0 +127,128,11968,0.068760549683423,2.2416666666667004,45,2297,0 +128,129,11996,0.075440683881139,2.1588888888889,42,2312,0 +129,130,12006,0.11771339431815,2.2763888888889,59,2834,0 +130,131,12225,0.069437397660265,2.3391666666667,52,3584,0 +131,132,12482,0.0,2.4841666666667,62,4009,0 +132,133,12289,0.0,2.4911111111111,81,4142,0 +133,134,12219,0.0,2.6922222222222,84,3876,0 +134,135,12282,0.027395404320488,2.8205555555556,104,4098,0 +135,136,12367,0.055202605299814,2.8216666666667,111,3831,0 +136,137,13042,0.078387348178452,2.7122222222222,91,3842,0 +137,138,12665,0.11851571646444,2.6744444444444,33,4129,0 +138,139,12133,0.068395341911942,2.8097222222222,26,3509,0 +139,140,12023,0.04720597158087901,3.1838888888889,37,2450,0 +140,141,11847,0.07910648512645599,3.5130555555556,23,2270,0 +141,142,11980,0.067550601916344,3.7722222222222,29,2360,0 +142,143,12026,0.080666570182724,3.9058333333333,45,2431,0 +143,144,11852,0.044973875852863,3.7697222222222,49,2042,0 +144,145,12152,0.065734580284861,3.6027777777778,27,1833,0 +145,146,12148,0.068759646748575,3.5038888888889,46,1852,0 +146,147,12236,0.027278224398313,3.445,39,1927,0 +147,148,12155,0.067695565422881,3.3494444444444,72,1999,0 +148,149,12113,0.07244669924777,3.1961111111111005,81,2030,0 +149,150,12175,0.028882930937168,2.8905555555556,64,1963,0 +150,151,12103,0.021568136982842,2.5805555555556,79,2116,0 +151,152,12206,0.064254625408408,2.3380555555556004,132,2461,0 +152,153,12239,0.073869151016554,2.2116666666667,127,2388,0 +153,154,12398,0.026644044055307004,2.2013888888889,121,2846,0 +154,155,12582,0.051289858799957,2.3236111111111,98,2974,0 +155,156,12705,0.099217337562612,2.3002777777778,128,3776,0 +156,157,12555,0.016615805334675,2.385,158,3885,0 +157,158,12476,0.078387348178452,2.5597222222222005,78,3865,0 +158,159,12706,0.0,2.6941666666667,65,4319,0 +159,160,12671,0.049384244324413,2.7169444444444,81,4646,0 +160,161,13277,0.043044731483849,2.6369444444444,586,3873,0 +161,162,12757,0.04215504851616,2.6572222222222,48,3489,0 +162,163,12401,0.042236538352835,2.8466666666667004,38,2790,0 +163,164,12248,0.1001564296112,3.1955555555556,30,2641,0 +164,165,12156,0.17378132267942994,3.5633333333333,28,2960,0 +165,166,12210,0.12005519462968,3.8113888888889,36,2192,0 +166,167,11983,0.14491137762023998,3.9655555555556,50,2145,0 +167,168,12374,0.07336941078506799,3.8483333333333,47,2133,0 +168,169,12230,0.12395626148952,3.6441666666667,82,2330,0 +169,170,12200,0.15077430423660998,3.5213888888889,56,2235,0 +170,171,12135,0.18960071033689,3.4702777777778,140,2258,0 +171,172,12131,0.06051348935254,3.3033333333333,145,2200,0 +172,173,12165,0.072057993662839,3.1933333333333,114,2161,0 +173,174,12193,0.082361078437032,2.8183333333333,129,2159,0 +174,175,12165,0.12343775199876,2.52,143,2088,0 +175,176,12304,0.1071817784483,2.2886111111111,113,2473,0 +176,177,12275,0.10359394556779,2.0822222222222,108,3217,0 +177,178,12369,0.021162435488903,2.1416666666667,93,2994,0 +178,179,12569,0.074524398314698,2.2688888888889,63,3827,0 +179,180,12766,0.12687067454443,2.335,103,4176,0 +180,181,12621,0.041752618326160014,2.4388888888889,114,4227,0 +181,182,12611,0.0,2.5386111111111,67,4290,0 +182,183,12618,0.040819652463459,2.6288888888889,106,4691,0 +183,184,12631,0.082668981599835,2.7511111111111,160,4442,0 +184,185,13121,0.06181362481077901,2.7744444444444,81,5775,0 +185,186,12871,0.0,2.8297222222222,113,3840,0 +186,187,12252,0.076137992226715,2.9708333333333,37,3721,0 +187,188,12155,0.12107639529965,3.1333333333333,70,2498,0 +188,189,12186,0.0,3.3544444444444,82,2265,0 +189,190,12179,0.19840339729984,3.6780555555556,76,2451,0 +190,191,12109,0.20112394005693,3.8038888888889,59,2892,0 +191,192,12142,0.096833471661634,3.8177777777778,58,2166,0 +192,193,12145,0.10338450919956,3.6916666666667,49,2040,0 +193,194,12162,0.10142513773096,3.5197222222222,36,2013,0 +194,195,12165,0.09779274451732,3.5186111111111003,111,2000,0 +195,196,12125,0.14744152252573,3.2597222222222,81,2117,0 +196,197,12097,0.083396348606149,3.0930555555556,92,2775,0 +197,198,12099,0.095637498006913,2.7825,113,2116,0 +198,199,12140,0.14768844039376006,2.4494444444444,90,1991,0 +199,200,12188,0.1131872329372,2.2369444444444,183,3162,0 +200,201,12157,0.073729686380986,2.0961111111111,117,2958,0 +201,202,12128,0.064614077523704,2.0377777777778,110,3153,0 +202,203,12190,0.056019959597275015,2.0730555555556003,179,2190,0 +203,204,12151,0.074812141908008,2.1655555555556,134,2172,0 +204,205,12214,0.02489388427845201,2.285,135,2074,0 +205,206,12275,0.023695834967821,2.4283333333333,100,2078,0 +206,207,12164,0.058680009072634,2.6186111111111,47,2406,0 +207,208,12120,0.10008779345816002,2.7372222222222,88,2018,0 +208,209,12693,0.066566772961868,2.8266666666667004,74,2091,0 +209,210,12624,0.070501147961051,2.8469444444444,58,2310,0 +210,211,12163,0.098779019649936,2.9855555555556,100,2113,0 +211,212,12100,0.11803653713501,3.1038888888889,49,2518,0 +212,213,12162,0.10076746585103,3.4058333333333,36,2605,0 +213,214,12106,0.053210709415363,3.6138888888889,40,2680,0 +214,215,12156,0.099346579713514,3.93,50,2228,0 +215,216,12120,0.047275248011591,3.8155555555556,58,2023,0 +216,217,12420,0.091262209791582,3.6588888888889,50,3702,0 +217,218,12417,0.038593218846488,3.5913888888889,53,1992,0 +218,219,12450,0.070273907645883,3.4644444444444003,93,1988,0 +219,220,12395,0.029431888410363,3.3944444444444,78,1919,0 +220,221,12382,0.096854769984307,3.2227777777778,84,2213,0 +221,222,12438,0.11656453357642,2.7961111111111,112,2181,0 +222,223,12363,0.12109055114779,2.4383333333333,73,2152,0 +223,224,12393,0.20381554615786,2.2647222222222005,91,2393,0 +224,225,12399,0.046311768005022014,2.1886111111111,114,2173,0 +225,226,12456,0.18261306403662,2.2825,127,2109,0 +226,227,12442,0.021992750543024,2.3333333333333,69,3606,0 +227,228,12481,0.088072259040681,2.445,59,2114,0 +228,229,12432,0.037896500450725,2.5811111111111,64,2135,0 +229,230,12403,0.09882843339863,2.7094444444444,75,2303,0 +230,231,12406,0.076277687882641,2.88,44,2137,0 +231,232,12462,0.022875979046571,2.8555555555556,52,2264,0 +232,233,13034,0.10022162220861,2.7791666666667,42,2245,0 +233,234,12830,0.08117200437078799,2.7772222222222,45,2151,0 +234,235,12439,0.09750667785645803,3.02,26,2330,0 +235,236,12541,0.05680722879784299,3.2213888888888995,29,3357,0 +236,237,12462,0.12240855732315,3.6211111111111,32,3152,0 +237,238,12394,0.1715485140175,4.0219444444444,44,2693,0 +238,239,12507,0.075015592829224,4.0980555555556,41,3798,0 +239,240,12512,0.11388410095531,3.9080555555556,42,4596,0 +240,241,12093,0.10519027968795,3.7269444444444,46,2529,0 +241,242,12197,0.1150532998405,3.6244444444444,40,2124,0 +242,243,12138,0.10890530980571,3.5252777777778,64,2762,0 +243,244,12174,0.099350621485086,3.4675,70,2973,0 +244,245,12163,0.12889794040441002,3.3316666666667003,69,3041,0 +245,246,12096,0.12069378235889,2.9497222222222,73,2179,0 +246,247,12166,0.13053034917739,2.5708333333333,85,2322,0 +247,248,12187,0.078977758004111,2.3086111111111,63,2274,0 +248,249,12246,0.08088416337864099,2.2311111111111,67,2448,0 +249,250,12335,0.04008956024204,2.3119444444444,68,3811,0 +250,251,12556,0.05063725351997099,2.3536111111111,62,3761,0 +251,252,12652,0.039066291775136,2.4819444444444,69,4269,0 +252,253,12646,0.028611752774164,2.6605555555556,82,4244,0 +253,254,12803,0.040593364983329,2.7527777777778,56,4417,0 +254,255,12570,0.038807415292018,3.0741666666667005,38,3758,0 +255,256,12633,0.07832796288132203,2.8522222222222,30,4375,0 +256,257,13146,0.066320996162546,2.7277777777778,48,4158,0 +257,258,12994,0.083175583471284,2.7502777777778,63,3410,0 +258,259,12314,0.06802464587725401,2.8797222222222,34,2853,0 +259,260,12193,0.051675070535006,3.2027777777778,11,2628,0 +260,261,12127,0.044129112207997014,3.5633333333333,22,2287,0 +261,262,12140,0.037685894365982006,3.8808333333333,22,3334,0 +262,263,12174,0.093414561465838,4.0352777777778,12,2795,0 +263,264,12180,0.06987083046098,3.8966666666667,10,2089,0 +264,265,12861,0.021992750543024,3.7225,14,2260,0 +265,266,12957,0.11305566197523,3.73,39,3176,0 +266,267,12981,0.030884138240845,3.5558333333333,55,4049,0 +267,268,12958,0.10381377439313,3.3169444444444003,90,2902,0 +268,269,12913,0.048953768695625004,3.2322222222222,68,3743,0 +269,270,12939,0.042258794089861,2.8658333333333,95,4280,0 +270,271,12933,0.048388685585470985,2.5169444444444,70,3977,0 +271,272,13006,0.034197830567692,2.3,96,4518,0 +272,273,13091,0.08835953066771099,2.1888888888889,45,2707,0 +273,274,13201,0.086890518272785,2.2030555555556,96,3522,0 +274,275,13520,0.031087561676959,2.2711111111111,74,4584,0 +275,276,13675,0.071287463233942,2.4697222222222,82,4141,0 +276,277,13594,0.14372616993938,2.5988888888889,82,4831,0 +277,278,13466,0.12647517487142998,2.7258333333333,45,3991,0 +278,279,13448,0.042854531198562,2.7858333333333,134,4645,0 +279,280,13492,0.039930389849144,2.7922222222222,119,4967,0 +280,281,14123,0.076184645265048,2.6988888888889,86,4578,0 +281,282,13839,0.037830020408535,2.7663888888889,75,4972,0 +282,283,13335,0.030884138240845,2.8938888888889,45,5522,0 +283,284,13196,0.048316550276279,3.1875,50,2832,0 +284,285,13047,0.10986585566763,3.6463888888889,31,2826,0 +285,286,13008,0.025485002897852004,3.866666666666701,88,2855,0 +286,287,12763,0.12451757643335,3.9808333333333,42,2660,0 +287,288,12949,0.12875690949235,3.8277777777778,70,2447,0 +288,289,13009,0.15720639094135,3.6269444444444,106,2545,0 +289,290,13008,0.079092017261926,3.5266666666667,44,3842,0 +290,291,12890,0.14711499890479998,3.5077777777778,57,2332,0 +291,292,13004,0.0531410973178,3.3455555555556,95,2294,0 +292,293,12918,0.10136246281349,3.1241666666667003,91,3016,0 +293,294,12910,0.053119315802353,2.8713888888889,66,3944,0 +294,295,12915,0.11313351589999003,2.5133333333333,66,2332,0 +295,296,13121,0.076760188212735,2.2197222222222,82,2405,0 +296,297,13076,0.08890522133351199,2.205,73,2572,0 +297,298,13096,0.1009555130175,2.2677777777778,69,2558,0 +298,299,13339,0.15685427502807,2.2991666666667,107,3701,0 +299,300,13635,0.11090638960365,2.4277777777778,101,4228,0 +300,301,13493,0.054798089981891,2.5333333333333,66,3990,0 +301,302,13402,0.08461316628091001,2.6422222222222005,47,4707,0 +302,303,13417,0.15790425505315,2.8211111111111005,47,3857,0 +303,304,13382,0.021675109392134,2.7625,66,3874,0 +304,305,14199,0.14112049645292002,2.7391666666667,102,4369,0 +305,306,13973,0.059612111520904,2.7525,71,4488,0 +306,307,13284,0.067835890522602,2.8644444444444,53,3637,0 +307,308,13070,0.047414460026828,3.1927777777778,28,2705,0 +308,309,12983,0.050348669783997005,3.5872222222222,24,2429,0 +309,310,13075,0.07296715773193299,3.8305555555556,23,2839,0 +310,311,12991,0.10713527159169,3.8827777777778,30,2371,0 +311,312,12993,0.073622496612493,3.7291666666667,25,2758,0 +312,313,13121,0.11556476355437,3.6172222222222,29,2291,0 +313,314,13097,0.034160489683707995,3.4491666666667005,27,2220,0 +314,315,13150,0.019571935182124,3.4097222222222,77,2620,0 +315,316,13078,0.15720996206912,3.2605555555556,46,2467,0 +316,317,13140,0.11515041454164,3.2191666666667,86,2088,0 +317,318,13102,0.086415715789296,2.9586111111111,97,2137,0 +318,319,13110,0.092606306920552,2.6036111111111,88,2907,0 +319,320,13138,0.046458579038692015,2.3319444444444,110,2558,0 +320,321,13238,0.10977831600416,2.2025,89,2823,0 +321,322,13317,0.11090009191451,2.2711111111111,134,2465,0 +322,323,13512,0.076652795374797,2.2897222222222005,84,4399,0 +323,324,13669,0.1087202400467,2.3297222222222005,109,4088,0 +324,325,13651,0.11471628863897,2.395,57,5099,0 +325,326,13580,0.11070024667119,2.5063888888889,49,5157,0 +326,327,13538,0.026827723134058,2.7077777777778,83,3782,0 +327,328,13657,0.029426630692549,2.735,101,4008,0 +328,329,14183,0.028611752774164,2.6958333333333,88,4534,0 +329,330,14117,0.053106181092382014,2.6930555555556,56,3242,0 +330,331,13166,0.055538160906184006,2.875,31,2808,0 +331,332,13265,0.11009690391165,3.1788888888888995,22,3676,0 +332,333,13085,0.10979978093137,3.5808333333333,32,3523,0 +333,334,13167,0.036174223284821,3.8508333333333,27,3038,0 +334,335,13170,0.048361321378982,3.9180555555556,17,2299,0 +335,336,13132,0.10958125953198,3.815,27,2345,0 +336,337,13055,0.047305343559722,3.6080555555556,38,2565,0 +337,338,13025,0.045316868664604014,3.4927777777778,73,2576,0 +338,339,13076,0.13255054531036,3.4316666666667004,56,2327,0 +339,340,13044,0.079695587369141,3.3436111111111004,49,2211,0 +340,341,13035,0.10277355185943,3.0663888888889,90,2642,0 +341,342,13103,0.15061124796385,2.7894444444444,106,3646,0 +342,343,13067,0.14509169704095,2.4994444444444,51,2281,0 +343,344,13183,0.054445250001619004,2.2544444444444,99,2474,0 +344,345,13144,0.082058799915824,2.0847222222222,104,2536,0 +345,346,13166,0.042151311782819015,2.0888888888889,119,2900,0 +346,347,13406,0.057404703309705984,2.1594444444444,73,3144,0 +347,348,13544,0.040891918425583,2.2533333333333,92,3725,0 +348,349,13608,0.045224636676715,2.3880555555556,57,4305,0 +349,350,13522,0.0,2.6338888888889,100,3665,0 +350,351,13595,0.0,2.6588888888889,93,3791,0 +351,352,13420,0.10335456693443,2.7586111111111005,111,3897,0 +352,353,14163,0.033846222120808,2.8797222222222,91,3494,0 +353,354,13678,0.026167129419328,2.785,43,3353,0 +354,355,13272,0.08571767780871499,2.8219444444444,91,2741,0 +355,356,13071,0.12459953631184,3.0055555555556,63,2463,0 +356,357,13004,0.054750658073534006,3.2936111111111,60,3477,0 +357,358,13068,0.20799106772677,3.5575,56,2792,0 +358,359,13031,0.10314231079956,3.676111111111101,59,2183,0 +359,360,13013,0.12212653292147,3.7166666666667,48,2874,0 +360,361,12998,0.19159058299176,3.6013888888889,65,2147,0 +361,362,12971,0.10782180851978,3.4455555555556,77,2754,0 +362,363,13000,0.06408869538637901,3.4166666666667003,60,2007,0 +363,364,12998,0.095540168894753,3.1791666666667004,94,2564,0 +364,365,12906,0.039360296791109,3.0013888888889,84,3020,0 +365,366,12969,0.086611479249287,2.72,99,2004,0 +366,367,12963,0.05845507441603001,2.4527777777778,61,2047,0 +367,368,12933,0.051490800079599004,2.1816666666667,60,3531,0 +368,369,12990,0.075496432869001,2.0161111111111,78,2383,0 +369,370,12980,0.10358625218721,1.9769444444444,81,2112,0 +370,371,12982,0.062806431427897,2.0597222222222,61,2554,0 +371,372,12989,0.08970338978685001,2.2111111111111,68,2371,0 +372,373,13073,0.094517316130968,2.3141666666667,53,2060,0 +373,374,12950,0.032322011663911,2.4280555555556003,49,2086,0 +374,375,12990,0.047911560407608,2.5855555555556,40,2130,0 +375,376,13035,0.062001214431213,2.6977777777778,125,2072,0 +376,377,13681,0.027102718749392,2.7777777777778,61,2033,0 +377,378,13304,0.034703114844079,2.7988888888889,111,2683,0 +378,379,12965,0.066236017573192,2.8927777777778,32,2046,0 +379,380,12966,0.032230355211769,3.0413888888889,21,2064,0 +380,381,12943,0.11559664215716,3.3569444444444,14,2067,0 +381,382,12958,0.021952502374124,3.4808333333333,32,2496,0 +382,383,13005,0.13347711194703,3.764166666666701,29,4758,0 +383,384,12923,0.10579408349834,3.8097222222222,26,2806,0 +384,385,12812,0.10679035350244,3.6911111111111,52,2227,0 +385,386,12803,0.068633627680319,3.4902777777778,39,3123,0 +386,387,12850,0.04699518011436099,3.3769444444444,78,3460,0 +387,388,12797,0.14159640074335994,3.3011111111111004,78,3587,0 +388,389,12732,0.078500039299167,3.1369444444444,83,2558,0 +389,390,12817,0.049232295047845,2.8475,63,2306,0 +390,391,12818,0.078777592482879,2.4544444444444,108,2083,0 +391,392,12815,0.08993433499951,2.1247222222222,158,3073,0 +392,393,12805,0.081869163858473,2.0266666666667,115,3325,0 +393,394,12703,0.14556064903749,2.1763888888889,112,2321,0 +394,395,12771,0.0,2.3088888888889,73,2846,0 +395,396,12847,0.0,2.4213888888889,93,2482,0 +396,397,12872,0.030693547421212,2.6436111111111,65,2306,0 +397,398,12815,0.0,2.6602777777778,91,2298,0 +398,399,12844,0.046999447831427,2.7677777777778,106,2907,0 +399,400,12811,0.028815579681692,2.8066666666667004,66,2329,0 +400,401,13472,0.0,2.7661111111111003,26,2456,0 +401,402,13063,0.039360296791109,2.8133333333333,23,2178,0 +402,403,12833,0.039570832199428,2.9186111111111,24,2142,0 +403,404,12842,0.090659246308087,3.1930555555556,19,2277,0 +404,405,12804,0.10540579050057003,3.565,23,3066,0 +405,406,12852,0.062601610466313,3.9133333333333,30,3619,0 +406,407,12862,0.051455855638306,3.9658333333333,23,3726,0 +407,408,12799,0.054631758648785014,3.8930555555556,35,2282,0 +408,409,12789,0.09017822949731,3.7297222222222,41,3079,0 +409,410,12815,0.045287525091609014,3.6516666666667,63,2448,0 +410,411,12887,0.033344698319951,3.5927777777778,33,2574,0 +411,412,12903,0.080098394586215,3.4694444444444,50,3697,0 +412,413,12892,0.025162301034707,3.2536111111111,88,3067,0 +413,414,12907,0.078260793447992,2.8986111111111,115,3491,0 +414,415,12883,0.07223863924679201,2.4488888888889,69,3195,0 +415,416,12965,0.042917873674349,2.2119444444444,116,2763,0 +416,417,12932,0.04720597158087901,2.2011111111111,73,2605,0 +417,418,13134,0.048273008229067,2.2338888888889,75,2755,0 +418,419,13440,0.036987975876273,2.3116666666667003,56,3300,0 +419,420,13544,0.06291463671717,2.3869444444444,66,3838,0 +420,421,13508,0.033319304393751,2.5119444444444,70,3608,0 +421,422,13401,0.029115275623859,2.5713888888889,52,3845,0 +422,423,13410,0.06821638123436,2.5088888888889,32,3563,0 +423,424,13482,0.015408589348188,2.4155555555556,16,5478,0 +424,425,14124,0.01916018435633,3.6455555555556,46,3656,0 +425,426,13703,0.06374239746477901,2.4625,53,3491,0 +426,427,13250,0.099738890728803,2.5808333333333,67,3430,0 +427,428,13092,0.10950621554455,3.0033333333333,58,2807,0 +428,429,13012,0.06138920621589401,3.3486111111111003,17,2524,0 +429,430,12901,0.051307638060244014,3.6644444444444,26,2964,0 +430,431,12848,0.082471571552878,4.0083333333333,13,3969,0 +431,432,13025,0.060122448878635,3.8530555555556,8,3561,0 +432,433,11352,0.07469842969719999,3.6183333333333,20,3394,0 +433,434,8761,0.056170625137636994,3.4922222222222,23,3005,0 +434,435,10433,0.052668952946361,3.4958333333333,34,2350,0 +435,436,10088,0.068871884486763,3.2738888888889,35,2139,0 +436,437,9485,0.040236057110938986,3.2102777777778,48,2098,0 +437,438,8865,0.053200012471363,2.8475,67,2341,0 +438,439,8920,0.056725172482788,2.4883333333332995,38,2698,0 +439,440,8798,0.035229341473877,2.1955555555556003,33,2968,0 +440,441,8927,0.0,2.1461111111111,40,2824,0 +441,442,9211,0.020190723068726,2.1522222222222,37,3003,0 +442,443,9286,0.093342961377898,2.3122222222222004,51,3551,0 +443,444,9725,0.0,2.4033333333333,52,4689,0 +444,445,11050,0.015717168144981003,2.4944444444444,57,3481,0 +445,446,11521,0.017190609993733997,2.6622222222222005,82,3376,0 +446,447,11603,0.0,2.675,74,3198,0 +447,448,11665,0.043273461915965,2.6997222222222,80,3059,0 +448,449,12153,0.029854520963498,2.6997222222222,78,2937,0 +449,450,11672,0.017383620014121998,2.7194444444444,58,2881,0 +450,451,11119,0.046391383573699006,2.8258333333333,41,2777,0 +451,452,11124,0.042155878228,3.1044444444444,34,2510,0 +452,453,10734,0.052684222339579014,3.4736111111111003,35,2356,0 +453,454,11612,0.063573954212613,3.6972222222222,40,2383,0 +454,455,11523,0.077413583128967,3.8038888888889,35,2455,0 +455,456,11632,0.069605078732108,3.7494444444444,37,2285,0 +456,457,12838,0.075937967855042,3.6813888888889,43,2455,0 +457,458,11637,0.047354002438352014,3.4791666666667003,45,4298,0 +458,459,12542,0.044000040388062,3.4530555555556,48,2400,0 +459,460,12394,0.095130971924595,3.2841666666667004,77,3431,0 +460,461,12419,0.069274987547704,3.205,79,2252,0 +461,462,12484,0.061118974117397,2.8436111111111004,59,2628,0 +462,463,12413,0.056393740750134,2.4441666666667,107,3266,0 +463,464,12440,0.06125086589409901,2.275,100,2620,0 +464,465,12614,0.047746883512707,2.1788888888889,84,2824,0 +465,466,12693,0.047136440673386,2.2083333333333,99,2801,0 +466,467,12989,0.0,2.2997222222222,103,3106,0 +467,468,13200,0.0,2.3155555555556004,47,3532,0 +468,469,13108,0.049828520132601,2.41,67,4210,0 +469,470,12886,0.0,2.5902777777778,65,3646,0 +470,471,13000,0.0,2.6636111111111,65,3768,0 +471,472,13071,0.043576825212604,2.7105555555556,70,5342,0 +472,473,13563,0.035173891965945,2.6811111111111,76,5327,0 +473,474,13333,0.04413510379665099,2.715,40,3363,0 +474,475,12672,0.016955671451488998,2.7083333333333,54,3016,0 +475,476,12547,0.1330396486107,3.0038888888889,45,3257,0 +476,477,12289,0.016462114132943,3.3911111111111003,32,2619,0 +477,478,12584,0.055696363369897,3.6375,26,2573,0 +478,479,12526,0.036411774365825,3.7755555555556,25,2575,0 +479,480,12416,0.047966724418057,3.5786111111111003,34,5355,0 +480,481,12450,0.05609961782665,3.4222222222222,43,5809,0 +481,482,12460,0.096990479781121,3.2538888888889,68,3823,0 +482,483,12425,0.11147038220964,3.1683333333333,60,3116,0 +483,484,12430,0.044797927381498,3.0677777777778,74,2321,0 +484,485,12418,0.024403519177111,2.94,68,2193,0 +485,486,12437,0.08532776818426499,2.7291666666667003,43,2982,0 +486,487,12484,0.043615168647623,2.4147222222222005,73,4140,0 +487,488,12380,0.056692005942856,2.1419444444444,72,2353,0 +488,489,12620,0.033708553131457,2.0244444444444,66,3350,0 +489,490,12674,0.040148453968243986,2.0458333333333,90,3184,0 +490,491,12855,0.099551526697496,2.09,104,3469,0 +491,492,13053,0.0,2.1575,114,4204,0 +492,493,12898,0.036157867549894,2.2655555555556,98,6447,0 +493,494,12809,0.052738784696875,2.2561111111111,70,4898,0 +494,495,12964,0.021636091422947,2.4669444444444,101,3633,0 +495,496,12956,0.037120220639643986,2.5277777777778,77,4189,0 +496,497,13625,0.034467327401996005,2.5266666666667,69,4012,0 +497,498,13285,0.0,2.5438888888889,19,4009,0 +498,499,12715,0.096807019710259,2.6511111111111,47,4346,0 +499,500,12637,0.059601475230884,2.9711111111111004,38,2781,0 +500,501,12535,0.068431521141608,3.2288888888889,22,2811,0 +501,502,12512,0.09611085542804,3.505,20,2415,0 +502,503,12549,0.064177980162036,3.4944444444444,26,3589,0 +503,504,12567,0.11565746993409,3.4633333333333,24,2878,0 +504,505,12362,0.073501732487291,3.3177777777778,27,3471,0 +505,506,12326,0.072746100819649,3.1963888888889,25,2697,0 +506,507,12450,0.07557888002360401,3.1069444444444,57,2583,0 +507,508,12404,0.036816888038697,3.0172222222222,58,3173,0 +508,509,12362,0.093969235453559,2.9247222222222,81,3341,0 +509,510,12431,0.034848294186597004,2.5336111111111,81,2305,0 +510,511,12351,0.084191269180943,2.2480555555556,69,2186,0 +511,512,12528,0.13109036514766,2.0383333333333,50,4439,0 +512,513,12559,0.061132356147447,1.8852777777778,55,3173,0 +513,514,12586,0.019478099970089,1.9225,57,2831,0 +514,515,12864,0.0,1.9719444444444,78,16385,0 +515,516,13026,0.0,2.0608333333333,57,83955,0 +516,517,12880,0.017965204407153,2.16,78,4574,0 +517,518,12743,0.019202263481759,2.3077777777778,95,4987,0 +518,519,12812,0.0,2.415,88,5110,0 +519,520,12878,0.052306327013631,2.4669444444444,108,4893,0 +520,521,13427,0.08536575533023,2.5125,87,3807,0 +521,522,13081,0.052461360256699015,2.6294444444444,87,3447,0 +522,523,12752,0.035302992848671,2.8183333333333,44,4329,0 +523,524,12594,0.028682734942579,3.0547222222222,39,5166,0 +524,525,12507,0.024204462299365,3.33,27,3454,0 +525,526,12494,0.034360100307537,3.5738888888889,23,3578,0 +526,527,12487,0.018977302969238,3.6888888888889,11,2406,0 +527,528,12404,0.034308847257872,3.7111111111111,13,2073,0 +528,529,11147,0.07460088255490599,3.7180555555556,24,1925,0 +529,530,11147,0.055037935083209005,3.6041666666667,77,2357,0 +530,531,11128,0.039311673522385,3.4483333333333,54,1947,0 +531,532,11106,0.046619928266775,3.2413888888888995,45,1912,0 +532,533,11115,0.048227542028921,3.1355555555556,36,2107,0 +533,534,11044,0.020367863848114,2.8172222222222,59,2985,0 +534,535,11110,0.063069968046591,2.4275,81,2081,0 +535,536,11190,0.054470866056974016,2.2513888888889,50,2631,0 +536,537,11063,0.0,2.0691666666667,53,2130,0 +537,538,11078,0.059261864411046,2.0155555555556,44,2085,0 +538,539,11146,0.064174002348993,2.0952777777778,87,2211,0 +539,540,11010,0.0,2.2397222222222,94,2105,0 +540,541,11139,0.021912411214588,2.3275,128,2585,0 +541,542,11117,0.057958262002105985,2.5255555555556004,82,3695,0 +542,543,11081,0.035358633773416,2.665,49,3198,0 +543,544,11128,0.029191244440103,2.7975,79,3191,0 +544,545,11720,0.054981313823219,2.8597222222222,62,2016,0 +545,546,11384,0.06405347705857799,2.7983333333333,64,2124,0 +546,547,11018,0.0,2.9322222222222,34,2105,0 +547,548,11104,0.055445634363329,3.08,41,2031,0 +548,549,11084,0.040996998867197,3.3466666666667004,47,1964,0 +549,550,11106,0.027670189755404,3.6869444444444,31,2016,0 +550,551,11055,0.054579839310753,3.7966666666667,26,3909,0 +551,552,11098,0.044833640073299014,3.7805555555556,17,2105,0 +552,553,11028,0.03282297151413,3.7422222222222,30,2405,0 +553,554,11152,0.017696014614986,3.639166666666701,17,2141,0 +554,555,11025,0.09418709999244,3.4775,28,1910,0 +555,556,11015,0.061817529149429,3.3283333333333,20,1951,0 +556,557,11125,0.054000161367618,3.1702777777778,85,2310,0 +557,558,11035,0.06165600249599,2.7688888888889,52,2047,0 +558,559,11103,0.055915839259234,2.4266666666667,143,2048,0 +559,560,11100,0.062788330996733,2.1963888888889,106,3083,0 +560,561,11170,0.044888048273534,2.135,244,3619,0 +561,562,11078,0.095259484956337,2.3186111111111,2005,2172,0 +562,563,11150,0.021952502374124,2.3383333333333,124,3142,0 +563,564,11149,0.0,2.5002777777778,109,2256,0 +564,565,10984,0.0,2.6527777777778,148,2200,0 +565,566,11034,0.0,2.7661111111111003,126,2183,0 +566,567,11050,0.061557079663167,2.7347222222222,46,2030,0 +567,568,11102,0.14186075040414,2.6069444444444,49,2297,0 +568,569,11743,0.0,2.5547222222222,40,2213,0 +569,570,11371,0.077457673524504,2.4716666666667004,39,4014,0 +570,571,11078,0.16422977329792998,2.6530555555556004,25,2809,0 +571,572,11224,0.049366067455729,2.9488888888889,37,2355,0 +572,573,11146,0.10064381631633,3.3383333333333,32,2372,0 +573,574,11199,0.11909159312806,3.5419444444444,47,2387,0 +574,575,11181,0.09003816676619801,5.3302777777778,34,2359,0 +575,576,11022,0.055882659245704,3.7727777777778,40,2485,0 +576,577,11073,0.1836893913223,3.6333333333333,46,3728,0 +577,578,11120,0.08574268253550299,3.5430555555556,35,2820,0 +578,579,11008,0.12559700716583,3.6711111111111,61,2426,0 +579,580,11078,0.086129850619071,3.4572222222222,56,2307,0 +580,581,11121,0.041752618326160014,3.2,72,2233,0 +581,582,11041,0.094396473652892,2.7772222222222,110,2178,0 +582,583,11168,0.045323960075285004,2.415,135,2243,0 +583,584,11213,0.13808411333909,2.2530555555556004,133,2713,0 +584,585,11238,0.08029349854683501,2.0994444444444,148,3168,0 +585,586,11273,0.06507307495461,2.1780555555556003,86,3163,0 +586,587,11479,0.084518021856329,2.2638888888889,132,3289,0 +587,588,11839,0.030507395540508,2.3575,73,4001,0 +588,589,11735,0.05892502921299701,2.4680555555556003,95,4684,0 +589,590,11574,0.0,2.6208333333333,74,4137,0 +590,591,11531,0.033075906123641,2.6863888888889,51,4787,0 +591,592,11420,0.16633704704670998,2.6172222222222,65,4278,0 +592,593,12301,0.10228536028167,2.6194444444444,95,3898,0 +593,594,11845,0.16949365549682996,2.6358333333333,72,3728,0 +594,595,11374,0.08260397756200501,2.8661111111111004,41,4047,0 +595,596,11370,0.024378363844868,3.0533333333333,38,3373,0 +596,597,11197,0.15686874147816002,3.4438888888889,32,2669,0 +597,598,11171,0.063929461148943,3.6552777777778,22,3289,0 +598,599,11197,0.12602019009982998,3.8519444444444,29,2556,0 +599,600,11114,0.035137191893634005,3.8069444444444,32,2557,0 +600,601,12564,0.14965728062748998,3.5961111111111004,40,3003,0 +601,602,12459,0.10046170077382,3.5344444444444,59,2441,0 +602,603,12508,0.13163105487926,3.3972222222222,52,2396,0 +603,604,12464,0.043899611017859004,3.3936111111111003,42,3426,0 +604,605,12438,0.19567092855859,3.1025,46,2379,0 +605,606,12449,0.19135011734275,2.8630555555556,97,3026,0 +606,607,12373,0.11171915024595,2.4255555555556003,72,2336,0 +607,608,12594,0.032053604746412,1.8619444444444,81,2850,0 +608,609,12623,0.096448361580655,1.8930555555556,81,3016,0 +609,610,12759,0.07934996156433399,2.2080555555556,70,3537,0 +610,611,12841,0.024581173073578,2.3052777777778,89,3899,0 +611,612,13063,0.025596039426134,2.3777777777777995,87,5044,0 +612,613,13023,0.027922074309281,2.5161111111111,125,4806,0 +613,614,12884,0.02593545023878,2.6411111111111,69,4139,0 +614,615,13007,0.033086949155743,2.8011111111111004,57,4776,0 +615,616,13016,0.047260069860172005,2.7236111111111003,99,4065,0 +616,617,13588,0.038487130166032016,2.6813888888889,111,4969,0 +617,618,13272,0.16080169828563,2.7336111111111,71,3784,0 +618,619,12589,0.12635270044885,2.8863888888889,71,3297,0 +619,620,12651,0.046904491868436,3.1225,48,3347,0 +620,621,12616,0.059534673085297,3.4613888888889,76,3170,0 +621,622,12492,0.12198352023568,3.8297222222222,56,2241,0 +622,623,12497,0.052131597947042,3.8936111111111,35,2301,0 +623,624,12623,0.094084438832673,3.7588888888889,35,2303,0 +624,625,12481,0.13486764750848,3.5827777777778,29,2587,0 +625,626,12434,0.062226183256115,3.4730555555556,38,3211,0 +626,627,12495,0.091202035463034,3.4175,69,2604,0 +627,628,12375,0.096137859324631,3.3533333333333,77,2841,0 +628,629,12357,0.10449109200785,3.1963888888889,20,2168,0 +629,630,12433,0.097127966420289,2.8852777777778,24,2265,0 +630,631,12432,0.064404980330111,2.4880555555556003,83,2908,0 +631,632,12429,0.10188181868693,2.2325,62,3180,0 +632,633,12551,0.19953464365013,2.1044444444444,54,3118,0 +633,634,12799,0.0747839457206,2.1097222222222,54,3296,0 +634,635,12818,0.0,2.235,60,4432,0 +635,636,13071,0.0,2.3516666666667003,63,4336,0 +636,637,12897,0.0,2.5138888888889,95,4534,0 +637,638,12961,0.041436571087464,2.6105555555556004,69,4261,0 +638,639,12925,0.038671790863765,2.7233333333333,68,5248,0 +639,640,12968,0.035810634316102014,2.6633333333333,58,5014,0 +640,641,13525,0.1409929213297,2.5580555555556,107,3864,0 +641,642,12993,0.0,2.6627777777778,48,5682,0 +642,643,12369,0.052915080344848,2.7625,64,4404,0 +643,644,12195,0.11966022897483,3.0283333333333,52,3705,0 +644,645,12464,0.12973870706052,3.3727777777778,61,2738,0 +645,646,12470,0.023838633821411,3.6369444444444,47,2887,0 +646,647,12475,0.12358680271021,3.7088888888889,58,3776,0 +647,648,12482,0.089095336472172,3.5847222222222,51,3532,0 +648,649,12221,0.019762530636927,3.4836111111111,61,3724,0 +649,650,12325,0.020994992941051,3.4077777777778,53,2786,0 +650,651,12258,0.10380294658324002,3.4441666666667,55,2941,0 +651,652,11980,0.079228021087742,3.1683333333333,52,2351,0 +652,653,11947,0.039012779943635,3.0527777777778,89,2316,0 +653,654,12291,0.10658713601061,2.8527777777778,85,2350,0 +654,655,12293,0.14426278476756,2.5433333333333,106,2916,0 +655,656,12341,0.08706206992122,2.1997222222222,88,2437,0 +656,657,12390,0.16325946030154,2.1036111111111,59,2761,0 +657,658,12611,0.0,2.2133333333333,48,3941,0 +658,659,12737,0.0,2.2086111111111,66,4025,0 +659,660,12882,0.07729609083366701,2.2883333333333,95,4466,0 +660,661,12891,0.058100747891124,2.3222222222222,82,4401,0 +661,662,12756,0.061191523312340984,2.47,76,4747,0 +662,663,12875,0.08592375974441901,2.685,104,4051,0 +663,664,12847,0.033467197342519,2.6763888888889,54,4448,0 +664,665,13518,0.030265788895452006,2.5838888888889,43,3736,0 +665,666,13217,0.11950310860409,2.6130555555556003,39,3918,0 +666,667,12621,0.09169148327055697,2.7633333333333,48,3408,0 +667,668,12591,0.18439354827551,3.0708333333333,38,2883,0 +668,669,12332,0.10741924067542,3.4347222222222,45,3631,0 +669,670,12404,0.15862461647089002,3.7030555555556,64,2609,0 +670,671,12457,0.14957813136313,3.8138888888889,35,2533,0 +671,672,12370,0.24059408570531,3.8508333333333,66,2469,0 +672,673,11509,0.15511115210127,3.8961111111111,61,2458,0 +673,674,11433,0.19582462633148,3.4763888888889,58,2458,0 +674,675,11317,0.13981560037535998,3.4041666666667,51,2043,0 +675,676,11364,0.1392329990551,3.2352777777778,55,1985,0 +676,677,11350,0.13079770999921,3.1508333333333,126,2032,0 +677,678,11348,0.053672881218709015,2.7863888888888995,61,3409,0 +678,679,11365,0.10971373742228,2.4861111111111,94,2018,0 +679,680,11505,0.13825204927093,2.2444444444444,83,2461,0 +680,681,11468,0.13912778922607,2.1286111111111,136,2318,0 +681,682,11562,0.10215803640865,2.1261111111111,104,2787,0 +682,683,11858,0.096617489053804,2.2405555555556003,77,3186,0 +683,684,11933,0.0,2.2991666666667,109,3490,0 +684,685,11813,0.0,2.3627777777778,146,3407,0 +685,686,11735,0.0,2.5863888888889,69,3193,0 +686,687,11848,0.0,2.7286111111111,121,3412,0 +687,688,11843,0.0,2.8355555555556,53,3563,0 +688,689,12318,0.068897518746959,2.7875,61,3247,0 +689,690,11846,0.05418569809170299,2.7825,82,3012,0 +690,691,11066,0.06507307495461,2.7972222222222,37,2382,0 +691,692,10920,0.10547682048851,3.0355555555556,19,2012,0 +692,693,10836,0.056437861708265,3.2486111111111,19,1915,0 +693,694,10879,0.098703711593837,3.6077777777778,19,1982,0 +694,695,10796,0.14331889652193,3.76,54,1950,0 +695,696,10785,0.05704449488642,3.806666666666701,44,4176,0 +696,697,9469,0.0,3.6638888888889,46,3654,0 +697,698,9278,0.032146952736052,3.5161111111111003,53,3063,0 +698,699,9417,0.068135614649249,3.3286111111111003,83,1916,0 +699,700,9253,0.034514299845882,3.2166666666667,92,1848,0 +700,701,9435,0.028306668795131006,2.9783333333333,94,1704,0 +701,702,9356,0.13119921991025002,2.7211111111111004,111,1680,0 +702,703,9354,0.093609772007723,2.4102777777778,84,2011,0 +703,704,9405,0.11179018663123,2.1366666666667,52,1772,0 +704,705,9326,0.065272680657868,1.9947222222222,68,1838,0 +705,706,9549,0.15901886092526998,1.9936111111111,35,1924,0 +706,707,9499,0.0,2.0788888888889,40,2038,0 +707,708,9371,0.26537507315217,2.1736111111111,47,1991,0 +708,709,9462,0.0,2.4027777777778,85,1729,0 +709,710,9509,0.056610336908172985,2.4580555555556,59,1673,0 +710,711,9469,0.026644044055307004,2.6102777777777995,61,1656,0 +711,712,9522,0.040819652463459,2.7597222222222,45,1774,0 +712,713,9885,0.13497701521251,2.8122222222222,47,1784,0 +713,714,9802,0.16853433621426,2.8427777777778,72,1818,0 +714,715,9461,0.08655557751574,2.87,69,1981,0 +715,716,9393,0.05741127788681901,2.9769444444444,17,2004,0 +716,717,9638,0.037244401880164,3.3241666666667005,47,1788,0 +717,718,9435,0.1132743034971,3.6375,37,1786,0 +718,719,9519,0.15690958465910998,3.8652777777778,57,1781,0 +719,720,9492,0.09604225449090803,3.8091666666667,62,2024,0 +720,721,9458,0.06746445682560599,3.6844444444444,72,1669,0 +721,722,9420,0.058373145210404015,3.5913888888889,43,1729,0 +722,723,9429,0.048008603166117006,3.5255555555556,57,1682,0 +723,724,9461,0.12614216994504,3.3277777777778,47,1714,0 +724,725,9404,0.077186121310215,3.07,61,1679,0 +725,726,9366,0.042879382350005,2.7622222222222,53,1739,0 +726,727,9488,0.031014262794497007,2.3872222222222,78,1669,0 +727,728,9515,0.13957171072647,2.1308333333333,100,1806,0 +728,729,9487,0.027108383258306,2.1563888888889,104,1650,0 +729,730,9497,0.0,2.2547222222222003,56,1751,0 +730,731,9516,0.0,2.3397222222222003,89,1685,0 +731,732,9504,0.0,2.4808333333333,108,1645,0 +732,733,9422,0.025265991419408,2.6208333333333,67,2133,0 +733,734,9543,0.0,2.8138888888889,83,1618,0 +734,735,9395,0.047219926720593,2.9275,90,1623,0 +735,736,9352,0.083109434319356,2.8663888888888995,82,1697,0 +736,737,9884,0.10860709298782,2.7794444444444,76,1684,0 +737,738,9820,0.098319718095083,2.8194444444444,34,1779,0 +738,739,9439,0.02201293380153,2.9458333333333,43,2982,0 +739,740,9560,0.064929719079082,3.2413888888888995,40,1848,0 +740,741,9589,0.036960535765785,3.7166666666667,40,1772,0 +741,742,9575,0.068536856116777,4.1333333333333,57,1841,0 +742,743,9541,0.012398281267649,4.2697222222222,60,1834,0 +743,744,9490,0.035305311833591015,4.2797222222222,53,1860,0 +744,745,7160,0.024153733176505,4.0,44,1647,0 +745,746,7233,0.031750779212929,3.8877777777778,48,2129,0 +746,747,7166,0.092612685693125,3.6633333333333,50,1763,0 +747,748,7245,0.12674340154738,3.6127777777778,65,1433,0 +748,749,7299,0.068594711667718,3.3175,93,1428,0 +749,750,7169,0.13866540834682,2.8930555555556,105,1521,0 +750,751,7228,0.046813024390007014,2.4722222222222,94,1622,0 +751,752,7123,0.072990045810784,2.2294444444444,53,1580,0 +752,753,7199,0.17156759541908995,2.1286111111111,59,1468,0 +753,754,7167,0.051876699734571985,2.2219444444444,63,1520,0 +754,755,7212,0.031958698733103,2.3366666666667,61,1529,0 +755,756,7206,0.07333373485157901,2.4155555555556,72,1611,0 +756,757,7149,0.0,2.5408333333333,93,1511,0 +757,758,7284,0.023187512335638,2.6511111111111,62,1906,0 +758,759,7265,0.031672522871666,2.8405555555556,50,2632,0 +759,760,7221,0.091103855362214,2.8336111111111,42,1483,0 +760,761,7588,0.0,2.6575,62,1611,0 +761,762,7423,0.0983398607742,2.6622222222222005,21,1676,0 +762,763,7198,0.08011943311413,2.7719444444444,28,1670,0 +763,764,7279,0.043646436319699,3.0344444444444,65,1631,0 +764,765,7174,0.091445521226266,3.3741666666667003,37,1799,0 +765,766,7259,0.067771120773973,3.6925,20,1511,0 +766,767,7166,0.049768578185777006,3.8136111111111,47,1605,0 +767,768,7171,0.067455979006223,3.8202777777778,45,1758,0 +768,769,6883,0.14102875351082,3.7547222222222,49,1509,0 +769,770,6859,0.04521932948417,3.6077777777778,46,1591,0 +770,771,6817,0.032382889221133,3.5330555555556,30,1543,0 +771,772,6877,0.075100266089453,3.3544444444444,30,1573,0 +772,773,6785,0.038989846359505,3.1155555555556,48,1473,0 +773,774,6665,0.093396608626074,2.8463888888888995,36,1476,0 +774,775,6805,0.06797619687558401,2.4411111111111,46,1712,0 +775,776,6863,0.08326287339845401,2.1455555555556,27,1801,0 +776,777,6926,0.015112630017379,2.0025,79,1902,0 +777,778,7004,0.031549757127405,2.1247222222222,65,2005,0 +778,779,6950,0.0,2.2741666666667,57,2363,0 +779,780,7262,0.0,2.3272222222222005,61,2513,0 +780,781,7361,0.017214486216241002,2.4363888888889,89,2664,0 +781,782,7288,0.015541991667356,2.6155555555556003,80,2714,0 +782,783,7463,0.0,2.7272222222222,79,2754,0 +783,784,7188,0.027199843934104,2.6552777777778,113,2670,0 +784,785,7658,0.053744802378685,2.6086111111111,71,2584,0 +785,786,7575,0.05675511278546901,2.6025,53,2466,0 +786,787,6954,0.070873939193717,2.7372222222222,64,2137,0 +787,788,6862,0.19022950977106,3.0125,43,1931,0 +788,789,6896,0.17589540947937002,3.3477777777778,34,1743,0 +789,790,6954,0.022875979046571,3.6236111111111,29,1713,0 +790,791,6869,0.0,3.7383333333333,30,1649,0 +791,792,6890,0.13681403156951,3.7772222222222,24,1633,0 +792,793,9742,0.058507485759525,3.6966666666667,40,1993,0 +793,794,9730,0.10227075584148,3.7733333333333,32,1940,0 +794,795,9810,0.06726096113022301,3.6408333333333,39,1951,0 +795,796,9688,0.15267199916685995,3.3922222222222,67,1894,0 +796,797,9849,0.069818221889972,3.1627777777778,65,1801,0 +797,798,9765,0.030305771594539,2.6875,49,1962,0 +798,799,9812,0.09211700324247198,2.3533333333333,41,2123,0 +799,800,9931,0.12298177354813,2.0425,50,2434,0 +800,801,9908,0.08705722689013601,1.9738888888889,48,2402,0 +801,802,10066,0.07529920073678098,2.0425,59,3013,0 +802,803,10184,0.06217694957317299,2.1563888888889,51,3086,0 +803,804,10295,0.020886039183631,2.2866666666667004,43,3527,0 +804,805,10113,0.08148200392528,2.3919444444444,72,3716,0 +805,806,10218,0.027014133895137,2.5513888888889,52,3577,0 +806,807,10322,0.08271940630361399,2.6030555555556,68,3430,0 +807,808,10269,0.038537180887872,2.6647222222222005,74,3413,0 +808,809,10781,0.090543853269643,2.5930555555556003,46,3755,0 +809,810,10486,0.02593545023878,2.5513888888889,64,4806,0 +810,811,10124,0.090692829340129,2.76,38,3127,0 +811,812,9993,0.09154630234853098,3.0636111111111,40,3421,0 +812,813,9801,0.09562635368432304,3.4016666666667,50,2475,0 +813,814,9760,0.0,3.7277777777778,42,2440,0 +814,815,9858,0.0,3.7902777777778,37,2731,0 +815,816,9884,0.027267039980187,3.7355555555556,34,2493,0 +816,817,7781,0.024102810048699,3.535,37,1665,0 +817,818,7742,0.072297652068167,3.5819444444444,47,1771,0 +818,819,7682,0.12348623922845,3.3847222222222,67,2293,0 +819,820,7831,0.077453588867077,3.2547222222222,66,1959,0 +820,821,7641,0.05662557916213299,3.125,91,1498,0 +821,822,7641,0.15509029304093,2.7766666666667,132,1537,0 +822,823,7759,0.079595064406905,2.4725,149,1580,0 +823,824,7748,0.053225613553497,2.1927777777778,65,1901,0 +824,825,7776,0.05741127788681901,2.1283333333333,50,1916,0 +825,826,7938,0.077171346852694,2.2319444444444,70,2213,0 +826,827,8031,0.0,2.3061111111111,82,2205,0 +827,828,8117,0.07512642149906099,2.3363888888889,72,2486,0 +828,829,8099,0.0,2.3686111111111,98,2580,0 +829,830,8002,0.0,2.4986111111111,78,2530,0 +830,831,7944,0.026463035590685,2.6433333333333,86,2664,0 +831,832,7963,0.024228588329879,2.7563888888889,76,4368,0 +832,833,8602,0.055182797357095005,2.6652777777778,95,3103,0 +833,834,8269,0.09607690135523,2.6844444444444,63,2249,0 +834,835,7871,0.059431847203259,2.7902777777778,32,2070,0 +835,836,7709,0.018731901987648,3.1119444444444,30,2833,0 +836,837,7726,0.033970515582906,3.5491666666667,27,1734,0 +837,838,7781,0.049963174087431,3.7102777777778,22,2151,0 +838,839,7762,0.073295374096872,3.7961111111111,19,2103,0 +839,840,7692,0.017715537831218996,3.7730555555556,32,1725,0 +840,841,6608,0.014656639469103996,3.5919444444444,45,1895,0 +841,842,6526,0.15513271231042,3.5580555555556,65,1959,0 +842,843,6531,0.06544162031760599,3.4588888888889,73,1637,0 +843,844,6483,0.12276447331552,3.2969444444444003,52,1658,0 +844,845,6602,0.054046416943085,3.2288888888889,93,1666,0 +845,846,6555,0.06827770027642299,2.7358333333333,68,2410,0 +846,847,6610,0.10171854295932,2.4636111111111,127,1787,0 +847,848,6690,0.093454285728882,2.1894444444444,105,2264,0 +848,849,6651,0.04318436192577,2.1227777777778,75,2007,0 +849,850,6759,0.10050707347524,2.1369444444444,77,2107,0 +850,851,6836,0.019571935182124,2.2230555555556,140,2355,0 +851,852,6894,0.0,2.3188888888889,132,2726,0 +852,853,6844,0.0,2.4166666666667003,100,2875,0 +853,854,6773,0.02713995635286,2.5777777777778,174,2780,0 +854,855,6802,0.092632629280125,2.7869444444444,82,3936,0 +855,856,6947,0.098676638207998,2.8586111111111,128,3116,0 +856,857,7248,0.0,3.0816666666667003,79,3770,0 +857,858,6885,0.11132365864914,2.8713888888889,71,2382,0 +858,859,6643,0.0947301899901,2.9386111111111,60,2152,0 +859,860,6560,0.061070711161473,2.9827777777778,60,1754,0 +860,861,6554,0.18477832073133,3.3197222222222,56,1783,0 +861,862,6600,0.055986690710270993,3.5961111111111004,78,1780,0 +862,863,6525,0.16264480046039995,3.7613888888889,60,1582,0 +863,864,6543,0.026215643469448,3.7305555555556,48,2271,0 +864,865,9018,0.0,3.5580555555556,48,2592,0 +865,866,9225,0.054655616583012,3.5136111111111004,42,2921,0 +866,867,9112,0.07076692500883701,3.3772222222222,64,1814,0 +867,868,9195,0.067217215228375,3.2402777777778,36,3219,0 +868,869,9206,0.046060828388587,3.0586111111111003,40,2567,0 +869,870,9224,0.08329795085471901,2.7908333333333,18,1899,0 +870,871,9408,0.08219020764935,2.3761111111111,35,1801,0 +871,872,9082,0.046792553198475,2.1347222222222,44,2005,0 +872,873,9168,0.06755714954154099,1.9991666666667,105,2572,0 +873,874,9258,0.099050882008287,1.9983333333333,71,3563,0 +874,875,9158,0.0,2.0908333333333,65,2777,0 +875,876,9140,0.10824637351267,2.2311111111111,74,3362,0 +876,877,9206,0.0,2.3219444444444,34,3590,0 +877,878,9186,0.0,2.4727777777778,49,2930,0 +878,879,9155,0.037750185176735,2.5952777777778,44,2481,0 +879,880,9174,0.030345867660395,2.7416666666667004,57,2571,0 +880,881,9758,0.057665227298857,2.7652777777778,102,3546,0 +881,882,9451,0.16774071722374,2.7980555555556,106,4984,0 +882,883,9153,0.10462164884166,2.7597222222222,58,1994,0 +883,884,9233,0.051974117163582,3.0116666666667005,57,3060,0 +884,885,9250,0.070438547008222,3.2916666666667003,62,2151,0 +885,886,9317,0.11437533048244,3.5547222222222,42,2158,0 +886,887,9130,0.028754095353637,3.7580555555556,35,2319,0 +887,888,9249,0.06874265819680701,3.7330555555556,28,1909,0 +888,889,8297,0.041552255552731,3.5886111111111005,27,1627,0 +889,890,8245,0.033571347720577,3.5255555555556,35,2459,0 +890,891,8298,0.014724878652831,3.3858333333333,50,3167,0 +891,892,8247,0.046095580964192,3.2677777777778,69,1839,0 +892,893,8387,0.031859774913781,3.1247222222222,64,3887,0 +893,894,8392,0.094121536253424,2.7213888888888995,69,2031,0 +894,895,8531,0.11471874999036,2.3972222222222004,58,1522,0 +895,896,8437,0.09375530196425097,2.0836111111111,58,1732,0 +896,897,8344,0.10898948864079,2.0644444444444,51,2169,0 +897,898,8274,0.031129909255124,2.2063888888889,46,1679,0 +898,899,8328,0.0,2.3044444444444,84,1941,0 +899,900,8351,0.020155867044519,2.47,144,1638,0 +900,901,8380,0.016795241270985,2.5697222222222003,86,1725,0 +901,902,8332,0.0,2.7625,69,1903,0 +902,903,8366,0.0,2.9436111111111005,81,2074,0 +903,904,8357,0.01748186857624,2.7905555555556,175,1848,0 +904,905,8867,0.015638795432702,2.7527777777778,65,1761,0 +905,906,8659,0.037878946671491,2.6980555555556,48,1838,0 +906,907,8458,0.14870829462531002,2.9102777777778,33,1640,0 +907,908,8360,0.07322030784057597,3.2663888888889,35,1715,0 +908,909,8330,0.10504553292421,3.5372222222222,37,1717,0 +909,910,8298,0.10771048774666,3.86,31,1758,0 +910,911,8381,0.07484115005697,3.9216666666667,36,1975,0 +911,912,8393,0.10377526695926,3.8766666666667,30,1865,0 +912,913,3998,0.052336696506499,3.6463888888889,28,3575,0 +913,914,3733,0.039930389849144,3.6552777777778,24,1413,0 +914,915,3735,0.052659026600132,3.5880555555556,68,1414,0 +915,916,3709,0.071593754146172,3.3594444444444003,26,1170,0 +916,917,3755,0.072107773186609,3.1888888888889,78,1209,0 +917,918,3782,0.14407221323011,2.7575,90,1170,0 +918,919,3849,0.078873737285415,2.3936111111111,76,1328,0 +919,920,3801,0.090543853269643,2.1925,94,1258,0 +920,921,3787,0.0,2.16,70,1427,0 +921,922,3835,0.18229662394063,2.2719444444444,129,1480,0 +922,923,4035,0.10064381631633,2.3994444444444,120,1687,0 +923,924,4173,0.0,2.2836111111111,122,1942,0 +924,925,3995,0.0,2.5422222222222004,100,1967,0 +925,926,4016,0.0,2.6908333333333,102,2110,0 +926,927,4049,0.064661049677152,2.7702777777778,118,1956,0 +927,928,4014,0.10610212880951,2.7405555555556,86,1984,0 +928,929,4263,0.098345239553664,2.6908333333333,92,1893,0 +929,930,3941,0.055426072308289,2.7008333333333,44,1821,0 +930,931,4023,0.026036719363444,2.8322222222222,25,1641,0 +931,932,3917,0.058176601538018,3.0922222222222,54,1604,0 +932,933,3910,0.11644035456955,3.4363888888889,48,1265,0 +933,934,3934,0.067489738764642,3.7530555555556,56,1407,0 +934,935,3783,0.091155534540558,3.9127777777778,42,1342,0 +935,936,3834,0.052217414705359004,3.7608333333333,41,1216,0 +936,937,8698,0.028401045145692,3.6472222222222,32,2569,0 +937,938,8969,0.06030991242653401,3.5544444444444,48,2150,0 +938,939,8928,0.057683225704233,3.5036111111111,40,2317,0 +939,940,9020,0.049602244305935,3.2538888888889,26,2047,0 +940,941,8865,0.054771618715138,3.1886111111111,55,2065,0 +941,942,8830,0.014455899164978,2.7341666666667,52,1909,0 +942,943,8879,0.05563571922395901,2.3655555555556003,34,1910,0 +943,944,9120,0.077488949885965,2.1688888888889,61,2037,0 +944,945,9111,0.06776025909838901,2.0977777777778,34,3065,0 +945,946,9071,0.033919453583666,2.3077777777778,50,2452,0 +946,947,9205,0.030948232299768,2.3611111111111,47,3226,0 +947,948,9355,0.0,2.4986111111111,56,3271,0 +948,949,9372,0.0,2.5691666666667,76,3471,0 +949,950,9392,0.0,2.7463888888889,60,3922,0 +950,951,9416,0.0,2.8063888888888995,100,3296,0 +951,952,9394,0.0,2.8091666666667003,80,3171,0 +952,953,9810,0.10150033578287,2.715,74,3208,0 +953,954,9594,0.13650296233629,2.6869444444444,24,3602,0 +954,955,9006,0.048341331534980006,2.8180555555556,41,3208,0 +955,956,9140,0.055919636698743,3.0541666666667004,19,3455,0 +956,957,8925,0.052826773889684014,3.4711111111111004,24,2833,0 +957,958,9047,0.07932984590431501,3.7566666666667,18,3453,0 +958,959,9030,0.033310879512461,3.8633333333333,28,3155,0 +959,960,9088,0.048306771033288,3.7519444444444,5,2145,0 +960,961,8569,0.034002578802562,3.6480555555556,12,1999,0 +961,962,8616,0.047801640470854015,3.5061111111111005,35,2135,0 +962,963,8497,0.13378075099383,3.47,41,1813,0 +963,964,8439,0.063853685461221,3.3086111111111003,30,2020,0 +964,965,8567,0.0,3.1194444444444,22,2127,0 +965,966,8694,0.073869151016554,2.8044444444444,56,1764,0 +966,967,8739,0.043582908466928014,2.4205555555556004,34,2249,0 +967,968,8761,0.0,2.1180555555556,73,3119,0 +968,969,8838,0.062006969698131,2.1266666666667,86,2031,0 +969,970,8908,0.14006961492891,2.1708333333333,68,2246,0 +970,971,9053,0.11198565566104,2.3247222222222,36,3214,0 +971,972,9346,0.0,2.4208333333333,66,4207,0 +972,973,8989,0.058427455554992985,2.5563888888889,74,4195,0 +973,974,8807,0.070887934206661,2.7086111111111,78,3179,0 +974,975,9020,0.031869233863638,2.8027777777778,66,2739,0 +975,976,9034,0.0,2.7711111111111,118,2394,0 +976,977,9558,0.055680379884383,2.74,81,3750,0 +977,978,9042,0.030919398857213,2.6869444444444,85,3000,0 +978,979,8804,0.040222150865381015,2.8113888888889,69,2646,0 +979,980,8885,0.08462727078727299,3.1258333333333,49,2375,0 +980,981,8721,0.15790637433488,3.4711111111111004,56,2442,0 +981,982,8676,0.099165571846447,3.7419444444444,64,2069,0 +982,983,9029,0.051043016646698,3.7258333333333,48,1899,0 +983,984,8670,0.023695834967821,3.5369444444444,65,2277,0 +984,985,8537,0.13363180896924,3.4911111111111004,53,1926,0 +985,986,8418,0.14375985835531,3.3769444444444,70,1949,0 +986,987,8481,0.13890523887057998,3.3327777777778,51,2222,0 +987,988,8535,0.096357518724471,3.1925,30,1797,0 +988,989,8535,0.098277544249084,3.135,97,1860,0 +989,990,8442,0.11251833989481,2.8338888888889,41,2870,0 +990,991,8448,0.074768662666532,2.4997222222222004,32,1899,0 +991,992,8527,0.038008655416852,2.2297222222222004,47,2336,0 +992,993,8541,0.016354174968753,2.1158333333333,34,2703,0 +993,994,8635,0.11898350916153,2.1966666666667,54,2773,0 +994,995,8867,0.0,2.2591666666667,69,2577,0 +995,996,9033,0.0,2.3002777777778,109,2816,0 +996,997,8875,0.0,2.3797222222222003,76,3133,0 +997,998,8708,0.0,2.625,47,3366,0 +998,999,8455,0.020636446066963,2.6661111111111,44,3062,0 +999,1000,8713,0.043044731483849,2.6694444444444,92,3003,0 +1000,1001,8934,0.12513578187909,2.6541666666667,67,3044,0 +1001,1002,8745,0.099581351017555,2.6483333333333,26,3230,0 +1002,1003,8674,0.085903047711976,2.7444444444444,42,2793,0 +1003,1004,8606,0.066698820830796,3.0788888888889,69,1945,0 +1004,1005,8508,0.034228320502586,3.4833333333333,32,2716,0 +1005,1006,8558,0.028479870560763,3.6063888888889,41,2103,0 +1006,1007,8529,0.16430377699282994,3.8069444444444,52,1795,0 +1007,1008,8520,0.020290722486788003,3.6475,56,2840,0 +1008,1009,6662,0.17253761895951006,3.5219444444444,47,2653,0 +1009,1010,6491,0.1150267570489,3.3708333333333,65,2819,0 +1010,1011,6498,0.14119445755296,3.3086111111111003,70,1706,0 +1011,1012,6500,0.079900598296651,3.2411111111111004,84,1801,0 +1012,1013,6471,0.11459361685243,3.0525,71,3271,0 +1013,1014,6354,0.11299850955195,2.7419444444444,110,2001,0 +1014,1015,6592,0.078187238738118,2.4305555555556,65,1678,0 +1015,1016,6552,0.15222680511595002,2.1852777777778,68,1703,0 +1016,1017,6492,0.05823703723779,2.0644444444444,74,2441,0 +1017,1018,6577,0.038270957919533,2.1961111111111,43,2304,0 +1018,1019,6777,0.045436612403901,2.2886111111111,55,3124,0 +1019,1020,6844,0.051111263534218,2.3219444444444,53,3605,0 +1020,1021,6769,0.0,2.4436111111111,64,2985,0 +1021,1022,6642,0.0,2.6463888888889,58,2934,0 +1022,1023,6782,0.057248496594127986,2.735,54,3044,0 +1023,1024,6715,0.0,2.7586111111111005,121,3463,0 +1024,1025,6915,0.084808608043399,2.7138888888889,103,3199,0 +1025,1026,6569,0.05823703723779,2.7119444444444,66,2684,0 +1026,1027,6486,0.12640598881102005,2.8027777777778,73,3317,0 +1027,1028,6504,0.08602692657241201,2.9777777777778,71,2159,0 +1028,1029,6445,0.13712331887199,3.2961111111111,37,2043,0 +1029,1030,6427,0.12184008568979,3.4869444444444,46,2003,0 +1030,1031,6365,0.050317612906928,3.673611111111101,40,2260,0 +1031,1032,6277,0.07167380324199299,3.7469444444444,26,3522,0 +1032,1033,5231,0.051289858799957,3.6133333333333,42,1840,0 +1033,1034,5166,0.094021005766084,3.4752777777778,63,1820,0 +1034,1035,5303,0.020566298353792,3.3602777777778,68,1856,0 +1035,1036,5306,0.12275234276969,3.1605555555556,87,1715,0 +1036,1037,5298,0.1054190746845,3.0733333333333,60,1695,0 +1037,1038,5268,0.19050318144252,2.7130555555556,94,2254,0 +1038,1039,5251,0.10472332930133,2.2886111111111,121,1652,0 +1039,1040,5194,0.12644994481537,2.0783333333333,128,1602,0 +1040,1041,5230,0.08859454436104999,1.9188888888889,68,1792,0 +1041,1042,5244,0.0,1.9355555555556003,76,1954,0 +1042,1043,5102,0.09532581107230803,2.0569444444444,77,1808,0 +1043,1044,5244,0.15766772749983,2.1902777777778,158,1629,0 +1044,1045,5249,0.06429178708826701,2.3477777777778,112,2140,0 +1045,1046,5261,0.068395341911942,2.5502777777778,85,2390,0 +1046,1047,5339,0.025992957736547997,2.6597222222222,77,1707,0 +1047,1048,5241,0.0,2.7238888888888995,89,1901,0 +1048,1049,5491,0.021142167244918,2.7375,106,1820,0 +1049,1050,5374,0.072067861729848,2.7483333333333,47,2167,0 +1050,1051,5354,0.1275228688396,2.8525,34,2063,0 +1051,1052,5232,0.043846003986674,3.0038888888889,32,2184,0 +1052,1053,5217,0.10247450096434,3.2761111111111005,22,1981,0 +1053,1054,5258,0.07584150637714701,3.5761111111111004,16,1813,0 +1054,1055,5251,0.020496657705832,3.8172222222222,32,2033,0 +1055,1056,5223,0.13399493992192998,3.6691666666667,16,1629,0 +1056,1057,3952,0.091121163023619,3.5558333333333,20,1485,0 +1057,1058,3949,0.11809705541338,3.4266666666667,56,1527,0 +1058,1059,4021,0.033014047837867995,3.435,74,2561,0 +1059,1060,3815,0.16367597832104,3.2111111111111,116,1523,0 +1060,1061,3855,0.12469537397569,3.1297222222222,72,1446,0 +1061,1062,3892,0.095002031789468,2.7538888888889,66,1499,0 +1062,1063,3948,0.1028064299952,2.3116666666667003,56,1368,0 +1063,1064,3860,0.028861851985229007,2.0988888888889,61,1426,0 +1064,1065,3830,0.05806984314166,2.0983333333333,2151,3528,0 +1065,1066,3821,0.050886592113012,2.1986111111111,459,2279,0 +1066,1067,3886,0.05081829754409599,2.3677777777778,84,1421,0 +1067,1068,3954,0.0,2.5036111111111,55,2008,0 +1068,1069,3839,0.08354288831032201,2.5786111111111,61,1429,0 +1069,1070,3921,0.0,2.8172222222222,19,1497,0 +1070,1071,3874,0.08142390858425297,2.8727777777778,30,1604,0 +1071,1072,3996,0.047911560407608,2.8294444444444,73,1595,0 +1072,1073,4246,0.12201534565884,2.7136111111111005,63,2217,0 +1073,1074,3803,0.088739417881303,2.7058333333333,35,1580,0 +1074,1075,3594,0.08276214539547999,2.8161111111111,57,1466,0 +1075,1076,3778,0.066779641097052,3.1541666666667,50,1717,0 +1076,1077,3745,0.11367082443275,3.5791666666667004,48,1564,0 +1077,1078,3747,0.021597223158314,3.8158333333333,40,1752,0 +1078,1079,3726,0.16874893592242002,3.9405555555556,36,1598,0 +1079,1080,3729,0.041971530556774,3.7294444444444,59,1842,0 +1080,1081,8513,0.042983941794881,3.6183333333333,14,3066,0 +1081,1082,8738,0.14500733624043,3.4911111111111004,16,2272,0 +1082,1083,8709,0.046727090031129015,3.4566666666667003,36,4344,0 +1083,1084,8601,0.032553617944112004,3.37,65,3242,0 +1084,1085,8719,0.040039251102491,3.1658333333333,80,2291,0 +1085,1086,8820,0.055153759101126985,2.7261111111111003,91,2240,0 +1086,1087,8674,0.05751181017711901,2.3533333333333,102,2012,0 +1087,1088,8859,0.041202889821452,2.1158333333333,85,2305,0 +1088,1089,8905,0.07854024449462599,2.0852777777778,69,2295,0 +1089,1090,8920,0.11628975245152,2.1422222222222,79,2370,0 +1090,1091,9062,0.087543035971238,2.3172222222222003,66,3066,0 +1091,1092,9139,0.0,2.3983333333333,47,3132,0 +1092,1093,8866,0.031151045483539,2.55,51,3006,0 +1093,1094,8997,0.0,2.7413888888888995,20,3101,0 +1094,1095,9122,0.029949950026121008,2.7636111111111004,62,3739,0 +1095,1096,9191,0.067297142748812,2.7002777777778,54,3933,0 +1096,1097,9795,0.08450527625030299,2.7247222222222,99,4537,0 +1097,1098,9255,0.049852109269358014,2.5866666666667,64,3856,0 +1098,1099,8924,0.094084438832673,2.8597222222222,66,2862,0 +1099,1100,9012,0.044896125591910994,3.1269444444444,49,2449,0 +1100,1101,9023,0.07328004196455701,3.5019444444444,73,2222,0 +1101,1102,8875,0.13104465124262998,3.778611111111101,47,2159,0 +1102,1103,8800,0.10394116672902,3.8727777777778,48,2486,0 +1103,1104,8785,0.033616505813902,3.704166666666701,35,3148,0 +1104,1105,8474,0.02672150953308,3.5533333333333,27,3207,0 +1105,1106,8412,0.082058799915824,3.4461111111111005,19,2057,0 +1106,1107,8491,0.05732182787355501,3.4341666666667003,37,2029,0 +1107,1108,8391,0.067005870534182,3.3141666666667,45,3127,0 +1108,1109,8216,0.13429243256821,3.0438888888889,45,2597,0 +1109,1110,8292,0.015094533525413,2.6791666666667004,32,2350,0 +1110,1111,8406,0.063949370932991,2.3202777777778,99,2364,0 +1111,1112,8509,0.094378811742462,2.0691666666667,71,2095,0 +1112,1113,8486,0.02139340711812,2.0091666666667,93,2978,0 +1113,1114,8616,0.0,2.1886111111111,78,2743,0 +1114,1115,8642,0.0,2.3088888888889,71,2668,0 +1115,1116,8823,0.0,2.3794444444444,91,3054,0 +1116,1117,8774,0.0,2.5994444444444,31,3733,0 +1117,1118,8810,0.0,2.7119444444444,35,4312,0 +1118,1119,8611,0.0,2.76,25,4112,0 +1119,1120,8798,0.10029435223064,2.6975,45,3541,0 +1120,1121,9179,0.0,2.5466666666667,33,3901,0 +1121,1122,9057,0.10365337249761998,2.6036111111111,34,4371,0 +1122,1123,8633,0.12418226954696003,2.7927777777778,40,4099,0 +1123,1124,8517,0.0,2.9788888888889,17,3039,0 +1124,1125,8427,0.051166116772473,3.4080555555556,17,3197,0 +1125,1126,8615,0.040222150865381015,3.6813888888889,16,2346,0 +1126,1127,8690,0.17057206553854998,3.7983333333333,26,2285,0 +1127,1128,8438,0.12861588337799,3.6338888888889,19,2313,0 +1128,1129,10388,0.0,3.5111111111111004,30,3216,0 +1129,1130,10588,0.0,3.3613888888889,94,3860,0 +1130,1131,10533,0.14569364884757002,3.3072222222222,73,4781,0 +1131,1132,10397,0.18198813530019,3.2447222222222,59,2957,0 +1132,1133,10347,0.038073868368755,3.1152777777778,53,2171,0 +1133,1134,10405,0.11491272575332,2.6994444444444,56,2856,0 +1134,1135,10411,0.064841538076484,2.3497222222222005,70,2714,0 +1135,1136,10503,0.048708312546253,2.0619444444444,60,2602,0 +1136,1137,10598,0.11629780056153,2.0625,83,2331,0 +1137,1138,10692,0.07659916149791901,2.1905555555556004,265,3586,0 +1138,1139,10874,0.0,2.2588888888889,944,3363,0 +1139,1140,11043,0.043763623117499,2.3983333333333,36,3879,0 +1140,1141,11009,0.0,2.5536111111111,42,3556,0 +1141,1142,10818,0.041436571087464,2.7408333333333,23,4381,0 +1142,1143,10985,0.0,2.7375,75,4777,0 +1143,1144,10861,0.08191467409622599,2.7780555555556,68,4879,0 +1144,1145,12282,0.11084389924027,2.6225,23,3553,0 +1145,1146,11225,0.12510294083344,2.6386111111111,35,3177,0 +1146,1147,10775,0.10213470511717,2.7908333333333,38,2727,0 +1147,1148,10688,0.06332743445339299,3.0922222222222,69,2758,0 +1148,1149,10601,0.033666593475508995,3.4291666666667004,57,4124,0 +1149,1150,10634,0.057459020289436,3.6752777777778,58,3076,0 +1150,1151,10646,0.023008391787587,3.736111111111101,43,2291,0 +1151,1152,10562,0.037622360322278,3.5905555555556,65,2482,0 +1152,1153,10608,0.026766196308354,3.3872222222222,60,2537,0 +1153,1154,10618,0.13691041072327,3.3186111111111005,55,2434,0 +1154,1155,10636,0.024581173073578,3.2775,49,2608,0 +1155,1156,10583,0.050723618686514,3.1625,54,2614,0 +1156,1157,10613,0.038807415292018,3.1391666666667004,66,2904,0 +1157,1158,10603,0.10731539561588,2.7616666666667005,59,2204,0 +1158,1159,10601,0.13649131550296,2.4675,107,2326,0 +1159,1160,10757,0.11190990870167998,2.2166666666667,104,3002,0 +1160,1161,10815,0.17879123074031,2.1205555555556,100,3472,0 +1161,1162,10790,0.08728058888363299,2.2044444444444,133,3496,0 +1162,1163,11082,0.0,2.3147222222222004,65,3168,0 +1163,1164,11121,0.07099894663641,2.2416666666667004,152,4268,0 +1164,1165,10913,0.098617038600063,2.405,83,4350,0 +1165,1166,11004,0.0,2.5705555555556003,158,3555,0 +1166,1167,11135,0.10519721128315,2.7088888888889,145,4986,0 +1167,1168,10960,0.10928571467639,2.6913888888889,77,4576,0 +1168,1169,11686,0.14969099592127,2.6427777777778,13,4451,0 +1169,1170,11244,0.060122448878635,2.705,67,3627,0 +1170,1171,10931,0.068254139999346,2.8738888888889,25,3485,0 +1171,1172,10811,0.056987671819742985,3.0819444444444,27,3046,0 +1172,1173,10679,0.094667935014769,3.4491666666667005,23,2657,0 +1173,1174,10648,0.13287358772218,3.6275,28,2423,0 +1174,1175,10757,0.032507012295146,3.8027777777778,25,2374,0 +1175,1176,10706,0.14779741522058998,3.6436111111111,28,2493,0 +1176,1177,9077,0.10864900088005,3.4861111111111005,30,2495,0 +1177,1178,8836,0.12602969813907,3.3266666666667004,31,2189,0 +1178,1179,8971,0.07253718299881,3.1866666666667003,31,2214,0 +1179,1180,8972,0.31381296416887,3.2213888888888995,44,2374,0 +1180,1181,8903,0.2312064012582,3.0102777777778,27,3230,0 +1181,1182,8967,0.17687421373190998,2.6658333333333,36,2132,0 +1182,1183,8962,0.022073721703464003,2.3902777777778,61,3042,0 +1183,1184,9044,0.11600086139073,2.1380555555556,64,2053,0 +1184,1185,8931,0.10418807549523,2.0161111111111,118,2349,0 +1185,1186,9028,0.040222150865381015,2.0641666666667,98,3381,0 +1186,1187,9240,0.06812462580532,2.1844444444444,76,3436,0 +1187,1188,9227,0.055328485037955,2.2822222222222,57,3280,0 +1188,1189,9227,0.027788383289499,2.4002777777777995,74,4357,0 +1189,1190,9125,0.0,2.5433333333333,72,4522,0 +1190,1191,9075,0.0,2.7469444444444,78,4094,0 +1191,1192,9117,0.035137191893634005,2.6872222222222,69,3296,0 +1192,1193,9562,0.035137191893634005,2.6980555555556,125,4129,0 +1193,1194,9305,0.11258759940039,2.7380555555556,157,3036,0 +1194,1195,8965,0.16105265701128,2.7858333333333,61,2628,0 +1195,1196,8862,0.15210502999287,3.0502777777778,12,2296,0 +1196,1197,8858,0.07673479360192201,3.2991666666667,16,2221,0 +1197,1198,8820,0.17013715283392,3.5533333333333,36,1991,0 +1198,1199,8876,0.1609412187274,3.6652777777778,27,2778,0 +1199,1200,8797,0.12008642730107,3.6116666666667,22,2511,0 +1200,1201,9074,0.045995324803682,3.5463888888889,22,2103,0 +1201,1202,9318,0.23802438276872,3.4013888888889,35,2111,0 +1202,1203,9286,0.18078076076243,3.245,67,2055,0 +1203,1204,9320,0.12741851179236,3.1644444444444,46,1930,0 +1204,1205,9280,0.08024661572906401,2.9361111111111,72,2456,0 +1205,1206,9333,0.32656213417732,2.6952777777778,96,2952,0 +1206,1207,9334,0.28639695711596,2.3702777777778,117,2147,0 +1207,1208,9337,0.083900984173012,2.0947222222222,113,2051,0 +1208,1209,9405,0.12853338721539,1.9538888888889,140,2281,0 +1209,1210,9263,0.032414228925828,1.9925,107,2102,0 +1210,1211,9326,0.08237281480963901,2.0363888888889,102,2062,0 +1211,1212,9421,0.0,2.1919444444444,85,2796,0 +1212,1213,9275,0.0,2.3211111111111,49,2005,0 +1213,1214,9323,0.0,2.4955555555556,69,2075,0 +1214,1215,9347,0.45868581620054,2.6980555555556,68,2058,1 +1215,1216,9333,0.1959092708736,2.7219444444444,104,2733,0 +1216,1217,9846,0.7871265862012701,2.725,111,2170,1 +1217,1218,9497,0.18267963393082,2.7816666666667,88,2282,0 +1218,1219,9383,0.26777755992147,2.7811111111111004,64,2178,0 +1219,1220,9300,0.30404676514833,2.955,29,2283,0 +1220,1221,9389,0.28226806095289003,3.3158333333333,32,2097,0 +1221,1222,9364,0.32093016819692,3.5669444444444003,29,2738,0 +1222,1223,9227,0.24793583772273,3.7419444444444,21,2678,0 +1223,1224,9309,0.27376916868294,3.6236111111111,33,2404,0 +1224,1225,6204,0.32069151905173,3.4416666666667,37,1497,0 +1225,1226,6048,0.16728853165162,3.4172222222222,57,1496,0 +1226,1227,5949,0.17244047836378998,3.3016666666667,72,1935,0 +1227,1228,5981,0.21356200193615,3.1963888888889,86,1521,0 +1228,1229,5897,0.08833993625230199,3.0641666666667,70,2879,0 +1229,1230,6038,0.20141526375625,2.735,63,1561,0 +1230,1231,6094,0.12271171189386,2.3288888888889,49,1381,0 +1231,1232,6022,0.15111333507662,2.0938888888889,81,1826,0 +1232,1233,6122,0.3688420983862,2.1338888888889,58,1896,0 +1233,1234,6034,0.15672074166098002,2.2247222222222005,70,2083,0 +1234,1235,6079,0.099476236793782,2.3308333333333,67,1792,0 +1235,1236,5998,0.18394691317126,2.3902777777778,70,3258,0 +1236,1237,6004,0.076264605227629,2.5819444444444,95,2265,0 +1237,1238,5908,0.058100747891124,2.6661111111111,100,2775,0 +1238,1239,6022,0.18015967729618,2.8258333333333,116,1545,0 +1239,1240,5981,0.059431847203259,2.7502777777778,123,1818,0 +1240,1241,6399,0.14870829462531002,2.6730555555556004,71,1481,0 +1241,1242,6119,0.09565694822541,2.7536111111111,65,1677,0 +1242,1243,6114,0.16022629962173002,2.9677777777778,73,1858,0 +1243,1244,5915,0.4140256163498,3.37,53,1643,0 +1244,1245,6192,0.32447726333369004,3.5958333333333,79,1582,0 +1245,1246,6021,0.15394421357627,3.8144444444444,77,1611,0 +1246,1247,6060,0.060070368432038,3.8283333333333,59,1803,0 +1247,1248,7510,0.14236976564388,3.7030555555556,66,2121,0 +1248,1249,7560,0.12741851179236,3.5802777777778,54,2375,0 +1249,1250,7525,0.093634078744746,3.4197222222222,54,1866,0 +1250,1251,7483,0.13709947889982,3.4438888888889,89,2398,0 +1251,1252,7452,0.06298116794216299,3.3425,85,2577,0 +1252,1253,7512,0.13125017838571,3.1608333333333,96,1801,0 +1253,1254,7572,0.21161148728916,2.7413888888888995,149,1840,0 +1254,1255,7629,0.06783428261124,2.3808333333333,139,1985,0 +1255,1256,7529,0.20877561051189,2.12,90,2041,0 +1256,1257,7623,0.10394294206935002,2.1533333333333,68,2075,0 +1257,1258,7637,0.0,2.2569444444444,445,2564,0 +1258,1259,7921,0.076424293095548,2.3183333333333,100,2734,0 +1259,1260,7790,0.08809461878011901,2.3583333333333,138,3143,0 diff --git a/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/problem_TRAIN/dataSplits.csv b/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/problem_TRAIN/dataSplits.csv new file mode 100644 index 0000000..b017fe5 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/problem_TRAIN/dataSplits.csv @@ -0,0 +1,1261 @@ +d3mIndex,type,repeat,fold +0,TRAIN,0,0 +1,TRAIN,0,0 +2,TRAIN,0,0 +3,TRAIN,0,0 +4,TRAIN,0,0 +5,TRAIN,0,0 +6,TRAIN,0,0 +7,TRAIN,0,0 +8,TRAIN,0,0 +9,TRAIN,0,0 +10,TRAIN,0,0 +11,TRAIN,0,0 +12,TRAIN,0,0 +13,TRAIN,0,0 +14,TRAIN,0,0 +15,TRAIN,0,0 +16,TRAIN,0,0 +17,TRAIN,0,0 +18,TRAIN,0,0 +19,TRAIN,0,0 +20,TRAIN,0,0 +21,TRAIN,0,0 +22,TRAIN,0,0 +23,TRAIN,0,0 +24,TRAIN,0,0 +25,TRAIN,0,0 +26,TRAIN,0,0 +27,TRAIN,0,0 +28,TRAIN,0,0 +29,TRAIN,0,0 +30,TRAIN,0,0 +31,TRAIN,0,0 +32,TRAIN,0,0 +33,TRAIN,0,0 +34,TRAIN,0,0 +35,TRAIN,0,0 +36,TRAIN,0,0 +37,TRAIN,0,0 +38,TRAIN,0,0 +39,TRAIN,0,0 +40,TRAIN,0,0 +41,TRAIN,0,0 +42,TRAIN,0,0 +43,TRAIN,0,0 +44,TRAIN,0,0 +45,TRAIN,0,0 +46,TRAIN,0,0 +47,TRAIN,0,0 +48,TRAIN,0,0 +49,TRAIN,0,0 +50,TRAIN,0,0 +51,TRAIN,0,0 +52,TRAIN,0,0 +53,TRAIN,0,0 +54,TRAIN,0,0 +55,TRAIN,0,0 +56,TRAIN,0,0 +57,TRAIN,0,0 +58,TRAIN,0,0 +59,TRAIN,0,0 +60,TRAIN,0,0 +61,TRAIN,0,0 +62,TRAIN,0,0 +63,TRAIN,0,0 +64,TRAIN,0,0 +65,TRAIN,0,0 +66,TRAIN,0,0 +67,TRAIN,0,0 +68,TRAIN,0,0 +69,TRAIN,0,0 +70,TRAIN,0,0 +71,TRAIN,0,0 +72,TRAIN,0,0 +73,TRAIN,0,0 +74,TRAIN,0,0 +75,TRAIN,0,0 +76,TRAIN,0,0 +77,TRAIN,0,0 +78,TRAIN,0,0 +79,TRAIN,0,0 +80,TRAIN,0,0 +81,TRAIN,0,0 +82,TRAIN,0,0 +83,TRAIN,0,0 +84,TRAIN,0,0 +85,TRAIN,0,0 +86,TRAIN,0,0 +87,TRAIN,0,0 +88,TRAIN,0,0 +89,TRAIN,0,0 +90,TRAIN,0,0 +91,TRAIN,0,0 +92,TRAIN,0,0 +93,TRAIN,0,0 +94,TRAIN,0,0 +95,TRAIN,0,0 +96,TRAIN,0,0 +97,TRAIN,0,0 +98,TRAIN,0,0 +99,TRAIN,0,0 +100,TRAIN,0,0 +101,TRAIN,0,0 +102,TRAIN,0,0 +103,TRAIN,0,0 +104,TRAIN,0,0 +105,TRAIN,0,0 +106,TRAIN,0,0 +107,TRAIN,0,0 +108,TRAIN,0,0 +109,TRAIN,0,0 +110,TRAIN,0,0 +111,TRAIN,0,0 +112,TRAIN,0,0 +113,TRAIN,0,0 +114,TRAIN,0,0 +115,TRAIN,0,0 +116,TRAIN,0,0 +117,TRAIN,0,0 +118,TRAIN,0,0 +119,TRAIN,0,0 +120,TRAIN,0,0 +121,TRAIN,0,0 +122,TRAIN,0,0 +123,TRAIN,0,0 +124,TRAIN,0,0 +125,TRAIN,0,0 +126,TRAIN,0,0 +127,TRAIN,0,0 +128,TRAIN,0,0 +129,TRAIN,0,0 +130,TRAIN,0,0 +131,TRAIN,0,0 +132,TRAIN,0,0 +133,TRAIN,0,0 +134,TRAIN,0,0 +135,TRAIN,0,0 +136,TRAIN,0,0 +137,TRAIN,0,0 +138,TRAIN,0,0 +139,TRAIN,0,0 +140,TRAIN,0,0 +141,TRAIN,0,0 +142,TRAIN,0,0 +143,TRAIN,0,0 +144,TRAIN,0,0 +145,TRAIN,0,0 +146,TRAIN,0,0 +147,TRAIN,0,0 +148,TRAIN,0,0 +149,TRAIN,0,0 +150,TRAIN,0,0 +151,TRAIN,0,0 +152,TRAIN,0,0 +153,TRAIN,0,0 +154,TRAIN,0,0 +155,TRAIN,0,0 +156,TRAIN,0,0 +157,TRAIN,0,0 +158,TRAIN,0,0 +159,TRAIN,0,0 +160,TRAIN,0,0 +161,TRAIN,0,0 +162,TRAIN,0,0 +163,TRAIN,0,0 +164,TRAIN,0,0 +165,TRAIN,0,0 +166,TRAIN,0,0 +167,TRAIN,0,0 +168,TRAIN,0,0 +169,TRAIN,0,0 +170,TRAIN,0,0 +171,TRAIN,0,0 +172,TRAIN,0,0 +173,TRAIN,0,0 +174,TRAIN,0,0 +175,TRAIN,0,0 +176,TRAIN,0,0 +177,TRAIN,0,0 +178,TRAIN,0,0 +179,TRAIN,0,0 +180,TRAIN,0,0 +181,TRAIN,0,0 +182,TRAIN,0,0 +183,TRAIN,0,0 +184,TRAIN,0,0 +185,TRAIN,0,0 +186,TRAIN,0,0 +187,TRAIN,0,0 +188,TRAIN,0,0 +189,TRAIN,0,0 +190,TRAIN,0,0 +191,TRAIN,0,0 +192,TRAIN,0,0 +193,TRAIN,0,0 +194,TRAIN,0,0 +195,TRAIN,0,0 +196,TRAIN,0,0 +197,TRAIN,0,0 +198,TRAIN,0,0 +199,TRAIN,0,0 +200,TRAIN,0,0 +201,TRAIN,0,0 +202,TRAIN,0,0 +203,TRAIN,0,0 +204,TRAIN,0,0 +205,TRAIN,0,0 +206,TRAIN,0,0 +207,TRAIN,0,0 +208,TRAIN,0,0 +209,TRAIN,0,0 +210,TRAIN,0,0 +211,TRAIN,0,0 +212,TRAIN,0,0 +213,TRAIN,0,0 +214,TRAIN,0,0 +215,TRAIN,0,0 +216,TRAIN,0,0 +217,TRAIN,0,0 +218,TRAIN,0,0 +219,TRAIN,0,0 +220,TRAIN,0,0 +221,TRAIN,0,0 +222,TRAIN,0,0 +223,TRAIN,0,0 +224,TRAIN,0,0 +225,TRAIN,0,0 +226,TRAIN,0,0 +227,TRAIN,0,0 +228,TRAIN,0,0 +229,TRAIN,0,0 +230,TRAIN,0,0 +231,TRAIN,0,0 +232,TRAIN,0,0 +233,TRAIN,0,0 +234,TRAIN,0,0 +235,TRAIN,0,0 +236,TRAIN,0,0 +237,TRAIN,0,0 +238,TRAIN,0,0 +239,TRAIN,0,0 +240,TRAIN,0,0 +241,TRAIN,0,0 +242,TRAIN,0,0 +243,TRAIN,0,0 +244,TRAIN,0,0 +245,TRAIN,0,0 +246,TRAIN,0,0 +247,TRAIN,0,0 +248,TRAIN,0,0 +249,TRAIN,0,0 +250,TRAIN,0,0 +251,TRAIN,0,0 +252,TRAIN,0,0 +253,TRAIN,0,0 +254,TRAIN,0,0 +255,TRAIN,0,0 +256,TRAIN,0,0 +257,TRAIN,0,0 +258,TRAIN,0,0 +259,TRAIN,0,0 +260,TRAIN,0,0 +261,TRAIN,0,0 +262,TRAIN,0,0 +263,TRAIN,0,0 +264,TRAIN,0,0 +265,TRAIN,0,0 +266,TRAIN,0,0 +267,TRAIN,0,0 +268,TRAIN,0,0 +269,TRAIN,0,0 +270,TRAIN,0,0 +271,TRAIN,0,0 +272,TRAIN,0,0 +273,TRAIN,0,0 +274,TRAIN,0,0 +275,TRAIN,0,0 +276,TRAIN,0,0 +277,TRAIN,0,0 +278,TRAIN,0,0 +279,TRAIN,0,0 +280,TRAIN,0,0 +281,TRAIN,0,0 +282,TRAIN,0,0 +283,TRAIN,0,0 +284,TRAIN,0,0 +285,TRAIN,0,0 +286,TRAIN,0,0 +287,TRAIN,0,0 +288,TRAIN,0,0 +289,TRAIN,0,0 +290,TRAIN,0,0 +291,TRAIN,0,0 +292,TRAIN,0,0 +293,TRAIN,0,0 +294,TRAIN,0,0 +295,TRAIN,0,0 +296,TRAIN,0,0 +297,TRAIN,0,0 +298,TRAIN,0,0 +299,TRAIN,0,0 +300,TRAIN,0,0 +301,TRAIN,0,0 +302,TRAIN,0,0 +303,TRAIN,0,0 +304,TRAIN,0,0 +305,TRAIN,0,0 +306,TRAIN,0,0 +307,TRAIN,0,0 +308,TRAIN,0,0 +309,TRAIN,0,0 +310,TRAIN,0,0 +311,TRAIN,0,0 +312,TRAIN,0,0 +313,TRAIN,0,0 +314,TRAIN,0,0 +315,TRAIN,0,0 +316,TRAIN,0,0 +317,TRAIN,0,0 +318,TRAIN,0,0 +319,TRAIN,0,0 +320,TRAIN,0,0 +321,TRAIN,0,0 +322,TRAIN,0,0 +323,TRAIN,0,0 +324,TRAIN,0,0 +325,TRAIN,0,0 +326,TRAIN,0,0 +327,TRAIN,0,0 +328,TRAIN,0,0 +329,TRAIN,0,0 +330,TRAIN,0,0 +331,TRAIN,0,0 +332,TRAIN,0,0 +333,TRAIN,0,0 +334,TRAIN,0,0 +335,TRAIN,0,0 +336,TRAIN,0,0 +337,TRAIN,0,0 +338,TRAIN,0,0 +339,TRAIN,0,0 +340,TRAIN,0,0 +341,TRAIN,0,0 +342,TRAIN,0,0 +343,TRAIN,0,0 +344,TRAIN,0,0 +345,TRAIN,0,0 +346,TRAIN,0,0 +347,TRAIN,0,0 +348,TRAIN,0,0 +349,TRAIN,0,0 +350,TRAIN,0,0 +351,TRAIN,0,0 +352,TRAIN,0,0 +353,TRAIN,0,0 +354,TRAIN,0,0 +355,TRAIN,0,0 +356,TRAIN,0,0 +357,TRAIN,0,0 +358,TRAIN,0,0 +359,TRAIN,0,0 +360,TRAIN,0,0 +361,TRAIN,0,0 +362,TRAIN,0,0 +363,TRAIN,0,0 +364,TRAIN,0,0 +365,TRAIN,0,0 +366,TRAIN,0,0 +367,TRAIN,0,0 +368,TRAIN,0,0 +369,TRAIN,0,0 +370,TRAIN,0,0 +371,TRAIN,0,0 +372,TRAIN,0,0 +373,TRAIN,0,0 +374,TRAIN,0,0 +375,TRAIN,0,0 +376,TRAIN,0,0 +377,TRAIN,0,0 +378,TRAIN,0,0 +379,TRAIN,0,0 +380,TRAIN,0,0 +381,TRAIN,0,0 +382,TRAIN,0,0 +383,TRAIN,0,0 +384,TRAIN,0,0 +385,TRAIN,0,0 +386,TRAIN,0,0 +387,TRAIN,0,0 +388,TRAIN,0,0 +389,TRAIN,0,0 +390,TRAIN,0,0 +391,TRAIN,0,0 +392,TRAIN,0,0 +393,TRAIN,0,0 +394,TRAIN,0,0 +395,TRAIN,0,0 +396,TRAIN,0,0 +397,TRAIN,0,0 +398,TRAIN,0,0 +399,TRAIN,0,0 +400,TRAIN,0,0 +401,TRAIN,0,0 +402,TRAIN,0,0 +403,TRAIN,0,0 +404,TRAIN,0,0 +405,TRAIN,0,0 +406,TRAIN,0,0 +407,TRAIN,0,0 +408,TRAIN,0,0 +409,TRAIN,0,0 +410,TRAIN,0,0 +411,TRAIN,0,0 +412,TRAIN,0,0 +413,TRAIN,0,0 +414,TRAIN,0,0 +415,TRAIN,0,0 +416,TRAIN,0,0 +417,TRAIN,0,0 +418,TRAIN,0,0 +419,TRAIN,0,0 +420,TRAIN,0,0 +421,TRAIN,0,0 +422,TRAIN,0,0 +423,TRAIN,0,0 +424,TRAIN,0,0 +425,TRAIN,0,0 +426,TRAIN,0,0 +427,TRAIN,0,0 +428,TRAIN,0,0 +429,TRAIN,0,0 +430,TRAIN,0,0 +431,TRAIN,0,0 +432,TRAIN,0,0 +433,TRAIN,0,0 +434,TRAIN,0,0 +435,TRAIN,0,0 +436,TRAIN,0,0 +437,TRAIN,0,0 +438,TRAIN,0,0 +439,TRAIN,0,0 +440,TRAIN,0,0 +441,TRAIN,0,0 +442,TRAIN,0,0 +443,TRAIN,0,0 +444,TRAIN,0,0 +445,TRAIN,0,0 +446,TRAIN,0,0 +447,TRAIN,0,0 +448,TRAIN,0,0 +449,TRAIN,0,0 +450,TRAIN,0,0 +451,TRAIN,0,0 +452,TRAIN,0,0 +453,TRAIN,0,0 +454,TRAIN,0,0 +455,TRAIN,0,0 +456,TRAIN,0,0 +457,TRAIN,0,0 +458,TRAIN,0,0 +459,TRAIN,0,0 +460,TRAIN,0,0 +461,TRAIN,0,0 +462,TRAIN,0,0 +463,TRAIN,0,0 +464,TRAIN,0,0 +465,TRAIN,0,0 +466,TRAIN,0,0 +467,TRAIN,0,0 +468,TRAIN,0,0 +469,TRAIN,0,0 +470,TRAIN,0,0 +471,TRAIN,0,0 +472,TRAIN,0,0 +473,TRAIN,0,0 +474,TRAIN,0,0 +475,TRAIN,0,0 +476,TRAIN,0,0 +477,TRAIN,0,0 +478,TRAIN,0,0 +479,TRAIN,0,0 +480,TRAIN,0,0 +481,TRAIN,0,0 +482,TRAIN,0,0 +483,TRAIN,0,0 +484,TRAIN,0,0 +485,TRAIN,0,0 +486,TRAIN,0,0 +487,TRAIN,0,0 +488,TRAIN,0,0 +489,TRAIN,0,0 +490,TRAIN,0,0 +491,TRAIN,0,0 +492,TRAIN,0,0 +493,TRAIN,0,0 +494,TRAIN,0,0 +495,TRAIN,0,0 +496,TRAIN,0,0 +497,TRAIN,0,0 +498,TRAIN,0,0 +499,TRAIN,0,0 +500,TRAIN,0,0 +501,TRAIN,0,0 +502,TRAIN,0,0 +503,TRAIN,0,0 +504,TRAIN,0,0 +505,TRAIN,0,0 +506,TRAIN,0,0 +507,TRAIN,0,0 +508,TRAIN,0,0 +509,TRAIN,0,0 +510,TRAIN,0,0 +511,TRAIN,0,0 +512,TRAIN,0,0 +513,TRAIN,0,0 +514,TRAIN,0,0 +515,TRAIN,0,0 +516,TRAIN,0,0 +517,TRAIN,0,0 +518,TRAIN,0,0 +519,TRAIN,0,0 +520,TRAIN,0,0 +521,TRAIN,0,0 +522,TRAIN,0,0 +523,TRAIN,0,0 +524,TRAIN,0,0 +525,TRAIN,0,0 +526,TRAIN,0,0 +527,TRAIN,0,0 +528,TRAIN,0,0 +529,TRAIN,0,0 +530,TRAIN,0,0 +531,TRAIN,0,0 +532,TRAIN,0,0 +533,TRAIN,0,0 +534,TRAIN,0,0 +535,TRAIN,0,0 +536,TRAIN,0,0 +537,TRAIN,0,0 +538,TRAIN,0,0 +539,TRAIN,0,0 +540,TRAIN,0,0 +541,TRAIN,0,0 +542,TRAIN,0,0 +543,TRAIN,0,0 +544,TRAIN,0,0 +545,TRAIN,0,0 +546,TRAIN,0,0 +547,TRAIN,0,0 +548,TRAIN,0,0 +549,TRAIN,0,0 +550,TRAIN,0,0 +551,TRAIN,0,0 +552,TRAIN,0,0 +553,TRAIN,0,0 +554,TRAIN,0,0 +555,TRAIN,0,0 +556,TRAIN,0,0 +557,TRAIN,0,0 +558,TRAIN,0,0 +559,TRAIN,0,0 +560,TRAIN,0,0 +561,TRAIN,0,0 +562,TRAIN,0,0 +563,TRAIN,0,0 +564,TRAIN,0,0 +565,TRAIN,0,0 +566,TRAIN,0,0 +567,TRAIN,0,0 +568,TRAIN,0,0 +569,TRAIN,0,0 +570,TRAIN,0,0 +571,TRAIN,0,0 +572,TRAIN,0,0 +573,TRAIN,0,0 +574,TRAIN,0,0 +575,TRAIN,0,0 +576,TRAIN,0,0 +577,TRAIN,0,0 +578,TRAIN,0,0 +579,TRAIN,0,0 +580,TRAIN,0,0 +581,TRAIN,0,0 +582,TRAIN,0,0 +583,TRAIN,0,0 +584,TRAIN,0,0 +585,TRAIN,0,0 +586,TRAIN,0,0 +587,TRAIN,0,0 +588,TRAIN,0,0 +589,TRAIN,0,0 +590,TRAIN,0,0 +591,TRAIN,0,0 +592,TRAIN,0,0 +593,TRAIN,0,0 +594,TRAIN,0,0 +595,TRAIN,0,0 +596,TRAIN,0,0 +597,TRAIN,0,0 +598,TRAIN,0,0 +599,TRAIN,0,0 +600,TRAIN,0,0 +601,TRAIN,0,0 +602,TRAIN,0,0 +603,TRAIN,0,0 +604,TRAIN,0,0 +605,TRAIN,0,0 +606,TRAIN,0,0 +607,TRAIN,0,0 +608,TRAIN,0,0 +609,TRAIN,0,0 +610,TRAIN,0,0 +611,TRAIN,0,0 +612,TRAIN,0,0 +613,TRAIN,0,0 +614,TRAIN,0,0 +615,TRAIN,0,0 +616,TRAIN,0,0 +617,TRAIN,0,0 +618,TRAIN,0,0 +619,TRAIN,0,0 +620,TRAIN,0,0 +621,TRAIN,0,0 +622,TRAIN,0,0 +623,TRAIN,0,0 +624,TRAIN,0,0 +625,TRAIN,0,0 +626,TRAIN,0,0 +627,TRAIN,0,0 +628,TRAIN,0,0 +629,TRAIN,0,0 +630,TRAIN,0,0 +631,TRAIN,0,0 +632,TRAIN,0,0 +633,TRAIN,0,0 +634,TRAIN,0,0 +635,TRAIN,0,0 +636,TRAIN,0,0 +637,TRAIN,0,0 +638,TRAIN,0,0 +639,TRAIN,0,0 +640,TRAIN,0,0 +641,TRAIN,0,0 +642,TRAIN,0,0 +643,TRAIN,0,0 +644,TRAIN,0,0 +645,TRAIN,0,0 +646,TRAIN,0,0 +647,TRAIN,0,0 +648,TRAIN,0,0 +649,TRAIN,0,0 +650,TRAIN,0,0 +651,TRAIN,0,0 +652,TRAIN,0,0 +653,TRAIN,0,0 +654,TRAIN,0,0 +655,TRAIN,0,0 +656,TRAIN,0,0 +657,TRAIN,0,0 +658,TRAIN,0,0 +659,TRAIN,0,0 +660,TRAIN,0,0 +661,TRAIN,0,0 +662,TRAIN,0,0 +663,TRAIN,0,0 +664,TRAIN,0,0 +665,TRAIN,0,0 +666,TRAIN,0,0 +667,TRAIN,0,0 +668,TRAIN,0,0 +669,TRAIN,0,0 +670,TRAIN,0,0 +671,TRAIN,0,0 +672,TRAIN,0,0 +673,TRAIN,0,0 +674,TRAIN,0,0 +675,TRAIN,0,0 +676,TRAIN,0,0 +677,TRAIN,0,0 +678,TRAIN,0,0 +679,TRAIN,0,0 +680,TRAIN,0,0 +681,TRAIN,0,0 +682,TRAIN,0,0 +683,TRAIN,0,0 +684,TRAIN,0,0 +685,TRAIN,0,0 +686,TRAIN,0,0 +687,TRAIN,0,0 +688,TRAIN,0,0 +689,TRAIN,0,0 +690,TRAIN,0,0 +691,TRAIN,0,0 +692,TRAIN,0,0 +693,TRAIN,0,0 +694,TRAIN,0,0 +695,TRAIN,0,0 +696,TRAIN,0,0 +697,TRAIN,0,0 +698,TRAIN,0,0 +699,TRAIN,0,0 +700,TRAIN,0,0 +701,TRAIN,0,0 +702,TRAIN,0,0 +703,TRAIN,0,0 +704,TRAIN,0,0 +705,TRAIN,0,0 +706,TRAIN,0,0 +707,TRAIN,0,0 +708,TRAIN,0,0 +709,TRAIN,0,0 +710,TRAIN,0,0 +711,TRAIN,0,0 +712,TRAIN,0,0 +713,TRAIN,0,0 +714,TRAIN,0,0 +715,TRAIN,0,0 +716,TRAIN,0,0 +717,TRAIN,0,0 +718,TRAIN,0,0 +719,TRAIN,0,0 +720,TRAIN,0,0 +721,TRAIN,0,0 +722,TRAIN,0,0 +723,TRAIN,0,0 +724,TRAIN,0,0 +725,TRAIN,0,0 +726,TRAIN,0,0 +727,TRAIN,0,0 +728,TRAIN,0,0 +729,TRAIN,0,0 +730,TRAIN,0,0 +731,TRAIN,0,0 +732,TRAIN,0,0 +733,TRAIN,0,0 +734,TRAIN,0,0 +735,TRAIN,0,0 +736,TRAIN,0,0 +737,TRAIN,0,0 +738,TRAIN,0,0 +739,TRAIN,0,0 +740,TRAIN,0,0 +741,TRAIN,0,0 +742,TRAIN,0,0 +743,TRAIN,0,0 +744,TRAIN,0,0 +745,TRAIN,0,0 +746,TRAIN,0,0 +747,TRAIN,0,0 +748,TRAIN,0,0 +749,TRAIN,0,0 +750,TRAIN,0,0 +751,TRAIN,0,0 +752,TRAIN,0,0 +753,TRAIN,0,0 +754,TRAIN,0,0 +755,TRAIN,0,0 +756,TRAIN,0,0 +757,TRAIN,0,0 +758,TRAIN,0,0 +759,TRAIN,0,0 +760,TRAIN,0,0 +761,TRAIN,0,0 +762,TRAIN,0,0 +763,TRAIN,0,0 +764,TRAIN,0,0 +765,TRAIN,0,0 +766,TRAIN,0,0 +767,TRAIN,0,0 +768,TRAIN,0,0 +769,TRAIN,0,0 +770,TRAIN,0,0 +771,TRAIN,0,0 +772,TRAIN,0,0 +773,TRAIN,0,0 +774,TRAIN,0,0 +775,TRAIN,0,0 +776,TRAIN,0,0 +777,TRAIN,0,0 +778,TRAIN,0,0 +779,TRAIN,0,0 +780,TRAIN,0,0 +781,TRAIN,0,0 +782,TRAIN,0,0 +783,TRAIN,0,0 +784,TRAIN,0,0 +785,TRAIN,0,0 +786,TRAIN,0,0 +787,TRAIN,0,0 +788,TRAIN,0,0 +789,TRAIN,0,0 +790,TRAIN,0,0 +791,TRAIN,0,0 +792,TRAIN,0,0 +793,TRAIN,0,0 +794,TRAIN,0,0 +795,TRAIN,0,0 +796,TRAIN,0,0 +797,TRAIN,0,0 +798,TRAIN,0,0 +799,TRAIN,0,0 +800,TRAIN,0,0 +801,TRAIN,0,0 +802,TRAIN,0,0 +803,TRAIN,0,0 +804,TRAIN,0,0 +805,TRAIN,0,0 +806,TRAIN,0,0 +807,TRAIN,0,0 +808,TRAIN,0,0 +809,TRAIN,0,0 +810,TRAIN,0,0 +811,TRAIN,0,0 +812,TRAIN,0,0 +813,TRAIN,0,0 +814,TRAIN,0,0 +815,TRAIN,0,0 +816,TRAIN,0,0 +817,TRAIN,0,0 +818,TRAIN,0,0 +819,TRAIN,0,0 +820,TRAIN,0,0 +821,TRAIN,0,0 +822,TRAIN,0,0 +823,TRAIN,0,0 +824,TRAIN,0,0 +825,TRAIN,0,0 +826,TRAIN,0,0 +827,TRAIN,0,0 +828,TRAIN,0,0 +829,TRAIN,0,0 +830,TRAIN,0,0 +831,TRAIN,0,0 +832,TRAIN,0,0 +833,TRAIN,0,0 +834,TRAIN,0,0 +835,TRAIN,0,0 +836,TRAIN,0,0 +837,TRAIN,0,0 +838,TRAIN,0,0 +839,TRAIN,0,0 +840,TRAIN,0,0 +841,TRAIN,0,0 +842,TRAIN,0,0 +843,TRAIN,0,0 +844,TRAIN,0,0 +845,TRAIN,0,0 +846,TRAIN,0,0 +847,TRAIN,0,0 +848,TRAIN,0,0 +849,TRAIN,0,0 +850,TRAIN,0,0 +851,TRAIN,0,0 +852,TRAIN,0,0 +853,TRAIN,0,0 +854,TRAIN,0,0 +855,TRAIN,0,0 +856,TRAIN,0,0 +857,TRAIN,0,0 +858,TRAIN,0,0 +859,TRAIN,0,0 +860,TRAIN,0,0 +861,TRAIN,0,0 +862,TRAIN,0,0 +863,TRAIN,0,0 +864,TRAIN,0,0 +865,TRAIN,0,0 +866,TRAIN,0,0 +867,TRAIN,0,0 +868,TRAIN,0,0 +869,TRAIN,0,0 +870,TRAIN,0,0 +871,TRAIN,0,0 +872,TRAIN,0,0 +873,TRAIN,0,0 +874,TRAIN,0,0 +875,TRAIN,0,0 +876,TRAIN,0,0 +877,TRAIN,0,0 +878,TRAIN,0,0 +879,TRAIN,0,0 +880,TRAIN,0,0 +881,TRAIN,0,0 +882,TRAIN,0,0 +883,TRAIN,0,0 +884,TRAIN,0,0 +885,TRAIN,0,0 +886,TRAIN,0,0 +887,TRAIN,0,0 +888,TRAIN,0,0 +889,TRAIN,0,0 +890,TRAIN,0,0 +891,TRAIN,0,0 +892,TRAIN,0,0 +893,TRAIN,0,0 +894,TRAIN,0,0 +895,TRAIN,0,0 +896,TRAIN,0,0 +897,TRAIN,0,0 +898,TRAIN,0,0 +899,TRAIN,0,0 +900,TRAIN,0,0 +901,TRAIN,0,0 +902,TRAIN,0,0 +903,TRAIN,0,0 +904,TRAIN,0,0 +905,TRAIN,0,0 +906,TRAIN,0,0 +907,TRAIN,0,0 +908,TRAIN,0,0 +909,TRAIN,0,0 +910,TRAIN,0,0 +911,TRAIN,0,0 +912,TRAIN,0,0 +913,TRAIN,0,0 +914,TRAIN,0,0 +915,TRAIN,0,0 +916,TRAIN,0,0 +917,TRAIN,0,0 +918,TRAIN,0,0 +919,TRAIN,0,0 +920,TRAIN,0,0 +921,TRAIN,0,0 +922,TRAIN,0,0 +923,TRAIN,0,0 +924,TRAIN,0,0 +925,TRAIN,0,0 +926,TRAIN,0,0 +927,TRAIN,0,0 +928,TRAIN,0,0 +929,TRAIN,0,0 +930,TRAIN,0,0 +931,TRAIN,0,0 +932,TRAIN,0,0 +933,TRAIN,0,0 +934,TRAIN,0,0 +935,TRAIN,0,0 +936,TRAIN,0,0 +937,TRAIN,0,0 +938,TRAIN,0,0 +939,TRAIN,0,0 +940,TRAIN,0,0 +941,TRAIN,0,0 +942,TRAIN,0,0 +943,TRAIN,0,0 +944,TRAIN,0,0 +945,TRAIN,0,0 +946,TRAIN,0,0 +947,TRAIN,0,0 +948,TRAIN,0,0 +949,TRAIN,0,0 +950,TRAIN,0,0 +951,TRAIN,0,0 +952,TRAIN,0,0 +953,TRAIN,0,0 +954,TRAIN,0,0 +955,TRAIN,0,0 +956,TRAIN,0,0 +957,TRAIN,0,0 +958,TRAIN,0,0 +959,TRAIN,0,0 +960,TRAIN,0,0 +961,TRAIN,0,0 +962,TRAIN,0,0 +963,TRAIN,0,0 +964,TRAIN,0,0 +965,TRAIN,0,0 +966,TRAIN,0,0 +967,TRAIN,0,0 +968,TRAIN,0,0 +969,TRAIN,0,0 +970,TRAIN,0,0 +971,TRAIN,0,0 +972,TRAIN,0,0 +973,TRAIN,0,0 +974,TRAIN,0,0 +975,TRAIN,0,0 +976,TRAIN,0,0 +977,TRAIN,0,0 +978,TRAIN,0,0 +979,TRAIN,0,0 +980,TRAIN,0,0 +981,TRAIN,0,0 +982,TRAIN,0,0 +983,TRAIN,0,0 +984,TRAIN,0,0 +985,TRAIN,0,0 +986,TRAIN,0,0 +987,TRAIN,0,0 +988,TRAIN,0,0 +989,TRAIN,0,0 +990,TRAIN,0,0 +991,TRAIN,0,0 +992,TRAIN,0,0 +993,TRAIN,0,0 +994,TRAIN,0,0 +995,TRAIN,0,0 +996,TRAIN,0,0 +997,TRAIN,0,0 +998,TRAIN,0,0 +999,TRAIN,0,0 +1000,TRAIN,0,0 +1001,TRAIN,0,0 +1002,TRAIN,0,0 +1003,TRAIN,0,0 +1004,TRAIN,0,0 +1005,TRAIN,0,0 +1006,TRAIN,0,0 +1007,TRAIN,0,0 +1008,TRAIN,0,0 +1009,TRAIN,0,0 +1010,TRAIN,0,0 +1011,TRAIN,0,0 +1012,TRAIN,0,0 +1013,TRAIN,0,0 +1014,TRAIN,0,0 +1015,TRAIN,0,0 +1016,TRAIN,0,0 +1017,TRAIN,0,0 +1018,TRAIN,0,0 +1019,TRAIN,0,0 +1020,TRAIN,0,0 +1021,TRAIN,0,0 +1022,TRAIN,0,0 +1023,TRAIN,0,0 +1024,TRAIN,0,0 +1025,TRAIN,0,0 +1026,TRAIN,0,0 +1027,TRAIN,0,0 +1028,TRAIN,0,0 +1029,TRAIN,0,0 +1030,TRAIN,0,0 +1031,TRAIN,0,0 +1032,TRAIN,0,0 +1033,TRAIN,0,0 +1034,TRAIN,0,0 +1035,TRAIN,0,0 +1036,TRAIN,0,0 +1037,TRAIN,0,0 +1038,TRAIN,0,0 +1039,TRAIN,0,0 +1040,TRAIN,0,0 +1041,TRAIN,0,0 +1042,TRAIN,0,0 +1043,TRAIN,0,0 +1044,TRAIN,0,0 +1045,TRAIN,0,0 +1046,TRAIN,0,0 +1047,TRAIN,0,0 +1048,TRAIN,0,0 +1049,TRAIN,0,0 +1050,TRAIN,0,0 +1051,TRAIN,0,0 +1052,TRAIN,0,0 +1053,TRAIN,0,0 +1054,TRAIN,0,0 +1055,TRAIN,0,0 +1056,TRAIN,0,0 +1057,TRAIN,0,0 +1058,TRAIN,0,0 +1059,TRAIN,0,0 +1060,TRAIN,0,0 +1061,TRAIN,0,0 +1062,TRAIN,0,0 +1063,TRAIN,0,0 +1064,TRAIN,0,0 +1065,TRAIN,0,0 +1066,TRAIN,0,0 +1067,TRAIN,0,0 +1068,TRAIN,0,0 +1069,TRAIN,0,0 +1070,TRAIN,0,0 +1071,TRAIN,0,0 +1072,TRAIN,0,0 +1073,TRAIN,0,0 +1074,TRAIN,0,0 +1075,TRAIN,0,0 +1076,TRAIN,0,0 +1077,TRAIN,0,0 +1078,TRAIN,0,0 +1079,TRAIN,0,0 +1080,TRAIN,0,0 +1081,TRAIN,0,0 +1082,TRAIN,0,0 +1083,TRAIN,0,0 +1084,TRAIN,0,0 +1085,TRAIN,0,0 +1086,TRAIN,0,0 +1087,TRAIN,0,0 +1088,TRAIN,0,0 +1089,TRAIN,0,0 +1090,TRAIN,0,0 +1091,TRAIN,0,0 +1092,TRAIN,0,0 +1093,TRAIN,0,0 +1094,TRAIN,0,0 +1095,TRAIN,0,0 +1096,TRAIN,0,0 +1097,TRAIN,0,0 +1098,TRAIN,0,0 +1099,TRAIN,0,0 +1100,TRAIN,0,0 +1101,TRAIN,0,0 +1102,TRAIN,0,0 +1103,TRAIN,0,0 +1104,TRAIN,0,0 +1105,TRAIN,0,0 +1106,TRAIN,0,0 +1107,TRAIN,0,0 +1108,TRAIN,0,0 +1109,TRAIN,0,0 +1110,TRAIN,0,0 +1111,TRAIN,0,0 +1112,TRAIN,0,0 +1113,TRAIN,0,0 +1114,TRAIN,0,0 +1115,TRAIN,0,0 +1116,TRAIN,0,0 +1117,TRAIN,0,0 +1118,TRAIN,0,0 +1119,TRAIN,0,0 +1120,TRAIN,0,0 +1121,TRAIN,0,0 +1122,TRAIN,0,0 +1123,TRAIN,0,0 +1124,TRAIN,0,0 +1125,TRAIN,0,0 +1126,TRAIN,0,0 +1127,TRAIN,0,0 +1128,TRAIN,0,0 +1129,TRAIN,0,0 +1130,TRAIN,0,0 +1131,TRAIN,0,0 +1132,TRAIN,0,0 +1133,TRAIN,0,0 +1134,TRAIN,0,0 +1135,TRAIN,0,0 +1136,TRAIN,0,0 +1137,TRAIN,0,0 +1138,TRAIN,0,0 +1139,TRAIN,0,0 +1140,TRAIN,0,0 +1141,TRAIN,0,0 +1142,TRAIN,0,0 +1143,TRAIN,0,0 +1144,TRAIN,0,0 +1145,TRAIN,0,0 +1146,TRAIN,0,0 +1147,TRAIN,0,0 +1148,TRAIN,0,0 +1149,TRAIN,0,0 +1150,TRAIN,0,0 +1151,TRAIN,0,0 +1152,TRAIN,0,0 +1153,TRAIN,0,0 +1154,TRAIN,0,0 +1155,TRAIN,0,0 +1156,TRAIN,0,0 +1157,TRAIN,0,0 +1158,TRAIN,0,0 +1159,TRAIN,0,0 +1160,TRAIN,0,0 +1161,TRAIN,0,0 +1162,TRAIN,0,0 +1163,TRAIN,0,0 +1164,TRAIN,0,0 +1165,TRAIN,0,0 +1166,TRAIN,0,0 +1167,TRAIN,0,0 +1168,TRAIN,0,0 +1169,TRAIN,0,0 +1170,TRAIN,0,0 +1171,TRAIN,0,0 +1172,TRAIN,0,0 +1173,TRAIN,0,0 +1174,TRAIN,0,0 +1175,TRAIN,0,0 +1176,TRAIN,0,0 +1177,TRAIN,0,0 +1178,TRAIN,0,0 +1179,TRAIN,0,0 +1180,TRAIN,0,0 +1181,TRAIN,0,0 +1182,TRAIN,0,0 +1183,TRAIN,0,0 +1184,TRAIN,0,0 +1185,TRAIN,0,0 +1186,TRAIN,0,0 +1187,TRAIN,0,0 +1188,TRAIN,0,0 +1189,TRAIN,0,0 +1190,TRAIN,0,0 +1191,TRAIN,0,0 +1192,TRAIN,0,0 +1193,TRAIN,0,0 +1194,TRAIN,0,0 +1195,TRAIN,0,0 +1196,TRAIN,0,0 +1197,TRAIN,0,0 +1198,TRAIN,0,0 +1199,TRAIN,0,0 +1200,TRAIN,0,0 +1201,TRAIN,0,0 +1202,TRAIN,0,0 +1203,TRAIN,0,0 +1204,TRAIN,0,0 +1205,TRAIN,0,0 +1206,TRAIN,0,0 +1207,TRAIN,0,0 +1208,TRAIN,0,0 +1209,TRAIN,0,0 +1210,TRAIN,0,0 +1211,TRAIN,0,0 +1212,TRAIN,0,0 +1213,TRAIN,0,0 +1214,TRAIN,0,0 +1215,TRAIN,0,0 +1216,TRAIN,0,0 +1217,TRAIN,0,0 +1218,TRAIN,0,0 +1219,TRAIN,0,0 +1220,TRAIN,0,0 +1221,TRAIN,0,0 +1222,TRAIN,0,0 +1223,TRAIN,0,0 +1224,TRAIN,0,0 +1225,TRAIN,0,0 +1226,TRAIN,0,0 +1227,TRAIN,0,0 +1228,TRAIN,0,0 +1229,TRAIN,0,0 +1230,TRAIN,0,0 +1231,TRAIN,0,0 +1232,TRAIN,0,0 +1233,TRAIN,0,0 +1234,TRAIN,0,0 +1235,TRAIN,0,0 +1236,TRAIN,0,0 +1237,TRAIN,0,0 +1238,TRAIN,0,0 +1239,TRAIN,0,0 +1240,TRAIN,0,0 +1241,TRAIN,0,0 +1242,TRAIN,0,0 +1243,TRAIN,0,0 +1244,TRAIN,0,0 +1245,TRAIN,0,0 +1246,TRAIN,0,0 +1247,TRAIN,0,0 +1248,TRAIN,0,0 +1249,TRAIN,0,0 +1250,TRAIN,0,0 +1251,TRAIN,0,0 +1252,TRAIN,0,0 +1253,TRAIN,0,0 +1254,TRAIN,0,0 +1255,TRAIN,0,0 +1256,TRAIN,0,0 +1257,TRAIN,0,0 +1258,TRAIN,0,0 +1259,TRAIN,0,0 diff --git a/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json b/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json new file mode 100644 index 0000000..417cb6b --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "yahoo_sub_5_problem", + "problemName": "yahoo_sub_5_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "yahoo_sub_5_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 7, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TRAIN" + } + ], + "test": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TEST" + } + ], + "score": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_dataset/datasetDoc.json b/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_dataset/datasetDoc.json new file mode 100644 index 0000000..08f39bf --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_dataset/datasetDoc.json @@ -0,0 +1,95 @@ +{ + "about": { + "datasetID": "yahoo_sub_5_dataset", + "datasetName": "yahoo_sub_5", + "description": "Database of baseball players and play statistics, including 'Games_played', 'At_bats', 'Runs', 'Hits', 'Doubles', 'Triples', 'Home_runs', 'RBIs', 'Walks', 'Strikeouts', 'Batting_average', 'On_base_pct', 'Slugging_pct' and 'Fielding_ave'", + "citation": " @book{simonoff2003analyzing,title={Analyzing Categorical Data},author={Simonoff, J.S.},isbn={9780387007496},lccn={2003044946},series={Springer Texts in Statistics},url={https://books.google.com/books?id=G8wrifweAoC},year={2003},publisher={Springer New York}} ", + "license": " CC Public Domain Mark 1.0 ", + "source": "OpenML", + "sourceURI": "http://www.openml.org/d/185", + "approximateSize": "", + "datasetSchemaVersion": "4.0.0", + "redacted": false, + "datasetVersion": "4.0.0" + }, + "dataResources": [ + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": { + "text/csv": [ + "csv" + ] + }, + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ] + }, + { + "colIndex": 1, + "colName": "timestamp", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "value_0", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 3, + "colName": "value_1", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 4, + "colName": "value_2", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 5, + "colName": "value_3", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 6, + "colName": "value_4", + "colType": "real", + "role": [ + "attribute" + ] + }, + { + "colIndex": 7, + "colName": "ground_truth", + "colType": "integer", + "role": [ + "suggestedTarget" + ] + } + ], + "columnsCount": 8 + } + ] +} \ No newline at end of file diff --git a/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv b/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv new file mode 100644 index 0000000..afae224 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv @@ -0,0 +1,1401 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +0,1,12183,0.0,3.7166666666667,5,2109,0 +1,2,12715,0.091757964510557,3.6108333333333,60,3229,0 +2,3,12736,0.17229675238449998,3.4813888888889,88,3637,0 +3,4,12716,0.22621935431999,3.3802777777778,84,1982,0 +4,5,12739,0.17635798469946,3.1933333333333,111,2751,0 +5,6,12737,0.090491245476051,2.7866666666667004,112,2128,0 +6,7,12857,0.08460994072769001,2.4627777777777995,1235,2109,0 +7,8,12884,0.06842699169496,2.2541666666667,710,2328,0 +8,9,12894,0.13330269689422,2.1180555555556,618,2453,0 +9,10,12675,0.085026586189321,2.0691666666667,84,2847,0 +10,11,13260,0.097073068447328,2.1972222222222,100,3659,0 +11,12,13470,0.0,2.3188888888889,125,5207,0 +12,13,13060,0.031063767542922,2.34,114,5146,0 +13,14,12949,0.017732750501525,2.4902777777778,145,4712,0 +14,15,13035,0.063354504072079,2.6438888888889,91,6363,0 +15,16,12980,0.087870391896335,2.8486111111111003,94,5010,0 +16,17,13677,0.11546815687729,2.8833333333333,79,3956,0 +17,18,13381,0.073413457727404,2.8808333333333,50,4063,0 +18,19,12737,0.040392584616896,2.9005555555556,39,3748,0 +19,20,12554,0.08911335594722301,3.0855555555556,28,3047,0 +20,21,12470,0.098030053711531,3.3536111111111,29,4099,0 +21,22,12490,0.047140641497552,3.7438888888889,24,2122,0 +22,23,12539,0.10481279080241,3.7947222222222,19,3387,0 +23,24,12530,0.20478886838928,3.801111111111101,21,1950,0 +24,25,13002,0.04485100631921201,3.6508333333333,27,2927,0 +25,26,12989,0.1053622140254,3.555,46,1889,0 +26,27,13038,0.08436887679639,3.4769444444444,133,1910,0 +27,28,13011,0.097980673762982,3.2158333333333,143,3747,0 +28,29,12984,0.10165726215275,3.1141666666667,86,4994,0 +29,30,13079,0.056764513454874,2.7983333333333,118,2009,0 +30,31,13048,0.074428708878932,2.4252777777778,56,2899,0 +31,32,13096,0.091244453451818,2.14,92,2298,0 +32,33,13003,0.094529332881679,1.9822222222222,85,1894,0 +33,34,13057,0.016638011234698,1.9694444444444,122,1999,0 +34,35,13023,0.038096861957006005,2.0741666666667,74,3007,0 +35,36,13033,0.064497814457643,2.2505555555556,84,2838,0 +36,37,13034,0.030426401876334,2.2819444444444,54,4113,0 +37,38,13068,0.095423209955973,2.4216666666667,77,2150,0 +38,39,13057,0.069688744272108,2.5997222222222005,84,3007,0 +39,40,13047,0.03468622413034,2.7544444444444003,139,2484,0 +40,41,13795,0.089564461084836,2.7258333333333,65,2101,0 +41,42,13528,0.07337616196456799,2.8302777777778,38,2001,0 +42,43,13032,0.061939295606039,2.9422222222222,35,2102,0 +43,44,13084,0.11419089175512,3.0919444444444,47,2129,0 +44,45,13000,0.10475925920163,3.3519444444444,37,4422,0 +45,46,13008,0.079657960399444,3.6952777777778,53,4573,0 +46,47,12978,0.14475546275416,3.8269444444444,55,1989,0 +47,48,13067,0.1421711341096,3.7877777777778,45,1953,0 +48,49,13086,0.07696963969656899,3.7536111111111,46,1872,0 +49,50,13023,0.06393273436444799,3.61,35,1850,0 +50,51,13046,0.14973281021845006,3.5091666666667,68,2879,0 +51,52,13032,0.041478839355346,3.4205555555556,82,1840,0 +52,53,13012,0.089317973365284,3.2647222222222,154,2134,0 +53,54,13051,0.088820248166203,2.7944444444444,128,2234,0 +54,55,12979,0.054872994406929,2.46,79,3769,0 +55,56,13025,0.07913553329046401,2.2075,66,2717,0 +56,57,13007,0.16317996709063,2.1758333333333,92,2171,0 +57,58,13036,0.08671926699280201,2.3058333333333,67,2224,0 +58,59,13043,0.0733999511789,2.3983333333333,58,1967,0 +59,60,13023,0.0,2.55,58,2148,0 +60,61,13022,0.032756244361869,2.7302777777778,63,1978,0 +61,62,13033,0.054893891024455,2.8169444444444003,61,2021,0 +62,63,13024,0.068514114108229,2.9247222222222,55,2060,0 +63,64,13048,0.05279414163165401,2.8911111111111003,71,2096,0 +64,65,13740,0.023853017353212,2.9575,64,2082,0 +65,66,13540,0.07426125441559799,2.9080555555556,92,2175,0 +66,67,12724,0.024228588329879,3.0088888888889,44,2332,0 +67,68,13070,0.09233413002519697,3.2033333333333,35,2147,0 +68,69,13106,0.15930655332113,3.6213888888889,53,2163,0 +69,70,13025,0.12755838225296,4.0322222222222,49,2406,0 +70,71,13074,0.10152541717054,4.1227777777778,49,2022,0 +71,72,13079,0.040148453968243986,3.9736111111111,103,2188,0 +72,73,13184,0.087208372094752,3.8425,107,2758,0 +73,74,13194,0.074209918996797,3.7097222222222,74,2925,0 +74,75,13191,0.059044537369404015,3.6258333333333,56,3223,0 +75,76,13059,0.06248169832921499,3.4705555555556,60,2507,0 +76,77,13169,0.08876527685714597,3.2877777777778,73,2435,0 +77,78,13114,0.051354431854972,2.9286111111111004,99,2552,0 +78,79,13037,0.074790104163639,2.4888888888889,84,2540,0 +79,80,13179,0.091817341555971,2.2744444444444,129,2642,0 +80,81,13152,0.14762794333026005,2.1733333333333,101,2254,0 +81,82,13095,0.07101004447510299,2.3416666666667,101,2539,0 +82,83,13144,0.07689756334240598,2.3808333333333,51,2596,0 +83,84,13170,0.08412575787388403,2.4663888888889,95,2573,0 +84,85,13162,0.06328921386603299,2.6608333333333,48,2302,0 +85,86,13117,0.057393902128707,2.7558333333333,40,2991,0 +86,87,13129,0.041819399065704,2.8636111111111004,55,3141,0 +87,88,13386,0.073729686380986,2.7586111111111005,56,3285,0 +88,89,13929,0.15365285617975,2.7377777777778,935,3807,0 +89,90,13385,0.060355859742407016,2.6961111111111005,34,2892,0 +90,91,13106,0.10644586288975,2.8569444444444,57,2538,0 +91,92,13113,0.059314286360126985,3.1833333333333,70,2234,0 +92,93,13155,0.096293806236591,3.5544444444444,72,2707,0 +93,94,13186,0.085101425467407,3.8894444444444,66,2382,0 +94,95,13151,0.11149072274185,4.1138888888889,72,2426,0 +95,96,13156,0.076266981262989,3.9519444444444,49,2451,0 +96,97,12813,0.097952120177625,3.8275,41,2288,0 +97,98,12821,0.17250021935572,3.6438888888889,42,2256,0 +98,99,12867,0.11389182319254,3.5608333333333,39,2884,0 +99,100,12837,0.08999961787521,3.5013888888889,81,2398,0 +100,101,12911,0.048649372449385005,3.3088888888889,90,2239,0 +101,102,12842,0.13861764684085998,2.9063888888889,92,2248,0 +102,103,12905,0.1088795585287,2.5027777777777995,81,2387,0 +103,104,12993,0.054235162564995,2.2466666666667003,145,3876,0 +104,105,12974,0.0390040506742,2.1869444444444,47,3073,0 +105,106,13039,0.0744713077811,2.2402777777778,63,3113,0 +106,107,13322,0.040258943675435,2.3727777777778,118,3363,0 +107,108,13606,0.0,2.4566666666667003,56,3796,0 +108,109,13536,0.027955712584728,2.5452777777777995,127,4924,0 +109,110,13341,0.047309968420241,2.6830555555556,48,4300,0 +110,111,13360,0.016602764360002,2.805,114,5225,0 +111,112,13450,0.042432577628353986,2.7386111111111004,78,4047,0 +112,113,14102,0.051191743726563,2.7438888888888995,58,4134,0 +113,114,14026,0.0,2.7586111111111005,56,4786,0 +114,115,13162,0.056724832354639,2.9013888888889,67,4184,0 +115,116,13118,0.055771058827737,3.19,155,2888,0 +116,117,12953,0.081014772096658,3.5561111111111003,123,2674,0 +117,118,12854,0.08253629738290899,3.8433333333333,118,2574,0 +118,119,12952,0.11499203730886,4.0319444444444,133,3123,0 +119,120,12915,0.07668513845109799,3.8844444444444,75,3369,0 +120,121,11994,0.070057457403873,3.6908333333333,29,3284,0 +121,122,11868,0.07031477357556501,3.6141666666667,68,2127,0 +122,123,11977,0.091946448716499,3.5019444444444,91,2117,0 +123,124,11874,0.14560588482235998,3.4205555555556,101,2271,0 +124,125,11913,0.094774329323472,3.1780555555556,22,2513,0 +125,126,11933,0.10217989327054,2.8361111111111,20,2746,0 +126,127,11844,0.04854243074027901,2.5222222222222004,27,2076,0 +127,128,11968,0.068760549683423,2.2416666666667004,45,2297,0 +128,129,11996,0.075440683881139,2.1588888888889,42,2312,0 +129,130,12006,0.11771339431815,2.2763888888889,59,2834,0 +130,131,12225,0.069437397660265,2.3391666666667,52,3584,0 +131,132,12482,0.0,2.4841666666667,62,4009,0 +132,133,12289,0.0,2.4911111111111,81,4142,0 +133,134,12219,0.0,2.6922222222222,84,3876,0 +134,135,12282,0.027395404320488,2.8205555555556,104,4098,0 +135,136,12367,0.055202605299814,2.8216666666667,111,3831,0 +136,137,13042,0.078387348178452,2.7122222222222,91,3842,0 +137,138,12665,0.11851571646444,2.6744444444444,33,4129,0 +138,139,12133,0.068395341911942,2.8097222222222,26,3509,0 +139,140,12023,0.04720597158087901,3.1838888888889,37,2450,0 +140,141,11847,0.07910648512645599,3.5130555555556,23,2270,0 +141,142,11980,0.067550601916344,3.7722222222222,29,2360,0 +142,143,12026,0.080666570182724,3.9058333333333,45,2431,0 +143,144,11852,0.044973875852863,3.7697222222222,49,2042,0 +144,145,12152,0.065734580284861,3.6027777777778,27,1833,0 +145,146,12148,0.068759646748575,3.5038888888889,46,1852,0 +146,147,12236,0.027278224398313,3.445,39,1927,0 +147,148,12155,0.067695565422881,3.3494444444444,72,1999,0 +148,149,12113,0.07244669924777,3.1961111111111005,81,2030,0 +149,150,12175,0.028882930937168,2.8905555555556,64,1963,0 +150,151,12103,0.021568136982842,2.5805555555556,79,2116,0 +151,152,12206,0.064254625408408,2.3380555555556004,132,2461,0 +152,153,12239,0.073869151016554,2.2116666666667,127,2388,0 +153,154,12398,0.026644044055307004,2.2013888888889,121,2846,0 +154,155,12582,0.051289858799957,2.3236111111111,98,2974,0 +155,156,12705,0.099217337562612,2.3002777777778,128,3776,0 +156,157,12555,0.016615805334675,2.385,158,3885,0 +157,158,12476,0.078387348178452,2.5597222222222005,78,3865,0 +158,159,12706,0.0,2.6941666666667,65,4319,0 +159,160,12671,0.049384244324413,2.7169444444444,81,4646,0 +160,161,13277,0.043044731483849,2.6369444444444,586,3873,0 +161,162,12757,0.04215504851616,2.6572222222222,48,3489,0 +162,163,12401,0.042236538352835,2.8466666666667004,38,2790,0 +163,164,12248,0.1001564296112,3.1955555555556,30,2641,0 +164,165,12156,0.17378132267942994,3.5633333333333,28,2960,0 +165,166,12210,0.12005519462968,3.8113888888889,36,2192,0 +166,167,11983,0.14491137762023998,3.9655555555556,50,2145,0 +167,168,12374,0.07336941078506799,3.8483333333333,47,2133,0 +168,169,12230,0.12395626148952,3.6441666666667,82,2330,0 +169,170,12200,0.15077430423660998,3.5213888888889,56,2235,0 +170,171,12135,0.18960071033689,3.4702777777778,140,2258,0 +171,172,12131,0.06051348935254,3.3033333333333,145,2200,0 +172,173,12165,0.072057993662839,3.1933333333333,114,2161,0 +173,174,12193,0.082361078437032,2.8183333333333,129,2159,0 +174,175,12165,0.12343775199876,2.52,143,2088,0 +175,176,12304,0.1071817784483,2.2886111111111,113,2473,0 +176,177,12275,0.10359394556779,2.0822222222222,108,3217,0 +177,178,12369,0.021162435488903,2.1416666666667,93,2994,0 +178,179,12569,0.074524398314698,2.2688888888889,63,3827,0 +179,180,12766,0.12687067454443,2.335,103,4176,0 +180,181,12621,0.041752618326160014,2.4388888888889,114,4227,0 +181,182,12611,0.0,2.5386111111111,67,4290,0 +182,183,12618,0.040819652463459,2.6288888888889,106,4691,0 +183,184,12631,0.082668981599835,2.7511111111111,160,4442,0 +184,185,13121,0.06181362481077901,2.7744444444444,81,5775,0 +185,186,12871,0.0,2.8297222222222,113,3840,0 +186,187,12252,0.076137992226715,2.9708333333333,37,3721,0 +187,188,12155,0.12107639529965,3.1333333333333,70,2498,0 +188,189,12186,0.0,3.3544444444444,82,2265,0 +189,190,12179,0.19840339729984,3.6780555555556,76,2451,0 +190,191,12109,0.20112394005693,3.8038888888889,59,2892,0 +191,192,12142,0.096833471661634,3.8177777777778,58,2166,0 +192,193,12145,0.10338450919956,3.6916666666667,49,2040,0 +193,194,12162,0.10142513773096,3.5197222222222,36,2013,0 +194,195,12165,0.09779274451732,3.5186111111111003,111,2000,0 +195,196,12125,0.14744152252573,3.2597222222222,81,2117,0 +196,197,12097,0.083396348606149,3.0930555555556,92,2775,0 +197,198,12099,0.095637498006913,2.7825,113,2116,0 +198,199,12140,0.14768844039376006,2.4494444444444,90,1991,0 +199,200,12188,0.1131872329372,2.2369444444444,183,3162,0 +200,201,12157,0.073729686380986,2.0961111111111,117,2958,0 +201,202,12128,0.064614077523704,2.0377777777778,110,3153,0 +202,203,12190,0.056019959597275015,2.0730555555556003,179,2190,0 +203,204,12151,0.074812141908008,2.1655555555556,134,2172,0 +204,205,12214,0.02489388427845201,2.285,135,2074,0 +205,206,12275,0.023695834967821,2.4283333333333,100,2078,0 +206,207,12164,0.058680009072634,2.6186111111111,47,2406,0 +207,208,12120,0.10008779345816002,2.7372222222222,88,2018,0 +208,209,12693,0.066566772961868,2.8266666666667004,74,2091,0 +209,210,12624,0.070501147961051,2.8469444444444,58,2310,0 +210,211,12163,0.098779019649936,2.9855555555556,100,2113,0 +211,212,12100,0.11803653713501,3.1038888888889,49,2518,0 +212,213,12162,0.10076746585103,3.4058333333333,36,2605,0 +213,214,12106,0.053210709415363,3.6138888888889,40,2680,0 +214,215,12156,0.099346579713514,3.93,50,2228,0 +215,216,12120,0.047275248011591,3.8155555555556,58,2023,0 +216,217,12420,0.091262209791582,3.6588888888889,50,3702,0 +217,218,12417,0.038593218846488,3.5913888888889,53,1992,0 +218,219,12450,0.070273907645883,3.4644444444444003,93,1988,0 +219,220,12395,0.029431888410363,3.3944444444444,78,1919,0 +220,221,12382,0.096854769984307,3.2227777777778,84,2213,0 +221,222,12438,0.11656453357642,2.7961111111111,112,2181,0 +222,223,12363,0.12109055114779,2.4383333333333,73,2152,0 +223,224,12393,0.20381554615786,2.2647222222222005,91,2393,0 +224,225,12399,0.046311768005022014,2.1886111111111,114,2173,0 +225,226,12456,0.18261306403662,2.2825,127,2109,0 +226,227,12442,0.021992750543024,2.3333333333333,69,3606,0 +227,228,12481,0.088072259040681,2.445,59,2114,0 +228,229,12432,0.037896500450725,2.5811111111111,64,2135,0 +229,230,12403,0.09882843339863,2.7094444444444,75,2303,0 +230,231,12406,0.076277687882641,2.88,44,2137,0 +231,232,12462,0.022875979046571,2.8555555555556,52,2264,0 +232,233,13034,0.10022162220861,2.7791666666667,42,2245,0 +233,234,12830,0.08117200437078799,2.7772222222222,45,2151,0 +234,235,12439,0.09750667785645803,3.02,26,2330,0 +235,236,12541,0.05680722879784299,3.2213888888888995,29,3357,0 +236,237,12462,0.12240855732315,3.6211111111111,32,3152,0 +237,238,12394,0.1715485140175,4.0219444444444,44,2693,0 +238,239,12507,0.075015592829224,4.0980555555556,41,3798,0 +239,240,12512,0.11388410095531,3.9080555555556,42,4596,0 +240,241,12093,0.10519027968795,3.7269444444444,46,2529,0 +241,242,12197,0.1150532998405,3.6244444444444,40,2124,0 +242,243,12138,0.10890530980571,3.5252777777778,64,2762,0 +243,244,12174,0.099350621485086,3.4675,70,2973,0 +244,245,12163,0.12889794040441002,3.3316666666667003,69,3041,0 +245,246,12096,0.12069378235889,2.9497222222222,73,2179,0 +246,247,12166,0.13053034917739,2.5708333333333,85,2322,0 +247,248,12187,0.078977758004111,2.3086111111111,63,2274,0 +248,249,12246,0.08088416337864099,2.2311111111111,67,2448,0 +249,250,12335,0.04008956024204,2.3119444444444,68,3811,0 +250,251,12556,0.05063725351997099,2.3536111111111,62,3761,0 +251,252,12652,0.039066291775136,2.4819444444444,69,4269,0 +252,253,12646,0.028611752774164,2.6605555555556,82,4244,0 +253,254,12803,0.040593364983329,2.7527777777778,56,4417,0 +254,255,12570,0.038807415292018,3.0741666666667005,38,3758,0 +255,256,12633,0.07832796288132203,2.8522222222222,30,4375,0 +256,257,13146,0.066320996162546,2.7277777777778,48,4158,0 +257,258,12994,0.083175583471284,2.7502777777778,63,3410,0 +258,259,12314,0.06802464587725401,2.8797222222222,34,2853,0 +259,260,12193,0.051675070535006,3.2027777777778,11,2628,0 +260,261,12127,0.044129112207997014,3.5633333333333,22,2287,0 +261,262,12140,0.037685894365982006,3.8808333333333,22,3334,0 +262,263,12174,0.093414561465838,4.0352777777778,12,2795,0 +263,264,12180,0.06987083046098,3.8966666666667,10,2089,0 +264,265,12861,0.021992750543024,3.7225,14,2260,0 +265,266,12957,0.11305566197523,3.73,39,3176,0 +266,267,12981,0.030884138240845,3.5558333333333,55,4049,0 +267,268,12958,0.10381377439313,3.3169444444444003,90,2902,0 +268,269,12913,0.048953768695625004,3.2322222222222,68,3743,0 +269,270,12939,0.042258794089861,2.8658333333333,95,4280,0 +270,271,12933,0.048388685585470985,2.5169444444444,70,3977,0 +271,272,13006,0.034197830567692,2.3,96,4518,0 +272,273,13091,0.08835953066771099,2.1888888888889,45,2707,0 +273,274,13201,0.086890518272785,2.2030555555556,96,3522,0 +274,275,13520,0.031087561676959,2.2711111111111,74,4584,0 +275,276,13675,0.071287463233942,2.4697222222222,82,4141,0 +276,277,13594,0.14372616993938,2.5988888888889,82,4831,0 +277,278,13466,0.12647517487142998,2.7258333333333,45,3991,0 +278,279,13448,0.042854531198562,2.7858333333333,134,4645,0 +279,280,13492,0.039930389849144,2.7922222222222,119,4967,0 +280,281,14123,0.076184645265048,2.6988888888889,86,4578,0 +281,282,13839,0.037830020408535,2.7663888888889,75,4972,0 +282,283,13335,0.030884138240845,2.8938888888889,45,5522,0 +283,284,13196,0.048316550276279,3.1875,50,2832,0 +284,285,13047,0.10986585566763,3.6463888888889,31,2826,0 +285,286,13008,0.025485002897852004,3.866666666666701,88,2855,0 +286,287,12763,0.12451757643335,3.9808333333333,42,2660,0 +287,288,12949,0.12875690949235,3.8277777777778,70,2447,0 +288,289,13009,0.15720639094135,3.6269444444444,106,2545,0 +289,290,13008,0.079092017261926,3.5266666666667,44,3842,0 +290,291,12890,0.14711499890479998,3.5077777777778,57,2332,0 +291,292,13004,0.0531410973178,3.3455555555556,95,2294,0 +292,293,12918,0.10136246281349,3.1241666666667003,91,3016,0 +293,294,12910,0.053119315802353,2.8713888888889,66,3944,0 +294,295,12915,0.11313351589999003,2.5133333333333,66,2332,0 +295,296,13121,0.076760188212735,2.2197222222222,82,2405,0 +296,297,13076,0.08890522133351199,2.205,73,2572,0 +297,298,13096,0.1009555130175,2.2677777777778,69,2558,0 +298,299,13339,0.15685427502807,2.2991666666667,107,3701,0 +299,300,13635,0.11090638960365,2.4277777777778,101,4228,0 +300,301,13493,0.054798089981891,2.5333333333333,66,3990,0 +301,302,13402,0.08461316628091001,2.6422222222222005,47,4707,0 +302,303,13417,0.15790425505315,2.8211111111111005,47,3857,0 +303,304,13382,0.021675109392134,2.7625,66,3874,0 +304,305,14199,0.14112049645292002,2.7391666666667,102,4369,0 +305,306,13973,0.059612111520904,2.7525,71,4488,0 +306,307,13284,0.067835890522602,2.8644444444444,53,3637,0 +307,308,13070,0.047414460026828,3.1927777777778,28,2705,0 +308,309,12983,0.050348669783997005,3.5872222222222,24,2429,0 +309,310,13075,0.07296715773193299,3.8305555555556,23,2839,0 +310,311,12991,0.10713527159169,3.8827777777778,30,2371,0 +311,312,12993,0.073622496612493,3.7291666666667,25,2758,0 +312,313,13121,0.11556476355437,3.6172222222222,29,2291,0 +313,314,13097,0.034160489683707995,3.4491666666667005,27,2220,0 +314,315,13150,0.019571935182124,3.4097222222222,77,2620,0 +315,316,13078,0.15720996206912,3.2605555555556,46,2467,0 +316,317,13140,0.11515041454164,3.2191666666667,86,2088,0 +317,318,13102,0.086415715789296,2.9586111111111,97,2137,0 +318,319,13110,0.092606306920552,2.6036111111111,88,2907,0 +319,320,13138,0.046458579038692015,2.3319444444444,110,2558,0 +320,321,13238,0.10977831600416,2.2025,89,2823,0 +321,322,13317,0.11090009191451,2.2711111111111,134,2465,0 +322,323,13512,0.076652795374797,2.2897222222222005,84,4399,0 +323,324,13669,0.1087202400467,2.3297222222222005,109,4088,0 +324,325,13651,0.11471628863897,2.395,57,5099,0 +325,326,13580,0.11070024667119,2.5063888888889,49,5157,0 +326,327,13538,0.026827723134058,2.7077777777778,83,3782,0 +327,328,13657,0.029426630692549,2.735,101,4008,0 +328,329,14183,0.028611752774164,2.6958333333333,88,4534,0 +329,330,14117,0.053106181092382014,2.6930555555556,56,3242,0 +330,331,13166,0.055538160906184006,2.875,31,2808,0 +331,332,13265,0.11009690391165,3.1788888888888995,22,3676,0 +332,333,13085,0.10979978093137,3.5808333333333,32,3523,0 +333,334,13167,0.036174223284821,3.8508333333333,27,3038,0 +334,335,13170,0.048361321378982,3.9180555555556,17,2299,0 +335,336,13132,0.10958125953198,3.815,27,2345,0 +336,337,13055,0.047305343559722,3.6080555555556,38,2565,0 +337,338,13025,0.045316868664604014,3.4927777777778,73,2576,0 +338,339,13076,0.13255054531036,3.4316666666667004,56,2327,0 +339,340,13044,0.079695587369141,3.3436111111111004,49,2211,0 +340,341,13035,0.10277355185943,3.0663888888889,90,2642,0 +341,342,13103,0.15061124796385,2.7894444444444,106,3646,0 +342,343,13067,0.14509169704095,2.4994444444444,51,2281,0 +343,344,13183,0.054445250001619004,2.2544444444444,99,2474,0 +344,345,13144,0.082058799915824,2.0847222222222,104,2536,0 +345,346,13166,0.042151311782819015,2.0888888888889,119,2900,0 +346,347,13406,0.057404703309705984,2.1594444444444,73,3144,0 +347,348,13544,0.040891918425583,2.2533333333333,92,3725,0 +348,349,13608,0.045224636676715,2.3880555555556,57,4305,0 +349,350,13522,0.0,2.6338888888889,100,3665,0 +350,351,13595,0.0,2.6588888888889,93,3791,0 +351,352,13420,0.10335456693443,2.7586111111111005,111,3897,0 +352,353,14163,0.033846222120808,2.8797222222222,91,3494,0 +353,354,13678,0.026167129419328,2.785,43,3353,0 +354,355,13272,0.08571767780871499,2.8219444444444,91,2741,0 +355,356,13071,0.12459953631184,3.0055555555556,63,2463,0 +356,357,13004,0.054750658073534006,3.2936111111111,60,3477,0 +357,358,13068,0.20799106772677,3.5575,56,2792,0 +358,359,13031,0.10314231079956,3.676111111111101,59,2183,0 +359,360,13013,0.12212653292147,3.7166666666667,48,2874,0 +360,361,12998,0.19159058299176,3.6013888888889,65,2147,0 +361,362,12971,0.10782180851978,3.4455555555556,77,2754,0 +362,363,13000,0.06408869538637901,3.4166666666667003,60,2007,0 +363,364,12998,0.095540168894753,3.1791666666667004,94,2564,0 +364,365,12906,0.039360296791109,3.0013888888889,84,3020,0 +365,366,12969,0.086611479249287,2.72,99,2004,0 +366,367,12963,0.05845507441603001,2.4527777777778,61,2047,0 +367,368,12933,0.051490800079599004,2.1816666666667,60,3531,0 +368,369,12990,0.075496432869001,2.0161111111111,78,2383,0 +369,370,12980,0.10358625218721,1.9769444444444,81,2112,0 +370,371,12982,0.062806431427897,2.0597222222222,61,2554,0 +371,372,12989,0.08970338978685001,2.2111111111111,68,2371,0 +372,373,13073,0.094517316130968,2.3141666666667,53,2060,0 +373,374,12950,0.032322011663911,2.4280555555556003,49,2086,0 +374,375,12990,0.047911560407608,2.5855555555556,40,2130,0 +375,376,13035,0.062001214431213,2.6977777777778,125,2072,0 +376,377,13681,0.027102718749392,2.7777777777778,61,2033,0 +377,378,13304,0.034703114844079,2.7988888888889,111,2683,0 +378,379,12965,0.066236017573192,2.8927777777778,32,2046,0 +379,380,12966,0.032230355211769,3.0413888888889,21,2064,0 +380,381,12943,0.11559664215716,3.3569444444444,14,2067,0 +381,382,12958,0.021952502374124,3.4808333333333,32,2496,0 +382,383,13005,0.13347711194703,3.764166666666701,29,4758,0 +383,384,12923,0.10579408349834,3.8097222222222,26,2806,0 +384,385,12812,0.10679035350244,3.6911111111111,52,2227,0 +385,386,12803,0.068633627680319,3.4902777777778,39,3123,0 +386,387,12850,0.04699518011436099,3.3769444444444,78,3460,0 +387,388,12797,0.14159640074335994,3.3011111111111004,78,3587,0 +388,389,12732,0.078500039299167,3.1369444444444,83,2558,0 +389,390,12817,0.049232295047845,2.8475,63,2306,0 +390,391,12818,0.078777592482879,2.4544444444444,108,2083,0 +391,392,12815,0.08993433499951,2.1247222222222,158,3073,0 +392,393,12805,0.081869163858473,2.0266666666667,115,3325,0 +393,394,12703,0.14556064903749,2.1763888888889,112,2321,0 +394,395,12771,0.0,2.3088888888889,73,2846,0 +395,396,12847,0.0,2.4213888888889,93,2482,0 +396,397,12872,0.030693547421212,2.6436111111111,65,2306,0 +397,398,12815,0.0,2.6602777777778,91,2298,0 +398,399,12844,0.046999447831427,2.7677777777778,106,2907,0 +399,400,12811,0.028815579681692,2.8066666666667004,66,2329,0 +400,401,13472,0.0,2.7661111111111003,26,2456,0 +401,402,13063,0.039360296791109,2.8133333333333,23,2178,0 +402,403,12833,0.039570832199428,2.9186111111111,24,2142,0 +403,404,12842,0.090659246308087,3.1930555555556,19,2277,0 +404,405,12804,0.10540579050057003,3.565,23,3066,0 +405,406,12852,0.062601610466313,3.9133333333333,30,3619,0 +406,407,12862,0.051455855638306,3.9658333333333,23,3726,0 +407,408,12799,0.054631758648785014,3.8930555555556,35,2282,0 +408,409,12789,0.09017822949731,3.7297222222222,41,3079,0 +409,410,12815,0.045287525091609014,3.6516666666667,63,2448,0 +410,411,12887,0.033344698319951,3.5927777777778,33,2574,0 +411,412,12903,0.080098394586215,3.4694444444444,50,3697,0 +412,413,12892,0.025162301034707,3.2536111111111,88,3067,0 +413,414,12907,0.078260793447992,2.8986111111111,115,3491,0 +414,415,12883,0.07223863924679201,2.4488888888889,69,3195,0 +415,416,12965,0.042917873674349,2.2119444444444,116,2763,0 +416,417,12932,0.04720597158087901,2.2011111111111,73,2605,0 +417,418,13134,0.048273008229067,2.2338888888889,75,2755,0 +418,419,13440,0.036987975876273,2.3116666666667003,56,3300,0 +419,420,13544,0.06291463671717,2.3869444444444,66,3838,0 +420,421,13508,0.033319304393751,2.5119444444444,70,3608,0 +421,422,13401,0.029115275623859,2.5713888888889,52,3845,0 +422,423,13410,0.06821638123436,2.5088888888889,32,3563,0 +423,424,13482,0.015408589348188,2.4155555555556,16,5478,0 +424,425,14124,0.01916018435633,3.6455555555556,46,3656,0 +425,426,13703,0.06374239746477901,2.4625,53,3491,0 +426,427,13250,0.099738890728803,2.5808333333333,67,3430,0 +427,428,13092,0.10950621554455,3.0033333333333,58,2807,0 +428,429,13012,0.06138920621589401,3.3486111111111003,17,2524,0 +429,430,12901,0.051307638060244014,3.6644444444444,26,2964,0 +430,431,12848,0.082471571552878,4.0083333333333,13,3969,0 +431,432,13025,0.060122448878635,3.8530555555556,8,3561,0 +432,433,11352,0.07469842969719999,3.6183333333333,20,3394,0 +433,434,8761,0.056170625137636994,3.4922222222222,23,3005,0 +434,435,10433,0.052668952946361,3.4958333333333,34,2350,0 +435,436,10088,0.068871884486763,3.2738888888889,35,2139,0 +436,437,9485,0.040236057110938986,3.2102777777778,48,2098,0 +437,438,8865,0.053200012471363,2.8475,67,2341,0 +438,439,8920,0.056725172482788,2.4883333333332995,38,2698,0 +439,440,8798,0.035229341473877,2.1955555555556003,33,2968,0 +440,441,8927,0.0,2.1461111111111,40,2824,0 +441,442,9211,0.020190723068726,2.1522222222222,37,3003,0 +442,443,9286,0.093342961377898,2.3122222222222004,51,3551,0 +443,444,9725,0.0,2.4033333333333,52,4689,0 +444,445,11050,0.015717168144981003,2.4944444444444,57,3481,0 +445,446,11521,0.017190609993733997,2.6622222222222005,82,3376,0 +446,447,11603,0.0,2.675,74,3198,0 +447,448,11665,0.043273461915965,2.6997222222222,80,3059,0 +448,449,12153,0.029854520963498,2.6997222222222,78,2937,0 +449,450,11672,0.017383620014121998,2.7194444444444,58,2881,0 +450,451,11119,0.046391383573699006,2.8258333333333,41,2777,0 +451,452,11124,0.042155878228,3.1044444444444,34,2510,0 +452,453,10734,0.052684222339579014,3.4736111111111003,35,2356,0 +453,454,11612,0.063573954212613,3.6972222222222,40,2383,0 +454,455,11523,0.077413583128967,3.8038888888889,35,2455,0 +455,456,11632,0.069605078732108,3.7494444444444,37,2285,0 +456,457,12838,0.075937967855042,3.6813888888889,43,2455,0 +457,458,11637,0.047354002438352014,3.4791666666667003,45,4298,0 +458,459,12542,0.044000040388062,3.4530555555556,48,2400,0 +459,460,12394,0.095130971924595,3.2841666666667004,77,3431,0 +460,461,12419,0.069274987547704,3.205,79,2252,0 +461,462,12484,0.061118974117397,2.8436111111111004,59,2628,0 +462,463,12413,0.056393740750134,2.4441666666667,107,3266,0 +463,464,12440,0.06125086589409901,2.275,100,2620,0 +464,465,12614,0.047746883512707,2.1788888888889,84,2824,0 +465,466,12693,0.047136440673386,2.2083333333333,99,2801,0 +466,467,12989,0.0,2.2997222222222,103,3106,0 +467,468,13200,0.0,2.3155555555556004,47,3532,0 +468,469,13108,0.049828520132601,2.41,67,4210,0 +469,470,12886,0.0,2.5902777777778,65,3646,0 +470,471,13000,0.0,2.6636111111111,65,3768,0 +471,472,13071,0.043576825212604,2.7105555555556,70,5342,0 +472,473,13563,0.035173891965945,2.6811111111111,76,5327,0 +473,474,13333,0.04413510379665099,2.715,40,3363,0 +474,475,12672,0.016955671451488998,2.7083333333333,54,3016,0 +475,476,12547,0.1330396486107,3.0038888888889,45,3257,0 +476,477,12289,0.016462114132943,3.3911111111111003,32,2619,0 +477,478,12584,0.055696363369897,3.6375,26,2573,0 +478,479,12526,0.036411774365825,3.7755555555556,25,2575,0 +479,480,12416,0.047966724418057,3.5786111111111003,34,5355,0 +480,481,12450,0.05609961782665,3.4222222222222,43,5809,0 +481,482,12460,0.096990479781121,3.2538888888889,68,3823,0 +482,483,12425,0.11147038220964,3.1683333333333,60,3116,0 +483,484,12430,0.044797927381498,3.0677777777778,74,2321,0 +484,485,12418,0.024403519177111,2.94,68,2193,0 +485,486,12437,0.08532776818426499,2.7291666666667003,43,2982,0 +486,487,12484,0.043615168647623,2.4147222222222005,73,4140,0 +487,488,12380,0.056692005942856,2.1419444444444,72,2353,0 +488,489,12620,0.033708553131457,2.0244444444444,66,3350,0 +489,490,12674,0.040148453968243986,2.0458333333333,90,3184,0 +490,491,12855,0.099551526697496,2.09,104,3469,0 +491,492,13053,0.0,2.1575,114,4204,0 +492,493,12898,0.036157867549894,2.2655555555556,98,6447,0 +493,494,12809,0.052738784696875,2.2561111111111,70,4898,0 +494,495,12964,0.021636091422947,2.4669444444444,101,3633,0 +495,496,12956,0.037120220639643986,2.5277777777778,77,4189,0 +496,497,13625,0.034467327401996005,2.5266666666667,69,4012,0 +497,498,13285,0.0,2.5438888888889,19,4009,0 +498,499,12715,0.096807019710259,2.6511111111111,47,4346,0 +499,500,12637,0.059601475230884,2.9711111111111004,38,2781,0 +500,501,12535,0.068431521141608,3.2288888888889,22,2811,0 +501,502,12512,0.09611085542804,3.505,20,2415,0 +502,503,12549,0.064177980162036,3.4944444444444,26,3589,0 +503,504,12567,0.11565746993409,3.4633333333333,24,2878,0 +504,505,12362,0.073501732487291,3.3177777777778,27,3471,0 +505,506,12326,0.072746100819649,3.1963888888889,25,2697,0 +506,507,12450,0.07557888002360401,3.1069444444444,57,2583,0 +507,508,12404,0.036816888038697,3.0172222222222,58,3173,0 +508,509,12362,0.093969235453559,2.9247222222222,81,3341,0 +509,510,12431,0.034848294186597004,2.5336111111111,81,2305,0 +510,511,12351,0.084191269180943,2.2480555555556,69,2186,0 +511,512,12528,0.13109036514766,2.0383333333333,50,4439,0 +512,513,12559,0.061132356147447,1.8852777777778,55,3173,0 +513,514,12586,0.019478099970089,1.9225,57,2831,0 +514,515,12864,0.0,1.9719444444444,78,16385,0 +515,516,13026,0.0,2.0608333333333,57,83955,0 +516,517,12880,0.017965204407153,2.16,78,4574,0 +517,518,12743,0.019202263481759,2.3077777777778,95,4987,0 +518,519,12812,0.0,2.415,88,5110,0 +519,520,12878,0.052306327013631,2.4669444444444,108,4893,0 +520,521,13427,0.08536575533023,2.5125,87,3807,0 +521,522,13081,0.052461360256699015,2.6294444444444,87,3447,0 +522,523,12752,0.035302992848671,2.8183333333333,44,4329,0 +523,524,12594,0.028682734942579,3.0547222222222,39,5166,0 +524,525,12507,0.024204462299365,3.33,27,3454,0 +525,526,12494,0.034360100307537,3.5738888888889,23,3578,0 +526,527,12487,0.018977302969238,3.6888888888889,11,2406,0 +527,528,12404,0.034308847257872,3.7111111111111,13,2073,0 +528,529,11147,0.07460088255490599,3.7180555555556,24,1925,0 +529,530,11147,0.055037935083209005,3.6041666666667,77,2357,0 +530,531,11128,0.039311673522385,3.4483333333333,54,1947,0 +531,532,11106,0.046619928266775,3.2413888888888995,45,1912,0 +532,533,11115,0.048227542028921,3.1355555555556,36,2107,0 +533,534,11044,0.020367863848114,2.8172222222222,59,2985,0 +534,535,11110,0.063069968046591,2.4275,81,2081,0 +535,536,11190,0.054470866056974016,2.2513888888889,50,2631,0 +536,537,11063,0.0,2.0691666666667,53,2130,0 +537,538,11078,0.059261864411046,2.0155555555556,44,2085,0 +538,539,11146,0.064174002348993,2.0952777777778,87,2211,0 +539,540,11010,0.0,2.2397222222222,94,2105,0 +540,541,11139,0.021912411214588,2.3275,128,2585,0 +541,542,11117,0.057958262002105985,2.5255555555556004,82,3695,0 +542,543,11081,0.035358633773416,2.665,49,3198,0 +543,544,11128,0.029191244440103,2.7975,79,3191,0 +544,545,11720,0.054981313823219,2.8597222222222,62,2016,0 +545,546,11384,0.06405347705857799,2.7983333333333,64,2124,0 +546,547,11018,0.0,2.9322222222222,34,2105,0 +547,548,11104,0.055445634363329,3.08,41,2031,0 +548,549,11084,0.040996998867197,3.3466666666667004,47,1964,0 +549,550,11106,0.027670189755404,3.6869444444444,31,2016,0 +550,551,11055,0.054579839310753,3.7966666666667,26,3909,0 +551,552,11098,0.044833640073299014,3.7805555555556,17,2105,0 +552,553,11028,0.03282297151413,3.7422222222222,30,2405,0 +553,554,11152,0.017696014614986,3.639166666666701,17,2141,0 +554,555,11025,0.09418709999244,3.4775,28,1910,0 +555,556,11015,0.061817529149429,3.3283333333333,20,1951,0 +556,557,11125,0.054000161367618,3.1702777777778,85,2310,0 +557,558,11035,0.06165600249599,2.7688888888889,52,2047,0 +558,559,11103,0.055915839259234,2.4266666666667,143,2048,0 +559,560,11100,0.062788330996733,2.1963888888889,106,3083,0 +560,561,11170,0.044888048273534,2.135,244,3619,0 +561,562,11078,0.095259484956337,2.3186111111111,2005,2172,0 +562,563,11150,0.021952502374124,2.3383333333333,124,3142,0 +563,564,11149,0.0,2.5002777777778,109,2256,0 +564,565,10984,0.0,2.6527777777778,148,2200,0 +565,566,11034,0.0,2.7661111111111003,126,2183,0 +566,567,11050,0.061557079663167,2.7347222222222,46,2030,0 +567,568,11102,0.14186075040414,2.6069444444444,49,2297,0 +568,569,11743,0.0,2.5547222222222,40,2213,0 +569,570,11371,0.077457673524504,2.4716666666667004,39,4014,0 +570,571,11078,0.16422977329792998,2.6530555555556004,25,2809,0 +571,572,11224,0.049366067455729,2.9488888888889,37,2355,0 +572,573,11146,0.10064381631633,3.3383333333333,32,2372,0 +573,574,11199,0.11909159312806,3.5419444444444,47,2387,0 +574,575,11181,0.09003816676619801,5.3302777777778,34,2359,0 +575,576,11022,0.055882659245704,3.7727777777778,40,2485,0 +576,577,11073,0.1836893913223,3.6333333333333,46,3728,0 +577,578,11120,0.08574268253550299,3.5430555555556,35,2820,0 +578,579,11008,0.12559700716583,3.6711111111111,61,2426,0 +579,580,11078,0.086129850619071,3.4572222222222,56,2307,0 +580,581,11121,0.041752618326160014,3.2,72,2233,0 +581,582,11041,0.094396473652892,2.7772222222222,110,2178,0 +582,583,11168,0.045323960075285004,2.415,135,2243,0 +583,584,11213,0.13808411333909,2.2530555555556004,133,2713,0 +584,585,11238,0.08029349854683501,2.0994444444444,148,3168,0 +585,586,11273,0.06507307495461,2.1780555555556003,86,3163,0 +586,587,11479,0.084518021856329,2.2638888888889,132,3289,0 +587,588,11839,0.030507395540508,2.3575,73,4001,0 +588,589,11735,0.05892502921299701,2.4680555555556003,95,4684,0 +589,590,11574,0.0,2.6208333333333,74,4137,0 +590,591,11531,0.033075906123641,2.6863888888889,51,4787,0 +591,592,11420,0.16633704704670998,2.6172222222222,65,4278,0 +592,593,12301,0.10228536028167,2.6194444444444,95,3898,0 +593,594,11845,0.16949365549682996,2.6358333333333,72,3728,0 +594,595,11374,0.08260397756200501,2.8661111111111004,41,4047,0 +595,596,11370,0.024378363844868,3.0533333333333,38,3373,0 +596,597,11197,0.15686874147816002,3.4438888888889,32,2669,0 +597,598,11171,0.063929461148943,3.6552777777778,22,3289,0 +598,599,11197,0.12602019009982998,3.8519444444444,29,2556,0 +599,600,11114,0.035137191893634005,3.8069444444444,32,2557,0 +600,601,12564,0.14965728062748998,3.5961111111111004,40,3003,0 +601,602,12459,0.10046170077382,3.5344444444444,59,2441,0 +602,603,12508,0.13163105487926,3.3972222222222,52,2396,0 +603,604,12464,0.043899611017859004,3.3936111111111003,42,3426,0 +604,605,12438,0.19567092855859,3.1025,46,2379,0 +605,606,12449,0.19135011734275,2.8630555555556,97,3026,0 +606,607,12373,0.11171915024595,2.4255555555556003,72,2336,0 +607,608,12594,0.032053604746412,1.8619444444444,81,2850,0 +608,609,12623,0.096448361580655,1.8930555555556,81,3016,0 +609,610,12759,0.07934996156433399,2.2080555555556,70,3537,0 +610,611,12841,0.024581173073578,2.3052777777778,89,3899,0 +611,612,13063,0.025596039426134,2.3777777777777995,87,5044,0 +612,613,13023,0.027922074309281,2.5161111111111,125,4806,0 +613,614,12884,0.02593545023878,2.6411111111111,69,4139,0 +614,615,13007,0.033086949155743,2.8011111111111004,57,4776,0 +615,616,13016,0.047260069860172005,2.7236111111111003,99,4065,0 +616,617,13588,0.038487130166032016,2.6813888888889,111,4969,0 +617,618,13272,0.16080169828563,2.7336111111111,71,3784,0 +618,619,12589,0.12635270044885,2.8863888888889,71,3297,0 +619,620,12651,0.046904491868436,3.1225,48,3347,0 +620,621,12616,0.059534673085297,3.4613888888889,76,3170,0 +621,622,12492,0.12198352023568,3.8297222222222,56,2241,0 +622,623,12497,0.052131597947042,3.8936111111111,35,2301,0 +623,624,12623,0.094084438832673,3.7588888888889,35,2303,0 +624,625,12481,0.13486764750848,3.5827777777778,29,2587,0 +625,626,12434,0.062226183256115,3.4730555555556,38,3211,0 +626,627,12495,0.091202035463034,3.4175,69,2604,0 +627,628,12375,0.096137859324631,3.3533333333333,77,2841,0 +628,629,12357,0.10449109200785,3.1963888888889,20,2168,0 +629,630,12433,0.097127966420289,2.8852777777778,24,2265,0 +630,631,12432,0.064404980330111,2.4880555555556003,83,2908,0 +631,632,12429,0.10188181868693,2.2325,62,3180,0 +632,633,12551,0.19953464365013,2.1044444444444,54,3118,0 +633,634,12799,0.0747839457206,2.1097222222222,54,3296,0 +634,635,12818,0.0,2.235,60,4432,0 +635,636,13071,0.0,2.3516666666667003,63,4336,0 +636,637,12897,0.0,2.5138888888889,95,4534,0 +637,638,12961,0.041436571087464,2.6105555555556004,69,4261,0 +638,639,12925,0.038671790863765,2.7233333333333,68,5248,0 +639,640,12968,0.035810634316102014,2.6633333333333,58,5014,0 +640,641,13525,0.1409929213297,2.5580555555556,107,3864,0 +641,642,12993,0.0,2.6627777777778,48,5682,0 +642,643,12369,0.052915080344848,2.7625,64,4404,0 +643,644,12195,0.11966022897483,3.0283333333333,52,3705,0 +644,645,12464,0.12973870706052,3.3727777777778,61,2738,0 +645,646,12470,0.023838633821411,3.6369444444444,47,2887,0 +646,647,12475,0.12358680271021,3.7088888888889,58,3776,0 +647,648,12482,0.089095336472172,3.5847222222222,51,3532,0 +648,649,12221,0.019762530636927,3.4836111111111,61,3724,0 +649,650,12325,0.020994992941051,3.4077777777778,53,2786,0 +650,651,12258,0.10380294658324002,3.4441666666667,55,2941,0 +651,652,11980,0.079228021087742,3.1683333333333,52,2351,0 +652,653,11947,0.039012779943635,3.0527777777778,89,2316,0 +653,654,12291,0.10658713601061,2.8527777777778,85,2350,0 +654,655,12293,0.14426278476756,2.5433333333333,106,2916,0 +655,656,12341,0.08706206992122,2.1997222222222,88,2437,0 +656,657,12390,0.16325946030154,2.1036111111111,59,2761,0 +657,658,12611,0.0,2.2133333333333,48,3941,0 +658,659,12737,0.0,2.2086111111111,66,4025,0 +659,660,12882,0.07729609083366701,2.2883333333333,95,4466,0 +660,661,12891,0.058100747891124,2.3222222222222,82,4401,0 +661,662,12756,0.061191523312340984,2.47,76,4747,0 +662,663,12875,0.08592375974441901,2.685,104,4051,0 +663,664,12847,0.033467197342519,2.6763888888889,54,4448,0 +664,665,13518,0.030265788895452006,2.5838888888889,43,3736,0 +665,666,13217,0.11950310860409,2.6130555555556003,39,3918,0 +666,667,12621,0.09169148327055697,2.7633333333333,48,3408,0 +667,668,12591,0.18439354827551,3.0708333333333,38,2883,0 +668,669,12332,0.10741924067542,3.4347222222222,45,3631,0 +669,670,12404,0.15862461647089002,3.7030555555556,64,2609,0 +670,671,12457,0.14957813136313,3.8138888888889,35,2533,0 +671,672,12370,0.24059408570531,3.8508333333333,66,2469,0 +672,673,11509,0.15511115210127,3.8961111111111,61,2458,0 +673,674,11433,0.19582462633148,3.4763888888889,58,2458,0 +674,675,11317,0.13981560037535998,3.4041666666667,51,2043,0 +675,676,11364,0.1392329990551,3.2352777777778,55,1985,0 +676,677,11350,0.13079770999921,3.1508333333333,126,2032,0 +677,678,11348,0.053672881218709015,2.7863888888888995,61,3409,0 +678,679,11365,0.10971373742228,2.4861111111111,94,2018,0 +679,680,11505,0.13825204927093,2.2444444444444,83,2461,0 +680,681,11468,0.13912778922607,2.1286111111111,136,2318,0 +681,682,11562,0.10215803640865,2.1261111111111,104,2787,0 +682,683,11858,0.096617489053804,2.2405555555556003,77,3186,0 +683,684,11933,0.0,2.2991666666667,109,3490,0 +684,685,11813,0.0,2.3627777777778,146,3407,0 +685,686,11735,0.0,2.5863888888889,69,3193,0 +686,687,11848,0.0,2.7286111111111,121,3412,0 +687,688,11843,0.0,2.8355555555556,53,3563,0 +688,689,12318,0.068897518746959,2.7875,61,3247,0 +689,690,11846,0.05418569809170299,2.7825,82,3012,0 +690,691,11066,0.06507307495461,2.7972222222222,37,2382,0 +691,692,10920,0.10547682048851,3.0355555555556,19,2012,0 +692,693,10836,0.056437861708265,3.2486111111111,19,1915,0 +693,694,10879,0.098703711593837,3.6077777777778,19,1982,0 +694,695,10796,0.14331889652193,3.76,54,1950,0 +695,696,10785,0.05704449488642,3.806666666666701,44,4176,0 +696,697,9469,0.0,3.6638888888889,46,3654,0 +697,698,9278,0.032146952736052,3.5161111111111003,53,3063,0 +698,699,9417,0.068135614649249,3.3286111111111003,83,1916,0 +699,700,9253,0.034514299845882,3.2166666666667,92,1848,0 +700,701,9435,0.028306668795131006,2.9783333333333,94,1704,0 +701,702,9356,0.13119921991025002,2.7211111111111004,111,1680,0 +702,703,9354,0.093609772007723,2.4102777777778,84,2011,0 +703,704,9405,0.11179018663123,2.1366666666667,52,1772,0 +704,705,9326,0.065272680657868,1.9947222222222,68,1838,0 +705,706,9549,0.15901886092526998,1.9936111111111,35,1924,0 +706,707,9499,0.0,2.0788888888889,40,2038,0 +707,708,9371,0.26537507315217,2.1736111111111,47,1991,0 +708,709,9462,0.0,2.4027777777778,85,1729,0 +709,710,9509,0.056610336908172985,2.4580555555556,59,1673,0 +710,711,9469,0.026644044055307004,2.6102777777777995,61,1656,0 +711,712,9522,0.040819652463459,2.7597222222222,45,1774,0 +712,713,9885,0.13497701521251,2.8122222222222,47,1784,0 +713,714,9802,0.16853433621426,2.8427777777778,72,1818,0 +714,715,9461,0.08655557751574,2.87,69,1981,0 +715,716,9393,0.05741127788681901,2.9769444444444,17,2004,0 +716,717,9638,0.037244401880164,3.3241666666667005,47,1788,0 +717,718,9435,0.1132743034971,3.6375,37,1786,0 +718,719,9519,0.15690958465910998,3.8652777777778,57,1781,0 +719,720,9492,0.09604225449090803,3.8091666666667,62,2024,0 +720,721,9458,0.06746445682560599,3.6844444444444,72,1669,0 +721,722,9420,0.058373145210404015,3.5913888888889,43,1729,0 +722,723,9429,0.048008603166117006,3.5255555555556,57,1682,0 +723,724,9461,0.12614216994504,3.3277777777778,47,1714,0 +724,725,9404,0.077186121310215,3.07,61,1679,0 +725,726,9366,0.042879382350005,2.7622222222222,53,1739,0 +726,727,9488,0.031014262794497007,2.3872222222222,78,1669,0 +727,728,9515,0.13957171072647,2.1308333333333,100,1806,0 +728,729,9487,0.027108383258306,2.1563888888889,104,1650,0 +729,730,9497,0.0,2.2547222222222003,56,1751,0 +730,731,9516,0.0,2.3397222222222003,89,1685,0 +731,732,9504,0.0,2.4808333333333,108,1645,0 +732,733,9422,0.025265991419408,2.6208333333333,67,2133,0 +733,734,9543,0.0,2.8138888888889,83,1618,0 +734,735,9395,0.047219926720593,2.9275,90,1623,0 +735,736,9352,0.083109434319356,2.8663888888888995,82,1697,0 +736,737,9884,0.10860709298782,2.7794444444444,76,1684,0 +737,738,9820,0.098319718095083,2.8194444444444,34,1779,0 +738,739,9439,0.02201293380153,2.9458333333333,43,2982,0 +739,740,9560,0.064929719079082,3.2413888888888995,40,1848,0 +740,741,9589,0.036960535765785,3.7166666666667,40,1772,0 +741,742,9575,0.068536856116777,4.1333333333333,57,1841,0 +742,743,9541,0.012398281267649,4.2697222222222,60,1834,0 +743,744,9490,0.035305311833591015,4.2797222222222,53,1860,0 +744,745,7160,0.024153733176505,4.0,44,1647,0 +745,746,7233,0.031750779212929,3.8877777777778,48,2129,0 +746,747,7166,0.092612685693125,3.6633333333333,50,1763,0 +747,748,7245,0.12674340154738,3.6127777777778,65,1433,0 +748,749,7299,0.068594711667718,3.3175,93,1428,0 +749,750,7169,0.13866540834682,2.8930555555556,105,1521,0 +750,751,7228,0.046813024390007014,2.4722222222222,94,1622,0 +751,752,7123,0.072990045810784,2.2294444444444,53,1580,0 +752,753,7199,0.17156759541908995,2.1286111111111,59,1468,0 +753,754,7167,0.051876699734571985,2.2219444444444,63,1520,0 +754,755,7212,0.031958698733103,2.3366666666667,61,1529,0 +755,756,7206,0.07333373485157901,2.4155555555556,72,1611,0 +756,757,7149,0.0,2.5408333333333,93,1511,0 +757,758,7284,0.023187512335638,2.6511111111111,62,1906,0 +758,759,7265,0.031672522871666,2.8405555555556,50,2632,0 +759,760,7221,0.091103855362214,2.8336111111111,42,1483,0 +760,761,7588,0.0,2.6575,62,1611,0 +761,762,7423,0.0983398607742,2.6622222222222005,21,1676,0 +762,763,7198,0.08011943311413,2.7719444444444,28,1670,0 +763,764,7279,0.043646436319699,3.0344444444444,65,1631,0 +764,765,7174,0.091445521226266,3.3741666666667003,37,1799,0 +765,766,7259,0.067771120773973,3.6925,20,1511,0 +766,767,7166,0.049768578185777006,3.8136111111111,47,1605,0 +767,768,7171,0.067455979006223,3.8202777777778,45,1758,0 +768,769,6883,0.14102875351082,3.7547222222222,49,1509,0 +769,770,6859,0.04521932948417,3.6077777777778,46,1591,0 +770,771,6817,0.032382889221133,3.5330555555556,30,1543,0 +771,772,6877,0.075100266089453,3.3544444444444,30,1573,0 +772,773,6785,0.038989846359505,3.1155555555556,48,1473,0 +773,774,6665,0.093396608626074,2.8463888888888995,36,1476,0 +774,775,6805,0.06797619687558401,2.4411111111111,46,1712,0 +775,776,6863,0.08326287339845401,2.1455555555556,27,1801,0 +776,777,6926,0.015112630017379,2.0025,79,1902,0 +777,778,7004,0.031549757127405,2.1247222222222,65,2005,0 +778,779,6950,0.0,2.2741666666667,57,2363,0 +779,780,7262,0.0,2.3272222222222005,61,2513,0 +780,781,7361,0.017214486216241002,2.4363888888889,89,2664,0 +781,782,7288,0.015541991667356,2.6155555555556003,80,2714,0 +782,783,7463,0.0,2.7272222222222,79,2754,0 +783,784,7188,0.027199843934104,2.6552777777778,113,2670,0 +784,785,7658,0.053744802378685,2.6086111111111,71,2584,0 +785,786,7575,0.05675511278546901,2.6025,53,2466,0 +786,787,6954,0.070873939193717,2.7372222222222,64,2137,0 +787,788,6862,0.19022950977106,3.0125,43,1931,0 +788,789,6896,0.17589540947937002,3.3477777777778,34,1743,0 +789,790,6954,0.022875979046571,3.6236111111111,29,1713,0 +790,791,6869,0.0,3.7383333333333,30,1649,0 +791,792,6890,0.13681403156951,3.7772222222222,24,1633,0 +792,793,9742,0.058507485759525,3.6966666666667,40,1993,0 +793,794,9730,0.10227075584148,3.7733333333333,32,1940,0 +794,795,9810,0.06726096113022301,3.6408333333333,39,1951,0 +795,796,9688,0.15267199916685995,3.3922222222222,67,1894,0 +796,797,9849,0.069818221889972,3.1627777777778,65,1801,0 +797,798,9765,0.030305771594539,2.6875,49,1962,0 +798,799,9812,0.09211700324247198,2.3533333333333,41,2123,0 +799,800,9931,0.12298177354813,2.0425,50,2434,0 +800,801,9908,0.08705722689013601,1.9738888888889,48,2402,0 +801,802,10066,0.07529920073678098,2.0425,59,3013,0 +802,803,10184,0.06217694957317299,2.1563888888889,51,3086,0 +803,804,10295,0.020886039183631,2.2866666666667004,43,3527,0 +804,805,10113,0.08148200392528,2.3919444444444,72,3716,0 +805,806,10218,0.027014133895137,2.5513888888889,52,3577,0 +806,807,10322,0.08271940630361399,2.6030555555556,68,3430,0 +807,808,10269,0.038537180887872,2.6647222222222005,74,3413,0 +808,809,10781,0.090543853269643,2.5930555555556003,46,3755,0 +809,810,10486,0.02593545023878,2.5513888888889,64,4806,0 +810,811,10124,0.090692829340129,2.76,38,3127,0 +811,812,9993,0.09154630234853098,3.0636111111111,40,3421,0 +812,813,9801,0.09562635368432304,3.4016666666667,50,2475,0 +813,814,9760,0.0,3.7277777777778,42,2440,0 +814,815,9858,0.0,3.7902777777778,37,2731,0 +815,816,9884,0.027267039980187,3.7355555555556,34,2493,0 +816,817,7781,0.024102810048699,3.535,37,1665,0 +817,818,7742,0.072297652068167,3.5819444444444,47,1771,0 +818,819,7682,0.12348623922845,3.3847222222222,67,2293,0 +819,820,7831,0.077453588867077,3.2547222222222,66,1959,0 +820,821,7641,0.05662557916213299,3.125,91,1498,0 +821,822,7641,0.15509029304093,2.7766666666667,132,1537,0 +822,823,7759,0.079595064406905,2.4725,149,1580,0 +823,824,7748,0.053225613553497,2.1927777777778,65,1901,0 +824,825,7776,0.05741127788681901,2.1283333333333,50,1916,0 +825,826,7938,0.077171346852694,2.2319444444444,70,2213,0 +826,827,8031,0.0,2.3061111111111,82,2205,0 +827,828,8117,0.07512642149906099,2.3363888888889,72,2486,0 +828,829,8099,0.0,2.3686111111111,98,2580,0 +829,830,8002,0.0,2.4986111111111,78,2530,0 +830,831,7944,0.026463035590685,2.6433333333333,86,2664,0 +831,832,7963,0.024228588329879,2.7563888888889,76,4368,0 +832,833,8602,0.055182797357095005,2.6652777777778,95,3103,0 +833,834,8269,0.09607690135523,2.6844444444444,63,2249,0 +834,835,7871,0.059431847203259,2.7902777777778,32,2070,0 +835,836,7709,0.018731901987648,3.1119444444444,30,2833,0 +836,837,7726,0.033970515582906,3.5491666666667,27,1734,0 +837,838,7781,0.049963174087431,3.7102777777778,22,2151,0 +838,839,7762,0.073295374096872,3.7961111111111,19,2103,0 +839,840,7692,0.017715537831218996,3.7730555555556,32,1725,0 +840,841,6608,0.014656639469103996,3.5919444444444,45,1895,0 +841,842,6526,0.15513271231042,3.5580555555556,65,1959,0 +842,843,6531,0.06544162031760599,3.4588888888889,73,1637,0 +843,844,6483,0.12276447331552,3.2969444444444003,52,1658,0 +844,845,6602,0.054046416943085,3.2288888888889,93,1666,0 +845,846,6555,0.06827770027642299,2.7358333333333,68,2410,0 +846,847,6610,0.10171854295932,2.4636111111111,127,1787,0 +847,848,6690,0.093454285728882,2.1894444444444,105,2264,0 +848,849,6651,0.04318436192577,2.1227777777778,75,2007,0 +849,850,6759,0.10050707347524,2.1369444444444,77,2107,0 +850,851,6836,0.019571935182124,2.2230555555556,140,2355,0 +851,852,6894,0.0,2.3188888888889,132,2726,0 +852,853,6844,0.0,2.4166666666667003,100,2875,0 +853,854,6773,0.02713995635286,2.5777777777778,174,2780,0 +854,855,6802,0.092632629280125,2.7869444444444,82,3936,0 +855,856,6947,0.098676638207998,2.8586111111111,128,3116,0 +856,857,7248,0.0,3.0816666666667003,79,3770,0 +857,858,6885,0.11132365864914,2.8713888888889,71,2382,0 +858,859,6643,0.0947301899901,2.9386111111111,60,2152,0 +859,860,6560,0.061070711161473,2.9827777777778,60,1754,0 +860,861,6554,0.18477832073133,3.3197222222222,56,1783,0 +861,862,6600,0.055986690710270993,3.5961111111111004,78,1780,0 +862,863,6525,0.16264480046039995,3.7613888888889,60,1582,0 +863,864,6543,0.026215643469448,3.7305555555556,48,2271,0 +864,865,9018,0.0,3.5580555555556,48,2592,0 +865,866,9225,0.054655616583012,3.5136111111111004,42,2921,0 +866,867,9112,0.07076692500883701,3.3772222222222,64,1814,0 +867,868,9195,0.067217215228375,3.2402777777778,36,3219,0 +868,869,9206,0.046060828388587,3.0586111111111003,40,2567,0 +869,870,9224,0.08329795085471901,2.7908333333333,18,1899,0 +870,871,9408,0.08219020764935,2.3761111111111,35,1801,0 +871,872,9082,0.046792553198475,2.1347222222222,44,2005,0 +872,873,9168,0.06755714954154099,1.9991666666667,105,2572,0 +873,874,9258,0.099050882008287,1.9983333333333,71,3563,0 +874,875,9158,0.0,2.0908333333333,65,2777,0 +875,876,9140,0.10824637351267,2.2311111111111,74,3362,0 +876,877,9206,0.0,2.3219444444444,34,3590,0 +877,878,9186,0.0,2.4727777777778,49,2930,0 +878,879,9155,0.037750185176735,2.5952777777778,44,2481,0 +879,880,9174,0.030345867660395,2.7416666666667004,57,2571,0 +880,881,9758,0.057665227298857,2.7652777777778,102,3546,0 +881,882,9451,0.16774071722374,2.7980555555556,106,4984,0 +882,883,9153,0.10462164884166,2.7597222222222,58,1994,0 +883,884,9233,0.051974117163582,3.0116666666667005,57,3060,0 +884,885,9250,0.070438547008222,3.2916666666667003,62,2151,0 +885,886,9317,0.11437533048244,3.5547222222222,42,2158,0 +886,887,9130,0.028754095353637,3.7580555555556,35,2319,0 +887,888,9249,0.06874265819680701,3.7330555555556,28,1909,0 +888,889,8297,0.041552255552731,3.5886111111111005,27,1627,0 +889,890,8245,0.033571347720577,3.5255555555556,35,2459,0 +890,891,8298,0.014724878652831,3.3858333333333,50,3167,0 +891,892,8247,0.046095580964192,3.2677777777778,69,1839,0 +892,893,8387,0.031859774913781,3.1247222222222,64,3887,0 +893,894,8392,0.094121536253424,2.7213888888888995,69,2031,0 +894,895,8531,0.11471874999036,2.3972222222222004,58,1522,0 +895,896,8437,0.09375530196425097,2.0836111111111,58,1732,0 +896,897,8344,0.10898948864079,2.0644444444444,51,2169,0 +897,898,8274,0.031129909255124,2.2063888888889,46,1679,0 +898,899,8328,0.0,2.3044444444444,84,1941,0 +899,900,8351,0.020155867044519,2.47,144,1638,0 +900,901,8380,0.016795241270985,2.5697222222222003,86,1725,0 +901,902,8332,0.0,2.7625,69,1903,0 +902,903,8366,0.0,2.9436111111111005,81,2074,0 +903,904,8357,0.01748186857624,2.7905555555556,175,1848,0 +904,905,8867,0.015638795432702,2.7527777777778,65,1761,0 +905,906,8659,0.037878946671491,2.6980555555556,48,1838,0 +906,907,8458,0.14870829462531002,2.9102777777778,33,1640,0 +907,908,8360,0.07322030784057597,3.2663888888889,35,1715,0 +908,909,8330,0.10504553292421,3.5372222222222,37,1717,0 +909,910,8298,0.10771048774666,3.86,31,1758,0 +910,911,8381,0.07484115005697,3.9216666666667,36,1975,0 +911,912,8393,0.10377526695926,3.8766666666667,30,1865,0 +912,913,3998,0.052336696506499,3.6463888888889,28,3575,0 +913,914,3733,0.039930389849144,3.6552777777778,24,1413,0 +914,915,3735,0.052659026600132,3.5880555555556,68,1414,0 +915,916,3709,0.071593754146172,3.3594444444444003,26,1170,0 +916,917,3755,0.072107773186609,3.1888888888889,78,1209,0 +917,918,3782,0.14407221323011,2.7575,90,1170,0 +918,919,3849,0.078873737285415,2.3936111111111,76,1328,0 +919,920,3801,0.090543853269643,2.1925,94,1258,0 +920,921,3787,0.0,2.16,70,1427,0 +921,922,3835,0.18229662394063,2.2719444444444,129,1480,0 +922,923,4035,0.10064381631633,2.3994444444444,120,1687,0 +923,924,4173,0.0,2.2836111111111,122,1942,0 +924,925,3995,0.0,2.5422222222222004,100,1967,0 +925,926,4016,0.0,2.6908333333333,102,2110,0 +926,927,4049,0.064661049677152,2.7702777777778,118,1956,0 +927,928,4014,0.10610212880951,2.7405555555556,86,1984,0 +928,929,4263,0.098345239553664,2.6908333333333,92,1893,0 +929,930,3941,0.055426072308289,2.7008333333333,44,1821,0 +930,931,4023,0.026036719363444,2.8322222222222,25,1641,0 +931,932,3917,0.058176601538018,3.0922222222222,54,1604,0 +932,933,3910,0.11644035456955,3.4363888888889,48,1265,0 +933,934,3934,0.067489738764642,3.7530555555556,56,1407,0 +934,935,3783,0.091155534540558,3.9127777777778,42,1342,0 +935,936,3834,0.052217414705359004,3.7608333333333,41,1216,0 +936,937,8698,0.028401045145692,3.6472222222222,32,2569,0 +937,938,8969,0.06030991242653401,3.5544444444444,48,2150,0 +938,939,8928,0.057683225704233,3.5036111111111,40,2317,0 +939,940,9020,0.049602244305935,3.2538888888889,26,2047,0 +940,941,8865,0.054771618715138,3.1886111111111,55,2065,0 +941,942,8830,0.014455899164978,2.7341666666667,52,1909,0 +942,943,8879,0.05563571922395901,2.3655555555556003,34,1910,0 +943,944,9120,0.077488949885965,2.1688888888889,61,2037,0 +944,945,9111,0.06776025909838901,2.0977777777778,34,3065,0 +945,946,9071,0.033919453583666,2.3077777777778,50,2452,0 +946,947,9205,0.030948232299768,2.3611111111111,47,3226,0 +947,948,9355,0.0,2.4986111111111,56,3271,0 +948,949,9372,0.0,2.5691666666667,76,3471,0 +949,950,9392,0.0,2.7463888888889,60,3922,0 +950,951,9416,0.0,2.8063888888888995,100,3296,0 +951,952,9394,0.0,2.8091666666667003,80,3171,0 +952,953,9810,0.10150033578287,2.715,74,3208,0 +953,954,9594,0.13650296233629,2.6869444444444,24,3602,0 +954,955,9006,0.048341331534980006,2.8180555555556,41,3208,0 +955,956,9140,0.055919636698743,3.0541666666667004,19,3455,0 +956,957,8925,0.052826773889684014,3.4711111111111004,24,2833,0 +957,958,9047,0.07932984590431501,3.7566666666667,18,3453,0 +958,959,9030,0.033310879512461,3.8633333333333,28,3155,0 +959,960,9088,0.048306771033288,3.7519444444444,5,2145,0 +960,961,8569,0.034002578802562,3.6480555555556,12,1999,0 +961,962,8616,0.047801640470854015,3.5061111111111005,35,2135,0 +962,963,8497,0.13378075099383,3.47,41,1813,0 +963,964,8439,0.063853685461221,3.3086111111111003,30,2020,0 +964,965,8567,0.0,3.1194444444444,22,2127,0 +965,966,8694,0.073869151016554,2.8044444444444,56,1764,0 +966,967,8739,0.043582908466928014,2.4205555555556004,34,2249,0 +967,968,8761,0.0,2.1180555555556,73,3119,0 +968,969,8838,0.062006969698131,2.1266666666667,86,2031,0 +969,970,8908,0.14006961492891,2.1708333333333,68,2246,0 +970,971,9053,0.11198565566104,2.3247222222222,36,3214,0 +971,972,9346,0.0,2.4208333333333,66,4207,0 +972,973,8989,0.058427455554992985,2.5563888888889,74,4195,0 +973,974,8807,0.070887934206661,2.7086111111111,78,3179,0 +974,975,9020,0.031869233863638,2.8027777777778,66,2739,0 +975,976,9034,0.0,2.7711111111111,118,2394,0 +976,977,9558,0.055680379884383,2.74,81,3750,0 +977,978,9042,0.030919398857213,2.6869444444444,85,3000,0 +978,979,8804,0.040222150865381015,2.8113888888889,69,2646,0 +979,980,8885,0.08462727078727299,3.1258333333333,49,2375,0 +980,981,8721,0.15790637433488,3.4711111111111004,56,2442,0 +981,982,8676,0.099165571846447,3.7419444444444,64,2069,0 +982,983,9029,0.051043016646698,3.7258333333333,48,1899,0 +983,984,8670,0.023695834967821,3.5369444444444,65,2277,0 +984,985,8537,0.13363180896924,3.4911111111111004,53,1926,0 +985,986,8418,0.14375985835531,3.3769444444444,70,1949,0 +986,987,8481,0.13890523887057998,3.3327777777778,51,2222,0 +987,988,8535,0.096357518724471,3.1925,30,1797,0 +988,989,8535,0.098277544249084,3.135,97,1860,0 +989,990,8442,0.11251833989481,2.8338888888889,41,2870,0 +990,991,8448,0.074768662666532,2.4997222222222004,32,1899,0 +991,992,8527,0.038008655416852,2.2297222222222004,47,2336,0 +992,993,8541,0.016354174968753,2.1158333333333,34,2703,0 +993,994,8635,0.11898350916153,2.1966666666667,54,2773,0 +994,995,8867,0.0,2.2591666666667,69,2577,0 +995,996,9033,0.0,2.3002777777778,109,2816,0 +996,997,8875,0.0,2.3797222222222003,76,3133,0 +997,998,8708,0.0,2.625,47,3366,0 +998,999,8455,0.020636446066963,2.6661111111111,44,3062,0 +999,1000,8713,0.043044731483849,2.6694444444444,92,3003,0 +1000,1001,8934,0.12513578187909,2.6541666666667,67,3044,0 +1001,1002,8745,0.099581351017555,2.6483333333333,26,3230,0 +1002,1003,8674,0.085903047711976,2.7444444444444,42,2793,0 +1003,1004,8606,0.066698820830796,3.0788888888889,69,1945,0 +1004,1005,8508,0.034228320502586,3.4833333333333,32,2716,0 +1005,1006,8558,0.028479870560763,3.6063888888889,41,2103,0 +1006,1007,8529,0.16430377699282994,3.8069444444444,52,1795,0 +1007,1008,8520,0.020290722486788003,3.6475,56,2840,0 +1008,1009,6662,0.17253761895951006,3.5219444444444,47,2653,0 +1009,1010,6491,0.1150267570489,3.3708333333333,65,2819,0 +1010,1011,6498,0.14119445755296,3.3086111111111003,70,1706,0 +1011,1012,6500,0.079900598296651,3.2411111111111004,84,1801,0 +1012,1013,6471,0.11459361685243,3.0525,71,3271,0 +1013,1014,6354,0.11299850955195,2.7419444444444,110,2001,0 +1014,1015,6592,0.078187238738118,2.4305555555556,65,1678,0 +1015,1016,6552,0.15222680511595002,2.1852777777778,68,1703,0 +1016,1017,6492,0.05823703723779,2.0644444444444,74,2441,0 +1017,1018,6577,0.038270957919533,2.1961111111111,43,2304,0 +1018,1019,6777,0.045436612403901,2.2886111111111,55,3124,0 +1019,1020,6844,0.051111263534218,2.3219444444444,53,3605,0 +1020,1021,6769,0.0,2.4436111111111,64,2985,0 +1021,1022,6642,0.0,2.6463888888889,58,2934,0 +1022,1023,6782,0.057248496594127986,2.735,54,3044,0 +1023,1024,6715,0.0,2.7586111111111005,121,3463,0 +1024,1025,6915,0.084808608043399,2.7138888888889,103,3199,0 +1025,1026,6569,0.05823703723779,2.7119444444444,66,2684,0 +1026,1027,6486,0.12640598881102005,2.8027777777778,73,3317,0 +1027,1028,6504,0.08602692657241201,2.9777777777778,71,2159,0 +1028,1029,6445,0.13712331887199,3.2961111111111,37,2043,0 +1029,1030,6427,0.12184008568979,3.4869444444444,46,2003,0 +1030,1031,6365,0.050317612906928,3.673611111111101,40,2260,0 +1031,1032,6277,0.07167380324199299,3.7469444444444,26,3522,0 +1032,1033,5231,0.051289858799957,3.6133333333333,42,1840,0 +1033,1034,5166,0.094021005766084,3.4752777777778,63,1820,0 +1034,1035,5303,0.020566298353792,3.3602777777778,68,1856,0 +1035,1036,5306,0.12275234276969,3.1605555555556,87,1715,0 +1036,1037,5298,0.1054190746845,3.0733333333333,60,1695,0 +1037,1038,5268,0.19050318144252,2.7130555555556,94,2254,0 +1038,1039,5251,0.10472332930133,2.2886111111111,121,1652,0 +1039,1040,5194,0.12644994481537,2.0783333333333,128,1602,0 +1040,1041,5230,0.08859454436104999,1.9188888888889,68,1792,0 +1041,1042,5244,0.0,1.9355555555556003,76,1954,0 +1042,1043,5102,0.09532581107230803,2.0569444444444,77,1808,0 +1043,1044,5244,0.15766772749983,2.1902777777778,158,1629,0 +1044,1045,5249,0.06429178708826701,2.3477777777778,112,2140,0 +1045,1046,5261,0.068395341911942,2.5502777777778,85,2390,0 +1046,1047,5339,0.025992957736547997,2.6597222222222,77,1707,0 +1047,1048,5241,0.0,2.7238888888888995,89,1901,0 +1048,1049,5491,0.021142167244918,2.7375,106,1820,0 +1049,1050,5374,0.072067861729848,2.7483333333333,47,2167,0 +1050,1051,5354,0.1275228688396,2.8525,34,2063,0 +1051,1052,5232,0.043846003986674,3.0038888888889,32,2184,0 +1052,1053,5217,0.10247450096434,3.2761111111111005,22,1981,0 +1053,1054,5258,0.07584150637714701,3.5761111111111004,16,1813,0 +1054,1055,5251,0.020496657705832,3.8172222222222,32,2033,0 +1055,1056,5223,0.13399493992192998,3.6691666666667,16,1629,0 +1056,1057,3952,0.091121163023619,3.5558333333333,20,1485,0 +1057,1058,3949,0.11809705541338,3.4266666666667,56,1527,0 +1058,1059,4021,0.033014047837867995,3.435,74,2561,0 +1059,1060,3815,0.16367597832104,3.2111111111111,116,1523,0 +1060,1061,3855,0.12469537397569,3.1297222222222,72,1446,0 +1061,1062,3892,0.095002031789468,2.7538888888889,66,1499,0 +1062,1063,3948,0.1028064299952,2.3116666666667003,56,1368,0 +1063,1064,3860,0.028861851985229007,2.0988888888889,61,1426,0 +1064,1065,3830,0.05806984314166,2.0983333333333,2151,3528,0 +1065,1066,3821,0.050886592113012,2.1986111111111,459,2279,0 +1066,1067,3886,0.05081829754409599,2.3677777777778,84,1421,0 +1067,1068,3954,0.0,2.5036111111111,55,2008,0 +1068,1069,3839,0.08354288831032201,2.5786111111111,61,1429,0 +1069,1070,3921,0.0,2.8172222222222,19,1497,0 +1070,1071,3874,0.08142390858425297,2.8727777777778,30,1604,0 +1071,1072,3996,0.047911560407608,2.8294444444444,73,1595,0 +1072,1073,4246,0.12201534565884,2.7136111111111005,63,2217,0 +1073,1074,3803,0.088739417881303,2.7058333333333,35,1580,0 +1074,1075,3594,0.08276214539547999,2.8161111111111,57,1466,0 +1075,1076,3778,0.066779641097052,3.1541666666667,50,1717,0 +1076,1077,3745,0.11367082443275,3.5791666666667004,48,1564,0 +1077,1078,3747,0.021597223158314,3.8158333333333,40,1752,0 +1078,1079,3726,0.16874893592242002,3.9405555555556,36,1598,0 +1079,1080,3729,0.041971530556774,3.7294444444444,59,1842,0 +1080,1081,8513,0.042983941794881,3.6183333333333,14,3066,0 +1081,1082,8738,0.14500733624043,3.4911111111111004,16,2272,0 +1082,1083,8709,0.046727090031129015,3.4566666666667003,36,4344,0 +1083,1084,8601,0.032553617944112004,3.37,65,3242,0 +1084,1085,8719,0.040039251102491,3.1658333333333,80,2291,0 +1085,1086,8820,0.055153759101126985,2.7261111111111003,91,2240,0 +1086,1087,8674,0.05751181017711901,2.3533333333333,102,2012,0 +1087,1088,8859,0.041202889821452,2.1158333333333,85,2305,0 +1088,1089,8905,0.07854024449462599,2.0852777777778,69,2295,0 +1089,1090,8920,0.11628975245152,2.1422222222222,79,2370,0 +1090,1091,9062,0.087543035971238,2.3172222222222003,66,3066,0 +1091,1092,9139,0.0,2.3983333333333,47,3132,0 +1092,1093,8866,0.031151045483539,2.55,51,3006,0 +1093,1094,8997,0.0,2.7413888888888995,20,3101,0 +1094,1095,9122,0.029949950026121008,2.7636111111111004,62,3739,0 +1095,1096,9191,0.067297142748812,2.7002777777778,54,3933,0 +1096,1097,9795,0.08450527625030299,2.7247222222222,99,4537,0 +1097,1098,9255,0.049852109269358014,2.5866666666667,64,3856,0 +1098,1099,8924,0.094084438832673,2.8597222222222,66,2862,0 +1099,1100,9012,0.044896125591910994,3.1269444444444,49,2449,0 +1100,1101,9023,0.07328004196455701,3.5019444444444,73,2222,0 +1101,1102,8875,0.13104465124262998,3.778611111111101,47,2159,0 +1102,1103,8800,0.10394116672902,3.8727777777778,48,2486,0 +1103,1104,8785,0.033616505813902,3.704166666666701,35,3148,0 +1104,1105,8474,0.02672150953308,3.5533333333333,27,3207,0 +1105,1106,8412,0.082058799915824,3.4461111111111005,19,2057,0 +1106,1107,8491,0.05732182787355501,3.4341666666667003,37,2029,0 +1107,1108,8391,0.067005870534182,3.3141666666667,45,3127,0 +1108,1109,8216,0.13429243256821,3.0438888888889,45,2597,0 +1109,1110,8292,0.015094533525413,2.6791666666667004,32,2350,0 +1110,1111,8406,0.063949370932991,2.3202777777778,99,2364,0 +1111,1112,8509,0.094378811742462,2.0691666666667,71,2095,0 +1112,1113,8486,0.02139340711812,2.0091666666667,93,2978,0 +1113,1114,8616,0.0,2.1886111111111,78,2743,0 +1114,1115,8642,0.0,2.3088888888889,71,2668,0 +1115,1116,8823,0.0,2.3794444444444,91,3054,0 +1116,1117,8774,0.0,2.5994444444444,31,3733,0 +1117,1118,8810,0.0,2.7119444444444,35,4312,0 +1118,1119,8611,0.0,2.76,25,4112,0 +1119,1120,8798,0.10029435223064,2.6975,45,3541,0 +1120,1121,9179,0.0,2.5466666666667,33,3901,0 +1121,1122,9057,0.10365337249761998,2.6036111111111,34,4371,0 +1122,1123,8633,0.12418226954696003,2.7927777777778,40,4099,0 +1123,1124,8517,0.0,2.9788888888889,17,3039,0 +1124,1125,8427,0.051166116772473,3.4080555555556,17,3197,0 +1125,1126,8615,0.040222150865381015,3.6813888888889,16,2346,0 +1126,1127,8690,0.17057206553854998,3.7983333333333,26,2285,0 +1127,1128,8438,0.12861588337799,3.6338888888889,19,2313,0 +1128,1129,10388,0.0,3.5111111111111004,30,3216,0 +1129,1130,10588,0.0,3.3613888888889,94,3860,0 +1130,1131,10533,0.14569364884757002,3.3072222222222,73,4781,0 +1131,1132,10397,0.18198813530019,3.2447222222222,59,2957,0 +1132,1133,10347,0.038073868368755,3.1152777777778,53,2171,0 +1133,1134,10405,0.11491272575332,2.6994444444444,56,2856,0 +1134,1135,10411,0.064841538076484,2.3497222222222005,70,2714,0 +1135,1136,10503,0.048708312546253,2.0619444444444,60,2602,0 +1136,1137,10598,0.11629780056153,2.0625,83,2331,0 +1137,1138,10692,0.07659916149791901,2.1905555555556004,265,3586,0 +1138,1139,10874,0.0,2.2588888888889,944,3363,0 +1139,1140,11043,0.043763623117499,2.3983333333333,36,3879,0 +1140,1141,11009,0.0,2.5536111111111,42,3556,0 +1141,1142,10818,0.041436571087464,2.7408333333333,23,4381,0 +1142,1143,10985,0.0,2.7375,75,4777,0 +1143,1144,10861,0.08191467409622599,2.7780555555556,68,4879,0 +1144,1145,12282,0.11084389924027,2.6225,23,3553,0 +1145,1146,11225,0.12510294083344,2.6386111111111,35,3177,0 +1146,1147,10775,0.10213470511717,2.7908333333333,38,2727,0 +1147,1148,10688,0.06332743445339299,3.0922222222222,69,2758,0 +1148,1149,10601,0.033666593475508995,3.4291666666667004,57,4124,0 +1149,1150,10634,0.057459020289436,3.6752777777778,58,3076,0 +1150,1151,10646,0.023008391787587,3.736111111111101,43,2291,0 +1151,1152,10562,0.037622360322278,3.5905555555556,65,2482,0 +1152,1153,10608,0.026766196308354,3.3872222222222,60,2537,0 +1153,1154,10618,0.13691041072327,3.3186111111111005,55,2434,0 +1154,1155,10636,0.024581173073578,3.2775,49,2608,0 +1155,1156,10583,0.050723618686514,3.1625,54,2614,0 +1156,1157,10613,0.038807415292018,3.1391666666667004,66,2904,0 +1157,1158,10603,0.10731539561588,2.7616666666667005,59,2204,0 +1158,1159,10601,0.13649131550296,2.4675,107,2326,0 +1159,1160,10757,0.11190990870167998,2.2166666666667,104,3002,0 +1160,1161,10815,0.17879123074031,2.1205555555556,100,3472,0 +1161,1162,10790,0.08728058888363299,2.2044444444444,133,3496,0 +1162,1163,11082,0.0,2.3147222222222004,65,3168,0 +1163,1164,11121,0.07099894663641,2.2416666666667004,152,4268,0 +1164,1165,10913,0.098617038600063,2.405,83,4350,0 +1165,1166,11004,0.0,2.5705555555556003,158,3555,0 +1166,1167,11135,0.10519721128315,2.7088888888889,145,4986,0 +1167,1168,10960,0.10928571467639,2.6913888888889,77,4576,0 +1168,1169,11686,0.14969099592127,2.6427777777778,13,4451,0 +1169,1170,11244,0.060122448878635,2.705,67,3627,0 +1170,1171,10931,0.068254139999346,2.8738888888889,25,3485,0 +1171,1172,10811,0.056987671819742985,3.0819444444444,27,3046,0 +1172,1173,10679,0.094667935014769,3.4491666666667005,23,2657,0 +1173,1174,10648,0.13287358772218,3.6275,28,2423,0 +1174,1175,10757,0.032507012295146,3.8027777777778,25,2374,0 +1175,1176,10706,0.14779741522058998,3.6436111111111,28,2493,0 +1176,1177,9077,0.10864900088005,3.4861111111111005,30,2495,0 +1177,1178,8836,0.12602969813907,3.3266666666667004,31,2189,0 +1178,1179,8971,0.07253718299881,3.1866666666667003,31,2214,0 +1179,1180,8972,0.31381296416887,3.2213888888888995,44,2374,0 +1180,1181,8903,0.2312064012582,3.0102777777778,27,3230,0 +1181,1182,8967,0.17687421373190998,2.6658333333333,36,2132,0 +1182,1183,8962,0.022073721703464003,2.3902777777778,61,3042,0 +1183,1184,9044,0.11600086139073,2.1380555555556,64,2053,0 +1184,1185,8931,0.10418807549523,2.0161111111111,118,2349,0 +1185,1186,9028,0.040222150865381015,2.0641666666667,98,3381,0 +1186,1187,9240,0.06812462580532,2.1844444444444,76,3436,0 +1187,1188,9227,0.055328485037955,2.2822222222222,57,3280,0 +1188,1189,9227,0.027788383289499,2.4002777777777995,74,4357,0 +1189,1190,9125,0.0,2.5433333333333,72,4522,0 +1190,1191,9075,0.0,2.7469444444444,78,4094,0 +1191,1192,9117,0.035137191893634005,2.6872222222222,69,3296,0 +1192,1193,9562,0.035137191893634005,2.6980555555556,125,4129,0 +1193,1194,9305,0.11258759940039,2.7380555555556,157,3036,0 +1194,1195,8965,0.16105265701128,2.7858333333333,61,2628,0 +1195,1196,8862,0.15210502999287,3.0502777777778,12,2296,0 +1196,1197,8858,0.07673479360192201,3.2991666666667,16,2221,0 +1197,1198,8820,0.17013715283392,3.5533333333333,36,1991,0 +1198,1199,8876,0.1609412187274,3.6652777777778,27,2778,0 +1199,1200,8797,0.12008642730107,3.6116666666667,22,2511,0 +1200,1201,9074,0.045995324803682,3.5463888888889,22,2103,0 +1201,1202,9318,0.23802438276872,3.4013888888889,35,2111,0 +1202,1203,9286,0.18078076076243,3.245,67,2055,0 +1203,1204,9320,0.12741851179236,3.1644444444444,46,1930,0 +1204,1205,9280,0.08024661572906401,2.9361111111111,72,2456,0 +1205,1206,9333,0.32656213417732,2.6952777777778,96,2952,0 +1206,1207,9334,0.28639695711596,2.3702777777778,117,2147,0 +1207,1208,9337,0.083900984173012,2.0947222222222,113,2051,0 +1208,1209,9405,0.12853338721539,1.9538888888889,140,2281,0 +1209,1210,9263,0.032414228925828,1.9925,107,2102,0 +1210,1211,9326,0.08237281480963901,2.0363888888889,102,2062,0 +1211,1212,9421,0.0,2.1919444444444,85,2796,0 +1212,1213,9275,0.0,2.3211111111111,49,2005,0 +1213,1214,9323,0.0,2.4955555555556,69,2075,0 +1214,1215,9347,0.45868581620054,2.6980555555556,68,2058,1 +1215,1216,9333,0.1959092708736,2.7219444444444,104,2733,0 +1216,1217,9846,0.7871265862012701,2.725,111,2170,1 +1217,1218,9497,0.18267963393082,2.7816666666667,88,2282,0 +1218,1219,9383,0.26777755992147,2.7811111111111004,64,2178,0 +1219,1220,9300,0.30404676514833,2.955,29,2283,0 +1220,1221,9389,0.28226806095289003,3.3158333333333,32,2097,0 +1221,1222,9364,0.32093016819692,3.5669444444444003,29,2738,0 +1222,1223,9227,0.24793583772273,3.7419444444444,21,2678,0 +1223,1224,9309,0.27376916868294,3.6236111111111,33,2404,0 +1224,1225,6204,0.32069151905173,3.4416666666667,37,1497,0 +1225,1226,6048,0.16728853165162,3.4172222222222,57,1496,0 +1226,1227,5949,0.17244047836378998,3.3016666666667,72,1935,0 +1227,1228,5981,0.21356200193615,3.1963888888889,86,1521,0 +1228,1229,5897,0.08833993625230199,3.0641666666667,70,2879,0 +1229,1230,6038,0.20141526375625,2.735,63,1561,0 +1230,1231,6094,0.12271171189386,2.3288888888889,49,1381,0 +1231,1232,6022,0.15111333507662,2.0938888888889,81,1826,0 +1232,1233,6122,0.3688420983862,2.1338888888889,58,1896,0 +1233,1234,6034,0.15672074166098002,2.2247222222222005,70,2083,0 +1234,1235,6079,0.099476236793782,2.3308333333333,67,1792,0 +1235,1236,5998,0.18394691317126,2.3902777777778,70,3258,0 +1236,1237,6004,0.076264605227629,2.5819444444444,95,2265,0 +1237,1238,5908,0.058100747891124,2.6661111111111,100,2775,0 +1238,1239,6022,0.18015967729618,2.8258333333333,116,1545,0 +1239,1240,5981,0.059431847203259,2.7502777777778,123,1818,0 +1240,1241,6399,0.14870829462531002,2.6730555555556004,71,1481,0 +1241,1242,6119,0.09565694822541,2.7536111111111,65,1677,0 +1242,1243,6114,0.16022629962173002,2.9677777777778,73,1858,0 +1243,1244,5915,0.4140256163498,3.37,53,1643,0 +1244,1245,6192,0.32447726333369004,3.5958333333333,79,1582,0 +1245,1246,6021,0.15394421357627,3.8144444444444,77,1611,0 +1246,1247,6060,0.060070368432038,3.8283333333333,59,1803,0 +1247,1248,7510,0.14236976564388,3.7030555555556,66,2121,0 +1248,1249,7560,0.12741851179236,3.5802777777778,54,2375,0 +1249,1250,7525,0.093634078744746,3.4197222222222,54,1866,0 +1250,1251,7483,0.13709947889982,3.4438888888889,89,2398,0 +1251,1252,7452,0.06298116794216299,3.3425,85,2577,0 +1252,1253,7512,0.13125017838571,3.1608333333333,96,1801,0 +1253,1254,7572,0.21161148728916,2.7413888888888995,149,1840,0 +1254,1255,7629,0.06783428261124,2.3808333333333,139,1985,0 +1255,1256,7529,0.20877561051189,2.12,90,2041,0 +1256,1257,7623,0.10394294206935002,2.1533333333333,68,2075,0 +1257,1258,7637,0.0,2.2569444444444,445,2564,0 +1258,1259,7921,0.076424293095548,2.3183333333333,100,2734,0 +1259,1260,7790,0.08809461878011901,2.3583333333333,138,3143,0 +1260,1261,7782,0.034280386319742985,2.5072222222222003,104,3119,0 +1261,1262,7829,0.039360296791109,2.5927777777778,82,3590,0 +1262,1263,7902,0.0,2.6894444444444,208,3893,0 +1263,1264,8039,0.038944065994356014,2.6291666666667,92,3264,0 +1264,1265,8350,0.18176011684739,2.6469444444444,53,3963,0 +1265,1266,8142,0.18521047165852,2.7461111111111003,65,2757,0 +1266,1267,7886,0.13079770999921,2.9363888888889,62,2306,0 +1267,1268,7743,0.13310058077443,3.2797222222222,73,2549,0 +1268,1269,7707,0.054750658073534006,3.5194444444444,84,2212,0 +1269,1270,7726,0.030588852697706,3.8130555555556,90,2286,0 +1270,1271,7717,0.12998124134227002,3.7941666666667,80,2979,0 +1271,1272,10331,0.09100057249197198,3.6086111111111,90,3158,0 +1272,1273,10515,0.19464543002904006,3.3858333333333,84,2645,0 +1273,1274,10415,0.22178651521516,3.3336111111111,34,3161,0 +1274,1275,10387,0.22983578430825,3.3116666666667003,67,4460,0 +1275,1276,10471,0.298229429356,3.2616666666667005,74,2630,0 +1276,1277,10385,0.12923377484588,3.0044444444444003,44,2593,0 +1277,1278,10439,0.19609416059774,2.6741666666667,64,2625,0 +1278,1279,10516,0.040518533819385014,2.3191666666667,70,4834,0 +1279,1280,10587,0.07099894663641,2.0597222222222,96,4056,0 +1280,1281,10586,0.07584150637714701,2.0547222222222,110,5713,0 +1281,1282,10684,0.08180100127782801,2.1511111111111,68,3940,0 +1282,1283,10880,0.0,2.2602777777778,90,4414,0 +1283,1284,10830,0.0,2.2883333333333,90,5044,0 +1284,1285,10794,0.09140162014739303,2.3736111111111,69,3894,0 +1285,1286,10843,0.0,2.5869444444444,46,3993,0 +1286,1287,10805,0.0,2.6480555555556,74,4404,0 +1287,1288,10996,0.0,2.6077777777777995,68,4072,0 +1288,1289,11327,0.05363316840061,2.6069444444444,67,4182,0 +1289,1290,11090,0.26818151064716,2.6908333333333,51,3351,0 +1290,1291,10578,0.21887772653901,2.9019444444444003,39,4183,0 +1291,1292,10528,0.32371296573811,3.2711111111111,26,4068,0 +1292,1293,10475,0.12565805017257,3.5872222222222,25,8139,0 +1293,1294,10664,0.092277247744574,3.6913888888889,32,11000,0 +1294,1295,10513,0.077016875742983,3.6313888888889,17,2975,0 +1295,1296,9072,0.3714480797312501,3.5605555555556,19,2692,0 +1296,1297,9069,0.19332372237792,3.4402777777778,16,2502,0 +1297,1298,9089,0.06345811641554701,3.35,28,2510,0 +1298,1299,9027,0.2267121559473,3.3469444444444,24,2663,0 +1299,1300,8969,0.053072279964629,3.2708333333333,35,3575,0 +1300,1301,9073,0.13336345197744,3.2519444444444,49,2586,0 +1301,1302,8957,0.1252855094715,2.7311111111111,106,2908,0 +1302,1303,9126,0.096211952864224,2.3875,80,3530,0 +1303,1304,9122,0.096524467517755,2.0847222222222,90,2776,0 +1304,1305,9231,0.08924770147957402,2.0975,169,2962,0 +1305,1306,9368,0.11889606284162,2.1763888888889,98,3441,0 +1306,1307,9458,0.031429841710104,2.2327777777777995,92,4376,0 +1307,1308,9463,0.0,2.2725,91,3857,0 +1308,1309,9356,0.036512411627868,2.3202777777778,99,4685,0 +1309,1310,9340,0.0,2.5425,90,4585,0 +1310,1311,9340,0.0,2.5986111111111,126,3542,0 +1311,1312,9276,0.0,2.6319444444444,102,3370,0 +1312,1313,9611,0.10106696361212,2.5836111111111,132,3515,0 +1313,1314,9532,0.14854949043035,2.675,88,3793,0 +1314,1315,9156,0.08612162048398897,2.8522222222222,135,2954,0 +1315,1316,9222,0.16494200410492002,3.1302777777778,114,2627,0 +1316,1317,9282,0.28637713141253,3.4805555555556,35,2550,0 +1317,1318,9573,0.13206535647488,3.5994444444444,24,2480,0 +1318,1319,9333,0.27364025607799,3.5847222222222,44,2521,0 +1319,1320,9987,0.38382339961227,3.4963888888889,26,2860,0 +1320,1321,10133,0.08426242877623301,3.3825,37,3675,0 +1321,1322,10010,0.3290413568025901,3.2694444444444,45,2704,0 +1322,1323,10028,0.22632868808708,3.2322222222222,42,3121,0 +1323,1324,9984,0.17914189971361,3.1936111111111005,47,2603,0 +1324,1325,10041,0.30046815361859003,3.0536111111111004,34,3984,0 +1325,1326,10072,0.22650915594248,2.7819444444444,56,2537,0 +1326,1327,10025,0.0,2.4152777777778,87,3349,0 +1327,1328,10116,0.1223093269317,2.1569444444444,74,3958,0 +1328,1329,10232,0.1696074188221,2.1125,90,4243,0 +1329,1330,10516,0.0,2.1833333333333003,79,4159,0 +1330,1331,10449,0.028193633007367,2.205,97,5637,0 +1331,1332,10598,0.0,2.1697222222222,90,8142,0 +1332,1333,10337,0.0,2.3075,77,5713,0 +1333,1334,10469,0.097305232437507,2.4575,101,3668,0 +1334,1335,10426,0.11905908868379,2.6077777777777995,74,4307,0 +1335,1336,10531,0.11660374103282,2.6275,439,4354,0 +1336,1337,10875,0.060474297756584014,2.6144444444444,79,4262,0 +1337,1338,10494,0.22568442027805,2.6477777777777995,165,3446,0 +1338,1339,10195,0.14077736537045002,2.8594444444444003,139,2677,0 +1339,1340,9918,0.1924574892026,3.2675,56,4450,0 +1340,1341,9889,0.18922597300629,3.5136111111111004,102,3044,0 +1341,1342,9947,0.041593949118095004,3.5725,101,3428,0 +1342,1343,9977,0.2502095174271,3.6863888888889,41,2845,0 +1343,1344,10835,0.18663972932643,3.5636111111111,94,2781,0 +1344,1345,10765,0.07351854082400297,3.4127777777778,116,2743,0 +1345,1346,10656,0.081949111399618,3.295,94,4470,0 +1346,1347,10485,0.20148511394009,3.2666666666667004,89,2596,0 +1347,1348,10681,0.11515101921294,3.1933333333333,141,3249,0 +1348,1349,10852,0.07797276382811,3.0688888888889,167,2529,0 +1349,1350,10728,0.07244862879413201,2.8102777777778,148,2452,0 +1350,1351,10874,0.07310929970435699,2.42,105,2934,0 +1351,1352,10964,0.066868365737218,2.1358333333333,210,3159,0 +1352,1353,10984,0.05788512501593701,1.9916666666667,145,3974,0 +1353,1354,11055,0.09727414207464803,2.0947222222222,136,4305,0 +1354,1355,11233,0.033270317741558,2.1591666666667,126,5012,0 +1355,1356,11161,0.0,2.2377777777778,157,4455,0 +1356,1357,10966,0.038270957919533,2.2511111111111,105,4108,0 +1357,1358,11193,0.08728058888363299,2.4208333333333,114,4339,0 +1358,1359,11167,0.10536774813238,2.5241666666667,104,5056,0 +1359,1360,11367,0.1233991317089,2.5794444444444,69,5573,0 +1360,1361,51251,0.042565915766552,2.5936111111111,75,3366,1 +1361,1362,17953,0.23147422367229,2.6830555555556,73,2559,1 +1362,1363,170029,0.08983405162538903,2.8188888888889,74,1999,1 +1363,1364,10955,0.07464756469365201,2.9513888888888995,126,1993,0 +1364,1365,10984,0.099244104918934,3.2830555555556,67,1913,0 +1365,1366,10964,0.11535172009194,3.4819444444444,32,1760,0 +1366,1367,10980,0.21774881707852,3.5886111111111005,38,1890,0 +1367,1368,10852,0.1305066423559,3.4836111111111,34,2469,0 +1368,1369,10786,0.10054853030204,3.3955555555556,36,2133,0 +1369,1370,10841,0.02468393737575,3.2847222222222,26,3359,0 +1370,1371,10762,0.10018007414459,3.2383333333332995,74,3783,0 +1371,1372,10419,0.12522619841308,3.2188888888889,85,1809,0 +1372,1373,10467,0.11781887197077,2.9483333333333,67,2143,0 +1373,1374,10502,0.13417256350298,2.5855555555556,84,2567,0 +1374,1375,10519,0.07474686582090599,2.3005555555556003,1630,2176,0 +1375,1376,10579,0.13570963056519,2.0855555555556,1435,1929,0 +1376,1377,10502,0.076431907457478,1.9027777777778,857,2244,0 +1377,1378,10661,0.0,1.9411111111111,31,1810,0 +1378,1379,10818,0.1936428046839,2.0444444444444,500,2088,0 +1379,1380,10918,0.052826773889684014,2.1363888888889,53,2371,0 +1380,1381,10871,0.0,2.22,61,1843,0 +1381,1382,10796,0.054466597481213,2.3530555555556,158,2668,0 +1382,1383,10774,0.057459020289436,2.545,184,2309,0 +1383,1384,10898,0.28750562005936,2.6202777777778,91,1998,0 +1384,1385,11442,0.075538554674309,2.6847222222222,60,2480,0 +1385,1386,11113,0.08112608570492501,2.6591666666667004,107,2147,0 +1386,1387,10888,0.21563803296368,2.7863888888888995,5157,1802,0 +1387,1388,10894,0.095725002305685,3.0269444444444003,28,1789,0 +1388,1389,10888,0.17516056892320994,3.3227777777778,24,1999,0 +1389,1390,10896,0.32902836018586,3.6097222222222,21,2142,0 +1390,1391,10800,0.10216065221678,3.6805555555556,12,1904,0 +1391,1392,11000,0.19741931250852,3.6075,24,1876,0 +1392,1393,10985,0.10149107903671,3.4091666666667004,17,2434,0 +1393,1394,11017,0.17479255893624,3.3666666666667004,48,2472,0 +1394,1395,10863,0.034385029573777,3.3158333333333,41,1744,0 +1395,1396,10875,0.21988771218053,3.1622222222222,1088,2404,0 +1396,1397,10987,0.10149107903671,3.1086111111111,68,1971,0 +1397,1398,10778,0.10269981175445,2.6552777777778,2575,1713,0 +1398,1399,10957,0.11258759940039,2.2730555555556,4688,1765,0 +1399,1400,10832,0.13022351806001,2.0591666666667,477,3156,0 diff --git a/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_problem/dataSplits.csv b/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_problem/dataSplits.csv new file mode 100644 index 0000000..c07dc45 --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_problem/dataSplits.csv @@ -0,0 +1,1261 @@ +d3mIndex,timestamp,value_0,value_1,value_2,value_3,value_4,ground_truth +0,1,12183,0.0,3.7166666666667,5,2109,0 +1,2,12715,0.091757964510557,3.6108333333333,60,3229,0 +2,3,12736,0.17229675238449998,3.4813888888889,88,3637,0 +3,4,12716,0.22621935431999,3.3802777777778,84,1982,0 +4,5,12739,0.17635798469946,3.1933333333333,111,2751,0 +5,6,12737,0.090491245476051,2.7866666666667004,112,2128,0 +6,7,12857,0.08460994072769001,2.4627777777777995,1235,2109,0 +7,8,12884,0.06842699169496,2.2541666666667,710,2328,0 +8,9,12894,0.13330269689422,2.1180555555556,618,2453,0 +9,10,12675,0.085026586189321,2.0691666666667,84,2847,0 +10,11,13260,0.097073068447328,2.1972222222222,100,3659,0 +11,12,13470,0.0,2.3188888888889,125,5207,0 +12,13,13060,0.031063767542922,2.34,114,5146,0 +13,14,12949,0.017732750501525,2.4902777777778,145,4712,0 +14,15,13035,0.063354504072079,2.6438888888889,91,6363,0 +15,16,12980,0.087870391896335,2.8486111111111003,94,5010,0 +16,17,13677,0.11546815687729,2.8833333333333,79,3956,0 +17,18,13381,0.073413457727404,2.8808333333333,50,4063,0 +18,19,12737,0.040392584616896,2.9005555555556,39,3748,0 +19,20,12554,0.08911335594722301,3.0855555555556,28,3047,0 +20,21,12470,0.098030053711531,3.3536111111111,29,4099,0 +21,22,12490,0.047140641497552,3.7438888888889,24,2122,0 +22,23,12539,0.10481279080241,3.7947222222222,19,3387,0 +23,24,12530,0.20478886838928,3.801111111111101,21,1950,0 +24,25,13002,0.04485100631921201,3.6508333333333,27,2927,0 +25,26,12989,0.1053622140254,3.555,46,1889,0 +26,27,13038,0.08436887679639,3.4769444444444,133,1910,0 +27,28,13011,0.097980673762982,3.2158333333333,143,3747,0 +28,29,12984,0.10165726215275,3.1141666666667,86,4994,0 +29,30,13079,0.056764513454874,2.7983333333333,118,2009,0 +30,31,13048,0.074428708878932,2.4252777777778,56,2899,0 +31,32,13096,0.091244453451818,2.14,92,2298,0 +32,33,13003,0.094529332881679,1.9822222222222,85,1894,0 +33,34,13057,0.016638011234698,1.9694444444444,122,1999,0 +34,35,13023,0.038096861957006005,2.0741666666667,74,3007,0 +35,36,13033,0.064497814457643,2.2505555555556,84,2838,0 +36,37,13034,0.030426401876334,2.2819444444444,54,4113,0 +37,38,13068,0.095423209955973,2.4216666666667,77,2150,0 +38,39,13057,0.069688744272108,2.5997222222222005,84,3007,0 +39,40,13047,0.03468622413034,2.7544444444444003,139,2484,0 +40,41,13795,0.089564461084836,2.7258333333333,65,2101,0 +41,42,13528,0.07337616196456799,2.8302777777778,38,2001,0 +42,43,13032,0.061939295606039,2.9422222222222,35,2102,0 +43,44,13084,0.11419089175512,3.0919444444444,47,2129,0 +44,45,13000,0.10475925920163,3.3519444444444,37,4422,0 +45,46,13008,0.079657960399444,3.6952777777778,53,4573,0 +46,47,12978,0.14475546275416,3.8269444444444,55,1989,0 +47,48,13067,0.1421711341096,3.7877777777778,45,1953,0 +48,49,13086,0.07696963969656899,3.7536111111111,46,1872,0 +49,50,13023,0.06393273436444799,3.61,35,1850,0 +50,51,13046,0.14973281021845006,3.5091666666667,68,2879,0 +51,52,13032,0.041478839355346,3.4205555555556,82,1840,0 +52,53,13012,0.089317973365284,3.2647222222222,154,2134,0 +53,54,13051,0.088820248166203,2.7944444444444,128,2234,0 +54,55,12979,0.054872994406929,2.46,79,3769,0 +55,56,13025,0.07913553329046401,2.2075,66,2717,0 +56,57,13007,0.16317996709063,2.1758333333333,92,2171,0 +57,58,13036,0.08671926699280201,2.3058333333333,67,2224,0 +58,59,13043,0.0733999511789,2.3983333333333,58,1967,0 +59,60,13023,0.0,2.55,58,2148,0 +60,61,13022,0.032756244361869,2.7302777777778,63,1978,0 +61,62,13033,0.054893891024455,2.8169444444444003,61,2021,0 +62,63,13024,0.068514114108229,2.9247222222222,55,2060,0 +63,64,13048,0.05279414163165401,2.8911111111111003,71,2096,0 +64,65,13740,0.023853017353212,2.9575,64,2082,0 +65,66,13540,0.07426125441559799,2.9080555555556,92,2175,0 +66,67,12724,0.024228588329879,3.0088888888889,44,2332,0 +67,68,13070,0.09233413002519697,3.2033333333333,35,2147,0 +68,69,13106,0.15930655332113,3.6213888888889,53,2163,0 +69,70,13025,0.12755838225296,4.0322222222222,49,2406,0 +70,71,13074,0.10152541717054,4.1227777777778,49,2022,0 +71,72,13079,0.040148453968243986,3.9736111111111,103,2188,0 +72,73,13184,0.087208372094752,3.8425,107,2758,0 +73,74,13194,0.074209918996797,3.7097222222222,74,2925,0 +74,75,13191,0.059044537369404015,3.6258333333333,56,3223,0 +75,76,13059,0.06248169832921499,3.4705555555556,60,2507,0 +76,77,13169,0.08876527685714597,3.2877777777778,73,2435,0 +77,78,13114,0.051354431854972,2.9286111111111004,99,2552,0 +78,79,13037,0.074790104163639,2.4888888888889,84,2540,0 +79,80,13179,0.091817341555971,2.2744444444444,129,2642,0 +80,81,13152,0.14762794333026005,2.1733333333333,101,2254,0 +81,82,13095,0.07101004447510299,2.3416666666667,101,2539,0 +82,83,13144,0.07689756334240598,2.3808333333333,51,2596,0 +83,84,13170,0.08412575787388403,2.4663888888889,95,2573,0 +84,85,13162,0.06328921386603299,2.6608333333333,48,2302,0 +85,86,13117,0.057393902128707,2.7558333333333,40,2991,0 +86,87,13129,0.041819399065704,2.8636111111111004,55,3141,0 +87,88,13386,0.073729686380986,2.7586111111111005,56,3285,0 +88,89,13929,0.15365285617975,2.7377777777778,935,3807,0 +89,90,13385,0.060355859742407016,2.6961111111111005,34,2892,0 +90,91,13106,0.10644586288975,2.8569444444444,57,2538,0 +91,92,13113,0.059314286360126985,3.1833333333333,70,2234,0 +92,93,13155,0.096293806236591,3.5544444444444,72,2707,0 +93,94,13186,0.085101425467407,3.8894444444444,66,2382,0 +94,95,13151,0.11149072274185,4.1138888888889,72,2426,0 +95,96,13156,0.076266981262989,3.9519444444444,49,2451,0 +96,97,12813,0.097952120177625,3.8275,41,2288,0 +97,98,12821,0.17250021935572,3.6438888888889,42,2256,0 +98,99,12867,0.11389182319254,3.5608333333333,39,2884,0 +99,100,12837,0.08999961787521,3.5013888888889,81,2398,0 +100,101,12911,0.048649372449385005,3.3088888888889,90,2239,0 +101,102,12842,0.13861764684085998,2.9063888888889,92,2248,0 +102,103,12905,0.1088795585287,2.5027777777777995,81,2387,0 +103,104,12993,0.054235162564995,2.2466666666667003,145,3876,0 +104,105,12974,0.0390040506742,2.1869444444444,47,3073,0 +105,106,13039,0.0744713077811,2.2402777777778,63,3113,0 +106,107,13322,0.040258943675435,2.3727777777778,118,3363,0 +107,108,13606,0.0,2.4566666666667003,56,3796,0 +108,109,13536,0.027955712584728,2.5452777777777995,127,4924,0 +109,110,13341,0.047309968420241,2.6830555555556,48,4300,0 +110,111,13360,0.016602764360002,2.805,114,5225,0 +111,112,13450,0.042432577628353986,2.7386111111111004,78,4047,0 +112,113,14102,0.051191743726563,2.7438888888888995,58,4134,0 +113,114,14026,0.0,2.7586111111111005,56,4786,0 +114,115,13162,0.056724832354639,2.9013888888889,67,4184,0 +115,116,13118,0.055771058827737,3.19,155,2888,0 +116,117,12953,0.081014772096658,3.5561111111111003,123,2674,0 +117,118,12854,0.08253629738290899,3.8433333333333,118,2574,0 +118,119,12952,0.11499203730886,4.0319444444444,133,3123,0 +119,120,12915,0.07668513845109799,3.8844444444444,75,3369,0 +120,121,11994,0.070057457403873,3.6908333333333,29,3284,0 +121,122,11868,0.07031477357556501,3.6141666666667,68,2127,0 +122,123,11977,0.091946448716499,3.5019444444444,91,2117,0 +123,124,11874,0.14560588482235998,3.4205555555556,101,2271,0 +124,125,11913,0.094774329323472,3.1780555555556,22,2513,0 +125,126,11933,0.10217989327054,2.8361111111111,20,2746,0 +126,127,11844,0.04854243074027901,2.5222222222222004,27,2076,0 +127,128,11968,0.068760549683423,2.2416666666667004,45,2297,0 +128,129,11996,0.075440683881139,2.1588888888889,42,2312,0 +129,130,12006,0.11771339431815,2.2763888888889,59,2834,0 +130,131,12225,0.069437397660265,2.3391666666667,52,3584,0 +131,132,12482,0.0,2.4841666666667,62,4009,0 +132,133,12289,0.0,2.4911111111111,81,4142,0 +133,134,12219,0.0,2.6922222222222,84,3876,0 +134,135,12282,0.027395404320488,2.8205555555556,104,4098,0 +135,136,12367,0.055202605299814,2.8216666666667,111,3831,0 +136,137,13042,0.078387348178452,2.7122222222222,91,3842,0 +137,138,12665,0.11851571646444,2.6744444444444,33,4129,0 +138,139,12133,0.068395341911942,2.8097222222222,26,3509,0 +139,140,12023,0.04720597158087901,3.1838888888889,37,2450,0 +140,141,11847,0.07910648512645599,3.5130555555556,23,2270,0 +141,142,11980,0.067550601916344,3.7722222222222,29,2360,0 +142,143,12026,0.080666570182724,3.9058333333333,45,2431,0 +143,144,11852,0.044973875852863,3.7697222222222,49,2042,0 +144,145,12152,0.065734580284861,3.6027777777778,27,1833,0 +145,146,12148,0.068759646748575,3.5038888888889,46,1852,0 +146,147,12236,0.027278224398313,3.445,39,1927,0 +147,148,12155,0.067695565422881,3.3494444444444,72,1999,0 +148,149,12113,0.07244669924777,3.1961111111111005,81,2030,0 +149,150,12175,0.028882930937168,2.8905555555556,64,1963,0 +150,151,12103,0.021568136982842,2.5805555555556,79,2116,0 +151,152,12206,0.064254625408408,2.3380555555556004,132,2461,0 +152,153,12239,0.073869151016554,2.2116666666667,127,2388,0 +153,154,12398,0.026644044055307004,2.2013888888889,121,2846,0 +154,155,12582,0.051289858799957,2.3236111111111,98,2974,0 +155,156,12705,0.099217337562612,2.3002777777778,128,3776,0 +156,157,12555,0.016615805334675,2.385,158,3885,0 +157,158,12476,0.078387348178452,2.5597222222222005,78,3865,0 +158,159,12706,0.0,2.6941666666667,65,4319,0 +159,160,12671,0.049384244324413,2.7169444444444,81,4646,0 +160,161,13277,0.043044731483849,2.6369444444444,586,3873,0 +161,162,12757,0.04215504851616,2.6572222222222,48,3489,0 +162,163,12401,0.042236538352835,2.8466666666667004,38,2790,0 +163,164,12248,0.1001564296112,3.1955555555556,30,2641,0 +164,165,12156,0.17378132267942994,3.5633333333333,28,2960,0 +165,166,12210,0.12005519462968,3.8113888888889,36,2192,0 +166,167,11983,0.14491137762023998,3.9655555555556,50,2145,0 +167,168,12374,0.07336941078506799,3.8483333333333,47,2133,0 +168,169,12230,0.12395626148952,3.6441666666667,82,2330,0 +169,170,12200,0.15077430423660998,3.5213888888889,56,2235,0 +170,171,12135,0.18960071033689,3.4702777777778,140,2258,0 +171,172,12131,0.06051348935254,3.3033333333333,145,2200,0 +172,173,12165,0.072057993662839,3.1933333333333,114,2161,0 +173,174,12193,0.082361078437032,2.8183333333333,129,2159,0 +174,175,12165,0.12343775199876,2.52,143,2088,0 +175,176,12304,0.1071817784483,2.2886111111111,113,2473,0 +176,177,12275,0.10359394556779,2.0822222222222,108,3217,0 +177,178,12369,0.021162435488903,2.1416666666667,93,2994,0 +178,179,12569,0.074524398314698,2.2688888888889,63,3827,0 +179,180,12766,0.12687067454443,2.335,103,4176,0 +180,181,12621,0.041752618326160014,2.4388888888889,114,4227,0 +181,182,12611,0.0,2.5386111111111,67,4290,0 +182,183,12618,0.040819652463459,2.6288888888889,106,4691,0 +183,184,12631,0.082668981599835,2.7511111111111,160,4442,0 +184,185,13121,0.06181362481077901,2.7744444444444,81,5775,0 +185,186,12871,0.0,2.8297222222222,113,3840,0 +186,187,12252,0.076137992226715,2.9708333333333,37,3721,0 +187,188,12155,0.12107639529965,3.1333333333333,70,2498,0 +188,189,12186,0.0,3.3544444444444,82,2265,0 +189,190,12179,0.19840339729984,3.6780555555556,76,2451,0 +190,191,12109,0.20112394005693,3.8038888888889,59,2892,0 +191,192,12142,0.096833471661634,3.8177777777778,58,2166,0 +192,193,12145,0.10338450919956,3.6916666666667,49,2040,0 +193,194,12162,0.10142513773096,3.5197222222222,36,2013,0 +194,195,12165,0.09779274451732,3.5186111111111003,111,2000,0 +195,196,12125,0.14744152252573,3.2597222222222,81,2117,0 +196,197,12097,0.083396348606149,3.0930555555556,92,2775,0 +197,198,12099,0.095637498006913,2.7825,113,2116,0 +198,199,12140,0.14768844039376006,2.4494444444444,90,1991,0 +199,200,12188,0.1131872329372,2.2369444444444,183,3162,0 +200,201,12157,0.073729686380986,2.0961111111111,117,2958,0 +201,202,12128,0.064614077523704,2.0377777777778,110,3153,0 +202,203,12190,0.056019959597275015,2.0730555555556003,179,2190,0 +203,204,12151,0.074812141908008,2.1655555555556,134,2172,0 +204,205,12214,0.02489388427845201,2.285,135,2074,0 +205,206,12275,0.023695834967821,2.4283333333333,100,2078,0 +206,207,12164,0.058680009072634,2.6186111111111,47,2406,0 +207,208,12120,0.10008779345816002,2.7372222222222,88,2018,0 +208,209,12693,0.066566772961868,2.8266666666667004,74,2091,0 +209,210,12624,0.070501147961051,2.8469444444444,58,2310,0 +210,211,12163,0.098779019649936,2.9855555555556,100,2113,0 +211,212,12100,0.11803653713501,3.1038888888889,49,2518,0 +212,213,12162,0.10076746585103,3.4058333333333,36,2605,0 +213,214,12106,0.053210709415363,3.6138888888889,40,2680,0 +214,215,12156,0.099346579713514,3.93,50,2228,0 +215,216,12120,0.047275248011591,3.8155555555556,58,2023,0 +216,217,12420,0.091262209791582,3.6588888888889,50,3702,0 +217,218,12417,0.038593218846488,3.5913888888889,53,1992,0 +218,219,12450,0.070273907645883,3.4644444444444003,93,1988,0 +219,220,12395,0.029431888410363,3.3944444444444,78,1919,0 +220,221,12382,0.096854769984307,3.2227777777778,84,2213,0 +221,222,12438,0.11656453357642,2.7961111111111,112,2181,0 +222,223,12363,0.12109055114779,2.4383333333333,73,2152,0 +223,224,12393,0.20381554615786,2.2647222222222005,91,2393,0 +224,225,12399,0.046311768005022014,2.1886111111111,114,2173,0 +225,226,12456,0.18261306403662,2.2825,127,2109,0 +226,227,12442,0.021992750543024,2.3333333333333,69,3606,0 +227,228,12481,0.088072259040681,2.445,59,2114,0 +228,229,12432,0.037896500450725,2.5811111111111,64,2135,0 +229,230,12403,0.09882843339863,2.7094444444444,75,2303,0 +230,231,12406,0.076277687882641,2.88,44,2137,0 +231,232,12462,0.022875979046571,2.8555555555556,52,2264,0 +232,233,13034,0.10022162220861,2.7791666666667,42,2245,0 +233,234,12830,0.08117200437078799,2.7772222222222,45,2151,0 +234,235,12439,0.09750667785645803,3.02,26,2330,0 +235,236,12541,0.05680722879784299,3.2213888888888995,29,3357,0 +236,237,12462,0.12240855732315,3.6211111111111,32,3152,0 +237,238,12394,0.1715485140175,4.0219444444444,44,2693,0 +238,239,12507,0.075015592829224,4.0980555555556,41,3798,0 +239,240,12512,0.11388410095531,3.9080555555556,42,4596,0 +240,241,12093,0.10519027968795,3.7269444444444,46,2529,0 +241,242,12197,0.1150532998405,3.6244444444444,40,2124,0 +242,243,12138,0.10890530980571,3.5252777777778,64,2762,0 +243,244,12174,0.099350621485086,3.4675,70,2973,0 +244,245,12163,0.12889794040441002,3.3316666666667003,69,3041,0 +245,246,12096,0.12069378235889,2.9497222222222,73,2179,0 +246,247,12166,0.13053034917739,2.5708333333333,85,2322,0 +247,248,12187,0.078977758004111,2.3086111111111,63,2274,0 +248,249,12246,0.08088416337864099,2.2311111111111,67,2448,0 +249,250,12335,0.04008956024204,2.3119444444444,68,3811,0 +250,251,12556,0.05063725351997099,2.3536111111111,62,3761,0 +251,252,12652,0.039066291775136,2.4819444444444,69,4269,0 +252,253,12646,0.028611752774164,2.6605555555556,82,4244,0 +253,254,12803,0.040593364983329,2.7527777777778,56,4417,0 +254,255,12570,0.038807415292018,3.0741666666667005,38,3758,0 +255,256,12633,0.07832796288132203,2.8522222222222,30,4375,0 +256,257,13146,0.066320996162546,2.7277777777778,48,4158,0 +257,258,12994,0.083175583471284,2.7502777777778,63,3410,0 +258,259,12314,0.06802464587725401,2.8797222222222,34,2853,0 +259,260,12193,0.051675070535006,3.2027777777778,11,2628,0 +260,261,12127,0.044129112207997014,3.5633333333333,22,2287,0 +261,262,12140,0.037685894365982006,3.8808333333333,22,3334,0 +262,263,12174,0.093414561465838,4.0352777777778,12,2795,0 +263,264,12180,0.06987083046098,3.8966666666667,10,2089,0 +264,265,12861,0.021992750543024,3.7225,14,2260,0 +265,266,12957,0.11305566197523,3.73,39,3176,0 +266,267,12981,0.030884138240845,3.5558333333333,55,4049,0 +267,268,12958,0.10381377439313,3.3169444444444003,90,2902,0 +268,269,12913,0.048953768695625004,3.2322222222222,68,3743,0 +269,270,12939,0.042258794089861,2.8658333333333,95,4280,0 +270,271,12933,0.048388685585470985,2.5169444444444,70,3977,0 +271,272,13006,0.034197830567692,2.3,96,4518,0 +272,273,13091,0.08835953066771099,2.1888888888889,45,2707,0 +273,274,13201,0.086890518272785,2.2030555555556,96,3522,0 +274,275,13520,0.031087561676959,2.2711111111111,74,4584,0 +275,276,13675,0.071287463233942,2.4697222222222,82,4141,0 +276,277,13594,0.14372616993938,2.5988888888889,82,4831,0 +277,278,13466,0.12647517487142998,2.7258333333333,45,3991,0 +278,279,13448,0.042854531198562,2.7858333333333,134,4645,0 +279,280,13492,0.039930389849144,2.7922222222222,119,4967,0 +280,281,14123,0.076184645265048,2.6988888888889,86,4578,0 +281,282,13839,0.037830020408535,2.7663888888889,75,4972,0 +282,283,13335,0.030884138240845,2.8938888888889,45,5522,0 +283,284,13196,0.048316550276279,3.1875,50,2832,0 +284,285,13047,0.10986585566763,3.6463888888889,31,2826,0 +285,286,13008,0.025485002897852004,3.866666666666701,88,2855,0 +286,287,12763,0.12451757643335,3.9808333333333,42,2660,0 +287,288,12949,0.12875690949235,3.8277777777778,70,2447,0 +288,289,13009,0.15720639094135,3.6269444444444,106,2545,0 +289,290,13008,0.079092017261926,3.5266666666667,44,3842,0 +290,291,12890,0.14711499890479998,3.5077777777778,57,2332,0 +291,292,13004,0.0531410973178,3.3455555555556,95,2294,0 +292,293,12918,0.10136246281349,3.1241666666667003,91,3016,0 +293,294,12910,0.053119315802353,2.8713888888889,66,3944,0 +294,295,12915,0.11313351589999003,2.5133333333333,66,2332,0 +295,296,13121,0.076760188212735,2.2197222222222,82,2405,0 +296,297,13076,0.08890522133351199,2.205,73,2572,0 +297,298,13096,0.1009555130175,2.2677777777778,69,2558,0 +298,299,13339,0.15685427502807,2.2991666666667,107,3701,0 +299,300,13635,0.11090638960365,2.4277777777778,101,4228,0 +300,301,13493,0.054798089981891,2.5333333333333,66,3990,0 +301,302,13402,0.08461316628091001,2.6422222222222005,47,4707,0 +302,303,13417,0.15790425505315,2.8211111111111005,47,3857,0 +303,304,13382,0.021675109392134,2.7625,66,3874,0 +304,305,14199,0.14112049645292002,2.7391666666667,102,4369,0 +305,306,13973,0.059612111520904,2.7525,71,4488,0 +306,307,13284,0.067835890522602,2.8644444444444,53,3637,0 +307,308,13070,0.047414460026828,3.1927777777778,28,2705,0 +308,309,12983,0.050348669783997005,3.5872222222222,24,2429,0 +309,310,13075,0.07296715773193299,3.8305555555556,23,2839,0 +310,311,12991,0.10713527159169,3.8827777777778,30,2371,0 +311,312,12993,0.073622496612493,3.7291666666667,25,2758,0 +312,313,13121,0.11556476355437,3.6172222222222,29,2291,0 +313,314,13097,0.034160489683707995,3.4491666666667005,27,2220,0 +314,315,13150,0.019571935182124,3.4097222222222,77,2620,0 +315,316,13078,0.15720996206912,3.2605555555556,46,2467,0 +316,317,13140,0.11515041454164,3.2191666666667,86,2088,0 +317,318,13102,0.086415715789296,2.9586111111111,97,2137,0 +318,319,13110,0.092606306920552,2.6036111111111,88,2907,0 +319,320,13138,0.046458579038692015,2.3319444444444,110,2558,0 +320,321,13238,0.10977831600416,2.2025,89,2823,0 +321,322,13317,0.11090009191451,2.2711111111111,134,2465,0 +322,323,13512,0.076652795374797,2.2897222222222005,84,4399,0 +323,324,13669,0.1087202400467,2.3297222222222005,109,4088,0 +324,325,13651,0.11471628863897,2.395,57,5099,0 +325,326,13580,0.11070024667119,2.5063888888889,49,5157,0 +326,327,13538,0.026827723134058,2.7077777777778,83,3782,0 +327,328,13657,0.029426630692549,2.735,101,4008,0 +328,329,14183,0.028611752774164,2.6958333333333,88,4534,0 +329,330,14117,0.053106181092382014,2.6930555555556,56,3242,0 +330,331,13166,0.055538160906184006,2.875,31,2808,0 +331,332,13265,0.11009690391165,3.1788888888888995,22,3676,0 +332,333,13085,0.10979978093137,3.5808333333333,32,3523,0 +333,334,13167,0.036174223284821,3.8508333333333,27,3038,0 +334,335,13170,0.048361321378982,3.9180555555556,17,2299,0 +335,336,13132,0.10958125953198,3.815,27,2345,0 +336,337,13055,0.047305343559722,3.6080555555556,38,2565,0 +337,338,13025,0.045316868664604014,3.4927777777778,73,2576,0 +338,339,13076,0.13255054531036,3.4316666666667004,56,2327,0 +339,340,13044,0.079695587369141,3.3436111111111004,49,2211,0 +340,341,13035,0.10277355185943,3.0663888888889,90,2642,0 +341,342,13103,0.15061124796385,2.7894444444444,106,3646,0 +342,343,13067,0.14509169704095,2.4994444444444,51,2281,0 +343,344,13183,0.054445250001619004,2.2544444444444,99,2474,0 +344,345,13144,0.082058799915824,2.0847222222222,104,2536,0 +345,346,13166,0.042151311782819015,2.0888888888889,119,2900,0 +346,347,13406,0.057404703309705984,2.1594444444444,73,3144,0 +347,348,13544,0.040891918425583,2.2533333333333,92,3725,0 +348,349,13608,0.045224636676715,2.3880555555556,57,4305,0 +349,350,13522,0.0,2.6338888888889,100,3665,0 +350,351,13595,0.0,2.6588888888889,93,3791,0 +351,352,13420,0.10335456693443,2.7586111111111005,111,3897,0 +352,353,14163,0.033846222120808,2.8797222222222,91,3494,0 +353,354,13678,0.026167129419328,2.785,43,3353,0 +354,355,13272,0.08571767780871499,2.8219444444444,91,2741,0 +355,356,13071,0.12459953631184,3.0055555555556,63,2463,0 +356,357,13004,0.054750658073534006,3.2936111111111,60,3477,0 +357,358,13068,0.20799106772677,3.5575,56,2792,0 +358,359,13031,0.10314231079956,3.676111111111101,59,2183,0 +359,360,13013,0.12212653292147,3.7166666666667,48,2874,0 +360,361,12998,0.19159058299176,3.6013888888889,65,2147,0 +361,362,12971,0.10782180851978,3.4455555555556,77,2754,0 +362,363,13000,0.06408869538637901,3.4166666666667003,60,2007,0 +363,364,12998,0.095540168894753,3.1791666666667004,94,2564,0 +364,365,12906,0.039360296791109,3.0013888888889,84,3020,0 +365,366,12969,0.086611479249287,2.72,99,2004,0 +366,367,12963,0.05845507441603001,2.4527777777778,61,2047,0 +367,368,12933,0.051490800079599004,2.1816666666667,60,3531,0 +368,369,12990,0.075496432869001,2.0161111111111,78,2383,0 +369,370,12980,0.10358625218721,1.9769444444444,81,2112,0 +370,371,12982,0.062806431427897,2.0597222222222,61,2554,0 +371,372,12989,0.08970338978685001,2.2111111111111,68,2371,0 +372,373,13073,0.094517316130968,2.3141666666667,53,2060,0 +373,374,12950,0.032322011663911,2.4280555555556003,49,2086,0 +374,375,12990,0.047911560407608,2.5855555555556,40,2130,0 +375,376,13035,0.062001214431213,2.6977777777778,125,2072,0 +376,377,13681,0.027102718749392,2.7777777777778,61,2033,0 +377,378,13304,0.034703114844079,2.7988888888889,111,2683,0 +378,379,12965,0.066236017573192,2.8927777777778,32,2046,0 +379,380,12966,0.032230355211769,3.0413888888889,21,2064,0 +380,381,12943,0.11559664215716,3.3569444444444,14,2067,0 +381,382,12958,0.021952502374124,3.4808333333333,32,2496,0 +382,383,13005,0.13347711194703,3.764166666666701,29,4758,0 +383,384,12923,0.10579408349834,3.8097222222222,26,2806,0 +384,385,12812,0.10679035350244,3.6911111111111,52,2227,0 +385,386,12803,0.068633627680319,3.4902777777778,39,3123,0 +386,387,12850,0.04699518011436099,3.3769444444444,78,3460,0 +387,388,12797,0.14159640074335994,3.3011111111111004,78,3587,0 +388,389,12732,0.078500039299167,3.1369444444444,83,2558,0 +389,390,12817,0.049232295047845,2.8475,63,2306,0 +390,391,12818,0.078777592482879,2.4544444444444,108,2083,0 +391,392,12815,0.08993433499951,2.1247222222222,158,3073,0 +392,393,12805,0.081869163858473,2.0266666666667,115,3325,0 +393,394,12703,0.14556064903749,2.1763888888889,112,2321,0 +394,395,12771,0.0,2.3088888888889,73,2846,0 +395,396,12847,0.0,2.4213888888889,93,2482,0 +396,397,12872,0.030693547421212,2.6436111111111,65,2306,0 +397,398,12815,0.0,2.6602777777778,91,2298,0 +398,399,12844,0.046999447831427,2.7677777777778,106,2907,0 +399,400,12811,0.028815579681692,2.8066666666667004,66,2329,0 +400,401,13472,0.0,2.7661111111111003,26,2456,0 +401,402,13063,0.039360296791109,2.8133333333333,23,2178,0 +402,403,12833,0.039570832199428,2.9186111111111,24,2142,0 +403,404,12842,0.090659246308087,3.1930555555556,19,2277,0 +404,405,12804,0.10540579050057003,3.565,23,3066,0 +405,406,12852,0.062601610466313,3.9133333333333,30,3619,0 +406,407,12862,0.051455855638306,3.9658333333333,23,3726,0 +407,408,12799,0.054631758648785014,3.8930555555556,35,2282,0 +408,409,12789,0.09017822949731,3.7297222222222,41,3079,0 +409,410,12815,0.045287525091609014,3.6516666666667,63,2448,0 +410,411,12887,0.033344698319951,3.5927777777778,33,2574,0 +411,412,12903,0.080098394586215,3.4694444444444,50,3697,0 +412,413,12892,0.025162301034707,3.2536111111111,88,3067,0 +413,414,12907,0.078260793447992,2.8986111111111,115,3491,0 +414,415,12883,0.07223863924679201,2.4488888888889,69,3195,0 +415,416,12965,0.042917873674349,2.2119444444444,116,2763,0 +416,417,12932,0.04720597158087901,2.2011111111111,73,2605,0 +417,418,13134,0.048273008229067,2.2338888888889,75,2755,0 +418,419,13440,0.036987975876273,2.3116666666667003,56,3300,0 +419,420,13544,0.06291463671717,2.3869444444444,66,3838,0 +420,421,13508,0.033319304393751,2.5119444444444,70,3608,0 +421,422,13401,0.029115275623859,2.5713888888889,52,3845,0 +422,423,13410,0.06821638123436,2.5088888888889,32,3563,0 +423,424,13482,0.015408589348188,2.4155555555556,16,5478,0 +424,425,14124,0.01916018435633,3.6455555555556,46,3656,0 +425,426,13703,0.06374239746477901,2.4625,53,3491,0 +426,427,13250,0.099738890728803,2.5808333333333,67,3430,0 +427,428,13092,0.10950621554455,3.0033333333333,58,2807,0 +428,429,13012,0.06138920621589401,3.3486111111111003,17,2524,0 +429,430,12901,0.051307638060244014,3.6644444444444,26,2964,0 +430,431,12848,0.082471571552878,4.0083333333333,13,3969,0 +431,432,13025,0.060122448878635,3.8530555555556,8,3561,0 +432,433,11352,0.07469842969719999,3.6183333333333,20,3394,0 +433,434,8761,0.056170625137636994,3.4922222222222,23,3005,0 +434,435,10433,0.052668952946361,3.4958333333333,34,2350,0 +435,436,10088,0.068871884486763,3.2738888888889,35,2139,0 +436,437,9485,0.040236057110938986,3.2102777777778,48,2098,0 +437,438,8865,0.053200012471363,2.8475,67,2341,0 +438,439,8920,0.056725172482788,2.4883333333332995,38,2698,0 +439,440,8798,0.035229341473877,2.1955555555556003,33,2968,0 +440,441,8927,0.0,2.1461111111111,40,2824,0 +441,442,9211,0.020190723068726,2.1522222222222,37,3003,0 +442,443,9286,0.093342961377898,2.3122222222222004,51,3551,0 +443,444,9725,0.0,2.4033333333333,52,4689,0 +444,445,11050,0.015717168144981003,2.4944444444444,57,3481,0 +445,446,11521,0.017190609993733997,2.6622222222222005,82,3376,0 +446,447,11603,0.0,2.675,74,3198,0 +447,448,11665,0.043273461915965,2.6997222222222,80,3059,0 +448,449,12153,0.029854520963498,2.6997222222222,78,2937,0 +449,450,11672,0.017383620014121998,2.7194444444444,58,2881,0 +450,451,11119,0.046391383573699006,2.8258333333333,41,2777,0 +451,452,11124,0.042155878228,3.1044444444444,34,2510,0 +452,453,10734,0.052684222339579014,3.4736111111111003,35,2356,0 +453,454,11612,0.063573954212613,3.6972222222222,40,2383,0 +454,455,11523,0.077413583128967,3.8038888888889,35,2455,0 +455,456,11632,0.069605078732108,3.7494444444444,37,2285,0 +456,457,12838,0.075937967855042,3.6813888888889,43,2455,0 +457,458,11637,0.047354002438352014,3.4791666666667003,45,4298,0 +458,459,12542,0.044000040388062,3.4530555555556,48,2400,0 +459,460,12394,0.095130971924595,3.2841666666667004,77,3431,0 +460,461,12419,0.069274987547704,3.205,79,2252,0 +461,462,12484,0.061118974117397,2.8436111111111004,59,2628,0 +462,463,12413,0.056393740750134,2.4441666666667,107,3266,0 +463,464,12440,0.06125086589409901,2.275,100,2620,0 +464,465,12614,0.047746883512707,2.1788888888889,84,2824,0 +465,466,12693,0.047136440673386,2.2083333333333,99,2801,0 +466,467,12989,0.0,2.2997222222222,103,3106,0 +467,468,13200,0.0,2.3155555555556004,47,3532,0 +468,469,13108,0.049828520132601,2.41,67,4210,0 +469,470,12886,0.0,2.5902777777778,65,3646,0 +470,471,13000,0.0,2.6636111111111,65,3768,0 +471,472,13071,0.043576825212604,2.7105555555556,70,5342,0 +472,473,13563,0.035173891965945,2.6811111111111,76,5327,0 +473,474,13333,0.04413510379665099,2.715,40,3363,0 +474,475,12672,0.016955671451488998,2.7083333333333,54,3016,0 +475,476,12547,0.1330396486107,3.0038888888889,45,3257,0 +476,477,12289,0.016462114132943,3.3911111111111003,32,2619,0 +477,478,12584,0.055696363369897,3.6375,26,2573,0 +478,479,12526,0.036411774365825,3.7755555555556,25,2575,0 +479,480,12416,0.047966724418057,3.5786111111111003,34,5355,0 +480,481,12450,0.05609961782665,3.4222222222222,43,5809,0 +481,482,12460,0.096990479781121,3.2538888888889,68,3823,0 +482,483,12425,0.11147038220964,3.1683333333333,60,3116,0 +483,484,12430,0.044797927381498,3.0677777777778,74,2321,0 +484,485,12418,0.024403519177111,2.94,68,2193,0 +485,486,12437,0.08532776818426499,2.7291666666667003,43,2982,0 +486,487,12484,0.043615168647623,2.4147222222222005,73,4140,0 +487,488,12380,0.056692005942856,2.1419444444444,72,2353,0 +488,489,12620,0.033708553131457,2.0244444444444,66,3350,0 +489,490,12674,0.040148453968243986,2.0458333333333,90,3184,0 +490,491,12855,0.099551526697496,2.09,104,3469,0 +491,492,13053,0.0,2.1575,114,4204,0 +492,493,12898,0.036157867549894,2.2655555555556,98,6447,0 +493,494,12809,0.052738784696875,2.2561111111111,70,4898,0 +494,495,12964,0.021636091422947,2.4669444444444,101,3633,0 +495,496,12956,0.037120220639643986,2.5277777777778,77,4189,0 +496,497,13625,0.034467327401996005,2.5266666666667,69,4012,0 +497,498,13285,0.0,2.5438888888889,19,4009,0 +498,499,12715,0.096807019710259,2.6511111111111,47,4346,0 +499,500,12637,0.059601475230884,2.9711111111111004,38,2781,0 +500,501,12535,0.068431521141608,3.2288888888889,22,2811,0 +501,502,12512,0.09611085542804,3.505,20,2415,0 +502,503,12549,0.064177980162036,3.4944444444444,26,3589,0 +503,504,12567,0.11565746993409,3.4633333333333,24,2878,0 +504,505,12362,0.073501732487291,3.3177777777778,27,3471,0 +505,506,12326,0.072746100819649,3.1963888888889,25,2697,0 +506,507,12450,0.07557888002360401,3.1069444444444,57,2583,0 +507,508,12404,0.036816888038697,3.0172222222222,58,3173,0 +508,509,12362,0.093969235453559,2.9247222222222,81,3341,0 +509,510,12431,0.034848294186597004,2.5336111111111,81,2305,0 +510,511,12351,0.084191269180943,2.2480555555556,69,2186,0 +511,512,12528,0.13109036514766,2.0383333333333,50,4439,0 +512,513,12559,0.061132356147447,1.8852777777778,55,3173,0 +513,514,12586,0.019478099970089,1.9225,57,2831,0 +514,515,12864,0.0,1.9719444444444,78,16385,0 +515,516,13026,0.0,2.0608333333333,57,83955,0 +516,517,12880,0.017965204407153,2.16,78,4574,0 +517,518,12743,0.019202263481759,2.3077777777778,95,4987,0 +518,519,12812,0.0,2.415,88,5110,0 +519,520,12878,0.052306327013631,2.4669444444444,108,4893,0 +520,521,13427,0.08536575533023,2.5125,87,3807,0 +521,522,13081,0.052461360256699015,2.6294444444444,87,3447,0 +522,523,12752,0.035302992848671,2.8183333333333,44,4329,0 +523,524,12594,0.028682734942579,3.0547222222222,39,5166,0 +524,525,12507,0.024204462299365,3.33,27,3454,0 +525,526,12494,0.034360100307537,3.5738888888889,23,3578,0 +526,527,12487,0.018977302969238,3.6888888888889,11,2406,0 +527,528,12404,0.034308847257872,3.7111111111111,13,2073,0 +528,529,11147,0.07460088255490599,3.7180555555556,24,1925,0 +529,530,11147,0.055037935083209005,3.6041666666667,77,2357,0 +530,531,11128,0.039311673522385,3.4483333333333,54,1947,0 +531,532,11106,0.046619928266775,3.2413888888888995,45,1912,0 +532,533,11115,0.048227542028921,3.1355555555556,36,2107,0 +533,534,11044,0.020367863848114,2.8172222222222,59,2985,0 +534,535,11110,0.063069968046591,2.4275,81,2081,0 +535,536,11190,0.054470866056974016,2.2513888888889,50,2631,0 +536,537,11063,0.0,2.0691666666667,53,2130,0 +537,538,11078,0.059261864411046,2.0155555555556,44,2085,0 +538,539,11146,0.064174002348993,2.0952777777778,87,2211,0 +539,540,11010,0.0,2.2397222222222,94,2105,0 +540,541,11139,0.021912411214588,2.3275,128,2585,0 +541,542,11117,0.057958262002105985,2.5255555555556004,82,3695,0 +542,543,11081,0.035358633773416,2.665,49,3198,0 +543,544,11128,0.029191244440103,2.7975,79,3191,0 +544,545,11720,0.054981313823219,2.8597222222222,62,2016,0 +545,546,11384,0.06405347705857799,2.7983333333333,64,2124,0 +546,547,11018,0.0,2.9322222222222,34,2105,0 +547,548,11104,0.055445634363329,3.08,41,2031,0 +548,549,11084,0.040996998867197,3.3466666666667004,47,1964,0 +549,550,11106,0.027670189755404,3.6869444444444,31,2016,0 +550,551,11055,0.054579839310753,3.7966666666667,26,3909,0 +551,552,11098,0.044833640073299014,3.7805555555556,17,2105,0 +552,553,11028,0.03282297151413,3.7422222222222,30,2405,0 +553,554,11152,0.017696014614986,3.639166666666701,17,2141,0 +554,555,11025,0.09418709999244,3.4775,28,1910,0 +555,556,11015,0.061817529149429,3.3283333333333,20,1951,0 +556,557,11125,0.054000161367618,3.1702777777778,85,2310,0 +557,558,11035,0.06165600249599,2.7688888888889,52,2047,0 +558,559,11103,0.055915839259234,2.4266666666667,143,2048,0 +559,560,11100,0.062788330996733,2.1963888888889,106,3083,0 +560,561,11170,0.044888048273534,2.135,244,3619,0 +561,562,11078,0.095259484956337,2.3186111111111,2005,2172,0 +562,563,11150,0.021952502374124,2.3383333333333,124,3142,0 +563,564,11149,0.0,2.5002777777778,109,2256,0 +564,565,10984,0.0,2.6527777777778,148,2200,0 +565,566,11034,0.0,2.7661111111111003,126,2183,0 +566,567,11050,0.061557079663167,2.7347222222222,46,2030,0 +567,568,11102,0.14186075040414,2.6069444444444,49,2297,0 +568,569,11743,0.0,2.5547222222222,40,2213,0 +569,570,11371,0.077457673524504,2.4716666666667004,39,4014,0 +570,571,11078,0.16422977329792998,2.6530555555556004,25,2809,0 +571,572,11224,0.049366067455729,2.9488888888889,37,2355,0 +572,573,11146,0.10064381631633,3.3383333333333,32,2372,0 +573,574,11199,0.11909159312806,3.5419444444444,47,2387,0 +574,575,11181,0.09003816676619801,5.3302777777778,34,2359,0 +575,576,11022,0.055882659245704,3.7727777777778,40,2485,0 +576,577,11073,0.1836893913223,3.6333333333333,46,3728,0 +577,578,11120,0.08574268253550299,3.5430555555556,35,2820,0 +578,579,11008,0.12559700716583,3.6711111111111,61,2426,0 +579,580,11078,0.086129850619071,3.4572222222222,56,2307,0 +580,581,11121,0.041752618326160014,3.2,72,2233,0 +581,582,11041,0.094396473652892,2.7772222222222,110,2178,0 +582,583,11168,0.045323960075285004,2.415,135,2243,0 +583,584,11213,0.13808411333909,2.2530555555556004,133,2713,0 +584,585,11238,0.08029349854683501,2.0994444444444,148,3168,0 +585,586,11273,0.06507307495461,2.1780555555556003,86,3163,0 +586,587,11479,0.084518021856329,2.2638888888889,132,3289,0 +587,588,11839,0.030507395540508,2.3575,73,4001,0 +588,589,11735,0.05892502921299701,2.4680555555556003,95,4684,0 +589,590,11574,0.0,2.6208333333333,74,4137,0 +590,591,11531,0.033075906123641,2.6863888888889,51,4787,0 +591,592,11420,0.16633704704670998,2.6172222222222,65,4278,0 +592,593,12301,0.10228536028167,2.6194444444444,95,3898,0 +593,594,11845,0.16949365549682996,2.6358333333333,72,3728,0 +594,595,11374,0.08260397756200501,2.8661111111111004,41,4047,0 +595,596,11370,0.024378363844868,3.0533333333333,38,3373,0 +596,597,11197,0.15686874147816002,3.4438888888889,32,2669,0 +597,598,11171,0.063929461148943,3.6552777777778,22,3289,0 +598,599,11197,0.12602019009982998,3.8519444444444,29,2556,0 +599,600,11114,0.035137191893634005,3.8069444444444,32,2557,0 +600,601,12564,0.14965728062748998,3.5961111111111004,40,3003,0 +601,602,12459,0.10046170077382,3.5344444444444,59,2441,0 +602,603,12508,0.13163105487926,3.3972222222222,52,2396,0 +603,604,12464,0.043899611017859004,3.3936111111111003,42,3426,0 +604,605,12438,0.19567092855859,3.1025,46,2379,0 +605,606,12449,0.19135011734275,2.8630555555556,97,3026,0 +606,607,12373,0.11171915024595,2.4255555555556003,72,2336,0 +607,608,12594,0.032053604746412,1.8619444444444,81,2850,0 +608,609,12623,0.096448361580655,1.8930555555556,81,3016,0 +609,610,12759,0.07934996156433399,2.2080555555556,70,3537,0 +610,611,12841,0.024581173073578,2.3052777777778,89,3899,0 +611,612,13063,0.025596039426134,2.3777777777777995,87,5044,0 +612,613,13023,0.027922074309281,2.5161111111111,125,4806,0 +613,614,12884,0.02593545023878,2.6411111111111,69,4139,0 +614,615,13007,0.033086949155743,2.8011111111111004,57,4776,0 +615,616,13016,0.047260069860172005,2.7236111111111003,99,4065,0 +616,617,13588,0.038487130166032016,2.6813888888889,111,4969,0 +617,618,13272,0.16080169828563,2.7336111111111,71,3784,0 +618,619,12589,0.12635270044885,2.8863888888889,71,3297,0 +619,620,12651,0.046904491868436,3.1225,48,3347,0 +620,621,12616,0.059534673085297,3.4613888888889,76,3170,0 +621,622,12492,0.12198352023568,3.8297222222222,56,2241,0 +622,623,12497,0.052131597947042,3.8936111111111,35,2301,0 +623,624,12623,0.094084438832673,3.7588888888889,35,2303,0 +624,625,12481,0.13486764750848,3.5827777777778,29,2587,0 +625,626,12434,0.062226183256115,3.4730555555556,38,3211,0 +626,627,12495,0.091202035463034,3.4175,69,2604,0 +627,628,12375,0.096137859324631,3.3533333333333,77,2841,0 +628,629,12357,0.10449109200785,3.1963888888889,20,2168,0 +629,630,12433,0.097127966420289,2.8852777777778,24,2265,0 +630,631,12432,0.064404980330111,2.4880555555556003,83,2908,0 +631,632,12429,0.10188181868693,2.2325,62,3180,0 +632,633,12551,0.19953464365013,2.1044444444444,54,3118,0 +633,634,12799,0.0747839457206,2.1097222222222,54,3296,0 +634,635,12818,0.0,2.235,60,4432,0 +635,636,13071,0.0,2.3516666666667003,63,4336,0 +636,637,12897,0.0,2.5138888888889,95,4534,0 +637,638,12961,0.041436571087464,2.6105555555556004,69,4261,0 +638,639,12925,0.038671790863765,2.7233333333333,68,5248,0 +639,640,12968,0.035810634316102014,2.6633333333333,58,5014,0 +640,641,13525,0.1409929213297,2.5580555555556,107,3864,0 +641,642,12993,0.0,2.6627777777778,48,5682,0 +642,643,12369,0.052915080344848,2.7625,64,4404,0 +643,644,12195,0.11966022897483,3.0283333333333,52,3705,0 +644,645,12464,0.12973870706052,3.3727777777778,61,2738,0 +645,646,12470,0.023838633821411,3.6369444444444,47,2887,0 +646,647,12475,0.12358680271021,3.7088888888889,58,3776,0 +647,648,12482,0.089095336472172,3.5847222222222,51,3532,0 +648,649,12221,0.019762530636927,3.4836111111111,61,3724,0 +649,650,12325,0.020994992941051,3.4077777777778,53,2786,0 +650,651,12258,0.10380294658324002,3.4441666666667,55,2941,0 +651,652,11980,0.079228021087742,3.1683333333333,52,2351,0 +652,653,11947,0.039012779943635,3.0527777777778,89,2316,0 +653,654,12291,0.10658713601061,2.8527777777778,85,2350,0 +654,655,12293,0.14426278476756,2.5433333333333,106,2916,0 +655,656,12341,0.08706206992122,2.1997222222222,88,2437,0 +656,657,12390,0.16325946030154,2.1036111111111,59,2761,0 +657,658,12611,0.0,2.2133333333333,48,3941,0 +658,659,12737,0.0,2.2086111111111,66,4025,0 +659,660,12882,0.07729609083366701,2.2883333333333,95,4466,0 +660,661,12891,0.058100747891124,2.3222222222222,82,4401,0 +661,662,12756,0.061191523312340984,2.47,76,4747,0 +662,663,12875,0.08592375974441901,2.685,104,4051,0 +663,664,12847,0.033467197342519,2.6763888888889,54,4448,0 +664,665,13518,0.030265788895452006,2.5838888888889,43,3736,0 +665,666,13217,0.11950310860409,2.6130555555556003,39,3918,0 +666,667,12621,0.09169148327055697,2.7633333333333,48,3408,0 +667,668,12591,0.18439354827551,3.0708333333333,38,2883,0 +668,669,12332,0.10741924067542,3.4347222222222,45,3631,0 +669,670,12404,0.15862461647089002,3.7030555555556,64,2609,0 +670,671,12457,0.14957813136313,3.8138888888889,35,2533,0 +671,672,12370,0.24059408570531,3.8508333333333,66,2469,0 +672,673,11509,0.15511115210127,3.8961111111111,61,2458,0 +673,674,11433,0.19582462633148,3.4763888888889,58,2458,0 +674,675,11317,0.13981560037535998,3.4041666666667,51,2043,0 +675,676,11364,0.1392329990551,3.2352777777778,55,1985,0 +676,677,11350,0.13079770999921,3.1508333333333,126,2032,0 +677,678,11348,0.053672881218709015,2.7863888888888995,61,3409,0 +678,679,11365,0.10971373742228,2.4861111111111,94,2018,0 +679,680,11505,0.13825204927093,2.2444444444444,83,2461,0 +680,681,11468,0.13912778922607,2.1286111111111,136,2318,0 +681,682,11562,0.10215803640865,2.1261111111111,104,2787,0 +682,683,11858,0.096617489053804,2.2405555555556003,77,3186,0 +683,684,11933,0.0,2.2991666666667,109,3490,0 +684,685,11813,0.0,2.3627777777778,146,3407,0 +685,686,11735,0.0,2.5863888888889,69,3193,0 +686,687,11848,0.0,2.7286111111111,121,3412,0 +687,688,11843,0.0,2.8355555555556,53,3563,0 +688,689,12318,0.068897518746959,2.7875,61,3247,0 +689,690,11846,0.05418569809170299,2.7825,82,3012,0 +690,691,11066,0.06507307495461,2.7972222222222,37,2382,0 +691,692,10920,0.10547682048851,3.0355555555556,19,2012,0 +692,693,10836,0.056437861708265,3.2486111111111,19,1915,0 +693,694,10879,0.098703711593837,3.6077777777778,19,1982,0 +694,695,10796,0.14331889652193,3.76,54,1950,0 +695,696,10785,0.05704449488642,3.806666666666701,44,4176,0 +696,697,9469,0.0,3.6638888888889,46,3654,0 +697,698,9278,0.032146952736052,3.5161111111111003,53,3063,0 +698,699,9417,0.068135614649249,3.3286111111111003,83,1916,0 +699,700,9253,0.034514299845882,3.2166666666667,92,1848,0 +700,701,9435,0.028306668795131006,2.9783333333333,94,1704,0 +701,702,9356,0.13119921991025002,2.7211111111111004,111,1680,0 +702,703,9354,0.093609772007723,2.4102777777778,84,2011,0 +703,704,9405,0.11179018663123,2.1366666666667,52,1772,0 +704,705,9326,0.065272680657868,1.9947222222222,68,1838,0 +705,706,9549,0.15901886092526998,1.9936111111111,35,1924,0 +706,707,9499,0.0,2.0788888888889,40,2038,0 +707,708,9371,0.26537507315217,2.1736111111111,47,1991,0 +708,709,9462,0.0,2.4027777777778,85,1729,0 +709,710,9509,0.056610336908172985,2.4580555555556,59,1673,0 +710,711,9469,0.026644044055307004,2.6102777777777995,61,1656,0 +711,712,9522,0.040819652463459,2.7597222222222,45,1774,0 +712,713,9885,0.13497701521251,2.8122222222222,47,1784,0 +713,714,9802,0.16853433621426,2.8427777777778,72,1818,0 +714,715,9461,0.08655557751574,2.87,69,1981,0 +715,716,9393,0.05741127788681901,2.9769444444444,17,2004,0 +716,717,9638,0.037244401880164,3.3241666666667005,47,1788,0 +717,718,9435,0.1132743034971,3.6375,37,1786,0 +718,719,9519,0.15690958465910998,3.8652777777778,57,1781,0 +719,720,9492,0.09604225449090803,3.8091666666667,62,2024,0 +720,721,9458,0.06746445682560599,3.6844444444444,72,1669,0 +721,722,9420,0.058373145210404015,3.5913888888889,43,1729,0 +722,723,9429,0.048008603166117006,3.5255555555556,57,1682,0 +723,724,9461,0.12614216994504,3.3277777777778,47,1714,0 +724,725,9404,0.077186121310215,3.07,61,1679,0 +725,726,9366,0.042879382350005,2.7622222222222,53,1739,0 +726,727,9488,0.031014262794497007,2.3872222222222,78,1669,0 +727,728,9515,0.13957171072647,2.1308333333333,100,1806,0 +728,729,9487,0.027108383258306,2.1563888888889,104,1650,0 +729,730,9497,0.0,2.2547222222222003,56,1751,0 +730,731,9516,0.0,2.3397222222222003,89,1685,0 +731,732,9504,0.0,2.4808333333333,108,1645,0 +732,733,9422,0.025265991419408,2.6208333333333,67,2133,0 +733,734,9543,0.0,2.8138888888889,83,1618,0 +734,735,9395,0.047219926720593,2.9275,90,1623,0 +735,736,9352,0.083109434319356,2.8663888888888995,82,1697,0 +736,737,9884,0.10860709298782,2.7794444444444,76,1684,0 +737,738,9820,0.098319718095083,2.8194444444444,34,1779,0 +738,739,9439,0.02201293380153,2.9458333333333,43,2982,0 +739,740,9560,0.064929719079082,3.2413888888888995,40,1848,0 +740,741,9589,0.036960535765785,3.7166666666667,40,1772,0 +741,742,9575,0.068536856116777,4.1333333333333,57,1841,0 +742,743,9541,0.012398281267649,4.2697222222222,60,1834,0 +743,744,9490,0.035305311833591015,4.2797222222222,53,1860,0 +744,745,7160,0.024153733176505,4.0,44,1647,0 +745,746,7233,0.031750779212929,3.8877777777778,48,2129,0 +746,747,7166,0.092612685693125,3.6633333333333,50,1763,0 +747,748,7245,0.12674340154738,3.6127777777778,65,1433,0 +748,749,7299,0.068594711667718,3.3175,93,1428,0 +749,750,7169,0.13866540834682,2.8930555555556,105,1521,0 +750,751,7228,0.046813024390007014,2.4722222222222,94,1622,0 +751,752,7123,0.072990045810784,2.2294444444444,53,1580,0 +752,753,7199,0.17156759541908995,2.1286111111111,59,1468,0 +753,754,7167,0.051876699734571985,2.2219444444444,63,1520,0 +754,755,7212,0.031958698733103,2.3366666666667,61,1529,0 +755,756,7206,0.07333373485157901,2.4155555555556,72,1611,0 +756,757,7149,0.0,2.5408333333333,93,1511,0 +757,758,7284,0.023187512335638,2.6511111111111,62,1906,0 +758,759,7265,0.031672522871666,2.8405555555556,50,2632,0 +759,760,7221,0.091103855362214,2.8336111111111,42,1483,0 +760,761,7588,0.0,2.6575,62,1611,0 +761,762,7423,0.0983398607742,2.6622222222222005,21,1676,0 +762,763,7198,0.08011943311413,2.7719444444444,28,1670,0 +763,764,7279,0.043646436319699,3.0344444444444,65,1631,0 +764,765,7174,0.091445521226266,3.3741666666667003,37,1799,0 +765,766,7259,0.067771120773973,3.6925,20,1511,0 +766,767,7166,0.049768578185777006,3.8136111111111,47,1605,0 +767,768,7171,0.067455979006223,3.8202777777778,45,1758,0 +768,769,6883,0.14102875351082,3.7547222222222,49,1509,0 +769,770,6859,0.04521932948417,3.6077777777778,46,1591,0 +770,771,6817,0.032382889221133,3.5330555555556,30,1543,0 +771,772,6877,0.075100266089453,3.3544444444444,30,1573,0 +772,773,6785,0.038989846359505,3.1155555555556,48,1473,0 +773,774,6665,0.093396608626074,2.8463888888888995,36,1476,0 +774,775,6805,0.06797619687558401,2.4411111111111,46,1712,0 +775,776,6863,0.08326287339845401,2.1455555555556,27,1801,0 +776,777,6926,0.015112630017379,2.0025,79,1902,0 +777,778,7004,0.031549757127405,2.1247222222222,65,2005,0 +778,779,6950,0.0,2.2741666666667,57,2363,0 +779,780,7262,0.0,2.3272222222222005,61,2513,0 +780,781,7361,0.017214486216241002,2.4363888888889,89,2664,0 +781,782,7288,0.015541991667356,2.6155555555556003,80,2714,0 +782,783,7463,0.0,2.7272222222222,79,2754,0 +783,784,7188,0.027199843934104,2.6552777777778,113,2670,0 +784,785,7658,0.053744802378685,2.6086111111111,71,2584,0 +785,786,7575,0.05675511278546901,2.6025,53,2466,0 +786,787,6954,0.070873939193717,2.7372222222222,64,2137,0 +787,788,6862,0.19022950977106,3.0125,43,1931,0 +788,789,6896,0.17589540947937002,3.3477777777778,34,1743,0 +789,790,6954,0.022875979046571,3.6236111111111,29,1713,0 +790,791,6869,0.0,3.7383333333333,30,1649,0 +791,792,6890,0.13681403156951,3.7772222222222,24,1633,0 +792,793,9742,0.058507485759525,3.6966666666667,40,1993,0 +793,794,9730,0.10227075584148,3.7733333333333,32,1940,0 +794,795,9810,0.06726096113022301,3.6408333333333,39,1951,0 +795,796,9688,0.15267199916685995,3.3922222222222,67,1894,0 +796,797,9849,0.069818221889972,3.1627777777778,65,1801,0 +797,798,9765,0.030305771594539,2.6875,49,1962,0 +798,799,9812,0.09211700324247198,2.3533333333333,41,2123,0 +799,800,9931,0.12298177354813,2.0425,50,2434,0 +800,801,9908,0.08705722689013601,1.9738888888889,48,2402,0 +801,802,10066,0.07529920073678098,2.0425,59,3013,0 +802,803,10184,0.06217694957317299,2.1563888888889,51,3086,0 +803,804,10295,0.020886039183631,2.2866666666667004,43,3527,0 +804,805,10113,0.08148200392528,2.3919444444444,72,3716,0 +805,806,10218,0.027014133895137,2.5513888888889,52,3577,0 +806,807,10322,0.08271940630361399,2.6030555555556,68,3430,0 +807,808,10269,0.038537180887872,2.6647222222222005,74,3413,0 +808,809,10781,0.090543853269643,2.5930555555556003,46,3755,0 +809,810,10486,0.02593545023878,2.5513888888889,64,4806,0 +810,811,10124,0.090692829340129,2.76,38,3127,0 +811,812,9993,0.09154630234853098,3.0636111111111,40,3421,0 +812,813,9801,0.09562635368432304,3.4016666666667,50,2475,0 +813,814,9760,0.0,3.7277777777778,42,2440,0 +814,815,9858,0.0,3.7902777777778,37,2731,0 +815,816,9884,0.027267039980187,3.7355555555556,34,2493,0 +816,817,7781,0.024102810048699,3.535,37,1665,0 +817,818,7742,0.072297652068167,3.5819444444444,47,1771,0 +818,819,7682,0.12348623922845,3.3847222222222,67,2293,0 +819,820,7831,0.077453588867077,3.2547222222222,66,1959,0 +820,821,7641,0.05662557916213299,3.125,91,1498,0 +821,822,7641,0.15509029304093,2.7766666666667,132,1537,0 +822,823,7759,0.079595064406905,2.4725,149,1580,0 +823,824,7748,0.053225613553497,2.1927777777778,65,1901,0 +824,825,7776,0.05741127788681901,2.1283333333333,50,1916,0 +825,826,7938,0.077171346852694,2.2319444444444,70,2213,0 +826,827,8031,0.0,2.3061111111111,82,2205,0 +827,828,8117,0.07512642149906099,2.3363888888889,72,2486,0 +828,829,8099,0.0,2.3686111111111,98,2580,0 +829,830,8002,0.0,2.4986111111111,78,2530,0 +830,831,7944,0.026463035590685,2.6433333333333,86,2664,0 +831,832,7963,0.024228588329879,2.7563888888889,76,4368,0 +832,833,8602,0.055182797357095005,2.6652777777778,95,3103,0 +833,834,8269,0.09607690135523,2.6844444444444,63,2249,0 +834,835,7871,0.059431847203259,2.7902777777778,32,2070,0 +835,836,7709,0.018731901987648,3.1119444444444,30,2833,0 +836,837,7726,0.033970515582906,3.5491666666667,27,1734,0 +837,838,7781,0.049963174087431,3.7102777777778,22,2151,0 +838,839,7762,0.073295374096872,3.7961111111111,19,2103,0 +839,840,7692,0.017715537831218996,3.7730555555556,32,1725,0 +840,841,6608,0.014656639469103996,3.5919444444444,45,1895,0 +841,842,6526,0.15513271231042,3.5580555555556,65,1959,0 +842,843,6531,0.06544162031760599,3.4588888888889,73,1637,0 +843,844,6483,0.12276447331552,3.2969444444444003,52,1658,0 +844,845,6602,0.054046416943085,3.2288888888889,93,1666,0 +845,846,6555,0.06827770027642299,2.7358333333333,68,2410,0 +846,847,6610,0.10171854295932,2.4636111111111,127,1787,0 +847,848,6690,0.093454285728882,2.1894444444444,105,2264,0 +848,849,6651,0.04318436192577,2.1227777777778,75,2007,0 +849,850,6759,0.10050707347524,2.1369444444444,77,2107,0 +850,851,6836,0.019571935182124,2.2230555555556,140,2355,0 +851,852,6894,0.0,2.3188888888889,132,2726,0 +852,853,6844,0.0,2.4166666666667003,100,2875,0 +853,854,6773,0.02713995635286,2.5777777777778,174,2780,0 +854,855,6802,0.092632629280125,2.7869444444444,82,3936,0 +855,856,6947,0.098676638207998,2.8586111111111,128,3116,0 +856,857,7248,0.0,3.0816666666667003,79,3770,0 +857,858,6885,0.11132365864914,2.8713888888889,71,2382,0 +858,859,6643,0.0947301899901,2.9386111111111,60,2152,0 +859,860,6560,0.061070711161473,2.9827777777778,60,1754,0 +860,861,6554,0.18477832073133,3.3197222222222,56,1783,0 +861,862,6600,0.055986690710270993,3.5961111111111004,78,1780,0 +862,863,6525,0.16264480046039995,3.7613888888889,60,1582,0 +863,864,6543,0.026215643469448,3.7305555555556,48,2271,0 +864,865,9018,0.0,3.5580555555556,48,2592,0 +865,866,9225,0.054655616583012,3.5136111111111004,42,2921,0 +866,867,9112,0.07076692500883701,3.3772222222222,64,1814,0 +867,868,9195,0.067217215228375,3.2402777777778,36,3219,0 +868,869,9206,0.046060828388587,3.0586111111111003,40,2567,0 +869,870,9224,0.08329795085471901,2.7908333333333,18,1899,0 +870,871,9408,0.08219020764935,2.3761111111111,35,1801,0 +871,872,9082,0.046792553198475,2.1347222222222,44,2005,0 +872,873,9168,0.06755714954154099,1.9991666666667,105,2572,0 +873,874,9258,0.099050882008287,1.9983333333333,71,3563,0 +874,875,9158,0.0,2.0908333333333,65,2777,0 +875,876,9140,0.10824637351267,2.2311111111111,74,3362,0 +876,877,9206,0.0,2.3219444444444,34,3590,0 +877,878,9186,0.0,2.4727777777778,49,2930,0 +878,879,9155,0.037750185176735,2.5952777777778,44,2481,0 +879,880,9174,0.030345867660395,2.7416666666667004,57,2571,0 +880,881,9758,0.057665227298857,2.7652777777778,102,3546,0 +881,882,9451,0.16774071722374,2.7980555555556,106,4984,0 +882,883,9153,0.10462164884166,2.7597222222222,58,1994,0 +883,884,9233,0.051974117163582,3.0116666666667005,57,3060,0 +884,885,9250,0.070438547008222,3.2916666666667003,62,2151,0 +885,886,9317,0.11437533048244,3.5547222222222,42,2158,0 +886,887,9130,0.028754095353637,3.7580555555556,35,2319,0 +887,888,9249,0.06874265819680701,3.7330555555556,28,1909,0 +888,889,8297,0.041552255552731,3.5886111111111005,27,1627,0 +889,890,8245,0.033571347720577,3.5255555555556,35,2459,0 +890,891,8298,0.014724878652831,3.3858333333333,50,3167,0 +891,892,8247,0.046095580964192,3.2677777777778,69,1839,0 +892,893,8387,0.031859774913781,3.1247222222222,64,3887,0 +893,894,8392,0.094121536253424,2.7213888888888995,69,2031,0 +894,895,8531,0.11471874999036,2.3972222222222004,58,1522,0 +895,896,8437,0.09375530196425097,2.0836111111111,58,1732,0 +896,897,8344,0.10898948864079,2.0644444444444,51,2169,0 +897,898,8274,0.031129909255124,2.2063888888889,46,1679,0 +898,899,8328,0.0,2.3044444444444,84,1941,0 +899,900,8351,0.020155867044519,2.47,144,1638,0 +900,901,8380,0.016795241270985,2.5697222222222003,86,1725,0 +901,902,8332,0.0,2.7625,69,1903,0 +902,903,8366,0.0,2.9436111111111005,81,2074,0 +903,904,8357,0.01748186857624,2.7905555555556,175,1848,0 +904,905,8867,0.015638795432702,2.7527777777778,65,1761,0 +905,906,8659,0.037878946671491,2.6980555555556,48,1838,0 +906,907,8458,0.14870829462531002,2.9102777777778,33,1640,0 +907,908,8360,0.07322030784057597,3.2663888888889,35,1715,0 +908,909,8330,0.10504553292421,3.5372222222222,37,1717,0 +909,910,8298,0.10771048774666,3.86,31,1758,0 +910,911,8381,0.07484115005697,3.9216666666667,36,1975,0 +911,912,8393,0.10377526695926,3.8766666666667,30,1865,0 +912,913,3998,0.052336696506499,3.6463888888889,28,3575,0 +913,914,3733,0.039930389849144,3.6552777777778,24,1413,0 +914,915,3735,0.052659026600132,3.5880555555556,68,1414,0 +915,916,3709,0.071593754146172,3.3594444444444003,26,1170,0 +916,917,3755,0.072107773186609,3.1888888888889,78,1209,0 +917,918,3782,0.14407221323011,2.7575,90,1170,0 +918,919,3849,0.078873737285415,2.3936111111111,76,1328,0 +919,920,3801,0.090543853269643,2.1925,94,1258,0 +920,921,3787,0.0,2.16,70,1427,0 +921,922,3835,0.18229662394063,2.2719444444444,129,1480,0 +922,923,4035,0.10064381631633,2.3994444444444,120,1687,0 +923,924,4173,0.0,2.2836111111111,122,1942,0 +924,925,3995,0.0,2.5422222222222004,100,1967,0 +925,926,4016,0.0,2.6908333333333,102,2110,0 +926,927,4049,0.064661049677152,2.7702777777778,118,1956,0 +927,928,4014,0.10610212880951,2.7405555555556,86,1984,0 +928,929,4263,0.098345239553664,2.6908333333333,92,1893,0 +929,930,3941,0.055426072308289,2.7008333333333,44,1821,0 +930,931,4023,0.026036719363444,2.8322222222222,25,1641,0 +931,932,3917,0.058176601538018,3.0922222222222,54,1604,0 +932,933,3910,0.11644035456955,3.4363888888889,48,1265,0 +933,934,3934,0.067489738764642,3.7530555555556,56,1407,0 +934,935,3783,0.091155534540558,3.9127777777778,42,1342,0 +935,936,3834,0.052217414705359004,3.7608333333333,41,1216,0 +936,937,8698,0.028401045145692,3.6472222222222,32,2569,0 +937,938,8969,0.06030991242653401,3.5544444444444,48,2150,0 +938,939,8928,0.057683225704233,3.5036111111111,40,2317,0 +939,940,9020,0.049602244305935,3.2538888888889,26,2047,0 +940,941,8865,0.054771618715138,3.1886111111111,55,2065,0 +941,942,8830,0.014455899164978,2.7341666666667,52,1909,0 +942,943,8879,0.05563571922395901,2.3655555555556003,34,1910,0 +943,944,9120,0.077488949885965,2.1688888888889,61,2037,0 +944,945,9111,0.06776025909838901,2.0977777777778,34,3065,0 +945,946,9071,0.033919453583666,2.3077777777778,50,2452,0 +946,947,9205,0.030948232299768,2.3611111111111,47,3226,0 +947,948,9355,0.0,2.4986111111111,56,3271,0 +948,949,9372,0.0,2.5691666666667,76,3471,0 +949,950,9392,0.0,2.7463888888889,60,3922,0 +950,951,9416,0.0,2.8063888888888995,100,3296,0 +951,952,9394,0.0,2.8091666666667003,80,3171,0 +952,953,9810,0.10150033578287,2.715,74,3208,0 +953,954,9594,0.13650296233629,2.6869444444444,24,3602,0 +954,955,9006,0.048341331534980006,2.8180555555556,41,3208,0 +955,956,9140,0.055919636698743,3.0541666666667004,19,3455,0 +956,957,8925,0.052826773889684014,3.4711111111111004,24,2833,0 +957,958,9047,0.07932984590431501,3.7566666666667,18,3453,0 +958,959,9030,0.033310879512461,3.8633333333333,28,3155,0 +959,960,9088,0.048306771033288,3.7519444444444,5,2145,0 +960,961,8569,0.034002578802562,3.6480555555556,12,1999,0 +961,962,8616,0.047801640470854015,3.5061111111111005,35,2135,0 +962,963,8497,0.13378075099383,3.47,41,1813,0 +963,964,8439,0.063853685461221,3.3086111111111003,30,2020,0 +964,965,8567,0.0,3.1194444444444,22,2127,0 +965,966,8694,0.073869151016554,2.8044444444444,56,1764,0 +966,967,8739,0.043582908466928014,2.4205555555556004,34,2249,0 +967,968,8761,0.0,2.1180555555556,73,3119,0 +968,969,8838,0.062006969698131,2.1266666666667,86,2031,0 +969,970,8908,0.14006961492891,2.1708333333333,68,2246,0 +970,971,9053,0.11198565566104,2.3247222222222,36,3214,0 +971,972,9346,0.0,2.4208333333333,66,4207,0 +972,973,8989,0.058427455554992985,2.5563888888889,74,4195,0 +973,974,8807,0.070887934206661,2.7086111111111,78,3179,0 +974,975,9020,0.031869233863638,2.8027777777778,66,2739,0 +975,976,9034,0.0,2.7711111111111,118,2394,0 +976,977,9558,0.055680379884383,2.74,81,3750,0 +977,978,9042,0.030919398857213,2.6869444444444,85,3000,0 +978,979,8804,0.040222150865381015,2.8113888888889,69,2646,0 +979,980,8885,0.08462727078727299,3.1258333333333,49,2375,0 +980,981,8721,0.15790637433488,3.4711111111111004,56,2442,0 +981,982,8676,0.099165571846447,3.7419444444444,64,2069,0 +982,983,9029,0.051043016646698,3.7258333333333,48,1899,0 +983,984,8670,0.023695834967821,3.5369444444444,65,2277,0 +984,985,8537,0.13363180896924,3.4911111111111004,53,1926,0 +985,986,8418,0.14375985835531,3.3769444444444,70,1949,0 +986,987,8481,0.13890523887057998,3.3327777777778,51,2222,0 +987,988,8535,0.096357518724471,3.1925,30,1797,0 +988,989,8535,0.098277544249084,3.135,97,1860,0 +989,990,8442,0.11251833989481,2.8338888888889,41,2870,0 +990,991,8448,0.074768662666532,2.4997222222222004,32,1899,0 +991,992,8527,0.038008655416852,2.2297222222222004,47,2336,0 +992,993,8541,0.016354174968753,2.1158333333333,34,2703,0 +993,994,8635,0.11898350916153,2.1966666666667,54,2773,0 +994,995,8867,0.0,2.2591666666667,69,2577,0 +995,996,9033,0.0,2.3002777777778,109,2816,0 +996,997,8875,0.0,2.3797222222222003,76,3133,0 +997,998,8708,0.0,2.625,47,3366,0 +998,999,8455,0.020636446066963,2.6661111111111,44,3062,0 +999,1000,8713,0.043044731483849,2.6694444444444,92,3003,0 +1000,1001,8934,0.12513578187909,2.6541666666667,67,3044,0 +1001,1002,8745,0.099581351017555,2.6483333333333,26,3230,0 +1002,1003,8674,0.085903047711976,2.7444444444444,42,2793,0 +1003,1004,8606,0.066698820830796,3.0788888888889,69,1945,0 +1004,1005,8508,0.034228320502586,3.4833333333333,32,2716,0 +1005,1006,8558,0.028479870560763,3.6063888888889,41,2103,0 +1006,1007,8529,0.16430377699282994,3.8069444444444,52,1795,0 +1007,1008,8520,0.020290722486788003,3.6475,56,2840,0 +1008,1009,6662,0.17253761895951006,3.5219444444444,47,2653,0 +1009,1010,6491,0.1150267570489,3.3708333333333,65,2819,0 +1010,1011,6498,0.14119445755296,3.3086111111111003,70,1706,0 +1011,1012,6500,0.079900598296651,3.2411111111111004,84,1801,0 +1012,1013,6471,0.11459361685243,3.0525,71,3271,0 +1013,1014,6354,0.11299850955195,2.7419444444444,110,2001,0 +1014,1015,6592,0.078187238738118,2.4305555555556,65,1678,0 +1015,1016,6552,0.15222680511595002,2.1852777777778,68,1703,0 +1016,1017,6492,0.05823703723779,2.0644444444444,74,2441,0 +1017,1018,6577,0.038270957919533,2.1961111111111,43,2304,0 +1018,1019,6777,0.045436612403901,2.2886111111111,55,3124,0 +1019,1020,6844,0.051111263534218,2.3219444444444,53,3605,0 +1020,1021,6769,0.0,2.4436111111111,64,2985,0 +1021,1022,6642,0.0,2.6463888888889,58,2934,0 +1022,1023,6782,0.057248496594127986,2.735,54,3044,0 +1023,1024,6715,0.0,2.7586111111111005,121,3463,0 +1024,1025,6915,0.084808608043399,2.7138888888889,103,3199,0 +1025,1026,6569,0.05823703723779,2.7119444444444,66,2684,0 +1026,1027,6486,0.12640598881102005,2.8027777777778,73,3317,0 +1027,1028,6504,0.08602692657241201,2.9777777777778,71,2159,0 +1028,1029,6445,0.13712331887199,3.2961111111111,37,2043,0 +1029,1030,6427,0.12184008568979,3.4869444444444,46,2003,0 +1030,1031,6365,0.050317612906928,3.673611111111101,40,2260,0 +1031,1032,6277,0.07167380324199299,3.7469444444444,26,3522,0 +1032,1033,5231,0.051289858799957,3.6133333333333,42,1840,0 +1033,1034,5166,0.094021005766084,3.4752777777778,63,1820,0 +1034,1035,5303,0.020566298353792,3.3602777777778,68,1856,0 +1035,1036,5306,0.12275234276969,3.1605555555556,87,1715,0 +1036,1037,5298,0.1054190746845,3.0733333333333,60,1695,0 +1037,1038,5268,0.19050318144252,2.7130555555556,94,2254,0 +1038,1039,5251,0.10472332930133,2.2886111111111,121,1652,0 +1039,1040,5194,0.12644994481537,2.0783333333333,128,1602,0 +1040,1041,5230,0.08859454436104999,1.9188888888889,68,1792,0 +1041,1042,5244,0.0,1.9355555555556003,76,1954,0 +1042,1043,5102,0.09532581107230803,2.0569444444444,77,1808,0 +1043,1044,5244,0.15766772749983,2.1902777777778,158,1629,0 +1044,1045,5249,0.06429178708826701,2.3477777777778,112,2140,0 +1045,1046,5261,0.068395341911942,2.5502777777778,85,2390,0 +1046,1047,5339,0.025992957736547997,2.6597222222222,77,1707,0 +1047,1048,5241,0.0,2.7238888888888995,89,1901,0 +1048,1049,5491,0.021142167244918,2.7375,106,1820,0 +1049,1050,5374,0.072067861729848,2.7483333333333,47,2167,0 +1050,1051,5354,0.1275228688396,2.8525,34,2063,0 +1051,1052,5232,0.043846003986674,3.0038888888889,32,2184,0 +1052,1053,5217,0.10247450096434,3.2761111111111005,22,1981,0 +1053,1054,5258,0.07584150637714701,3.5761111111111004,16,1813,0 +1054,1055,5251,0.020496657705832,3.8172222222222,32,2033,0 +1055,1056,5223,0.13399493992192998,3.6691666666667,16,1629,0 +1056,1057,3952,0.091121163023619,3.5558333333333,20,1485,0 +1057,1058,3949,0.11809705541338,3.4266666666667,56,1527,0 +1058,1059,4021,0.033014047837867995,3.435,74,2561,0 +1059,1060,3815,0.16367597832104,3.2111111111111,116,1523,0 +1060,1061,3855,0.12469537397569,3.1297222222222,72,1446,0 +1061,1062,3892,0.095002031789468,2.7538888888889,66,1499,0 +1062,1063,3948,0.1028064299952,2.3116666666667003,56,1368,0 +1063,1064,3860,0.028861851985229007,2.0988888888889,61,1426,0 +1064,1065,3830,0.05806984314166,2.0983333333333,2151,3528,0 +1065,1066,3821,0.050886592113012,2.1986111111111,459,2279,0 +1066,1067,3886,0.05081829754409599,2.3677777777778,84,1421,0 +1067,1068,3954,0.0,2.5036111111111,55,2008,0 +1068,1069,3839,0.08354288831032201,2.5786111111111,61,1429,0 +1069,1070,3921,0.0,2.8172222222222,19,1497,0 +1070,1071,3874,0.08142390858425297,2.8727777777778,30,1604,0 +1071,1072,3996,0.047911560407608,2.8294444444444,73,1595,0 +1072,1073,4246,0.12201534565884,2.7136111111111005,63,2217,0 +1073,1074,3803,0.088739417881303,2.7058333333333,35,1580,0 +1074,1075,3594,0.08276214539547999,2.8161111111111,57,1466,0 +1075,1076,3778,0.066779641097052,3.1541666666667,50,1717,0 +1076,1077,3745,0.11367082443275,3.5791666666667004,48,1564,0 +1077,1078,3747,0.021597223158314,3.8158333333333,40,1752,0 +1078,1079,3726,0.16874893592242002,3.9405555555556,36,1598,0 +1079,1080,3729,0.041971530556774,3.7294444444444,59,1842,0 +1080,1081,8513,0.042983941794881,3.6183333333333,14,3066,0 +1081,1082,8738,0.14500733624043,3.4911111111111004,16,2272,0 +1082,1083,8709,0.046727090031129015,3.4566666666667003,36,4344,0 +1083,1084,8601,0.032553617944112004,3.37,65,3242,0 +1084,1085,8719,0.040039251102491,3.1658333333333,80,2291,0 +1085,1086,8820,0.055153759101126985,2.7261111111111003,91,2240,0 +1086,1087,8674,0.05751181017711901,2.3533333333333,102,2012,0 +1087,1088,8859,0.041202889821452,2.1158333333333,85,2305,0 +1088,1089,8905,0.07854024449462599,2.0852777777778,69,2295,0 +1089,1090,8920,0.11628975245152,2.1422222222222,79,2370,0 +1090,1091,9062,0.087543035971238,2.3172222222222003,66,3066,0 +1091,1092,9139,0.0,2.3983333333333,47,3132,0 +1092,1093,8866,0.031151045483539,2.55,51,3006,0 +1093,1094,8997,0.0,2.7413888888888995,20,3101,0 +1094,1095,9122,0.029949950026121008,2.7636111111111004,62,3739,0 +1095,1096,9191,0.067297142748812,2.7002777777778,54,3933,0 +1096,1097,9795,0.08450527625030299,2.7247222222222,99,4537,0 +1097,1098,9255,0.049852109269358014,2.5866666666667,64,3856,0 +1098,1099,8924,0.094084438832673,2.8597222222222,66,2862,0 +1099,1100,9012,0.044896125591910994,3.1269444444444,49,2449,0 +1100,1101,9023,0.07328004196455701,3.5019444444444,73,2222,0 +1101,1102,8875,0.13104465124262998,3.778611111111101,47,2159,0 +1102,1103,8800,0.10394116672902,3.8727777777778,48,2486,0 +1103,1104,8785,0.033616505813902,3.704166666666701,35,3148,0 +1104,1105,8474,0.02672150953308,3.5533333333333,27,3207,0 +1105,1106,8412,0.082058799915824,3.4461111111111005,19,2057,0 +1106,1107,8491,0.05732182787355501,3.4341666666667003,37,2029,0 +1107,1108,8391,0.067005870534182,3.3141666666667,45,3127,0 +1108,1109,8216,0.13429243256821,3.0438888888889,45,2597,0 +1109,1110,8292,0.015094533525413,2.6791666666667004,32,2350,0 +1110,1111,8406,0.063949370932991,2.3202777777778,99,2364,0 +1111,1112,8509,0.094378811742462,2.0691666666667,71,2095,0 +1112,1113,8486,0.02139340711812,2.0091666666667,93,2978,0 +1113,1114,8616,0.0,2.1886111111111,78,2743,0 +1114,1115,8642,0.0,2.3088888888889,71,2668,0 +1115,1116,8823,0.0,2.3794444444444,91,3054,0 +1116,1117,8774,0.0,2.5994444444444,31,3733,0 +1117,1118,8810,0.0,2.7119444444444,35,4312,0 +1118,1119,8611,0.0,2.76,25,4112,0 +1119,1120,8798,0.10029435223064,2.6975,45,3541,0 +1120,1121,9179,0.0,2.5466666666667,33,3901,0 +1121,1122,9057,0.10365337249761998,2.6036111111111,34,4371,0 +1122,1123,8633,0.12418226954696003,2.7927777777778,40,4099,0 +1123,1124,8517,0.0,2.9788888888889,17,3039,0 +1124,1125,8427,0.051166116772473,3.4080555555556,17,3197,0 +1125,1126,8615,0.040222150865381015,3.6813888888889,16,2346,0 +1126,1127,8690,0.17057206553854998,3.7983333333333,26,2285,0 +1127,1128,8438,0.12861588337799,3.6338888888889,19,2313,0 +1128,1129,10388,0.0,3.5111111111111004,30,3216,0 +1129,1130,10588,0.0,3.3613888888889,94,3860,0 +1130,1131,10533,0.14569364884757002,3.3072222222222,73,4781,0 +1131,1132,10397,0.18198813530019,3.2447222222222,59,2957,0 +1132,1133,10347,0.038073868368755,3.1152777777778,53,2171,0 +1133,1134,10405,0.11491272575332,2.6994444444444,56,2856,0 +1134,1135,10411,0.064841538076484,2.3497222222222005,70,2714,0 +1135,1136,10503,0.048708312546253,2.0619444444444,60,2602,0 +1136,1137,10598,0.11629780056153,2.0625,83,2331,0 +1137,1138,10692,0.07659916149791901,2.1905555555556004,265,3586,0 +1138,1139,10874,0.0,2.2588888888889,944,3363,0 +1139,1140,11043,0.043763623117499,2.3983333333333,36,3879,0 +1140,1141,11009,0.0,2.5536111111111,42,3556,0 +1141,1142,10818,0.041436571087464,2.7408333333333,23,4381,0 +1142,1143,10985,0.0,2.7375,75,4777,0 +1143,1144,10861,0.08191467409622599,2.7780555555556,68,4879,0 +1144,1145,12282,0.11084389924027,2.6225,23,3553,0 +1145,1146,11225,0.12510294083344,2.6386111111111,35,3177,0 +1146,1147,10775,0.10213470511717,2.7908333333333,38,2727,0 +1147,1148,10688,0.06332743445339299,3.0922222222222,69,2758,0 +1148,1149,10601,0.033666593475508995,3.4291666666667004,57,4124,0 +1149,1150,10634,0.057459020289436,3.6752777777778,58,3076,0 +1150,1151,10646,0.023008391787587,3.736111111111101,43,2291,0 +1151,1152,10562,0.037622360322278,3.5905555555556,65,2482,0 +1152,1153,10608,0.026766196308354,3.3872222222222,60,2537,0 +1153,1154,10618,0.13691041072327,3.3186111111111005,55,2434,0 +1154,1155,10636,0.024581173073578,3.2775,49,2608,0 +1155,1156,10583,0.050723618686514,3.1625,54,2614,0 +1156,1157,10613,0.038807415292018,3.1391666666667004,66,2904,0 +1157,1158,10603,0.10731539561588,2.7616666666667005,59,2204,0 +1158,1159,10601,0.13649131550296,2.4675,107,2326,0 +1159,1160,10757,0.11190990870167998,2.2166666666667,104,3002,0 +1160,1161,10815,0.17879123074031,2.1205555555556,100,3472,0 +1161,1162,10790,0.08728058888363299,2.2044444444444,133,3496,0 +1162,1163,11082,0.0,2.3147222222222004,65,3168,0 +1163,1164,11121,0.07099894663641,2.2416666666667004,152,4268,0 +1164,1165,10913,0.098617038600063,2.405,83,4350,0 +1165,1166,11004,0.0,2.5705555555556003,158,3555,0 +1166,1167,11135,0.10519721128315,2.7088888888889,145,4986,0 +1167,1168,10960,0.10928571467639,2.6913888888889,77,4576,0 +1168,1169,11686,0.14969099592127,2.6427777777778,13,4451,0 +1169,1170,11244,0.060122448878635,2.705,67,3627,0 +1170,1171,10931,0.068254139999346,2.8738888888889,25,3485,0 +1171,1172,10811,0.056987671819742985,3.0819444444444,27,3046,0 +1172,1173,10679,0.094667935014769,3.4491666666667005,23,2657,0 +1173,1174,10648,0.13287358772218,3.6275,28,2423,0 +1174,1175,10757,0.032507012295146,3.8027777777778,25,2374,0 +1175,1176,10706,0.14779741522058998,3.6436111111111,28,2493,0 +1176,1177,9077,0.10864900088005,3.4861111111111005,30,2495,0 +1177,1178,8836,0.12602969813907,3.3266666666667004,31,2189,0 +1178,1179,8971,0.07253718299881,3.1866666666667003,31,2214,0 +1179,1180,8972,0.31381296416887,3.2213888888888995,44,2374,0 +1180,1181,8903,0.2312064012582,3.0102777777778,27,3230,0 +1181,1182,8967,0.17687421373190998,2.6658333333333,36,2132,0 +1182,1183,8962,0.022073721703464003,2.3902777777778,61,3042,0 +1183,1184,9044,0.11600086139073,2.1380555555556,64,2053,0 +1184,1185,8931,0.10418807549523,2.0161111111111,118,2349,0 +1185,1186,9028,0.040222150865381015,2.0641666666667,98,3381,0 +1186,1187,9240,0.06812462580532,2.1844444444444,76,3436,0 +1187,1188,9227,0.055328485037955,2.2822222222222,57,3280,0 +1188,1189,9227,0.027788383289499,2.4002777777777995,74,4357,0 +1189,1190,9125,0.0,2.5433333333333,72,4522,0 +1190,1191,9075,0.0,2.7469444444444,78,4094,0 +1191,1192,9117,0.035137191893634005,2.6872222222222,69,3296,0 +1192,1193,9562,0.035137191893634005,2.6980555555556,125,4129,0 +1193,1194,9305,0.11258759940039,2.7380555555556,157,3036,0 +1194,1195,8965,0.16105265701128,2.7858333333333,61,2628,0 +1195,1196,8862,0.15210502999287,3.0502777777778,12,2296,0 +1196,1197,8858,0.07673479360192201,3.2991666666667,16,2221,0 +1197,1198,8820,0.17013715283392,3.5533333333333,36,1991,0 +1198,1199,8876,0.1609412187274,3.6652777777778,27,2778,0 +1199,1200,8797,0.12008642730107,3.6116666666667,22,2511,0 +1200,1201,9074,0.045995324803682,3.5463888888889,22,2103,0 +1201,1202,9318,0.23802438276872,3.4013888888889,35,2111,0 +1202,1203,9286,0.18078076076243,3.245,67,2055,0 +1203,1204,9320,0.12741851179236,3.1644444444444,46,1930,0 +1204,1205,9280,0.08024661572906401,2.9361111111111,72,2456,0 +1205,1206,9333,0.32656213417732,2.6952777777778,96,2952,0 +1206,1207,9334,0.28639695711596,2.3702777777778,117,2147,0 +1207,1208,9337,0.083900984173012,2.0947222222222,113,2051,0 +1208,1209,9405,0.12853338721539,1.9538888888889,140,2281,0 +1209,1210,9263,0.032414228925828,1.9925,107,2102,0 +1210,1211,9326,0.08237281480963901,2.0363888888889,102,2062,0 +1211,1212,9421,0.0,2.1919444444444,85,2796,0 +1212,1213,9275,0.0,2.3211111111111,49,2005,0 +1213,1214,9323,0.0,2.4955555555556,69,2075,0 +1214,1215,9347,0.45868581620054,2.6980555555556,68,2058,1 +1215,1216,9333,0.1959092708736,2.7219444444444,104,2733,0 +1216,1217,9846,0.7871265862012701,2.725,111,2170,1 +1217,1218,9497,0.18267963393082,2.7816666666667,88,2282,0 +1218,1219,9383,0.26777755992147,2.7811111111111004,64,2178,0 +1219,1220,9300,0.30404676514833,2.955,29,2283,0 +1220,1221,9389,0.28226806095289003,3.3158333333333,32,2097,0 +1221,1222,9364,0.32093016819692,3.5669444444444003,29,2738,0 +1222,1223,9227,0.24793583772273,3.7419444444444,21,2678,0 +1223,1224,9309,0.27376916868294,3.6236111111111,33,2404,0 +1224,1225,6204,0.32069151905173,3.4416666666667,37,1497,0 +1225,1226,6048,0.16728853165162,3.4172222222222,57,1496,0 +1226,1227,5949,0.17244047836378998,3.3016666666667,72,1935,0 +1227,1228,5981,0.21356200193615,3.1963888888889,86,1521,0 +1228,1229,5897,0.08833993625230199,3.0641666666667,70,2879,0 +1229,1230,6038,0.20141526375625,2.735,63,1561,0 +1230,1231,6094,0.12271171189386,2.3288888888889,49,1381,0 +1231,1232,6022,0.15111333507662,2.0938888888889,81,1826,0 +1232,1233,6122,0.3688420983862,2.1338888888889,58,1896,0 +1233,1234,6034,0.15672074166098002,2.2247222222222005,70,2083,0 +1234,1235,6079,0.099476236793782,2.3308333333333,67,1792,0 +1235,1236,5998,0.18394691317126,2.3902777777778,70,3258,0 +1236,1237,6004,0.076264605227629,2.5819444444444,95,2265,0 +1237,1238,5908,0.058100747891124,2.6661111111111,100,2775,0 +1238,1239,6022,0.18015967729618,2.8258333333333,116,1545,0 +1239,1240,5981,0.059431847203259,2.7502777777778,123,1818,0 +1240,1241,6399,0.14870829462531002,2.6730555555556004,71,1481,0 +1241,1242,6119,0.09565694822541,2.7536111111111,65,1677,0 +1242,1243,6114,0.16022629962173002,2.9677777777778,73,1858,0 +1243,1244,5915,0.4140256163498,3.37,53,1643,0 +1244,1245,6192,0.32447726333369004,3.5958333333333,79,1582,0 +1245,1246,6021,0.15394421357627,3.8144444444444,77,1611,0 +1246,1247,6060,0.060070368432038,3.8283333333333,59,1803,0 +1247,1248,7510,0.14236976564388,3.7030555555556,66,2121,0 +1248,1249,7560,0.12741851179236,3.5802777777778,54,2375,0 +1249,1250,7525,0.093634078744746,3.4197222222222,54,1866,0 +1250,1251,7483,0.13709947889982,3.4438888888889,89,2398,0 +1251,1252,7452,0.06298116794216299,3.3425,85,2577,0 +1252,1253,7512,0.13125017838571,3.1608333333333,96,1801,0 +1253,1254,7572,0.21161148728916,2.7413888888888995,149,1840,0 +1254,1255,7629,0.06783428261124,2.3808333333333,139,1985,0 +1255,1256,7529,0.20877561051189,2.12,90,2041,0 +1256,1257,7623,0.10394294206935002,2.1533333333333,68,2075,0 +1257,1258,7637,0.0,2.2569444444444,445,2564,0 +1258,1259,7921,0.076424293095548,2.3183333333333,100,2734,0 +1259,1260,7790,0.08809461878011901,2.3583333333333,138,3143,0 diff --git a/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_problem/problemDoc.json b/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_problem/problemDoc.json new file mode 100644 index 0000000..417cb6b --- /dev/null +++ b/datasets/anomaly_reserve/yahoo_sub_5/yahoo_sub_5_problem/problemDoc.json @@ -0,0 +1,65 @@ +{ + "about": { + "problemID": "yahoo_sub_5_problem", + "problemName": "yahoo_sub_5_problem", + "problemDescription": "Anomaly detection", + "problemVersion": "4.0.0", + "problemSchemaVersion": "4.0.0", + "taskKeywords": [ + "classification", + "binary", + "tabular" + ] + }, + "inputs": { + "data": [ + { + "datasetID": "yahoo_sub_5_dataset", + "targets": [ + { + "targetIndex": 0, + "resID": "learningData", + "colIndex": 7, + "colName": "ground_truth" + } + ] + } + ], + "dataSplits": { + "method": "holdOut", + "testSize": 0.2, + "stratified": true, + "numRepeats": 0, + "randomSeed": 42, + "splitsFile": "dataSplits.csv", + "datasetViewMaps": { + "train": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TRAIN" + } + ], + "test": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_TEST" + } + ], + "score": [ + { + "from": "yahoo_sub_5_dataset", + "to": "yahoo_sub_5_dataset_SCORE" + } + ] + } + }, + "performanceMetrics": [ + { + "metric": "f1Macro" + } + ] + }, + "expectedOutputs": { + "predictionsFile": "predictions.csv" + } +} \ No newline at end of file diff --git a/datasets/data-supply/README.md b/datasets/data-supply/README.md new file mode 100644 index 0000000..74d7cd4 --- /dev/null +++ b/datasets/data-supply/README.md @@ -0,0 +1,4 @@ +This repository contains D3M datasets schemas and related documentation from the DARPA [Data-Driven Discovery of Models (D3M)](https://www.darpa.mil/program/data-driven-discovery-of-models) program. These were developed researchers at MIT Lincoln Lab in collaboration with D3M performers. + +__________________ +This work was sponsored by the Defense Advanced Research Projects Agency (DARPA) under Air Force Contract FA8721-05-C-0002. Opinions, interpretations, conclusions, and recommendations are those of the authors and are not necessarily endorsed by the United States Government. diff --git a/datasets/data-supply/documentation/README.md b/datasets/data-supply/documentation/README.md new file mode 100644 index 0000000..92292b4 --- /dev/null +++ b/datasets/data-supply/documentation/README.md @@ -0,0 +1,6 @@ +The documentation for the data supply infrastructure, including the dataset schema and problem schema, is provided here. + +# [Overview](overview.md) +# [Dataset Schema](datasetSchema.md) +# [Problem Schema](problemSchema.md) +# [Minimal Metadata](minimalMetadata.md) \ No newline at end of file diff --git a/datasets/data-supply/documentation/code/consolidated-new-metrics.ipynb b/datasets/data-supply/documentation/code/consolidated-new-metrics.ipynb new file mode 100644 index 0000000..818f7f2 --- /dev/null +++ b/datasets/data-supply/documentation/code/consolidated-new-metrics.ipynb @@ -0,0 +1,906 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hamming loss" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from io import StringIO\n", + "from sklearn.metrics import hamming_loss" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def hammingLoss(y_true, y_pred):\n", + " \"\"\"\n", + " Computes the hamming loss. Used for multiclass and multilabel classification.\n", + " \"\"\"\n", + " from sklearn import preprocessing\n", + " from sklearn.metrics import hamming_loss\n", + " import numpy as np\n", + " from itertools import chain\n", + "\n", + " def to_2d_array(df):\n", + " MULTI_CLASS_CONDITION=True\n", + " _dict = {}\n", + " for [index,value] in df.as_matrix():\n", + " if index in _dict.keys():\n", + " _dict[index].append(value)\n", + " MULTI_CLASS_CONDITION = False\n", + " else:\n", + " _dict[index]=[value]\n", + " return list(_dict.keys()), list(_dict.values()), MULTI_CLASS_CONDITION\n", + " \n", + " (y_true_keys, y_true_mat, b_multiclass) = to_2d_array(y_true)\n", + " (y_pred_keys, y_pred_mat, b_multiclass) = to_2d_array(y_pred)\n", + " \n", + " assert y_true_keys==y_pred_keys\n", + " \n", + " if b_multiclass:\n", + "# print('this is a multiclass case')\n", + " y_true_label_encoded = np.array(y_true_mat).ravel()\n", + " y_pred_label_encoded = np.array(y_pred_mat).ravel()\n", + " else: # MULTI_LABEL_CONDITION\n", + "# print('this is a multilabel case')\n", + " y_true_classes=(set(list(chain.from_iterable(y_true_mat))))\n", + " y_pred_classes=(set(list(chain.from_iterable(y_pred_mat))))\n", + " all_classes = list(y_true_classes.union(y_pred_classes))\n", + " lb = preprocessing.MultiLabelBinarizer(classes=all_classes)\n", + " y_true_label_encoded = lb.fit_transform(y_true_mat)\n", + " y_pred_label_encoded = lb.transform(y_pred_mat)\n", + " return hamming_loss(y_true_label_encoded, y_pred_label_encoded)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:13: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n", + " del sys.path[0]\n" + ] + }, + { + "data": { + "text/plain": [ + "0.26666666666666666" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Testcase 1: MultiLabel, typical\n", + "y_true = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,class_label\n", + "3,happy-pleased\n", + "3,relaxing-calm\n", + "7,amazed-suprised\n", + "7,happy-pleased\n", + "13,quiet-still\n", + "13,sad-lonely\n", + "\"\"\"))\n", + "\n", + "y_pred = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,class_label\n", + "3,happy-pleased\n", + "3,sad-lonely\n", + "7,amazed-suprised\n", + "7,happy-pleased\n", + "13,quiet-still\n", + "13,happy-pleased\n", + "\"\"\"))\n", + "\n", + "hammingLoss(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:13: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n", + " del sys.path[0]\n" + ] + }, + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Testcase 2: MultiLabel, Zero loss\n", + "y_true = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,class_label\n", + "3,happy-pleased\n", + "3,relaxing-calm\n", + "7,amazed-suprised\n", + "7,happy-pleased\n", + "13,quiet-still\n", + "13,sad-lonely\n", + "\"\"\"))\n", + "\n", + "y_pred = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,class_label\n", + "3,happy-pleased\n", + "3,relaxing-calm\n", + "7,amazed-suprised\n", + "7,happy-pleased\n", + "13,quiet-still\n", + "13,sad-lonely\n", + "\"\"\"))\n", + "\n", + "hammingLoss(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:13: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n", + " del sys.path[0]\n" + ] + }, + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Testcase 3: MultiLabel, Complete loss\n", + "y_true = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,class_label\n", + "3,happy-pleased\n", + "3,relaxing-calm\n", + "7,amazed-suprised\n", + "7,happy-pleased\n", + "13,quiet-still\n", + "13,sad-lonely\n", + "\"\"\"))\n", + "\n", + "y_pred = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,class_label\n", + "3,ecstatic\n", + "3,sad-lonely\n", + "3,quiet-still\n", + "3,amazed-suprised\n", + "7,ecstatic\n", + "7,sad-lonely\n", + "7,relaxing-calm\n", + "7,quiet-still\n", + "13,ecstatic\n", + "13,happy-pleased\n", + "13,relaxing-calm\n", + "13,amazed-suprised\n", + "\"\"\"))\n", + "\n", + "hammingLoss(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:13: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n", + " del sys.path[0]\n" + ] + }, + { + "data": { + "text/plain": [ + "0.2" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Testcase 4: Multiclass, Typical\n", + "y_true = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,species\n", + "1,versicolor\n", + "2,versicolor\n", + "16,virginica\n", + "17,setosa\n", + "22,versicolor\n", + "26,versicolor\n", + "30,versicolor\n", + "31,virginica\n", + "33,versicolor\n", + "37,virginica\n", + "\"\"\"))\n", + "\n", + "y_pred = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,species\n", + "1,setosa\n", + "2,versicolor\n", + "16,virginica\n", + "17,setosa\n", + "22,versicolor\n", + "26,virginica\n", + "30,versicolor\n", + "31,virginica\n", + "33,versicolor\n", + "37,virginica\n", + "\"\"\"))\n", + "\n", + "hammingLoss(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:13: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n", + " del sys.path[0]\n" + ] + }, + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Testcase 5: Multiclass, Zero loss\n", + "y_true = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,species\n", + "1,versicolor\n", + "2,versicolor\n", + "16,virginica\n", + "17,setosa\n", + "22,versicolor\n", + "26,versicolor\n", + "30,versicolor\n", + "31,virginica\n", + "33,versicolor\n", + "37,virginica\n", + "\"\"\"))\n", + "\n", + "y_pred = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,species\n", + "1,versicolor\n", + "2,versicolor\n", + "16,virginica\n", + "17,setosa\n", + "22,versicolor\n", + "26,versicolor\n", + "30,versicolor\n", + "31,virginica\n", + "33,versicolor\n", + "37,virginica\n", + "\"\"\"))\n", + "\n", + "hammingLoss(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:13: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n", + " del sys.path[0]\n" + ] + }, + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Testcase 6: Multiclass, Complete loss\n", + "y_true = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,species\n", + "1,versicolor\n", + "2,versicolor\n", + "16,versicolor\n", + "17,virginica\n", + "22,versicolor\n", + "26,versicolor\n", + "30,versicolor\n", + "31,virginica\n", + "33,versicolor\n", + "37,virginica\n", + "\"\"\"))\n", + "\n", + "y_pred = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,species\n", + "1,setosa\n", + "2,setosa\n", + "16,setosa\n", + "17,setosa\n", + "22,setosa\n", + "26,setosa\n", + "30,setosa\n", + "31,setosa\n", + "33,setosa\n", + "37,setosa\n", + "\"\"\"))\n", + "\n", + "hammingLoss(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RMSE" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def rootMeanSquaredError(y_true, y_pred):\n", + " \"\"\"\n", + " Computes the root mean squared error, for both univariate and multivariate case\n", + " \"\"\"\n", + " import numpy as np\n", + " from sklearn.metrics import mean_squared_error\n", + " from math import sqrt\n", + " \n", + " rmse = None\n", + " \n", + " # perform some checks\n", + " assert 'd3mIndex' in y_true.columns\n", + " assert 'd3mIndex' in y_pred.columns\n", + " assert y_true.shape == y_pred.shape\n", + " \n", + " # preprocessing\n", + " y_true.set_index('d3mIndex', inplace=True)\n", + " y_pred.set_index('d3mIndex', inplace=True)\n", + " \n", + " # determine the dimension\n", + " y_true_dim=y_true.shape[1]\n", + " \n", + " # univariate case\n", + " if y_true_dim == 1: \n", + " y_true_array = y_true.as_matrix().ravel()\n", + " y_pred_array = y_pred.as_matrix().ravel()\n", + " mse = mean_squared_error(y_true, y_pred)\n", + " rmse = sqrt(mse)\n", + " \n", + " # multivariate case\n", + " elif y_true_dim > 1:\n", + " y_true_array = y_true.as_matrix()\n", + " y_pred_array = y_pred.as_matrix()\n", + " mse = mean_squared_error(y_true_array, y_pred_array, multioutput='uniform_average')\n", + " rmse = sqrt(mse)\n", + " \n", + " return rmse" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:25: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n", + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:26: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n" + ] + }, + { + "data": { + "text/plain": [ + "0.8381527307120105" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# test case 1\n", + "# y_true_uni=[3, -1., 2, 7]\n", + "# y_pred_uni=[2.1, 0.0, 2, 8]\n", + "# expected rmse = 0.8381527307120105\n", + "\n", + "y_true = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,value\n", + "1,3\n", + "2,-1.0\n", + "16,2\n", + "17,7\n", + "\"\"\"))\n", + "y_pred = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,value\n", + "1,2.1\n", + "2,0.0\n", + "16,2\n", + "17,8\n", + "\"\"\"))\n", + "rootMeanSquaredError(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:32: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n", + "/home/svattam/miniconda3/envs/automl/lib/python3.6/site-packages/ipykernel_launcher.py:33: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.\n" + ] + }, + { + "data": { + "text/plain": [ + "0.8416254115301732" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# test case 2\n", + "# y_true_multi=[[0.5, 1],[-1, 1],[7, -6]]\n", + "# y_pred_multi=[[0, 2],[-1, 2],[8, -5]]\n", + "# expected rmse = 0.8416254115301732\n", + "\n", + "y_true = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,value1, value2\n", + "1,0.5,1\n", + "2,-1,1\n", + "16,7,-6\n", + "\"\"\"))\n", + "y_pred = pd.read_csv(StringIO(\"\"\"\n", + "d3mIndex,value1,value2\n", + "1,0,2\n", + "2,-1,2\n", + "16,8,-5\n", + "\"\"\"))\n", + "rootMeanSquaredError(y_true, y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Object detection average precision" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "TEST CASE 1 --- AP: 0.6666666666666666\n", + "TEST CASE 2 --- AP: 0.125\n", + "TEST CASE 3 --- AP: 0.4444444444444444\n", + "TEST CASE 4 --- AP: 0.4444444444444444\n" + ] + } + ], + "source": [ + "def group_gt_boxes_by_image_name(gt_boxes):\n", + " gt_dict: typing.Dict = {}\n", + "\n", + " for box in gt_boxes:\n", + " image_name = box[0]\n", + " bounding_polygon = box[1:]\n", + " bbox = convert_bouding_polygon_to_box_coords(bounding_polygon)\n", + "\n", + " if image_name not in gt_dict.keys():\n", + " gt_dict[image_name] = []\n", + "\n", + " gt_dict[image_name].append({'bbox': bbox})\n", + "\n", + " return gt_dict\n", + "\n", + "\n", + "def convert_bouding_polygon_to_box_coords(bounding_polygon):\n", + " # box_coords = [x_min, y_min, x_max, y_max]\n", + " box_coords = [bounding_polygon[0], bounding_polygon[1],\n", + " bounding_polygon[4], bounding_polygon[5]]\n", + " return box_coords\n", + "\n", + "\n", + "def voc_ap(rec, prec):\n", + " import numpy\n", + "\n", + " # First append sentinel values at the end.\n", + " mrec = numpy.concatenate(([0.], rec, [1.]))\n", + " mpre = numpy.concatenate(([0.], prec, [0.]))\n", + "\n", + " # Compute the precision envelope.\n", + " for i in range(mpre.size - 1, 0, -1):\n", + " mpre[i - 1] = numpy.maximum(mpre[i - 1], mpre[i])\n", + "\n", + " # To calculate area under PR curve, look for points\n", + " # where X axis (recall) changes value.\n", + " i = numpy.where(mrec[1:] != mrec[:-1])[0]\n", + "\n", + " # And sum (\\Delta recall) * prec.\n", + " ap = numpy.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])\n", + "\n", + " return float(ap)\n", + "\n", + "\n", + "def object_detection_average_precision(y_true, y_pred):\n", + " \"\"\"\n", + " This function takes a list of ground truth bounding polygons (rectangles in this case)\n", + " and a list of detected bounding polygons (also rectangles) for a given class and\n", + " computes the average precision of the detections with respect to the ground truth polygons.\n", + " Parameters:\n", + " -----------\n", + " y_true: list\n", + " List of ground truth polygons. Each polygon is represented as a list of\n", + " vertices, starting in the upper-left corner going counter-clockwise.\n", + " Since in this case, the polygons are rectangles, they will have the\n", + " following format:\n", + " [image_name, x_min, y_min, x_min, y_max, x_max, y_max, x_max, y_min].\n", + " y_pred: list\n", + " List of bounding box polygons with their corresponding confidence scores. Each\n", + " polygon is represented as a list of vertices, starting in the upper-left corner\n", + " going counter-clockwise. Since in this case, the polygons are rectangles, they\n", + " will have the following format:\n", + " [image_name, x_min, y_min, x_min, y_max, x_max, y_max, x_max, y_min, confidence_score].\n", + " Returns:\n", + " --------\n", + " ap: float\n", + " Average precision between detected polygons (rectangles) and the ground truth polylgons (rectangles).\n", + " (it is also the area under the precision-recall curve).\n", + " Example 1:\n", + " >> predictions_list_1 = [['img_00001.png', 110, 110, 110, 210, 210, 210, 210, 110, 0.6],\n", + " ['img_00002.png', 5, 10, 5, 20, 20, 20, 20, 10, 0.9],\n", + " ['img_00002.png', 120, 130, 120, 200, 200, 200, 200, 130, 0.6]]\n", + " >> ground_truth_list_1 = [['img_00001.png', 100, 100, 100, 200, 200, 200, 200, 100],\n", + " ['img_00002.png', 10, 10, 10, 20, 20, 20, 20, 10],\n", + " ['img_00002.png', 70, 80, 70, 150, 140, 150, 140, 80]]\n", + " >> ap_1 = object_detection_average_precision(ground_truth_list_1, predictions_list_1)\n", + " >> print(ap_1)\n", + " 0.667\n", + " Example 2:\n", + " >> predictions_list_2 = [['img_00285.png', 330, 463, 330, 505, 387, 505, 387, 463, 0.0739],\n", + " ['img_00285.png', 420, 433, 420, 498, 451, 498, 451, 433, 0.0910],\n", + " ['img_00285.png', 328, 465, 328, 540, 403, 540, 403, 465, 0.1008],\n", + " ['img_00285.png', 480, 477, 480, 522, 508, 522, 508, 477, 0.1012],\n", + " ['img_00285.png', 357, 460, 357, 537, 417, 537, 417, 460, 0.1058],\n", + " ['img_00285.png', 356, 456, 356, 521, 391, 521, 391, 456, 0.0843],\n", + " ['img_00225.png', 345, 460, 345, 547, 415, 547, 415, 460, 0.0539],\n", + " ['img_00225.png', 381, 362, 381, 513, 455, 513, 455, 362, 0.0542],\n", + " ['img_00225.png', 382, 366, 382, 422, 416, 422, 416, 366, 0.0559],\n", + " ['img_00225.png', 730, 463, 730, 583, 763, 583, 763, 463, 0.0588]]\n", + " >> ground_truth_list_2 = [['img_00285.png', 480, 457, 480, 529, 515, 529, 515, 457],\n", + " ['img_00285.png', 480, 457, 480, 529, 515, 529, 515, 457],\n", + " ['img_00225.png', 522, 540, 522, 660, 576, 660, 576, 540],\n", + " ['img_00225.png', 739, 460, 739, 545, 768, 545, 768, 460]]\n", + " >> ap_2 = object_detection_average_precision(ground_truth_list_2, predictions_list_2)\n", + " >> print(ap_2)\n", + " 0.125\n", + " Example 3:\n", + " >> predictions_list_3 = [['img_00001.png', 110, 110, 110, 210, 210, 210, 210, 110, 0.6],\n", + " ['img_00002.png', 120, 130, 120, 200, 200, 200, 200, 130, 0.6],\n", + " ['img_00002.png', 5, 8, 5, 16, 15, 16, 15, 8, 0.9],\n", + " ['img_00002.png', 11, 12, 11, 18, 21, 18, 21, 12, 0.9]]\n", + " >> ground_truth_list_3 = [['img_00001.png', 100, 100, 100, 200, 200, 200, 200, 100],\n", + " ['img_00002.png', 10, 10, 10, 20, 20, 20, 20, 10],\n", + " ['img_00002.png', 70, 80, 70, 150, 140, 150, 140, 80]]\n", + " >> ap_3 = object_detection_average_precision(ground_truth_list_3, predictions_list_3)\n", + " >> print(ap_3)\n", + " 0.444\n", + " Example 4:\n", + " (Same as example 3 except the last two box predictions in img_00002.png are switched)\n", + " >> predictions_list_4 = [['img_00001.png', 110, 110, 110, 210, 210, 210, 210, 110, 0.6],\n", + " ['img_00002.png', 120, 130, 120, 200, 200, 200, 200, 130, 0.6],\n", + " ['img_00002.png', 11, 12, 11, 18, 21, 18, 21, 12, 0.9],\n", + " ['img_00002.png', 5, 8, 5, 16, 15, 16, 15, 8, 0.9]]\n", + " >> ground_truth_list_4 = [['img_00001.png', 100, 100, 100, 200, 200, 200, 200, 100],\n", + " ['img_00002.png', 10, 10, 10, 20, 20, 20, 20, 10],\n", + " ['img_00002.png', 70, 80, 70, 150, 140, 150, 140, 80]]\n", + " >> ap_4 = object_detection_average_precision(ground_truth_list_4, predictions_list_4)\n", + " >> print(ap_4)\n", + " 0.444\n", + " \"\"\"\n", + "\n", + " \"\"\"\n", + " This function is different from others because ``y_true`` and ``y_pred`` are not vectors but arrays.\n", + " \"\"\"\n", + " import numpy\n", + " ovthresh = 0.5\n", + "\n", + " # y_true = typing.cast(Truth, unvectorize(y_true))\n", + " # y_pred = typing.cast(Predictions, unvectorize(y_pred))\n", + "\n", + " # Load ground truth.\n", + " gt_dict = group_gt_boxes_by_image_name(y_true)\n", + "\n", + " # Extract gt objects for this class.\n", + " recs = {}\n", + " npos = 0\n", + "\n", + " imagenames = sorted(gt_dict.keys())\n", + " for imagename in imagenames:\n", + " Rlist = [obj for obj in gt_dict[imagename]]\n", + " bbox = numpy.array([x['bbox'] for x in Rlist])\n", + " det = [False] * len(Rlist)\n", + " npos = npos + len(Rlist)\n", + " recs[imagename] = {'bbox': bbox, 'det': det}\n", + "\n", + " # Load detections.\n", + " det_length = len(y_pred[0])\n", + "\n", + " # Check that all boxes are the same size.\n", + " for det in y_pred:\n", + " assert len(det) == det_length, 'Not all boxes have the same dimensions.'\n", + "\n", + " image_ids = [x[0] for x in y_pred]\n", + " BP = numpy.array([[float(z) for z in x[1:-1]] for x in y_pred])\n", + " BB = numpy.array([convert_bouding_polygon_to_box_coords(x) for x in BP])\n", + "\n", + " confidence = numpy.array([float(x[-1]) for x in y_pred])\n", + " boxes_w_confidences_list = numpy.hstack((BB, -1 * confidence[:, None]))\n", + " boxes_w_confidences = numpy.empty((boxes_w_confidences_list.shape[0],),\n", + " dtype=[('x_min', float), ('y_min', float),\n", + " ('x_max', float), ('y_max', float),\n", + " ('confidence', float)])\n", + " boxes_w_confidences[:] = [tuple(i) for i in boxes_w_confidences_list]\n", + "\n", + " # Sort by confidence.\n", + " #sorted_ind = numpy.argsort(-confidence)\n", + " sorted_ind = numpy.argsort(\n", + " boxes_w_confidences, kind='mergesort',\n", + " order=('confidence', 'x_min', 'y_min'))\n", + " BB = BB[sorted_ind, :]\n", + " image_ids = [image_ids[x] for x in sorted_ind]\n", + "\n", + " # Go down y_pred and mark TPs and FPs.\n", + " nd = len(image_ids)\n", + " tp = numpy.zeros(nd)\n", + " fp = numpy.zeros(nd)\n", + " for d in range(nd):\n", + " R = recs[image_ids[d]]\n", + " bb = BB[d, :].astype(float)\n", + " ovmax = -numpy.inf\n", + " BBGT = R['bbox'].astype(float)\n", + "\n", + " if BBGT.size > 0:\n", + " # Compute overlaps.\n", + " # Intersection.\n", + " ixmin = numpy.maximum(BBGT[:, 0], bb[0])\n", + " iymin = numpy.maximum(BBGT[:, 1], bb[1])\n", + " ixmax = numpy.minimum(BBGT[:, 2], bb[2])\n", + " iymax = numpy.minimum(BBGT[:, 3], bb[3])\n", + " iw = numpy.maximum(ixmax - ixmin + 1., 0.)\n", + " ih = numpy.maximum(iymax - iymin + 1., 0.)\n", + " inters = iw * ih\n", + "\n", + " # Union.\n", + " uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +\n", + " (BBGT[:, 2] - BBGT[:, 0] + 1.) *\n", + " (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)\n", + "\n", + " overlaps = inters / uni\n", + " ovmax = numpy.max(overlaps)\n", + " jmax = numpy.argmax(overlaps)\n", + "\n", + " if ovmax > ovthresh:\n", + " if not R['det'][jmax]:\n", + " tp[d] = 1.\n", + " R['det'][jmax] = 1\n", + " else:\n", + " fp[d] = 1.\n", + " else:\n", + " fp[d] = 1.\n", + "\n", + " # Compute precision recall.\n", + " fp = numpy.cumsum(fp)\n", + " tp = numpy.cumsum(tp)\n", + " rec = tp / float(npos)\n", + " # Avoid divide by zero in case the first detection matches a difficult ground truth.\n", + " prec = tp / numpy.maximum(tp + fp, numpy.finfo(numpy.float64).eps)\n", + " ap = voc_ap(rec, prec)\n", + "\n", + " return ap\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " predictions_list_1 = [\n", + " ['img_00001.png', 110, 110, 110, 210, 210, 210, 210, 110, 0.6],\n", + " ['img_00002.png', 5, 10, 5, 20, 20, 20, 20, 10, 0.9],\n", + " ['img_00002.png', 120, 130, 120, 200, 200, 200, 200, 130, 0.6]\n", + " ]\n", + " ground_truth_list_1 = [\n", + " ['img_00001.png', 100, 100, 100, 200, 200, 200, 200, 100],\n", + " ['img_00002.png', 10, 10, 10, 20, 20, 20, 20, 10],\n", + " ['img_00002.png', 70, 80, 70, 150, 140, 150, 140, 80]\n", + " ]\n", + " ap_1 = object_detection_average_precision(\n", + " ground_truth_list_1, predictions_list_1)\n", + " print('TEST CASE 1 --- AP: ', ap_1)\n", + "\n", + " predictions_list_2 = [\n", + " ['img_00285.png', 330, 463, 330, 505, 387, 505, 387, 463, 0.0739],\n", + " ['img_00285.png', 420, 433, 420, 498, 451, 498, 451, 433, 0.0910],\n", + " ['img_00285.png', 328, 465, 328, 540, 403, 540, 403, 465, 0.1008],\n", + " ['img_00285.png', 480, 477, 480, 522, 508, 522, 508, 477, 0.1012],\n", + " ['img_00285.png', 357, 460, 357, 537, 417, 537, 417, 460, 0.1058],\n", + " ['img_00285.png', 356, 456, 356, 521, 391, 521, 391, 456, 0.0843],\n", + " ['img_00225.png', 345, 460, 345, 547, 415, 547, 415, 460, 0.0539],\n", + " ['img_00225.png', 381, 362, 381, 513, 455, 513, 455, 362, 0.0542],\n", + " ['img_00225.png', 382, 366, 382, 422, 416, 422, 416, 366, 0.0559],\n", + " ['img_00225.png', 730, 463, 730, 583, 763, 583, 763, 463, 0.0588],\n", + " ]\n", + " ground_truth_list_2 = [\n", + " ['img_00285.png', 480, 457, 480, 529, 515, 529, 515, 457],\n", + " ['img_00285.png', 480, 457, 480, 529, 515, 529, 515, 457],\n", + " ['img_00225.png', 522, 540, 522, 660, 576, 660, 576, 540],\n", + " ['img_00225.png', 739, 460, 739, 545, 768, 545, 768, 460],\n", + " ]\n", + " ap_2 = object_detection_average_precision(\n", + " ground_truth_list_2, predictions_list_2)\n", + " print('TEST CASE 2 --- AP: ', ap_2)\n", + "\n", + " predictions_list_3 = [\n", + " ['img_00001.png', 110, 110, 110, 210, 210, 210, 210, 110, 0.6],\n", + " ['img_00002.png', 120, 130, 120, 200, 200, 200, 200, 130, 0.6],\n", + " ['img_00002.png', 5, 8, 5, 16, 15, 16, 15, 8, 0.9],\n", + " ['img_00002.png', 11, 12, 11, 18, 21, 18, 21, 12, 0.9]\n", + " ]\n", + " ground_truth_list_3 = [\n", + " ['img_00001.png', 100, 100, 100, 200, 200, 200, 200, 100],\n", + " ['img_00002.png', 10, 10, 10, 20, 20, 20, 20, 10],\n", + " ['img_00002.png', 70, 80, 70, 150, 140, 150, 140, 80]\n", + " ]\n", + " ap_3 = object_detection_average_precision(\n", + " ground_truth_list_3, predictions_list_3)\n", + " print('TEST CASE 3 --- AP: ', ap_3)\n", + "\n", + " predictions_list_4 = [\n", + " ['img_00001.png', 110, 110, 110, 210, 210, 210, 210, 110, 0.6],\n", + " ['img_00002.png', 120, 130, 120, 200, 200, 200, 200, 130, 0.6],\n", + " ['img_00002.png', 11, 12, 11, 18, 21, 18, 21, 12, 0.9],\n", + " ['img_00002.png', 5, 8, 5, 16, 15, 16, 15, 8, 0.9]\n", + " ]\n", + " ground_truth_list_4 = [\n", + " ['img_00001.png', 100, 100, 100, 200, 200, 200, 200, 100],\n", + " ['img_00002.png', 10, 10, 10, 20, 20, 20, 20, 10],\n", + " ['img_00002.png', 70, 80, 70, 150, 140, 150, 140, 80]\n", + " ]\n", + " ap_4 = object_detection_average_precision(\n", + " ground_truth_list_4, predictions_list_4)\n", + " print('TEST CASE 4 --- AP: ', ap_4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/datasets/data-supply/documentation/code/d3m_eval.py b/datasets/data-supply/documentation/code/d3m_eval.py new file mode 100644 index 0000000..e6ba03a --- /dev/null +++ b/datasets/data-supply/documentation/code/d3m_eval.py @@ -0,0 +1,246 @@ +import numpy as np + + +def group_gt_boxes_by_image_name(gt_boxes): + gt_dict = {} + + for box in gt_boxes: + #x = box.split() + #image_name = x[0] + #bbox = [float(z) for z in x[1:]] + image_name = box[0] + bbox = box[1:] + #print(image_name, bbox) + + if image_name not in gt_dict.keys(): + gt_dict[image_name] = [] + + gt_dict[image_name].append({'bbox': bbox}) + + return gt_dict + + +def voc_ap(rec, prec, use_07_metric=False): + """ ap = voc_ap(rec, prec, [use_07_metric]) + Compute VOC AP given precision and recall. + If use_07_metric is true, uses the + VOC 07 11 point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def objectDetectionAP(dets, + gts, + ovthresh=0.5, + use_07_metric=False): + """ + This function takes a list of ground truth boxes and a list of detected bounding boxes + for a given class and computes the average precision of the detections with respect to + the ground truth boxes. + + Parameters: + ----------- + dets: list + List of bounding box detections. Each box is represented as a list + with format: + Case 1 (confidence provided): + ['image_name', 'x_min', 'y_min', 'x_max', 'y_max', 'confidence'] + Case 2 (confidence not provided): + ['image_name', 'x_min', 'y_min', 'x_max', 'y_max'] + + gts: list + List of ground truth boxes. Each box is represented as a list with the + following format: [image_name, x_min, y_min, x_max, y_max]. + + [ovthresh]: float + Overlap threshold (default = 0.5) + + [use_07_metric]: boolean + Whether to use VOC07's 11 point AP computation (default False) + + Returns: + -------- + rec: 1d array-like + Array where each element (rec[i]) is the recall when considering i+1 detections + + prec: 1d array-like + Array where each element (rec[i]) is the precision when considering i+1 detections + + ap: float + Average precision between detected boxes and the ground truth boxes. + (it is also the area under the precision-recall curve). + + Example: + + With confidence scores: + >> predictions_list = [['img_00285.png',330,463,387,505,0.0739], + ['img_00285.png',420,433,451,498,0.0910], + ['img_00285.png',328,465,403,540,0.1008], + ['img_00285.png',480,477,508,522,0.1012], + ['img_00285.png',357,460,417,537,0.1058], + ['img_00285.png',356,456,391,521,0.0843], + ['img_00225.png',345,460,415,547,0.0539], + ['img_00225.png',381,362,455,513,0.0542], + ['img_00225.png',382,366,416,422,0.0559], + ['img_00225.png',730,463,763,583,0.0588]] + >> ground_truth_list = [['img_00285.png',480,457,515,529], + ['img_00285.png',480,457,515,529], + ['img_00225.png',522,540,576,660], + ['img_00225.png',739,460,768,545]] + + >> rec, prec, ap = objectDetectionAP(predictions_list, ground_truth_list) + >> print(ap) + 0.125 + + Without confidence scores: + >> predictions_list = [['img_00285.png',330,463,387,505], + ['img_00285.png',420,433,451,498], + ['img_00285.png',328,465,403,540], + ['img_00285.png',480,477,508,522], + ['img_00285.png',357,460,417,537], + ['img_00285.png',356,456,391,521], + ['img_00225.png',345,460,415,547], + ['img_00225.png',381,362,455,513], + ['img_00225.png',382,366,416,422], + ['img_00225.png',730,463,763,583]] + >> ground_truth_list = [['img_00285.png',480,457,515,529], + ['img_00285.png',480,457,515,529], + ['img_00225.png',522,540,576,660], + ['img_00225.png',739,460,768,545]] + + >> rec, prec, ap = objectDetectionAP(predictions_list, ground_truth_list) + >> print(ap) + 0.0625 + + """ + + # Load ground truth + gt_dict = group_gt_boxes_by_image_name(gts) + + # extract gt objects for this class + recs = {} + npos = 0 + + imagenames = sorted(gt_dict.keys()) + for imagename in imagenames: + R = [obj for obj in gt_dict[imagename]] + bbox = np.array([x['bbox'] for x in R]) + det = [False] * len(R) + npos = npos + len(R) + recs[imagename] = {'bbox': bbox, + 'det': det} + + # Load detections + det_length = len(dets[0]) + + # Check that all boxes are the same size + for det in dets: + assert len(det) == det_length, 'Not all boxes have the same dimensions.' + + image_ids = [x[0] for x in dets] + BB = np.array([[float(z) for z in x[1:5]] for x in dets]) + + if det_length == 6: + print('confidence scores are present') + confidence = np.array([float(x[-1]) for x in dets]) + # sort by confidence + sorted_ind = np.argsort(-confidence) + sorted_scores = np.sort(-confidence) + + else: + print('confidence scores are not present') + num_dets = len(dets) + sorted_ind = np.arange(num_dets) + sorted_scores = np.ones(num_dets) + + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # print('sorted_ind: ', sorted_ind) + # print('sorted_scores: ', sorted_scores) + # print('BB: ', BB) + # print('image_ids: ', image_ids) + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + # print('det %d: ' % d) + # print('bb: ', bb) + + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + # union + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + # print('overlaps: ', overlaps) + + if ovmax > ovthresh: + if not R['det'][jmax]: + # print('Box matched!') + tp[d] = 1. + R['det'][jmax] = 1 + else: + # print('Box was already taken!') + fp[d] = 1. + else: + # print('No match with sufficient overlap!') + fp[d] = 1. + + # print('tp: ', tp) + # print('fp: ', fp) + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap diff --git a/datasets/data-supply/documentation/datasetSchema.md b/datasets/data-supply/documentation/datasetSchema.md new file mode 100644 index 0000000..208b909 --- /dev/null +++ b/datasets/data-supply/documentation/datasetSchema.md @@ -0,0 +1,590 @@ +# Dataset Schema (version 4.0.0) + +Dataset schema provides a specification of an abstract dataset. It is contained in the [datasetSchema.json](../schemas/datasetSchema.json) file. An instance of this dataset schema is included with every dataset in the datasetDoc.json file. The semantics of datasetSchema.json and datasetDoc.json is [here](FAQ.md#semantics). + +Dataset schema specifies a dataset in three sections: about, dataResources, and qualities. Each of these sections are described below. + +## About + +The "about" section contains of some general information about the dataset and consists of the following fields. + +| Field | Description | +|-----------------------|---------------------------------------------------------------------------------------------------| +| datasetID | a unique ID assigned to a dataset | +| datasetName | the name of a dataset | +| datasetURI | the location of the dataset | +| description | a brief description of the dataset | +| citation | citation of the source of the dataset | +| humanSubjectsResearch | indicates if the dataset contains human subjects data or not | +| license | license of the source of the dataset | +| source | source of the dataset | +| sourceURI | location of the source of the dataset | +| approximateSize | size of the dataset | +| applicationDomain | application domain of the dataset (e.g., medical, environment, transportation, agriculture, etc.) | +| datasetVersion | version of the current dataset | +| datasetSchemaVersion | version of the datasetSchema | +| redacted | a field indicating if the dataset has been redacted or not | +| publicationDate | publication date of the dataset, if available | + +## Data Resources + +The "datasetResources" section annotates all the data resources in a dataset. A dataset is considered as a set of data resources. Therefore the "datasetResources" field is a list of dictionaries, one item per resource. Each dictionary item contains the following information about a data resource. + +| Field | Description | +|-----------------------|-------------------------------------------------------------------------------------------------------------------| +| resID | a unique ID assigned to a resource | +| resPath | path of this resource relative to the dataset root | +| resType | the type of this resource+ | +| resFormat | a dictionary structure containing the occurring media types in this dataset as the keys and their corresponding list of file formats as the values ++| +| isCollection | a boolean flag indicating if this resource if a single item/file or a collection of items/files +++. | + ++ __resType__ can be one of the following: __["image","video","audio","speech","text","graph","edgeList","table","timeseries","raw"]__ + +If `resType` is `edgeList`, the resource is basically a graph, but it is represented using an edge list. Each item in this list refers to an edge given by its source node and its target node (in that order, if it is a directed graph). For example, refer to [Case 5](#case-5) below. The edge list can also have optional edge attributes: [edgeID, source_node, target_node, edge_attr1, edge_attr2, ....] + + +++ Here is an example of `resFormat` +``` +"resFormat":{ + "image/jpeg":["jpeg", "jpg"], + "image/png":["png"] +} +``` + +The [supportedResourceTypesFormats.json](supportedResourceTypesFormats.json) file contains the current list of all the resTypes, resFormats and the file types used in the datasets in a machine-readable way. + + ++++A **collection** of resources/files can be organized into directories and sub-directories. + + +If `resType` is "table" or "timeseries" (which is also a table basically), columns in the table can be annotated with the following information. +Not all columns are necessary annotated (see [minimal metadata datasets](./minimalMetadata.md) for an example). +There is an optional `columnsCount` field of the resource which can convey how many columns there are, +which can be used to determine if all columns are annotated. + +| Field | Description | +|-----------------------|-------------------------------------------------------------------------------| +| colIndex | index of the column | +| colName | name of the column | +| colDescription | description of a column | +| colType | the data type of this column | +| role | the role this column plays | +| refersTo | an optional reference to another resource if applicable | +| timeGranularity | if a particular column represents time axis, this field captures the time sampling granularity as a numeric `value` and `unit` (e.g., 15-second granularity). | + +A column's type can be __one of__ the following: + +| Type | Description | +|----------------------|-------------------------------------------------------------------------------| +| boolean || +| integer || +| real || +| string || +| categorical || +| dateTime || +| realVector | This is stored as a string in CSV files. It's represented as "val1,val2,vl3,val4,...". For example, in the objectDetection task, the bounding boundary is a rectangle, which is represented as: "0,160,156,182,276,302,18,431". | +| json || +| geojson | See here: http://geojson.org/ | +| unknown | Column type is unknown. | + + +A column's role can be __one or more__ of the following: + +| Role | Description | +|-------------------------|-----------------| +| index | Primary index of the table. Should have unique values without missing values | +| multiIndex | Primary index of the table. Values in this primary index are not necessary unique to allow the same row to be repeated multiple times (useful for certain task types like multiLabel and objectDetection) | +| key | Any column that satisfies the uniqueness constraint can splay the role of a key. A table can have many keys. Values can be missing in such column. A key can be referenced in other tables (foreign key)| +| attribute | The column contains an input feature or attribute that should be used for analysis | +| suggestedTarget | The column is a potential target variable for a problem. If a problem chooses not to employ it as a target, by default, it will also have an attribute role | +| timeIndicator | Entries in this column are time entries | +| locationIndicator | Entries in this column correspond to physical locations | +| boundaryIndicator | Entries in this column indicated boundary (e.g., start/stop in audio files, bounding-boxes in an image files, etc.) | +| interval | Values in this column represents an interval and will contain a "realVector" of two values| +| instanceWeight | This is a weight assigned to the row during training and testing. It is used when a single row or pattern should be weighted more or less than others.| +| boundingPolygon | This is a type of boundary indicator representing geometric shapes. It's type is "realVector". It contains two coordinates for each polygon vertex - always even number of elements in string. The vertices are ordered counter-clockwise. For e.g., a bounding box in objectDetection consisting of 4 vertices and 8 coordinates: "0,160,156,182,276,302,18,431" | +| suggestedPrivilegedData | The column is potentially unavailable variable during testing for a problem. If a problem chooses not to employ it as a privileged data, by default, it will also have an attribute role | +| suggestedGroupingKey | A column with this role is a potential candidate for group_by operation (rows having the same values in this column should be grouped togehter). Multiple columns with this role can be grouped to obtain a hierarchical multi-index data. | +| edgeSource | Entries in this column are source nodes of edges in a graph | +| directedEdgeSource | Entries in this column are source nodes of directed edges in a graph | +| undirectedEdgeSource | Entries in this column are source nodes of undirected edges in a graph | +| multiEdgeSource | Entries in this column are source nodes of edges in a multi graph | +| simpleEdgeSource | Entries in this column are source nodes of edges in a simple graph | +| edgeTarget | Entries in this column are target nodes of edges in a graph | +| directedEdgeTarget | Entries in this column are target nodes of directed edges in a graph | +| undirectedEdgeTarget | Entries in this column are target nodes of undirected edges in a graph | +| multiEdgeTarget | Entries in this column are target nodes of edges in a multi graph | +| simpleEdgeTarget | Entries in this column are target nodes of edges in a simple graph | + +A time granularity unit can be __one of__ the following: + +| Unit | Description | +|----------------------|-------------------------------------------------------------------------------| +| seconds || +| minutes || +| days || +| weeks || +| months || +| years || +| unspecified || + +__Notes regarding index, multi-index roles__ +A table can have only one `index` or `multiIndex` column. + +For a table with `multiIndex` column, for a particular index value there can be multiple rows. Those rows have exactly the same values in all columns except columns with `suggestedTarget` role. + +Of special note is the "refersTo" field, which allows linking of tables to other entities. A typical "refersTo" entry looks like the following. +``` +"colIndex":1, + "colName":"customerID", + "colType":"integer", + "role":["attribute"], + "refersTo":{ + "resID":"0", + "resObject":{ + "columnName":"custID"}} +``` +Here, the customerID column in one table is linking to another resource whose "resID"="0", which is presumably a different table. Further, the object of reference, "resObject", is a column whose "columnName"="custID". + +A table entry can also refer to other kinds of resources besides another tables, including, a raw file in a collection of raw files (e.g., img1.png in a collection of images), a node in a graph, an edge in a graph. The below table lists the different resource objects that can be referenced. + +| resObject | resource referenced | Description | +|-----------|---------------------|-------------| +| columnIndex, columnName | table | Entries in this column refer to entries in another table (foreign key). See [example](#case-3) | +| item | a collection of raw files | Each entry in this column refers to an item in a collection of raw files. See [example](#case-2) | +| nodeAttribute | graph | Entries in this column refer to attribute values of a node in a graph. This can include a reference to the nodeID| +| edgeAttribute | graph | Entries in this column refer to attribute values of an edge in a graph. This can include a reference to the edgeID| + +__Notes regarding graph node/edge roles__ +The parent roles of egdeSource/edgeTarget can be combined with derived roles such as directedEdgeSource/directedEdgeTarget and undirectedEdgeSource/undirectedEdgeTarget to indicate directed or undirected edges in a graph. Similarly, the metadata can capture information about simple/multi-edges using multiEdgeSource/multiEdgeTarget and simpleEdgeSource/simpleEdgeTarget. + + +Next, we will see how this approach of treating everything as a resource and using references to link resources can handle a range of cases that can arise. + +## Case 1: Single table + +In many openml and other tabular cases, all the learning data is contained in a single tabular file. In this case, an example dataset will look like the following. +``` +/ +|-- tables/ + |-- learningData.csv + d3mIndex,sepalLength,sepalWidth,petalLength,petalWidth,species + 0,5.2,3.5,1.4,0.2,I.setosa + 1,4.9,3.0,1.4,0.2,I.setosa + 2,4.7,3.2,1.3,0.2,I.setosa + 3,4.6,3.1,1.5,0.2,I.setosa + 4,5.0,3.6,1.4,0.3,I.setosa + 5,5.4,3.5,1.7,0.4,I.setosa + ... +|-- datasetDoc.json +``` +The [datasetDoc.json](examples/iris.datasetDoc.json) for this example shows how this case is handled. Note that in this datasetDoc.json, the "role" of the column "species" is "suggestedTarget". The reason for this is given [here](FAQ.md#suggested-targets). + +## Case 2: Single table, multiple raw files + +Consider the following sample image classification dataset. It has one learningData.csv file, whose "image" column has pointers to image files. +``` +/ +|-- media/ + |-- img1.png + |-- img2.png + |-- img3.png + |-- img4.png +|-- tables/ + |-- learningData.csv + d3mIndex,image,label + 0,img1.png,cat + 1,img2.png,dog + 2,img3.png,dog + 3,img4.png,cat +|-- datasetDoc.json +``` +The [datasetDoc.json](examples/image.datasetDoc.json) for this example shows how this case is handled. Two things to note in this datasetDoc.json are: +- We define all the images in media/ as a single resource, albeit a collective resource (notice "isCollection"=true). +``` +{ + "resID": "0", + "resPath": "media/", + "resType": "image", + "resFormat": ["img/png"], + "isCollection": true, +} +``` +- We reference this image-collection resource ("resID"="1") in the "image" column of the "learningData.csv" resource: +``` +{ + "colIndex": 1, + "colName": "image", + "colType": "string", + "role": "attribute", + "refersTo":{ + "resID": "0", + "resObject": "item" + } +} +``` +The semantics here is as follows: The entries in the "image" column refers to an item ("resObject"="item") in a resource whose "resID"="0". Therefore, an entry 'img1.png' in this column refers to an item of the same name in the image collection. Since we know the relative path of this image-collection resource ("resPath"="media/"), we can locate "img1.png" at /media/img1.png. + +## Case 3: Multiple tables referencing each other +Consider the following dataset containing multiple relational tables referencing each other. +``` +/ +|-- tables/ + |-- customers.csv + custID,country,first_invoices_time,facebookHandle + 0,USA,12161998,"johnDoe" + 1,USA,06272014,"janeSmith" + 2,USA,12042014,"fooBar" + 3,AUS,02022006,"johnyAppleseed" + ... + |-- items.csv + stockCode,first_item_purchases_time,Description + FIL36,02162005,waterfilter + MAN42,06272014,userManual + GHDWR2,01112012,generalHardware + ... + |-- invoices.csv + invoiceNo,customerID,first_item_purchases_time + 0,734,12161998 + 1,474,11222010 + 2,647,10182011 + ... + |-- learningData.csv + item_purchase_ID,invoiceNo,invoiceDate,stockCode,unitPrice,quantity,customerSatisfied,profitMargin + 0,36,03142009,GYXR15,35.99,2,1,0.1, + 1,156,02022006,TEYT746,141.36,2,0,0.36 + 2,8383,11162010,IUYU132,57.25,2,0,0.22 + 3,3663,12042014,FURY145,1338.00,1,1,0.11 + 4,6625,07072013,DSDV97,762.00,1,1,0.05 + ... +``` +Here, "learningData" references both "invoice" and "items" tables. "invoice" table in turn references "customers" table. The [datasetDoc.json](examples/multitable.datasetDoc.json) for this example shows how this case is handled. + +There are a number of important things to notice in this example: + +- The "isColleciton" flag is False for all the resources, indicating that there are no collection-of-files resource as was the case in the image example above. + +- As shown in the snippet below, columns in one resource are referencing columns in another resource. +``` + -- In "resID"="1" --- -- In "resID"="3" --- -- In "resID"="3" --- + "colIndex":1, "colIndex":1, "colIndex":3, + "colName":"customerID", "colName":"invoiceNo", "colName":"stockCode", + "colType":"integer", "colType":"integer", "colType":"string", + "role":["attribute"], "role":["attribute"], "role":["attribute"], + "refersTo":{ "refersTo":{ "refersTo":{ + "resID":"0", "resID":"1", "resID":"2", + "resObject":{ "resObject":{ "resObject":{ + "columnName":"custID"}} "columnIndex":0}} "columnName":"stockCode"}} +``` +- A column can have multiple roles. For instance in "customer.csv", the column "facebookHandle" has two roles ("role":["attribute", "key"]). It is a key because it satisfies the uniqueness constraint. (see [here](FAQ.md#index-key) for distinction between index and key roles.) +``` +{ + "colIndex":3, + "colName":"facebookHandle", + "colType":"string", + "role":["attribute", "key"] +} +``` +- Finally, a column can be referenced using its columnIndex or columnName. See "resID"=3 entries above. Ideally we want column names participating in a reference for readability reasons, however, occasionally we have come across datasets with tables that do not have unique column names. In such cases, columnIndex is used. + +## Case 4: Referencing Graph Elements +Assume a sample graph resource below, G2.gml, whose 'resID' = 1 +``` +graph [ + node [ + id 0 + label "1" + nodeID 1 + ] + node [ + id 1 + label "88160" + nodeID 88160 + ] +... +``` + +A corresponding learningData.csv + +| d3mIndex | G2.nodeID | class | +|----------|-----------|-------| +|0 |88160 |1 | +|... |... |... | + + +A corresponding datasetDoc.json +``` +{ + "colIndex": 2, + "colName": "G2.nodeID", + "colType": "integer", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "1", + "resObject": "node" + } +}, +``` + +- "resID": "1" - this points to the graph resource G2.gml +- "resObject": "node" - this points to a node in that graph. **All nodes have a 'nodeID' in the GML file and all nodes are uniquely identified and referenced by its nodeID.** +- So, in the learningData table above, row 0 points to a node in G2.gml whose nodeID = 88160: + +``` + node [ + id 1 + label "88160" + nodeID 88160 + ] +``` + +**Note:** nodeID's can also be strings as shown in the example below: +``` +graph.gml (resID=1) learningData.csv datasetDoc.json +=================== ================= ================ +graph [ | d3mIndex | child.nodeID | parent.nodeID | bond | ... + node [ |----------|--------------|---------------|------| { + id 0 | 0 | BSP | OPSIN | C | "colIndex": 1, + label "Protein Acyl Transferases" | 1 | RHOD | OPSIN | NC | "colName": "child.nodeID", + nodeID "PATS" | 2 | RXN | GPCR | NC | "colType": "string", + ] | ... | ... | ..... | ... | "role": [ + node [ "attribute" + id 1 ], + label "Blue Sensitive Opsins" "refersTo": { + nodeID "BSoP" "resID": "1", + ] "resObject": "node" +... }}, + ... +``` + +## Case 5: Graph resources represented as edge lists + +In v3.1.1, we introduced a new "resType" called "edgeList". This is how a dataset migth look for a case where the graphs are represented using edge lists. + +``` +/ + |-- tables/ + |-- learningData.csv + d3mIndex,user_ID,user_location,gender,age,high_valued_user + 0,2048151474,46614,M,37,"johnDoe",1 + 1,6451671537,08105,F,55,"janeSmith",1 + 2,6445265804,31021,F,23,"fooBar",1 + 3,0789614390,11554,58,"johnyAppleseed",1 + ... + |-- graphs + |-- mentions.edgelist.csv + edgeID,user_ID,mentioned_userID + 0,2048151474,0688363262 + 1,2048151474,1184169266 + 2,2048151474,3949024994 + 3,2048151474,7156662506 + ... +``` + +It's corresponding datasetDoc.json will look like the following: +``` + ... + { + "resID": "0", + "resPath": "graphs/mentions.edgelist.csv", + "resType": "edgeList", + "resFormat": ["text/csv"], + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "edgeID", + "colType": "integer", + "role": [ + "index" + ], + }, + { + "colIndex": 1, + "colName": "user_ID", + "colType": "integer", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "1", + "resObject":{ + "columnName":"user_ID"} + } + }, + { + "colIndex": 2, + "colName": "mentioned_userID", + "colType": "integer", + "role": [ + "attribute" + ], + "refersTo": { + "resID": "1", + "resObject":{ + "columnName":"user_ID"} + } + } + ] + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": ["text/csv"], + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "index" + ], + }, + { + "colIndex": 1, + "colName": "user_ID", + "colType": "integer", + "role": [ + "attribute" + ] + }, + { + "colIndex": 2, + "colName": "user_location", + "colType": "categorical", + "role": [ + "attribute" + ], + }, + ... + }, +... +``` + +# Qualities + +A dataset can be annotated with its unique properties or __qualities__. The "qualities" section of the datasetSchema specifies the fields that can be used to used to capture those qualities. + +| Field | Description | +|-------|-------------| +| qualName | name of the quality variable | +| qualValue | value of the quality variable | +| qualValueType | data type of the quality variable; can be one of ["boolean","integer","real","string","dict"]| +| qualValueUnits | units of the quality variable | +| restrictedTo | if a quality is restricted to specific resource and not the entire dataset, this field captures that constraint | + +__Note:__ We are not providing a taxonomy of qualities but, through this section in the dataset schema, are providing a provision for capturing the qualities of a dataset in its datasetDoc.json file. + +Some qualities apply to an entire dataset. For instance, if one wants to annotate a dataset with (1) information about the presence of multi relational tables or not, (2) requires LUPI (Learning Using Privileged Information) or not, and/or (3) its approximate size: +``` +"qualities":[ + { + "qualName":"multiRelationalTables", + "qualValue":true, + "qualValueType":"boolean" + }, + { + "qualityName":"LUPI", + "qualityValue":true, + "qualValueType":"boolean" + }, + { + "qualName":"approximateSize", + "qualValue":300, + "qualValueType":"integer" + "qualValueUnits":"MB" +}] +``` + +Some qualities are applicable only to certain resources within a dataset. For instance, "maxSkewOfNumericAttributes" is table specific and not dataset wide. In such cases, we include the "restrictedTo" field: +``` +{ + "qualName":"maxSkewOfNumericAttributes", + "qualValue":6.57, + "qualValueType":"real", + "restrictedTo":{ + "resID":"3" // redID 3 is a table + } +}, +``` + +Further, some qualities are restricted to not only a specific resource but to a specific component within a resource. For instance, "classEntropy" and "numMissingValues" are only applicable to a column within a table within a dataset. In such cases, the "restrictedTo" field will have additional "resComponent" qualifier: +``` +{ + "qualName":"classEntropy", + "qualValue":3.63 + "qualValueType":"real", + "restrictedTo":{ + "resID":"3", + "resComponent":{"columnName":"customerSatisfied"} + } +}, +{ + "qualName":"numMissingValues", + "qualValue":12, + "qualValueType":"integer", + "restrictedTo":{ + "resID":"3", + "resComponent":{"columnName":"customerSatisfied"} + } +} +``` +Similarly a "privilegedFeature" called "facebookHandle" will be restricted to its column within a table (resID=0, customer table) within a dataset as shown below: +``` +{ + "qualName":"privilegedFeature", + "qualValue":true, + "qualValueType":"boolean", + "restrictedTo":{ + "resID":"0", + "resComponent":{"colName":"facebookHandle"} + } +} +``` + +For a full example see the [datasetDoc.json](examples/multitable.datasetDoc.json) file for the above Case 3. + +"resComponent" can also be strings "nodes" and "edges". + +# FAQ + +#### Why do we have distinct "index" and "key" roles + +An field which has unique values is a key. However, a key that is used to uniquely identify a row in a table is an index. There can be multiple keys in a table, but a single index. Both index and key columns can participate in the foreign key relationship with other tables. + +#### Why is "role" filed a list? + +A column can play multiple roles. For instance, it can be a "locationIndicator" and an "attribute", a "key" and an "attribute", and so on. + +#### How are LUPI datasets represented + +LUPI datasets contain columns for variables which are not available during testing (testing dataset split does not contain data for those columns). + +Such dataset can have some columns marked with role "suggestedPrivilegedData" to suggest that those columns might be used in the problem description as LUPI columns. The problem description has a list of such columns specified through "privilegedData" list of columns. + +#### Why are we using "suggestedTarget" role instead of "target" in datasetSchema? + +Defining targets is inherently in the domain of a problem. The best we can do in a dataset is to make a suggestion to the problem that these are potential targets. Therefore we use "target" in the problemSchema vocabulary and "suggestedTarget" in the datasetSchema vocabulary. + +#### What is the use for the role 'boundaryIndicator' +In some datasets, there are columns that denote boundaries and is best not treated as attributes for learning. In such cases, this tag comes in handy. Example: + +``` +d3mIndex,audio_file,startTime,endTime,event +0,a_001.wav,0.32,0.56,car_horn +1,a_002.wav,0.11,1.34,engine_start +2,a_003.wav,0.23,1.44,dog_bark +3,a_004.wav,0.34,2.56,dog_bark +4,a_005.wav,0.56,1.22,car_horn +... +``` +Here, 'startTime' and 'endTime' indicate the start and end of an event of interest in the audio file. These columns are good candidates for the role of boundary indicators. + +A boundary column can contain `refersTo` reference to point to the column for which it is a boundary. +If `refersTo` is not provided it means that a boundary column is a boundary for the first preceding non-boundary column. +E.g., in the example above, `startTime` and `endTime` columns are boundary columns for `audio_file` column +(or more precisely, for files pointed to by the `audio_file` column). diff --git a/datasets/data-supply/documentation/examples/image.datasetDoc.json b/datasets/data-supply/documentation/examples/image.datasetDoc.json new file mode 100644 index 0000000..42e417b --- /dev/null +++ b/datasets/data-supply/documentation/examples/image.datasetDoc.json @@ -0,0 +1,52 @@ +{ +"about": + { + "datasetID": "image_dataset_1", + "datasetName":"Sample Image Dataset", + "humanSubjectsResearch": false, + "license":"CC", + "datasetSchemaVersion":"3.0", + "redacted":false + }, +"dataResources": + [ + { + "resID": "0", + "resPath": "media/", + "resType": "image", + "resFormat": ["img/png"], + "isCollection": true, + }, + { + "resID": "1", + "resPath": "tables/learningDoc.csv", + "resType": "table", + "resFormat": ["text/csv"], + "isCollection": false, + "columns":[ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": ["index"] + }, + { + "colIndex": 1, + "colName": "image", + "colType": "string", + "role": ["attribute"], + "refersTo":{ + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "label", + "colType": "categorical", + "role": ["suggestedTarget"] + } + ] + } + ] +} \ No newline at end of file diff --git a/datasets/data-supply/documentation/examples/iris.datasetDoc.json b/datasets/data-supply/documentation/examples/iris.datasetDoc.json new file mode 100644 index 0000000..32e5b0c --- /dev/null +++ b/datasets/data-supply/documentation/examples/iris.datasetDoc.json @@ -0,0 +1,59 @@ +{ +"about": + { + "datasetID": "iris_dataset_1", + "datasetName":"Iris Dataset", + "humanSubjectsResearch": false, + "license":"CC", + "datasetSchemaVersion":"3.0", + "redacted":false + }, +"dataResources": + [ + { + "resID": "0", + "resPath": "tables/learningDoc.csv", + "resType": "table", + "resFormat": ["text/csv"], + "isCollection": false, + "columns":[ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": ["index"] + }, + { + "colIndex": 1, + "colName": "sepalLength", + "colType": "real", + "role": ["attribute"] + }, + { + "colIndex": 2, + "colName": "sepalWidth", + "colType": "real", + "role": ["attribute"] + }, + { + "colIndex": 3, + "colName": "petalLength", + "colType": "real", + "role": ["attribute"] + }, + { + "colIndex": 4, + "colName": "petalWidth", + "colType": "real", + "role": ["attribute"] + }, + { + "colIndex": 5, + "colName": "species", + "colType": "categorical", + "role": ["suggestedTarget"] + } + ] + } + ] +} \ No newline at end of file diff --git a/datasets/data-supply/documentation/examples/multitable.datasetDoc.json b/datasets/data-supply/documentation/examples/multitable.datasetDoc.json new file mode 100644 index 0000000..6981d59 --- /dev/null +++ b/datasets/data-supply/documentation/examples/multitable.datasetDoc.json @@ -0,0 +1,232 @@ +{ + "about": { + "datasetID":"mt_mtc_1", + "datasetName":"Sample multitable relational dataset", + "humanSubjectsResearch": false, + "license":"CC", + "datasetSchemaVersion":"3.0", + "redacted":false + }, + "dataResources":[ + { + "resID":"0", + "resPath":"tables/customers.csv", + "resType":"table", + "resFormat":["text/csv"], + "isCollection":false, + "columns":[ + { + "colIndex":0, + "colName":"custID", + "colType":"integer", + "role":["index"] + }, + { + "colIndex":1, + "colName":"country", + "colType":"categorical", + "role":["attribute"] + }, + { + "colIndex":2, + "colName":"first_invoices_time", + "colType":"dateTime", + "role":["attribute"] + }, + { + "colIndex":3, + "colName":"facebookHandle", + "colType":"string", + "role":["attribute", "key"] + } + ] + }, + { + "resID":"1", + "resPath":"tables/invoices.csv", + "resType":"table", + "resFormat":["text/csv"], + "isCollection":false, + "columns":[ + { + "colIndex":0, + "colName":"invoiceNo", + "colType":"integer", + "role":["index"] + }, + { + "colIndex":1, + "colName":"customerID", + "colType":"integer", + "role":["attribute"], + "refersTo":{ + "resID":"0", + "resObject":{ + "columnName":"custID"}} + }, + { + "colIndex":2, + "colName":"first_item_purchases_time", + "colType":"dateTime", + "role":["attribute"] + } + ] + }, + { + "resID":"2", + "resPath":"tables/items.csv", + "resType":"table", + "resFormat":["text/csv"], + "isCollection":false, + "columns":[ + { + "colIndex":0, + "colName":"stockCode", + "colType":"string", + "role":["index"] + }, + { + "colIndex":1, + "colName":"first_item_purchases_time", + "colType":"dateTime", + "role":["attribute"] + }, + { + "colIndex":2, + "colName":"Description", + "colType":"string", + "role":["attribute"] + } + ] + }, + { + "resID":"learningData", + "resPath":"tables/learningData.csv", + "resType":"table", + "resFormat":["text/csv"], + "isCollection":false, + "columns":[ + { + "colIndex":0, + "colName":"item_purchase_ID", + "colType":"integer", + "role":["index"] + }, + { + "colIndex":1, + "colName":"invoiceNo", + "colType":"integer", + "role":["attribute"], + "refersTo":{ + "resID":"1", + "resObject":{ + "columnIndex":0}} + }, + { + "colIndex":2, + "colName":"invoiceDate", + "colType":"dateTime", + "role":["attribute"] + }, + { + "colIndex":3, + "colName":"stockCode", + "colType":"string", + "role":["attribute"], + "refersTo":{ + "resID":"2", + "resObject":{ + "columnName":"stockCode" + } + } + }, + { + "colIndex":4, + "colName":"unitPrice", + "colType":"real", + "role":["attribute"] + }, + { + "colIndex":5, + "colName":"quantity", + "colType":"integer", + "role":["attribute"] + }, + { + "colIndex":6, + "colName":"customerSatisfied", + "colType":"categorical", + "role":["suggestedTarget"] + }, + { + "colIndex":7, + "colName":"profitMargin", + "colType":"real", + "role":["suggestedTarget"] + } + ] + } + ], + "qualities":[ + { + "qualName":"multiRelationalTables", + "qualValue":true, + "qualValueType":"boolean" + }, + { + "qualityName":"LUPI", + "qualityValue":true, + "qualValueType":"boolean" + }, + { + "qualName":"privilegedFeature", + "qualValue":true, + "qualValueType":"boolean", + "restrictedTo":{ + "resID":"0", + "resComponent":{"colName":"facebookHandle"} + } + }, + { + "qualName":"approximateSize", + "qualValue":300, + "qualValueType":"integer", + "qualValueUnits":"MB" + }, + { + "qualName":"maxSkewOfNumericAttributes", + "qualValue":6.57, + "qualValueType":"real", + "restrictedTo":{ + "resID":"3" + } + }, + { + "qualName":"classEntropy", + "qualValue":3.63, + "qualValueType":"real", + "restrictedTo":{ + "resID":"3", + "resComponent":{"colName":"customerSatisfied"} + } + }, + { + "qualName":"numUniqueValues", + "qualValue":3667, + "qualValueType":"integer", + "restrictedTo":{ + "resID":"3", + "resComponent":{"columnIndex":1} + } + }, + { + "qualName":"numMissingValues", + "qualValue":12, + "qualValueType":"integer", + "restrictedTo":{ + "resID":"3", + "resComponent":{"columnName":"customerSatisfied"} + } + } + ] +} diff --git a/datasets/data-supply/documentation/minimalMetadata.md b/datasets/data-supply/documentation/minimalMetadata.md new file mode 100644 index 0000000..099fe29 --- /dev/null +++ b/datasets/data-supply/documentation/minimalMetadata.md @@ -0,0 +1,53 @@ +# Minimal Metadata Datasets (version 4.0.0) + +Begining with D3M Winter Workshop 2020, there is a push in the program to reduce the amount of manually curated metadata information that is provided in the datasets. The motivation is that as the D3M systems are transitioned to partner environments, it's unrealistic to expect that the datasets will be fully curated in D3M format. Therefore, the systems need to reduce their reliance on manually hand-coded metadata. This is a step in that direction. + +D3M core package clearly requires some metadata to work. Some of that metadata information can be inferred and others will be difficult to infer automatically and will have to be provided manually. For those that are provided manually, this page lists what metadata elements can be reliably expected in minimal metadata datasets, and the rest of it will be optional (i.e., not provided in most cases). + +**Note 1:** The structure of the D3M datasets will remain intact. + +**Note 2:** There will be not changes to the problem metadata that is provided, i.e., the problemDoc.json files will not be affected by this change. + +**Note 3:** All the changes in minimal metadata datasets (for now) will be w.r.t to the data schema, i.e, only datasetDoc.json files will be affected. For now, only the column metadata like column types and roles (with a few exceptions) have been removed when transitioning to minimal metadata format. The exceptions are listed in the table below. The resourse metadata will remain intact. + +**Note4** The master branch (seed datasets) moving forward will by default contain datasets in min metadata format + +**Note 5:** The original full metadata counterparts of the min metadata seeds are archived in a separate directory: training_datasets/seed_datasets_archive/ + +The following table lists the column metadata that can be extected per problem type. + + +| Problem Type | Column information retained in minimal metadata | +|------------------------------------------------------------|-------------------------------------------------| +| (classification, binary, tabular) | (index) | +| (classification, multiClass, tabular) | (index) | +| (classification, multiLabel, tabular) | (index, multiIndex) | +| (classification, binary, lupi, tabular) | (index) | +| (classification, binary, semiSupervised, tabular) | (index) | +| (classification, multiClass, semiSupervised, tabular) | (index) | +| (regression, univariate, tabular) | (index) | +| (regression, multivariate, tabular) | (index) | +| (classification, binary, tabular, relational) | (index, refersTo) | +| (classification, multiClass, tabular, relational) | (index, refersTo) | +| (regression, univariate, tabular, relational) | (index, refersTo) | +| (regression, multivariate, tabular, relational) | (index, refersTo) | +| (classification, binary, text) | (index, refersTo) | +| (classifciation, multiClass, text) | (index, refersTo) | +| (classification, binary, text, relational) | (index, refersTo) | +| ('classification', 'multiClass', 'video') | (index, refersTo) | +| ('classification', 'multiClass', 'image') | (index, refersTo) | +| (regression, univariate, image) | (index, refersTo) | +| (regression, multivariate, image) | (index, refersTo) | +| ('classification', 'multiClass', 'audio') | (index, refersTo) | +| (objectDetection, image) | (index, refersTo, multiIndex) | +| (graph, vertexClassification, multiClass) | (index, refersTo) | +| (graph, linkPrediction) | (index, refersTo) | +| (graph, graphMatching) | (index, refersTo) | +| (graph, communityDetection, nonOverlapping) | (index, refersTo) | +| (graph, linkPrediction, timeSeries) | (index, refersTo, timeIndicator) | +| (timeseries, forecasting, tabular) | (index, timeIndicator) | +| (timeseries, forecasting, tabular, grouped) | (index, timeIndicator, suggestedGroupingKey) | +| (classification, multiClass, timeseries) | (index, refersTo) | +| (classification, multiClass, timeseries) | (index, refersTo) | +| (classification, binary, timeseries, tabular, grouped) | (index, refersTo, suggestedGroupingKey) | +| (classification, multiClass, timeseries, tabular, grouped) | (index, refersTo, suggestedGroupingKey) | \ No newline at end of file diff --git a/datasets/data-supply/documentation/overview.md b/datasets/data-supply/documentation/overview.md new file mode 100644 index 0000000..ebc0c5a --- /dev/null +++ b/datasets/data-supply/documentation/overview.md @@ -0,0 +1,119 @@ +# D3MDS + +D3M Data Supply (D3MDS) refers to the data infrastructure provided by MIT Lincoln Lab for the D3M program. The basic components of D3MDS includes the following: Datasets, Problems and Baseline solutions. The guiding principles for this design are as follows: +1. Keep the three entities, Dataset, Problem, and Baseline solutions, decoupled. +2. Keep a dataset self-contained. +2. Provide a uniform and consistent way of dealing with datasets, which could be full datasets, a train view of the dataset (during blind evaluation), or a test view of the dataset (during blind evaluation). +3. Allow the possibility of annotating a dataset with metadata information. +4. Dataset schema should be able to handle multiple relational tables. +3. Datasets and Problems should be decoupled. + +## Dataset + +One of the core components of D3MDS are datasets. Each dataset is a self-contained set of data resources. These data resources can come in many types and formats. Some of the types of data resources that one can expect to see in D3MDS include: image, video, audio, speech, text, table, graph, timeseries, etc. Each type can be supplied in one or more formats. For example, image resources can come in PNG, JPEG, etc. formats. Our file structure convention for organizing a dataset is as follows: +``` +. / +|-- media/ +|-- text/ +|-- tables/ +|-- graphs/ +|-- datasetDoc.json +``` +Convention: The name of the root directory of a dataset is its dataset_id + +Suggested sub-directory names and structure: + +| | | +|-----------------|-------------------------------------------------------------------------------------------------------------------| +| / | the root directory | +| media/ | (optional) directory containing media files, if any (e.g. images, video, audio, etc.) | +| text/ | (optional) directory containing text documents, if any | +| tables/ | (required) directory containing tabular data. This is not optional as a dataset will contain at least one table | +| graphs/ | (optional) directory containing graphs, if any | +| datasetDoc.json | (required) JSON document that describes all the data resources in the dataset (an instance of the dataset schema) | + +The datasetDoc.json file provides a description of all the elements of a dataset as a JSON document. This document is specified according to a predefined dataset schema. In other words, datasetDoc is an instance of datasetSchema. + +A small sample Dataset is shown in the figure below + +![A sample dataset](static/sampleDataset.PNG) + + +__Special "learningData" file__: All datasets will have a main data file, a dataset entry point, which can be considered as an entry point into the dataset. Its resource name is always 'learningData' and for backwards compatibility this file is always named 'learningData'. It's format is typically CSV (learningData.csv), but not necessarily the case. This file is treated as just another tabular resource and will be placed in the tables/ directory as shown below. It's columns and format will be annotated in the datasetDoc.json similar to any other table that may be part of the dataset. An example of learningDoc file can be seen in the sample dataset figure above. +``` +. / +|-- tables/ + |-- learningData.csv +|-- datasetDoc.json +``` + +### datasetSchema + +* The datasetSchema is version controlled and can be found in [datasetSchema.json](../schemas/datasetSchema.json) +* Full documentation for datasetSchema can be found in [datasetSchema.md](datasetSchema.md) + +## Problems + +A dataset alone does not constitute a data science problem. A problem is developed over a dataset by defining a task, inputs for the task (which includes a dataset), and expected outputs for the task. Multiple problems can be developed for a single dataset. Our convention for organizing a problem is as follows: + +``` +. / +|-- dataSplits.csv +|-- problemDoc.json +``` +Convention: The name of the root directory of a problem is its problem_id + +| | | +|-----------------|--------------------------------------------------------------------------------------------------------------------------------| +| / | the root directory | +|dataSplits.csv | (optional) data splits file which specifies the train/test split of the data when the split is defined manually | +|problemDoc.json | (required) JSON document that describes the task, inputs and expected output of the problem (an instance of the problem schema)| + +A small sample Problem is shown in the figure below. + +![A sample problem](static/sampleProblem.PNG) + +__Special "dataSplits" file__: When evaluation split has been done manually and not algorithmically, the dataSplits file contains information about which rows in learningData.csv are 'TRAIN' rows and which ones are 'TEST' rows. Normally (outside the context of the blind evaluation), performer systems can use this file to infer the trainData and testData splits. In the context of the blind evaluation, this file will be used to create two separate views of the dataset, the train view and test view, as described in the Dataset Views section below. An example of dataSplits file can be seen in the sample problem figure above. + +The dataSplits file also contains the repeat number and fold number if multiple repeats or folds are used in the problem definition. Some training datasets have different evaluation procedures(e.g., 10-fold CV, 3 repeats of 20% holdout, etc.). This approach of including repeat and fold information in dataSplits will handle all the different cases. __However, during blind evaluation, only holdout method is used without any repeats. Therefore, for evaluation problems, repeat and fold will always contain 0 values__ + +### problemSchema +* The problemSchema is version controlled and can be found in [problemSchema.json](../schemas/problemSchema.json) +* Full documentation for problemSchema can be found in [problemSchema.md](problemSchema.md) + +## Baseline solutions +A baseline solution is a solution to a problem. In its current conception, it consists of runnable code that takes as input a problem and produces an output (as specified in the problem schema). There can be multiple solution for a given problem. Our convention for organizing a python solution is as follows: +``` +. / +|-- src/ +|-- run.py +|-- predictions.csv +``` +| | | +|-----------------|--------------------------------------------------------------------------------------------------------------------------------| +| / | the root directory | +|src/ | source code for the pipeline | +|run.py | runner script for running the solution | +|predictions.csv | a file containing the predictions of the solution for the give problem | +|scores.json | a file containing the performance scores of the solution | + + +# Dataset Views + +For all the datasets that are released to the performers (e.g., training datasets), performers get a full view of the dataset as shown below: + +![](static/sampleSupply.PNG) + +However, for the blind evaluation purposes, three separate views of the dataset will be created. These are no just logical views, but two separate physical views of the dataset. For each dataset and problem combination, dataset_TRAIJN, dataset_TEST, dataset_SCORE variants will be created. By convention they always have those suffixes. +TEST view is the same as SCORE view, only that TEST view has all target values redacted. + +![](static/allViews.PNG) + +Only the pertinent data resources from the full dataset will be replicated in the train and test views. For instance, the train view will be created by making a copy of a full dataset and deleting 'TEST' rows in learningData.csv and any other data resources that are referenced by those rows. The same is true for train view of the problem. + +![](static/trainView.PNG) + +During blind evaluation, at test time, test view will be provided as shown below. Note that the labels in the learningDoc.csv have been removed for the test view. + +![](static/testView.PNG) + diff --git a/datasets/data-supply/documentation/problemSchema.md b/datasets/data-supply/documentation/problemSchema.md new file mode 100644 index 0000000..acda70e --- /dev/null +++ b/datasets/data-supply/documentation/problemSchema.md @@ -0,0 +1,595 @@ +# Problem Schema (version 4.0.0) + +Dataset schema provides a specification of an abstract data science problem. It is contained in the [problemSchema.json](../schemas/problemSchema.json) file. An instance of this schema is included with every problem in the problemDoc.json file. + +Problem schema specifies a dataset in three sections: about, inputs, and expectedOutputs. Each of these sections are described below. + +# About + +The "about" section contains of some general information about the problem and consists of the following fields. + +| Field | Description | +|-----------------------|---------------------------------------------------------------------------------------------------| +| problemID | a unique ID assigned to a problem | +| problemName | the name of a problem | +| problemDescription | a brief description of the problem | +| problemURI | the location of the problem | +| taskKeywords | a list of keywords that capture the nature of the machine learning task | +| problemVersion | the version of the current problem | +| problemSchemaVersion | the version of the problem schema | + +Currently, the keywords that can be combined to describe the task are the following: + +| taskKeywords |Notes| +|--------------|-----| +|classification|supervised learning task - learn from a labeled dataset to assign a class labels to prediction samples| +|regression|supervised learning task - learn from a labeled dataset to assign a numeric values to prediction samples| +|clustering|unsupervised learning task - no labeled dataset, cluster samples and assign a cluster label to all samples| +|linkPrediction|[linkPrediction](#link-prediction) task| +|vertexNomination|[vertexNomination](#vertex-nomination) task| +|vertexClassification|[vertexClassification](#vertex-classification) task| +|communityDetection|[communityDetection](#community-detection) task| +|graphMatching|[graphMatching](#graph-matching) task| +|forecasting|data is indexed by time dimension and the task is to predict future values based on previously observed values| +|collaborativeFiltering|task of filling blank cells in a utility matrix where each cell in the matrix holds association between two entities (e.g., users and products)| +|objectDetection|[objectDetection](#object-detection-task) task| +|semiSupervised|[semiSupervised](#semi-supervised) learning task| +|unsupervised|unsupervised learning task - no labeled dataset| +|binary|binary classification task| +|multiClass|multi-class classification task| +|multiLabel|multi-label classification task| +|univariate|applied to "regression" task with a single response variable| +|multivariate|applied to "regression" task with more than one response variables| +|overlapping|applied to "communityDetection" problems to indicate overlapping communites: multiple community memberships for nodes| +|nonOverlapping|applied to "communityDetection" problems to indicate disjoint communites: single community memberships for nodes| +|tabular|indicates data is tabular| +|relational|indicates data is a relational database| +|nested |indicates that a table consists of nested tables. For example, a column entry can point to an entire table stored in a separate CSV file (similar to pointing to an image or other media files)| +|image|indicates data consists of raw images| +|audio|indicates data consists of raw audio| +|video|indicates data consists of raw video| +|speech|indicates human speech data| +|text|indicates data consists of raw text| +|graph|indicates data consists of graphs| +|multiGraph|indicates data consists of multigraphs| +|timeSeries|indicates data consists of time series| +|grouped|applied to time series data (or tabular data in general) to indicate that some columns should be [grouped](#grouped-time-series)| +|geospatial|indicates data contains geospatial information| +|remoteSensing|indicates data contains remote-sensing data| +|lupi|indicates the presence of privileged features: [lupi](#LUPI)| +|missingMetadata|indicates that the metadata for dataset is not complete| + +# Inputs + +This section specifies the three inputs that are required to understand and solve a problem in the context of D3M. They include: data, data splits, and performance metrics. +``` +"inputs":{ + "data":[ + ... + ], + "dataSplits":{ + ... + }, + "performanceMetrics":{ + ... + } +} +``` + +## Data + +Data refers to the dataset(s) over which a problem is defined. A problem can refer to multiple datasets. That is captured by the datasetID field, which is a list. +``` +"data":[ + { + "datasetID":"sample_dataset_ID", + ... + } +] +``` + +In addition to the datasetID, one has to also specify target variable(s) on that dataset as part of the problem specification. Each target variable is specified by referring to a table using its "resID" and specifying its target column index and column name. For a correct specification, the target column names and indexes in problemDoc should match with the corresponding column names and indexes in datasetDoc. The redundancy of specifying a target by its column name and index is by design. +``` +"data": [ + { + "datasetID":"sample_dataset_ID" + "targets": + [ + { + "targetIndex":0, + "resID":"0", // reference to a table in the dataset + "colIndex":18, + "colName":"classLabel" + } + ] + } +] +``` +For more information about "resID", "colIndex", and "colName", refer to the [datasetSchema.json](../schemas/datasetSchema.json) + +"targets" also has an optional "numClusters" field. This field is applicable to clustering problems. It is used to specify the number of clusters to be generated by the solution algorithm (if this information is known apriori). + + +If the task is time series forecasting, then the problem specification can contain additional information about the horizon of forecast. It will contain a number, which indicates the max number of time steps in future the predictions will need to be made. The horizon is in the units of `timeGranularity` in the column metadata. In the following example, assuming that the `timeGranularity` 5 days, the prediction horizon is 10 future steps, that is 50 days: +``` +"forecastingHorizon":{ + "resID": "learningData", + "colIndex": 8, + "colName": "time", + "horizonValue": 10.0 + } +``` + +Data can also contains "privilegedData" list of columns related to unavailable variables during testing. Those columns do not have data available in the test split of a dataset. +``` +"data": [ + { + "datasetID":"sample_dataset_ID" + "privilegedData": [ + { + "privilegedDataIndex": 0, + "resID": "learningData", + "colIndex": 20, + "colName": "HISTOLOGY" + } + ] + } +] +``` + + +## Data Splits + +Every problem has a special __dataSplits__ file. This file contains information about which rows in the learning data are 'TRAIN' rows and which ones are 'TEST' rows. It has the following columns: [d3mIndex, type, repeat, fold] as shown in the problem sample below. + +![](static/sampleProblem.PNG) + +This split file indirectly reflects the evaluation procedure that was used to define the problem. The "dataSplits" section in the problem schema contains information that can be used to infer the evaluation procedure and interpret the dataSplits file. It contains the following fields: + +| Field | Description | +|-------|-------------| +| method | refers to the evaluation method reflected in the data splits. Currently, it can be one of "holdOut" or "kFold". Can be extended in future to accommodate others. It is always 'holdOut' by default | +| testSize | applicable to "holdOut" method, it specifies the size of the test split in the range 0.0 to 1.0. | +| numFolds | applicable to "kFold" method, it specifies the number of folds | +| stratified | specifies if the split is stratified or not, default value being True | +| numRepeats | specifies how many repeats, e.g., 3 repeats of 50% holdout | +| randomSeed | the random number generator seed that was used to generate the splits | +| splitsFile | the relative path to the splits file from the problem root, which is "dataSplits.csv" directly under the problem root by default| +| splitScript | the relative path from the problem root to the script that was used to create the data split (optional) | + +## Performance metrics + +Another important part of a problem specification is set of metrics that will be used to evaluate the performance of a solution. The "performanceMetrics" section of the problem schema is a **list** of metrics. The provided metrics are the following. + +| Metric | Notes | +|--------|-------| +| accuracy | sklearn.metrics.accuracy_score | +| precision | sklearn.metrics.precision_score | +| recall | sklearn.metrics.recall_score | +| f1 | sklearn.metrics.f1_score (pos_label=1) | +| f1Micro | sklearn.metrics.f1_score(average='micro') | +| f1Macro | sklearn.metrics.f1_score(average='macro') | +| rocAuc | Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) - only works with binary data, requires confidence | +| rocAucMacro | Compute rocAuc metric for each label and compute unweighted mean - only works for multi-class and multi-label data, requires confidence | +| rocAucMicro | Compute rocAuc metric globally by considering each element of the label indicator matrix as a binary prediction - only works for multi-class and multi-label data, requires confidence | +| meanSquaredError | sklearn.metrics.mean_squared_error, average computed over multiple target columns | +| rootMeanSquaredError | sqrt(sklearn.metrics.mean_squared_error), average computed over multiple target columns | +| meanAbsoluteError | sklearn.metrics.mean_absolute_error, average computed over multiple target columns | +| rSquared | sklearn.metrics.r2_score | +| normalizedMutualInformation | sklearn.metrics.normalized_mutual_info_score | +| jaccardSimilarityScore | sklearn.metrics.jaccard_similarity_score | +| precisionAtTopK | number of values shared between first K entries of ground truth and predicted labels and normalized by K | +| objectDetectionAP | an implementation of mean average precision for object detection problems, mean is computed accross classes (when there are multiple classes), average accross bounding polygons of one class, requires confidence | +| hammingLoss | wraps sklearn.metrics.hamming_loss function - used for multilabel classification problems. | +| meanReciprocalRank | computes the mean of the reciprocal of elements of a vector of rankings - used for linkPrediction problems.| +| hitsAtK | computes how many elements of a vector of ranks make it to the top 'k' positions - used for linkPrediction problems.| + + +**Note**: There can be multiple performance metrics included for a single problem. For example: +``` +"performanceMetrics": [ + { + "metric": "accuracy" + }, + { + "metric": "f1" + } + ] +``` +### Optional parameters for metrics +Notice that there are optional metric parameters in the problemSchema: +``` +... + "K":{"type":"integer","required":false,"dependencies": {"metric":["precisionAtTopK","hitsAtK"]}}, + "posLabel":{"type":"string","required":false, "dependencies": {"metric":["f1","precision","recall"]}} +... +``` + +| parameter | Notes | +|-----------------------|--------------------------------------------------------------------------------------------------------------------------------| +| K | This is applicable to the metrics ['precisionAtTopK', 'hitsAtK'] provided the value of K | +| posLabel | This is applicable to 'f1', 'precision', and 'recall' metrics, and indicates which class should be treated as "positive" class | + +# Expected outputs + +The "expectedOutputs" in the problem schema directs the solution systems to produce output(s) in a standardized manner. +``` +"expectedOutputs":{"type":"dict","required":true,"schema":{ + "predictionsFile": {"type":"string", "required":true, "default":"predictions.csv"}, + "scoresFile": {"type":"string", "required":false, "default":"scores.csv"} +}} +``` +## Predictions file + +Currently, there is only one required output, which is the predictions file - all solution systems should output this file. By default the name of this file is "predictions.csv", but can be changed in a problem instance using the "predictionsFile" field in the problem schema. + +The structure of predictions file depends on the metric used in the problem description. + +### `objectDetectionAP` metric + +`objectDetectionAP` metric requires the following structure: + * `d3mIndex` column + * target columns (names depend on problem target columns) + * `confidence` column + +Generally target columns are: a column for class/label and a column for object boundary. +For same input sample (image) multiple output rows can be made (matching in `d3mIndex` value), +but **only** those rows for objects which are determined to be found in the image. + +|d3mIndex|class|bounding_box |confidence| +|--------|-----|---------------|----------| +|0 |dog |"..." |... | +|0 |cat |"..." |... | +|0 |cat |"..." |... | +|0 |cat |"..." |... | +|1 |cat |"..." |... | +|1 |cat |"..." |... | +|1 |cat |"..." |... | +|1 |pig |"..." |... | +|1 |pig |"..." |... | +|2 |man |"..." |... | +|2 |man |"..." |... | +|2 |man |"..." |... | +|2 |bird |"..." |... | +|2 |car |"..." |... | + +### `rocAuc`, `rocAucMacro`, `rocAucMicro` metrics + +`rocAuc`, `rocAucMacro`, `rocAucMicro` metrics require the following structure: + * `d3mIndex` column + * target column (name depends on the problem target column) + * `confidence` column + +Target column represents class/label. For one input sample multiple output rows should be made (matching in `d3mIndex` value), +for **all** possible class/label values, each one row. The `confidence` value should then be confidence of classification +into that class/label. + +| d3mIndex | label | confidence | +|----------|-------|------------| +| 640 | 0 | 0.612 | +| 640 | 1 | 0.188 | +| 640 | 2 | 0.2 | +| 641 | 0 | 0.4 | +| 641 | 1 | 0.25 | +| 641 | 2 | 0.35 | +| 642 | 0 | 1.0 | +| 642 | 1 | 0.0 | +| 642 | 2 | 0.0 | +| 643 | 0 | 0.52 | +| 643 | 1 | 0.38 | +| 643 | 2 | 0.1 | +| 644 | 0 | 0.3 | +| 644 | 1 | 0.2 | +| 644 | 2 | 0.5 | +| 645 | 0 | 0.1 | +| 645 | 1 | 0.2 | +| 645 | 2 | 0.7 | +| 646 | 0 | 1.0 | +| 646 | 1 | 0.0 | +| 646 | 2 | 0.0 | + +### `hitsAtK`, `meanReciprocalRank` +`hitsAtK` and `meanReciprocalRank`metrics require the following structure: +* `d3mIndex` column +* target column (name depends on the problem target column) +* `rank` column + +`rank` is a reserved keyword. Target column represents class/label. For one input sample multiple output rows should be made (matching in `d3mIndex` value). Unlike `rocAuc` this need not have all possible class/label values. Example: + +learningData.csv: + +|d3mIndex |subject |object |relationship (target)| +|------------|--------|------------|---------------------| +|0 |James |John |father | +|1 |John |Patricia |sister | +|2 |Robert |Thomas |brother | +|... |... |... |... | +|... |... |... |... | + +ground truth (coming from learningData.csv): + +|d3mIndex |relationship | +|------------|-------------| +|0 |father | +|1 |sister | +|2 |brother | + +predictions.csv: + +|d3mIndex |relationships |rank | +|------------|----------------|-----| +|0 |brother |1 | +|0 |cousin |2 | +|0 |mother |3 | +|0 |father |4* | +|0 |grandfather |5 | +|1 |sister |1* | +|1 |mother |2 | +|1 |aunt |3 | +|2 |father |1 | +|2 |brother |2* | +|2 |sister |3 | +|2 |grandfather |4 | +|2 |aunt |5 | + +Note that the rank vector = [4,1,2] + +MRR = sum(1/ranks)/len(ranks) = 0.58333 + +Hits@3 = 2/3 = 0.666666; Hits@1 = 1/3 = 0.3333333; Hits@5 = 3/3 = 1.0 + + +### Other metrics, multi-label + +Output has for each input sample multiple output rows (matching in `d3mIndex` value), with `d3mIndex` column and a column for predicted labels. +Only predicted labels should be listed in output rows. + +| d3mIndex | label | +|----------|-------| +| 640 | 0 | +| 640 | 1 | +| 641 | 0 | +| 642 | 0 | +| 642 | 1 | +| 642 | 2 | +| 643 | 1 | +| 644 | 1 | +| 645 | 0 | +| 645 | 2 | +| 646 | 2 | + +### Other metrics + +For each input sample one output row should be provided (matching in `d3mIndex` value), with `d3mIndex` column and a target column for the prediction, +or multiple target columns in `multivariate` case. +Target column names depend on problem target columns. + +| d3mIndex | class | +|----------|-------| +| 640 | 0 | +| 641 | 0 | +| 642 | 2 | +| 643 | 1 | +| 644 | 1 | +| 645 | 0 | +| 646 | 2 | + +__Notes__ +- A target column is represented in the predictions file by its **colName**, which is specified in problemDoc.json under **inputs**->**data**->**targets**. +- We are interested in predictions for 'TEST' data points only. +- Typically, a predictions file will contain only one target column. If a given problem has multiple independent targets, it will be presented as separate problems, each with a single target. A simple test if multiple targets can be listed in one problem: is there only one confidence computed for those targets. +- If a dataset has multiple targets that are related, then the predictions file will have additional target columns. For example, see [object detection](#object-detection-task). +- Predictions can optionally have one or more of the below listed special columns which are required to compute certain metrics (e.g., `rocAuc` requires `confidence` and `meanReciprocalRank` requires `rank`). Besides `d3mIndex`, the following are reserved column names in predictions: + - `confidence` + - `rank` + +## Scores file + +A scores file can be another output from a solution pipeline when the ground truth for test data is known. Hence, this output is optional. Problem schema already specifies the given metrics under `performanceMetrics` under `inputs` section. +The standard scores.csv file can contain the following columns: `[metric, value, normalized, randomSeed, fold]`, with `normalized` (a normalized score into the range [0, 1] where higher is better), +`randomSeed` (random seed value used to run the solution pipeline), and `fold` (a 0-based index of the fold) being optional. `randomSeed` is the main seed used by a solution pipeline. +`metric` is the name of the metric as defined in `performanceMetrics` or equivalent (like d3m core package enumeration). + +The baselines systems produced by MIT-LL will produce both predictions file and scores file. + +Old scores file format with columns `[index, metric, value]` has been deprecated, but you might still see it around. + +# Data Augmentation +The "dataAugmentation" field provides information about external sources of data that can be used to address the challenge of data augmentation. It is a list of dictionaries, each item corresponding to one external source. Each dictionary for an external source contains two fields: domain and keywords. "domain" captures the application domain(s) of an external dataset (e.g., government, census, economics) and "keywords" capture additional tags that help narrow the search (e.g., housing, household income). + +# Appendix + + +## Link Prediction +Given an input graph/multiGraph, link prediction involves predicting future possible links in the network or predicting missing links due to incomplete data in the network. + + +## Vertex Nomination +Given an input graph, vertex nomination involves learning a model that, to each observed node in a graph, assigns a prioritized nomination list. + + +## Vertex Classification +Given an input (possibly node-attributed) graph, vertex classification involves assigning labels/classes to unknown nodes in the network based on the classes (and attributes) of known nodes and the network structure. Similar to traditional classification tasktype, vertex classification can be of subtype binary, multiclass, or multilabel. + + +## Community Detection +Given an input graph/network possibly with a community structure, community detection involves grouping nodes into sets of (overlapping or non-overlapping) groups/clusters that reflect the community structure. + + +## Graph Matching +Given two input graphs, the task here is to find an approximate matching between these two graphs based on their structural properties. We assume that a partial correspondence between graphs is given. Given two other nodes not specified in the partial correspondence, the model has to predict if they match/correspond or not. + + +## Semi-supervised Classification +This task requires learning classification models from both labeled and unlabeled data. The structure of this task is the same as the standard classification task, except that the target column will only have a small number of the labels provided. The rest of the values in the target column will be empty. Similar to supervised classification, semi-supervised classification can be of subtype binary, multiclass, or multilabel. +## Semi-supervised Regression +This task requires learning regression models from both labeled and unlabeled data. The structure of this task is the same as the standard regression task, except that the target column will only have a small number of the labels provided. The rest of the values in the target column will be empty. Similar to supervised regression, semi-supervised regression can be of subtype univariate, or multivariate. + + +## Learning Using Privileged Information +This is a special case of classification and regression problems where the training set has certain extra features called privileged features and the test set does not have those features. The models have to deal with feature mismatch across train/test split and take advantage of extra features available at train time. + + +## Grouped time series forecasting +Adding "grouped" qualifier to a time-series forecasting task implies that the data is grouped hierarchically. For example, the total monthly sales data of a bike manufacturer might organized per "bike-model" per "region". In other words, we get individual time series of monthly sales if the whole table is grouped buy two both ("region","bike-model"). The metadata of the dataset in datasetDoc.json will have "suggestedGroupingKey" role set on those columns that would serve as a grouping key. + + +## Object Detection Task +The typical structure of an objectDetection task is as follows: +``` +LL1_some_object_detection_task / + |-- LL1_some_object_detection_task_dataset / + |-- tables / + |-- learningData.csv + |-- media / + |-- img01.jpg + |-- img02.jpg + |-- img03.jpg + |-- img04.jpg + ... + |-- datasetDoc.json + |-- LL1_some_object_detection_task_problem / + |-- problemDoc.json + |-- dataSplits.csv + |-- LL1_some_object_detection_task_solution / + |-- src / + |-- predictions.csv + |-- scores.csv + +``` +### learningData.csv + +An example learningData: + +| d3mIndex | image | someOtherCol1 | someOtherCol2 | ...| class |bounding_box| +|-----------|--------|---------------|---------------|----|-------|------------| +|0|img01.jpg|someVal|someVal|...| dog |"10,10,60,11,60,100,10,100"| +|0|img01.jpg|someVal|someVal|...| dog |"10,20,60,31,60,110,10,110"| +|0|img01.jpg|someVal|someVal|...| tree| "20,10,70,11,70,100,20,100"| +|...|...|...|...|...|...|...|...|...|...| +|2|img03.jpg|someVal|someVal|...| cat |"50,6,100,13,100,131,50,54"| +|2|img03.jpg|someVal|someVal|...| car |"70,6,120,13,120,131,70,54"| +|...|...|...|...|...|...|...|...|...|...| + +__Notes__ + +- Each row corresponds to a bounding box in an image +- There is a one-to-one correspondence between d3mIndex and an image. Therefore, d3mIndex is a multiIndex as it can have non-unique values as shown above. +- For this type of the datasets, the target column is a bounding polygon (named "bounding_box" in the above example). +- The type of the bounding_box column is "realVector" and captures 4 vertices using 8 coordinate values. +- The role of the bounding_box column is "boundingPolygon". +- Object detection dataset and problem can have a "class" target column. This column identifies which object is present inside the bounding polygon, when there are multiple classes of objects can be detected. +- The role of the class cloumn is suggestedTarget, indicating that model should output a class associated with a bounding box. Again if this is a case of a single class, all the output values will be the same. +- Typically, the metric that is used for this problem type requires that a confidence value be output along with the prediction. + +### datasetDoc.json + +An example datasetDoc: + +``` +... + + "dataResources": [ + { + "resID": "0", + "resPath": "media/", + "resType": "image", + "resFormat": [ + "image/png" + ], + "isCollection": true + }, + { + "resID": "learningData", + "resPath": "tables/learningData.csv", + "resType": "table", + "resFormat": [ + "text/csv" + ], + "isCollection": false, + "columns": [ + { + "colIndex": 0, + "colName": "d3mIndex", + "colType": "integer", + "role": [ + "multiIndex" + ] + }, + { + "colIndex": 1, + "colName": "image", + "colType": "string", + "role": [ + "index" + ], + "refersTo": { + "resID": "0", + "resObject": "item" + } + }, + { + "colIndex": 2, + "colName": "class", + "colType": "string", + "role": [ + "suggestedTarget" + ] + }, + { + "colIndex": 3, + "colName": "bounding_box", + "colType": "realVector", + "role": [ + "suggestedTarget", + "boundingPolygon" + ], + "refersTo": { + "resID": "learningData", + "resObject": { + "columnName": "image" + } + } + } + ] + } +... +``` + +# FAQ + +#### Why did we include repeat and fold columns in dataSplits? + +A number of datasets come with hold-out CV and k-fold CV splits provided. To leverage these splits and not create our own, we have included fold and repeat columns. + +#### How is a problem with multiple target variables handled? + +If the targets are independent, then we will create multiple problems, one for each target. On the other hand, if the targets are joint, he "targets" field in problemSchema will contain a list of dictionaries, one for each target description in the problem. An example of joint multi-target is the object detection problem where the output is both a class (e.g., dog) and a bounding polygon. + +``` +"targets": [ + { + "targetIndex": 0, + "resID": "1", + "colIndex": 2, + "colName": "class" + }, + { + "targetIndex": 1, + "resID": "1", + "colIndex": 3, + "colName": "bounding_box" + } + ] +``` + +#### How to submit predictions when there are multiple targets? + +The target columns are carried over in the predictions file as shown: + +|d3mIndex|class|bounding_box |confidence| +|--------|-----|---------------|----------| +|0 |dog |"..." |... | +|0 |cat |"..." |... | +|0 |cat |"..." |... | +|0 |cat |"..." |... | diff --git a/datasets/data-supply/documentation/standardValues.json b/datasets/data-supply/documentation/standardValues.json new file mode 100644 index 0000000..9f7a4cd --- /dev/null +++ b/datasets/data-supply/documentation/standardValues.json @@ -0,0 +1,137 @@ +{ + "unit": [ + "seconds", + "minutes", + "days", + "weeks", + "months", + "years", + "unspecified" + ], + "qualValueType": [ + "boolean", + "integer", + "real", + "string", + "dict" + ], + "resType": [ + "image", + "video", + "audio", + "speech", + "text", + "graph", + "edgeList", + "table", + "timeseries", + "raw" + ], + "colType": [ + "boolean", + "integer", + "real", + "string", + "categorical", + "dateTime", + "realVector", + "json", + "geojson", + "unknown" + ], + "role": [ + "index", + "multiIndex", + "key", + "attribute", + "suggestedTarget", + "timeIndicator", + "locationIndicator", + "boundaryIndicator", + "interval", + "instanceWeight", + "boundingPolygon", + "suggestedPrivilegedData", + "suggestedGroupingKey", + "edgeSource", + "directedEdgeSource", + "undirectedEdgeSource", + "multiEdgeSource", + "simpleEdgeSource", + "edgeTarget", + "directedEdgeTarget", + "undirectedEdgeTarget", + "multiEdgeTarget", + "simpleEdgeTarget" + ], + "resObject": [ + "item" + ], + "resComponent": [ + "nodes", + "edges" + ], + "taskKeywords": [ + "classification", + "regression", + "clustering", + "linkPrediction", + "vertexNomination", + "vertexClassification", + "communityDetection", + "graphMatching", + "forecasting", + "collaborativeFiltering", + "objectDetection", + "semiSupervised", + "binary", + "multiClass", + "multiLabel", + "univariate", + "multivariate", + "overlapping", + "nonOverlapping", + "tabular", + "relational", + "nested", + "image", + "audio", + "video", + "speech", + "text", + "graph", + "multiGraph", + "timeSeries", + "grouped", + "geospatial", + "remoteSensing", + "lupi", + "missingMetadata" + ], + "method": [ + "holdOut", + "kFold" + ], + "metric": [ + "accuracy", + "precision", + "recall", + "f1", + "f1Micro", + "f1Macro", + "rocAuc", + "rocAucMacro", + "rocAucMicro", + "meanSquaredError", + "rootMeanSquaredError", + "meanAbsoluteError", + "rSquared", + "normalizedMutualInformation", + "jaccardSimilarityScore", + "precisionAtTopK", + "objectDetectionAP", + "hammingLoss", + "hitsAtK", + "meanReciprocalRank" + ] +} diff --git a/datasets/data-supply/documentation/static/Drawing1.vsdx b/datasets/data-supply/documentation/static/Drawing1.vsdx new file mode 100644 index 0000000..f806d5f Binary files /dev/null and b/datasets/data-supply/documentation/static/Drawing1.vsdx differ diff --git a/datasets/data-supply/documentation/static/allViews.PNG b/datasets/data-supply/documentation/static/allViews.PNG new file mode 100644 index 0000000..2fc3d8b Binary files /dev/null and b/datasets/data-supply/documentation/static/allViews.PNG differ diff --git a/datasets/data-supply/documentation/static/examples.txt b/datasets/data-supply/documentation/static/examples.txt new file mode 100644 index 0000000..11cb922 --- /dev/null +++ b/datasets/data-supply/documentation/static/examples.txt @@ -0,0 +1,149 @@ + +/ +|-- media/ + |-- img1.png + |-- img2.png + |-- img3.png + |-- img4.png +|-- tables/ + |-- learningData.csv + d3mIndex,image,label + 0,img1.png,cat + 1,img2.png,dog + 2,img3.png,dog + 3,img4.png,cat +|-- datasetDoc.json + + +/ +|-- media/ + |-- img1.png + |-- img2.png +|-- tables/ + |-- learningData.csv + d3mIndex,image,label + 0,img1.png,cat + 1,img2.png,dog +|-- datasetDoc.json + + +/ +|-- media/ + |-- img3.png + |-- img4.png +|-- tables/ + |-- learningData.csv + d3mIndex,image,label + 2,img3.png, + 3,img4.png, +|-- datasetDoc.json + + +/ +|-- problemDoc.json +|-- dataSplits.csv + d3mIndex,type,repeat,fold + 0,TRAIN,0,0 + 1,TRAIN,0,0 + 2,TEST,0,0 + 3,TEST,0,0 + + +/ +|-- problemDoc.json +|-- dataSplits.csv + d3mIndex,type,repeat,fold + 0,TRAIN,0,0 + 1,TRAIN,0,0 + +/ +|-- problemDoc.json +|-- dataSplits.csv + d3mIndex,image,type,repeat,fold + 0,img001.pngTRAIN,0,0 + 1,img001.pngTRAIN,0,0 + 2,img001.png,TRAIN,0,0 + 3,img002.png,TEST,0,0 + 4,img002.png,TEST,0,0 + 5,img003.png,TRAIN,0,0 + 6,img003.png,TRAIN,0,0 + ... + +/ +|-- problemDoc.json +|-- dataSplits.csv + d3mIndex,type,repeat,fold + 2,TEST,0,0 + 3,TEST,0,0 + + +d3mIndex,type,repeat,fold +0,TRAIN,0,0 +1,TRAIN,0,0 +2,TEST,0,0 +3,TRAIN,0,0 +4,TRAIN,0,0 +5,TEST,0,0 +6,TEST,0,0 +7,TRAIN,0,0 +8,TEST,0,0 + +d3mIndex,type,repeat,fold,0 +2,TEST,0,0,1 +5,TEST,0,0,1 +6,TEST,0,0,3 +8,TEST,0,0,2 + + +d3mIndex,type,repeat,fold,0,1,2 +12,TEST,0,0,0,0,1 +15,TEST,0,0,1,0,0 +62,TEST,0,0,1,1,0 +81,TEST,0,0,0,0,0 +87,TEST,0,0,0,1,0 + + +d3mIndex,type,repeat,fold,0,0.confidence.1,0.confidence.2,0.confidence.3 +2,TEST,0,0,1,0.5,0.25,0.25 +5,TEST,0,0,1,0.8,0.1,0.1 +6,TEST,0,0,3,0.2,0.2,0.6 +8,TEST,0,0,2,0.1,0.7,0.2 + + +d3mIndex,image,bounding_box,confidence +12,'img_00285.png','330,463,387,505',0.0739 +13,'img_00285.png','420,433,451,498',0.0910 +14,'img_00285.png','328,465,403,540',0.1008 +15,'img_00285.png','480,477,508,522',0.1012 +16,'img_00285.png','357,460,417,537',0.1058 +17,'img_00285.png','356,456,391,521',0.0843 +18,'img_00225.png','345,460,415,547',0.0539 +19,'img_00225.png','381,362,455,513',0.0542 +20,'img_00225.png','382,366,416,422',0.0559 +21,'img_00225.png','730,463,763,583',0.0588 + +d3mIndex,image,bounding_box +3,'img_00285.png','480,457,515,529' +4,'img_00285.png','480,457,515,529' +5,'img_00225.png','522,540,576,660' +6,'img_00225.png','739,460,768,545' + + +24,img00016.png,"80,86,205,383",0.0739 +25,img00016.png,"279,94,400,361",0.0918 +26,img00016.png,"411,101,495,378",0.1008 +27,img00017.png,"115,48,244,332",0.1058 +28,img00018.png,"20,19,126,304",0.0843 +29,img00018.png,"7,135,142,389",0.0539 +30,img00018.png,"194,123,339,421",0.0542 +31,img00028.png,"339,99,508,381",0.0559 + + +d3mIndex,image,bounding_box +12,img00016.png,"80,86,205,383" +13,img00016.png,"279,94,400,361" +14,img00017.png,"115,48,244,332" +15,img00018.png,"20,19,126,304" +16,img00019.png,"7,135,142,389" +17,img00019.png,"194,123,339,421" +17,img00020.png,"339,99,508,381" diff --git a/datasets/data-supply/documentation/static/objDetection_scoring_GT.PNG b/datasets/data-supply/documentation/static/objDetection_scoring_GT.PNG new file mode 100644 index 0000000..4b7f543 Binary files /dev/null and b/datasets/data-supply/documentation/static/objDetection_scoring_GT.PNG differ diff --git a/datasets/data-supply/documentation/static/objDetection_scoring_PRED.PNG b/datasets/data-supply/documentation/static/objDetection_scoring_PRED.PNG new file mode 100644 index 0000000..e818660 Binary files /dev/null and b/datasets/data-supply/documentation/static/objDetection_scoring_PRED.PNG differ diff --git a/datasets/data-supply/documentation/static/sampleDataSplitsFile.PNG b/datasets/data-supply/documentation/static/sampleDataSplitsFile.PNG new file mode 100644 index 0000000..5650723 Binary files /dev/null and b/datasets/data-supply/documentation/static/sampleDataSplitsFile.PNG differ diff --git a/datasets/data-supply/documentation/static/sampleDataset.PNG b/datasets/data-supply/documentation/static/sampleDataset.PNG new file mode 100644 index 0000000..cb18ff2 Binary files /dev/null and b/datasets/data-supply/documentation/static/sampleDataset.PNG differ diff --git a/datasets/data-supply/documentation/static/sampleProblem.PNG b/datasets/data-supply/documentation/static/sampleProblem.PNG new file mode 100644 index 0000000..6ece324 Binary files /dev/null and b/datasets/data-supply/documentation/static/sampleProblem.PNG differ diff --git a/datasets/data-supply/documentation/static/sampleProblemTestView.PNG b/datasets/data-supply/documentation/static/sampleProblemTestView.PNG new file mode 100644 index 0000000..45edfd0 Binary files /dev/null and b/datasets/data-supply/documentation/static/sampleProblemTestView.PNG differ diff --git a/datasets/data-supply/documentation/static/sampleProblemTrainView.PNG b/datasets/data-supply/documentation/static/sampleProblemTrainView.PNG new file mode 100644 index 0000000..548c05e Binary files /dev/null and b/datasets/data-supply/documentation/static/sampleProblemTrainView.PNG differ diff --git a/datasets/data-supply/documentation/static/sampleProblem_objectDetection.PNG b/datasets/data-supply/documentation/static/sampleProblem_objectDetection.PNG new file mode 100644 index 0000000..1576078 Binary files /dev/null and b/datasets/data-supply/documentation/static/sampleProblem_objectDetection.PNG differ diff --git a/datasets/data-supply/documentation/static/sampleSupply.PNG b/datasets/data-supply/documentation/static/sampleSupply.PNG new file mode 100644 index 0000000..d72e687 Binary files /dev/null and b/datasets/data-supply/documentation/static/sampleSupply.PNG differ diff --git a/datasets/data-supply/documentation/static/sampleTestView.PNG b/datasets/data-supply/documentation/static/sampleTestView.PNG new file mode 100644 index 0000000..9b5e200 Binary files /dev/null and b/datasets/data-supply/documentation/static/sampleTestView.PNG differ diff --git a/datasets/data-supply/documentation/static/sampleTrainView.PNG b/datasets/data-supply/documentation/static/sampleTrainView.PNG new file mode 100644 index 0000000..0a9b672 Binary files /dev/null and b/datasets/data-supply/documentation/static/sampleTrainView.PNG differ diff --git a/datasets/data-supply/documentation/static/schema fields spreadsheet.xlsx b/datasets/data-supply/documentation/static/schema fields spreadsheet.xlsx new file mode 100644 index 0000000..c5b4fc2 Binary files /dev/null and b/datasets/data-supply/documentation/static/schema fields spreadsheet.xlsx differ diff --git a/datasets/data-supply/documentation/static/testView.PNG b/datasets/data-supply/documentation/static/testView.PNG new file mode 100644 index 0000000..209d053 Binary files /dev/null and b/datasets/data-supply/documentation/static/testView.PNG differ diff --git a/datasets/data-supply/documentation/static/trainView.PNG b/datasets/data-supply/documentation/static/trainView.PNG new file mode 100644 index 0000000..8a08a8d Binary files /dev/null and b/datasets/data-supply/documentation/static/trainView.PNG differ diff --git a/datasets/data-supply/documentation/supportedResourceTypesFormats.json b/datasets/data-supply/documentation/supportedResourceTypesFormats.json new file mode 100644 index 0000000..5a8b9ae --- /dev/null +++ b/datasets/data-supply/documentation/supportedResourceTypesFormats.json @@ -0,0 +1,60 @@ +{ + "datasets_release":"3.0", + "supported_resource_types_and_formats":[ + { + "resType":"audio", + "resFormat":{ + "audio/aiff":["aif", "aiff"], + "audio/flac":["flac"], + "audio/ogg":["ogg"], + "audio/wav":["wav"], + "audio/mpeg":["mp3"] + } + }, + { + "resType":"image", + "resFormat":{ + "image/jpeg":["jpeg", "jpg"], + "image/png":["png"] + } + }, + { + "resType":"video", + "resFormat":{ + "video/mp4": ["mp4"], + "video/avi": ["avi"] + } + }, + { + "resType":"table", + "resFormat":{ + "text/csv": ["csv"], + "text/csv+gzip": ["csv.gz"] + } + }, + { + "resType":"text", + "resFormat":{ + "text/plain": ["txt"] + } + }, + { + "resType":"graph", + "resFormat":{ + "text/vnd.gml": ["gml"] + } + }, + { + "resType":"edgeList", + "resFormat":{ + "text/csv": ["csv"] + } + }, + { + "resType":"timeseries", + "resFormat":{ + "text/csv": ["csv"] + } + } + ] +} \ No newline at end of file diff --git a/datasets/data-supply/schemas/README.md b/datasets/data-supply/schemas/README.md new file mode 100644 index 0000000..99cbc10 --- /dev/null +++ b/datasets/data-supply/schemas/README.md @@ -0,0 +1,5 @@ +# D3M Schemas + +Please see the contents of [documentation/](../documentation/) for details. + +Schemas are described using [Cerberus](https://docs.python-cerberus.org/en/stable/). diff --git a/datasets/data-supply/schemas/datasetSchema.json b/datasets/data-supply/schemas/datasetSchema.json new file mode 100644 index 0000000..711e359 --- /dev/null +++ b/datasets/data-supply/schemas/datasetSchema.json @@ -0,0 +1,74 @@ +{ + "about": {"type":"dict", "required":true, "allow_unknown":true, "schema": { + "datasetID":{"type":"string", "required":true, "empty": false}, + "datasetName":{"type":"string", "required":true, "empty": false}, + "datasetURI":{"type":"string","required":false}, + "description":{"type":"string","required":false}, + "citation":{"type":"string","required":false}, + "publicationDate":{"type":"string","required":false}, + "humanSubjectsResearch": {"type":"boolean", "required":false}, + "license":{"type":"string", "required":false}, + "source":{"type":"string", "required":false}, + "sourceURI":{"type":"string", "required":false}, + "approximateSize":{"type":"string", "required":false}, + "applicationDomain":{"type":"string","required":false}, + "datasetVersion":{"type":"string", "required":true}, + "datasetSchemaVersion":{"type":"string", "required":true}, + "redacted":{"type":"boolean", "required":false}, + "digest":{"type":"string", "required":false} + }}, + + "dataResources":{"type":"list","required":true,"schema":{"type":"dict","required": true,"allow_unknown":true,"schema":{ + "resID":{"type":"string","required":true}, + "resPath":{"type":"string","required":true}, + "resType":{"type":"string","required":true}, + "resFormat":{"type":"dict", "required":true, "keysrules": {"type":"string"}, "valuesrules": {"type":"list"}}, + "isCollection":{"type":"boolean","required":false,"default":false}, + "columnsCount": {"type":"integer","required":false}, + "columns": { + "type":"list", + "required": false, + "schema": {"type":"dict", "required": true, "allow_unknown":true, "schema": { + "colIndex":{"type":"integer", "required":true}, + "colName":{"type":"string", "required":true, "empty":false}, + "colDescription":{"type":"string", "required":false}, + "colType":{"required":true, "type":"string"}, + "role":{"type":"list","required":true,"schema":{"type":"string"}}, + "refersTo":{"type": "dict", "required":false, "allow_unknown":true, "schema":{ + "resID":{"type":"string","required":true}, + "resObject":{"required":true, "oneof":[ + {"type":"string"}, + {"type":"dict", "allow_unknown":true, "schema":{ + "nodeAttribute":{"type":"string","excludes":["edgeAttribute","columnIndex","columnName"]}, + "edgeAttribute":{"type":"string","excludes":["nodeAttribute","columnIndex","columnName"]}, + "columnIndex":{"type":"integer","excludes":["nodeAttribute","edgeAttribute","columnName"]}, + "columnName":{"type":"string","excludes":["nodeAttribute","edgeAttribute","columnIndex"]}}} + ]} + }}, + "timeGranularity":{"type":"dict", "required":false, "allow_unknown":true, "schema":{ + "value":{"type":"number", "required":true}, + "unit":{"type":"string", "required":true} + }} + }} + }}}}, + + "qualities":{"type":"list","required":false,"schema":{"type":"dict","required":true,"allow_unknown":true,"schema":{ + "qualName":{"type":"string","required":true}, + "qualValue":{"required":true}, + "qualValueType":{"type":"string","required":true}, + "qualValueUnits":{"type":"string","required":false}, + "restrictedTo":{"type":"dict","required":false,"allow_unknown":true,"schema":{ + "resID":{"type":"string","required":true}, + "resComponent":{"oneof":[ + {"type":"dict","required":false,"allow_unknown":true,"schema":{ + "columnIndex":{"type":"integer","excludes":["columnName","nodeAttribute","edgeAttribute","selector"]}, + "columnName":{"type":"string","excludes":["columnIndex","nodeAttribute","edgeAttribute","selector"]}, + "nodeAttribute":{"type":"string","excludes":["columnIndex","columnName","edgeAttribute","selector"]}, + "edgeAttribute":{"type":"string","excludes":["columnIndex","columnName","nodeAttribute","selector"]}, + "selector":{"type":"list","excludes":["columnIndex","columnName","nodeAttribute","edgeAttribute"]} + }}, + {"type":"string","required":false} + ]} + }} + }}} +} diff --git a/datasets/data-supply/schemas/problemSchema.json b/datasets/data-supply/schemas/problemSchema.json new file mode 100644 index 0000000..3c65329 --- /dev/null +++ b/datasets/data-supply/schemas/problemSchema.json @@ -0,0 +1,72 @@ +{ + "about":{"type":"dict", "required":true, "allow_unknown":true, "schema": { + "problemID":{"type":"string", "required":true, "empty":false}, + "problemName":{"type":"string", "required":true, "empty":false}, + "problemDescription":{"type":"string", "required":false}, + "problemURI":{"type":"string","required":false}, + "taskKeywords":{"type":"list", "required":true, "schema":{"type":"string"}}, + "problemVersion":{"type":"string", "required":true}, + "problemSchemaVersion":{"type":"string", "required":true} + }}, + "inputs":{"type":"dict","required":true,"allow_unknown":true,"schema":{ + "data":{"type":"list","required":true,"schema":{"type":"dict", "required": true, "allow_unknown":true, "schema": { + "datasetID":{"type":"string","required":true}, + "targets":{"type":"list", "required":true,"schema":{"type":"dict","required":true,"allow_unknown":true,"schema":{ + "targetIndex":{"type":"integer","required":true}, + "resID":{"type":"string","required":true}, + "colIndex":{"type":"integer", "required":true}, + "colName":{"type":"string","required":true}, + "numClusters":{"type":"integer","required":false} + }}}, + "forecastingHorizon":{"type":"dict", "required":false, "allow_unknown":true, "schema":{ + "resID":{"type":"string","required":true}, + "colIndex":{"type":"integer", "required":true}, + "colName":{"type":"string","required":true}, + "horizonValue":{"type":"number", "required":true} + }}, + "privilegedData":{"type":"list", "required":false,"schema":{"type":"dict","required":true,"allow_unknown":true,"schema":{ + "privilegedDataIndex":{"type":"integer","required":true}, + "resID":{"type":"string","required":true}, + "colIndex":{"type":"integer", "required":true}, + "colName":{"type":"string","required":true} + }}} + }}}, + "dataSplits":{"type":"dict","required":false,"allow_unknown":true,"schema":{ + "method":{"type":"string","required":false}, + "testSize":{"type":"float","required":false,"min":0.0,"max":1.0}, + "numFolds":{"type":"integer","required":false}, + "stratified":{"type":"boolean","required":false}, + "numRepeats":{"type":"integer","required":false}, + "randomSeed":{"type":"integer","required":false}, + "splitsFile":{"type":"string", "required":false}, + "splitScript":{"type":"string", "required":false}, + "datasetViewMaps":{"type":"dict","required":false,"allow_unknown":true,"schema":{ + "train":{"type":"list", "required":false, "schema":{"type":"dict", "required":true, "allow_unknown":true, "schema":{ + "from":{"type":"string", "required":true}, + "to":{"type":"string", "required":true} + }}}, + "test":{"type":"list", "required":false, "schema":{"type":"dict", "required":true, "allow_unknown":true, "schema":{ + "from":{"type":"string", "required":true}, + "to":{"type":"string", "required":true} + }}}, + "score":{"type":"list", "required":false, "schema":{"type":"dict", "required":true, "allow_unknown":true, "schema":{ + "from":{"type":"string", "required":true}, + "to":{"type":"string", "required":true} + }}} + }} + }}, + "performanceMetrics":{"type":"list","required":true,"schema":{"type":"dict","required":true,"allow_unknown":true,"schema":{ + "metric":{"type":"string","required":true}, + "K":{"type":"integer","required":false,"dependencies": {"metric":"precisionAtTopK"}}, + "posLabel":{"type":"string","required":false, "dependencies": {"metric":["f1","precision","recall","jaccardSimilarityScore"]}} + }}} + }}, + "expectedOutputs":{"type":"dict","required":false,"allow_unknown":true,"schema":{ + "predictionsFile": {"type":"string", "required":false, "default":"predictions.csv"}, + "scoresFile": {"type":"string", "required":false, "default":"scores.csv"} + }}, + "dataAugmentation":{"type":"list", "required":false,"schema":{"type":"dict","allow_unknown":true,"schema":{ + "domain":{"type":"list","required":false, "schema":{"type":"string"}}, + "keywords":{"type":"list", "required":false, "schema":{"type":"string"}} + }}} +} diff --git a/datasets/validate.py b/datasets/validate.py new file mode 100644 index 0000000..5f04dbc --- /dev/null +++ b/datasets/validate.py @@ -0,0 +1,1296 @@ +#!/usr/bin/env python3 +# +# This script validates that problem and dataset descriptions match +# standards and conventions (schemas, naming and directory structure, etc.). +# +# This script expects a that there is a clone of the "data-supply" +# repository in the same directory as this script. +# +# Checks done by this script: +# - Dataset description validates according to its schema. +# - Problem description validates according to its schema. +# - Dataset description filename should be "datasetDoc.json". +# - Problem description filename should be "problemDoc.json". +# - There should be no duplicate dataset IDs or problem IDs. +# - Dataset directory names should match the dataset IDs, and be under +# a matching parent directory based on that ID (where ID should +# have an expected suffix). +# - All problem descriptions for dataset views/splits should be the same. +# - Dataset splits should match in ID the original dataset based on the directory +# structure they are in, but have "TEST, "TRAIN", or "SCORE" suffix. +# - Problem descriptions should reference existing datasets and columns. +# - Dataset and problem descriptions should be (almost) equal between splits. +# - Clustering problems require numClusters in target specifications. +# - Clustering problems should not have data splitting configuration. +# - Test and train split of datasets used in clustering problems should be the same. +# - Require dataset digest. +# - Dataset entry points should have "learningData" as resource ID. +# - Problem descriptions using "f1", "precision", "recall", and "jaccardSimilarityScore" +# metrics should have only two distinct values in target columns, have "posLabel" provided, +# and that "posLabel" value should be among target values. +# - No other should have "posLabel" set. +# - "hammingLoss" metric can be used only with multi-label problems. +# - "precisionAtTopK" should be used only with forecasting. +# - Problem descriptions should have only one target, except for multi-variate +# and object detection problems which should have more than one. +# - Dataset entry point cannot be a collection. +# - Dataset entry point has to have columns metadata. +# - There is at most one "index" or "multiIndex" column per resource. +# - "index" and "multiIndex" cannot be set at the same time. +# - Dataset entry point is required to have an "index" or "multiIndex" column. +# - Columns cannot be both "index" and "key" at the same time. +# - Columns cannot be both "multiIndex" and "key" at the same time. +# - "index" columns have to have unique values and no missing values. +# - "multiIndex" columns have to have no missing values. +# - "key" columns have to have unique values. +# - Every metric should be listed only once in a problem description. +# - Some task keywords can be used only with corresponding task keywords. +# - All resource formats used by a resource should be from the standard list of them. +# - All files used in a collection resource should have a file extension of a resource +# format from the standard list of them. +# - Collection resource should contain at least one file. +# - Resource path of a collection resource should end with "/". +# - Any file referenced in a collection resource must exist. +# - On edgelist resources, both "edgeSource" and "edgeTarget" columns should exist in +# same resource, only one each. It should have additional two column roles for direction +# and simple/multi. Those should match between columns (so both should be directed or not, +# and simple or multi, but not mix). +# - When there is "multiIndex" column, all rows for same index value should have the same +# values in all columns except "suggestedTarget" columns. +# - Makes sure that "columnsCount" matches the number of columns, when it exists. + +import argparse +import collections +import copy +import functools +import json +import traceback +import os +import os.path +import sys + +import cerberus +import deep_dircmp +import pandas + +LIMIT_OUTPUT = 10 +EDGELIST_COLUMN_ROLES = [ + 'edgeSource', + 'directedEdgeSource', + 'undirectedEdgeSource', + 'multiEdgeSource', + 'simpleEdgeSource', + 'edgeTarget', + 'directedEdgeTarget', + 'undirectedEdgeTarget', + 'multiEdgeTarget', + 'simpleEdgeTarget', +] + +if not os.path.exists(os.path.join(os.path.dirname(__file__), 'data-supply')): + raise Exception("\"data-supply\" directory is missing. You should clone the repository to be in the same directory as this script.") + +with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'schemas', 'datasetSchema.json')) as dataset_description_schema_file: + dataset_description_validator = cerberus.Validator(json.load(dataset_description_schema_file)) + +with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'schemas', 'problemSchema.json')) as problem_description_schema_file: + problem_description_validator = cerberus.Validator(json.load(problem_description_schema_file)) + +with open(os.path.join(os.path.dirname(__file__), 'data-supply', 'documentation', 'supportedResourceTypesFormats.json')) as supported_resource_types_formats_file: + supported_resource_types_formats = json.load(supported_resource_types_formats_file) + res_format_to_extensions = {} + for supported_resource in supported_resource_types_formats['supported_resource_types_and_formats']: + for res_format, extensions in supported_resource['resFormat'].items(): + if res_format not in res_format_to_extensions: + res_format_to_extensions[res_format] = sorted(set(extensions)) + else: + res_format_to_extensions[res_format] = sorted(set(extensions) | set(res_format_to_extensions[res_format])) + + +@functools.lru_cache(maxsize=10) +def read_csv(data_path): + return pandas.read_csv( + data_path, + # We do not want to do any conversion of values. + dtype=str, + # We always expect one row header. + header=0, + # We want empty strings and not NaNs. + na_filter=False, + encoding='utf8', + ) + + +def validate_dataset_path(description_id, description_path, *, strict_naming=True): + if os.path.basename(description_path) != 'datasetDoc.json': + print("ERROR: Dataset description filename is not 'datasetDoc.json'.") + return True + + if strict_naming: + split_path = os.path.dirname(description_path).split(os.sep) + for suffix in ['_dataset_TEST', '_dataset_TRAIN', '_dataset_SCORE']: + if description_id.endswith(suffix): + expected_paths = [[description_id[:-len(suffix)], suffix[len('_dataset_'):], suffix[1:]]] + + # A special case, SCORE dataset/problem can be in TEST directory. + if suffix == '_dataset_SCORE': + expected_paths.append([description_id[:-len(suffix)], suffix[len('_dataset_'):], 'dataset_TEST']) + + if split_path[-3:] not in expected_paths: + print("ERROR: Dataset directory path {directory_path} does not match any of expected paths: {expected_paths}".format( + directory_path=split_path[-3:], + expected_paths=', '.join(str(expected_path) for expected_path in expected_paths), + )) + return True + + break + else: + if not description_id.endswith('_dataset'): + print("ERROR: Dataset ID does not end with allowed suffix: {description_id}".format( + description_id=description_id, + )) + return True + + expected_path = [description_id[:-len('_dataset')], description_id] + + if split_path[-2:] != expected_path: + print("ERROR: Dataset directory path {directory_path} does not match expected path: {expected_path}".format( + directory_path=split_path[-2:], + expected_path=expected_path, + )) + return True + + return False + + +def validate_metrics(problem_description): + error = False + + existing_metrics = set() + for metric in problem_description.get('inputs', {}).get('performanceMetrics', []): + if metric['metric'] in ['f1', 'precision', 'recall', 'jaccardSimilarityScore']: + if 'posLabel' not in metric: + print("ERROR: Problem uses '{metric}' metric, but 'posLabel' is not provided.".format( + metric=metric['metric'], + )) + error = True + if set(problem_description['about']['taskKeywords']) & {'multiClass', 'multiLabel'}: + print("ERROR: Problem uses '{metric}' metric, but it is a multi-class or a multi-label problem.".format( + metric=metric['metric'], + )) + error = True + elif 'posLabel' in metric: + print("ERROR: Problem does not use 'f1', 'precision', 'recall', or 'jaccardSimilarityScore' metric, but 'posLabel' is provided.".format( + metric=metric['metric'], + )) + error = True + + if metric['metric'] == 'hammingLoss' and 'multiLabel' not in set(problem_description['about']['taskKeywords']): + print("ERROR: Problem uses 'hammingLoss' metric, but it is not a multi-label problem.") + error = True + + if metric['metric'] == 'precisionAtTopK' and 'forecasting' not in set(problem_description['about']['taskKeywords']): + print("ERROR: Problem uses 'precisionAtTopK' metric, but it is not forecasting problem.") + error = True + + if metric['metric'] in existing_metrics: + print("ERROR: Problem uses same metric '{metric}' multiple times.".format(metric=metric['metric'])) + error = True + existing_metrics.add(metric['metric']) + + return error + + +def validate_keywords(problem_description): + task_keywords = set(problem_description['about']['taskKeywords']) + + targets_number = 0 + for data in problem_description.get('inputs', {}).get('data', []): + targets_number += len(data.get('targets', [])) + + if 'regression' in task_keywords and 'multivariate' in task_keywords: + if targets_number < 2: + print("ERROR: Problem is a multi-variate problem, but it does not have more than 1 target.") + return True + elif 'objectDetection' in task_keywords: + if targets_number != 1 and targets_number != 2: + print("ERROR: Problem is an object detection problem, but it does not have 1 or 2 targets.") + return True + elif targets_number != 1: + print("ERROR: Problem has more than 1 target.") + return True + + if task_keywords & {'binary', 'multiClass', 'multiLabel'} and not task_keywords & {'classification', 'vertexClassification'}: + print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format( + task_keywords=task_keywords, + )) + return True + if task_keywords & {'classification', 'vertexClassification'} and not task_keywords & {'binary', 'multiClass', 'multiLabel'}: + print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format( + task_keywords=task_keywords, + )) + return True + + if task_keywords & {'univariate', 'multivariate'} and 'regression' not in task_keywords: + print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format( + task_keywords=task_keywords, + )) + return True + if 'regression' in task_keywords and not task_keywords & {'univariate', 'multivariate'}: + print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format( + task_keywords=task_keywords, + )) + return True + + if task_keywords & {'overlapping', 'nonOverlapping'} and not task_keywords & {'clustering', 'communityDetection'}: + print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format( + task_keywords=task_keywords, + )) + return True + if task_keywords & {'clustering', 'communityDetection'} and not task_keywords & {'overlapping', 'nonOverlapping'}: + print("ERROR: Invalid combination of problem's keywords: {task_keywords}".format( + task_keywords=task_keywords, + )) + return True + + return False + + +def validate_files(dataset_description_path, data_resource, dataset_description, column_index, collection_resource_id): + for collection_data_resource in dataset_description['dataResources']: + if collection_data_resource['resID'] == collection_resource_id: + break + else: + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' referencing with column {column_index} a collection resource '{collection_resource_id}', but the resource does not exixt.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + column_index=column_index, + collection_resource_id=collection_resource_id, + )) + # We cannot do much more here. + return True + + if not collection_data_resource.get('isCollection', False): + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' referencing with column {column_index} a collection resource '{collection_resource_id}', but the resource is not a collection.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + column_index=column_index, + collection_resource_id=collection_resource_id, + )) + # We cannot do much more here. + return True + + error = False + + data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath']) + + data = read_csv(data_path) + + collection_dir = os.path.join(os.path.dirname(dataset_description_path), collection_data_resource['resPath']) + + count = 0 + for filename in data.iloc[:, column_index]: + filepath = os.path.join(collection_dir, filename) + + if not os.path.isfile(filepath): + count += 1 + + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' referencing with column {column_index} a file in a collection resource '{collection_resource_id}', but the file does not exist: {filename}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + column_index=column_index, + collection_resource_id=collection_resource_id, + filename=filename, + )) + error = True + + if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT: + break + + return error + + +def validate_collection(dataset_description_path, data_resource): + error = False + + if not data_resource['resPath'].endswith('/'): + print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' where resource path is not ending with '/': {res_path}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + res_path=data_resource['resPath'], + )) + error = True + + allowed_file_extensions = set() + for res_format, extensions in data_resource['resFormat'].items(): + unsupported_extensions = set(extensions) - set(res_format_to_extensions[res_format]) + if unsupported_extensions: + print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' and resource format '{res_format}' with unsupported extensions: {unsupported_extensions}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + res_format=res_format, + unsupported_extensions=sorted(unsupported_extensions), + )) + error = True + allowed_file_extensions.update(extensions) + + collection_dir = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath']) + is_empty = True + count = 0 + for dirpath, dirnames, filenames in os.walk(collection_dir): + for filename in filenames: + is_empty = False + + filepath = os.path.join(dirpath, filename) + + file_extension = get_file_extension(filepath) + if file_extension not in allowed_file_extensions: + count += 1 + + print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' with a file with unsupported file extension: {filepath}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + filepath=filepath, + )) + error = True + + if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT: + break + + if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT: + break + + if is_empty: + print("ERROR: Dataset '{dataset_path}' has a collection resource '{resource_id}' without any files.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + return error + + +def validate_multi_index(dataset_description_path, data_resource, multi_index_column): + error = False + + suggested_target_columns = [] + for column_description in data_resource['columns']: + if 'suggestedTarget' in column_description['role']: + suggested_target_columns.append(column_description['colIndex']) + + data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath']) + + data = read_csv(data_path) + + attribute_columns = [column_index for column_index in range(len(data.columns)) if column_index != multi_index_column and column_index not in suggested_target_columns] + attributes = data.iloc[:, attribute_columns].set_index(data.iloc[:, multi_index_column]) + + count = 0 + for group_name, group in attributes.groupby(level=0): + # The first row in a group is not marked, so we add 1 to number of duplicated rows. + if group.duplicated(keep='first').sum() + 1 != len(group): + count += 1 + + print("ERROR: Dataset '{dataset_path}' has a multi-index resource '{resource_id}' with all attributes in rows not equal for index value '{value}'.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + value=group_name, + )) + error = True + + if LIMIT_OUTPUT is not None and count > LIMIT_OUTPUT: + break + + return error + + +def validate_edgelist(dataset_description_path, data_resource): + error = False + + found_source = False + is_directed_source = None + is_multi_source = None + found_target = False + is_directed_target = None + is_multi_target = None + for column_description in data_resource['columns']: + if 'edgeSource' in column_description['role']: + # We have to check this only here or only in "edgeTarget" case. We check it here. + if 'edgeTarget' in column_description['role']: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting source vs. target column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if found_source: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with multiple edge source columns.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + continue + found_source = True + + if 'multiEdgeSource' in column_description['role']: + if is_multi_source is None: + is_multi_source = True + elif is_multi_source != True: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if 'simpleEdgeSource' in column_description['role']: + if is_multi_source is None: + is_multi_source = False + elif is_multi_source != False: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if is_multi_source is None: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing multi vs. simple column role.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if 'directedEdgeSource' in column_description['role']: + if is_directed_source is None: + is_directed_source = True + elif is_directed_source != True: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if 'undirectedEdgeSource' in column_description['role']: + if is_directed_source is None: + is_directed_source = False + elif is_directed_source != False: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if is_directed_source is None: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing directed vs. undirected column role.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if 'edgeTarget' in column_description['role']: + if found_target: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with multiple edge target columns.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + continue + found_target = True + + if 'multiEdgeTarget' in column_description['role']: + if is_multi_target is None: + is_multi_target = True + elif is_multi_target != True: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if 'simpleEdgeTarget' in column_description['role']: + if is_multi_target is None: + is_multi_target = False + elif is_multi_target != False: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if is_multi_target is None: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing multi vs. simple column role.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if 'directedEdgeTarget' in column_description['role']: + if is_directed_target is None: + is_directed_target = True + elif is_directed_target != True: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if 'undirectedEdgeTarget' in column_description['role']: + if is_directed_target is None: + is_directed_target = False + elif is_directed_target != False: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if is_directed_target is None: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing directed vs. undirected column role.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if not found_source: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing edge source column role.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + if not found_target: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with missing edge target column role.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if found_source and found_target: + if is_directed_source != is_directed_target: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting directed vs. undirected column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if is_multi_source != is_multi_target: + print("ERROR: Dataset '{dataset_path}' has a edgelist resource '{resource_id}' with conflicting multi vs. simple column roles.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + return error + + +def get_file_extension(path): + extension = os.path.splitext(path)[1] + if extension: + # We remove leading dot as returned from "splitext". + return extension[1:] + else: + raise ValueError(f"Cannot get file extension of '{path}'.") + + +def validate_dataset(dataset_description_path, dataset_description): + error = False + + for data_resource in dataset_description['dataResources']: + if os.path.splitext(os.path.basename(data_resource['resPath']))[0] == 'learningData' and data_resource['resID'] != 'learningData': + print("ERROR: Dataset '{dataset_path}' has a dataset entry point without 'learningData' as resource's ID, but '{resource_id}'.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + )) + error = True + + if data_resource['resID'] == 'learningData': + if data_resource.get('isCollection', False): + print("ERROR: Dataset '{dataset_path}' has a dataset entry point which is a collection.".format( + dataset_path=dataset_description_path, + )) + error = True + + if 'columns' not in data_resource: + print("ERROR: Dataset '{dataset_path}' has a dataset entry point without columns metadata.".format( + dataset_path=dataset_description_path, + )) + error = True + + if 'columns' in data_resource: + index_columns = [] + multi_index_columns = [] + key_columns = [] + edgelist_columns = [] + for column_description in data_resource['columns']: + if 'index' in column_description['role']: + index_columns.append(column_description['colIndex']) + if 'multiIndex' in column_description['role']: + multi_index_columns.append(column_description['colIndex']) + if 'key' in column_description['role']: + key_columns.append(column_description['colIndex']) + if any(edgelist_column_role in column_description['role'] for edgelist_column_role in EDGELIST_COLUMN_ROLES): + edgelist_columns.append(column_description['colIndex']) + + index_columns_set = set(index_columns) + multi_index_columns_set = set(multi_index_columns) + key_columns_set = set(key_columns) + + if index_columns_set & multi_index_columns_set: + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with columns being both index and multi-index at the same time: {index_columns}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + index_columns=sorted(index_columns_set & multi_index_columns_set), + )) + error = True + elif data_resource['resID'] == 'learningData' and len(index_columns) + len(multi_index_columns) == 0: + print("ERROR: Dataset '{dataset_path}' has a dataset entry point with no index columns.".format( + dataset_path=dataset_description_path, + )) + error = True + elif len(index_columns) + len(multi_index_columns) > 1: + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with multiple index columns: {index_columns}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + index_columns=index_columns + multi_index_columns, + )) + error = True + + if index_columns_set & key_columns_set: + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with columns being both index and key at the same time: {index_columns}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + index_columns=sorted(index_columns_set & key_columns_set), + )) + error = True + + if multi_index_columns_set & key_columns_set: + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with columns being both multi-index and key at the same time: {index_columns}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + index_columns=sorted(multi_index_columns_set & key_columns_set), + )) + error = True + + if data_resource.get('isCollection', False): + continue + + for column_index in index_columns: + error = validate_column_values(dataset_description_path, data_resource, column_index, unique=True, no_missing=True) or error + for column_index in multi_index_columns: + error = validate_column_values(dataset_description_path, data_resource, column_index, unique=False, no_missing=True) or error + for column_index in key_columns: + error = validate_column_values(dataset_description_path, data_resource, column_index, unique=True, no_missing=False) or error + + for column_description in data_resource['columns']: + if 'refersTo' in column_description and column_description['refersTo']['resObject'] == 'item': + error = validate_files(dataset_description_path, data_resource, dataset_description, column_description['colIndex'], column_description['refersTo']['resID']) or error + + if edgelist_columns: + error = validate_edgelist(dataset_description_path, data_resource) or error + + if len(multi_index_columns) == 1: + error = validate_multi_index(dataset_description_path, data_resource, multi_index_columns[0]) or error + + for res_format in data_resource['resFormat'].keys(): + if res_format not in res_format_to_extensions: + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with unsupported format: {res_format}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + res_format=res_format, + )) + error = True + + if data_resource.get('isCollection', False): + error = validate_collection(dataset_description_path, data_resource) or error + else: + if len(data_resource['resFormat']) == 1: + file_extension = get_file_extension(data_resource['resPath']) + # There should be only one resource format listed for non-collection resources. + if file_extension not in list(data_resource['resFormat'].values())[0]: + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with invalid resource path file extension: {file_extension}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + file_extension=file_extension, + )) + error = True + else: + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with invalid number of listed formats: {count}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + count=len(data_resource['resFormat']), + )) + error = True + + return error + + +def validate_dataset_description(dataset_description_path, known_dataset_descriptions, *, strict_naming=True): + print("Validating dataset '{dataset_description_path}'.".format(dataset_description_path=dataset_description_path)) + + try: + with open(dataset_description_path) as dataset_description_file: + dataset_description = json.load(dataset_description_file) + + if not dataset_description_validator.validate(dataset_description): + print("ERROR: Schema validation: {errors}".format(errors=dataset_description_validator.errors)) + return True + + dataset_id = dataset_description['about']['datasetID'] + + # Handle a special case for SCORE dataset splits (those which have "targets.csv" file). + # They are the same as TEST dataset splits, but we present them differently, so that + # SCORE dataset splits have targets as part of data. Because of this we also update + # corresponding dataset ID. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176 + if os.path.exists(os.path.join(os.path.dirname(dataset_description_path), '..', 'targets.csv')) and dataset_id.endswith('_TEST'): + dataset_id = dataset_id[:-5] + '_SCORE' + if dataset_id in known_dataset_descriptions: + print("ERROR: Duplicate dataset ID '{dataset_id}': '{first_path}' and '{second_path}'".format( + dataset_id=dataset_id, + first_path=known_dataset_descriptions[dataset_id]['path'], + second_path=dataset_description_path, + )) + return True + + known_dataset_descriptions[dataset_id] = { + 'path': dataset_description_path, + 'description': dataset_description, + } + + if validate_dataset_path(dataset_id, dataset_description_path, strict_naming=strict_naming): + return True + + #if 'digest' not in dataset_description['about']: + # print("ERROR: Dataset '{dataset_path}' missing digest.".format(dataset_path=dataset_description_path)) + # return True + + if validate_dataset(dataset_description_path, dataset_description): + return True + + except Exception: + print("ERROR: Unexpected exception:") + traceback.print_exc() + return True + + return False + + +def validate_problem_description(problem_description_path, known_problem_descriptions): + print("Validating problem '{problem_description_path}'.".format(problem_description_path=problem_description_path)) + + try: + with open(problem_description_path) as problem_description_file: + problem_description = json.load(problem_description_file) + + if not problem_description_validator.validate(problem_description): + print("ERROR: Schema validation: {errors}".format(errors=problem_description_validator.errors)) + return True + + problem_id = problem_description['about']['problemID'] + + # Handle a special case for SCORE dataset splits (those which have "targets.csv" file). + # They are the same as TEST dataset splits, but we present them differently, so that + # SCORE dataset splits have targets as part of data. Because of this we also update + # corresponding problem ID. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/176 + if os.path.exists(os.path.join(os.path.dirname(problem_description_path), '..', 'targets.csv')) and problem_id.endswith('_TEST'): + problem_id = problem_id[:-5] + '_SCORE' + + # Also update dataset references. + for data in problem_description.get('inputs', {}).get('data', []): + if data['datasetID'].endswith('_TEST'): + data['datasetID'] = data['datasetID'][:-5] + '_SCORE' + + # All problem descriptions show be the same. + if problem_id.endswith('_TRAIN') or problem_id.endswith('_TEST') or problem_id.endswith('_SCORE'): + print("ERROR: Invalid problem ID '{problem_id}' in '{problem_description_path}'.".format( + problem_id=problem_id, + problem_description_path=problem_description_path, + )) + return True + + if problem_id in known_problem_descriptions: + # Problem descriptions with same ID should have the same content. + if problem_description == known_problem_descriptions[problem_id]['description']: + known_problem_descriptions[problem_id]['paths'].append(problem_description_path) + else: + print("ERROR: Duplicate problem ID '{problem_id}', but different problem description: {first_paths} and '{second_path}'".format( + problem_id=problem_id, + first_paths=known_problem_descriptions[problem_id]['paths'], + second_path=problem_description_path, + )) + return True + + else: + known_problem_descriptions[problem_id] = { + 'paths': [problem_description_path], + 'description': problem_description, + } + + if os.path.basename(problem_description_path) != 'problemDoc.json': + print("ERROR: Problem description filename '{problem_description_path}' is not 'problemDoc.json'.".format( + problem_description_path=problem_description_path, + )) + return True + + if validate_metrics(problem_description): + return True + + if validate_keywords(problem_description): + return True + + split_path = os.path.dirname(problem_description_path).split(os.sep) + for split_directory in ['problem_TRAIN', 'problem_TEST', 'problem_SCORE']: + if split_directory in split_path and 'datasetViewMaps' not in problem_description.get('inputs', {}).get('dataSplits', {}): + print("ERROR: Problem '{problem_description_path}' is missing dataset view maps.".format( + problem_description_path=problem_description_path, + )) + return True + + except Exception: + print("ERROR: Unexpected exception:") + traceback.print_exc() + return True + + return False + + +def validate_column_values(dataset_description_path, data_resource, column_index, *, unique, no_missing): + error = False + + data_path = os.path.join(os.path.dirname(dataset_description_path), data_resource['resPath']) + + data = read_csv(data_path) + + column_values = data.iloc[:, column_index] + + # We assume missing values is represented as empty strings. + column_values_without_missing = column_values[column_values != ''] + + # There should be no NA anyway anymore. + value_counts = column_values_without_missing.value_counts(dropna=True) + + if unique and (value_counts > 1).sum(): + duplicate = list(value_counts[value_counts > 1].keys()) + if LIMIT_OUTPUT is not None: + duplicate = duplicate[:LIMIT_OUTPUT] + + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with column {column_index} which should have unique values but it does not. Example duplicate values: {duplicate}".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + column_index=column_index, + duplicate=duplicate, + )) + error = True + + if no_missing and len(column_values) != len(column_values_without_missing): + print("ERROR: Dataset '{dataset_path}' has a resource '{resource_id}' with column {column_index} which should have no missing values but it does have them.".format( + dataset_path=dataset_description_path, + resource_id=data_resource['resID'], + column_index=column_index, + )) + error = True + + return error + + +def validate_target_values(problem_paths, dataset_path, problem_description, data_resource, target): + error = False + + data_path = os.path.join(os.path.dirname(dataset_path), data_resource['resPath']) + + data = read_csv(data_path) + + target_values = data.iloc[:, target['colIndex']] + distinct_values = list(target_values.value_counts(dropna=False).keys()) + number_distinct_values = len(distinct_values) + # We assume missing values is represented as empty strings. + has_missing_values = '' in distinct_values + if has_missing_values: + # We do not count missing values as distinct values. + number_distinct_values -= 1 + task_keywords = set(problem_description['about']['taskKeywords']) + + if 'binary' in task_keywords: + if number_distinct_values != 2: + print("ERROR: Problem {problem_paths} has 'binary' keyword, but target column does not have 2 distinct values, but {number_distinct_values}.".format( + problem_paths=problem_paths, + number_distinct_values=number_distinct_values, + )) + error = True + elif 'multiClass' in task_keywords: + if number_distinct_values < 3: + print("ERROR: Problem {problem_paths} has 'multiClass' keyword, but target column does not have more than 2 distinct values, but {number_distinct_values}.".format( + problem_paths=problem_paths, + number_distinct_values=number_distinct_values, + )) + error = True + + for metric in problem_description.get('inputs', {}).get('performanceMetrics', []): + if metric['metric'] in ['f1', 'precision', 'recall', 'jaccardSimilarityScore']: + if number_distinct_values != 2: + print("ERROR: Problem {problem_paths} uses '{metric}' metric, but target column does not have 2 distinct values, but {number_distinct_values}.".format( + problem_paths=problem_paths, + metric=metric['metric'], + number_distinct_values=number_distinct_values, + )) + error = True + if 'posLabel' in metric and metric['posLabel'] not in distinct_values: + print("ERROR: Problem {problem_paths} provides 'posLabel' for metric '{metric}' with value '{value}', but possible values are: {distinct_values}".format( + problem_paths=problem_paths, + metric=metric['metric'], + value=metric['posLabel'], + distinct_values=sorted(distinct_values), + )) + error = True + + if has_missing_values and not task_keywords & {'semiSupervised', 'clustering'}: + print("ERROR: Problem {problem_paths} has target column with missing values, but it not a semi-supervised or clustering task.".format( + problem_paths=problem_paths, + )) + error = True + if 'semiSupervised' in task_keywords and not has_missing_values: + print("ERROR: Problem {problem_paths} is a semi-supervised task, but does not have a target column with missing values.".format( + problem_paths=problem_paths, + )) + error = True + + return error + + +def get_all_columns(dataset_path, resource_id, data_resource): + data_path = os.path.join(os.path.dirname(dataset_path), data_resource['resPath']) + + data = read_csv(data_path) + + data_columns = [{ + 'colIndex': column_index, + 'colName': column_name, + 'colType': 'unknown', + 'role': [] + } for column_index, column_name in enumerate(data.columns)] + + columns = data_resource.get('columns', None) + + if columns is None: + return data_columns + + if 'columnsCount' in data_resource and data_resource['columnsCount'] != len(data_columns): + raise ValueError("Dataset '{dataset_path}' has resource '{resource_id}' with incorrect columns count {columns_count} (correct {correct_count}).".format( + dataset_path=dataset_path, + resource_id=resource_id, + columns_count=data_resource['columnsCount'], + correct_count=len(data_columns), + )) + + if len(columns) >= len(data_columns): + columns_names = [{'colIndex': c['colIndex'], 'colName': c['colName']} for c in columns] + data_columns_names = [{'colIndex': c['colIndex'], 'colName': c['colName']} for c in data_columns] + + if columns_names != data_columns_names: + raise ValueError("Dataset '{dataset_path}' has resource '{resource_id}' where metadata columns do not match data columns.".format( + dataset_path=dataset_path, + resource_id=resource_id, + )) + + return columns + + else: + for column in columns: + if column['colName'] != data_columns[column['colIndex']]['colName']: + raise ValueError("Dataset '{dataset_path}' has resource '{resource_id}' where column name '{metadata_name}' in metadata does not match column name '{data_name}' in data.".format( + dataset_path=dataset_path, + resource_id=resource_id, + metadata_name=column['colName'], + data_name=data_columns[column['colIndex']]['colName'], + )) + + data_columns[column['colIndex']] = column + + return data_columns + + +def validate_target(problem_paths, dataset_path, problem_description, dataset_description, target, check_target_values): + error = False + + try: + for data_resource in dataset_description['dataResources']: + if data_resource['resID'] == target['resID']: + columns = get_all_columns(dataset_path, data_resource['resID'], data_resource) + for column in columns: + if target['colName'] == column['colName'] or target['colIndex'] == column['colIndex']: + if not (target['colName'] == column['colName'] and target['colIndex'] == column['colIndex']): + print("ERROR: Problem {problem_paths} has a target '{target_index}' which does not match a column '{column_index}' in dataset '{dataset_path}' fully.".format( + problem_paths=problem_paths, + target_index=target['targetIndex'], + column_index=column['colIndex'], + dataset_path=dataset_path, + )) + error = True + + if check_target_values: + error = validate_target_values(problem_paths, dataset_path, problem_description, data_resource, target) or error + + break + else: + raise KeyError("Cannot find column with column name '{column_name}' or column index '{column_index}'.".format( + column_name=target['colName'], + column_index=target['colIndex'], + )) + + break + else: + raise KeyError("Cannot find data resource with resource ID '{resource_id}'.".format( + resource_id=target['resID'], + )) + + except (IndexError, KeyError): + print("ERROR: Problem {problem_paths} has target with index '{target_index}' which does not resolve.".format( + problem_paths=problem_paths, + target_index=target['targetIndex'], + )) + return True + + except ValueError as error: + print("ERROR: {error}".format( + error=error, + )) + return True + + return error + + +def canonical_dataset_description(dataset_description): + dataset_description = copy.deepcopy(dataset_description) + + del dataset_description['about']['datasetID'] + if 'digest' in dataset_description['about']: + del dataset_description['about']['digest'] + + return dataset_description + + +def datasets_equal(first_dataset_path, second_dataset_path): + if first_dataset_path == second_dataset_path: + return True + + first_dataset_base_path = os.path.dirname(first_dataset_path) + second_dataset_base_path = os.path.dirname(second_dataset_path) + + dir_comparison = deep_dircmp.DeepDirCmp(first_dataset_base_path, second_dataset_base_path, hide=[], ignore=[]) + + different_files = dir_comparison.get_left_only_recursive() + dir_comparison.get_right_only_recursive() + dir_comparison.get_common_funny_recursive() + dir_comparison.get_diff_files_recursive() + + # This one can be different. And if it is different, we compare it elsewhere for allowed differences. + if 'datasetDoc.json' in different_files: + different_files.remove('datasetDoc.json') + + if different_files: + print("ERROR: Dataset '{first_dataset_path}' and dataset '{second_dataset_path}' are not the same: {differences}".format( + first_dataset_path=first_dataset_path, + second_dataset_path=second_dataset_path, + differences=different_files, + )) + return False + + return True + + +def validate_dataset_reference(dataset_id, dataset_descriptions, targets, problem_description_value, check_target_values): + error = False + + if dataset_id not in dataset_descriptions: + print("ERROR: Problem {problem_paths} is referencing unknown dataset '{dataset_id}'.".format( + problem_paths=problem_description_value['paths'], + dataset_id=dataset_id, + )) + error = True + else: + dataset_description_value = dataset_descriptions[dataset_id] + dataset_description = dataset_description_value['description'] + for i, target in enumerate(targets): + if target['targetIndex'] != i: + print("ERROR: Problem {problem_paths} has target with invalid target index '{target_index}'.".format( + problem_paths=problem_description_value['paths'], + target_index=target['targetIndex'], + )) + error = True + error = validate_target(problem_description_value['paths'], dataset_description_value['path'], problem_description_value['description'], dataset_description, target, check_target_values) or error + + return error + + +def map_dataset_id(dataset_id, dataset_view_map): + for view_map in dataset_view_map: + if view_map['from'] == dataset_id: + return view_map['to'] + else: + raise KeyError("Could not map '{dataset_id}' in dataset view map.".format(dataset_id=dataset_id)) + + +def validate(dataset_descriptions, problem_descriptions): + print("Validating all datasets and problems.") + + error = False + + dataset_description_groups = collections.defaultdict(list) + + for problem_description_value in problem_descriptions.values(): + problem_description = problem_description_value['description'] + for data in problem_description.get('inputs', {}).get('data', []): + error = validate_dataset_reference(data['datasetID'], dataset_descriptions, data.get('targets', []), problem_description_value, True) or error + + if 'datasetViewMaps' in problem_description.get('inputs', {}).get('dataSplits', {}): + if {'train', 'test', 'score'} != set(problem_description['inputs']['dataSplits']['datasetViewMaps'].keys()): + print("ERROR: Problem {problem_paths} has dataset view maps with invalid keys.".format( + problem_paths=problem_description_value['paths'], + )) + error = True + else: + error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['train']), dataset_descriptions, data.get('targets', []), problem_description_value, True) or error + + # Test and score splits do not have all values, so we do not validate target values there. + error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['test']), dataset_descriptions, data.get('targets', []), problem_description_value, False) or error + error = validate_dataset_reference(map_dataset_id(data['datasetID'], problem_description['inputs']['dataSplits']['datasetViewMaps']['score']), dataset_descriptions, data.get('targets', []), problem_description_value, False) or error + + if 'clustering' in problem_description['about']['taskKeywords']: + for data in problem_description.get('inputs', {}).get('data', []): + for target in data.get('targets', []): + if 'numClusters' not in target: + print("ERROR: Problem {problem_paths} is a clustering problem but is missing 'numClusters' in target '{target_index}'.".format( + problem_paths=problem_description_value['paths'], + target_index=target['targetIndex'], + )) + error = True + + if 'dataSplits' in problem_description['inputs'] and set(problem_description['inputs']['dataSplits'].keys()) - {'datasetViewMaps'}: + print("ERROR: Problem {problem_paths} is a clustering problem with data splitting configuration, but it should not have one.".format( + problem_paths=problem_description_value['paths'], + )) + error = True + + for dataset_description_value in dataset_descriptions.values(): + dataset_description = dataset_description_value['description'] + + dataset_id = dataset_description['about']['datasetID'] + + for suffix in ['_TEST', '_TRAIN', '_SCORE']: + if dataset_id.endswith(suffix): + dataset_description_groups[dataset_id[:-len(suffix)]].append(dataset_description_value) + break + + for problem_description_value in problem_descriptions.values(): + problem_description = problem_description_value['description'] + + # If any clustering problem is using dataset splits, we validate those splits. + if 'clustering' in problem_description['about']['taskKeywords']: + for data in problem_description.get('inputs', {}).get('data', []): + # We check this elsewhere. + if data['datasetID'] not in dataset_descriptions: + continue + + dataset_id = data['datasetID'] + + for suffix in ['_TEST', '_TRAIN', '_SCORE']: + if dataset_id.endswith(suffix): + base_dataset_id = dataset_id[:-len(suffix)] + break + else: + base_dataset_id = dataset_id + + # There should always be at least one dataset. + datasets = dataset_description_groups[base_dataset_id] + if len(datasets) > 1: + first_dataset_path = datasets[0]['path'] + for second_dataset_value in datasets[1:]: + second_dataset_path = second_dataset_value['path'] + if not datasets_equal(first_dataset_path, second_dataset_path): + print("ERROR: Problem {problem_paths} is a clustering problem, but its data splits are not all the same, for example, {first_dataset_path} and {second_dataset_path}.".format( + problem_paths=problem_description_value['paths'], + first_dataset_path=first_dataset_path, + second_dataset_path=second_dataset_path, + )) + error = True + break + + for dataset_description_group in dataset_description_groups.values(): + first_dataset_description_value = dataset_description_group[0] + first_dataset_description = canonical_dataset_description(first_dataset_description_value['description']) + for dataset_description_value in dataset_description_group[1:]: + dataset_description = canonical_dataset_description(dataset_description_value['description']) + + if first_dataset_description != dataset_description: + print("ERROR: Dataset '{first_dataset_path}' and dataset '{dataset_path}' are not the same.".format( + first_dataset_path=first_dataset_description_value['path'], + dataset_path=dataset_description_value['path'], + )) + error = True + + return error + + +def search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, *, strict_naming=True): + error = False + + datasets_directory = os.path.abspath(datasets_directory) + + for dirpath, dirnames, filenames in os.walk(datasets_directory, followlinks=True): + if 'datasetDoc.json' in filenames: + # Do not traverse further (to not parse "datasetDoc.json" if they + # exists in raw data filename). + dirnames[:] = [] + + dataset_description_path = os.path.join(dirpath, 'datasetDoc.json') + + error = validate_dataset_description(dataset_description_path, known_dataset_descriptions, strict_naming=strict_naming) or error + + if 'problemDoc.json' in filenames: + # We continue traversing further in this case. + + problem_description_path = os.path.join(dirpath, 'problemDoc.json') + + error = validate_problem_description(problem_description_path, known_problem_descriptions) or error + + return error + + +def configure_parser(parser: argparse.ArgumentParser, *, skip_arguments=()): + if 'no_strict_naming' not in skip_arguments: + parser.add_argument( + '-n', '--no-strict-naming', default=True, action='store_false', dest='strict_naming', + help="do not require strict naming convention", + ) + if 'directories' not in skip_arguments: + parser.add_argument( + 'directories', metavar='DIR', nargs='*', default=['.'], + help="path to a directory with datasets, default is current directory", + ) + + +def handler(arguments): + error = False + + known_dataset_descriptions = {} + known_problem_descriptions = {} + + for datasets_directory in arguments.directories: + error = search_directory(datasets_directory, known_dataset_descriptions, known_problem_descriptions, strict_naming=arguments.strict_naming) or error + + error = validate(known_dataset_descriptions, known_problem_descriptions) or error + + if error: + print("There are ERRORS.") + sys.exit(1) + else: + print("There are no errors.") + + +def main(argv): + parser = argparse.ArgumentParser(description="Validate datasets.") + configure_parser(parser) + + arguments = parser.parse_args(argv[1:]) + + handler(arguments) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/examples/build_AutoEncoder_pipeline.py b/examples/build_AutoEncoder_pipeline.py new file mode 100644 index 0000000..f6af364 --- /dev/null +++ b/examples/build_AutoEncoder_pipeline.py @@ -0,0 +1,70 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: processing +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: algorithm` +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_ae')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Step 6: Predictions +step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) +step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +step_6.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_6.add_output('produce') +pipeline_description.add_step(step_6) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.6.produce') + +# Output to json +data = pipeline_description.to_json() +with open('example_pipeline.json', 'w') as f: + f.write(data) + print(data) + diff --git a/examples/build_IsolationForest_pipline.py b/examples/build_IsolationForest_pipline.py new file mode 100644 index 0000000..e993d78 --- /dev/null +++ b/examples/build_IsolationForest_pipline.py @@ -0,0 +1,107 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: test primitive +primitive_4 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.power_transformer') +step_4 = PrimitiveStep(primitive=primitive_4) +step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 4: test primitive +primitive_5 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler') +step_5 = PrimitiveStep(primitive=primitive_5) +step_5.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Step 4: test primitive +primitive_6 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_6 = PrimitiveStep(primitive=primitive_6) +step_6.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +step_6.add_output('produce') +pipeline_description.add_step(step_6) + +# Step 4: test primitive +primitive_7 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.quantile_transformer') +step_7 = PrimitiveStep(primitive=primitive_7) +step_7.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') +step_7.add_output('produce') +pipeline_description.add_step(step_7) + +# Step 4: test primitive +primitive_8 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_iforest') +step_8 = PrimitiveStep(primitive=primitive_8) +step_8.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce') +# step_8.add_output('produce_score') +step_8.add_output('produce') +pipeline_description.add_step(step_8) + +# Step 5: Predictions +step_9 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) +step_9.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.8.produce') +step_9.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_9.add_output('produce') +pipeline_description.add_step(step_9) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.9.produce') + +# Output to json +data = pipeline_description.to_json() +with open('example_pipeline.json', 'w') as f: + f.write(data) + print(data) + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) diff --git a/examples/build_LODA_pipline.py b/examples/build_LODA_pipline.py new file mode 100644 index 0000000..bf67f33 --- /dev/null +++ b/examples/build_LODA_pipline.py @@ -0,0 +1,78 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: test primitive +primitive_4 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_loda') +step_4 = PrimitiveStep(primitive=primitive_4) +step_4.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: Predictions +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') +step_5.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + +# Output to json +data = pipeline_description.to_json() +with open('example_pipeline.json', 'w') as f: + f.write(data) + print(data) + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + diff --git a/examples/run_automl.py b/examples/run_automl.py new file mode 100644 index 0000000..4b91b18 --- /dev/null +++ b/examples/run_automl.py @@ -0,0 +1,33 @@ +import uuid +import random +import pandas as pd +from pprint import pprint +from sklearn.datasets import make_classification + +from d3m import container +from d3m.metadata.pipeline import Pipeline +from d3m.metadata.problem import TaskKeyword, PerformanceMetric + +from axolotl.utils import data_problem +from axolotl.backend.simple import SimpleRunner +from axolotl.backend.ray import RayRunner +from axolotl.algorithms.base import PipelineSearchBase +from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils + +import tods +from tods.search import BruteForceSearch + +table_path = 'datasets/anomaly/kpi/kpi_dataset/tables/learningData.csv' +df = pd.read_csv(table_path) +dataset, problem_description = data_problem.generate_dataset_problem(df, + target_index=3, + task_keywords=[TaskKeyword.ANOMALY_DETECTION,], + performance_metrics=[{'metric': PerformanceMetric.F1}]) + +print(dataset) +print(problem_description) + + +backend = SimpleRunner(random_seed=0) +search = BruteForceSearch(problem_description=problem_description, backend=backend) +print(search) diff --git a/examples/run_certain_pipeline.py b/examples/run_certain_pipeline.py new file mode 100644 index 0000000..4f02ea5 --- /dev/null +++ b/examples/run_certain_pipeline.py @@ -0,0 +1,30 @@ + +import os + +results_dir = './' +pipeline_run_yml_dir = './' + +pipeline_yml_name = './pipeline.yml' # './pipeline_yml/pipeline_10.yml' +pipline_yml_index = pipeline_yml_name[11:-4] + +python_command = 'python3 -m d3m runtime fit-produce -p ' + pipeline_yml_name \ + + ' -r ./datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json' \ + + ' -i ./datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json' \ + + ' -t ./datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json -o ' \ + + results_dir + 'result.csv' \ + + ' -O ' \ + + pipeline_run_yml_dir + 'pipeline_run' + '.yml' + +print(python_command) +os.system(python_command) +# 'python3 -m d3m runtime fit-produce -p pipeline.yml +# -r ../datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json +# -i ../datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json +# -t ../datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json +# -o results.csv -O pipeline_run.yml' + +# python3 -m d3m runtime fit-produce -p pipeline.yml +# -r ../datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json +# -i ../datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json +# -t ../datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json +# -o result.csv -O pipeline_run.yml \ No newline at end of file diff --git a/examples/run_predefined_pipeline.py b/examples/run_predefined_pipeline.py new file mode 100644 index 0000000..0fb6779 --- /dev/null +++ b/examples/run_predefined_pipeline.py @@ -0,0 +1,51 @@ +import uuid +import random +import pandas as pd +import json +from pprint import pprint +from sklearn.datasets import make_classification + +from d3m import container +from d3m.metadata.pipeline import Pipeline +from d3m.metadata.problem import TaskKeyword, PerformanceMetric + +from axolotl.utils import data_problem +from axolotl.backend.simple import SimpleRunner +# from axolotl.backend.ray import RayRunner +# from axolotl.algorithms.base import PipelineSearchBase +from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils + +import tods +from tods.search import BruteForceSearch + +table_path = 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv' +df = pd.read_csv(table_path) +dataset, problem_description = data_problem.generate_dataset_problem(df, + target_index=7, + task_keywords=[TaskKeyword.ANOMALY_DETECTION,], + performance_metrics=[{'metric': PerformanceMetric.F1}]) + +print(dataset) +print(problem_description) + +metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}, + ] + +pipeline_path = 'example_pipeline.json' +pipeline = pipeline_utils.load_pipeline(pipeline_path) +print(pipeline) + +data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") +scoring_pipeline = schemas_utils.get_scoring_pipeline() +data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + +backend = SimpleRunner(random_seed=0) +pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, + pipeline=pipeline, + input_data=[dataset], + metrics=metrics, + data_preparation_pipeline=data_preparation_pipeline, + scoring_pipeline=scoring_pipeline, + data_preparation_params=data_preparation_params) +print(pipeline_result) + diff --git a/examples/test_axolotl.py b/examples/test_axolotl.py new file mode 100644 index 0000000..5b97666 --- /dev/null +++ b/examples/test_axolotl.py @@ -0,0 +1,194 @@ + +def generate_metrics(): + from d3m.metadata.problem import PerformanceMetric + metrics = [{'metric': PerformanceMetric.F1, 'params': {'pos_label': '1'}}, + ] + return metrics + +def generate_data_preparation_params(): + from axolotl.utils import schemas as schemas_utils + data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split'] + return data_preparation_params + +def generate_scoring_pipeline(): + from axolotl.utils import schemas as schemas_utils + scoring_pipeline = schemas_utils.get_scoring_pipeline() + return scoring_pipeline + +def generate_data_preparation_pipeline(): + from axolotl.utils import schemas as schemas_utils + data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA") + return data_preparation_pipeline + + +def generate_dataset_problems(dataset_infos): + """ + Args: + dataset_infos: A list of dataset info, including `path` and `target` + + Returns: + A list of Dataset and Problem + """ + import pandas as pd + from axolotl.utils import data_problem + from d3m.metadata.problem import TaskKeyword, PerformanceMetric + + dataset_problems = [] + for dataset_info in dataset_infos: + table_path = dataset_info['path'] + target = dataset_info['target'] + + df = pd.read_csv(table_path) + dataset, problem_description = data_problem.generate_dataset_problem(df, + target_index=target, + task_keywords=[TaskKeyword.ANOMALY_DETECTION,], + performance_metrics=[{'metric': PerformanceMetric.F1}]) + + dataset_problems.append((dataset, problem_description)) + + return dataset_problems + +# FIXME: Currently only consider algorithm +def generate_pipelines(primitive_python_paths): + """ + Args: + primitive_python_paths: a list of primitive Python paths for algorithms + + Returns: + the pipline description json + """ + from d3m import index + from d3m.metadata.base import ArgumentType + from d3m.metadata.pipeline import Pipeline, PrimitiveStep + from axolotl.utils import pipeline as pipeline_utils + + pipelines = [] + for primitive_python_path in primitive_python_paths: + # Creating pipeline + pipeline_description = Pipeline() + pipeline_description.add_input(name='inputs') + + # The first three steps are fixed + # Step 0: dataset_to_dataframe + step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) + step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') + step_0.add_output('produce') + pipeline_description.add_step(step_0) + + # Step 1: column_parser + step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) + step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') + step_1.add_output('produce') + pipeline_description.add_step(step_1) + + # Step 2: extract_columns_by_semantic_types(attributes) + step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) + step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') + step_2.add_output('produce') + step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) + pipeline_description.add_step(step_2) + + # Step 3: extract_columns_by_semantic_types(targets) + step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) + step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') + step_3.add_output('produce') + step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) + pipeline_description.add_step(step_3) + + attributes = 'steps.2.produce' + targets = 'steps.3.produce' + + # This one is what we want to test + test_step = PrimitiveStep(primitive=index.get_primitive(primitive_python_path)) + test_step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) + test_step.add_output('produce') + pipeline_description.add_step(test_step) + + # Finalize the pipeline + final_step = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) + final_step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') + final_step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') + final_step.add_output('produce') + pipeline_description.add_step(final_step) + + pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + + pipelines.append(pipeline_description) + + return pipelines + +def test(): + # datasets to be tested + dataset_infos = [ + { + 'path': 'datasets/anomaly/yahoo_sub_5/yahoo_sub_5_dataset/tables/learningData.csv', + 'target': 7 + }, + { + 'path': 'datasets/anomaly/kpi/kpi_dataset/tables/learningData.csv', + # 'path': 'datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv', + 'target': 3 + }, + ] + + # Algorithms to be tested + # FIXME: Test more primitives + primitive_python_paths = [ + 'd3m.primitives.tods.detection_algorithm.pyod_ae', + 'd3m.primitives.tods.detection_algorithm.pyod_vae', + 'd3m.primitives.tods.detection_algorithm.pyod_cof', + 'd3m.primitives.tods.detection_algorithm.pyod_sod', + 'd3m.primitives.tods.detection_algorithm.pyod_abod', + 'd3m.primitives.tods.detection_algorithm.pyod_hbos', + 'd3m.primitives.tods.detection_algorithm.pyod_iforest', + 'd3m.primitives.tods.detection_algorithm.pyod_lof', + 'd3m.primitives.tods.detection_algorithm.pyod_knn', + 'd3m.primitives.tods.detection_algorithm.pyod_ocsvm', + 'd3m.primitives.tods.detection_algorithm.pyod_loda', + # 'd3m.primitives.tods.detection_algorithm.pyod_cblof', + 'd3m.primitives.tods.detection_algorithm.pyod_sogaal', + 'd3m.primitives.tods.detection_algorithm.pyod_mogaal', + ] + + dataset_problems = generate_dataset_problems(dataset_infos) + pipelines = generate_pipelines(primitive_python_paths) + metrics = generate_metrics() + data_preparation_pipeline = generate_data_preparation_pipeline() + scoring_pipeline = generate_scoring_pipeline() + data_preparation_params = generate_data_preparation_params() + + # Start running + from axolotl.backend.simple import SimpleRunner + backend = SimpleRunner(random_seed=0) + for i, dataset_problem in enumerate(dataset_problems): + + dataset, problem_description = dataset_problem + for j, pipeline in enumerate(pipelines): + + print('Dataset:', i, 'Pipline:', j) + + pipeline_result = backend.evaluate_pipeline(problem_description=problem_description, + pipeline=pipeline, + input_data=[dataset], + metrics=metrics, + data_preparation_pipeline=data_preparation_pipeline, + scoring_pipeline=scoring_pipeline, + data_preparation_params=data_preparation_params) + print('Results') + print('----------------------------') + print(pipeline_result) + print('----------------------------') + if pipeline_result.status == 'ERRORED': + print('Scoring pipeline is {}'.format(scoring_pipeline.id)) + print('Data preparation pipeline is {}'.format(data_preparation_pipeline.id)) + raise ValueError('ERRORED for dataset {}, primitive {}'.format(dataset_infos[i], primitive_python_paths[j])) + +if __name__ == "__main__": + test() + + + + + diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..ada87db --- /dev/null +++ b/install.sh @@ -0,0 +1,23 @@ +pip install -r requirements.txt + +cd d3m +pip install -e . +cd .. + +cd tods/common-primitives +pip install -e . +cd ../.. + +cd tods/common-primitives/sklearn-wrap +pip install -e . +cd ../../.. + +cd tods +pip3 install -e . +cd .. + +cd axolotl +pip3 install -e . +pip3 install -e .[cpu] +cd .. + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a6bcea0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,40 @@ +scikit-learn==0.21.3 +pytypes==1.0b5 +frozendict==1.2 +numpy>=1.15.4,<=1.18.1 +jsonschema==3.0.2 +requests>=2.19.1,<=2.22.0 +strict-rfc3339==0.7 +rfc3987==1.3.8 +webcolors>=1.8.1,<=1.10 +dateparser>=0.7.0,<=0.7.2 +python-dateutil==2.8.1 +pandas==0.23.4 +typing-inspect==0.5.0 +GitPython==3.1.0 +jsonpath-ng==1.4.3 +custom-inherit>=2.2.0,<=2.2.2 +PyYAML>=5.1,<=5.3 +pycurl>=7.43.0.2,<=7.43.0.3 +pyarrow==0.15.1 +gputil>=1.3.0,<=1.4.0 +pyrsistent>=0.14.11,<=0.15.7 +scipy>=1.2.1,<=1.4.1 +openml==0.10.1 +lightgbm>=2.2.2,<=2.3.0 +opencv-python-headless<=4.1.1.26,>=4.1 +imageio>=2.3.0,<=2.6.0 +pillow==6.2.1 +xgboost>=0.81,<=0.90 +Jinja2==2.9.4 +simplejson==3.12.0 +gitdb2==2.0.6 +grpcio +grpcio-tools +grpcio-testing +ray +networkx +-e git+https://gitlab.com/datadrivendiscovery/ta3ta2-api.git@a0e423e452d41ccf82edb772e4b7854cd2326606#egg=ta3ta2_api +-e git+https://gitlab.com/datadrivendiscovery/tests-data.git@753d974b322ccae79f674432a3ca79f2f734d4c2#egg=test_primitives&subdirectory=primitives +Keras==2.3.1 + diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..af60b73 --- /dev/null +++ b/test.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +#test_scripts=$(ls tests) +test_scripts=$(ls tests | grep -v -f tested_file.txt) + +for file in $test_scripts +do + for f in $tested_file + do + echo $f + done + echo $file + + # Test pipeline building + python tests/$file > tmp.txt 2>>tmp.txt + error=$(cat tmp.txt | grep 'Error' | wc -l) + echo "\t#Pipeline Building Errors:" $error + if [ "$error" -gt "0" ] + then + cat tmp.txt + #rm tmp.txt + break + fi + # Test on KPI dataset + #python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml + #python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/kpi/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/kpi/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/kpi/TEST/dataset_TEST/datasetDoc.json -o results.csv 2>>tmp.txt + + # Test on Yahoo dataset + #python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json -o results.csv -O pipeline_run.yml + python3 -m d3m runtime fit-produce -p pipeline.yml -r datasets/anomaly/yahoo_sub_5/TRAIN/problem_TRAIN/problemDoc.json -i datasets/anomaly/yahoo_sub_5/TRAIN/dataset_TRAIN/datasetDoc.json -t datasets/anomaly/yahoo_sub_5/TEST/dataset_TEST/datasetDoc.json -o results.csv 2> tmp.txt + error=$(cat tmp.txt | grep 'Error' | wc -l) + echo "\t#Pipeline Running Errors:" $error + if [ "$error" -gt "0" ] + then + cat tmp.txt + #rm tmp.txt + break + fi + echo $file >> tested_file.txt +done diff --git a/tested_file.txt b/tested_file.txt new file mode 100644 index 0000000..7778da3 --- /dev/null +++ b/tested_file.txt @@ -0,0 +1,130 @@ +build_ABOD_pipline.py +build_AutoEncoder.py +build_AutoRegODetect_pipeline.py +build_AxiswiseScale_pipline.py +build_BKFilter_pipline.py +build_CBLOF_pipline.py +build_CategoricalToBinary.py +build_ColumnFilter_pipeline.py +build_ContinuityValidation_pipline.py +build_DeepLog_pipeline.py +build_DiscreteCosineTransform.py +build_DuplicationValidation_pipline.py +build_FastFourierTransform.py +build_HBOS_pipline.py +build_HBOS_score_pipline.py +build_HPFilter_pipline.py +build_HoltSmoothing_pipline.py +build_HoltWintersExponentialSmoothing_pipline.py +build_IsolationForest_pipline.py +build_KDiscord_pipeline.py +build_KNN_pipline.py +build_LODA_pipline.py +build_LOF_pipline.py +build_LSTMOD_pipline.py +build_MatrixProfile_pipeline.py +build_MeanAverageTransform_pipline.py +build_NonNegativeMatrixFactorization.py +build_OCSVM_pipline.py +build_PCAODetect_pipeline.py +build_PowerTransform_pipline.py +build_PyodCOF.py +build_QuantileTransform_pipline.py +build_RuleBasedFilter_pipline.py +build_SOD_pipeline.py +build_SimpleExponentialSmoothing_pipline.py +build_Standardize_pipline.py +build_TRMF_pipline.py +build_TimeIntervalTransform_pipeline.py +build_TruncatedSVD_pipline.py +build_VariationalAutoEncoder.py +build_WaveletTransform_pipline.py +build_test_detection_algorithm_PyodMoGaal.py +build_test_detection_algorithm_PyodSoGaal.py +build_test_feature_analysis_spectral_residual_transform_pipeline.py +build_test_feature_analysis_statistical_abs_energy.py +build_test_feature_analysis_statistical_abs_sum.py +build_test_feature_analysis_statistical_gmean.py +build_test_feature_analysis_statistical_hmean.py +build_test_feature_analysis_statistical_kurtosis.py +build_test_feature_analysis_statistical_maximum.py +build_test_feature_analysis_statistical_mean.py +build_test_feature_analysis_statistical_mean_abs.py +build_test_feature_analysis_statistical_mean_abs_temporal_derivative.py +build_test_feature_analysis_statistical_mean_temporal_derivative.py +build_test_feature_analysis_statistical_median.py +build_test_feature_analysis_statistical_median_absolute_deviation.py +build_test_feature_analysis_statistical_minimum.py +build_test_feature_analysis_statistical_skew.py +build_test_feature_analysis_statistical_std.py +build_test_feature_analysis_statistical_var.py +build_test_feature_analysis_statistical_variation.py +build_test_feature_analysis_statistical_vec_sum.py +build_test_feature_analysis_statistical_willison_amplitude.py +build_test_feature_analysis_statistical_zero_crossing.py +build_test_time_series_seasonality_trend_decomposition.py +build_ABOD_pipline.py +build_AutoEncoder.py +build_AutoRegODetect_pipeline.py +build_AxiswiseScale_pipline.py +build_BKFilter_pipline.py +build_CBLOF_pipline.py +build_CategoricalToBinary.py +build_ColumnFilter_pipeline.py +build_ContinuityValidation_pipline.py +build_DeepLog_pipeline.py +build_DiscreteCosineTransform.py +build_DuplicationValidation_pipline.py +build_FastFourierTransform.py +build_HBOS_pipline.py +build_HBOS_score_pipline.py +build_HPFilter_pipline.py +build_HoltSmoothing_pipline.py +build_HoltWintersExponentialSmoothing_pipline.py +build_IsolationForest_pipline.py +build_KDiscord_pipeline.py +build_KNN_pipline.py +build_LODA_pipline.py +build_LOF_pipline.py +build_LSTMOD_pipline.py +build_MatrixProfile_pipeline.py +build_MeanAverageTransform_pipline.py +build_NonNegativeMatrixFactorization.py +build_OCSVM_pipline.py +build_PCAODetect_pipeline.py +build_PowerTransform_pipline.py +build_PyodCOF.py +build_QuantileTransform_pipline.py +build_RuleBasedFilter_pipline.py +build_SOD_pipeline.py +build_SimpleExponentialSmoothing_pipline.py +build_Standardize_pipline.py +build_TRMF_pipline.py +build_TimeIntervalTransform_pipeline.py +build_TruncatedSVD_pipline.py +build_VariationalAutoEncoder.py +build_WaveletTransform_pipline.py +build_test_detection_algorithm_PyodMoGaal.py +build_test_detection_algorithm_PyodSoGaal.py +build_test_feature_analysis_spectral_residual_transform_pipeline.py +build_test_feature_analysis_statistical_abs_energy.py +build_test_feature_analysis_statistical_abs_sum.py +build_test_feature_analysis_statistical_gmean.py +build_test_feature_analysis_statistical_hmean.py +build_test_feature_analysis_statistical_kurtosis.py +build_test_feature_analysis_statistical_maximum.py +build_test_feature_analysis_statistical_mean.py +build_test_feature_analysis_statistical_mean_abs.py +build_test_feature_analysis_statistical_mean_abs_temporal_derivative.py +build_test_feature_analysis_statistical_mean_temporal_derivative.py +build_test_feature_analysis_statistical_median.py +build_test_feature_analysis_statistical_median_absolute_deviation.py +build_test_feature_analysis_statistical_minimum.py +build_test_feature_analysis_statistical_skew.py +build_test_feature_analysis_statistical_std.py +build_test_feature_analysis_statistical_var.py +build_test_feature_analysis_statistical_variation.py +build_test_feature_analysis_statistical_vec_sum.py +build_test_feature_analysis_statistical_willison_amplitude.py +build_test_feature_analysis_statistical_zero_crossing.py +build_test_time_series_seasonality_trend_decomposition.py diff --git a/tests/build_ABOD_pipline.py b/tests/build_ABOD_pipline.py new file mode 100644 index 0000000..6baad03 --- /dev/null +++ b/tests/build_ABOD_pipline.py @@ -0,0 +1,70 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: imputer +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: ABOD +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_abod')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') + +step_5.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_5.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_5.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2, 4,)) +step_5.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='replace') + +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_AutoEncoder.py b/tests/build_AutoEncoder.py new file mode 100644 index 0000000..951da1a --- /dev/null +++ b/tests/build_AutoEncoder.py @@ -0,0 +1,67 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: imputer +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: auto encoder +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_ae')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_5.add_output('produce') +pipeline_description.add_step(step_5) + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_AutoRegODetect_pipeline.py b/tests/build_AutoRegODetect_pipeline.py new file mode 100644 index 0000000..1034b97 --- /dev/null +++ b/tests/build_AutoRegODetect_pipeline.py @@ -0,0 +1,71 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import numpy as np + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# # Step 3: Standardization +primitive_3 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(1,2,3,4,5,)) +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + +# # Step 4: test primitive +primitive_4 = index.get_primitive('d3m.primitives.tods.detection_algorithm.AutoRegODetector') +step_4 = PrimitiveStep(primitive=primitive_4) +step_4.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_4.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=10) +# step_4.add_hyperparameter(name='weights', argument_type=ArgumentType.VALUE, data=weights_ndarray) +step_4.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=False) +# step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) # There is sth wrong with multi-dimensional +step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_4.add_hyperparameter(name='return_subseq_inds', argument_type=ArgumentType.VALUE, data=True) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') +step_4.add_output('produce') +step_4.add_output('produce_score') +pipeline_description.add_step(step_4) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.4.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_AxiswiseScale_pipline.py b/tests/build_AxiswiseScale_pipline.py new file mode 100644 index 0000000..7c51ab2 --- /dev/null +++ b/tests/build_AxiswiseScale_pipline.py @@ -0,0 +1,50 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_BKFilter_pipline.py b/tests/build_BKFilter_pipline.py new file mode 100644 index 0000000..e68fb68 --- /dev/null +++ b/tests/build_BKFilter_pipline.py @@ -0,0 +1,44 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + + +# Step 2: BKFilter +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.feature_analysis.bk_filter')) +# step_2.add_hyperparameter(name = 'columns_using_method', argument_type=ArgumentType.VALUE, data = 'name') +step_2.add_hyperparameter(name = 'use_semantic_types', argument_type=ArgumentType.VALUE, data = True) +step_2.add_hyperparameter(name = 'use_columns', argument_type=ArgumentType.VALUE, data = (2,3)) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_CBLOF_pipline.py b/tests/build_CBLOF_pipline.py new file mode 100644 index 0000000..302727d --- /dev/null +++ b/tests/build_CBLOF_pipline.py @@ -0,0 +1,51 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_cblof') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_CategoricalToBinary.py b/tests/build_CategoricalToBinary.py new file mode 100644 index 0000000..a29399c --- /dev/null +++ b/tests/build_CategoricalToBinary.py @@ -0,0 +1,48 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: Column Parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: Categorical to Binary +primitive_2 = index.get_primitive('d3m.primitives.tods.data_processing.categorical_to_binary') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(3,)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_ColumnFilter_pipeline.py b/tests/build_ColumnFilter_pipeline.py new file mode 100644 index 0000000..9afa446 --- /dev/null +++ b/tests/build_ColumnFilter_pipeline.py @@ -0,0 +1,49 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +#Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +primitive_2 = index.get_primitive('d3m.primitives.tods.feature_analysis.auto_correlation') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name="use_semantic_types", argument_type=ArgumentType.VALUE, data = True) +step_2.add_hyperparameter(name="use_columns", argument_type=ArgumentType.VALUE, data = (2, 3)) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +primitive_3 = index.get_primitive('d3m.primitives.tods.data_processing.column_filter') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_ContinuityValidation_pipline.py b/tests/build_ContinuityValidation_pipline.py new file mode 100644 index 0000000..c42310b --- /dev/null +++ b/tests/build_ContinuityValidation_pipline.py @@ -0,0 +1,43 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + + +# Step 2: ContinuityValidation +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.continuity_validation')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name = 'continuity_option', argument_type=ArgumentType.VALUE, data = 'imputation') +step_2.add_hyperparameter(name = 'interval', argument_type=ArgumentType.VALUE, data = 0.3) +# Or: +# step_2.add_hyperparameter(name = 'continuity_option', argument_type=ArgumentType.VALUE, data = 'ablation') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_DeepLog_pipeline.py b/tests/build_DeepLog_pipeline.py new file mode 100644 index 0000000..32c69d0 --- /dev/null +++ b/tests/build_DeepLog_pipeline.py @@ -0,0 +1,49 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.deeplog') + +step_2 = PrimitiveStep(primitive=primitive_2) +#step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# # Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_DiscreteCosineTransform.py b/tests/build_DiscreteCosineTransform.py new file mode 100644 index 0000000..c3cc52f --- /dev/null +++ b/tests/build_DiscreteCosineTransform.py @@ -0,0 +1,50 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: Column Parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + + +# Step 2: Discrete Cosine Transform +primitive_2 = index.get_primitive('d3m.primitives.tods.feature_analysis.discrete_cosine_transform') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_DuplicationValidation_pipline.py b/tests/build_DuplicationValidation_pipline.py new file mode 100644 index 0000000..6471ac9 --- /dev/null +++ b/tests/build_DuplicationValidation_pipline.py @@ -0,0 +1,42 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + + +# Step 2: DuplicationValidation +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.duplication_validation')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name = 'keep_option', argument_type=ArgumentType.VALUE, data = 'average') # Or: 'first' +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_FastFourierTransform.py b/tests/build_FastFourierTransform.py new file mode 100644 index 0000000..10a8914 --- /dev/null +++ b/tests/build_FastFourierTransform.py @@ -0,0 +1,48 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: Column Parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: Fast Fourier Transform +primitive_2 = index.get_primitive('d3m.primitives.tods.feature_analysis.fast_fourier_transform') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_HBOS_pipline.py b/tests/build_HBOS_pipline.py new file mode 100644 index 0000000..a32645c --- /dev/null +++ b/tests/build_HBOS_pipline.py @@ -0,0 +1,68 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: imputer +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: HBOS +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_hbos')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') + +step_5.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +# step_5.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') + +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_HBOS_score_pipline.py b/tests/build_HBOS_score_pipline.py new file mode 100644 index 0000000..70e6661 --- /dev/null +++ b/tests/build_HBOS_score_pipline.py @@ -0,0 +1,71 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: imputer +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: HBOS +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_hbos')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') + +step_5.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_5.add_hyperparameter(name='return_subseq_inds', argument_type=ArgumentType.VALUE, data=True) +# step_5.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') + +step_5.add_output('produce_score') +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') +# pipeline_description.add_output(name='output score', data_reference='steps.5.produce_score') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_HPFilter_pipline.py b/tests/build_HPFilter_pipline.py new file mode 100644 index 0000000..80754bf --- /dev/null +++ b/tests/build_HPFilter_pipline.py @@ -0,0 +1,46 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + + +# Step 2: HPFilter +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.feature_analysis.hp_filter')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') + +step_2.add_hyperparameter(name = 'use_columns', argument_type=ArgumentType.VALUE, data = [2,3,6]) + +step_2.add_hyperparameter(name = 'use_semantic_types', argument_type=ArgumentType.VALUE, data = True) +step_2.add_hyperparameter(name = 'return_result', argument_type=ArgumentType.VALUE, data = 'append') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_HoltSmoothing_pipline.py b/tests/build_HoltSmoothing_pipline.py new file mode 100644 index 0000000..dd4e3c5 --- /dev/null +++ b/tests/build_HoltSmoothing_pipline.py @@ -0,0 +1,76 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: imputer +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: holt smoothing +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.holt_smoothing')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_5.add_hyperparameter(name="exclude_columns", argument_type=ArgumentType.VALUE, data = (2, 3)) +step_5.add_hyperparameter(name="use_semantic_types", argument_type=ArgumentType.VALUE, data = True) +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Step 6: isolation forest +#step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.anomaly_detection.isolation_forest.Algorithm')) +#step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +#step_6.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets) +#step_6.add_output('produce') +#pipeline_description.add_step(step_6) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_HoltWintersExponentialSmoothing_pipline.py b/tests/build_HoltWintersExponentialSmoothing_pipline.py new file mode 100644 index 0000000..1234dc7 --- /dev/null +++ b/tests/build_HoltWintersExponentialSmoothing_pipline.py @@ -0,0 +1,76 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: imputer +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: holt winters exponential smoothing +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.holt_winters_exponential_smoothing')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_5.add_hyperparameter(name="use_columns", argument_type=ArgumentType.VALUE, data = (2, 3)) +step_5.add_hyperparameter(name="use_semantic_types", argument_type=ArgumentType.VALUE, data = True) +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Step 6: isolation forest +#step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.anomaly_detection.isolation_forest.Algorithm')) +#step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +#step_6.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets) +#step_6.add_output('produce') +#pipeline_description.add_step(step_6) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_IsolationForest_pipline.py b/tests/build_IsolationForest_pipline.py new file mode 100644 index 0000000..965343c --- /dev/null +++ b/tests/build_IsolationForest_pipline.py @@ -0,0 +1,59 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +primitive_3 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_iforest') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +# step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +# step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_hyperparameter(name='return_subseq_inds', argument_type=ArgumentType.VALUE, data=True) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce_score') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.3.produce_score') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_KDiscord_pipeline.py b/tests/build_KDiscord_pipeline.py new file mode 100644 index 0000000..e66bcb1 --- /dev/null +++ b/tests/build_KDiscord_pipeline.py @@ -0,0 +1,71 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import numpy as np + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# # Step 3: Standardization +primitive_3 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(1,2,3,4,5,)) +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + +# # Step 4: test primitive +primitive_4 = index.get_primitive('d3m.primitives.tods.detection_algorithm.KDiscordODetector') +step_4 = PrimitiveStep(primitive=primitive_4) +step_4.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_4.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=10) +# step_4.add_hyperparameter(name='weights', argument_type=ArgumentType.VALUE, data=weights_ndarray) +step_4.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=False) +# step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) # There is sth wrong with multi-dimensional +step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_4.add_hyperparameter(name='return_subseq_inds', argument_type=ArgumentType.VALUE, data=True) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') +step_4.add_output('produce') +step_4.add_output('produce_score') +pipeline_description.add_step(step_4) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.4.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_KNN_pipline.py b/tests/build_KNN_pipline.py new file mode 100644 index 0000000..d188c76 --- /dev/null +++ b/tests/build_KNN_pipline.py @@ -0,0 +1,51 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_knn') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_LODA_pipline.py b/tests/build_LODA_pipline.py new file mode 100644 index 0000000..daac9f6 --- /dev/null +++ b/tests/build_LODA_pipline.py @@ -0,0 +1,51 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_loda') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_LOF_pipline.py b/tests/build_LOF_pipline.py new file mode 100644 index 0000000..0c82d13 --- /dev/null +++ b/tests/build_LOF_pipline.py @@ -0,0 +1,51 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_lof') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_LSTMOD_pipline.py b/tests/build_LSTMOD_pipline.py new file mode 100644 index 0000000..0e9a54b --- /dev/null +++ b/tests/build_LSTMOD_pipline.py @@ -0,0 +1,70 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import numpy as np + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# # Step 2: Standardization +primitive_3 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(1,2,3,4,5,)) +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + +# # Step 3: test primitive +primitive_4 = index.get_primitive('d3m.primitives.tods.detection_algorithm.LSTMODetector') +step_4 = PrimitiveStep(primitive=primitive_4) +step_4.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_4.add_hyperparameter(name='diff_group_method', argument_type=ArgumentType.VALUE, data='average') +step_4.add_hyperparameter(name='feature_dim', argument_type=ArgumentType.VALUE, data=5) +step_4.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=False) +# step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) # There is sth wrong with multi-dimensional +step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_4.add_hyperparameter(name='return_subseq_inds', argument_type=ArgumentType.VALUE, data=True) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.4.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_MatrixProfile_pipeline.py b/tests/build_MatrixProfile_pipeline.py new file mode 100644 index 0000000..41ceeaa --- /dev/null +++ b/tests/build_MatrixProfile_pipeline.py @@ -0,0 +1,49 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.matrix_profile') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=3) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# # Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_MeanAverageTransform_pipline.py b/tests/build_MeanAverageTransform_pipline.py new file mode 100644 index 0000000..d14ffad --- /dev/null +++ b/tests/build_MeanAverageTransform_pipline.py @@ -0,0 +1,77 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: imputer +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: mean average transform +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.moving_average_transform')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_5.add_hyperparameter(name="use_columns", argument_type=ArgumentType.VALUE, data = (2, 3)) +step_5.add_hyperparameter(name="use_semantic_types", argument_type=ArgumentType.VALUE, data = True) +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Step 6: isolation forest +#step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.anomaly_detection.isolation_forest.Algorithm')) +#step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +#step_6.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets) +#step_6.add_output('produce') +#pipeline_description.add_step(step_6) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_NonNegativeMatrixFactorization.py b/tests/build_NonNegativeMatrixFactorization.py new file mode 100644 index 0000000..bdfcd2f --- /dev/null +++ b/tests/build_NonNegativeMatrixFactorization.py @@ -0,0 +1,50 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: Column Parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: Non Negative Matrix Factorization +primitive_2 = index.get_primitive('d3m.primitives.tods.feature_analysis.non_negative_matrix_factorization') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_hyperparameter(name='rank', argument_type=ArgumentType.VALUE, data=5) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_OCSVM_pipline.py b/tests/build_OCSVM_pipline.py new file mode 100644 index 0000000..640fb9b --- /dev/null +++ b/tests/build_OCSVM_pipline.py @@ -0,0 +1,51 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_ocsvm') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_PCAODetect_pipeline.py b/tests/build_PCAODetect_pipeline.py new file mode 100644 index 0000000..1e93027 --- /dev/null +++ b/tests/build_PCAODetect_pipeline.py @@ -0,0 +1,71 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import numpy as np + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# # Step 3: Standardization +primitive_3 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(1,2,3,4,5,)) +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + +# # Step 4: test primitive +primitive_4 = index.get_primitive('d3m.primitives.tods.detection_algorithm.PCAODetector') +step_4 = PrimitiveStep(primitive=primitive_4) +step_4.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_4.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=10) +# step_4.add_hyperparameter(name='weights', argument_type=ArgumentType.VALUE, data=weights_ndarray) +step_4.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=False) +# step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) # There is sth wrong with multi-dimensional +step_4.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_4.add_hyperparameter(name='return_subseq_inds', argument_type=ArgumentType.VALUE, data=True) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') +step_4.add_output('produce') +step_4.add_output('produce_score') +pipeline_description.add_step(step_4) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.4.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_PowerTransform_pipline.py b/tests/build_PowerTransform_pipline.py new file mode 100644 index 0000000..94ecfbd --- /dev/null +++ b/tests/build_PowerTransform_pipline.py @@ -0,0 +1,49 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.power_transformer') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_PyodCOF.py b/tests/build_PyodCOF.py new file mode 100644 index 0000000..7128e6d --- /dev/null +++ b/tests/build_PyodCOF.py @@ -0,0 +1,51 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_cof') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_QuantileTransform_pipline.py b/tests/build_QuantileTransform_pipline.py new file mode 100644 index 0000000..28c4911 --- /dev/null +++ b/tests/build_QuantileTransform_pipline.py @@ -0,0 +1,49 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.quantile_transformer') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_RuleBasedFilter_pipline.py b/tests/build_RuleBasedFilter_pipline.py new file mode 100644 index 0000000..013af40 --- /dev/null +++ b/tests/build_RuleBasedFilter_pipline.py @@ -0,0 +1,54 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + + +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.reinforcement.rule_filter')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') + +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2, 4,)) +step_3.add_hyperparameter(name='rule', argument_type=ArgumentType.VALUE, data='#4# % 2 == 0 and #2# <= 0.3') + +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +pipeline_description.add_step(step_3) + + + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_SOD_pipeline.py b/tests/build_SOD_pipeline.py new file mode 100644 index 0000000..8858cee --- /dev/null +++ b/tests/build_SOD_pipeline.py @@ -0,0 +1,49 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_sod') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# # Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_SimpleExponentialSmoothing_pipline.py b/tests/build_SimpleExponentialSmoothing_pipline.py new file mode 100644 index 0000000..a4176a2 --- /dev/null +++ b/tests/build_SimpleExponentialSmoothing_pipline.py @@ -0,0 +1,76 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: imputer +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: simple exponential smoothing +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.simple_exponential_smoothing')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_5.add_hyperparameter(name="use_columns", argument_type=ArgumentType.VALUE, data = (1,)) +step_5.add_hyperparameter(name="use_semantic_types", argument_type=ArgumentType.VALUE, data = True) +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Step 6: isolation forest +#step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.anomaly_detection.isolation_forest.Algorithm')) +#step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +#step_6.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets) +#step_6.add_output('produce') +#pipeline_description.add_step(step_6) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_Standardize_pipline.py b/tests/build_Standardize_pipline.py new file mode 100644 index 0000000..50844ea --- /dev/null +++ b/tests/build_Standardize_pipline.py @@ -0,0 +1,49 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_TRMF_pipline.py b/tests/build_TRMF_pipline.py new file mode 100644 index 0000000..b058cbc --- /dev/null +++ b/tests/build_TRMF_pipline.py @@ -0,0 +1,44 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + + +# Step 2: TRMF +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.feature_analysis.trmf')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') + +step_2.add_hyperparameter(name = 'lags', argument_type=ArgumentType.VALUE, data = [1,2,10,100]) +# step_2.add_hyperparameter(name = 'K', argument_type=ArgumentType.VALUE, data = 3) +# step_2.add_hyperparameter(name = 'use_columns', argument_type=ArgumentType.VALUE, data = (2, 3, 4, 5, 6)) + +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_Telemanom.py b/tests/build_Telemanom.py new file mode 100644 index 0000000..afb4bb3 --- /dev/null +++ b/tests/build_Telemanom.py @@ -0,0 +1,48 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: Column Parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: Fast Fourier Transform +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.telemanom') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_TimeIntervalTransform_pipeline.py b/tests/build_TimeIntervalTransform_pipeline.py new file mode 100644 index 0000000..289171c --- /dev/null +++ b/tests/build_TimeIntervalTransform_pipeline.py @@ -0,0 +1,86 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: dataframe transformation +# primitive_1 = index.get_primitive('d3m.primitives.data_transformation.SKPowerTransformer') +# primitive_1 = index.get_primitive('d3m.primitives.data_transformation.SKStandardization') +# primitive_1 = index.get_primitive('d3m.primitives.data_transformation.SKQuantileTransformer') + +#Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +primitive_2 = index.get_primitive('d3m.primitives.tods.data_processing.time_interval_transform') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name="time_interval", argument_type=ArgumentType.VALUE, data = '5T') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) +# +# # Step 2: column_parser +# step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +# step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +# step_2.add_output('produce') +# pipeline_description.add_step(step_2) +# +# +# # Step 3: extract_columns_by_semantic_types(attributes) +# step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +# step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +# step_3.add_output('produce') +# step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, +# data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +# pipeline_description.add_step(step_3) +# +# # Step 4: extract_columns_by_semantic_types(targets) +# step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +# step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +# step_4.add_output('produce') +# step_4.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, +# data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +# pipeline_description.add_step(step_4) +# +# attributes = 'steps.3.produce' +# targets = 'steps.4.produce' +# +# # Step 5: imputer +# step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +# step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +# step_5.add_output('produce') +# pipeline_description.add_step(step_5) +# +# # Step 6: random_forest +# step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.regression.random_forest.SKlearn')) +# step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +# step_6.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference=targets) +# step_6.add_output('produce') +# pipeline_description.add_step(step_6) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.1.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_TruncatedSVD_pipline.py b/tests/build_TruncatedSVD_pipline.py new file mode 100644 index 0000000..af14404 --- /dev/null +++ b/tests/build_TruncatedSVD_pipline.py @@ -0,0 +1,44 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + + +# Step 2: TruncatedSVD +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.feature_analysis.truncated_svd')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name = 'n_components', argument_type=ArgumentType.VALUE, data = 3) +step_2.add_hyperparameter(name = 'use_columns', argument_type=ArgumentType.VALUE, data = (2, 3, 4, 5, 6)) +step_2.add_hyperparameter(name = 'return_result', argument_type=ArgumentType.VALUE, data = 'append') +step_2.add_hyperparameter(name = 'use_semantic_types', argument_type=ArgumentType.VALUE, data = True) +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_VariationalAutoEncoder.py b/tests/build_VariationalAutoEncoder.py new file mode 100644 index 0000000..cf0e6f9 --- /dev/null +++ b/tests/build_VariationalAutoEncoder.py @@ -0,0 +1,67 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: column_parser +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: extract_columns_by_semantic_types(attributes) +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/Attribute']) +pipeline_description.add_step(step_2) + +# Step 3: extract_columns_by_semantic_types(targets) +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_3.add_output('produce') +step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, + data=['https://metadata.datadrivendiscovery.org/types/TrueTarget']) +pipeline_description.add_step(step_3) + +attributes = 'steps.2.produce' +targets = 'steps.3.produce' + +# Step 4: imputer +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_cleaning.imputer.SKlearn')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: variatinal auto encoder +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_vae')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes) +step_5.add_output('produce') +pipeline_description.add_step(step_5) + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.5.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_WaveletTransform_pipline.py b/tests/build_WaveletTransform_pipline.py new file mode 100644 index 0000000..de3aa90 --- /dev/null +++ b/tests/build_WaveletTransform_pipline.py @@ -0,0 +1,64 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test WaveletTransform +primitive_2 = index.get_primitive('d3m.primitives.tods.feature_analysis.wavelet_transform') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='wavelet', argument_type=ArgumentType.VALUE, data='db8') +step_2.add_hyperparameter(name='level', argument_type=ArgumentType.VALUE, data=2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 2: test inverse WaveletTransform +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.wavelet_transform') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='wavelet', argument_type=ArgumentType.VALUE, data='db8') +step_3.add_hyperparameter(name='level', argument_type=ArgumentType.VALUE, data=2) +step_3.add_hyperparameter(name='inverse', argument_type=ArgumentType.VALUE, data=1) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=False) +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='new') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_detection_algorithm_PyodMoGaal.py b/tests/build_test_detection_algorithm_PyodMoGaal.py new file mode 100644 index 0000000..a488900 --- /dev/null +++ b/tests/build_test_detection_algorithm_PyodMoGaal.py @@ -0,0 +1,50 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_mogaal') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_test_detection_algorithm_PyodSoGaal.py b/tests/build_test_detection_algorithm_PyodSoGaal.py new file mode 100644 index 0000000..f14c5a0 --- /dev/null +++ b/tests/build_test_detection_algorithm_PyodSoGaal.py @@ -0,0 +1,50 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: test primitive +primitive_2 = index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_sogaal') + +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=0.1) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,)) # There is sth wrong with multi-dimensional +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.2.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) +print(yaml) + +# Or you can output json +#data = pipline_description.to_json() diff --git a/tests/build_test_feature_analysis_spectral_residual_transform_pipeline.py b/tests/build_test_feature_analysis_spectral_residual_transform_pipeline.py new file mode 100644 index 0000000..d481651 --- /dev/null +++ b/tests/build_test_feature_analysis_spectral_residual_transform_pipeline.py @@ -0,0 +1,61 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.spectral_residual_transform') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='avg_filter_dimension', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(8,9,10,11,12)) # There is sth wrong with multi-dimensional +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_abs_energy.py b/tests/build_test_feature_analysis_statistical_abs_energy.py new file mode 100644 index 0000000..fc616c4 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_abs_energy.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_abs_energy') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(8,9,10,11,12)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_abs_sum.py b/tests/build_test_feature_analysis_statistical_abs_sum.py new file mode 100644 index 0000000..2a5f499 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_abs_sum.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_abs_sum') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(8,9,10,11,12)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_gmean.py b/tests/build_test_feature_analysis_statistical_gmean.py new file mode 100644 index 0000000..05230ff --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_gmean.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_g_mean') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_hmean.py b/tests/build_test_feature_analysis_statistical_hmean.py new file mode 100644 index 0000000..047a6c4 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_hmean.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_h_mean') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_kurtosis.py b/tests/build_test_feature_analysis_statistical_kurtosis.py new file mode 100644 index 0000000..ef5ea92 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_kurtosis.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_kurtosis') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_maximum.py b/tests/build_test_feature_analysis_statistical_maximum.py new file mode 100644 index 0000000..e3bb764 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_maximum.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_maximum') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_mean.py b/tests/build_test_feature_analysis_statistical_mean.py new file mode 100644 index 0000000..5112fec --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_mean.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_mean') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_mean_abs.py b/tests/build_test_feature_analysis_statistical_mean_abs.py new file mode 100644 index 0000000..4c94ec6 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_mean_abs.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_mean_abs') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_mean_abs_temporal_derivative.py b/tests/build_test_feature_analysis_statistical_mean_abs_temporal_derivative.py new file mode 100644 index 0000000..a1bdcdd --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_mean_abs_temporal_derivative.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_mean_abs_temporal_derivative') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_mean_temporal_derivative.py b/tests/build_test_feature_analysis_statistical_mean_temporal_derivative.py new file mode 100644 index 0000000..e785efb --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_mean_temporal_derivative.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_mean_temporal_derivative') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_median.py b/tests/build_test_feature_analysis_statistical_median.py new file mode 100644 index 0000000..b11b108 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_median.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_median') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_median_absolute_deviation.py b/tests/build_test_feature_analysis_statistical_median_absolute_deviation.py new file mode 100644 index 0000000..e219ad7 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_median_absolute_deviation.py @@ -0,0 +1,63 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_median_abs_deviation') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_minimum.py b/tests/build_test_feature_analysis_statistical_minimum.py new file mode 100644 index 0000000..be8a148 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_minimum.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_minimum') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_skew.py b/tests/build_test_feature_analysis_statistical_skew.py new file mode 100644 index 0000000..b506f03 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_skew.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_skew') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_std.py b/tests/build_test_feature_analysis_statistical_std.py new file mode 100644 index 0000000..f7a61d7 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_std.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_std') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_var.py b/tests/build_test_feature_analysis_statistical_var.py new file mode 100644 index 0000000..386c932 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_var.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_var') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_variation.py b/tests/build_test_feature_analysis_statistical_variation.py new file mode 100644 index 0000000..bc1f680 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_variation.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_variation') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_vec_sum.py b/tests/build_test_feature_analysis_statistical_vec_sum.py new file mode 100644 index 0000000..85f06d2 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_vec_sum.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_vec_sum') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_willison_amplitude.py b/tests/build_test_feature_analysis_statistical_willison_amplitude.py new file mode 100644 index 0000000..6c578fc --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_willison_amplitude.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_willison_amplitude') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=4) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(5,6)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_feature_analysis_statistical_zero_crossing.py b/tests/build_test_feature_analysis_statistical_zero_crossing.py new file mode 100644 index 0000000..7a33424 --- /dev/null +++ b/tests/build_test_feature_analysis_statistical_zero_crossing.py @@ -0,0 +1,62 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_zero_crossing') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(9,10)) # There is sth wrong with multi-dimensional +step_3.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tests/build_test_time_series_seasonality_trend_decomposition.py b/tests/build_test_time_series_seasonality_trend_decomposition.py new file mode 100644 index 0000000..99168bf --- /dev/null +++ b/tests/build_test_time_series_seasonality_trend_decomposition.py @@ -0,0 +1,61 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep +from d3m.metadata import hyperparams +import copy + +# -> dataset_to_dataframe -> column_parser -> extract_columns_by_semantic_types(attributes) -> imputer -> random_forest +# extract_columns_by_semantic_types(targets) -> ^ + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +primitive_0 = index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe') +step_0 = PrimitiveStep(primitive=primitive_0) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# # Step 1: column_parser +primitive_1 = index.get_primitive('d3m.primitives.data_transformation.column_parser.Common') +step_1 = PrimitiveStep(primitive=primitive_1) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# # Step 2: Standardization +primitive_2 = index.get_primitive('d3m.primitives.tods.timeseries_processing.transformation.standard_scaler') +step_2 = PrimitiveStep(primitive=primitive_2) +step_2.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_2.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(2,3,4,5,6)) +step_2.add_hyperparameter(name='return_result', argument_type=ArgumentType.VALUE, data='append') +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# # Step 3: test primitive +# primitive_3 = index.get_primitive('d3m.primitives.anomaly_detection.KNNPrimitive') +primitive_3 = index.get_primitive('d3m.primitives.tods.timeseries_processing.decomposition.time_series_seasonality_trend_decomposition') +step_3 = PrimitiveStep(primitive=primitive_3) +step_3.add_hyperparameter(name='period', argument_type=ArgumentType.VALUE, data=5) +step_3.add_hyperparameter(name='use_semantic_types', argument_type=ArgumentType.VALUE, data=True) +step_3.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data=(8,9,10,11,12)) # There is sth wrong with multi-dimensional +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + + + +# Final Output +pipeline_description.add_output(name='output', data_reference='steps.3.produce') + +# Output to YAML +yaml = pipeline_description.to_yaml() +with open('pipeline.yml', 'w') as f: + f.write(yaml) + +# Or you can output json +#data = pipline_description.to_json() + diff --git a/tods/.gitignore b/tods/.gitignore new file mode 100644 index 0000000..36fa0f3 --- /dev/null +++ b/tods/.gitignore @@ -0,0 +1,2 @@ +.pyc +__pycache__ diff --git a/tods/__init__.py b/tods/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tods/common-primitives/HISTORY.md b/tods/common-primitives/HISTORY.md new file mode 100644 index 0000000..5daa8a3 --- /dev/null +++ b/tods/common-primitives/HISTORY.md @@ -0,0 +1,363 @@ +## v0.8.0 + +* Removed multi-targets support in `classification.light_gbm.Common` and fixed + categorical attributes handling. + [!118](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/118) +* Unified date parsing across primitives. + Added `raise_error` hyper-parameter to `data_preprocessing.datetime_range_filter.Common`. + This bumped the version of the primitive. + [!117](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/117) +* `evaluation.kfold_time_series_split.Common` now parses the datetime column + before sorting. `fuzzy_time_parsing` hyper-parameter was added to the primitive. + This bumped the version of the primitive. + [!110](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/110) +* Added option `equal` to hyper-parameter `match_logic` of primitive + `data_transformation.extract_columns_by_semantic_types.Common` to support set equality + when determining columns to extract. This bumped the version of the primitive. + [!116](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/116) +* Fixed `data_preprocessing.one_hot_encoder.MakerCommon` to work with the + latest core package. +* `data_cleaning.tabular_extractor.Common` has been fixed to work with the + latest version of sklearn. + [!113](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/113) +* ISI side of `data_augmentation.datamart_augmentation.Common` and + `data_augmentation.datamart_download.Common` has been updated. + [!108](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/108) +* Improved how pipelines and pipeline runs for all primitives are managed. + Many more pipelines and pipeline runs were added. +* `evaluation.kfold_timeseries_split.Common` has been renamed to `evaluation.kfold_time_series_split.Common`. +* Fixed `data_preprocessing.dataset_sample.Common` on empty input. + [!95](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/95) +* `data_preprocessing.datetime_range_filter.Common` does not assume local timezone + when parsing dates. + [#115](https://gitlab.com/datadrivendiscovery/common-primitives/issues/115) +* Added `fuzzy_time_parsing` hyper-parameter to `data_transformation.column_parser.Common`. + This bumped the version of the primitive. +* Fixed `data_transformation.column_parser.Common` to work correctly with `python-dateutil==2.8.1`. + [#119](https://gitlab.com/datadrivendiscovery/common-primitives/issues/119). +* Refactored `data_preprocessing.one_hot_encoder.MakerCommon` to address some issues. + [#66](https://gitlab.com/datadrivendiscovery/common-primitives/issues/66) + [#75](https://gitlab.com/datadrivendiscovery/common-primitives/issues/75) + [!96](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/96) +* Added support for handling of numeric columns to `data_preprocessing.regex_filter.Common` and `data_preprocessing.term_filter.Common`. + [!101](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/101) + [!104](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/104) +* Fixed exception in `produce` method in `data_transformation.datetime_field_compose.Common` caused by using incorrect type for dataframe indexer. + [!102](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/102) +* Added primitives: + * `data_transformation.grouping_field_compose.Common` + +## v0.7.0 + +* Renamed primitives: + * `data_transformation.add_semantic_types.DataFrameCommon` to `data_transformation.add_semantic_types.Common` + * `data_transformation.remove_semantic_types.DataFrameCommon` to `data_transformation.remove_semantic_types.Common` + * `data_transformation.replace_semantic_types.DataFrameCommon` to `data_transformation.replace_semantic_types.Common` + * `operator.column_map.DataFrameCommon` to `operator.column_map.Common` + * `regression.xgboost_gbtree.DataFrameCommon` to `regression.xgboost_gbtree.Common` + * `classification.light_gbm.DataFrameCommon` to `classification.light_gbm.Common` + * `classification.xgboost_gbtree.DataFrameCommon` to `classification.xgboost_gbtree.Common` + * `classification.xgboost_dart.DataFrameCommon` to `classification.xgboost_dart.Common` + * `classification.random_forest.DataFrameCommon` to `classification.random_forest.Common` + * `data_transformation.extract_columns.DataFrameCommon` to `data_transformation.extract_columns.Common` + * `data_transformation.extract_columns_by_semantic_types.DataFrameCommon` to `data_transformation.extract_columns_by_semantic_types.Common` + * `data_transformation.extract_columns_by_structural_types.DataFrameCommon` to `data_transformation.extract_columns_by_structural_types.Common` + * `data_transformation.cut_audio.DataFrameCommon` to `data_transformation.cut_audio.Common` + * `data_transformation.column_parser.DataFrameCommon` to `data_transformation.column_parser.Common` + * `data_transformation.remove_columns.DataFrameCommon` to `data_transformation.remove_columns.Common` + * `data_transformation.remove_duplicate_columns.DataFrameCommon` to `data_transformation.remove_duplicate_columns.Common` + * `data_transformation.horizontal_concat.DataFrameConcat` to `data_transformation.horizontal_concat.DataFrameCommon` + * `data_transformation.construct_predictions.DataFrameCommon` to `data_transformation.construct_predictions.Common` + * `data_transformation.datetime_field_compose.DataFrameCommon` to `data_transformation.datetime_field_compose.Common` + * `data_preprocessing.label_encoder.DataFrameCommon` to `data_preprocessing.label_encoder.Common` + * `data_preprocessing.label_decoder.DataFrameCommon` to `data_preprocessing.label_decoder.Common` + * `data_preprocessing.image_reader.DataFrameCommon` to `data_preprocessing.image_reader.Common` + * `data_preprocessing.text_reader.DataFrameCommon` to `data_preprocessing.text_reader.Common` + * `data_preprocessing.video_reader.DataFrameCommon` to `data_preprocessing.video_reader.Common` + * `data_preprocessing.csv_reader.DataFrameCommon` to `data_preprocessing.csv_reader.Common` + * `data_preprocessing.audio_reader.DataFrameCommon` to `data_preprocessing.audio_reader.Common` + * `data_preprocessing.regex_filter.DataFrameCommon` to `data_preprocessing.regex_filter.Common` + * `data_preprocessing.term_filter.DataFrameCommon` to `data_preprocessing.term_filter.Common` + * `data_preprocessing.numeric_range_filter.DataFrameCommon` to `data_preprocessing.numeric_range_filter.Common` + * `data_preprocessing.datetime_range_filter.DataFrameCommon` to `data_preprocessing.datetime_range_filter.Common` + +## v0.6.0 + +* Added `match_logic`, `negate`, and `add_index_columns` hyper-parameters + to `data_transformation.extract_columns_by_structural_types.DataFrameCommon` + and `data_transformation.extract_columns_by_semantic_types.DataFrameCommon` + primitives. +* `feature_extraction.sparse_pca.Common` has been removed and is now available as part of realML. + [!89](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/89) +* Added new primitives: + * `data_preprocessing.datetime_range_filter.DataFrameCommon` + * `data_transformation.datetime_field_compose.DataFrameCommon` + * `d3m.primitives.data_preprocessing.flatten.DataFrameCommon` + * `data_augmentation.datamart_augmentation.Common` + * `data_augmentation.datamart_download.Common` + * `data_preprocessing.dataset_sample.Common` + + [#53](https://gitlab.com/datadrivendiscovery/common-primitives/issues/53) + [!86](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/86) + [!87](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/87) + [!85](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/85) + [!63](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/63) + [!92](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/92) + [!93](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/93) + [!81](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/81) + +* Fixed `fit` method to return correct value for `operator.column_map.DataFrameCommon`, + `operator.dataset_map.DataFrameCommon`, and `schema_discovery.profiler.Common`. +* Some not maintained primitives have been disabled. If you are using them, consider adopting them. + * `classification.bayesian_logistic_regression.Common` + * `regression.convolutional_neural_net.TorchCommon` + * `operator.diagonal_mvn.Common` + * `regression.feed_forward_neural_net.TorchCommon` + * `data_preprocessing.image_reader.Common` + * `clustering.k_means.Common` + * `regression.linear_regression.Common` + * `regression.loss.TorchCommon` + * `feature_extraction.pca.Common` +* `data_transformation.update_semantic_types.DatasetCommon` has been removed. + Use `data_transformation.add_semantic_types.DataFrameCommon`, + `data_transformation.remove_semantic_types.DataFrameCommon`, + or `data_transformation.replace_semantic_types.DataFrameCommon` together with + `operator.dataset_map.DataFrameCommon` primitive to obtain previous functionality. + [#83](https://gitlab.com/datadrivendiscovery/common-primitives/issues/83) +* `data_transformation.remove_columns.DatasetCommon` has been removed. + Use `data_transformation.remove_columns.DataFrameCommon` together with + `operator.dataset_map.DataFrameCommon` primitive to obtain previous functionality. + [#83](https://gitlab.com/datadrivendiscovery/common-primitives/issues/83) +* Some primitives which operate on Dataset have been converted to operate + on DataFrame and renamed. Use them together with `operator.dataset_map.DataFrameCommon` + primitive to obtain previous functionality. + * `data_preprocessing.regex_filter.DatasetCommon` to `data_preprocessing.regex_filter.DataFrameCommon` + * `data_preprocessing.term_filter.DatasetCommon` to `data_preprocessing.term_filter.DataFrameCommon` + * `data_preprocessing.numeric_range_filter.DatasetCommon` to `data_preprocessing.numeric_range_filter.DataFrameCommon` + + [#83](https://gitlab.com/datadrivendiscovery/common-primitives/issues/83) + [!84](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/84) + +* `schema_discovery.profiler.Common` has been improved: + * More options added to `detect_semantic_types`. + * Added new `remove_unknown_type` hyper-parameter. + +## v0.5.0 + +* `evaluation.compute_scores.Common` primitive has been moved to the core + package and renamed to `evaluation.compute_scores.Core`. +* `metafeature_extraction.compute_metafeatures.Common` has been renamed to + `metalearning.metafeature_extractor.Common` +* `evaluation.compute_scores.Common` has now a `add_normalized_scores` hyper-parameter + to control adding also a column with normalized scores to the output, which is now + added by default. +* `data_preprocessing.text_reader.DataFrameCommon` primitive has been fixed. +* `data_transformation.rename_duplicate_name.DataFrameCommon` primitive was + fixed to handle all types of column names. + [#73](https://gitlab.com/datadrivendiscovery/common-primitives/issues/73) + [!65](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/65) +* Added new primitives: + * `data_cleaning.tabular_extractor.Common` + * `data_preprocessing.one_hot_encoder.PandasCommon` + * `schema_discovery.profiler.Common` + * `data_transformation.ravel.DataFrameRowCommon` + * `operator.column_map.DataFrameCommon` + * `operator.dataset_map.DataFrameCommon` + * `data_transformation.normalize_column_references.Common` + * `data_transformation.normalize_graphs.Common` + * `feature_extraction.sparse_pca.Common` + * `evaluation.kfold_timeseries_split.Common` + + [#57](https://gitlab.com/datadrivendiscovery/common-primitives/issues/57) + [!42](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/42) + [!44](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/44) + [!47](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/47) + [!71](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/71) + [!73](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/73) + [!77](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/77) + [!66](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/66) + [!67](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/67) + +* Added hyper-parameter `error_on_no_columns` to `classification.random_forest.DataFrameCommon`. +* Common primitives have been updated to latest changes in d3m core package. +* Many utility functions from `utils.py` have been moved to the d3m core package. + +## v0.4.0 + +* Renamed `data_preprocessing.one_hot_encoder.Common` to + `data_preprocessing.one_hot_encoder.MakerCommon` and reimplement it. + [!54](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/54) +* Added new primitives: + * `classification.xgboost_gbtree.DataFrameCommon` + * `classification.xgboost_dart.DataFrameCommon` + * `regression.xgboost_gbtree.DataFrameCommon` + * `classification.light_gbm.DataFrameCommon` + * `data_transformation.rename_duplicate_name.DataFrameCommon` + + [!45](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/45) + [!46](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/46) + [!49](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/49) + +* Made sure `utils.select_columns` works also when given a tuple of columns, and not a list. + [!58](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/58) +* `classification.random_forest.DataFrameCommon` updated so that produced columns have + names matching column names during fitting. Moreover, `produce_feature_importances` + return a `DataFrame` with each column being one feature and having one row with + importances. + [!59](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/59) +* `regression.feed_forward_neural_net.TorchCommon` updated to support + selection of columns using semantic types. + [!57](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/57) + +## v0.3.0 + +* Made `evaluation.redact_columns.Common` primitive more general so that it can + redact any columns based on their semantic type and not just targets. +* Renamed primitives: + * `datasets.Denormalize` to `data_transformation.denormalize.Common` + * `datasets.DatasetToDataFrame` to `data_transformation.dataset_to_dataframe.Common` + * `evaluation.ComputeScores` to `evaluation.compute_scores.Common` + * `evaluation.RedactTargets` to `evaluation.redact_columns.Common` + * `evaluation.KFoldDatasetSplit` to `evaluation.kfold_dataset_split.Common` + * `evaluation.TrainScoreDatasetSplit` to `evaluation.train_score_dataset_split.Common` + * `evaluation.NoSplitDatasetSplit` to `evaluation.no_split_dataset_split.Common` + * `evaluation.FixedSplitDatasetSplit` to `evaluation.fixed_split_dataset_split.Commmon` + * `classifier.RandomForest` to `classification.random_forest.DataFrameCommon` + * `metadata.ComputeMetafeatures` to `metafeature_extraction.compute_metafeatures.Common` + * `audio.CutAudio` to `data_transformation.cut_audio.DataFrameCommon` + * `data.ListToNDArray` to `data_transformation.list_to_ndarray.Common` + * `data.StackNDArrayColumn` to `data_transformation.stack_ndarray_column.Common` + * `data.AddSemanticTypes` to `data_transformation.add_semantic_types.DataFrameCommon` + * `data.RemoveSemanticTypes` to `data_transformation.remove_semantic_types.DataFrameCommon` + * `data.ConstructPredictions` to `data_transformation.construct_predictions.DataFrameCommon` + * `data.ColumnParser` to `data_transformation.column_parser.DataFrameCommon` + * `data.CastToType` to `data_transformation.cast_to_type.Common` + * `data.ExtractColumns` to `data_transformation.extract_columns.DataFrameCommon` + * `data.ExtractColumnsBySemanticTypes` to `data_transformation.extract_columns_by_semantic_types.DataFrameCommon` + * `data.ExtractColumnsByStructuralTypes` to `data_transformation.extract_columns_by_structural_types.DataFrameCommon` + * `data.RemoveColumns` to `data_transformation.remove_columns.DataFrameCommon` + * `data.RemoveDuplicateColumns` to `data_transformation.remove_duplicate_columns.DataFrameCommon` + * `data.HorizontalConcat` to `data_transformation.horizontal_concat.DataFrameConcat` + * `data.DataFrameToNDArray` to `data_transformation.dataframe_to_ndarray.Common` + * `data.NDArrayToDataFrame` to `data_transformation.ndarray_to_dataframe.Common` + * `data.DataFrameToList` to `data_transformation.dataframe_to_list.Common` + * `data.ListToDataFrame` to `data_transformation.list_to_dataframe.Common` + * `data.NDArrayToList` to `data_transformation.ndarray_to_list.Common` + * `data.ReplaceSemanticTypes` to `data_transformation.replace_semantic_types.DataFrameCommon` + * `data.UnseenLabelEncoder` to `data_preprocessing.label_encoder.DataFrameCommon` + * `data.UnseenLabelDecoder` to `data_preprocessing.label_decoder.DataFrameCommon` + * `data.ImageReader` to `data_preprocessing.image_reader.DataFrameCommon` + * `data.TextReader` to `data_preprocessing.text_reader.DataFrameCommon` + * `data.VideoReader` to `data_preprocessing.video_reader.DataFrameCommon` + * `data.CSVReader` to `data_preprocessing.csv_reader.DataFrameCommon` + * `data.AudioReader` to `data_preprocessing.audio_reader.DataFrameCommon` + * `datasets.UpdateSemanticTypes` to `data_transformation.update_semantic_types.DatasetCommon` + * `datasets.RemoveColumns` to `data_transformation.remove_columns.DatasetCommon` + * `datasets.RegexFilter` to `data_preprocessing.regex_filter.DatasetCommon` + * `datasets.TermFilter` to `data_preprocessing.term_filter.DatasetCommon` + * `datasets.NumericRangeFilter` to `data_preprocessing.numeric_range_filter.DatasetCommon` + * `common_primitives.BayesianLogisticRegression` to `classification.bayesian_logistic_regression.Common` + * `common_primitives.ConvolutionalNeuralNet` to `regression.convolutional_neural_net.TorchCommon` + * `common_primitives.DiagonalMVN` to `operator.diagonal_mvn.Common` + * `common_primitives.FeedForwardNeuralNet` to `regression.feed_forward_neural_net.TorchCommon` + * `common_primitives.ImageReader` to `data_preprocessing.image_reader.Common` + * `common_primitives.KMeans` to `clustering.kmeans.Common` + * `common_primitives.LinearRegression` to `regression.linear_regression.Common` + * `common_primitives.Loss` to `regression.loss.TorchCommon` + * `common_primitives.PCA` to `feature_extraction.pca.Common` + * `common_primitives.OneHotMaker` to `data_preprocessing.one_hot_encoder.Common` +* Fixed pickling issue of `classifier.RandomFores`. + [#47](https://gitlab.com/datadrivendiscovery/common-primitives/issues/47) + [!48](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/48) +* `data.ColumnParser` primitive has now additional hyper-parameter `replace_index_columns` + which controls whether index columns are still replaced when otherwise appending returned + parsed columns or not. +* Made `data.RemoveDuplicateColumns` fit and remember duplicate columns during training. + [#45](https://gitlab.com/datadrivendiscovery/common-primitives/issues/45) +* Added `match_logic` hyper-parameter to the `data.ReplaceSemanticTypes` primitive + which allows one to control how multiple specified semantic types match. +* Added new primitives: + * `metadata.ComputeMetafeatures` + * `datasets.RegexFilter` + * `datasets.TermFilter` + * `datasets.NumericRangeFilter` + * `evaluation.NoSplitDatasetSplit` + * `evaluation.FixedSplitDatasetSplit` +* Column parser fixed to parse columns with `http://schema.org/DateTime` semantic type. +* Simplified logic (and made it more predictable) of `combine_columns` utility function when + using `new` `return_result` and `add_index_columns` set to true. Now if output already contains + any index column, input index columns are not added. And if there are no index columns, + all input index columns are added at the beginning. +* Fixed `_can_use_inputs_column` in `classifier.RandomForest`. Added check of structural type, so + only columns with numerical structural types are processed. +* Correctly set column names in `evaluation.ComputeScores` primitive's output. +* Cast indices and columns to match predicted columns' dtypes. + [#33](https://gitlab.com/datadrivendiscovery/common-primitives/issues/33) +* `datasets.DatasetToDataFrame` primitive does not try to generate metadata automatically + because this is not really needed (metadata can just be copied from the dataset). This + speeds up the primitive. + [#34](https://gitlab.com/datadrivendiscovery/common-primitives/issues/34) +* Made it uniform that whenever we are generating lists of all column names + we try first to get the name from the metadata and fallback to one in DataFrame. + Instead of using a column index in the latter case. +* Made splitting primitives, `classifier.RandomForest` and `data.UnseenLabelEncoder` + be picklable even unfitted. +* Fixed entry point for `audio.CutAudio` primitive. + +## v0.2.0 + +* Made those primitives operate on semantic types and support different ways to return results. +* Added or updated many primitives: + * `data.ExtractColumns` + * `data.ExtractColumnsBySemanticTypes` + * `data.ExtractColumnsByStructuralTypes` + * `data.RemoveColumns` + * `data.RemoveDuplicateColumns` + * `data.HorizontalConcat` + * `data.CastToType` + * `data.ColumnParser` + * `data.ConstructPredictions` + * `data.DataFrameToNDArray` + * `data.NDArrayToDataFrame` + * `data.DataFrameToList` + * `data.ListToDataFrame` + * `data.NDArrayToList` + * `data.ListToNDArray` + * `data.StackNDArrayColumn` + * `data.AddSemanticTypes` + * `data.RemoveSemanticTypes` + * `data.ReplaceSemanticTypes` + * `data.UnseenLabelEncoder` + * `data.UnseenLabelDecoder` + * `data.ImageReader` + * `data.TextReader` + * `data.VideoReader` + * `data.CSVReader` + * `data.AudioReader` + * `datasets.Denormalize` + * `datasets.DatasetToDataFrame` + * `datasets.UpdateSemanticTypes` + * `datasets.RemoveColumns` + * `evaluation.RedactTargets` + * `evaluation.ComputeScores` + * `evaluation.KFoldDatasetSplit` + * `evaluation.TrainScoreDatasetSplit` + * `audio.CutAudio` + * `classifier.RandomForest` +* Starting list enabled primitives in the [`entry_points.ini`](./entry_points.ini) file. +* Created `devel` branch which contains primitives coded against the + future release of the `d3m` core package (its `devel` branch). + `master` branch of this repository is made against the latest stable + release of the `d3m` core package. +* Dropped support for Python 2.7 and require Python 3.6. +* Renamed repository and package to `common-primitives` and `common_primitives`, + respectively. +* Repository migrated to gitlab.com and made public. + +## v0.1.1 + +* Made common primitives work on Python 2.7. + +## v0.1.0 + +* Initial set of common primitives. diff --git a/tods/common-primitives/HOW_TO_MANAGE.md b/tods/common-primitives/HOW_TO_MANAGE.md new file mode 100644 index 0000000..9e0d3db --- /dev/null +++ b/tods/common-primitives/HOW_TO_MANAGE.md @@ -0,0 +1,94 @@ +# How to publish primitive annotations + +As contributors add or update their primitives they might want to publish +primitive annotations for added primitives. When doing this it is important +to republish also all other primitive annotations already published from this +package. This is because only one version of the package can be installed at +a time and all primitive annotations have to point to the same package in +their `installation` metadata. + +Steps to publish primitive annotations: +* Operate in a virtual env with the following installed: + * Target core package installed. + * [Test primitives](https://gitlab.com/datadrivendiscovery/tests-data/tree/master/primitives) + with the same version of primitives which are currently published in `primitives` + repository. Remember to install them in `-e` editable mode. +* Update `HISTORY.md` for `vNEXT` release with information about primitives + added or updated. If there was no package release since they were updated last, + do not duplicate entries but just update any existing entries for those primitives + instead, so that once released it is clear what has changed in a release as a whole. +* Make sure tests for primitives being published (primitives added, updated, + and primitives previously published which should be now republished) pass. +* Update `entry_points.ini` and add new primitives. Leave active + only those entries for primitives being (re)published and comment out all others. + * If this is the first time primitives are published after a release of a new `d3m` + core package, leave active only those which were updated to work with + the new `d3m` core package. Leave to others to update, verify, and publish + other common primitives. +* In clone of `primitives` repository prepare a branch of the up-to-date `master` branch + to add/update primitive annotations. If existing annotations for common primitives + are already there the best is to first remove them to make sure annotations for + removed primitives do not stay around. We will re-add all primitives in the next step. +* Run `add.sh` in root of this package, which will add primitive annotations + to `primitives`. See instructions in the script for more information. +* Verify changes in the `primitives`, add and commit files to git. +* Publish a branch in `primitives` and make a merge request. + +# How to release a new version + +A new version is always released from `master` branch against a stable release +of `d3m` core package. A new version should be released when there are major +changes to the package (many new primitives added, larger breaking changes). +Sync up with other developers of the repo to suggest a release, or do a release. + +* On `master` branch: + * Make sure `HISTORY.md` file is updated with all changes since the last release. + * Change a version in `common_primitives/__init__.py` to the to-be-released version, without `v` prefix. + * Change `vNEXT` in `HISTORY.md` to the to-be-released version, with `v` prefix. + * Commit with message `Bumping version for release.` + * `git push` + * Wait for CI to run tests successfully. + * Tag with version prefixed with `v`, e.g., for version `0.2.0`: `git tag v0.2.0` + * `git push` & `git push --tags` + * Change a version in `common_primitives/__init__.py` back to `devel` string. + * Add a new empty `vNEXT` version on top of `HISTORY.md`. + * Commit with message `Version bump for development.` + * `git push` +* On `devel` branch: + * Merge `master` into `devel` branch: `git merge master` + * Update the branch according to the section below. + * `git push` + +# How to update `master` branch after a release of new `d3m` core package + +Hopefully, `devel` branch already contains code which works against the released +`d3m` core package. So merge `devel` branch into `master` branch and update +files according to the following section. + +# Keeping `master` and `devel` branches in sync + +Because `master` and `devel` branches mostly contain the same code, +just made against different version of `d3m` core package, it is common +to merge branches into each other as needed to keep them in sync. +When doing so, the following are files which are specific to branches: + +* `.gitlab-ci.yml` has a `DEPENDENCY_REF` environment variable which + has to point to `master` on `master` branch of this repository, + and `devel` on `devel` branch of this repository. + +# How to add an example pipeline + +Every common primitive (except those used in non-standard pipelines, like splitting primitives) +should have at least one example pipeline and associated pipeline run. + +Add example pipelines into a corresponding sub-directory based on primitive's suffix into `pipelines` +directory in the repository. If a pipeline uses multiple common primitives, add it for only one +primitive and create symbolic links for other primitives. + +Create a `fit-score` pipeline run as [described in primitives index repository](https://gitlab.com/datadrivendiscovery/primitives#adding-a-primitive). +Compress it with `gzip` and store it under `pipeline_runs` directory in the repository. +Similarly, add it only for one primitive and create symbolic links for others, if pipeline run +corresponds to a pipeline with multiple common primitives. + +Use `git-add.sh` script to assure all files larger than 100 KB are added as git LFS files to +the repository. diff --git a/tods/common-primitives/LICENSE.txt b/tods/common-primitives/LICENSE.txt new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/tods/common-primitives/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/tods/common-primitives/MANIFEST.in b/tods/common-primitives/MANIFEST.in new file mode 100644 index 0000000..3e677d0 --- /dev/null +++ b/tods/common-primitives/MANIFEST.in @@ -0,0 +1,2 @@ +include README.md +include LICENSE.txt diff --git a/tods/common-primitives/README.md b/tods/common-primitives/README.md new file mode 100644 index 0000000..fe2fbcf --- /dev/null +++ b/tods/common-primitives/README.md @@ -0,0 +1,83 @@ +# Common D3M primitives + +A common set of primitives for D3M project, maintained together. +It contains example primitives, various glue primitives, and other primitives performers +contributed. + +## Installation + +This package works on Python 3.6+ and pip 19+. + +This package additional dependencies which are specified in primitives' metadata, +but if you are manually installing the package, you have to first run, for Ubuntu: + +``` +$ apt-get install build-essential libopenblas-dev libcap-dev ffmpeg +$ pip3 install python-prctl +``` + +To install common primitives from inside a cloned repository, run: + +``` +$ pip3 install -e . +``` + +When cloning a repository, clone it recursively to get also git submodules: + +``` +$ git clone --recursive https://gitlab.com/datadrivendiscovery/common-primitives.git +``` + +## Changelog + +See [HISTORY.md](./HISTORY.md) for summary of changes to this package. + +## Repository structure + +`master` branch contains latest code of common primitives made against the latest stable +release of the [`d3m` core package](https://gitlab.com/datadrivendiscovery/d3m) (its `master` branch). +`devel` branch contains latest code of common primitives made against the +future release of the `d3m` core package (its `devel` branch). + +Releases are [tagged](https://gitlab.com/datadrivendiscovery/d3m/tags) but they are not done +regularly. Each primitive has its own versions as well, which are not related to package versions. +Generally is the best to just use the latest code available in `master` or `devel` +branches (depending which version of the core package you are using). + +## Testing locally + +For each commit to this repository, tests run automatically in the +[GitLab CI](https://gitlab.com/datadrivendiscovery/common-primitives/pipelines). + +If you don't want to wait for the GitLab CI test results and run the tests locally, +you can install and use the [GitLab runner](https://docs.gitlab.com/runner/install/) in your system. + +With the local GitLab runner, you can run the tests defined in the [.gitlab-ci.yml](.gitlab-ci.yml) +file of this repository, such as: + +``` +$ gitlab-runner exec docker style_check +$ gitlab-runner exec docker type_check +``` + +You can also just try to run tests available under `/tests` by running: + +``` +$ python3 run_tests.py +``` + +## Contribute + +Feel free to contribute more primitives to this repository. The idea is that we build +a common set of primitives which can help both as an example, but also to have shared +maintenance of some primitives, especially glue primitives. + +All primitives are written in Python 3 and are type checked using +[mypy](http://www.mypy-lang.org/), so typing annotations are required. + +## About Data Driven Discovery Program + +DARPA Data Driven Discovery (D3M) Program is researching ways to get machines to build +machine learning pipelines automatically. It is split into three layers: +TA1 (primitives), TA2 (systems which combine primitives automatically into pipelines +and executes them), and TA3 (end-users interfaces). diff --git a/tods/common-primitives/add.sh b/tods/common-primitives/add.sh new file mode 100755 index 0000000..7059b16 --- /dev/null +++ b/tods/common-primitives/add.sh @@ -0,0 +1,24 @@ +#!/bin/bash -e + +# Assumption is that this repository is cloned into "common-primitives" directory +# which is a sibling of "d3m-primitives" directory with D3M public primitives. + +D3M_VERSION="$(python3 -c 'import d3m; print(d3m.__version__)')" + +for PRIMITIVE_SUFFIX in $(./list_primitives.py --suffix); do + echo "$PRIMITIVE_SUFFIX" + python3 -m d3m index describe -i 4 "d3m.primitives.$PRIMITIVE_SUFFIX" > primitive.json + pushd ../d3m-primitives > /dev/null + ./add.py ../common-primitives/primitive.json + popd > /dev/null + if [[ -e "pipelines/$PRIMITIVE_SUFFIX" ]]; then + PRIMITIVE_PATH="$(echo ../d3m-primitives/v$D3M_VERSION/common-primitives/d3m.primitives.$PRIMITIVE_SUFFIX/*)" + mkdir -p "$PRIMITIVE_PATH/pipelines" + find pipelines/$PRIMITIVE_SUFFIX/ \( -name '*.json' -or -name '*.yaml' -or -name '*.yml' -or -name '*.json.gz' -or -name '*.yaml.gz' -or -name '*.yml.gz' \) -exec cp '{}' "$PRIMITIVE_PATH/pipelines" ';' + fi + if [[ -e "pipeline_runs/$PRIMITIVE_SUFFIX" ]]; then + PRIMITIVE_PATH="$(echo ../d3m-primitives/v$D3M_VERSION/common-primitives/d3m.primitives.$PRIMITIVE_SUFFIX/*)" + mkdir -p "$PRIMITIVE_PATH/pipeline_runs" + find pipeline_runs/$PRIMITIVE_SUFFIX/ \( -name '*.yml.gz' -or -name '*.yaml.gz' \) -exec cp '{}' "$PRIMITIVE_PATH/pipeline_runs" ';' + fi +done diff --git a/tods/common-primitives/common_primitives/__init__.py b/tods/common-primitives/common_primitives/__init__.py new file mode 100644 index 0000000..d79ffe1 --- /dev/null +++ b/tods/common-primitives/common_primitives/__init__.py @@ -0,0 +1,2 @@ +__version__ = '0.8.0' +__author__ = 'common-primitives' diff --git a/tods/common-primitives/common_primitives/add_semantic_types.py b/tods/common-primitives/common_primitives/add_semantic_types.py new file mode 100644 index 0000000..6ea74e3 --- /dev/null +++ b/tods/common-primitives/common_primitives/add_semantic_types.py @@ -0,0 +1,78 @@ +import copy +import typing +import os + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('AddSemanticTypesPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='A set of column indices of columns to add semantic types for.', + ) + semantic_types = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Semantic types to add for columns listed in "columns".', + ) + + +class AddSemanticTypesPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which adds semantic types for columns in a DataFrame. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'd7e14b12-abeb-42d8-942f-bdb077b4fd37', + 'version': '0.1.0', + 'name': "Add semantic types to columns", + 'python_path': 'd3m.primitives.data_transformation.add_semantic_types.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/add_semantic_types.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + outputs = copy.copy(inputs) + + outputs.metadata = self._update_metadata(outputs.metadata) + + return base.CallResult(outputs) + + def _update_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata + + for column_index in self.hyperparams['columns']: + for semantic_type in self.hyperparams['semantic_types']: + outputs_metadata = outputs_metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), semantic_type) + + return outputs_metadata diff --git a/tods/common-primitives/common_primitives/audio_reader.py b/tods/common-primitives/common_primitives/audio_reader.py new file mode 100644 index 0000000..4128e87 --- /dev/null +++ b/tods/common-primitives/common_primitives/audio_reader.py @@ -0,0 +1,137 @@ +import tempfile +import signal +import subprocess +import os + +import frozendict # type: ignore +import numpy # type: ignore +import prctl # type: ignore +from scipy.io import wavfile # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base + +import common_primitives +from common_primitives import base + + +class AudioReaderPrimitive(base.FileReaderPrimitiveBase): + """ + A primitive which reads columns referencing audio files. + + Each column which has ``https://metadata.datadrivendiscovery.org/types/FileName`` semantic type + and a valid media type (``audio/aiff``, ``audio/flac``, ``audio/ogg``, ``audio/wav``, ``audio/mpeg``) + has every filename read into an audio represented as a numpy array. By default the resulting column + with read arrays is appended to existing columns. + + The shape of numpy arrays is S x C. S is the number of samples, C is the number of + channels in an audio (e.g., C = 1 for mono, C = 2 for stereo). dtype is float32. + """ + + _supported_media_types = ( + 'audio/aiff', + 'audio/flac', + 'audio/ogg', + 'audio/wav', + 'audio/mpeg', + ) + _file_structural_type = container.ndarray + _file_semantic_types = ('http://schema.org/AudioObject',) + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '05e6eba3-2f5a-4934-8309-a6d17e099400', + 'version': '0.1.0', + 'name': 'Columns audio reader', + 'python_path': 'd3m.primitives.data_preprocessing.audio_reader.Common', + 'keywords': ['audio', 'reader', 'aiff', 'flac', 'ogg', 'wav', 'mpeg'], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/audio_reader.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.UBUNTU, + 'package': 'build-essential', + 'version': '12.4ubuntu1', + }, { + 'type': metadata_base.PrimitiveInstallationType.UBUNTU, + 'package': 'libcap-dev', + 'version': '1:2.25-1.1', + }, { + 'type': metadata_base.PrimitiveInstallationType.UBUNTU, + 'package': 'ffmpeg', + 'version': '7:2.8.11-0', + }, { + # "python-prctl" requires "build-essential" and "libcap-dev". We list it here instead of + # "setup.py" to not have to list these system dependencies for every common primitive (because + # we cannot assure this primitive annotation gets installed first). + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package': 'python-prctl', + 'version': '1.7', + }, { + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, + ], + 'supported_media_types': _supported_media_types, + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + } + ) + + def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict, fileuri: str) -> container.ndarray: + # Ideally, temporary files are created in ramdisk by configuring Python's location of temporary files. + with tempfile.NamedTemporaryFile(mode='rb') as output_file: + # We use ffmpeg to convert all audio files to same format. + args = [ + 'ffmpeg', + '-y', # Always overwrite existing files. + '-nostdin', # No interaction. + '-i', fileuri, # Input file. + '-vn', # There is no video. + '-acodec', 'pcm_f32le', # We want everything in float32 dtype. + '-f', 'wav', # This will give us sample rate available in metadata. + output_file.name, # Output file. + ] + + try: + result = subprocess.run( + args, stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + # Setting "pdeathsig" will make the ffmpeg process be killed if our process dies for any reason. + encoding='utf8', check=True, preexec_fn=lambda: prctl.set_pdeathsig(signal.SIGKILL), + ) + except subprocess.CalledProcessError as error: + self.logger.error("Error running ffmpeg: %(stderr)s", {'stderr': error.stderr}) + raise + + self.logger.debug("Finished running ffmpeg: %(stderr)s", {'stderr': result.stderr}) + + sampling_rate, audio_array = wavfile.read(output_file.name, mmap=True) + + assert audio_array.dtype == numpy.float32, audio_array.dtype + + if audio_array.ndim == 1: + # Make sure there are always two dimensions. + audio_array = audio_array.reshape(list(audio_array.shape) + [1]) + + assert audio_array.ndim == 2, audio_array.ndim + + audio_array = container.ndarray(audio_array, { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.ndarray, + }, generate_metadata=False) + + audio_array.metadata = audio_array.metadata.update((), { + 'dimension': { + 'sampling_rate': sampling_rate, + }, + }) + + return audio_array diff --git a/tods/common-primitives/common_primitives/base.py b/tods/common-primitives/common_primitives/base.py new file mode 100644 index 0000000..66712cd --- /dev/null +++ b/tods/common-primitives/common_primitives/base.py @@ -0,0 +1,437 @@ +import abc +import typing +import weakref + +import frozendict # type: ignore +import numpy # type: ignore +import pandas # type: ignore + +from d3m import container, exceptions, types +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces import base, generator, transformer + +from common_primitives import dataset_utils + +__all__ = ( + 'FileReaderPrimitiveBase', + 'DatasetSplitPrimitiveBase', + 'TabularSplitPrimitiveBase', +) + +FileReaderInputs = container.DataFrame +FileReaderOutputs = container.DataFrame + + +class FileReaderHyperparams(hyperparams.Hyperparams): + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column does not contain filenames for supported media types, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should columns with read files be appended, should they replace original columns, or should only columns with read files be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + + +class FileReaderPrimitiveBase(transformer.TransformerPrimitiveBase[FileReaderInputs, FileReaderOutputs, FileReaderHyperparams]): + """ + A primitive base class for reading files referenced in columns. + """ + + _supported_media_types: typing.Sequence[str] = () + _file_structural_type: type = None + # If any of these semantic types already exists on a column, then nothing is done. + # If all are missing, the first one is set. + _file_semantic_types: typing.Sequence[str] = () + + def __init__(self, *, hyperparams: FileReaderHyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + # Because same file can be referenced multiple times in multiple rows, we maintain + # a cache of read files so that we do not have to read same files again and again. + self._cache: weakref.WeakValueDictionary[typing.Tuple[int, str], typing.Any] = weakref.WeakValueDictionary() + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + if column_metadata['structural_type'] != str: + return False + + semantic_types = column_metadata.get('semantic_types', []) + media_types = set(column_metadata.get('media_types', [])) + + if 'https://metadata.datadrivendiscovery.org/types/FileName' in semantic_types and media_types <= set(self._supported_media_types): + return True + + return False + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + # We are OK if no columns ended up being read. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns contain filenames for supported media types. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def produce(self, *, inputs: FileReaderInputs, timeout: float = None, iterations: int = None) -> base.CallResult[FileReaderOutputs]: + columns_to_use = self._get_columns(inputs.metadata) + + output_columns = [self._produce_column(inputs, column_index) for column_index in columns_to_use] + + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + if self.hyperparams['return_result'] == 'append': + outputs.metadata = self._reassign_boundaries(outputs.metadata, columns_to_use) + + return base.CallResult(outputs) + + @abc.abstractmethod + def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict, fileuri: str) -> typing.Any: + pass + + def _read_filename(self, column_index: int, metadata: frozendict.FrozenOrderedDict, filename: str) -> typing.Any: + # TODO: Support handling multiple "location_base_uris". + # "location_base_uris" should be made so that we can just concat with the filename + # ("location_base_uris" end with "/"). + fileuri = metadata['location_base_uris'][0] + filename + + # We do not use the structure where we check if the key exists in the cache and if not set it and then + # return from the cache outside if clause because we are not sure garbage collection might not remove it + # before we get to return. So we directly ask for a reference and return it, or we obtain the file + # and populate the cache. + file = self._cache.get((column_index, fileuri), None) + if file is not None: + return file + + file = self._read_fileuri(metadata, fileuri) + + # We cache the file based on column index as well, because it could be that file is read differently + # based on column metadata, or that resulting metadata is different for a different column. + # We cache only if we can make a weakref. Many Python built-in types like "str" do not support them. + if type(file).__weakrefoffset__: + self._cache[(column_index, fileuri)] = file + + return file + + def _produce_column(self, inputs: FileReaderInputs, column_index: int) -> FileReaderOutputs: + read_files = [self._read_filename(column_index, inputs.metadata.query((row_index, column_index)), value) for row_index, value in enumerate(inputs.iloc[:, column_index])] + + column = container.DataFrame({inputs.columns[column_index]: read_files}, generate_metadata=False) + + column.metadata = self._produce_column_metadata(inputs.metadata, column_index, read_files) + column.metadata = column.metadata.generate(column, compact=True) + + return column + + def _produce_column_metadata( + self, inputs_metadata: metadata_base.DataMetadata, column_index: int, read_files: typing.Sequence[typing.Any], + ) -> metadata_base.DataMetadata: + column_metadata = inputs_metadata.select_columns([column_index]) + column_metadata = column_metadata.update_column(0, { + 'structural_type': self._file_structural_type, + # Clear metadata useful for filename columns. + 'location_base_uris': metadata_base.NO_VALUE, + 'media_types': metadata_base.NO_VALUE, + }) + + # It is not a filename anymore. + column_metadata = column_metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/FileName') + + # At least one semantic type from listed semantic types should be set. + semantic_types = column_metadata.query_column(0).get('semantic_types', []) + if not set(semantic_types) & set(self._file_semantic_types): + # Add the first one. + column_metadata = column_metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), self._file_semantic_types[0]) + + for row_index, file in enumerate(read_files): + # Copy metadata only if we have a container type. + if isinstance(file, types.Container): + column_metadata = file.metadata.copy_to(column_metadata, (), (row_index, 0)) + + column_metadata = column_metadata.compact(['name', 'structural_type', 'media_types', 'location_base_uris', 'semantic_types']) + + return column_metadata + + def _reassign_boundaries(self, inputs_metadata: metadata_base.DataMetadata, columns: typing.List[int]) -> metadata_base.DataMetadata: + """ + Moves metadata about boundaries from the filename column to image object column. + """ + + outputs_metadata = inputs_metadata + columns_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + for column_index in range(columns_length): + column_metadata = outputs_metadata.query_column(column_index) + + if 'boundary_for' not in column_metadata: + continue + + # TODO: Support also "column_name" boundary metadata. + if 'column_index' not in column_metadata['boundary_for']: + continue + + try: + i = columns.index(column_metadata['boundary_for']['column_index']) + except ValueError: + continue + + outputs_metadata = outputs_metadata.update_column(column_index, { + 'boundary_for': { + # We know that "columns" were appended at the end. + 'column_index': columns_length - len(columns) + i, + } + }) + + return outputs_metadata + + +DatasetSplitInputs = container.List +DatasetSplitOutputs = container.List + + +class DatasetSplitPrimitiveBase(generator.GeneratorPrimitiveBase[DatasetSplitOutputs, base.Params, base.Hyperparams]): + """ + A base class for primitives which fit on a ``Dataset`` object to produce splits of that + ``Dataset`` when producing. There are two produce methods: `produce` and `produce_score_data`. + They take as an input a list of non-negative integers which identify which ``Dataset`` + splits to return. + + This class is parameterized using only by two type variables, + ``Params`` and ``Hyperparams``. + """ + + @abc.abstractmethod + def produce(self, *, inputs: DatasetSplitInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DatasetSplitOutputs]: + """ + For each input integer creates a ``Dataset`` split and produces the training ``Dataset`` object. + This ``Dataset`` object should then be used to fit (train) the pipeline. + """ + + @abc.abstractmethod + def produce_score_data(self, *, inputs: DatasetSplitInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DatasetSplitOutputs]: + """ + For each input integer creates a ``Dataset`` split and produces the scoring ``Dataset`` object. + This ``Dataset`` object should then be used to test the pipeline and score the results. + + Output ``Dataset`` objects do not have targets redacted and are not directly suitable for testing. + """ + + @abc.abstractmethod + def set_training_data(self, *, dataset: container.Dataset) -> None: # type: ignore + """ + Sets training data of this primitive, the ``Dataset`` to split. + + Parameters + ---------- + dataset : Dataset + The dataset to split. + """ + + +class TabularSplitPrimitiveParams(params.Params): + dataset: typing.Optional[container.Dataset] + main_resource_id: typing.Optional[str] + splits: typing.Optional[typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]]] + graph: typing.Optional[typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]]] + + +# TODO: Make clear the assumption that both output container type (List) and output Datasets should have metadata. +# Redaction primitive expects that, while there is officially no reason for Datasets +# to really have metadata: metadata is stored available on the input container type, not +# values inside it. +class TabularSplitPrimitiveBase(DatasetSplitPrimitiveBase[TabularSplitPrimitiveParams, base.Hyperparams]): + def __init__(self, *, hyperparams: base.Hyperparams, random_seed: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + # We need random seed multiple times. So we create our own random state we use everywhere. + self._random_state = numpy.random.RandomState(self.random_seed) + self._fitted: bool = False + self._dataset: container.Dataset = None + self._main_resource_id: str = None + self._splits: typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]] = None + self._graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]] = None + + def produce(self, *, inputs: DatasetSplitInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DatasetSplitOutputs]: + return self._produce(inputs, True) + + def produce_score_data(self, *, inputs: DatasetSplitInputs, timeout: float = None, iterations: int = None) -> base.CallResult[DatasetSplitOutputs]: + return self._produce(inputs, False) + + def set_training_data(self, *, dataset: container.Dataset) -> None: # type: ignore + main_resource_id, main_resource = base_utils.get_tabular_resource(dataset, None, has_hyperparameter=False) + + self._main_resource_id = main_resource_id + self._dataset = dataset + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + """ + This function computes everything in advance, including generating the relation graph. + """ + + if self._dataset is None: + raise exceptions.InvalidStateError('Missing training data.') + + if self._fitted: + return base.CallResult(None) + + targets, target_columns = self._get_target_columns(self._dataset, self._main_resource_id) + attributes = self._get_attribute_columns(self._dataset, self._main_resource_id, target_columns) + + # Get splits' indices. + self._splits = self._get_splits(attributes, targets, self._dataset, self._main_resource_id) + + # Graph is the adjacency representation for the relations graph. Make it not be a "defaultdict". + self._graph = dict(self._dataset.get_relations_graph()) + + self._fitted = True + + return base.CallResult(None) + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: DatasetSplitInputs, dataset: container.Dataset, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, dataset=dataset) # type: ignore + + @abc.abstractmethod + def _get_splits(self, attributes: pandas.DataFrame, targets: pandas.DataFrame, dataset: container.Dataset, main_resource_id: str) -> typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]]: + pass + + def _get_target_columns(self, dataset: container.Dataset, main_resource_id: str) -> typing.Tuple[pandas.DataFrame, typing.Sequence[int]]: + target_columns = dataset.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(main_resource_id,)) + + # It is OK if there are no target columns. "_get_splits" should raise an exception + # if this is a problem for a given split logic. + + return dataset[main_resource_id].iloc[:, list(target_columns)], target_columns + + def _get_attribute_columns(self, dataset: container.Dataset, main_resource_id: str, target_columns: typing.Sequence[int]) -> pandas.DataFrame: + attribute_columns = dataset.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/Attribute'], at=(main_resource_id,)) + + if not attribute_columns: + # No attribute columns with semantic types, let's use all + # non-target columns as attributes then. + all_columns = list(range(dataset.metadata.query((main_resource_id, metadata_base.ALL_ELEMENTS,))['dimension']['length'])) + attribute_columns = [column_index for column_index in all_columns if column_index not in target_columns] + + if not attribute_columns: + raise ValueError("No attribute columns.") + + return dataset[main_resource_id].iloc[:, list(attribute_columns)] + + def _produce(self, inputs: DatasetSplitInputs, is_train: bool) -> base.CallResult[DatasetSplitOutputs]: + """ + This function splits the fitted Dataset. + + Parameters + ---------- + inputs : List[int] + A list of 0-based indices which specify which splits to be used as test split in output. + is_train : bool + Whether we are producing train or test data. + + Returns + ------- + List[Dataset] + Returns a list of Datasets. + """ + + if not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + output_datasets = container.List(generate_metadata=True) + + for index in inputs: + train_indices, test_indices = self._splits[index] + + if is_train: + output_dataset = dataset_utils.sample_rows( + self._dataset, + self._main_resource_id, + set(train_indices), + self._graph, + delete_recursive=self.hyperparams.get('delete_recursive', False), + ) + else: + output_dataset = dataset_utils.sample_rows( + self._dataset, + self._main_resource_id, + set(test_indices), + self._graph, + delete_recursive=self.hyperparams.get('delete_recursive', False), + ) + + output_datasets.append(output_dataset) + + output_datasets.metadata = metadata_base.DataMetadata({ + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.List, + 'dimension': { + 'length': len(output_datasets), + }, + }) + + # We update metadata based on metadata of each dataset. + # TODO: In the future this might be done automatically by generate_metadata. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/119 + for index, dataset in enumerate(output_datasets): + output_datasets.metadata = dataset.metadata.copy_to(output_datasets.metadata, (), (index,)) + + return base.CallResult(output_datasets) + + def get_params(self) -> TabularSplitPrimitiveParams: + if not self._fitted: + return TabularSplitPrimitiveParams( + dataset=None, + main_resource_id=None, + splits=None, + graph=None, + ) + + return TabularSplitPrimitiveParams( + dataset=self._dataset, + main_resource_id=self._main_resource_id, + splits=self._splits, + graph=self._graph, + ) + + def set_params(self, *, params: TabularSplitPrimitiveParams) -> None: + self._dataset = params['dataset'] + self._main_resource_id = params['main_resource_id'] + self._splits = params['splits'] + self._graph = params['graph'] + self._fitted = all(param is not None for param in params.values()) + + def __getstate__(self) -> dict: + state = super().__getstate__() + + state['random_state'] = self._random_state + + return state + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + + self._random_state = state['random_state'] diff --git a/tods/common-primitives/common_primitives/cast_to_type.py b/tods/common-primitives/common_primitives/cast_to_type.py new file mode 100644 index 0000000..7290129 --- /dev/null +++ b/tods/common-primitives/common_primitives/cast_to_type.py @@ -0,0 +1,122 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('CastToTypePrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + type_to_cast = hyperparams.Enumeration[str]( + values=['str', 'float'], + default='str', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be cast to the type, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + + +class CastToTypePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which casts all columns it can cast (by default, controlled by ``use_columns``, + ``exclude_columns``) of an input DataFrame to a given structural type (dtype). + It removes columns which are not cast. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'eb5fe752-f22a-4090-948b-aafcef203bf5', + 'version': '0.2.0', + 'name': "Casts DataFrame", + 'python_path': 'd3m.primitives.data_transformation.cast_to_type.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/cast_to_type.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + _type_map = { + 'str': str, + 'float': float, + } + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + type_to_cast = self._type_map[self.hyperparams['type_to_cast']] + + columns_to_use = self._get_columns(inputs.metadata, type_to_cast) + + outputs = inputs.iloc[:, list(columns_to_use)].astype(type_to_cast) + outputs_metadata = inputs.metadata.select_columns(columns_to_use) + + outputs_metadata = outputs_metadata.update((metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS), { + 'structural_type': type_to_cast, + }) + + outputs.metadata = outputs_metadata + + return base.CallResult(outputs) + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int, type_to_cast: type) -> bool: + if type_to_cast == str: + # TODO: Anything can be converted to string, but is it meaningful (Python string description of object might not be)? Should we limit what can be cast this way? + return True + else: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + structural_type = column_metadata.get('structural_type', None) + + if structural_type is None: + return False + + return d3m_utils.is_numeric(structural_type) + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata, type_to_cast: type) -> typing.Sequence[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index, type_to_cast) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + if not columns_to_use: + raise ValueError("No columns to be cast to type '{type}'.".format(type=type_to_cast)) + + # We prefer if all columns could be cast, not just specified columns, + # so we warn always when there are columns which cannot be produced. + elif columns_not_to_use: + self.logger.warning("Not all columns can be cast to type '%(type)s'. Skipping columns: %(columns)s", { + 'type': type_to_cast, + 'columns': columns_not_to_use, + }) + + return columns_to_use diff --git a/tods/common-primitives/common_primitives/column_map.py b/tods/common-primitives/common_primitives/column_map.py new file mode 100644 index 0000000..289536e --- /dev/null +++ b/tods/common-primitives/common_primitives/column_map.py @@ -0,0 +1,361 @@ +import collections +import copy +import os.path +import typing + +from d3m import container, exceptions, index, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams as hyperparams_module, params +from d3m.primitive_interfaces import base, transformer, unsupervised_learning + +import common_primitives + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + # For each column, for each cell in a column, we have potentially params of a primitive. + columns_params: typing.Optional[typing.List[typing.List[params.Params]]] + + +class Hyperparams(hyperparams_module.Hyperparams): + # TODO: How to specify that input type of allowed primitive has to be "DataFrame". + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/335 + primitive = hyperparams_module.Union[typing.Union[transformer.TransformerPrimitiveBase, unsupervised_learning.UnsupervisedLearnerPrimitiveBase]]( + configuration=collections.OrderedDict( + transformer=hyperparams_module.Primitive[transformer.TransformerPrimitiveBase]( # type: ignore + # TODO: This default in fact gets List as input and produces List. Not DataFrame. + # But in fact it just passes through whatever it gets, so it works out. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/214 + default=index.get_primitive('d3m.primitives.operator.null.TransformerTest'), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A transformer primitive.", + ), + unsupervised_learner=hyperparams_module.Primitive[unsupervised_learning.UnsupervisedLearnerPrimitiveBase]( # type: ignore + # TODO: This default in fact gets List as input and produces List. Not DataFrame. + # But in fact it just passes through whatever it gets, so it works out. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/214 + default=index.get_primitive('d3m.primitives.operator.null.UnsupervisedLearnerTest'), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="An unsupervised learner primitive. If it is already fitted and you do not want to re-fit it, " + "set \"fit_primitive\" to \"no\".", + ), + ), + default='transformer', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A primitive to use for mapping of each cell value. Has to take \"DataFrame\" as input.", + ) + fit_primitive = hyperparams_module.Enumeration( + values=['no', 'fit', 'continue_fit'], + default='fit', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Fit an unsupervised learner primitive or not.", + ) + produce_method = hyperparams_module.Hyperparameter[str]( + default='produce', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Name of primitive's produce method to use.", + ) + use_columns = hyperparams_module.Set( + elements=hyperparams_module.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be mapped, it is skipped.", + ) + exclude_columns = hyperparams_module.Set( + elements=hyperparams_module.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams_module.Enumeration( + values=['append', 'replace', 'new'], + default='replace', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should mapped columns be appended, should they replace original columns, or should only mapped columns be returned?", + ) + add_index_columns = hyperparams_module.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_columns = hyperparams_module.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no column is selected/provided. Otherwise issue a warning.", + ) + + +# TODO: Implement optimized "fit_multi_produce" which calls "fit_multi_produce" of underlying primitive. +class DataFrameColumnMapPrimitive(unsupervised_learning.UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A primitive which for every column with embedded ``DataFrame`` cells (by default) + runs provided ``primitive`` on every cell's value, producing new cell's value. + + ``primitive`` can be transformer or fitted or unfitted unsupervised learner primitive. + If it is already fitted and you do not want to re-fit it, set ``fit_primitive`` to ``no``. + Otherwise, if ``fit_primitive`` is set to ``fit``, for each cell's value a copy of the + primitive will be made and it will be first fitted and then produced on that value. + If ``fit_primitive`` is set to ``continue_fit``, a copy of the primitive is made per + column and it is continue fitted on all cell values in the column, in row order. + + Input to the ``primitive`` has to be container ``DataFrame``, but output can be any + container type. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'fe58e7bb-f6c7-4d91-b897-69faf33bece5', + 'version': '0.1.0', + 'name': "Map DataFrame cell values to new values using provided primitive", + 'python_path': 'd3m.primitives.operator.column_map.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/column_map.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + # TODO: Change to "MAP". + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._training_inputs: Inputs = None + self._columns_primitives: typing.List[typing.List[base.PrimitiveBase]] = None + self._fitted: bool = False + + def _should_fit(self) -> bool: + if self.hyperparams['fit_primitive'] == 'no': + return False + + if isinstance(self.hyperparams['primitive'], transformer.TransformerPrimitiveBase): + return False + + if self.hyperparams['fit_primitive'] == 'continue_fit' and not isinstance(self.hyperparams['primitive'], base.ContinueFitMixin): + raise exceptions.InvalidArgumentValueError("\"fit_primitive\" hyper-parameter is set to \"continue_fit\", but primitive does not inherit the \"ContinueFitMixin\" class.") + + return True + + def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore + if not self._should_fit(): + return + + self._training_inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + if not self._should_fit(): + return base.CallResult(None) + + if self._training_inputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + self._columns_primitives = self._fit_columns(self._training_inputs) + self._fitted = True + + return base.CallResult(None) + + def _fit_columns(self, inputs: Inputs) -> typing.List[typing.List[base.PrimitiveBase]]: + columns_to_use = self._get_columns(inputs.metadata) + + columns_primitives = [] + + for column_index in columns_to_use: + columns_primitives.append(self._fit_column(inputs, column_index)) + + assert len(columns_primitives) == len(columns_to_use) + + return columns_primitives + + def _prepare_cell_value(self, inputs_metadata: metadata_base.DataMetadata, value: container.DataFrame, row_index: int, column_index: int) -> container.DataFrame: + assert isinstance(value, container.DataFrame) + + value = copy.copy(value) + + value.metadata = metadata_base.DataMetadata({ + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + }) + + value.metadata = inputs_metadata.copy_to( + value.metadata, + (row_index, column_index), + ) + + return value + + def _fit_column(self, inputs: Inputs, column_index: int) -> typing.List[base.PrimitiveBase]: + column_primitives = [] + primitive = None + + for row_index, column_value in enumerate(inputs.iloc[:, column_index]): + column_value = self._prepare_cell_value(inputs.metadata, column_value, row_index, column_index) + + # If "fit_primitive" is "continue_fit" we copy the primitive only once. + if self.hyperparams['fit_primitive'] == 'fit' or primitive is None: + primitive = copy.deepcopy(self.hyperparams['primitive']) + column_primitives.append(primitive) + + primitive.set_training_data(inputs=column_value) + + if self.hyperparams['fit_primitive'] == 'fit': + primitive.fit() + else: + assert self.hyperparams['fit_primitive'] == 'continue_fit' + primitive.continue_fit() + + return column_primitives + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + if self._should_fit() and not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + assert self._should_fit() == self._fitted + assert (self._columns_primitives is not None) == self._fitted + + if self.hyperparams['produce_method'] != 'produce' and not self.hyperparams['produce_method'].startswith('produce_'): + raise exceptions.InvalidArgumentValueError(f"Invalid produce method name in \"produce_method\" hyper-parameter: {self.hyperparams['produce_method']}") + + columns_to_use, output_columns = self._produce_columns(inputs, self._columns_primitives) + + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + return base.CallResult(outputs) + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + structural_type = inputs_metadata.query_column_field(column_index, 'structural_type') + + return issubclass(structural_type, container.DataFrame) + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + if not columns_to_use: + if self.hyperparams['error_on_no_columns']: + raise ValueError("No inputs columns.") + else: + self.logger.warning("No inputs columns.") + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified inputs columns can be used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _produce_columns( + self, inputs: Inputs, columns_primitives: typing.Optional[typing.List[typing.List[base.PrimitiveBase]]], + ) -> typing.Tuple[typing.List[int], typing.List[Outputs]]: + columns_to_use = self._get_columns(inputs.metadata) + + output_columns = [] + + for column_index in columns_to_use: + output_columns.append(self._produce_column(inputs, column_index, columns_primitives)) + + assert len(columns_to_use) == len(output_columns) + + return columns_to_use, output_columns + + # TODO: Instead of copying metadata to a cell value and then back, we could maybe just hack it by setting a correct reference. + # So cell value metadata would point directly into dataframe column's (we would select input column + # first and just modify metadata directly there) metadata object for element corresponding to the cell value. + # How would that work if there is any metadata on dataframe's ALL_ELEMENTS? For updating it does not matter + # because cell value metadata has precedence anyway? But we would still first have to copy metadata from ALL_ELEMENTS + # to cell value metadata so that it is available there for querying. + def _produce_column(self, inputs: Inputs, column_index: int, columns_primitives: typing.Optional[typing.List[typing.List[base.PrimitiveBase]]]) -> Outputs: + output_column_values = [] + + if columns_primitives is not None: + if self.hyperparams['fit_primitive'] == 'fit': + # We will set it later for every row. + primitive = None + else: + assert self.hyperparams['fit_primitive'] == 'continue_fit' + # When "fit_primitive" is "continue_fit", we have only + # one primitive instance for the whole column. + primitive = columns_primitives[column_index][0] + else: + # It could be that "fit_primitive" is "no" or that we have a transformer primitive. + primitive = self.hyperparams['primitive'] + + for row_index, column_value in enumerate(inputs.iloc[:, column_index]): + column_value = self._prepare_cell_value(inputs.metadata, column_value, row_index, column_index) + + if columns_primitives is not None and self.hyperparams['fit_primitive'] == 'fit': + primitive = columns_primitives[column_index][row_index] + + output_value = getattr(primitive, self.hyperparams['produce_method'])(inputs=column_value).value + + output_column_values.append(output_value) + + output_column = container.DataFrame({inputs.columns[column_index]: output_column_values}, generate_metadata=False) + + output_column.metadata = metadata_base.DataMetadata(inputs.metadata.query(())) + output_column.metadata = output_column.metadata.update((metadata_base.ALL_ELEMENTS, 0), inputs.metadata.query((metadata_base.ALL_ELEMENTS, column_index))) + output_column.metadata = output_column.metadata.generate(output_column) + + # TODO: Because metadata generation does not reuse existing metadata, we have to copy it ourselves. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/119 + for row_index, column_value in enumerate(output_column_values): + output_column.metadata = column_value.metadata.copy_to( + output_column.metadata, + (), + (row_index, 0), + ) + + # TODO: Should we compact metadata? It could make it nicer. + # But it could be slow, especially with nested DataFrames. + + return output_column + + def get_params(self) -> Params: + if not self._fitted: + return Params( + columns_params=None, + ) + + return Params( + columns_params=[ + [primitive.get_params() for primitive in column] + for column in self._columns_primitives + ], + ) + + def set_params(self, *, params: Params) -> None: + if params['columns_primitives'] is None: + self._columns_primitives = None + self._fitted = False + return + + columns_primitives = [] + for column in params['columns_primitives']: + column_primitives = [] + + for params in column: + primitive = copy.deepcopy(self.hyperparams['primitive']) + primitive.set_params(params) + column_primitives.append(primitive) + + columns_primitives.append(column_primitives) + + self._columns_primitives = columns_primitives + self._fitted = True diff --git a/tods/common-primitives/common_primitives/column_parser.py b/tods/common-primitives/common_primitives/column_parser.py new file mode 100644 index 0000000..e6204ae --- /dev/null +++ b/tods/common-primitives/common_primitives/column_parser.py @@ -0,0 +1,398 @@ +import hashlib +import os +import typing + +import numpy # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives +from common_primitives import utils + +__all__ = ('ColumnParserPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + parse_semantic_types = hyperparams.Set( + elements=hyperparams.Enumeration( + values=[ + 'http://schema.org/Boolean', 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer', 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/FloatVector', 'http://schema.org/DateTime', + ], + # Default is ignored. + # TODO: Remove default. See: https://gitlab.com/datadrivendiscovery/d3m/issues/141 + default='http://schema.org/Boolean', + ), + default=( + 'http://schema.org/Boolean', 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer', 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/FloatVector', 'http://schema.org/DateTime', + ), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of semantic types to parse. One can provide a subset of supported semantic types to limit what the primitive parses.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='replace', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + parse_categorical_target_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should it parse also categorical target columns?", + ) + replace_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Replace primary index columns even if otherwise appending columns. Applicable only if \"return_result\" is set to \"append\".", + ) + fuzzy_time_parsing = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Use fuzzy time parsing.", + ) + + +class ColumnParserPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which parses strings into their parsed values. + + It goes over all columns (by default, controlled by ``use_columns``, ``exclude_columns``) + and checks those with structural type ``str`` if they have a semantic type suggesting + that they are a boolean value, categorical, integer, float, or time (by default, + controlled by ``parse_semantic_types``). Categorical values are converted with + hash encoding. + + What is returned is controlled by ``return_result`` and ``add_index_columns``. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'd510cb7a-1782-4f51-b44c-58f0236e47c7', + 'version': '0.6.0', + 'name': "Parses strings into their types", + 'python_path': 'd3m.primitives.data_transformation.column_parser.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/column_parser.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + columns_to_use, output_columns = self._produce_columns(inputs) + + if self.hyperparams['replace_index_columns'] and self.hyperparams['return_result'] == 'append': + assert len(columns_to_use) == len(output_columns) + + index_columns = inputs.metadata.get_index_columns() + + index_columns_to_use = [] + other_columns_to_use = [] + index_output_columns = [] + other_output_columns = [] + for column_to_use, output_column in zip(columns_to_use, output_columns): + if column_to_use in index_columns: + index_columns_to_use.append(column_to_use) + index_output_columns.append(output_column) + else: + other_columns_to_use.append(column_to_use) + other_output_columns.append(output_column) + + outputs = base_utils.combine_columns(inputs, index_columns_to_use, index_output_columns, return_result='replace', add_index_columns=self.hyperparams['add_index_columns']) + outputs = base_utils.combine_columns(outputs, other_columns_to_use, other_output_columns, return_result='append', add_index_columns=self.hyperparams['add_index_columns']) + else: + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + return base.CallResult(outputs) + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + # We produce only on columns which have not yet been parsed (are strings). + if column_metadata['structural_type'] != str: + return False + + semantic_types = column_metadata.get('semantic_types', []) + + for semantic_type in self.hyperparams['parse_semantic_types']: + if semantic_type not in semantic_types: + continue + + if semantic_type == 'https://metadata.datadrivendiscovery.org/types/CategoricalData': + # Skip parsing if a column is categorical, but also a target column. + if not self.hyperparams['parse_categorical_target_columns'] and 'https://metadata.datadrivendiscovery.org/types/Target' in semantic_types: + continue + + return True + + return False + + def _produce_columns(self, inputs: Inputs) -> typing.Tuple[typing.List[int], typing.List[Outputs]]: + # The logic of parsing values tries to mirror also the logic of detecting + # values in "SimpleProfilerPrimitive". One should keep them in sync. + + columns_to_use = self._get_columns(inputs.metadata) + + # We check against this list again, because there might be multiple matching semantic types + # (which is not really valid). + parse_semantic_types = self.hyperparams['parse_semantic_types'] + + output_columns = [] + + for column_index in columns_to_use: + column_metadata = inputs.metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = column_metadata.get('semantic_types', []) + if column_metadata['structural_type'] == str: + if 'http://schema.org/Boolean' in parse_semantic_types and 'http://schema.org/Boolean' in semantic_types: + output_columns.append(self._parse_boolean_data(inputs, column_index)) + + elif 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in parse_semantic_types and \ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types and \ + (self.hyperparams['parse_categorical_target_columns'] or 'https://metadata.datadrivendiscovery.org/types/Target' not in semantic_types): + output_columns.append(self._parse_categorical_data(inputs, column_index)) + + elif 'http://schema.org/Integer' in parse_semantic_types and 'http://schema.org/Integer' in semantic_types: + # For primary key we know all values have to exist so we can assume they can always be represented as integers. + if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in semantic_types: + integer_required = True + else: + integer_required = False + + output_columns.append(self._parse_integer(inputs, column_index, integer_required)) + + elif 'http://schema.org/Float' in parse_semantic_types and 'http://schema.org/Float' in semantic_types: + output_columns.append(self._parse_float_data(inputs, column_index)) + + elif 'https://metadata.datadrivendiscovery.org/types/FloatVector' in parse_semantic_types and 'https://metadata.datadrivendiscovery.org/types/FloatVector' in semantic_types: + output_columns.append(self._parse_float_vector_data(inputs, column_index)) + + elif 'http://schema.org/DateTime' in parse_semantic_types and 'http://schema.org/DateTime' in semantic_types: + output_columns.append(self._parse_time_data(inputs, column_index, self.hyperparams['fuzzy_time_parsing'])) + + else: + assert False, column_index + + assert len(output_columns) == len(columns_to_use) + + return columns_to_use, output_columns + + def _produce_columns_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Tuple[typing.List[int], typing.List[metadata_base.DataMetadata]]: + columns_to_use = self._get_columns(inputs_metadata) + + # We check against this list again, because there might be multiple matching semantic types + # (which is not really valid). + parse_semantic_types = self.hyperparams['parse_semantic_types'] + + output_columns = [] + + for column_index in columns_to_use: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = column_metadata.get('semantic_types', []) + if column_metadata['structural_type'] == str: + if 'http://schema.org/Boolean' in parse_semantic_types and 'http://schema.org/Boolean' in semantic_types: + output_columns.append(self._parse_boolean_metadata(inputs_metadata, column_index)) + + elif 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in parse_semantic_types and \ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types and \ + (self.hyperparams['parse_categorical_target_columns'] or 'https://metadata.datadrivendiscovery.org/types/Target' not in semantic_types): + output_columns.append(self._parse_categorical_metadata(inputs_metadata, column_index)) + + elif 'http://schema.org/Integer' in parse_semantic_types and 'http://schema.org/Integer' in semantic_types: + output_columns.append(self._parse_integer_metadata(inputs_metadata, column_index)) + + elif 'http://schema.org/Float' in parse_semantic_types and 'http://schema.org/Float' in semantic_types: + output_columns.append(self._parse_float_metadata(inputs_metadata, column_index)) + + elif 'https://metadata.datadrivendiscovery.org/types/FloatVector' in parse_semantic_types and 'https://metadata.datadrivendiscovery.org/types/FloatVector' in semantic_types: + output_columns.append(self._parse_float_vector_metadata(inputs_metadata, column_index)) + + elif 'http://schema.org/DateTime' in parse_semantic_types and 'http://schema.org/DateTime' in semantic_types: + output_columns.append(self._parse_time_metadata(inputs_metadata, column_index)) + + else: + assert False, column_index + + assert len(output_columns) == len(columns_to_use) + + return columns_to_use, output_columns + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + # We are OK if no columns ended up being parsed. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns can parsed. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + @classmethod + def _parse_boolean_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: + return cls._parse_categorical_data(inputs, column_index) + + @classmethod + def _parse_boolean_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + return cls._parse_categorical_metadata(inputs_metadata, column_index) + + @classmethod + def _parse_categorical_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: + values_map: typing.Dict[str, int] = {} + for value in inputs.iloc[:, column_index]: + value = value.strip() + if value not in values_map: + value_hash = hashlib.sha256(value.encode('utf8')) + values_map[value] = int.from_bytes(value_hash.digest()[0:8], byteorder='little') ^ int.from_bytes(value_hash.digest()[8:16], byteorder='little') ^ \ + int.from_bytes(value_hash.digest()[16:24], byteorder='little') ^ int.from_bytes(value_hash.digest()[24:32], byteorder='little') + + outputs = container.DataFrame({inputs.columns[column_index]: [values_map[value.strip()] for value in inputs.iloc[:, column_index]]}, generate_metadata=False) + outputs.metadata = cls._parse_categorical_metadata(inputs.metadata, column_index) + + return outputs + + @classmethod + def _parse_categorical_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata.select_columns([column_index]) + return outputs_metadata.update_column(0, {'structural_type': int}) + + @classmethod + def _str_to_int(cls, value: str) -> typing.Union[float, int]: + try: + return int(value.strip()) + except ValueError: + try: + # Maybe it is an int represented as a float. Let's try this. This can get rid of non-integer + # part of the value, but the integer was requested through a semantic type, so this is probably OK. + return int(float(value.strip())) + except ValueError: + # No luck, use NaN to represent a missing value. + return float('nan') + + @classmethod + def _parse_integer(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment, + integer_required: bool) -> container.DataFrame: + outputs = container.DataFrame({inputs.columns[column_index]: [cls._str_to_int(value) for value in inputs.iloc[:, column_index]]}, generate_metadata=False) + + if outputs.dtypes.iloc[0].kind == 'f': + structural_type: type = float + elif outputs.dtypes.iloc[0].kind in ['i', 'u']: + structural_type = int + else: + assert False, outputs.dtypes.iloc[0] + + if structural_type is float and integer_required: + raise ValueError("Not all values in a column can be parsed into integers, but only integers were expected.") + + outputs.metadata = inputs.metadata.select_columns([column_index]) + outputs.metadata = outputs.metadata.update_column(0, {'structural_type': structural_type}) + + return outputs + + @classmethod + def _parse_integer_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata.select_columns([column_index]) + # Without data we assume we can parse everything into integers. This might not be true and + # we might end up parsing into floats if we have to represent missing (or invalid) values. + return outputs_metadata.update_column(0, {'structural_type': int}) + + @classmethod + def _str_to_float(cls, value: str) -> float: + try: + return float(value.strip()) + except ValueError: + return float('nan') + + @classmethod + def _parse_float_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: + outputs = container.DataFrame({inputs.columns[column_index]: [cls._str_to_float(value) for value in inputs.iloc[:, column_index]]}, generate_metadata=False) + outputs.metadata = cls._parse_float_metadata(inputs.metadata, column_index) + + return outputs + + @classmethod + def _parse_float_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata.select_columns([column_index]) + return outputs_metadata.update_column(0, {'structural_type': float}) + + @classmethod + def _parse_float_vector_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment) -> Outputs: + # We are pretty strict here because we are assuming this was generated programmatically. + outputs = container.DataFrame( + { + inputs.columns[column_index]: [ + container.ndarray([cls._str_to_float(value) for value in values.split(',')]) + for values in inputs.iloc[:, column_index] + ], + }, + generate_metadata=False, + ) + outputs.metadata = cls._parse_float_metadata(inputs.metadata, column_index) + # We have to automatically generate metadata to set ndarray dimension(s). + outputs.metadata = outputs.metadata.generate(outputs) + + return outputs + + @classmethod + def _parse_float_vector_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata.select_columns([column_index]) + # We cannot know the dimension of the ndarray without data. + outputs_metadata = outputs_metadata.update_column(0, {'structural_type': container.ndarray}) + outputs_metadata = outputs_metadata.update((metadata_base.ALL_ELEMENTS, 0, metadata_base.ALL_ELEMENTS), {'structural_type': numpy.float64}) + return outputs_metadata + + @classmethod + def _parse_time_data(cls, inputs: Inputs, column_index: metadata_base.SimpleSelectorSegment, fuzzy: bool) -> Outputs: + outputs = container.DataFrame({inputs.columns[column_index]: [utils.parse_datetime_to_float(value, fuzzy=fuzzy) for value in inputs.iloc[:, column_index]]}, generate_metadata=False) + outputs.metadata = cls._parse_time_metadata(inputs.metadata, column_index) + + return outputs + + @classmethod + def _parse_time_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_index: metadata_base.SimpleSelectorSegment) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata.select_columns([column_index]) + return outputs_metadata.update_column(0, {'structural_type': float}) diff --git a/tods/common-primitives/common_primitives/compute_metafeatures.py b/tods/common-primitives/common_primitives/compute_metafeatures.py new file mode 100644 index 0000000..c038d71 --- /dev/null +++ b/tods/common-primitives/common_primitives/compute_metafeatures.py @@ -0,0 +1,600 @@ +import collections +import copy +import os +import typing + +import numpy # type: ignore +import pandas # type: ignore +from scipy import stats # type: ignore +from sklearn import metrics # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('ComputeMetafeaturesPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class ComputeMetafeaturesPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which computes meta-features and adds them to metadata. + + Primitive is meant to be used with columns already parsed. + """ + + __author__ = 'Mingjie Sun ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '915832a4-8059-438d-9118-f4fb4f7b0aaf', + 'version': '0.1.0', + 'name': "Compute meta-features", + 'python_path': 'd3m.primitives.metalearning.metafeature_extractor.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:sunmj15@gmail.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/compute_metafeatures.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + metadata_base.PrimitiveAlgorithmType.MUTUAL_INFORMATION, + metadata_base.PrimitiveAlgorithmType.SIGNAL_TO_NOISE_RATIO, + metadata_base.PrimitiveAlgorithmType.INFORMATION_ENTROPY, + ], + 'primitive_family': metadata_base.PrimitiveFamily.METALEARNING, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + attribute_columns_indices_list = inputs.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/Attribute']) + attribute_columns_indices = set(attribute_columns_indices_list) + target_columns_indices_list = inputs.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/TrueTarget']) + target_columns_indices = set(target_columns_indices_list) + + string_columns_indices = set(inputs.metadata.list_columns_with_structural_types((str,))) + numeric_columns_indices = set(inputs.metadata.list_columns_with_structural_types(d3m_utils.is_numeric)) + discrete_columns_indices = self._get_discrete_indices(inputs, numeric_columns_indices) + binary_columns_indices = self._get_binary_indices(inputs, discrete_columns_indices) + + # Categorical columns can be represented with number or strings. + categorical_columns_indices = set(inputs.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/CategoricalData'])) + + attributes_metafeatures = [ + self._attribute_metafeatures( + inputs, + index, + index in string_columns_indices, + index in numeric_columns_indices, + index in categorical_columns_indices, + index in discrete_columns_indices, + ) for index in attribute_columns_indices_list + ] + targets_metafeatures = [ + self._target_metafeatures( + inputs, + index, + index in string_columns_indices, + index in numeric_columns_indices, + index in categorical_columns_indices, + index in discrete_columns_indices, + attribute_columns_indices_list, + string_columns_indices, + numeric_columns_indices, + discrete_columns_indices, + categorical_columns_indices, + ) for index in target_columns_indices_list + ] + + # Our flags are slightly different from metafeatures schema. Our columns can be both categorical and + # discrete, for example. Or both categorical and string. So we make sure here that we are stricter. + strict_string_columns_indices = string_columns_indices - categorical_columns_indices + strict_numeric_columns_indices = numeric_columns_indices - categorical_columns_indices + strict_discrete_columns_indices = discrete_columns_indices - categorical_columns_indices + strict_binary_columns_indices = binary_columns_indices - categorical_columns_indices + + table_metafeatures = { + 'number_of_attributes': len(attribute_columns_indices), + 'number_of_instances': inputs.shape[0], + 'number_of_categorical_attributes': len(categorical_columns_indices & attribute_columns_indices), + 'number_of_string_attributes': len(strict_string_columns_indices & attribute_columns_indices), + 'number_of_numeric_attributes': len(strict_numeric_columns_indices & attribute_columns_indices), + 'number_of_discrete_attributes': len(strict_discrete_columns_indices & attribute_columns_indices), + 'number_of_binary_attributes': len(strict_binary_columns_indices & attribute_columns_indices), + } + + if table_metafeatures['number_of_instances']: + table_metafeatures['dimensionality'] = table_metafeatures['number_of_attributes'] / table_metafeatures['number_of_instances'] + table_metafeatures['number_of_other_attributes'] = table_metafeatures['number_of_attributes'] - table_metafeatures['number_of_categorical_attributes'] - \ + table_metafeatures['number_of_string_attributes'] - table_metafeatures['number_of_numeric_attributes'] + + if table_metafeatures['number_of_attributes']: + table_metafeatures['ratio_of_categorical_attributes'] = table_metafeatures['number_of_categorical_attributes'] / table_metafeatures['number_of_attributes'] + table_metafeatures['ratio_of_string_attributes'] = table_metafeatures['number_of_string_attributes'] / table_metafeatures['number_of_attributes'] + table_metafeatures['ratio_of_numeric_attributes'] = table_metafeatures['number_of_numeric_attributes'] / table_metafeatures['number_of_attributes'] + table_metafeatures['ratio_of_discrete_attributes'] = table_metafeatures['number_of_discrete_attributes'] / table_metafeatures['number_of_attributes'] + table_metafeatures['ratio_of_binary_attributes'] = table_metafeatures['number_of_binary_attributes'] / table_metafeatures['number_of_attributes'] + table_metafeatures['ratio_of_other_attributes'] = table_metafeatures['number_of_other_attributes'] / table_metafeatures['number_of_attributes'] + + table_metafeatures['number_of_instances_with_missing_values'] = self._get_number_of_instances_with_missing_values(inputs, attribute_columns_indices, string_columns_indices) + if table_metafeatures['number_of_instances']: + table_metafeatures['ratio_of_instances_with_missing_values'] = table_metafeatures['number_of_instances_with_missing_values'] / table_metafeatures['number_of_instances'] + table_metafeatures['number_of_instances_with_present_values'] = self._get_number_of_instances_with_present_values(inputs, attribute_columns_indices, string_columns_indices) + if table_metafeatures['number_of_instances']: + table_metafeatures['ratio_of_instances_with_present_values'] = table_metafeatures['number_of_instances_with_present_values'] / table_metafeatures['number_of_instances'] + + attribute_counts_by_structural_type = self._get_counts_by_structural_type(inputs, attribute_columns_indices) + if attribute_counts_by_structural_type: + table_metafeatures['attribute_counts_by_structural_type'] = attribute_counts_by_structural_type + if len(attribute_columns_indices): + table_metafeatures['attribute_ratios_by_structural_type'] = {key: value / len(attribute_columns_indices) for key, value in attribute_counts_by_structural_type.items()} + + attribute_counts_by_semantic_type = self._get_counts_by_semantic_type(inputs, attribute_columns_indices) + if attribute_counts_by_semantic_type: + table_metafeatures['attribute_counts_by_semantic_type'] = attribute_counts_by_semantic_type + if len(attribute_columns_indices): + table_metafeatures['attribute_ratios_by_semantic_type'] = {key: value / len(attribute_columns_indices) for key, value in attribute_counts_by_semantic_type.items()} + + mean_of_attributes = self._aggregate([ + attributes_metafeatures[i]['values_aggregate']['mean'] for i, index in enumerate(attribute_columns_indices_list) if index in strict_numeric_columns_indices + ]) + if mean_of_attributes is not None: + table_metafeatures['mean_of_attributes'] = mean_of_attributes + standard_deviation_of_attributes = self._aggregate([ + attributes_metafeatures[i]['values_aggregate']['std'] for i, index in enumerate(attribute_columns_indices_list) if index in strict_numeric_columns_indices + ]) + if standard_deviation_of_attributes is not None: + table_metafeatures['standard_deviation_of_attributes'] = standard_deviation_of_attributes + kurtosis_of_attributes = self._aggregate([ + attributes_metafeatures[i]['values_aggregate']['kurtosis'] for i, index in enumerate(attribute_columns_indices_list) if index in strict_numeric_columns_indices + ]) + if kurtosis_of_attributes is not None: + table_metafeatures['kurtosis_of_attributes'] = kurtosis_of_attributes + skew_of_attributes = self._aggregate([ + attributes_metafeatures[i]['values_aggregate']['skewness'] for i, index in enumerate(attribute_columns_indices_list) if index in strict_numeric_columns_indices + ]) + if skew_of_attributes is not None: + table_metafeatures['skew_of_attributes'] = skew_of_attributes + + entropy_of_categorical_attributes = self._aggregate([ + attributes_metafeatures[i]['entropy_of_values'] for i, index in enumerate(attribute_columns_indices_list) if index in categorical_columns_indices + ]) + if entropy_of_categorical_attributes is not None: + table_metafeatures['entropy_of_categorical_attributes'] = entropy_of_categorical_attributes + entropy_of_numeric_attributes = self._aggregate([ + attributes_metafeatures[i]['entropy_of_values'] for i, index in enumerate(attribute_columns_indices_list) if index in strict_numeric_columns_indices + ]) + if entropy_of_numeric_attributes is not None: + table_metafeatures['entropy_of_numeric_attributes'] = entropy_of_numeric_attributes + entropy_of_discrete_attributes = self._aggregate([ + attributes_metafeatures[i]['entropy_of_values'] for i, index in enumerate(attribute_columns_indices_list) if index in strict_discrete_columns_indices + ]) + if entropy_of_discrete_attributes is not None: + table_metafeatures['entropy_of_discrete_attributes'] = entropy_of_discrete_attributes + entropy_of_attributes = self._aggregate([ + attributes_metafeatures[i]['entropy_of_values'] for i, index in enumerate(attribute_columns_indices_list) if index in strict_numeric_columns_indices | categorical_columns_indices + ]) + if entropy_of_attributes is not None: + table_metafeatures['entropy_of_attributes'] = entropy_of_attributes + + number_distinct_values_of_categorical_attributes = self._aggregate([ + attributes_metafeatures[i]['number_distinct_values'] for i, index in enumerate(attribute_columns_indices_list) if index in categorical_columns_indices + ]) + if number_distinct_values_of_categorical_attributes is not None: + table_metafeatures['number_distinct_values_of_categorical_attributes'] = number_distinct_values_of_categorical_attributes + number_distinct_values_of_numeric_attributes = self._aggregate([ + attributes_metafeatures[i]['number_distinct_values'] for i, index in enumerate(attribute_columns_indices_list) if index in strict_numeric_columns_indices + ]) + if number_distinct_values_of_numeric_attributes is not None: + table_metafeatures['number_distinct_values_of_numeric_attributes'] = number_distinct_values_of_numeric_attributes + number_distinct_values_of_discrete_attributes = self._aggregate([ + attributes_metafeatures[i]['number_distinct_values'] for i, index in enumerate(attribute_columns_indices_list) if index in strict_discrete_columns_indices + ]) + if number_distinct_values_of_discrete_attributes is not None: + table_metafeatures['number_distinct_values_of_discrete_attributes'] = number_distinct_values_of_discrete_attributes + + for i, index in enumerate(target_columns_indices_list): + if 'mutual_information_of_categorical_attributes' in targets_metafeatures[i] and 'entropy_of_categorical_attributes' in table_metafeatures: + mutual_information_of_categorical_attributes = targets_metafeatures[i]['mutual_information_of_categorical_attributes']['mean'] + if mutual_information_of_categorical_attributes: + targets_metafeatures[i]['categorical_noise_to_signal_ratio'] = (table_metafeatures['entropy_of_categorical_attributes']['mean'] - mutual_information_of_categorical_attributes) / \ + mutual_information_of_categorical_attributes + if 'mutual_information_of_numeric_attributes' in targets_metafeatures[i] and 'entropy_of_numeric_attributes' in table_metafeatures: + mutual_information_of_numeric_attributes = targets_metafeatures[i]['mutual_information_of_numeric_attributes']['mean'] + if mutual_information_of_numeric_attributes: + targets_metafeatures[i]['numeric_noise_to_signal_ratio'] = (table_metafeatures['entropy_of_numeric_attributes']['mean'] - mutual_information_of_numeric_attributes) / \ + mutual_information_of_numeric_attributes + if 'mutual_information_of_discrete_attributes' in targets_metafeatures[i] and 'entropy_of_discrete_attributes' in table_metafeatures: + mutual_information_of_discrete_attributes = targets_metafeatures[i]['mutual_information_of_discrete_attributes']['mean'] + if mutual_information_of_discrete_attributes: + targets_metafeatures[i]['discrete_noise_to_signal_ratio'] = (table_metafeatures['entropy_of_discrete_attributes']['mean'] - mutual_information_of_discrete_attributes) / \ + mutual_information_of_discrete_attributes + if 'mutual_information_of_attributes' in targets_metafeatures[i] and 'entropy_of_attributes' in table_metafeatures: + mutual_information_of_attributes = targets_metafeatures[i]['mutual_information_of_attributes']['mean'] + if mutual_information_of_attributes: + targets_metafeatures[i]['noise_to_signal_ratio'] = (table_metafeatures['entropy_of_attributes']['mean'] - mutual_information_of_attributes) / \ + mutual_information_of_attributes + + outputs = copy.copy(inputs) + outputs.metadata = inputs.metadata + + if table_metafeatures: + outputs.metadata = outputs.metadata.update((), {'data_metafeatures': table_metafeatures}) + + for i, index in enumerate(attribute_columns_indices_list): + if attributes_metafeatures[i]: + outputs.metadata = outputs.metadata.update_column(index, {'data_metafeatures': attributes_metafeatures[i]}) + + for i, index in enumerate(target_columns_indices_list): + if targets_metafeatures[i]: + outputs.metadata = outputs.metadata.update_column(index, {'data_metafeatures': targets_metafeatures[i]}) + + return base.CallResult(outputs) + + def _get_discrete_indices(self, columns: container.DataFrame, numeric_columns_indices: typing.Set[int]) -> typing.Set[int]: + known_discrete_indices = columns.metadata.list_columns_with_structural_types((int, numpy.integer)) + + discrete_indices = set() + + for index in numeric_columns_indices: + if index in known_discrete_indices: + discrete_indices.add(index) + continue + + # Even if structural type is float, it could still be a discrete column + # where all values are discrete, but column contains also NaN values and this + # is why its structural type is float. + assert d3m_utils.is_float(columns.metadata.query_column(index)['structural_type']), columns.metadata.query_column(index)['structural_type'] + assert d3m_utils.is_float(columns.dtypes[index].type), columns.dtypes[index].type + + # If all values are or integers or NaN values. + if all(v.is_integer() for v in columns.iloc[:, index].dropna()): + discrete_indices.add(index) + + return discrete_indices + + def _get_binary_indices(self, columns: container.DataFrame, discrete_columns_indices: typing.Set[int]) -> typing.Set[int]: + binary_indices = set() + + for index in discrete_columns_indices: + values: typing.Set[typing.Any] = set() + + for value in columns.iloc[:, index].dropna(): + if value in values: + continue + values.add(value) + + if len(values) > 2: + break + + if len(values) == 2: + binary_indices.add(index) + + return binary_indices + + @classmethod + def _get_number_of_instances_with_missing_values(cls, columns: container.DataFrame, attribute_columns_indices: typing.Set[int], string_columns_indices: typing.Set[int]) -> int: + number_of_instances_with_missing_values = 0 + + for row in columns.itertuples(index=False, name=None): + has_missing_values = False + + for column_index, column_value in enumerate(row): + if column_index not in attribute_columns_indices: + continue + + if column_index in string_columns_indices: + if column_value == '': + has_missing_values = True + break + else: + if pandas.isna(column_value): + has_missing_values = True + break + + if has_missing_values: + number_of_instances_with_missing_values += 1 + + return number_of_instances_with_missing_values + + def _get_number_of_instances_with_present_values(self, columns: container.DataFrame, attribute_columns_indices: typing.Set[int], string_columns_indices: typing.Set[int]) -> int: + number_of_instances_with_present_values = 0 + + for row in columns.itertuples(index=False, name=None): + has_present_values = False + + for column_index, column_value in enumerate(row): + if column_index not in attribute_columns_indices: + continue + + if column_index in string_columns_indices: + if column_value != '': + has_present_values = True + break + else: + if not pandas.isna(column_value): + has_present_values = True + break + + if has_present_values: + number_of_instances_with_present_values += 1 + + return number_of_instances_with_present_values + + @classmethod + def _get_counts_by_structural_type(cls, columns: container.DataFrame, columns_indices: typing.Iterable[int]) -> typing.Dict[str, int]: + counts: typing.Dict[str, int] = collections.defaultdict(int) + + for index in columns_indices: + counts[d3m_utils.type_to_str(columns.metadata.query_column(index)['structural_type'])] += 1 + + return dict(counts) + + @classmethod + def _get_counts_by_semantic_type(cls, columns: container.DataFrame, columns_indices: typing.Iterable[int]) -> typing.Dict[str, int]: + counts: typing.Dict[str, int] = collections.defaultdict(int) + + for index in columns_indices: + for semantic_type in columns.metadata.query_column(index).get('semantic_types', []): + counts[semantic_type] += 1 + + return dict(counts) + + def _columns_metafeatures(self, columns: container.DataFrame, index: int, is_string: bool, is_numeric: bool, is_categorical: bool, is_discrete: bool) -> typing.Dict[str, typing.Any]: + column_metafeatures: typing.Dict[str, typing.Any] = {} + + values = columns.iloc[:, index] + + if is_string: + if is_categorical: + # Categorical string values have missing data represented as empty strings. + values_without_na = values.replace('', numpy.nan).dropna() + else: + values_without_na = values + elif is_numeric: + values_without_na = values.dropna() + else: + values_without_na = values + + if is_string or is_numeric: + column_metafeatures['number_of_missing_values'] = len(values) - len(values_without_na) + if len(values): + column_metafeatures['ratio_of_missing_values'] = column_metafeatures['number_of_missing_values'] / len(values) + column_metafeatures['number_of_present_values'] = len(values_without_na) + if len(values): + column_metafeatures['ratio_of_present_values'] = column_metafeatures['number_of_present_values'] / len(values) + + if is_numeric or is_categorical: + discrete_values = self._discretize(values_without_na, is_string, is_categorical, is_discrete) + + # There should be no NA anyway anymore. + value_counts = discrete_values.value_counts(dropna=False) + assert len(values_without_na) == value_counts.sum(), (len(values_without_na), value_counts.sum()) + + if len(values_without_na): + value_counts_normalized = value_counts / len(values_without_na) + else: + value_counts_normalized = None + + if is_categorical or is_discrete: + column_metafeatures['number_distinct_values'] = value_counts.size + else: + column_metafeatures['number_distinct_values'] = len(values_without_na.unique()) + if value_counts_normalized is not None: + column_metafeatures['entropy_of_values'] = stats.entropy(value_counts_normalized) + value_counts_aggregate = self._aggregate(value_counts) + if value_counts_aggregate is not None: + column_metafeatures['value_counts_aggregate'] = value_counts_aggregate + if value_counts_normalized is not None: + value_probabilities_aggregate = self._aggregate(value_counts_normalized) + if value_probabilities_aggregate is not None: + column_metafeatures['value_probabilities_aggregate'] = value_probabilities_aggregate + + if is_numeric: + values_aggregate = self._aggregate(values_without_na) + if values_aggregate is not None: + column_metafeatures['values_aggregate'] = values_aggregate + + # Our flags are slightly different from metafeatures schema. Our columns can be both categorical and + # discrete, for example. Or both categorical and string. So we make sure here that we are stricter. + if is_numeric and not is_categorical: + column_metafeatures['number_of_numeric_values'] = column_metafeatures['number_of_present_values'] + column_metafeatures['ratio_of_numeric_values'] = column_metafeatures['ratio_of_present_values'] + column_metafeatures['number_of_positive_numeric_values'] = int((values_without_na > 0).sum()) + if len(values): + column_metafeatures['ratio_of_positive_numeric_values'] = column_metafeatures['number_of_positive_numeric_values'] / len(values) + column_metafeatures['number_of_negative_numeric_values'] = int((values_without_na < 0).sum()) + if len(values): + column_metafeatures['ratio_of_negative_numeric_values'] = column_metafeatures['number_of_negative_numeric_values'] / len(values) + column_metafeatures['number_of_numeric_values_equal_0'] = int((values_without_na == 0).sum()) + if len(values): + column_metafeatures['ratio_of_numeric_values_equal_0'] = column_metafeatures['number_of_numeric_values_equal_0'] / len(values) + column_metafeatures['number_of_numeric_values_equal_1'] = int((values_without_na == 1).sum()) + if len(values): + column_metafeatures['ratio_of_numeric_values_equal_1'] = column_metafeatures['number_of_numeric_values_equal_1'] / len(values) + column_metafeatures['number_of_numeric_values_equal_-1'] = int((values_without_na == -1).sum()) + if len(values): + column_metafeatures['ratio_of_numeric_values_equal_-1'] = column_metafeatures['number_of_numeric_values_equal_-1'] / len(values) + + return column_metafeatures + + def _attribute_metafeatures(self, columns: container.DataFrame, index: int, is_string: bool, is_numeric: bool, is_categorical: bool, is_discrete: bool) -> typing.Dict[str, typing.Any]: + return self._columns_metafeatures(columns, index, is_string, is_numeric, is_categorical, is_discrete) + + def _target_metafeatures(self, columns: container.DataFrame, target_index: int, is_string: bool, is_numeric: bool, is_categorical: bool, is_discrete: bool, + attribute_columns_indices_list: typing.Sequence[int], string_columns_indices: typing.Set[int], numeric_columns_indices: typing.Set[int], + discrete_columns_indices: typing.Set[int], categorical_columns_indices: typing.Set[int]) -> typing.Dict[str, typing.Any]: + metafeatures = self._columns_metafeatures(columns, target_index, is_string, is_numeric, is_categorical, is_discrete) + + if is_categorical: + if 'value_probabilities_aggregate' in metafeatures: + metafeatures['default_accuracy'] = metafeatures['value_probabilities_aggregate']['max'] + + if is_numeric or is_categorical: + categorical_joint_entropy = [] + numeric_joint_entropy = [] + discrete_joint_entropy = [] + all_joint_entropy = [] + + categorical_mutual_information = [] + numeric_mutual_information = [] + discrete_mutual_information = [] + all_mutual_information = [] + + discrete_target_values = self._discretize(columns.iloc[:, target_index], is_string, is_categorical, is_discrete) + + for attribute_index in attribute_columns_indices_list: + attribute_is_string = attribute_index in string_columns_indices + attribute_is_numeric = attribute_index in numeric_columns_indices + attribute_is_categorical = attribute_index in categorical_columns_indices + attribute_is_discrete = attribute_index in discrete_columns_indices + + if not (attribute_is_numeric or attribute_is_categorical): + continue + + discrete_attribute_values = self._discretize(columns.iloc[:, attribute_index], attribute_is_string, attribute_is_categorical, attribute_is_discrete) + + all_values_without_na = pandas.concat([discrete_attribute_values, discrete_target_values], axis=1).dropna(axis=0, how='any') + + attribute_values_without_na = all_values_without_na.iloc[:, 0] + target_values_without_na = all_values_without_na.iloc[:, 1] + + probabilities = [] + # We sort so that we always traverse in the same order so that floating point + # operations are always in the same order to produce exactly the same results. + for attribute_value in sorted(set(attribute_values_without_na)): + for target_value in sorted(set(target_values_without_na)): + probabilities.append(numpy.mean(numpy.logical_and(attribute_values_without_na == attribute_value, target_values_without_na == target_value))) + + joint_entropy = stats.entropy(probabilities) + mutual_information = metrics.mutual_info_score(attribute_values_without_na, target_values_without_na) + + # Our flags are slightly different from metafeatures schema. Our columns can be both categorical and + # discrete, for example. Or both categorical and string. So we make sure here that we are stricter. + if attribute_is_categorical: + categorical_joint_entropy.append(joint_entropy) + categorical_mutual_information.append(mutual_information) + if attribute_is_numeric and not attribute_is_categorical: + numeric_joint_entropy.append(joint_entropy) + numeric_mutual_information.append(mutual_information) + if attribute_is_discrete and not attribute_is_categorical: + discrete_joint_entropy.append(joint_entropy) + discrete_mutual_information.append(mutual_information) + all_joint_entropy.append(joint_entropy) + all_mutual_information.append(mutual_information) + + if categorical_joint_entropy: + joint_entropy_of_categorical_attributes = self._aggregate(categorical_joint_entropy) + if joint_entropy_of_categorical_attributes is not None: + metafeatures['joint_entropy_of_categorical_attributes'] = joint_entropy_of_categorical_attributes + if categorical_mutual_information: + mutual_information_of_categorical_attributes = self._aggregate(categorical_mutual_information) + if mutual_information_of_categorical_attributes is not None: + metafeatures['mutual_information_of_categorical_attributes'] = mutual_information_of_categorical_attributes + metafeatures['equivalent_number_of_categorical_attributes'] = metafeatures['entropy_of_values'] / mutual_information_of_categorical_attributes['mean'] + if numeric_joint_entropy: + joint_entropy_of_numeric_attributes = self._aggregate(numeric_joint_entropy) + if joint_entropy_of_numeric_attributes is not None: + metafeatures['joint_entropy_of_numeric_attributes'] = joint_entropy_of_numeric_attributes + if numeric_mutual_information: + mutual_information_of_numeric_attributes = self._aggregate(numeric_mutual_information) + if mutual_information_of_numeric_attributes is not None: + metafeatures['mutual_information_of_numeric_attributes'] = mutual_information_of_numeric_attributes + metafeatures['equivalent_number_of_numeric_attributes'] = metafeatures['entropy_of_values'] / mutual_information_of_numeric_attributes['mean'] + if discrete_joint_entropy: + joint_entropy_of_discrete_attributes = self._aggregate(discrete_joint_entropy) + if joint_entropy_of_discrete_attributes is not None: + metafeatures['joint_entropy_of_discrete_attributes'] = joint_entropy_of_discrete_attributes + if discrete_mutual_information: + mutual_information_of_discrete_attributes = self._aggregate(discrete_mutual_information) + if mutual_information_of_discrete_attributes is not None: + metafeatures['mutual_information_of_discrete_attributes'] = mutual_information_of_discrete_attributes + metafeatures['equivalent_number_of_discrete_attributes'] = metafeatures['entropy_of_values'] / mutual_information_of_discrete_attributes['mean'] + if all_joint_entropy: + joint_entropy_of_attributes = self._aggregate(all_mutual_information) + if joint_entropy_of_attributes is not None: + metafeatures['joint_entropy_of_attributes'] = joint_entropy_of_attributes + if all_mutual_information: + mutual_information_of_attributes = self._aggregate(categorical_mutual_information) + if mutual_information_of_attributes is not None: + metafeatures['mutual_information_of_attributes'] = mutual_information_of_attributes + metafeatures['equivalent_number_of_attributes'] = metafeatures['entropy_of_values'] / mutual_information_of_attributes['mean'] + + return metafeatures + + def _discretize(self, values: typing.Sequence, is_string: bool, is_categorical: bool, is_discrete: bool) -> pandas.Series: + if not isinstance(values, pandas.Series): + values = pandas.Series(values) + + if is_discrete: + # These can still be values with structural type float, but are in fact discrete + # numbers which might contain NaN and this is why structural type is float. + return values + + if is_string: + # This means we have categorical string values. + assert is_categorical + + # Categorical string values have missing data represented as empty strings. + values = values.replace('', numpy.nan) + + # We leave values as strings and expect caller to specially handle this case if necessary. + return values + + if is_categorical: + # Categorical values should be only strings or discrete numbers, but if we got to + # here this is not really true, but we cannot really do anything. + return values + + # If we got to here we have true numeric values, and we have to bin them. + + values = pandas.Series(pandas.cut(values, round(len(values) ** (1/3)), include_lowest=True, labels=False)) + + return values + + def _aggregate(self, values: typing.Sequence) -> typing.Optional[typing.Dict[str, typing.Any]]: + if not isinstance(values, pandas.Series): + values = pandas.Series(values) + + if not len(values): + return None + + results = { + 'count': len(values), + 'min': values.min(), + 'max': values.max(), + 'mean': values.mean(), + 'median': values.median(), + 'std': values.std(), + 'quartile_1': values.quantile(0.25), + 'quartile_3': values.quantile(0.75), + 'kurtosis': values.kurtosis(), + 'skewness': values.skew(), + } + + # We iterate over a list so that we can change dict while iterating. + for name, value in list(results.items()): + # If anything cannot be computed, we remove it. + if not numpy.isfinite(value): + del results[name] + + if not results: + return None + + return results diff --git a/tods/common-primitives/common_primitives/construct_predictions.py b/tods/common-primitives/common_primitives/construct_predictions.py new file mode 100644 index 0000000..9aa4ffb --- /dev/null +++ b/tods/common-primitives/common_primitives/construct_predictions.py @@ -0,0 +1,262 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer +from d3m.contrib.primitives import compute_scores + +import common_primitives + +__all__ = ('ConstructPredictionsPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If metadata reconstruction happens, this is used for reference columns." + " If any specified column is not a primary index or a predicted target, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. If metadata reconstruction happens, this is used for reference columns. Applicable only if \"use_columns\" is not provided.", + ) + + +class ConstructPredictionsPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which takes as input a DataFrame and outputs a DataFrame in Lincoln Labs predictions + format: first column is a d3mIndex column (and other primary index columns, e.g., for object detection + problem), and then predicted targets, each in its column, followed by optional confidence column(s). + + It supports both input columns annotated with semantic types (``https://metadata.datadrivendiscovery.org/types/PrimaryKey``, + ``https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey``, ``https://metadata.datadrivendiscovery.org/types/PredictedTarget``, + ``https://metadata.datadrivendiscovery.org/types/Confidence``), or trying to reconstruct metadata. + This is why the primitive takes also additional input of a reference DataFrame which should + have metadata to help reconstruct missing metadata. If metadata is missing, the primitive + assumes that all ``inputs`` columns are predicted targets, without confidence column(s). + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '8d38b340-f83f-4877-baaa-162f8e551736', + 'version': '0.3.0', + 'name': "Construct pipeline predictions output", + 'python_path': 'd3m.primitives.data_transformation.construct_predictions.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/construct_predictions.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # type: ignore + + index_columns = inputs.metadata.get_index_columns() + target_columns = inputs.metadata.list_columns_with_semantic_types(('https://metadata.datadrivendiscovery.org/types/PredictedTarget',)) + + # Target columns cannot be also index columns. This should not really happen, + # but it could happen with buggy primitives. + target_columns = [target_column for target_column in target_columns if target_column not in index_columns] + + if index_columns and target_columns: + outputs = self._produce_using_semantic_types(inputs, index_columns, target_columns) + else: + outputs = self._produce_reconstruct(inputs, reference, index_columns, target_columns) + + outputs = compute_scores.ComputeScoresPrimitive._encode_columns(outputs) + + # Generally we do not care about column names in DataFrame itself (but use names of columns from metadata), + # but in this case setting column names makes it easier to assure that "to_csv" call produces correct output. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/147 + column_names = [] + for column_index in range(len(outputs.columns)): + column_names.append(outputs.metadata.query_column(column_index).get('name', outputs.columns[column_index])) + outputs.columns = column_names + + return base.CallResult(outputs) + + def _filter_index_columns(self, inputs_metadata: metadata_base.DataMetadata, index_columns: typing.Sequence[int]) -> typing.Sequence[int]: + if self.hyperparams['use_columns']: + index_columns = [index_column_index for index_column_index in index_columns if index_column_index in self.hyperparams['use_columns']] + if not index_columns: + raise ValueError("No index columns listed in \"use_columns\" hyper-parameter, but index columns are required.") + + else: + index_columns = [index_column_index for index_column_index in index_columns if index_column_index not in self.hyperparams['exclude_columns']] + if not index_columns: + raise ValueError("All index columns listed in \"exclude_columns\" hyper-parameter, but index columns are required.") + + names = [] + for index_column in index_columns: + index_metadata = inputs_metadata.query_column(index_column) + # We do not care about empty strings for names either. + if index_metadata.get('name', None): + names.append(index_metadata['name']) + + if 'd3mIndex' not in names: + raise ValueError("\"d3mIndex\" index column is missing.") + + names_set = set(names) + if len(names) != len(names_set): + duplicate_names = names + for name in names_set: + # Removes just the first occurrence. + duplicate_names.remove(name) + + self.logger.warning("Duplicate names for index columns: %(duplicate_names)s", { + 'duplicate_names': list(set(duplicate_names)), + }) + + return index_columns + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata, index_columns: typing.Sequence[int], target_columns: typing.Sequence[int]) -> typing.List[int]: + assert index_columns + assert target_columns + + index_columns = self._filter_index_columns(inputs_metadata, index_columns) + + if self.hyperparams['use_columns']: + target_columns = [target_column_index for target_column_index in target_columns if target_column_index in self.hyperparams['use_columns']] + if not target_columns: + raise ValueError("No target columns listed in \"use_columns\" hyper-parameter, but target columns are required.") + + else: + target_columns = [target_column_index for target_column_index in target_columns if target_column_index not in self.hyperparams['exclude_columns']] + if not target_columns: + raise ValueError("All target columns listed in \"exclude_columns\" hyper-parameter, but target columns are required.") + + assert index_columns + assert target_columns + + return list(index_columns) + list(target_columns) + + def _get_confidence_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: + confidence_columns = inputs_metadata.list_columns_with_semantic_types(('https://metadata.datadrivendiscovery.org/types/Confidence',)) + + if self.hyperparams['use_columns']: + confidence_columns = [confidence_column_index for confidence_column_index in confidence_columns if confidence_column_index in self.hyperparams['use_columns']] + else: + confidence_columns = [confidence_column_index for confidence_column_index in confidence_columns if confidence_column_index not in self.hyperparams['exclude_columns']] + + return confidence_columns + + def _produce_using_semantic_types(self, inputs: Inputs, index_columns: typing.Sequence[int], + target_columns: typing.Sequence[int]) -> Outputs: + confidence_columns = self._get_confidence_columns(inputs.metadata) + + output_columns = self._get_columns(inputs.metadata, index_columns, target_columns) + confidence_columns + + # "get_index_columns" makes sure that "d3mIndex" is always listed first. + # And "select_columns" selects columns in order listed, which then + # always puts "d3mIndex" first. + outputs = inputs.select_columns(output_columns) + + if confidence_columns: + outputs.metadata = self._update_confidence_columns(outputs.metadata, confidence_columns) + + return outputs + + def _update_confidence_columns(self, inputs_metadata: metadata_base.DataMetadata, confidence_columns: typing.Sequence[int]) -> metadata_base.DataMetadata: + output_columns_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + outputs_metadata = inputs_metadata + + # All confidence columns have to be named "confidence". + for column_index in range(output_columns_length - len(confidence_columns), output_columns_length): + outputs_metadata = outputs_metadata.update((metadata_base.ALL_ELEMENTS, column_index), { + 'name': 'confidence', + }) + + return outputs_metadata + + def _produce_reconstruct(self, inputs: Inputs, reference: Inputs, index_columns: typing.Sequence[int], target_columns: typing.Sequence[int]) -> Outputs: + if not index_columns: + reference_index_columns = reference.metadata.get_index_columns() + + if not reference_index_columns: + raise ValueError("Cannot find an index column in reference data, but index column is required.") + + filtered_index_columns = self._filter_index_columns(reference.metadata, reference_index_columns) + index = reference.select_columns(filtered_index_columns) + else: + filtered_index_columns = self._filter_index_columns(inputs.metadata, index_columns) + index = inputs.select_columns(filtered_index_columns) + + if not target_columns: + if index_columns: + raise ValueError("No target columns in input data, but index column(s) present.") + + # We assume all inputs are targets. + targets = inputs + + # We make sure at least basic metadata is generated correctly, so we regenerate metadata. + targets.metadata = targets.metadata.generate(targets) + + # We set target column names from the reference. We set semantic types. + targets.metadata = self._update_targets_metadata(targets.metadata, self._get_target_names(reference.metadata)) + + else: + targets = inputs.select_columns(target_columns) + + return index.append_columns(targets) + + def multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore + return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, reference=reference) + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], inputs: Inputs, reference: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, inputs=inputs, reference=reference) + + def _get_target_names(self, metadata: metadata_base.DataMetadata) -> typing.List[typing.Union[str, None]]: + target_names = [] + + for column_index in metadata.list_columns_with_semantic_types(('https://metadata.datadrivendiscovery.org/types/TrueTarget',)): + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + target_names.append(column_metadata.get('name', None)) + + return target_names + + def _update_targets_metadata(self, metadata: metadata_base.DataMetadata, target_names: typing.Sequence[typing.Union[str, None]]) -> metadata_base.DataMetadata: + targets_length = metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + if targets_length != len(target_names): + raise ValueError("Not an expected number of target columns to apply names for. Expected {target_names}, provided {targets_length}.".format( + target_names=len(target_names), + targets_length=targets_length, + )) + + for column_index, target_name in enumerate(target_names): + metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/Target') + metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') + + # We do not have it, let's skip it and hope for the best. + if target_name is None: + continue + + metadata = metadata.update_column(column_index, { + 'name': target_name, + }) + + return metadata diff --git a/tods/common-primitives/common_primitives/csv_reader.py b/tods/common-primitives/common_primitives/csv_reader.py new file mode 100644 index 0000000..3f665c0 --- /dev/null +++ b/tods/common-primitives/common_primitives/csv_reader.py @@ -0,0 +1,145 @@ +import typing +import os +from urllib import parse as url_parse + +import frozendict # type: ignore +import pandas # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base + +import common_primitives +from common_primitives import base + + +class CSVReaderPrimitive(base.FileReaderPrimitiveBase): + """ + A primitive which reads columns referencing CSV files. + + Each column which has ``https://metadata.datadrivendiscovery.org/types/FileName`` semantic type + and a valid media type (``text/csv``) has every filename read as a pandas DataFrame. By default + the resulting column with read pandas DataFrames is appended to existing columns. + """ + + _supported_media_types = ( + 'text/csv', + ) + _file_structural_type = container.DataFrame + _file_semantic_types = ('https://metadata.datadrivendiscovery.org/types/Table', 'https://metadata.datadrivendiscovery.org/types/Timeseries') + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '989562ac-b50f-4462-99cb-abef80d765b2', + 'version': '0.1.0', + 'name': 'Columns CSV reader', + 'python_path': 'd3m.primitives.data_preprocessing.csv_reader.Common', + 'keywords': ['CSV', 'reader'], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/csv_reader.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, + ], + 'supported_media_types': _supported_media_types, + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + } + ) + + def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict, fileuri: str) -> container.DataFrame: + # This is the same logic as used in D3M core package. + # TODO: Deduplicate. + + expected_names = None + if metadata.get('file_columns', None): + expected_names = [] + for column in metadata['file_columns']: + expected_names.append(column['name']) + + # Pandas requires a host for "file" URIs. + parsed_uri = url_parse.urlparse(fileuri, allow_fragments=False) + if parsed_uri.scheme == 'file' and parsed_uri.netloc == '': + parsed_uri = parsed_uri._replace(netloc='localhost') + fileuri = url_parse.urlunparse(parsed_uri) + + data = pandas.read_csv( + fileuri, + usecols=expected_names, + # We do not want to do any conversion of values at this point. + # This should be done by primitives later on. + dtype=str, + # We always expect one row header. + header=0, + # We want empty strings and not NaNs. + na_filter=False, + encoding='utf8', + low_memory=False, + memory_map=True, + ) + + column_names = list(data.columns) + + if expected_names is not None and expected_names != column_names: + raise ValueError("Mismatch between column names in data {column_names} and expected names {expected_names}.".format( + column_names=column_names, + expected_names=expected_names, + )) + + if data is None: + raise FileNotFoundError("Data file for table '{file_path}' cannot be found.".format( + file_path=fileuri, + )) + + data = container.DataFrame(data, { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + }, generate_metadata=False) + + assert column_names is not None + + for i, column_name in enumerate(column_names): + data.metadata = data.metadata.update((metadata_base.ALL_ELEMENTS, i), { + 'name': column_name, + 'structural_type': str, + }) + + return data + + def _produce_column_metadata(self, inputs_metadata: metadata_base.DataMetadata, column_index: int, + read_files: typing.Sequence[typing.Any]) -> metadata_base.DataMetadata: + # We do not pass "read_files" to parent method but we apply it at the end of this method ourselves. + column_metadata = super()._produce_column_metadata(inputs_metadata, column_index, []) + + column_metadata = column_metadata.update_column(0, { + # Clear metadata useful for filename columns. + 'file_columns': metadata_base.NO_VALUE, + }) + + # We might have metadata about columns, apply it here. + column_meta = inputs_metadata.query_column(column_index) + if column_meta.get('file_columns', None): + for i, column in enumerate(column_meta['file_columns']): + column_metadata = column_metadata.update((metadata_base.ALL_ELEMENTS, 0, metadata_base.ALL_ELEMENTS, i), column) + + # We know which columns are there, but also we know that we are reading everything as strings, so we can set that as well. + column_metadata = column_metadata.update((metadata_base.ALL_ELEMENTS, 0, metadata_base.ALL_ELEMENTS, i), {'structural_type': str}) + + # A DataFrame is always a table as well. + column_metadata = column_metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Table') + + # We do not pass "read_files" to parent method but we apply it here ourselves. + # This makes sure that metadata read from data override any metadata from metadata. + for row_index, file in enumerate(read_files): + column_metadata = file.metadata.copy_to(column_metadata, (), (row_index, 0)) + + return column_metadata diff --git a/tods/common-primitives/common_primitives/cut_audio.py b/tods/common-primitives/common_primitives/cut_audio.py new file mode 100644 index 0000000..a4ab162 --- /dev/null +++ b/tods/common-primitives/common_primitives/cut_audio.py @@ -0,0 +1,319 @@ +import os +import typing + +import frozendict # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column is not an audio column, it is skipped." + "Boundary columns are not impacted by this hyper-parameter.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided." + "Boundary columns are not impacted by this hyper-parameter.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='replace', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should columns with cut audio be appended, should they replace original columns, or should only columns with cut audio be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + + +# TODO: Add a hyper-parameter to remove boundary column(s) when replacing. +class CutAudioPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which uses boundary columns to cut audio columns. + + It uses ``http://schema.org/AudioObject`` and structural type ``container.ndarray` to + find columns with audio data. + + It searches for boundary columns referencing them. + Boundary columns are identified by ``https://metadata.datadrivendiscovery.org/types/Interval``, + ``https://metadata.datadrivendiscovery.org/types/IntervalStart`` and + ``https://metadata.datadrivendiscovery.org/types/IntervalEnd`` semantic types. + + It requires that the audio dimension has ``sampling_rate`` metadata set. + + Boundaries are rounded down to samples. Cut is done exclusive: not including the last sample. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '4ad9ce62-283d-4765-a87b-78b55d89a4ed', + 'version': '0.1.0', + 'name': 'Cut audio columns', + 'python_path': 'd3m.primitives.data_transformation.cut_audio.Common', + 'keywords': ['audio', 'cut'], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/cut_audio.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + metadata_base.PrimitiveAlgorithmType.AUDIO_STREAM_MANIPULATION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + audio_columns_to_use = self._get_audio_columns(inputs.metadata) + + all_boundary_columns = self._get_boundary_columns(inputs.metadata, audio_columns_to_use) + + output_columns = [self._produce_column(inputs, audio_column, boundary_columns) for audio_column, boundary_columns in all_boundary_columns.items()] + + outputs = base_utils.combine_columns( + inputs, list(all_boundary_columns.keys()), output_columns, + return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], + ) + + if self.hyperparams['return_result'] == 'replace': + outputs.metadata = self._remove_metadata_references(outputs.metadata, all_boundary_columns) + + return base.CallResult(outputs) + + def _remove_metadata_references(self, inputs_metadata: metadata_base.DataMetadata, all_boundary_columns: typing.Dict[int, typing.List[int]]) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata + + # When replacing, boundary columns do not apply anymore to new columns. + for audio_column, boundary_columns in all_boundary_columns.items(): + for boundary_column in boundary_columns: + outputs_metadata = outputs_metadata.update_column(boundary_column, { + 'boundary_for': metadata_base.NO_VALUE, + }) + + return outputs_metadata + + def _produce_column(self, inputs: Inputs, column_index: int, boundary_columns: typing.List[int]) -> Outputs: + cut_audio = [] + for row_index, value in enumerate(inputs.iloc[:, column_index]): + try: + if len(boundary_columns) == 1: + # Float vector is a ndarray vector, so we convert it to a list. + boundaries = list(inputs.iloc[row_index, boundary_columns[0]]) + else: + assert len(boundary_columns) == 2 + + boundaries = [inputs.iloc[row_index, boundary_columns[0]], inputs.iloc[row_index, boundary_columns[1]]] + + cut_audio.append(self._cut_audio(boundaries, inputs.metadata.query((row_index, column_index)), value)) + + except Exception as error: + raise ValueError("Could not cut audio in column {column_index} at row {row_index}.".format( + column_index=column_index, + row_index=row_index, + )) from error + + column = container.DataFrame({inputs.columns[column_index]: cut_audio}, generate_metadata=False) + + column.metadata = self._produce_column_metadata(inputs.metadata, column_index, cut_audio) + column.metadata = column.metadata.generate(column) + + return column + + def _produce_column_metadata(self, inputs_metadata: metadata_base.DataMetadata, column_index: int, + cut_audio: typing.Sequence[container.ndarray]) -> metadata_base.DataMetadata: + column_metadata = inputs_metadata.select_columns([column_index]) + + for row_index, audio in enumerate(cut_audio): + column_metadata = column_metadata.update((row_index, 0), { + 'dimension': { + 'length': len(audio), + } + }) + + return column_metadata + + def _cut_audio(self, boundaries: typing.List[int], metadata: frozendict.FrozenOrderedDict, audio: container.ndarray) -> container.ndarray: + if 'sampling_rate' not in metadata.get('dimension', {}): + raise ValueError("\"sampling_rate\" dimension metadata is missing.") + + sampling_rate = metadata['dimension']['sampling_rate'] + + assert len(boundaries) == 2 + + start = int(sampling_rate * boundaries[0]) + end = int(sampling_rate * boundaries[1]) + + if not 0 <= start <= end: + self.logger.warning("Interval start is out of range: start=%(start)s, end=%(end)s, length=%(length)s", { + 'start': start, + 'end': end, + 'length': len(audio), + }) + if not start <= end <= len(audio): + self.logger.warning("Interval end is out of range: start=%(start)s, end=%(end)s, length=%(length)s", { + 'start': start, + 'end': end, + 'length': len(audio), + }) + + return audio[start:end] + + def _can_use_audio_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + if not issubclass(column_metadata['structural_type'], container.ndarray): + return False + + if 'http://schema.org/AudioObject' not in column_metadata.get('semantic_types', []): + return False + + return True + + def _get_audio_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_audio_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + # We are OK if no columns ended up being cut. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns contain audio. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _get_boundary_columns(self, inputs_metadata: metadata_base.DataMetadata, audio_columns: typing.List[int]) -> typing.Dict[int, typing.List[int]]: + # In Python 3.6 this dict has deterministic order. + boundary_columns = {} + for audio_column in audio_columns: + boundary_columns_for_column = self._get_boundary_columns_for_column(inputs_metadata, audio_column) + + if boundary_columns_for_column: + boundary_columns[audio_column] = boundary_columns_for_column + else: + # This is OK, not all audio columns should be cut. + self.logger.debug("Audio column %(audio_column)s does not have boundary columns.", { + 'audio_column': audio_column, + }) + + return boundary_columns + + def _get_boundary_columns_for_column(self, inputs_metadata: metadata_base.DataMetadata, audio_column: int) -> typing.List[int]: + """ + If returned list contains one element, then that column is "interval" column. + If it contains two elements, then the first column is "interval start" column, and the second + "interval end" column. + """ + + columns_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + # In Python 3.6 this dict has deterministic order. + boundary_columns_with_index = {} + + for column_index in range(columns_length): + column_metadata = inputs_metadata.query_column(column_index) + semantic_types = column_metadata.get('semantic_types', []) + + if not any(semantic_type in semantic_types for semantic_type in [ + 'https://metadata.datadrivendiscovery.org/types/Interval', + 'https://metadata.datadrivendiscovery.org/types/IntervalStart', + 'https://metadata.datadrivendiscovery.org/types/IntervalEnd', + ]): + continue + + if audio_column == column_metadata.get('boundary_for', {}).get('column_index', None): + boundary_columns_with_index[column_index] = column_metadata + + if not boundary_columns_with_index: + return [] + + if len(boundary_columns_with_index) == 1: + for column_index, column_metadata in boundary_columns_with_index.items(): + semantic_types = column_metadata.get('semantic_types', []) + + if any(semantic_type in semantic_types for semantic_type in [ + 'https://metadata.datadrivendiscovery.org/types/IntervalStart', + 'https://metadata.datadrivendiscovery.org/types/IntervalEnd', + ]): + self.logger.warning("One boundary column %(boundary_column)s for audio column %(audio_column)s, but invalid semantic types.", { + 'boundary_column': column_index, + 'audio_column': audio_column, + }) + return [] + + assert 'https://metadata.datadrivendiscovery.org/types/Interval' in semantic_types, column_index + + return [column_index] + + elif len(boundary_columns_with_index) == 2: + start_column_index: int = None + end_column_index: int = None + + for column_index, column_metadata in boundary_columns_with_index.items(): + semantic_types = column_metadata.get('semantic_types', []) + + if 'https://metadata.datadrivendiscovery.org/types/Interval' in semantic_types: + self.logger.warning("Two boundary columns %(boundary_columns)s for audio column %(audio_column)s, but boundary column %(boundary_column)s has invalid semantic type.", { + 'boundary_columns': list(boundary_columns_with_index.keys()), + 'boundary_column': column_index, + 'audio_column': audio_column, + }) + return [] + + # It is OK if set one of the variables twice, then the other one will stay "None" + # and we will abort below. + if 'https://metadata.datadrivendiscovery.org/types/IntervalStart' in semantic_types: + start_column_index = column_index + elif 'https://metadata.datadrivendiscovery.org/types/IntervalEnd' in semantic_types: + end_column_index = column_index + else: + assert False, column_index + + if start_column_index is not None and end_column_index is not None: + return [start_column_index, end_column_index] + else: + self.logger.warning("Two boundary columns %(boundary_columns)s for audio column %(audio_column)s, but invalid semantic types.", { + 'boundary_columns': list(boundary_columns_with_index.keys()), + 'audio_column': audio_column, + }) + return [] + + else: + self.logger.warning("Multiple (%(count)s) boundary columns for audio column %(audio_column)s.".format({ + 'count': len(boundary_columns_with_index), + 'audio_column': audio_column, + })) + return [] + + # Not really necessary, but mypy is happier with it. + return [] diff --git a/tods/common-primitives/common_primitives/dataframe_flatten.py b/tods/common-primitives/common_primitives/dataframe_flatten.py new file mode 100644 index 0000000..c9fe291 --- /dev/null +++ b/tods/common-primitives/common_primitives/dataframe_flatten.py @@ -0,0 +1,201 @@ +from typing import List, Any +import os +import csv +import collections + +import frozendict # type: ignore +import pandas as pd # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer +from d3m.base import utils as base_utils + +import common_primitives + +__all__ = ('DataFrameFlattenPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column does not have any semantic type from \"from_semantic_types\", it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['replace', 'new'], + default='replace', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should the nested columns be appended, should they replace original columns, or should only the expanded columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no column is selected/provided. Otherwise issue a warning.", + ) + + +class DataFrameFlattenPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Cycles through the input dataframe and flattens the encountered nested structures (series & dataframes). + Flattening involves creating a new row for each nested data row, and replicating the unnested row features. + [ + a, b, [w, x], + c, d, [y, z], + ] + + yields: + + [ + a, b, w, + a, b, x, + c, d, y, + c, d, z + ] + + If the d3m index field is present and set as index, it will be updated to be multi index + as needed. The primitive should be called after the referenced files have + already been nested in the dataframe (using the CSVReader primitive for example). The primitive can + flatten mutiple nested columns, but is currently limited to supporting a nesting depth of 1. + """ + + __author__ = 'Uncharted Software', + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '1c4aed23-f3d3-4e6b-9710-009a9bc9b694', + 'version': '0.1.0', + 'name': 'DataFrame Flatten', + 'python_path': 'd3m.primitives.data_preprocessing.flatten.DataFrameCommon', + 'keywords': ['dataframe', 'flatten'], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:chris.bethune@uncharted.software', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataframe_flatten.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_DENORMALIZATION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + } + ) + + def _expand_rows(self, inputs: Inputs, cols_to_expand: List[int], return_result: str, add_index_columns: bool) -> container.DataFrame: + output_data = [] + + # find the index columns and ignore that have nested contents (are flagged for expand) + # Currently needed becaause the CSVReader seems to replicate the filename column metadata into + # the expanded metadata column, causing the PrimaryKey type to show up in the original and nested + index_columns = inputs.metadata.get_index_columns() + index_columns = [col for col in index_columns if col not in cols_to_expand] + + # get the selectors for the metadata we need to copy + metadata_sel: List[Any] = [] + for col_idx in range(len(inputs.columns)): + if col_idx in cols_to_expand: + expand_meta = inputs.metadata.query((metadata_base.ALL_ELEMENTS, col_idx, metadata_base.ALL_ELEMENTS)) + num_sub_cols = expand_meta['dimension']['length'] + for sub_col_idx in range(num_sub_cols): + metadata_sel.append((metadata_base.ALL_ELEMENTS, col_idx, metadata_base.ALL_ELEMENTS, sub_col_idx)) + elif return_result != 'new' or (return_result == 'new' and add_index_columns and col_idx in index_columns): + metadata_sel.append((metadata_base.ALL_ELEMENTS, col_idx)) + + # process every input row + # the nested data will be a series containing a dataframe + for t_row in inputs.itertuples(index=False, name=None): + row_data = [t_row] + + # expand every nested column + # every column to expand essentially becomes a cross product + for col_index in cols_to_expand: + # col_data is the expanded value for that nested column + # row_data is the list of all expanded data for that row + col_data = [] + for e_row in t_row[col_index].itertuples(index=False, name=None): + for s_row in row_data: + if return_result == 'new': + if add_index_columns: + data = [s_row[idx] for idx in index_columns] + col_data.append(data + list(e_row)) + else: + col_data.append(e_row) + elif return_result == 'replace': + data = list(s_row) + [data.pop(idx) for idx in cols_to_expand] + col_data.append(data + list(e_row)) + else: + raise ValueError(f"Unsupported return_result '{return_result}'") + row_data = col_data + output_data.extend(row_data) + + # wrap up as a dataframe and reset index now that merging is all done + result = container.DataFrame(output_data, generate_metadata=True) + for col_idx, col_metadata_selector in enumerate(metadata_sel): + result.metadata = inputs.metadata.copy_to(result.metadata, col_metadata_selector, (metadata_base.ALL_ELEMENTS, col_idx)) + + result.reset_index(inplace=True, drop=True) + return result + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def _is_nested(col_index: int) -> bool: + t = inputs_metadata.query((metadata_base.ALL_ELEMENTS, col_index))['structural_type'] + return issubclass(t, container.DataFrame) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + inputs_metadata, + self.hyperparams['use_columns'], + self.hyperparams['exclude_columns'], + _is_nested, + ) + + # We are OK if no columns ended up being encoded. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns can be encoded. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def produce(self, *, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> base.CallResult[Outputs]: + + container_dataframe = inputs + + to_expand_index = self._get_columns(inputs.metadata) + if len(to_expand_index) > 0: + inputs_clone = inputs.copy() + container_dataframe = self._expand_rows(inputs_clone, to_expand_index, self.hyperparams['return_result'], self.hyperparams['add_index_columns']) + else: + if self.hyperparams['error_on_no_columns']: + raise ValueError('No columns need flattening') + else: + self.logger.warning('No columns required flattening') + + # wrap as a D3M container + return base.CallResult(container_dataframe) diff --git a/tods/common-primitives/common_primitives/dataframe_image_reader.py b/tods/common-primitives/common_primitives/dataframe_image_reader.py new file mode 100644 index 0000000..a2ef6cc --- /dev/null +++ b/tods/common-primitives/common_primitives/dataframe_image_reader.py @@ -0,0 +1,94 @@ +import os + +import frozendict # type: ignore +import imageio # type: ignore +import numpy # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base + +import common_primitives +from common_primitives import base + + +class DataFrameImageReaderPrimitive(base.FileReaderPrimitiveBase): + """ + A primitive which reads columns referencing image files. + + Each column which has ``https://metadata.datadrivendiscovery.org/types/FileName`` semantic type + and a valid media type (``image/jpeg``, ``image/png``) has every filename read into an image + represented as a numpy array. By default the resulting column with read arrays is appended + to existing columns. + + The shape of numpy arrays is H x W x C. C is the number of channels in an image + (e.g., C = 1 for greyscale, C = 3 for RGB), H is the height, and W is the width. + dtype is uint8. + """ + + _supported_media_types = ( + 'image/jpeg', + 'image/png', + ) + _file_structural_type = container.ndarray + _file_semantic_types = ('http://schema.org/ImageObject',) + + __author__ = 'University of Michigan, Ali Soltani' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '8f2e51e8-da59-456d-ae29-53912b2b9f3d', + 'version': '0.2.0', + 'name': 'Columns image reader', + 'python_path': 'd3m.primitives.data_preprocessing.image_reader.Common', + 'keywords': ['image', 'reader', 'jpg', 'png'], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:alsoltan@umich.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataframe_image_reader.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, + ], + 'supported_media_types': _supported_media_types, + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + } + ) + + def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict, fileuri: str) -> container.ndarray: + image_array = imageio.imread(fileuri) + + image_reader_metadata = image_array.meta + + # "imread" does not necessary always return uint8 dtype, but for PNG and JPEG files it should. + assert image_array.dtype == numpy.uint8, image_array.dtype + + if image_array.ndim == 2: + # Make sure there are always three dimensions. + image_array = image_array.reshape(list(image_array.shape) + [1]) + + assert image_array.ndim == 3, image_array.ndim + + image_array = container.ndarray(image_array, { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.ndarray, + }, generate_metadata=False) + + # There might be custom metadata available, let's store it. + # TODO: Add metadata which channel is which color (probably by providing metadata about the color space). + # It should probably go to "dimension" section for the "channels" dimension, for example, color space + # "RGB" would say that the dimension has to be of length 3 and has colors in this order. + # We could also set names for each dimension ("height", "width", "channels"). + # We should probably also add semantic types to mark these dimensions. + image_array.metadata = image_array.metadata.update((), { + 'image_reader_metadata': image_reader_metadata, + }) + + return image_array diff --git a/tods/common-primitives/common_primitives/dataframe_to_list.py b/tods/common-primitives/common_primitives/dataframe_to_list.py new file mode 100644 index 0000000..d3e5b66 --- /dev/null +++ b/tods/common-primitives/common_primitives/dataframe_to_list.py @@ -0,0 +1,54 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('DataFrameToListPrimitive',) + +Inputs = container.DataFrame +Outputs = container.List + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class DataFrameToListPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which converts a pandas dataframe into a list of rows. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '901ff55d-0a0a-4bfd-8195-3a947ba2a8f5', + 'version': '0.1.0', + 'name': "DataFrame to list converter", + 'python_path': 'd3m.primitives.data_transformation.dataframe_to_list.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataframe_to_list.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + return base.CallResult(container.List(inputs, generate_metadata=True)) diff --git a/tods/common-primitives/common_primitives/dataframe_to_ndarray.py b/tods/common-primitives/common_primitives/dataframe_to_ndarray.py new file mode 100644 index 0000000..02d2075 --- /dev/null +++ b/tods/common-primitives/common_primitives/dataframe_to_ndarray.py @@ -0,0 +1,54 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('DataFrameToNDArrayPrimitive',) + +Inputs = container.DataFrame +Outputs = container.ndarray + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class DataFrameToNDArrayPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which converts a pandas dataframe into a numpy array. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '34f71b2e-17bb-488d-a2ba-b60b8c305539', + 'version': '0.1.0', + 'name': "DataFrame to ndarray converter", + 'python_path': 'd3m.primitives.data_transformation.dataframe_to_ndarray.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataframe_to_ndarray.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + return base.CallResult(container.ndarray(inputs, generate_metadata=True)) diff --git a/tods/common-primitives/common_primitives/dataframe_utils.py b/tods/common-primitives/common_primitives/dataframe_utils.py new file mode 100644 index 0000000..dd84133 --- /dev/null +++ b/tods/common-primitives/common_primitives/dataframe_utils.py @@ -0,0 +1,46 @@ +import typing + +from d3m import exceptions, utils +from d3m.container import pandas as container_pandas + + +def select_rows(resource: container_pandas.DataFrame, row_indices_to_keep: typing.Sequence[int]) -> container_pandas.DataFrame: + if not isinstance(resource, container_pandas.DataFrame): + raise exceptions.InvalidArgumentTypeError("Only DataFrame resources can have rows selected, not '{type}'.".format(type=type(resource))) + + row_indices = sorted(row_indices_to_keep) + resource = resource.iloc[row_indices, :].reset_index(drop=True) + + # TODO: Expose this as a general metadata method. + # In that case this has to be done recursively over all nested ALL_ELEMENTS. + # Here we are operating at resource level so we have to iterate only over first + # ALL_ELEMENTS and resource's element itself. + + # Change the metadata. Update the number of rows in the split. + # This makes a copy so that we can modify metadata in-place. + resource.metadata = resource.metadata.update( + (), + { + 'dimension': { + 'length': len(row_indices), + }, + }, + ) + + # Remove all rows not in this split and reorder those which are. + for element_metadata_entry in [ + resource.metadata._current_metadata, + ]: + if element_metadata_entry is None: + continue + + elements = element_metadata_entry.elements + new_elements_evolver = utils.EMPTY_PMAP.evolver() + for i, row_index in enumerate(row_indices): + if row_index in elements: + new_elements_evolver.set(i, elements[row_index]) + element_metadata_entry.elements = new_elements_evolver.persistent() + element_metadata_entry.is_elements_empty = not element_metadata_entry.elements + element_metadata_entry.update_is_empty() + + return resource diff --git a/tods/common-primitives/common_primitives/datamart_augment.py b/tods/common-primitives/common_primitives/datamart_augment.py new file mode 100644 index 0000000..796941a --- /dev/null +++ b/tods/common-primitives/common_primitives/datamart_augment.py @@ -0,0 +1,106 @@ +import os + +import datamart # type: ignore + +from d3m import container +from d3m import utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces.base import CallResult +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +import common_primitives + +__all__ = ('DataMartAugmentPrimitive',) + +Inputs = container.Dataset +Outputs = container.Dataset + + +class ColumnHyperparams(hyperparams.Hyperparams, set_names=False): + resource_id = hyperparams.Hyperparameter[str]('') + column_index = hyperparams.Hyperparameter[int](-1) + + +class Hyperparams(hyperparams.Hyperparams): + search_result = hyperparams.Hyperparameter[str]( + default='', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter', + ], + description="Serialized search result provided by Datamart", + ) + system_identifier = hyperparams.Hyperparameter[str]( + default='', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter', + ], + description="Which Datamart system this search result is from", + ) + augment_columns = hyperparams.Set( + elements=ColumnHyperparams, + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Optional list of columns from the Datamart dataset that will be added" + ) + + +class DataMartAugmentPrimitive(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Augment supplied dataset with additional columns. + + Use ``DATAMART_NYU_URL`` and ``DATAMART_ISI_URL`` environment variables to control where + can the primitive connect to respective DataMarts. + """ + + metadata = metadata_base.PrimitiveMetadata({ + 'id': 'fe0f1ac8-1d39-463a-b344-7bd498a31b91', + 'version': '0.1', + 'name': "Perform dataset augmentation using Datamart", + 'python_path': 'd3m.primitives.data_augmentation.datamart_augmentation.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:remi.rampin@nyu.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/datamart_augment.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit( + os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_RETRIEVAL, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_AUGMENTATION, + 'pure_primitive': False, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + search_result = self.hyperparams['search_result'] + system_identifier = self.hyperparams['system_identifier'] + augment_columns = [datamart.DatasetColumn(**augment_column) for augment_column in self.hyperparams['augment_columns']] + if not augment_columns: + augment_columns = None + + # Get the URL for this system from the environment (can be None) + system_url = os.environ.get('DATAMART_URL_{}'.format(system_identifier)) + + # Deserialize search result + if system_identifier == 'NYU': + import datamart_rest # type: ignore + + search_result_loaded = datamart_rest.RESTSearchResult.deserialize(search_result) + elif system_identifier == 'ISI': + import datamart_isi.rest # type: ignore + + search_result_loaded = datamart_isi.rest.RESTSearchResult.deserialize(search_result) + else: + raise ValueError("Unknown Datamart system {}".format(system_identifier)) + + # Perform augment + output = search_result_loaded.augment(supplied_data=inputs, augment_columns=augment_columns, connection_url=system_url) + return CallResult(output) diff --git a/tods/common-primitives/common_primitives/datamart_download.py b/tods/common-primitives/common_primitives/datamart_download.py new file mode 100644 index 0000000..048c269 --- /dev/null +++ b/tods/common-primitives/common_primitives/datamart_download.py @@ -0,0 +1,91 @@ +import os + +from d3m import container +from d3m import utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces.base import CallResult +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +import common_primitives + +__all__ = ('DataMartDownloadPrimitive',) + + +Inputs = container.Dataset +Outputs = container.Dataset + + +class Hyperparams(hyperparams.Hyperparams): + search_result = hyperparams.Hyperparameter[str]( + default='', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter', + ], + description="Serialized search result provided by Datamart", + ) + system_identifier = hyperparams.Hyperparameter[str]( + default='', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter', + ], + description="Which Datamart system this search result is from", + ) + + +class DataMartDownloadPrimitive(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Download a dataset from DataMart. + + Use ``DATAMART_NYU_URL`` and ``DATAMART_ISI_URL`` environment variables to control where + can the primitive connect to respective DataMarts. + """ + + metadata = metadata_base.PrimitiveMetadata({ + 'id': '9e2077eb-3e38-4df1-99a5-5e647d21331f', + 'version': '0.1', + 'name': "Download a dataset from Datamart", + 'python_path': 'd3m.primitives.data_augmentation.datamart_download.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:remi.rampin@nyu.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/datamart_download.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit( + os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_RETRIEVAL, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_AUGMENTATION, + 'pure_primitive': False, + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + search_result = self.hyperparams['search_result'] + system_identifier = self.hyperparams['system_identifier'] + + # Get the URL for this system from the environment (can be None) + system_url = os.environ.get('DATAMART_URL_{}'.format(system_identifier)) + + # Deserialize search result + if system_identifier == 'NYU': + import datamart_rest # type: ignore + + search_result_loaded = datamart_rest.RESTSearchResult.deserialize(search_result) + elif system_identifier == 'ISI': + import datamart_isi.rest # type: ignore + + search_result_loaded = datamart_isi.rest.RESTSearchResult.deserialize(search_result) + else: + raise ValueError("Unknown Datamart system {}".format(system_identifier)) + + # Perform download + output = search_result_loaded.download(supplied_data=inputs, connection_url=system_url) + return CallResult(output) diff --git a/tods/common-primitives/common_primitives/dataset_map.py b/tods/common-primitives/common_primitives/dataset_map.py new file mode 100644 index 0000000..2dfa1be --- /dev/null +++ b/tods/common-primitives/common_primitives/dataset_map.py @@ -0,0 +1,375 @@ +import collections +import copy +import os.path +import typing + +from d3m import container, exceptions, index, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams as hyperparams_module, params +from d3m.primitive_interfaces import base, transformer, unsupervised_learning + +import common_primitives + + +Inputs = container.Dataset +Outputs = container.Dataset + + +class Params(params.Params): + # For resource in a dataset we have potentially params of a primitive. + # Or we have one for all resources if "continue_fit" is enabled. + # TODO: Remove workaround of "Any" once resolved in pytypes. + # See: https://github.com/Stewori/pytypes/issues/56 + # Restore to: resource_params: typing.Optional[typing.Union[typing.Dict[str, params.Params], params.Params]] + resource_params: typing.Optional[typing.Any] + + +class Hyperparams(hyperparams_module.Hyperparams): + # TODO: How to specify that input type of allowed primitive has to be "DataFrame". + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/335 + primitive = hyperparams_module.Union[typing.Union[transformer.TransformerPrimitiveBase, unsupervised_learning.UnsupervisedLearnerPrimitiveBase]]( + configuration=collections.OrderedDict( + transformer=hyperparams_module.Primitive[transformer.TransformerPrimitiveBase]( # type: ignore + # TODO: This default in fact gets List as input and produces List. Not DataFrame. + # But in fact it just passes through whatever it gets, so it works out. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/214 + default=index.get_primitive('d3m.primitives.operator.null.TransformerTest'), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A transformer primitive.", + ), + unsupervised_learner=hyperparams_module.Primitive[unsupervised_learning.UnsupervisedLearnerPrimitiveBase]( # type: ignore + # TODO: This default in fact gets List as input and produces List. Not DataFrame. + # But in fact it just passes through whatever it gets, so it works out. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/214 + default=index.get_primitive('d3m.primitives.operator.null.UnsupervisedLearnerTest'), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="An unsupervised learner primitive. If it is already fitted and you do not want to re-fit it, " + "set \"fit_primitive\" to \"no\".", + ), + ), + default='transformer', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A primitive to use for mapping of \"DataFrame\" resources. Has to take \"DataFrame\" as input.", + ) + fit_primitive = hyperparams_module.Enumeration( + values=['no', 'fit', 'continue_fit'], + default='fit', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Fit an unsupervised learner primitive or not.", + ) + produce_method = hyperparams_module.Hyperparameter[str]( + default='produce', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Name of primitive's produce method to use.", + ) + resources = hyperparams_module.Union[typing.Union[typing.Sequence[str], str]]( + configuration=collections.OrderedDict( + resource_ids=hyperparams_module.Set( + elements=hyperparams_module.Hyperparameter[str]( + # Default is ignored. + # TODO: Remove default. See: https://gitlab.com/datadrivendiscovery/d3m/issues/141 + default='', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID to map.", + ), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Map resources matching specified resource IDs.", + ), + all=hyperparams_module.Constant( + default="all", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Map all dataset resources.", + ), + entry_point=hyperparams_module.Constant( + default='entry_point', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Map the dataset entry point, if dataset has one, " + "or the only resource in the dataset, if there is only one.", + ), + ), + default='entry_point', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Which resources should the primitive map.", + ) + error_on_no_resources = hyperparams_module.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no resource is selected/provided. Otherwise issue a warning.", + ) + + +# TODO: Implement optimized "fit_multi_produce" which calls "fit_multi_produce" of underlying primitive. +class DataFrameDatasetMapPrimitive(unsupervised_learning.UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A primitive which for dataset entry point ``DataFrame`` resource (by default) + runs provided ``primitive`` on it, producing a new resource. + + ``primitive`` can be transformer or fitted or unfitted unsupervised learner primitive. + If it is already fitted and you do not want to re-fit it, set ``fit_primitive`` to ``no``. + Otherwise, if ``fit_primitive`` is set to ``fit``, for resource a copy of the + primitive will be made and it will be first fitted and then produced on that resource. + If ``fit_primitive`` is set to ``continue_fit``, the primitive is continue fitted on + all resources in the dataset, in resource ID order. + + Input to the ``primitive`` has to be container ``DataFrame``, but output can be any + container type. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '5bef5738-1638-48d6-9935-72445f0eecdc', + 'version': '0.1.0', + 'name': "Map DataFrame resources to new resources using provided primitive", + 'python_path': 'd3m.primitives.operator.dataset_map.DataFrameCommon', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataset_map.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + # TODO: Change to "MAP". + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.OPERATOR, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._training_inputs: Inputs = None + self._resource_primitives: typing.Union[typing.Dict[str, base.PrimitiveBase], base.PrimitiveBase] = None + self._fitted: bool = False + + def _should_fit(self) -> bool: + if self.hyperparams['fit_primitive'] == 'no': + return False + + if isinstance(self.hyperparams['primitive'], transformer.TransformerPrimitiveBase): + return False + + if self.hyperparams['fit_primitive'] == 'continue_fit' and not isinstance(self.hyperparams['primitive'], base.ContinueFitMixin): + raise exceptions.InvalidArgumentValueError("\"fit_primitive\" hyper-parameter is set to \"continue_fit\", but primitive does not inherit the \"ContinueFitMixin\" class.") + + return True + + def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore + if not self._should_fit(): + return + + self._training_inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + if not self._should_fit(): + return base.CallResult(None) + + if self._training_inputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + self._resource_primitives = self._fit_resources(self._training_inputs) + self._fitted = True + + return base.CallResult(None) + + def _fit_resources(self, inputs: Inputs) -> typing.Union[typing.Dict[str, base.PrimitiveBase], base.PrimitiveBase]: + resources_to_use = self._get_resources(inputs) + + if self.hyperparams['fit_primitive'] == 'fit': + primitive = None + resource_primitives: typing.Union[typing.Dict[str, base.PrimitiveBase], base.PrimitiveBase] = {} + else: + # We just use provided primitive as-is. Runtime already copies it once for us. + primitive = self.hyperparams['primitive'] + resource_primitives = primitive + + for resource_id in resources_to_use: + resource = self._prepare_resource(inputs.metadata, inputs[resource_id], resource_id) + + # If "fit_primitive" is "continue_fit" we have only + # one primitive instance for the whole dataset. + if self.hyperparams['fit_primitive'] == 'fit': + primitive = copy.deepcopy(self.hyperparams['primitive']) + typing.cast(typing.Dict[str, base.PrimitiveBase], resource_primitives)[resource_id] = primitive + + primitive.set_training_data(inputs=resource) + + if self.hyperparams['fit_primitive'] == 'fit': + primitive.fit() + else: + assert self.hyperparams['fit_primitive'] == 'continue_fit' + primitive.continue_fit() + + return resource_primitives + + def _prepare_resource(self, inputs_metadata: metadata_base.DataMetadata, resource: container.DataFrame, resource_id: str) -> container.DataFrame: + assert isinstance(resource, container.DataFrame) + + resource = copy.copy(resource) + + resource.metadata = metadata_base.DataMetadata({ + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + }) + + resource.metadata = inputs_metadata.copy_to( + resource.metadata, + (resource_id,), + ) + + return resource + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + if self._should_fit() and not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + assert self._should_fit() == self._fitted + assert (self._resource_primitives is not None) == self._fitted + + if self.hyperparams['produce_method'] != 'produce' and not self.hyperparams['produce_method'].startswith('produce_'): + raise exceptions.InvalidArgumentValueError(f"Invalid produce method name in \"produce_method\" hyper-parameter: {self.hyperparams['produce_method']}") + + outputs = self._produce_dataset(inputs, self._resource_primitives) + + return base.CallResult(outputs) + + def _get_resources(self, inputs: Inputs) -> typing.List[str]: + if self.hyperparams['resources'] == 'all': + # We sort so that we potentially continue fit in resource ID order. + resources_to_use = sorted( + resource_id for resource_id, resource in inputs.items() + if isinstance(resource, container.DataFrame) + ) + resources_not_to_use: typing.List[str] = [] + elif self.hyperparams['resources'] == 'entry_point': + try: + resources_to_use = [ + base_utils.get_tabular_resource_metadata( + inputs.metadata, + None, + pick_entry_point=True, + pick_one=True, + ), + ] + except ValueError: + resources_to_use = [] + resources_not_to_use = [] + else: + resources_not_to_use = [ + resource_id for resource_id in self.hyperparams['resources'] + if resource_id not in inputs or not isinstance(inputs[resource_id], container.DataFrame) + ] + # We sort so that we potentially continue fit in resource ID order. + resources_to_use = sorted( + resource_id for resource_id in self.hyperparams['resources'] + if resource_id not in resources_not_to_use + ) + + if not resources_to_use: + if self.hyperparams['error_on_no_resources']: + raise ValueError("No inputs resources.") + else: + self.logger.warning("No inputs resources.") + + if self.hyperparams['resources'] not in ['all', 'entry_point'] and resources_not_to_use: + self.logger.warning("Not all specified inputs resources can be used. Skipping resources: %(resources)s", { + 'resources': resources_not_to_use, + }) + + return resources_to_use + + def _produce_dataset( + self, inputs: Inputs, + resource_primitives: typing.Optional[typing.Union[typing.Dict[str, base.PrimitiveBase], base.PrimitiveBase]], + ) -> Outputs: + resources_to_use = self._get_resources(inputs) + + outputs = inputs.copy() + + for resource_id in resources_to_use: + self._produce_resource(outputs, resource_id, resource_primitives) + + return outputs + + # TODO: Instead of copying metadata to a resource and then back, we could maybe just hack it by setting a correct reference. + # So resource metadata would point directly into dataset's metadata object for + # element corresponding to the resource. How would that work if there is any metadata + # on dataset's ALL_ELEMENTS? For updating it does not matter because resource metadata + # has precedence anyway? But we would still first have to copy metadata from ALL_ELEMENTS + # to resource metadata so that it is available there for querying. + def _produce_resource( + self, outputs: Outputs, resource_id: str, + resource_primitives: typing.Optional[typing.Union[typing.Dict[str, base.PrimitiveBase], base.PrimitiveBase]], + ) -> None: + if resource_primitives is not None: + if self.hyperparams['fit_primitive'] == 'fit': + primitive = typing.cast(typing.Dict[str, base.PrimitiveBase], resource_primitives)[resource_id] + else: + assert self.hyperparams['fit_primitive'] == 'continue_fit' + # When "fit_primitive" is "continue_fit", we have only + # one primitive instance for the whole dataset. + primitive = typing.cast(base.PrimitiveBase, resource_primitives) + else: + # It could be that "fit_primitive" is "no" or that we have a transformer primitive. + primitive = self.hyperparams['primitive'] + + resource = self._prepare_resource(outputs.metadata, outputs[resource_id], resource_id) + + output_resource = getattr(primitive, self.hyperparams['produce_method'])(inputs=resource).value + + outputs[resource_id] = output_resource + + outputs.metadata = outputs.metadata.remove((resource_id,), recursive=True) + outputs.metadata = output_resource.metadata.copy_to( + outputs.metadata, + (), + (resource_id,), + ) + + # TODO: Should we compact metadata? It could make it nicer. + + def get_params(self) -> Params: + if not self._fitted: + return Params( + resource_params=None, + ) + + elif isinstance(self._resource_primitives, dict): + return Params( + resource_params={ + resource_id: primitive.get_params() + for resource_id, primitive in self._resource_primitives.items() + }, + ) + + else: + return Params(resource_params=self._resource_primitives.get_params()) + + def set_params(self, *, params: Params) -> None: + if params['resource_params'] is None: + self._resource_params = None + self._fitted = False + + elif isinstance(params['resource_params'], dict): + resource_primitives = {} + for resource_id, params in params['resource_params'].items(): + primitive = copy.deepcopy(self.hyperparams['primitive']) + primitive.set_params(params) + resource_primitives[resource_id] = primitive + + self._resource_primitives = resource_primitives + self._fitted = True + + else: + self.hyperparams['primitive'].set_params(params['resource_params']) + self._resource_primitives = self.hyperparams['primitive'] + self._fitted = True diff --git a/tods/common-primitives/common_primitives/dataset_sample.py b/tods/common-primitives/common_primitives/dataset_sample.py new file mode 100644 index 0000000..25fa3aa --- /dev/null +++ b/tods/common-primitives/common_primitives/dataset_sample.py @@ -0,0 +1,141 @@ +import os +import typing +import collections + +import numpy # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives +from common_primitives import dataset_utils + +__all__ = ('DatasetSamplePrimitive',) + +Inputs = container.dataset.Dataset +Outputs = container.dataset.Dataset + + +class Hyperparams(hyperparams.Hyperparams): + starting_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="From which resource to start denormalizing. If \"None\" then it starts from the dataset entry point.", + ) + sample_size = hyperparams.Union[typing.Union[int, float, None]]( + configuration=collections.OrderedDict( + absolute=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + description='Sample an absolute number of rows from the dataset.', + ), + relative=hyperparams.Uniform( + lower=0, + upper=1, + default=0.5, + description='Sample a relative number of rows from the dataset.', + ), + all_rows=hyperparams.Constant( + default=None, + description='Sample all rows from the dataset', + ), + ), + default='relative', + description='Sample rows from the dataset according to either an absolute or relative value.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + replacement = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Whether to sample the data with replacement.", + ) + delete_recursive = hyperparams.Hyperparameter[bool]( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Delete rows in other resources/tables which are not needed for rows left in the dataset entry point resource/table.", + ) + + +class DatasetSamplePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which samples the rows of a tabular Dataset. + """ + + __author__ = 'Distil' + __version__ = '0.1.0' + __contact__ = 'mailto:nklabs@newknowledge.com' + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '268315c1-7549-4aee-a4cc-28921cba74c0', + 'version': __version__, + 'name': "Dataset sampling primitive", + 'python_path': 'd3m.primitives.data_preprocessing.dataset_sample.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': __contact__, + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataset_sample.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_SPLITTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0)-> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + main_resource_id, main_resource = base_utils.get_tabular_resource(inputs, self.hyperparams['starting_resource']) + + # return inputs immediately if constant sample HP or number of rows to sample > number in dataset + if self.hyperparams['sample_size'] is None or self.hyperparams['sample_size'] >= main_resource.shape[0]: + return base.CallResult(inputs) + + # don't resample if we are working on test data + target_columns = inputs.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(main_resource_id,)) + + # only consider rows of input where target column is not missing + row_indices_to_keep = set() + row_indices_to_sample = set() + for row_index in range(main_resource.shape[0]): + row_target_values = main_resource.iloc[row_index, target_columns] + # if there is any missing value in targets we assume is a test data row, or at least a row we should not sample + if row_target_values.eq('').any() or row_target_values.isna().any(): + row_indices_to_keep.add(row_index) + else: + row_indices_to_sample.add(row_index) + + # generate random indices to sample + local_random_state = numpy.random.RandomState(self.random_seed) + + if self.hyperparams['sample_size'] < 1: + sample_rows = int(self.hyperparams['sample_size'] * len(row_indices_to_sample)) + else: + sample_rows = self.hyperparams['sample_size'] + if sample_rows != 0 and len(row_indices_to_sample) != 0: + # we sort row indices to be deterministic + row_indices_to_keep.update(local_random_state.choice(sorted(row_indices_to_sample), size=sample_rows, replace=self.hyperparams['replacement'])) + + output_dataset = dataset_utils.sample_rows( + inputs, + main_resource_id, + row_indices_to_keep, + inputs.get_relations_graph(), + delete_recursive=self.hyperparams.get('delete_recursive', False), + ) + + return base.CallResult(output_dataset) diff --git a/tods/common-primitives/common_primitives/dataset_to_dataframe.py b/tods/common-primitives/common_primitives/dataset_to_dataframe.py new file mode 100644 index 0000000..4f8abe3 --- /dev/null +++ b/tods/common-primitives/common_primitives/dataset_to_dataframe.py @@ -0,0 +1,88 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer +import logging +import common_primitives + +__all__ = ('DatasetToDataFramePrimitive',) + +Inputs = container.Dataset +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", + ) + + +class DatasetToDataFramePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which extracts a DataFrame out of a Dataset. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '4b42ce1e-9b98-4a25-b68e-fad13311eb65', + 'version': '0.3.0', + 'name': "Extract a DataFrame from a Dataset", + 'python_path': 'd3m.primitives.data_transformation.dataset_to_dataframe.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataset_to_dataframe.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + dataframe_resource_id, dataframe = base_utils.get_tabular_resource(inputs, self.hyperparams['dataframe_resource']) + + dataframe.metadata = self._update_metadata(inputs.metadata, dataframe_resource_id) + + assert isinstance(dataframe, container.DataFrame), type(dataframe) + + return base.CallResult(dataframe) + + def _update_metadata(self, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment) -> metadata_base.DataMetadata: + resource_metadata = dict(metadata.query((resource_id,))) + + if 'structural_type' not in resource_metadata or not issubclass(resource_metadata['structural_type'], container.DataFrame): + raise TypeError("The Dataset resource is not a DataFrame, but \"{type}\".".format( + type=resource_metadata.get('structural_type', None), + )) + + resource_metadata.update( + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + }, + ) + + new_metadata = metadata_base.DataMetadata(resource_metadata) + + new_metadata = metadata.copy_to(new_metadata, (resource_id,)) + + # Resource is not anymore an entry point. + new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') + + return new_metadata diff --git a/tods/common-primitives/common_primitives/dataset_utils.py b/tods/common-primitives/common_primitives/dataset_utils.py new file mode 100644 index 0000000..3dd3cdc --- /dev/null +++ b/tods/common-primitives/common_primitives/dataset_utils.py @@ -0,0 +1,52 @@ +import collections +import typing + +from d3m import container + + +def sample_rows( + dataset: container.Dataset, main_resource_id: str, main_resource_indices_to_keep: typing.Set[int], + relations_graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]], *, + delete_recursive: bool = False, +) -> container.Dataset: + # We store rows as sets, but later on we sort them when we select rows. + row_indices_to_keep_sets: typing.Dict[str, typing.Set[int]] = collections.defaultdict(set) + row_indices_to_keep_sets[main_resource_id] = main_resource_indices_to_keep + + # If "delete_recursive" is set to "False", we do not populate "row_indices_to_keep_sets" + # with other resources, making "select_rows" simply keep them. + if delete_recursive: + # We sort to be deterministic. + for main_resource_row_index in sorted(row_indices_to_keep_sets[main_resource_id]): + queue = [] + queue.append((main_resource_id, [main_resource_row_index])) + while queue: + current_resource_id, current_row_indices = queue.pop(0) + current_resource = dataset[current_resource_id] + + for edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state in relations_graph[current_resource_id]: + # All rows from the main resource we want are already there. + # TODO: What to do if we get a reference to the row in the main resource which is not part of this sample? + # This means that probably the sample is invalid. We should not be generating such samples which do not + # preserve reference loops and their consistency. Otherwise it is not really possible to denormalize + # such Dataset properly: a reference is referencing a row in the main resource which does not exist. + if edge_resource_id == main_resource_id: + continue + + edge_resource = dataset[edge_resource_id] + + to_column_values = edge_resource.iloc[:, edge_to_index] + for from_column_value in current_resource.iloc[current_row_indices, edge_from_index]: + # We assume here that "index" corresponds to the default index with row indices. + rows_with_value = edge_resource.index[to_column_values == from_column_value] + # We sort to be deterministic. + new_rows_list = sorted(set(rows_with_value) - row_indices_to_keep_sets[edge_resource_id]) + row_indices_to_keep_sets[edge_resource_id].update(new_rows_list) + queue.append((edge_resource_id, new_rows_list)) + + # We sort indices to get deterministic outputs from sets (which do not have deterministic order). + # We also do not want to change the row order but keep the original row order. + # Sorting by row indices values assure that. + row_indices_to_keep = {resource_id: sorted(indices) for resource_id, indices in row_indices_to_keep_sets.items()} + + return dataset.select_rows(row_indices_to_keep) diff --git a/tods/common-primitives/common_primitives/datetime_field_compose.py b/tods/common-primitives/common_primitives/datetime_field_compose.py new file mode 100644 index 0000000..5e811da --- /dev/null +++ b/tods/common-primitives/common_primitives/datetime_field_compose.py @@ -0,0 +1,83 @@ +import os + +from dateutil import parser +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('DatetimeFieldComposePrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to use when composing a datetime field.", + ) + join_char = hyperparams.Hyperparameter[str]( + default="-", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='A string used to join fields prior to parsing a datetime.', + ) + output_name = hyperparams.Hyperparameter[str]( + default="__date", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='The name to use for the new parsed datetime field.', + ) + + +class DatetimeFieldComposePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which composes fields into a new single datetime field. + + The primitve joins the columns (identified in the columns hyperparam) in order and then parses + the resulting string as a datetime. The value is stored in a new column. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '73d79f46-1bea-4858-a061-a2d1cfc5f122', + 'version': '0.1.0', + 'name': "Datetime Field Compose", + 'python_path': 'd3m.primitives.data_transformation.datetime_field_compose.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:cbethune@uncharted.software', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/datetime_field_compose.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + inputs_clone = inputs.copy() + columns = self.hyperparams['columns'] + output_name = self.hyperparams['output_name'] + join_char = self.hyperparams['join_char'] + + new_col = inputs_clone.iloc[:, list(columns)].apply(lambda x: parser.parse(join_char.join(x)), axis=1) + new_col_index = len(inputs_clone.columns) + inputs_clone.insert(new_col_index, output_name, new_col) + inputs_clone.metadata = inputs_clone.metadata.generate(inputs_clone) + inputs_clone.metadata = inputs_clone.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, new_col_index), 'http://schema.org/DateTime') + inputs_clone.metadata = inputs_clone.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, new_col_index), 'https://metadata.datadrivendiscovery.org/types/Time') + + return base.CallResult(inputs_clone) diff --git a/tods/common-primitives/common_primitives/datetime_range_filter.py b/tods/common-primitives/common_primitives/datetime_range_filter.py new file mode 100644 index 0000000..34ca316 --- /dev/null +++ b/tods/common-primitives/common_primitives/datetime_range_filter.py @@ -0,0 +1,161 @@ +import collections +import os +import typing + +from datetime import datetime, timezone +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives +from common_primitives import dataframe_utils, utils + +import pandas as pd # type: ignore + +__all__ = ('DatetimeRangeFilterPrimitive',) + +MIN_DATETIME = datetime.min.replace(tzinfo=timezone.utc) +MAX_DATETIME = datetime.min.replace(tzinfo=timezone.utc) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + column = hyperparams.Hyperparameter[int]( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Index of column filter applies to.' + ) + inclusive = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='True when values outside the range are removed, False when values within the range are removed.' + ) + min = hyperparams.Union[typing.Union[datetime, None]]( + configuration=collections.OrderedDict( + datetime=hyperparams.Hyperparameter[datetime](utils.DEFAULT_DATETIME), + negative_infinity=hyperparams.Constant(None), + ), + default='negative_infinity', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Minimum value for filter. If it is not timezone-aware, it is assumed that it is in UTC timezone.' + ) + max = hyperparams.Union[typing.Union[datetime, None]]( + configuration=collections.OrderedDict( + datetime=hyperparams.Hyperparameter[datetime](utils.DEFAULT_DATETIME), + positive_infinity=hyperparams.Constant(None), + ), + default='positive_infinity', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Maximum value for filter. If it is not timezone-aware, it is assumed that it is in UTC timezone.' + ) + strict = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='True when the filter bounds are strict (ie. less than), false then are not (ie. less than equal to).' + ) + raise_error = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Raise error if the column contains a value which cannot be parsed into a datetime.' + ) + + +class DatetimeRangeFilterPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which filters rows from a DataFrame based on a datetime range applied to a given column. + Columns are identified by their index, and the filter itself can be inclusive (values within range are retained) + or exclusive (values within range are removed). Boundaries values can be included in the filter (ie. <=) or excluded + (ie. <). + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '487e5a58-19e9-432c-ac61-fe05c024e42c', + 'version': '0.2.0', + 'name': "Datetime range filter", + 'python_path': 'd3m.primitives.data_preprocessing.datetime_range_filter.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:cbethune@uncharted.software', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/datetime_range_filter.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + }, + ) + + @classmethod + def _make_aware(cls, value: datetime) -> datetime: + if value.tzinfo is not None and value.tzinfo.utcoffset(value) is not None: + return value + + return value.replace(tzinfo=timezone.utc) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + # to make sure index matches row indices + resource = inputs.reset_index(drop=True) + + if self.hyperparams['min'] is None: + min = datetime.min + else: + min = self.hyperparams['min'] + + if self.hyperparams['max'] is None: + max = datetime.max + else: + max = self.hyperparams['max'] + + min = self._make_aware(min) + max = self._make_aware(max) + + # apply the filter using native dataframe methods + col_idx = self.hyperparams['column'] + try: + parsed_column = resource.iloc[:, col_idx].apply(lambda x: utils.parse_datetime(x)) + if self.hyperparams['raise_error'] and parsed_column.isna().any(): + raise exceptions.InvalidArgumentValueError( + "Failure to apply datetime range filter to column {col_idx} of type {type}.".format( + col_idx=col_idx, + type=resource.iloc[:, col_idx].dtype, + ), + ) + + to_keep: pd.Series + if self.hyperparams['inclusive']: + if self.hyperparams['strict']: + to_keep = (parsed_column > min) & (parsed_column < max) + else: + to_keep = (parsed_column >= min) & (parsed_column <= max) + else: + if self.hyperparams['strict']: + to_keep = (parsed_column < min) | (parsed_column > max) + else: + to_keep = (parsed_column <= min) | (parsed_column >= max) + + to_keep_indices = resource.loc[to_keep].index + + except (ValueError, OverflowError) as error: + raise exceptions.InvalidArgumentValueError( + "Failure to apply datetime range filter to column {col_idx} of type {type}.".format( + col_idx=col_idx, + type=resource.iloc[:, col_idx].dtype, + ), + ) from error + + # remove dataframe and metadata rows by index + outputs = dataframe_utils.select_rows(inputs, to_keep_indices) + + return base.CallResult(outputs) diff --git a/tods/common-primitives/common_primitives/denormalize.py b/tods/common-primitives/common_primitives/denormalize.py new file mode 100644 index 0000000..bd05148 --- /dev/null +++ b/tods/common-primitives/common_primitives/denormalize.py @@ -0,0 +1,556 @@ +import os +import typing +import itertools + +import numpy # type: ignore +import pandas # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('DenormalizePrimitive',) + +Inputs = container.Dataset +Outputs = container.Dataset + + +class Hyperparams(hyperparams.Hyperparams): + starting_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="From which resource to start denormalizing. If \"None\" then it starts from the dataset entry point.", + ) + recursive = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Denormalize recursively?", + ) + many_to_many = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Denormalize also many-to-many relations?", + ) + discard_not_joined_tabular_resources = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should tabular resources which were not joined be discarded?", + ) + + +# TODO: Implement support for M2M relations. +# TODO: Consider the case where there are loops in foreign keys. +# TODO: Add all column names together to "other names" metadata for column. +# TODO: Consider denormalizing deep-first instead of current iterative approach. +# It seems it might be better because when one table is referencing the second one twice, +# which might reference other tables further, then currently we first join the second table +# and then have to repeat joining other tables twice. But we could first join other tables +# once to the second table, and then just do the join with already joined second table. +# Not sure how to behave in "recursive == False" case then. +# TODO: Add a test where main table has a foreign key twice to same table (for example, person 1 and person 2 to table of persons). +class DenormalizePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which converts a Dataset with multiple tabular resources into a Dataset with only one tabular resource, + based on known relations between tabular resources. Any resource which can be joined is joined (thus the resource + itself is removed), and other resources are by default discarded (controlled by ``discard_resources`` hyper-parameter). + + If hyper-parameter ``recursive`` is set to ``True``, the primitive will join tables recursively. For example, + if table 1 (main table) has a foreign key that points to table 2, and table 2 has a foreign key that points to table 3, + then after table 2 is jointed into table 1, table 1 will have a foreign key that points to table 3. So now the + primitive continues to join table 3 into the main table. + """ + + __author__ = 'Mingjie Sun ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e', + 'version': '0.2.0', + 'name': "Denormalize datasets", + 'python_path': 'd3m.primitives.data_transformation.denormalize.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:sunmj15@gmail.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/denormalize.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_DENORMALIZATION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + # If only one tabular resource is in the dataset, we do not have anything to do. + tabular_resource_ids = [dataset_resource_id for dataset_resource_id, dataset_resource in inputs.items() if isinstance(dataset_resource, container.DataFrame)] + if len(tabular_resource_ids) == 1: + return base.CallResult(inputs) + + # We could set "pick_one" to "False" because we already checked for that, but we leave it + # as "True" because then error messages are more meaningful for this case. + main_resource_id, main_resource = base_utils.get_tabular_resource(inputs, self.hyperparams['starting_resource']) + + # Graph is the adjacency representation for the relations graph. + graph = inputs.get_relations_graph() + + resources = dict(inputs) + metadata = inputs.metadata + all_resources_joined = set() + + while self._has_forward_edges(graph, main_resource_id): + # "resources" and "graph" are modified in-place. + metadata, resources_joined = self._denormalize(resources, metadata, main_resource_id, graph) + + all_resources_joined.update(resources_joined) + + if not self.hyperparams['recursive']: + break + + # Do we discard all other tabular resources (including joined ones)? + if self.hyperparams['discard_not_joined_tabular_resources']: + resources_to_remove = [] + for resource_id, resource in resources.items(): + if resource_id == main_resource_id: + continue + if not isinstance(resource, container.DataFrame): + continue + resources_to_remove.append(resource_id) + + # Discard only joined tabular resources and which no other resource depends on. + else: + # We deal only with tabular resources here. + dependent_upon_resources = self._get_dependent_upon_resources(graph) + resources_to_remove = [resource_id for resource_id in sorted(all_resources_joined - dependent_upon_resources) if resource_id != main_resource_id] + + for resource_id in resources_to_remove: + assert resource_id != main_resource_id + + del resources[resource_id] + metadata = metadata.remove((resource_id,), recursive=True) + + metadata = metadata.update((), { + 'dimension': { + 'length': len(resources), + }, + }) + + return base.CallResult(container.Dataset(resources, metadata)) + + def _has_forward_edges(self, graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]], resource_id: str) -> bool: + # We check first to not create a list in "graph" when accessing it. + if resource_id not in graph: + return False + + for edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state in graph[resource_id]: + if edge_direction: + return True + + return False + + def _has_edges_to_process(self, graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]], resource_id: str) -> bool: + # We check first to not create a list in "graph" when accessing it. + if resource_id not in graph: + return False + + for edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state in graph[resource_id]: + if custom_state.get('process', False): + return True + + return False + + def _denormalize(self, resources: typing.Dict, metadata: metadata_base.DataMetadata, main_resource_id: str, + graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]]) -> typing.Tuple[metadata_base.DataMetadata, typing.Set[str]]: + """ + Finds all tables which are pointed to by the main resource and join them into the main table. + + ``resources`` and ``graph`` are modified in-place. + """ + + resources_joined: typing.Set[str] = set() + main_resource = resources[main_resource_id] + + # Should not really happen. + if main_resource_id not in graph: + return metadata, resources_joined + + # We mark all current edges to be processed. We might be adding more edges to the list, + # but we want to do for this call only those which existed at the beginning. + for edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state in graph[main_resource_id]: + custom_state['process'] = True + + while self._has_edges_to_process(graph, main_resource_id): + edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state = graph[main_resource_id][0] + + if not custom_state.get('process', False): + continue + del custom_state['process'] + + if not edge_direction: + # For now we just remove this relation. + # TODO: Support M2M relations. + + # We remove the relation we would have joined, backward. + self._remove_graph_edge(graph, main_resource_id, edge_resource_id, False, edge_from_index, edge_to_index) + + # We remove the relation we would have joined, forward. + self._remove_graph_edge(graph, edge_resource_id, main_resource_id, True, edge_to_index, edge_from_index) + + continue + + if main_resource_id == edge_resource_id: + # TODO: Implement. + raise NotImplementedError("Support for loops is not implemented yet.") + + # Calling "_join" updates column indices in "graph" and "metadata" + # and also removes the current joined edge from "graph" + main_resource, metadata = self._join( + main_resource_id, main_resource, edge_from_index, + edge_resource_id, resources[edge_resource_id], edge_to_index, + metadata, graph, + ) + + resources_joined.add(edge_resource_id) + + resources[main_resource_id] = main_resource + + return metadata, resources_joined + + def _row_of_missing_values(self, resource: container.DataFrame, metadata: metadata_base.DataMetadata, resource_id: str) -> typing.List[typing.Any]: + row = [] + for column_index, dtype in enumerate(resource.dtypes): + if dtype.kind in ['b', 'i', 'u', 'f', 'c']: + row.append(numpy.nan) + elif dtype.kind == 'O' and issubclass(metadata.query_column_field(column_index, 'structural_type', at=(resource_id,)), str): + row.append('') + else: + row.append(None) + + return row + + def _join(self, main_resource_id: str, main_resource: container.DataFrame, main_column_index: int, foreign_resource_id: str, + foreign_resource: container.DataFrame, foreign_column_index: int, metadata: metadata_base.DataMetadata, + graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]]) -> typing.Tuple[container.DataFrame, metadata_base.DataMetadata]: + if main_resource_id == foreign_resource_id: + # TODO: Implement. + raise NotImplementedError("Support for loops is not implemented yet.") + + # We use this information later on. + one_to_one_relation = foreign_resource.iloc[:, foreign_column_index].sort_values().equals(main_resource.iloc[:, main_column_index].sort_values()) + + foreign_indexer = pandas.Index(foreign_resource.iloc[:, foreign_column_index]).get_indexer(main_resource.iloc[:, main_column_index]) + # "get_indexer" sets all unresolved values to -1. + unresolved_rows = foreign_indexer == -1 + + # We store dtypes so that we can later on compare. + foreign_resource_dtypes = foreign_resource.dtypes + + # -1 is converted into the last row, but we set it to row of missing values if it exists. + resolved_foreign_resource = foreign_resource.take(foreign_indexer).reset_index(drop=True) + if unresolved_rows.any(): + # Set all unresolved rows to a row of missing values. + resolved_foreign_resource.iloc[unresolved_rows, :] = self._row_of_missing_values(foreign_resource, metadata, foreign_resource_id) + + # And store final dtypes so that we can later on compare. + resolved_foreign_resource_dtypes = resolved_foreign_resource.dtypes + + # This makes a copy so that we can modify metadata in-place. + metadata = metadata.update( + (metadata_base.ALL_ELEMENTS,), + {}, + ) + + # TODO: Move this to metadata API. + # We reorder metadata for rows. + for element_metadata_entry in [ + metadata._current_metadata.all_elements, + metadata._current_metadata.elements[foreign_resource_id], + ]: + if element_metadata_entry is None: + continue + + elements = element_metadata_entry.elements + new_elements_evolver = d3m_utils.EMPTY_PMAP.evolver() + for i, row_index in enumerate(foreign_indexer): + if row_index == -1: + continue + + if row_index in elements: + new_elements_evolver.set(i, elements[row_index]) + element_metadata_entry.elements = new_elements_evolver.persistent() + element_metadata_entry.is_elements_empty = not element_metadata_entry.elements + element_metadata_entry.update_is_empty() + + assert resolved_foreign_resource.shape[1] > 0 + + main_resource = pandas.concat([ + main_resource.iloc[:, 0:main_column_index], + resolved_foreign_resource, + main_resource.iloc[:, main_column_index + 1:], + ], axis=1) + + old_semantic_types = metadata.query_column(main_column_index, at=(main_resource_id,)).get('semantic_types', []) + + # First we remove metadata for the existing column. + # This makes a copy so that we can modify metadata in-place. + metadata = metadata.remove_column(main_column_index, at=(main_resource_id,), recursive=True) + + # TODO: Move this to metadata API. + # Move columns and make space for foreign metadata to be inserted. + # We iterate over a list so that we can change dict while iterating. + for element_metadata_entry in itertools.chain( + [metadata._current_metadata.all_elements.all_elements if metadata._current_metadata.all_elements is not None else None], + metadata._current_metadata.all_elements.elements.values() if metadata._current_metadata.all_elements is not None else iter([None]), + [metadata._current_metadata.elements[main_resource_id].all_elements], + metadata._current_metadata.elements[main_resource_id].elements.values(), + ): + if element_metadata_entry is None: + continue + + new_elements_evolver = element_metadata_entry.elements.evolver() + for element_index in element_metadata_entry.elements.keys(reverse=True): + # We removed metadata for "main_column_index". + assert element_index != main_column_index + + element_index = typing.cast(int, element_index) + + if main_column_index < element_index: + metadata_dict = new_elements_evolver[element_index] + new_elements_evolver.remove(element_index) + new_elements_evolver.set(element_index + resolved_foreign_resource.shape[1] - 1, metadata_dict) + element_metadata_entry.elements = new_elements_evolver.persistent() + element_metadata_entry.is_elements_empty = not element_metadata_entry.elements + element_metadata_entry.update_is_empty() + + # And copy over metadata for new (replaced) columns in place of the existing column. + for column_index in range(resolved_foreign_resource.shape[1]): + # To go over "ALL_ELEMENTS" and all rows. + for element in metadata.get_elements((foreign_resource_id,)): + metadata = metadata.copy_to( + metadata, + [foreign_resource_id, element, metadata_base.ALL_ELEMENTS], + [main_resource_id, element, main_column_index + column_index], + ignore_all_elements=True, + ) + metadata = metadata.copy_to( + metadata, + [foreign_resource_id, element, column_index], + [main_resource_id, element, main_column_index + column_index], + ignore_all_elements=True, + ) + + # Update metadata for new (replaced) columns. + for column_index in range(main_column_index, main_column_index + resolved_foreign_resource.shape[1]): + # We copy semantic types describing the role of the column from the original column to all new (replaced) columns. + # TODO: Do not hard-code this list here but maybe extract it from "definitions.json"? + for semantic_type in [ + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/Boundary', + 'https://metadata.datadrivendiscovery.org/types/BoundingPolygon', + 'https://metadata.datadrivendiscovery.org/types/Interval', + 'https://metadata.datadrivendiscovery.org/types/IntervalEnd', + 'https://metadata.datadrivendiscovery.org/types/IntervalStart', + 'https://metadata.datadrivendiscovery.org/types/InstanceWeight', + 'https://metadata.datadrivendiscovery.org/types/PrivilegedData', + 'https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData', + 'https://metadata.datadrivendiscovery.org/types/RedactedTarget', + 'https://metadata.datadrivendiscovery.org/types/SuggestedPrivilegedData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + 'https://metadata.datadrivendiscovery.org/types/Score', + 'https://metadata.datadrivendiscovery.org/types/Confidence', + 'https://metadata.datadrivendiscovery.org/types/Time', + 'https://metadata.datadrivendiscovery.org/types/Location', + ]: + if semantic_type in old_semantic_types: + metadata = metadata.add_semantic_type((main_resource_id, metadata_base.ALL_ELEMENTS, column_index), semantic_type) + + is_column_unique = main_resource.iloc[:, column_index].is_unique + column_semantic_types = metadata.query_column(column_index, at=(main_resource_id,)).get('semantic_types', []) + was_column_unique = 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in column_semantic_types \ + or 'https://metadata.datadrivendiscovery.org/types/UniqueKey' in column_semantic_types + + # Foreign keys can reference same foreign row multiple times, so values in this column might not be even + # unique anymore, nor they are a primary key at all. So we remove the semantic type marking a column as such. + # We re-set semantic type for any real primary key later on. + metadata = metadata.remove_semantic_type((main_resource_id, metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') + metadata = metadata.remove_semantic_type((main_resource_id, metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey') + metadata = metadata.remove_semantic_type((main_resource_id, metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/UniqueKey') + + # We re-set semantic type for column which was and is still unique. + if was_column_unique and is_column_unique: + metadata = metadata.add_semantic_type((main_resource_id, metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/UniqueKey') + + old_dtype = foreign_resource_dtypes.iloc[column_index - main_column_index] + new_dtype = resolved_foreign_resource_dtypes.iloc[column_index - main_column_index] + if old_dtype is not new_dtype: + # Not a nice way to convert a dtype to Python type, but it works. + old_type = type(numpy.zeros(1, old_dtype).tolist()[0]) + new_type = type(numpy.zeros(1, new_dtype).tolist()[0]) + if old_type is not new_type: + # Type changed, we have to update metadata about the structural type. + metadata = metadata.update((main_resource_id, metadata_base.ALL_ELEMENTS, column_index), { + 'structural_type': new_type, + }) + + # If the original column was a primary key, we should re-set it back. + if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in old_semantic_types and (one_to_one_relation or not unresolved_rows.any()): + if main_resource.iloc[:, main_column_index].is_unique: + metadata = metadata.add_semantic_type((main_resource_id, metadata_base.ALL_ELEMENTS, main_column_index), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') + # Removing "UniqueKey" if it was set before, "PrimaryKey" surpasses it. + metadata = metadata.remove_semantic_type((main_resource_id, metadata_base.ALL_ELEMENTS, main_column_index), 'https://metadata.datadrivendiscovery.org/types/UniqueKey') + else: + metadata = metadata.add_semantic_type((main_resource_id, metadata_base.ALL_ELEMENTS, main_column_index), 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey') + elif 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' in old_semantic_types and (one_to_one_relation or not unresolved_rows.any()): + metadata = metadata.add_semantic_type((main_resource_id, metadata_base.ALL_ELEMENTS, main_column_index), 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey') + + # TODO: Update boundary columns and "confidence for" references. + # This is not currently needed because all file collections are just one column so they do not + # move the column indices. But as a general case we should be updating all standard column references. + + # Update columns number in the main resource. + metadata = metadata.update((main_resource_id, metadata_base.ALL_ELEMENTS), { + 'dimension': { + 'length': main_resource.shape[1], + }, + }) + + # We remove the relation we just joined, forward. + self._remove_graph_edge(graph, main_resource_id, foreign_resource_id, True, main_column_index, foreign_column_index) + + # We remove the relation we just joined, backward. + self._remove_graph_edge(graph, foreign_resource_id, main_resource_id, False, foreign_column_index, main_column_index) + + # We have to update column indices if they have changed because we inserted new columns. + for resource_id, edges in graph.items(): + if resource_id == main_resource_id: + for i, (edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state) in enumerate(edges): + if edge_direction and main_column_index < edge_from_index: + # We replaced one column with "resolved_foreign_resource.shape[1]" columns, so there is + # "resolved_foreign_resource.shape[1] - 1" new columns to shift indices for. + edges[i] = (edge_resource_id, edge_direction, edge_from_index + resolved_foreign_resource.shape[1] - 1, edge_to_index, custom_state) + else: + for i, (edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state) in enumerate(edges): + if edge_resource_id == main_resource_id and not edge_direction and main_column_index < edge_to_index: + # We replaced one column with "resolved_foreign_resource.shape[1]" columns, so there is + # "resolved_foreign_resource.shape[1] - 1" new columns to shift indices for. + edges[i] = (edge_resource_id, edge_direction, edge_from_index, edge_to_index + resolved_foreign_resource.shape[1] - 1, custom_state) + + # If foreign resource has any additional relations, we copy them to new columns in the main resource. + if foreign_resource_id in graph: + # We iterate over a list so that we can change graph while iterating. + for edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state in list(graph[foreign_resource_id]): + if edge_resource_id in [main_resource_id, foreign_resource_id]: + # TODO: Implement. + raise NotImplementedError("Support for loops is not implemented yet.") + + if edge_direction: + graph[main_resource_id].append((edge_resource_id, True, main_column_index + edge_from_index, edge_to_index, {})) + graph[edge_resource_id].append((main_resource_id, False, edge_to_index, main_column_index + edge_from_index, {})) + else: + # TODO: What should we do about backward relations? + # For now we just ignore backward relations because we do not support M2M relations. + # For the foreign resource we just joined, we could change all relations to instead point + # to the main resource. This might be tricky though if we have a situation where main table + # includes table 1 twice, and table 1 has a relation to table 2. If we after joining table 1 + # once rewrite all backward relations from table 2 to table 1 to point to main table now, + # when we get to join the table 1 the second time we might have issues. This is why it might + # better to start joining deep-first. See another TODO. + # TODO: We might have to also update foreign key metadata in this case. + # We might want to update metadata so that if table 1 is joined to the main table, and there is + # also table 2 which has a foreign key that points to table 1, then the foreign key in table 2 + # should point to the main table after joining. But what if main table has a foreign key to + # table 1 twice? How do we then update metadata in table 2 to point twice to table 1? + # Metadata does not support that. + + # A special case for now. If relation is one-to-one, then we can move backwards relations to the + # main resource without complications mentioned in TODOs above. Maybe some additional columns might + # be joined through M2M relations in this case, once that is supported, but generally this should not + # be a problem. It might add some duplicated columns at that point. This special case is useful + # when "learningData" with only targets is pointing to some other table with real attributes. + if one_to_one_relation: + self._remove_graph_edge(graph, edge_resource_id, foreign_resource_id, True, edge_to_index, edge_from_index) + self._remove_graph_edge(graph, foreign_resource_id, edge_resource_id, False, edge_from_index, edge_to_index) + + graph[main_resource_id].append((edge_resource_id, False, main_column_index + edge_from_index, edge_to_index, custom_state)) + graph[edge_resource_id].append((main_resource_id, True, edge_to_index, main_column_index + edge_from_index, custom_state)) + + # We override metadata for foreign key to make it point to the main resource (and not to foreign resource anymore). + metadata = metadata.update((edge_resource_id, metadata_base.ALL_ELEMENTS, edge_to_index), { + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': main_resource_id, + 'column_index': main_column_index + edge_from_index, + 'column_name': metadata_base.NO_VALUE, + }, + }) + + return main_resource, metadata + + def _get_dependent_upon_resources(self, graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]]) -> typing.Set[str]: + """ + Returns a set of resources which have other resources depend on them. + """ + + dependent_upon_resources = set() + + for resource_id, edges in graph.items(): + for edge_resource_id, edge_direction, edge_from_index, edge_to_index, custom_state in edges: + if edge_direction: + dependent_upon_resources.add(edge_resource_id) + + return dependent_upon_resources + + def _remove_graph_edge(self, graph: typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]], + resource_id: str, edge_resource_id: str, edge_direction: bool, edge_from_index: int, edge_to_index: int) -> None: + assert resource_id in graph + + for i, edge in enumerate(graph[resource_id]): + if edge[0:4] == (edge_resource_id, edge_direction, edge_from_index, edge_to_index): + del graph[resource_id][i] + break + + if not graph[resource_id]: + del graph[resource_id] + + +if __name__ == '__main__': + import logging + import pprint + import sys + + logging.basicConfig() + + for dataset_file_path in sys.argv[1:]: + try: + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=os.path.abspath(dataset_file_path))) + except Exception as error: + raise Exception("Unable to load dataset: {dataset_doc_path}".format(dataset_doc_path=dataset_file_path)) from error + + primitive = DenormalizePrimitive(hyperparams=Hyperparams.defaults().replace({ + 'recursive': True, + 'discard_not_joined_tabular_resources': False, + })) + + try: + denormalized_dataset = primitive.produce(inputs=dataset).value + + pprint.pprint(denormalized_dataset) + denormalized_dataset.metadata.pretty_print() + except Exception as error: + raise Exception("Unable to denormalize dataset: {dataset_doc_path}".format(dataset_doc_path=dataset_file_path)) from error diff --git a/tods/common-primitives/common_primitives/extract_columns.py b/tods/common-primitives/common_primitives/extract_columns.py new file mode 100644 index 0000000..917120c --- /dev/null +++ b/tods/common-primitives/common_primitives/extract_columns.py @@ -0,0 +1,58 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('ExtractColumnsPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices of columns to extract.", + ) + + +class ExtractColumnsPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which extracts a fixed list of columns. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '81d7e261-e25b-4721-b091-a31cd46e99ae', + 'version': '0.1.0', + 'name': "Extracts columns", + 'python_path': 'd3m.primitives.data_transformation.extract_columns.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/extract_columns.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + return base.CallResult(inputs.select_columns(self.hyperparams['columns'])) diff --git a/tods/common-primitives/common_primitives/extract_columns_semantic_types.py b/tods/common-primitives/common_primitives/extract_columns_semantic_types.py new file mode 100644 index 0000000..1792080 --- /dev/null +++ b/tods/common-primitives/common_primitives/extract_columns_semantic_types.py @@ -0,0 +1,141 @@ +import os +import typing + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('ExtractColumnsBySemanticTypesPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + semantic_types = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=('https://metadata.datadrivendiscovery.org/types/Attribute',), + min_size=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Semantic types to use to extract columns. If any of them matches, by default.", + ) + match_logic = hyperparams.Enumeration( + values=['all', 'any', 'equal'], + default='any', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should a column have all of semantic types in \"semantic_types\" to be extracted, or any of them?", + ) + negate = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should columns which do not match semantic types in \"semantic_types\" be extracted?", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column does not match any semantic type, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them.", + ) + + +class ExtractColumnsBySemanticTypesPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which extracts columns from input data based on semantic types provided. + Columns which match any of the listed semantic types are extracted. + + If you want to extract only attributes, you can use ``https://metadata.datadrivendiscovery.org/types/Attribute`` + semantic type (also default). + + For real targets (not suggested targets) use ``https://metadata.datadrivendiscovery.org/types/Target``. + For this to work, columns have to be are marked as targets by the TA2 in a dataset before passing the dataset + through a pipeline. Or something else has to mark them at some point in a pipeline. + + It uses ``use_columns`` and ``exclude_columns`` to control which columns it considers. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '4503a4c6-42f7-45a1-a1d4-ed69699cf5e1', + 'version': '0.4.0', + 'name': "Extracts columns by semantic type", + 'python_path': 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/extract_columns_semantic_types.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + columns_to_use = self._get_columns(inputs.metadata) + + output_columns = inputs.select_columns(columns_to_use) + + outputs = base_utils.combine_columns(inputs, columns_to_use, [output_columns], return_result='new', add_index_columns=self.hyperparams['add_index_columns']) + + return base.CallResult(outputs) + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + semantic_types = column_metadata.get('semantic_types', []) + + if self.hyperparams['match_logic'] == 'all': + match = all(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types']) + elif self.hyperparams['match_logic'] == 'any': + match = any(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types']) + elif self.hyperparams["match_logic"] == "equal": + match = set(semantic_types) == set(self.hyperparams["semantic_types"]) + else: + raise exceptions.UnexpectedValueError("Unknown value of hyper-parameter \"match_logic\": {value}".format(value=self.hyperparams['match_logic'])) + + if self.hyperparams['negate']: + return not match + else: + return match + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Sequence[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + if not columns_to_use: + raise ValueError("Input data has no columns matching semantic types: {semantic_types}".format( + semantic_types=self.hyperparams['semantic_types'], + )) + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns match semantic types. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use diff --git a/tods/common-primitives/common_primitives/extract_columns_structural_types.py b/tods/common-primitives/common_primitives/extract_columns_structural_types.py new file mode 100644 index 0000000..2d9f17d --- /dev/null +++ b/tods/common-primitives/common_primitives/extract_columns_structural_types.py @@ -0,0 +1,135 @@ +import os +import typing + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('ExtractColumnsByStructuralTypesPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + structural_types = hyperparams.Set( + elements=hyperparams.Hyperparameter[typing.Union[str, type]](''), + default=('str',), + min_size=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Structural types to use to extract columns. If any of them matches, by default.", + ) + match_logic = hyperparams.Enumeration( + values=['all', 'any'], + default='any', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should a column have all of structural types in \"structural_types\" to be extracted, or any of them?", + ) + negate = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should columns which do not match structural types in \"structural_types\" be extracted?", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column does not match any structural type, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them.", + ) + + +class ExtractColumnsByStructuralTypesPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which extracts columns from input data based on structural types provided. + Columns which match any of the listed structural types are extracted. + + It uses ``use_columns`` and ``exclude_columns`` to control which columns it considers. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '79674d68-9b93-4359-b385-7b5f60645b06', + 'version': '0.1.0', + 'name': "Extracts columns by structural type", + 'python_path': 'd3m.primitives.data_transformation.extract_columns_by_structural_types.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:ychr93@gmail.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/extract_columns_structural_types.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + columns_to_use = self._get_columns(inputs.metadata) + + output_columns = inputs.select_columns(columns_to_use) + + outputs = base_utils.combine_columns(inputs, columns_to_use, [output_columns], return_result='new', add_index_columns=self.hyperparams['add_index_columns']) + + return base.CallResult(outputs) + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + if 'structural_type' not in column_metadata: + return False + + structural_types = typing.cast(typing.Sequence[typing.Union[str, type]], self.hyperparams['structural_types']) + + if self.hyperparams['match_logic'] == 'all': + match = all(d3m_utils.matches_structural_type(column_metadata['structural_type'], structural_type) for structural_type in structural_types) + elif self.hyperparams['match_logic'] == 'any': + match = any(d3m_utils.matches_structural_type(column_metadata['structural_type'], structural_type) for structural_type in structural_types) + else: + raise exceptions.UnexpectedValueError("Unknown value of hyper-parameter \"match_logic\": {value}".format(value=self.hyperparams['match_logic'])) + + if self.hyperparams['negate']: + return not match + else: + return match + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Sequence[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + if not columns_to_use: + raise ValueError("Input data has no columns matching structural types: {structural_types}".format( + structural_types=self.hyperparams['structural_types'], + )) + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns match structural types. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use diff --git a/tods/common-primitives/common_primitives/fixed_split.py b/tods/common-primitives/common_primitives/fixed_split.py new file mode 100644 index 0000000..d8aa340 --- /dev/null +++ b/tods/common-primitives/common_primitives/fixed_split.py @@ -0,0 +1,124 @@ +import os +import typing + +import numpy # type: ignore +import pandas # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams + +import common_primitives +from common_primitives import base + +__all__ = ('FixedSplitDatasetSplitPrimitive',) + + +class Hyperparams(hyperparams.Hyperparams): + primary_index_values = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='A set of primary index values of the main resource belonging to the test (score) split. Cannot be set together with "row_indices".', + ) + row_indices = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='A set of row indices of the main resource belonging to the test (score) split. Cannot be set together with "primary_index_values".', + ) + delete_recursive = hyperparams.Hyperparameter[bool]( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Delete rows in other resources/tables which are not needed for rows left in the dataset entry point resource/table.", + ) + + +class FixedSplitDatasetSplitPrimitive(base.TabularSplitPrimitiveBase[Hyperparams]): + """ + A primitive which splits a tabular Dataset in a way that uses for the test + (score) split a fixed list of primary index values or row indices of the main + resource to be used. All other rows are added used for the train split. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '1654f000-2178-4520-be4c-a95bc26b8d3a', + 'version': '0.1.0', + 'name': "Fixed split tabular dataset splits", + 'python_path': 'd3m.primitives.evaluation.fixed_split_dataset_split.Commmon', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/fixed_split.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_SPLITTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.EVALUATION, + }, + ) + + def _get_splits(self, attributes: pandas.DataFrame, targets: pandas.DataFrame, dataset: container.Dataset, main_resource_id: str) -> typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]]: + # This should be handled by "Set" hyper-parameter, but we check it here again just to be sure. + if d3m_utils.has_duplicates(self.hyperparams['primary_index_values']): + raise exceptions.InvalidArgumentValueError("\"primary_index_values\" hyper-parameter has duplicate values.") + if d3m_utils.has_duplicates(self.hyperparams['row_indices']): + raise exceptions.InvalidArgumentValueError("\"row_indices\" hyper-parameter has duplicate values.") + + if self.hyperparams['primary_index_values'] and self.hyperparams['row_indices']: + raise exceptions.InvalidArgumentValueError("Both \"primary_index_values\" and \"row_indices\" cannot be provided.") + + if self.hyperparams['primary_index_values']: + primary_index_values = numpy.array(self.hyperparams['primary_index_values']) + + index_columns = dataset.metadata.get_index_columns(at=(main_resource_id,)) + + if not index_columns: + raise exceptions.InvalidArgumentValueError("Cannot find index columns in the main resource of the dataset, but \"primary_index_values\" is provided.") + + main_resource = dataset[main_resource_id] + # We reset the index so that the index corresponds to row indices. + main_resource = main_resource.reset_index(drop=True) + + # We use just the "d3mIndex" column and ignore multi-key indices. + # This works for now because it seems that every current multi-key + # dataset in fact has an unique value in "d3mIndex" alone. + # See: https://gitlab.datadrivendiscovery.org/MIT-LL/d3m_data_supply/issues/117 + index_column = index_columns[0] + + score_data = numpy.array(main_resource.loc[main_resource.iloc[:, index_column].isin(primary_index_values)].index) + score_data_set = set(score_data) + + assert len(score_data) == len(score_data_set), (len(score_data), len(score_data_set)) + + if len(score_data) != len(primary_index_values): + raise exceptions.InvalidArgumentValueError("\"primary_index_values\" contains values which do not exist.") + + else: + score_data = numpy.array(self.hyperparams['row_indices']) + score_data_set = set(score_data) + + all_data_set = set(numpy.arange(len(attributes))) + + if not score_data_set <= all_data_set: + raise exceptions.InvalidArgumentValueError("\"row_indices\" contains indices which do not exist, e.g., {indices}.".format( + indices=sorted(score_data_set - all_data_set)[:5], + )) + + train_data = [] + for i in numpy.arange(len(attributes)): + if i not in score_data_set: + train_data.append(i) + + assert len(train_data) + len(score_data) == len(attributes), (len(train_data), len(score_data), len(attributes)) + + return [(numpy.array(train_data), score_data)] diff --git a/tods/common-primitives/common_primitives/grouping_field_compose.py b/tods/common-primitives/common_primitives/grouping_field_compose.py new file mode 100644 index 0000000..a5d8b8f --- /dev/null +++ b/tods/common-primitives/common_primitives/grouping_field_compose.py @@ -0,0 +1,101 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('GroupingFieldComposePrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to use when composing a grouping key field.", + ) + join_char = hyperparams.Hyperparameter[str]( + default="|", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='A string used to join fields.', + ) + output_name = hyperparams.Hyperparameter[str]( + default="__grouping_key", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='The name to use for the new grouping key field.', + ) + + +class GroupingFieldComposePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which composes suggested grouping key fields into a new single grouping key field. + + The primitve joins the columns marked with SuggestedGroupingKey type in order. The + resulting value is stored in a new column and marked with the GroupingKey type. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '59db88b9-dd81-4e50-8f43-8f2af959560b', + 'version': '0.1.0', + 'name': "Grouping Field Compose", + 'python_path': 'd3m.primitives.data_transformation.grouping_field_compose.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:cbethune@uncharted.software', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/grouping_field_compose.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def _get_suggested_columns(self, inputs: Inputs) -> typing.Sequence[int]: + # get every column that has the SuggestedGroupingKey semantic type + return inputs.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/SuggestedGroupingKey']) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + inputs_clone = inputs.copy() + columns = self.hyperparams['columns'] + output_name = self.hyperparams['output_name'] + join_char = self.hyperparams['join_char'] + + # get the columns needing to be joined if not specified by hyperparam + if len(columns) == 0: + columns = self._get_suggested_columns(inputs_clone) + + # get the columns needing to be joined if not specified by hyperparam + if len(columns) == 0: + self.logger.warning('no columns to use for grouping key so returning input as output') + return base.CallResult(inputs_clone) + + # join the columns using the separator + new_col = inputs_clone.iloc[:, list(columns)].apply(lambda x: join_char.join(x), axis=1) + + # append the new colum + new_col_index = len(inputs_clone.columns) + inputs_clone.insert(new_col_index, output_name, new_col) + + # update the metadata as needed + inputs_clone.metadata = inputs_clone.metadata.generate(inputs_clone) + inputs_clone.metadata = inputs_clone.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, new_col_index), 'http://schema.org/Text') + inputs_clone.metadata = inputs_clone.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, new_col_index), 'https://metadata.datadrivendiscovery.org/types/GroupingKey') + + return base.CallResult(inputs_clone) diff --git a/tods/common-primitives/common_primitives/holt_smoothing.py b/tods/common-primitives/common_primitives/holt_smoothing.py new file mode 100644 index 0000000..12b67e3 --- /dev/null +++ b/tods/common-primitives/common_primitives/holt_smoothing.py @@ -0,0 +1,345 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import pandas as pd +# Custom import commands if any +from sklearn.preprocessing.data import Normalizer +from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult,DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + +import os +from typing import Any,Optional,List + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams,params +from d3m.primitive_interfaces import base, transformer + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + # Added by Mia + endog = hyperparams.Bounded[int]( + lower = 2, + upper = None, + default = 3, + description='Array like time series.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + # keep previous + norm = hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2', 'max'], + description='The norm to use to normalize each non zero sample.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe", + ) + + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class HoltSmoothing(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Holt Smoothing + `statsmodels documentation `_ + + """ + + __author__ = "DataLab@TAMU" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_NORMALIZATION, ], + "name": "sklearn.preprocessing.data.Normalizer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.anomaly_detection.HoltSmoothing", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html']}, + "version": "2019.11.13", + "id": "980b3a2d-1574-31f3-8326-ddc62f8fc2c3", + "hyperparams_to_tune": ['norm'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Normalizer( + norm=self.hyperparams['norm'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + self.logger.info('Holt Smoothing Primitive called') + outputs = inputs + try: + columns_to_calculate_simple_exponential_smoothing= List[str] + if(self.hyperparams['use_columns']==()): + columns_to_calculate_simple_exponential_smoothing = list(set(inputs.columns)-set(['d3mIndex','timestamp','ground_truth'])) + else: + columns_to_calculate_simple_exponential_smoothing = self.hyperparams['use_columns'] + for column in columns_to_calculate_simple_exponential_smoothing: + outputs[column+"_holt_smoothing"] = Holt(inputs[column]).fit(smoothing_level=0.2, smoothing_slope=0.2,optimized=False).fittedvalues + except Exception as e: + self.logger.error("Error in Calculating Holt smoothing",e) + self._update_metadata(outputs) + print(inputs) + print("-------------") + print(outputs) + + return base.CallResult(outputs) + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs,) + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +HoltSmoothing.__doc__ = Normalizer.__doc__ diff --git a/tods/common-primitives/common_primitives/holt_winters_exponential_smoothing.py b/tods/common-primitives/common_primitives/holt_winters_exponential_smoothing.py new file mode 100644 index 0000000..deca752 --- /dev/null +++ b/tods/common-primitives/common_primitives/holt_winters_exponential_smoothing.py @@ -0,0 +1,345 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import pandas as pd +# Custom import commands if any +from sklearn.preprocessing.data import Normalizer +from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult,DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + +import os +from typing import Any,Optional,List + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams,params +from d3m.primitive_interfaces import base, transformer + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + # Added by Mia + endog = hyperparams.Bounded[int]( + lower = 2, + upper = None, + default = 3, + description='Array like time seires.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + # keep previous + norm = hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2', 'max'], + description='The norm to use to normalize each non zero sample.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class HoltWintersExponentialSmoothing(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + HoltWinter Exponential Smoothing + `Statsmodels documentation `_ + + """ + + __author__ = "DataLab@TAMU" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_NORMALIZATION, ], + "name": "sklearn.preprocessing.data.Normalizer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.anomaly_detection.HoltWintersExponentialSmoothing", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html']}, + "version": "2019.11.13", + "id": "980b3a2d-1574-31f3-8326-ddc62f8fc2c3", + "hyperparams_to_tune": ['norm'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Normalizer( + norm=self.hyperparams['norm'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + self.logger.info('Holt Winters Smoothing Primitive called') + outputs = inputs + try: + columns_to_calculate_simple_exponential_smoothing= List[str] + if(self.hyperparams['use_columns']==()): + columns_to_calculate_simple_exponential_smoothing = list(set(inputs.columns)-set(['d3mIndex','timestamp','ground_truth'])) + else: + columns_to_calculate_simple_exponential_smoothing = self.hyperparams['use_columns'] + for column in columns_to_calculate_simple_exponential_smoothing: + outputs[column+"_holt_winters_smoothing"] = ExponentialSmoothing(inputs[column], seasonal_periods=4, trend = 'add', seasonal='mul').fit(use_boxcox=True).fittedvalues + except Exception as e: + self.logger.error("Error in Calculating Holt Winters smoothing",e) + self._update_metadata(outputs) + print(inputs) + print("-------------") + print(outputs) + + return base.CallResult(outputs) + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs,) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +HoltWintersExponentialSmoothing.__doc__ = Normalizer.__doc__ diff --git a/tods/common-primitives/common_primitives/horizontal_concat.py b/tods/common-primitives/common_primitives/horizontal_concat.py new file mode 100644 index 0000000..4427a56 --- /dev/null +++ b/tods/common-primitives/common_primitives/horizontal_concat.py @@ -0,0 +1,78 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('HorizontalConcatPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + use_index = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Use primary index columns in both DataFrames (if they exist) to match rows in proper order. Otherwise, concatination happens on the order of rows in input DataFrames.", + ) + remove_second_index = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="When both input DataFrames have primary index columns, remove second index columns from the result." + " When \"use_index\" is \"True\", second index columns are redundant because they are equal to the first ones (assuming equal metadata).", + ) + + +class HorizontalConcatPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which concatenates two DataFrames horizontally. + + It has some heuristics how it tries to match up primary index columns in the case that there are + multiple of them, but generally it aligns samples by all primary index columns. + + It is required that both DataFrames have the same number of samples. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'aff6a77a-faa0-41c5-9595-de2e7f7c4760', + 'version': '0.2.0', + 'name': "Concatenate two dataframes", + 'python_path': 'd3m.primitives.data_transformation.horizontal_concat.DataFrameCommon', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/horizontal_concat.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_CONCATENATION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, left: Inputs, right: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: # type: ignore + return base.CallResult(left.horizontal_concat( + right, + use_index=self.hyperparams['use_index'], + remove_second_index=self.hyperparams['remove_second_index'], + )) + + def multi_produce(self, *, produce_methods: typing.Sequence[str], left: Inputs, right: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore + return self._multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, left=left, right=right) + + def fit_multi_produce(self, *, produce_methods: typing.Sequence[str], left: Inputs, right: Inputs, timeout: float = None, iterations: int = None) -> base.MultiCallResult: # type: ignore + return self._fit_multi_produce(produce_methods=produce_methods, timeout=timeout, iterations=iterations, left=left, right=right) diff --git a/tods/common-primitives/common_primitives/kfold_split.py b/tods/common-primitives/common_primitives/kfold_split.py new file mode 100644 index 0000000..e6b80c4 --- /dev/null +++ b/tods/common-primitives/common_primitives/kfold_split.py @@ -0,0 +1,94 @@ +import os +import typing + +import numpy # type: ignore +import pandas # type: ignore +from sklearn import model_selection # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams + +import common_primitives +from common_primitives import base + +__all__ = ('KFoldDatasetSplitPrimitive',) + + +class Hyperparams(hyperparams.Hyperparams): + number_of_folds = hyperparams.Bounded[int]( + lower=2, + upper=None, + default=5, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Number of folds for k-folds cross-validation.", + ) + stratified = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Do stratified folds. The folds are made by preserving the percentage of samples for each class.", + ) + shuffle = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Whether to shuffle the data before splitting into batches.", + ) + delete_recursive = hyperparams.Hyperparameter[bool]( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Delete rows in other resources/tables which are not needed for rows left in the dataset entry point resource/table.", + ) + + +class KFoldDatasetSplitPrimitive(base.TabularSplitPrimitiveBase[Hyperparams]): + """ + A primitive which splits a tabular Dataset for k-fold cross-validation. + """ + + __author__ = 'Mingjie Sun ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'bfedaf3a-6dd0-4a83-ad83-3a50fe882bf8', + 'version': '0.1.0', + 'name': "K-fold cross-validation tabular dataset splits", + 'python_path': 'd3m.primitives.evaluation.kfold_dataset_split.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:sunmj15@gmail.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/kfold_split.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.K_FOLD, + metadata_base.PrimitiveAlgorithmType.CROSS_VALIDATION, + metadata_base.PrimitiveAlgorithmType.DATA_SPLITTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.EVALUATION, + }, + ) + + def _get_splits(self, attributes: pandas.DataFrame, targets: pandas.DataFrame, dataset: container.Dataset, main_resource_id: str) -> typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]]: + if self.hyperparams['stratified']: + if not len(targets.columns): + raise exceptions.InvalidArgumentValueError("Stratified split is requested, but no target columns found.") + + k_fold = model_selection.StratifiedKFold( + n_splits=self.hyperparams['number_of_folds'], + shuffle=self.hyperparams['shuffle'], + random_state=self._random_state, + ) + else: + k_fold = model_selection.KFold( + n_splits=self.hyperparams['number_of_folds'], + shuffle=self.hyperparams['shuffle'], + random_state=self._random_state, + ) + + return list(k_fold.split(attributes, targets)) diff --git a/tods/common-primitives/common_primitives/kfold_split_timeseries.py b/tods/common-primitives/common_primitives/kfold_split_timeseries.py new file mode 100644 index 0000000..1ab5e6c --- /dev/null +++ b/tods/common-primitives/common_primitives/kfold_split_timeseries.py @@ -0,0 +1,198 @@ +import os +import uuid +import typing +from collections import OrderedDict + +import numpy # type: ignore +import pandas # type: ignore +from sklearn import model_selection # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams + +import common_primitives +from common_primitives import base, utils + +__all__ = ('KFoldTimeSeriesSplitPrimitive',) + + +class Hyperparams(hyperparams.Hyperparams): + number_of_folds = hyperparams.Bounded[int]( + lower=2, + upper=None, + default=5, + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter' + ], + description="Number of folds for k-folds cross-validation.", + ) + number_of_window_folds = hyperparams.Union[typing.Union[int, None]]( + configuration=OrderedDict( + fixed=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + description="Number of folds in train set (window). These folds come directly " + "before test set (streaming window).", + ), + all_records=hyperparams.Constant( + default=None, + description="Number of folds in train set (window) = maximum number possible.", + ), + ), + default='all_records', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter' + ], + description="Maximum size for a single training set.", + ) + time_column_index = hyperparams.Union[typing.Union[int, None]]( + configuration=OrderedDict( + fixed=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + description="Specific column that contains the time index", + ), + one_column=hyperparams.Constant( + default=None, + description="Only one column contains a time index. " + "It is detected automatically using semantic types.", + ), + ), + default='one_column', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter' + ], + description="Column index to use as datetime index. " + "If None, it is required that only one column with time column role semantic type is " + "present and otherwise an exception is raised. " + "If column index specified is not a datetime column an exception is" + "also raised.", + ) + fuzzy_time_parsing = hyperparams.UniformBool( + default=True, + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter' + ], + description="Use fuzzy time parsing.", + ) + + +class KFoldTimeSeriesSplitPrimitive(base.TabularSplitPrimitiveBase[Hyperparams]): + """ + A primitive which splits a tabular time-series Dataset for k-fold cross-validation. + + Primitive sorts the time column so care should be taken to assure sorting of a + column is reasonable. E.g., if column is not numeric but of string structural type, + strings should be formatted so that sorting by them also sorts by time. + """ + + __author__ = 'Distil' + __version__ = '0.3.0' + __contact__ = 'mailto:jeffrey.gleason@yonder.co' + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '002f9ad1-46e3-40f4-89ed-eeffbb3a102b', + 'version': __version__, + 'name': "K-fold cross-validation timeseries dataset splits", + 'python_path': 'd3m.primitives.evaluation.kfold_time_series_split.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': __contact__, + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/kfold_split_timeseries.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [ + { + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit( + os.path.dirname(__file__) + ), + ), + } + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.K_FOLD, + metadata_base.PrimitiveAlgorithmType.CROSS_VALIDATION, + metadata_base.PrimitiveAlgorithmType.DATA_SPLITTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.EVALUATION, + }, + ) + + def _get_splits(self, attributes: pandas.DataFrame, targets: pandas.DataFrame, dataset: container.Dataset, main_resource_id: str) -> typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]]: + time_column_indices = dataset.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/Time'], at=(main_resource_id,)) + attribute_column_indices = dataset.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/Attribute'], at=(main_resource_id,)) + + # We want only time columns which are also attributes. + time_column_indices = [time_column_index for time_column_index in time_column_indices if time_column_index in attribute_column_indices] + + if self.hyperparams['time_column_index'] is None: + if len(time_column_indices) != 1: + raise exceptions.InvalidArgumentValueError( + "If \"time_column_index\" hyper-parameter is \"None\", it is required that exactly one column with time column role semantic type is present.", + ) + else: + # We know it exists because "time_column_indices" is a subset of "attribute_column_indices". + time_column_index = attribute_column_indices.index( + time_column_indices[0], + ) + else: + if self.hyperparams['time_column_index'] not in time_column_indices: + raise exceptions.InvalidArgumentValueError( + "Time column index specified does not have a time column role semantic type.", + ) + else: + time_column_index = attribute_column_indices.index( + self.hyperparams['time_column_index'], + ) + + # We first reset index. + attributes = attributes.reset_index(drop=True) + + # Then convert datetime column to consistent datetime representation + attributes.insert( + loc=0, + column=uuid.uuid4(), # use uuid to ensure we are inserting a new column name + value=self._parse_time_data( + attributes, time_column_index, self.hyperparams['fuzzy_time_parsing'], + ), + ) + + # Then sort dataframe by new datetime column. Index contains original row order. + attributes = attributes.sort_values(by=attributes.columns[0]) + + # Remove datetime representation used for sorting (primitives might choose to parse this str col differently). + attributes = attributes.drop(attributes.columns[0], axis=1) + + if self.hyperparams['number_of_window_folds'] is not None: + max_train_size = int(attributes.shape[0] * self.hyperparams['number_of_window_folds'] / self.hyperparams['number_of_folds']) + else: + max_train_size = None + + k_fold = model_selection.TimeSeriesSplit( + n_splits=self.hyperparams['number_of_folds'], + max_train_size=max_train_size + ) + + # We sorted "attributes" so we have to map indices on sorted "attributes" back to original + # indices. We do that by using DataFrame's index which contains original row order. + return [ + ( + numpy.array([attributes.index[val] for val in train]), + numpy.array([attributes.index[val] for val in test]), + ) + for train, test in k_fold.split(attributes) + ] + + @classmethod + def _parse_time_data(cls, inputs: container.DataFrame, column_index: metadata_base.SimpleSelectorSegment, fuzzy: bool) -> typing.List[float]: + return [ + utils.parse_datetime_to_float(value, fuzzy=fuzzy) + for value in inputs.iloc[:, column_index] + ] diff --git a/tods/common-primitives/common_primitives/lgbm_classifier.py b/tods/common-primitives/common_primitives/lgbm_classifier.py new file mode 100644 index 0000000..ccd2f5f --- /dev/null +++ b/tods/common-primitives/common_primitives/lgbm_classifier.py @@ -0,0 +1,658 @@ +import math +import os +from collections import OrderedDict +from typing import cast, Dict, List, Union, Sequence, Optional, Tuple, Callable + +import lightgbm as lgbm # type: ignore +import numpy as np # type: ignore +import pandas as pd # type: ignore +from sklearn.multioutput import MultiOutputClassifier # type: ignore +from sklearn.preprocessing import LabelEncoder # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces.base import CallResult, ProbabilisticCompositionalityMixin, SamplingCompositionalityMixin, \ + ContinueFitMixin +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase + +import common_primitives + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + booster: Optional[Union[lgbm.basic.Booster, List[lgbm.basic.Booster]]] + estimators: Optional[Union[List[lgbm.LGBMClassifier], lgbm.LGBMClassifier]] + classes: Optional[Union[np.ndarray, List[np.ndarray]]] + n_classes: Optional[Union[int, List[int]]] + n_features: Optional[Union[int, List[int]]] + objective: Optional[Union[str, Callable]] + multi_output_estimator_dict: Optional[Dict] + target_columns_names: Optional[List[str]] + target_columns_metadata: Optional[List[OrderedDict]] + le: Optional[LabelEncoder] + attribute_columns_names: Optional[List[str]] + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=100, + description='The number of trees in the forest.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + n_more_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=100, + description='When continuing a fit, it controls how many more trees to add every time.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + max_depth = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=5, + ), + unlimited=hyperparams.Enumeration[int]( + values=[-1], + default=-1, + ), + ), + default='limit', + description='The maximum depth of the tree.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + num_leaves_base = hyperparams.Bounded[float]( + lower=1, + upper=2, + default=2, + description='Maximum tree leaves for base learners, this value is the base of the formula num_leaves_base^(max_depth)', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + subsample_for_bin = hyperparams.Bounded[int]( + lower=1, + upper=None, + default=200000, + description='number of data that sampled to construct histogram bins', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + learning_rate = hyperparams.Uniform( + lower=0, + upper=1, + default=0.1, + description=r'Boosting learning rate (xgb\`s \"eta\")', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_child_weight = hyperparams.Bounded[int]( + lower=0, + upper=None, + default=1, + description='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results ' + 'in a leaf node with the sum of instance weight less than min_child_weight, then the building ' + 'process will give up further partitioning ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_child_samples = hyperparams.Bounded[int]( + lower=0, + upper=None, + default=20, + description='minimal number of data in one leaf. Can be used to deal with over-fitting', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + max_delta_step = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + # TODO: 1-10 instead? + upper=None, + default=1, + description='Maximum delta step we allow each leaf output to be.' + ), + unlimited=hyperparams.Enumeration[int]( + values=[0], + default=0, + description='No constraint.', + ), + ), + default='unlimited', + description='Maximum delta step we allow.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + # TODO: better way to represent lower bound is exclusive? + subsample = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of the training instances,this will prevent overfitting. Subsampling will occur ' + 'once in every boosting iteration.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + subsample_freq = hyperparams.Bounded[int]( + lower=0, + upper=1, + default=0, + description='frequency for bagging', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + colsample_bytree = hyperparams.Bounded[float]( + lower=0, + upper=1, + default=1, + description='Subsample ratio of columns when constructing each tree. Subsampling will occur once in every ' + 'boosting iteration', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_split_gain = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0, + description='the minimal gain to perform split', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + reg_lambda = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1, + description='L2 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + reg_alpha = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0, + description='L1 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + n_jobs = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + ), + all_cores=hyperparams.Enumeration[int]( + values=[-1], + default=-1, + description='The number of jobs is set to the number of cores.', + ), + ), + default='limit', + description='The number of jobs to run in parallel for both "fit" and "produce".', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'], + ) + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + # Default value depends on the nature of the primitive. + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should resulting columns be appended, should they replace original columns, or should only resulting columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + + +class LightGBMClassifierPrimitive(ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + SamplingCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams], + SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A lightGBM classifier using ``lgbm.LGBMClassifier``. + + It uses semantic types to determine which columns to operate on. + """ + __author__ = 'TAMU DARPA D3M Team, TsungLin Yang ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '259aa747-795c-435e-8e33-8c32a4c83c6b', + 'version': '0.1.0', + 'name': "LightGBM GBTree classifier", + 'python_path': 'd3m.primitives.classification.light_gbm.Common', + 'keywords': ['lightgbm', 'decision tree', 'gradient boosted trees', ], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:lin.yang@tamu.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.GRADIENT_BOOSTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION, + 'hyperparams_to_tune': [ + 'learning_rate', + 'colsample_bytree', + 'min_child_weight', + 'subsample', + 'max_depth', + 'max_delta_step' + ] + } + ) + + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, _verbose: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + # We need random seed multiple times (every time an underlying "RandomForestClassifier" is instantiated), + # and when we sample. So instead we create our own random state we use everywhere. + self._random_state = np.random.RandomState(self.random_seed) + self._verbose = _verbose + self._training_inputs: Inputs = None + self._training_outputs: Outputs = None + self._new_training_data = False + self._learner: Union[lgbm.LGBMClassifier, MultiOutputClassifier] = None + self._multi_output_estimator_dict: Dict = {} + self._target_columns_metadata: List[OrderedDict] = None + self._attribute_columns_names: List[str] = None + self._target_columns_names: List[str] = None + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._training_inputs = inputs + self._training_outputs = outputs + self._new_training_data = True + + def _create_learner(self) -> None: + # TODO: temporarily deal with the dependency between max_depth and num_leaves. When max_depth is not limited, + # set num_leaves to default value 31 + num_leaves = math.floor(pow(self.hyperparams['num_leaves_base'], self.hyperparams['max_depth'])) if \ + self.hyperparams['max_depth'] else 31 + self._learner = lgbm.LGBMClassifier( + n_estimators=self.hyperparams['n_estimators'], + max_depth=self.hyperparams['max_depth'], + num_leaves=num_leaves, + subsample_for_bin=self.hyperparams['subsample_for_bin'], + learning_rate=self.hyperparams['learning_rate'], + min_child_weight=self.hyperparams['min_child_weight'], + min_child_samples=self.hyperparams['min_child_samples'], + max_delta_step=self.hyperparams['max_delta_step'], + subsample=self.hyperparams['subsample'], + subsample_freq=self.hyperparams['subsample_freq'], + min_split_gain=self.hyperparams['min_split_gain'], + colsample_bytree=self.hyperparams['colsample_bytree'], + reg_lambda=self.hyperparams['reg_lambda'], + reg_alpha=self.hyperparams['reg_alpha'], + n_jobs=-1 if self.hyperparams['n_jobs'] is None else self.hyperparams['n_jobs'], + random_state=self.random_seed, + boosting_type='gbdt', + verbose=self._verbose - 1 + ) + + def _get_target_columns_metadata(self, outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = list(column_metadata.get('semantic_types', [])) + if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types: + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + semantic_types = [semantic_type for semantic_type in semantic_types if + semantic_type != 'https://metadata.datadrivendiscovery.org/types/TrueTarget'] + column_metadata['semantic_types'] = semantic_types + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _store_target_columns_metadata(self, inputs: Inputs, outputs: Outputs) -> None: + self._attribute_columns_names = list(inputs.columns) + self._target_columns_metadata = self._get_target_columns_metadata(outputs.metadata) + self._target_columns_names = list(outputs.columns) + + def _cast_to_category_type(self, data: container.DataFrame) -> container.DataFrame: + cat_cols = data.metadata.get_columns_with_semantic_type( + 'https://metadata.datadrivendiscovery.org/types/CategoricalData') + if cat_cols: + data.iloc[:, cat_cols] = data.iloc[:, cat_cols].astype('category') + return data + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # An optimization. Do not refit if data has not changed. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + inputs, _ = self._select_inputs_columns(self._training_inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + # cast categorical feature column to pandas category type + inputs = self._cast_to_category_type(inputs) + self._create_learner() + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.shape[1] > 1: + raise exceptions.InvalidArgumentValueError('Multioutput is not supported by LGBM classifier primitive') + + self._learner.fit(inputs, outputs) + + self._store_target_columns_metadata(inputs, outputs) + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # This model is not improving fitting if called multiple times on the same data. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if not self._learner: + self._create_learner() + + inputs, _ = self._select_inputs_columns(self._training_inputs) + inputs = self._cast_to_category_type(inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + + # using lightgbm api to continue fit the classifier. + def continue_lgb_booster(lgb_model: lgbm.LGBMClassifier, inputs: Inputs, + output_values: Union[np.ndarray, Outputs], num_of_boosting_round: int) -> None: + label = LabelEncoder().fit_transform(output_values) + train_set = lgbm.Dataset(data=inputs, label=label) + model_param = lgb_model.get_params() + + del model_param['n_estimators'], model_param['silent'], model_param['importance_type'] + model_param['objective'] = lgb_model.objective_ + model_param['num_class'] = lgb_model.n_classes_ + booster = lgbm.train(params=model_param, train_set=train_set, + num_boost_round=num_of_boosting_round, + init_model=lgb_model.booster_, keep_training_booster=True) + lgb_model.set_params(Booster=booster) + + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.ndim == 2 and outputs.shape[1] == 1: + continue_lgb_booster(self._learner, inputs, np.ravel(outputs), self.hyperparams['n_more_estimators']) + else: + raise exceptions.InvalidArgumentValueError('Multioutput is not supported by LGBM classifier primitive') + # # TODO Currently doesn't support unseen target for continuing multi-output classification. + + self._store_target_columns_metadata(inputs, outputs) + + return CallResult(None) + + def _update_predictions_metadata(self, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata() + if outputs is not None: + outputs_metadata = outputs_metadata.generate(outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, predictions: np.ndarray) -> Outputs: + outputs = container.DataFrame(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(outputs, self._target_columns_metadata) + outputs.columns = self._target_columns_names + return outputs + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + selected_inputs, columns_to_use = self._select_inputs_columns(inputs) + selected_inputs = self._cast_to_category_type(selected_inputs) + + predictions = self._learner.predict(selected_inputs) + + output_columns = [self._wrap_predictions(predictions)] + + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, + return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns']) + + return CallResult(outputs) + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + # TODO: is feature importances the same for every target? + feature_importances = self._learner.feature_importances_ + feature_importances = feature_importances / sum(feature_importances) + feature_importances_array = feature_importances.reshape((1, len(self._attribute_columns_names))) + + feature_importances = container.DataFrame(feature_importances_array, generate_metadata=True) + feature_importances.columns = self._attribute_columns_names + for k in range(len(self._attribute_columns_names)): + feature_importances.metadata = feature_importances.metadata.update_column(k, { + 'name': self._attribute_columns_names[k]}) + + return CallResult(feature_importances) + + def sample(self, *, inputs: Inputs, num_samples: int = 1, timeout: float = None, iterations: int = None) -> \ + CallResult[Sequence[Outputs]]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + inputs, _ = self._select_inputs_columns(inputs) + inputs = self._cast_to_category_type(inputs) + + samples = [] + for i in range(num_samples): + predictions = self._learner.predict(inputs) + samples.append(self._wrap_predictions(predictions)) + + return CallResult(samples) + + def log_likelihoods(self, *, outputs: Outputs, inputs: Inputs, timeout: float = None, iterations: int = None) -> \ + CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + outputs, _ = self._select_outputs_columns(outputs) + inputs, _ = self._select_inputs_columns(inputs) + inputs = self._cast_to_category_type(inputs) + log_proba = np.log(self._learner.predict_proba(inputs)) + + if outputs.shape[1] > 1: + raise exceptions.InvalidArgumentValueError('Multioutput is not supported by LGBM classifier primitive') + + samples_length = inputs.shape[0] + + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, 0] + classes_map = pd.Series(np.arange(len(self._learner.classes_)), index=self._learner.classes_) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihood = log_proba[np.arange(samples_length), mapped_outputs_column] + + result = container.DataFrame(log_likelihood, generate_metadata=True) + result.columns = outputs.columns + + column_metadata = outputs.metadata.query_column(0) + if 'name' in column_metadata: + result.metadata = result.metadata.update_column(0, {'name': column_metadata['name']}) + + return CallResult(result) + + def get_params(self) -> Params: + if not self._learner: + return Params( + estimators=None, + booster=None, + classes=None, + n_classes=None, + n_features=None, + objective=None, + multi_output_estimator_dict=None, + target_columns_metadata=None, + ) + + return Params( + estimators=self._learner.estimators_ if isinstance(self._learner, MultiOutputClassifier) else self._learner, + booster=self._learner.booster_ if not isinstance(self._learner, MultiOutputClassifier) else [ + estimator.booster_ for estimator in self._learner.estimators_], + classes=self._learner.classes_ + if not isinstance(self._learner, MultiOutputClassifier) else [estimator.classes_ for estimator in + self._learner.estimators_], + n_classes=self._learner.n_classes_ + if not isinstance(self._learner, MultiOutputClassifier) else [estimator.n_classes_ for estimator in + self._learner.estimators_], + n_features=self._learner.n_features_ + if not isinstance(self._learner, MultiOutputClassifier) else [estimator.n_features_ for estimator in + self._learner.estimators_], + objective=self._learner.objective_ + if not isinstance(self._learner, MultiOutputClassifier) else self._learner.estimators_[0].objective, + multi_output_estimator_dict=self._multi_output_estimator_dict + if isinstance(self._learner, MultiOutputClassifier) else {}, + target_columns_names=self._target_columns_names, + attribute_columns_names=self._attribute_columns_names, + target_columns_metadata=self._target_columns_metadata, + le=self._learner._le if not isinstance(self._learner, MultiOutputClassifier) else None + + ) + + def set_params(self, *, params: Params) -> None: + if not all(params[param] is not None for param in + ['booster', 'objective', 'classes', 'n_classes', 'n_features', 'target_columns_metadata']) or \ + not params['estimators']: + self._learner = None + else: + if isinstance(self._learner, MultiOutputClassifier): + self._learner.estimators_ = params['estimators'] + self._multi_output_estimator_dict = params['multi_output_estimator_dict'] + else: + self._create_learner() + lgbm_param = params.copy() + del lgbm_param['estimators'], lgbm_param['target_columns_metadata'], \ + lgbm_param['multi_output_estimator_dict'] + lgbm_param['Booster'] = lgbm_param.pop('booster') + self._learner._le = params['le'] + self._learner.set_params(**lgbm_param) + self._target_columns_metadata = params['target_columns_metadata'] + self._attribute_columns_names = params['attribute_columns_names'] + self._target_columns_names = params['target_columns_names'] + + def __getstate__(self) -> dict: + state = super().__getstate__() + + # Random state is not part of the "Params", but it is part of the state we want to + # pickle and unpickle to have full reproducibility. So we have to add it ourselves here. + # This is also difference between pickling/unpickling and "get_params"/"set_params". + # The later saves only the model state which is useful to produce at a later time, but + # if we want to also reproduce the exact sequence of values, we should be using pickling. + state['random_state'] = self._random_state + + return state + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + + self._random_state = state['random_state'] + + def _can_use_inputs_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + # if not d3m_utils.is_numeric(column_metadata['structural_type']): + # return False + # + return 'https://metadata.datadrivendiscovery.org/types/Attribute' in column_metadata.get('semantic_types', + []) + + def _get_inputs_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_inputs_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + inputs_metadata, + self.hyperparams['use_inputs_columns'], + self.hyperparams['exclude_inputs_columns'], + can_use_column, + ) + + if not columns_to_use: + raise ValueError("No inputs columns.") + + if self.hyperparams['use_inputs_columns'] and columns_not_to_use: + self.logger.warning("Not all specified inputs columns can used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _can_use_outputs_column(self, outputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + return 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in column_metadata.get('semantic_types', []) + + def _get_outputs_columns(self, outputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_outputs_column(outputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + outputs_metadata, + self.hyperparams['use_outputs_columns'], + self.hyperparams['exclude_outputs_columns'], + can_use_column, + ) + + if not columns_to_use: + raise ValueError("No outputs columns.") + + if self.hyperparams['use_outputs_columns'] and columns_not_to_use: + self.logger.warning("Not all specified outputs columns can used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _select_inputs_columns(self, inputs: Inputs) -> Tuple[Inputs, List[int]]: + columns_to_use = self._get_inputs_columns(inputs.metadata) + + return inputs.select_columns(columns_to_use), columns_to_use + + def _select_outputs_columns(self, outputs: Outputs) -> Tuple[Outputs, List[int]]: + columns_to_use = self._get_outputs_columns(outputs.metadata) + + return outputs.select_columns(columns_to_use), columns_to_use diff --git a/tods/common-primitives/common_primitives/list_to_dataframe.py b/tods/common-primitives/common_primitives/list_to_dataframe.py new file mode 100644 index 0000000..f37545e --- /dev/null +++ b/tods/common-primitives/common_primitives/list_to_dataframe.py @@ -0,0 +1,53 @@ +import os + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('ListToDataFramePrimitive',) + +Inputs = container.List +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class ListToDataFramePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which converts a list into a pandas dataframe. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'dd4598cf-2384-438a-a264-f6c77185132b', + 'version': '0.1.0', + 'name': "List to DataFrame converter", + 'python_path': 'd3m.primitives.data_transformation.list_to_dataframe.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/list_to_dataframe.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + return base.CallResult(container.DataFrame(inputs, generate_metadata=True)) diff --git a/tods/common-primitives/common_primitives/list_to_ndarray.py b/tods/common-primitives/common_primitives/list_to_ndarray.py new file mode 100644 index 0000000..c0fe4fd --- /dev/null +++ b/tods/common-primitives/common_primitives/list_to_ndarray.py @@ -0,0 +1,78 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('ListToNDArrayPrimitive',) + +Inputs = container.List +Outputs = container.ndarray + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class ListToNDArrayPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which converts a list into a numpy array. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '40ff1396-0725-4cf9-b7b9-c6eca6237f65', + 'version': '0.1.0', + 'name': "List to ndarray converter", + 'python_path': 'd3m.primitives.data_transformation.list_to_ndarray.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/list_to_ndarray.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + dataframe = container.ndarray(inputs, generate_metadata=True) + + # TODO: Remove once fixed in core package and released. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/144 + dataframe.metadata = self._update_metadata(dataframe.metadata) + + return base.CallResult(dataframe) + + def _update_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata + + selector: metadata_base.ListSelector = [metadata_base.ALL_ELEMENTS] + + while 'structural_type' in outputs_metadata.query(selector): + metadata = outputs_metadata.query(selector) + if issubclass(metadata['structural_type'], (container.List, container.ndarray)): + outputs_metadata = outputs_metadata.update(selector, { + 'structural_type': metadata_base.NO_VALUE, + }) + else: + break + + selector.append(metadata_base.ALL_ELEMENTS) + + return outputs_metadata.set_table_metadata() diff --git a/tods/common-primitives/common_primitives/mean_average_transform.py b/tods/common-primitives/common_primitives/mean_average_transform.py new file mode 100644 index 0000000..1f15530 --- /dev/null +++ b/tods/common-primitives/common_primitives/mean_average_transform.py @@ -0,0 +1,348 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import pandas as pd +# Custom import commands if any +from sklearn.preprocessing.data import Normalizer + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + window_size = hyperparams.Bounded[int]( + lower = 2, + upper = None, + default = 3, + description='Size of moving window.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + norm = hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2', 'max'], + description='The norm to use to normalize each non zero sample.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class MeanAverageTransform(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn Normalizer + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_NORMALIZATION, ], + "name": "sklearn.preprocessing.data.Normalizer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.anomaly_detection.MeanAverageTransform", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html']}, + "version": "2019.11.13", + "id": "980b3a2d-1574-31f3-8326-ddc62f8fc2c3", + "hyperparams_to_tune": ['norm'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Normalizer( + norm=self.hyperparams['norm'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + # if self._fitted: + # return CallResult(None) + + # self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + # self._input_column_names = self._training_inputs.columns + + # if self._training_inputs is None: + # return CallResult(None) + + # if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + # self._fitted = True + # else: + # if self.hyperparams['error_on_no_input']: + # raise RuntimeError("No input columns were selected") + # self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + # if not self._fitted: + # raise PrimitiveNotFittedError("Primitive not fitted.") + + window_size = self.hyperparams['window_size'] + print('window_size', window_size) + print('inputs-----------------------------------------------') + print(inputs) + + output = inputs.rolling(3, on = 'timestamp').mean() + print('output ---------------------------------------------') + print(output) + return(output) + + # sk_inputs = inputs + # if self.hyperparams['use_semantic_types']: + # sk_inputs = inputs.iloc[:, self._training_indices] + # output_columns = [] + # if len(self._training_indices) > 0: + # sk_output = self._clf.transform(sk_inputs) + # if sparse.issparse(sk_output): + # sk_output = sk_output.toarray() + # outputs = self._wrap_predictions(inputs, sk_output) + # if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + # output_columns = [outputs] + # else: + # if self.hyperparams['error_on_no_input']: + # raise RuntimeError("No input columns were selected") + # self.logger.warn("No input columns were selected") + # outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + # add_index_columns=self.hyperparams['add_index_columns'], + # inputs=inputs, column_indices=self._training_indices, + # columns_list=output_columns) + # return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +MeanAverageTransform.__doc__ = Normalizer.__doc__ diff --git a/tods/common-primitives/common_primitives/ndarray_to_dataframe.py b/tods/common-primitives/common_primitives/ndarray_to_dataframe.py new file mode 100644 index 0000000..11faa99 --- /dev/null +++ b/tods/common-primitives/common_primitives/ndarray_to_dataframe.py @@ -0,0 +1,64 @@ +import os + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('NDArrayToDataFramePrimitive',) + +Inputs = container.ndarray +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class NDArrayToDataFramePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which converts numpy array into a pandas dataframe. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'f5241b2e-64f7-44ad-9675-df3d08066437', + 'version': '0.1.0', + 'name': "ndarray to Dataframe converter", + 'python_path': 'd3m.primitives.data_transformation.ndarray_to_dataframe.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/ndarray_to_dataframe.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + metadata = inputs.metadata.query((metadata_base.ALL_ELEMENTS,)) + + if 'dimension' in metadata: + # Extract the column names so we can add them to the created dataframe, or set it to index string + num_cols = inputs.metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + col_names = [inputs.metadata.query((metadata_base.ALL_ELEMENTS, i)).get('name', str(i)) for i in range(num_cols)] + else: + col_names = None + + # create a dataframe from the numpy array + dataframe = container.DataFrame(inputs, columns=col_names, generate_metadata=True) + return base.CallResult(dataframe) diff --git a/tods/common-primitives/common_primitives/ndarray_to_list.py b/tods/common-primitives/common_primitives/ndarray_to_list.py new file mode 100644 index 0000000..87f126d --- /dev/null +++ b/tods/common-primitives/common_primitives/ndarray_to_list.py @@ -0,0 +1,53 @@ +import os + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('NDArrayToListPrimitive',) + +Inputs = container.ndarray +Outputs = container.List + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class NDArrayToListPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which converts a numpy array into a list. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'b5101331-64b4-451a-beb6-260b40d1436b', + 'version': '0.1.0', + 'name': "ndarray to list converter", + 'python_path': 'd3m.primitives.data_transformation.ndarray_to_list.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/ndarray_to_list.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + return base.CallResult(container.List(inputs, generate_metadata=True)) diff --git a/tods/common-primitives/common_primitives/no_split.py b/tods/common-primitives/common_primitives/no_split.py new file mode 100644 index 0000000..d0c90ce --- /dev/null +++ b/tods/common-primitives/common_primitives/no_split.py @@ -0,0 +1,59 @@ +import os +import typing + +import numpy # type: ignore +import pandas # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams + +import common_primitives +from common_primitives import base + +__all__ = ('NoSplitDatasetSplitPrimitive',) + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class NoSplitDatasetSplitPrimitive(base.TabularSplitPrimitiveBase[Hyperparams]): + """ + A primitive which splits a tabular Dataset in a way that for all splits it + produces the same (full) Dataset. Useful for unsupervised learning tasks. . + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '48c683ad-da9e-48cf-b3a0-7394dba5e5d2', + 'version': '0.1.0', + 'name': "No-split tabular dataset splits", + 'python_path': 'd3m.primitives.evaluation.no_split_dataset_split.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/no_split.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.IDENTITY_FUNCTION, + metadata_base.PrimitiveAlgorithmType.DATA_SPLITTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.EVALUATION, + }, + ) + + def _get_splits(self, attributes: pandas.DataFrame, targets: pandas.DataFrame, dataset: container.Dataset, main_resource_id: str) -> typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]]: + # We still go through the whole splitting process to assure full compatibility + # (and error conditions) of a regular split, but we use all data for both splits. + all_data = numpy.arange(len(attributes)) + + return [(all_data, all_data)] diff --git a/tods/common-primitives/common_primitives/normalize_column_references.py b/tods/common-primitives/common_primitives/normalize_column_references.py new file mode 100644 index 0000000..a6b88c9 --- /dev/null +++ b/tods/common-primitives/common_primitives/normalize_column_references.py @@ -0,0 +1,112 @@ +import collections +import os + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('NormalizeColumnReferencesPrimitive',) + +Inputs = container.Dataset +Outputs = container.Dataset + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class NormalizeColumnReferencesPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which converts all column references (``foreign_key``, ``boundary_for``, ``confidence_for``) + found in a dataset to be by column index and not by column name. + + It is useful to do this at the beginning of the pipeline because it is easier to maintain references + by column index as data and metadata is being changed by the pipeline. + + See for more information `this issue`_. + + .. _this issue: https://gitlab.com/datadrivendiscovery/d3m/issues/343 + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '2ee36ea4-5ec3-4c18-909f-9e157fb6d18f', + 'version': '0.1.0', + 'name': "Normalize column references", + 'python_path': 'd3m.primitives.data_transformation.normalize_column_references.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/normalize_column_references.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + outputs = inputs.copy() + + for resource_id, resource in inputs.items(): + if not isinstance(resource, container.DataFrame): + continue + + for column_element in inputs.metadata.get_elements((resource_id, metadata_base.ALL_ELEMENTS,)): + column_metadata = inputs.metadata.query((resource_id, metadata_base.ALL_ELEMENTS, column_element)) + + if 'confidence_for' in column_metadata and 'column_names' in column_metadata['confidence_for']: + confidence_for = collections.OrderedDict(column_metadata['confidence_for']) + column_reference_resource_id = confidence_for.get('resource_id', resource_id) + + confidence_for['column_indices'] = [ + inputs.metadata.get_column_index_from_column_name(column_name, at=(column_reference_resource_id,)) + for column_name in confidence_for['column_names'] + ] + + confidence_for['column_names'] = metadata_base.NO_VALUE + + outputs.metadata = outputs.metadata.update((resource_id, metadata_base.ALL_ELEMENTS, column_element), { + 'confidence_for': confidence_for, + }) + + if 'boundary_for' in column_metadata and 'column_name' in column_metadata['boundary_for']: + boundary_for = collections.OrderedDict(column_metadata['boundary_for']) + column_reference_resource_id = boundary_for.get('resource_id', resource_id) + + boundary_for['column_index'] = inputs.metadata.get_column_index_from_column_name(boundary_for['column_name'], at=(column_reference_resource_id,)) + + boundary_for['column_name'] = metadata_base.NO_VALUE + + outputs.metadata = outputs.metadata.update((resource_id, metadata_base.ALL_ELEMENTS, column_element), { + 'boundary_for': boundary_for, + }) + + if 'foreign_key' in column_metadata and column_metadata['foreign_key']['type'] == 'COLUMN' and 'column_name' in column_metadata['foreign_key']: + foreign_key = collections.OrderedDict(column_metadata['foreign_key']) + column_reference_resource_id = foreign_key['resource_id'] + + foreign_key['column_index'] = inputs.metadata.get_column_index_from_column_name(foreign_key['column_name'], at=(column_reference_resource_id,)) + + foreign_key['column_name'] = metadata_base.NO_VALUE + + outputs.metadata = outputs.metadata.update((resource_id, metadata_base.ALL_ELEMENTS, column_element), { + 'foreign_key': foreign_key, + }) + + return base.CallResult(outputs) diff --git a/tods/common-primitives/common_primitives/normalize_graphs.py b/tods/common-primitives/common_primitives/normalize_graphs.py new file mode 100644 index 0000000..8b2ace3 --- /dev/null +++ b/tods/common-primitives/common_primitives/normalize_graphs.py @@ -0,0 +1,360 @@ +import os +import typing + +import networkx # type: ignore +import pandas # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('NormalizeGraphsPrimitive',) + +Inputs = container.Dataset +Outputs = container.Dataset + + +class Hyperparams(hyperparams.Hyperparams): + pass + + +class NormalizeGraphsPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which converts all graphs found in a dataset into a standard two-table representation + (one of nodes and one of edges, using foreign keys to link between nodes and edges). + + See for more information `this issue`_. + + .. _this issue: https://gitlab.com/datadrivendiscovery/d3m/issues/134 + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'dbb3792d-a44b-4941-a88e-5520c0a23488', + 'version': '0.1.0', + 'name': "Normalize graphs", + 'python_path': 'd3m.primitives.data_transformation.normalize_graphs.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/normalize_graphs.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + outputs = inputs.copy() + + for resource_id, resource in inputs.items(): + if isinstance(resource, networkx.classes.graph.Graph): + self._convert_networkx(outputs, resource_id) + + if isinstance(resource, container.DataFrame) and outputs.metadata.has_semantic_type((resource_id,), 'https://metadata.datadrivendiscovery.org/types/EdgeList'): + self._update_edge_list(outputs, resource_id) + + return base.CallResult(outputs) + + def _convert_networkx(self, dataset: container.Dataset, resource_id: str) -> None: + resource = dataset[resource_id] + + # DataFrame index contains networkX node IDs (which come from GML node IDs). + # We see them as internal to the networkX structure and we use them only + # to align nodes with edges but then discard them. + nodes = pandas.DataFrame.from_dict(resource.nodes, orient='index') + + if len(nodes) != len(resource.nodes): + raise exceptions.InvalidStateError(f"Converted nodes DataFrame has {len(nodes)} nodes, but graph has {len(resource.nodes)} nodes.") + + if not nodes.loc[:, 'nodeID'].is_unique: + raise exceptions.UnexpectedValueError(f"'nodeID' column should be unique, but it is not in the graph with resource ID '{resource_id}'.") + if nodes.loc[:, 'nodeID'].hasnans: + raise exceptions.UnexpectedValueError(f"'nodeID' column should not have missing values, but it has them in the graph with resource ID '{resource_id}'.") + + # "source" and "target" columns point to "nodes" index values, not "nodeID" column. + # TODO: What if edge attributes contain "source" and "target" keys? + edgelist = networkx.to_pandas_edgelist(resource) + + # We map "source" and "target" columns to "nodeID" column. + edgelist.loc[:, 'source'] = edgelist.loc[:, 'source'].apply(lambda s: nodes.loc[s, 'nodeID']) + edgelist.loc[:, 'target'] = edgelist.loc[:, 'target'].apply(lambda s: nodes.loc[s, 'nodeID']) + + nodes = container.DataFrame(nodes, metadata=dataset.metadata.query((resource_id,)), generate_metadata=True) + edgelist = container.DataFrame(edgelist, metadata=dataset.metadata.query((resource_id,)), generate_metadata=True) + + nodes_resource_id = f'{resource_id}_nodes' + edgelist_resource_id = f'{resource_id}_edges' + + if nodes_resource_id in dataset: + raise exceptions.AlreadyExistsError(f"Resource with ID '{nodes_resource_id}' already exists.") + if edgelist_resource_id in dataset: + raise exceptions.AlreadyExistsError(f"Resource with ID '{edgelist_resource_id}' already exists.") + + node_id_column_index = nodes.metadata.get_column_index_from_column_name('nodeID') + + nodes.metadata = nodes.metadata.update((), { + 'dimension': { + 'name': 'nodes', + }, + }) + nodes.metadata = nodes.metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/EdgeList') + nodes.metadata = nodes.metadata.add_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/Graph') + nodes.metadata = nodes.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS), + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ) + nodes.metadata = nodes.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS), + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + ) + nodes.metadata = nodes.metadata.update( + (metadata_base.ALL_ELEMENTS, node_id_column_index), + { + 'semantic_types': [ + # "nodeID" is always an integer. + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }, + ) + + source_column_index = edgelist.metadata.get_column_index_from_column_name('source') + target_column_index = edgelist.metadata.get_column_index_from_column_name('target') + + edgelist.metadata = edgelist.metadata.update((), { + 'dimension': { + 'name': 'edges', + }, + }) + edgelist.metadata = edgelist.metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/Graph') + edgelist.metadata = edgelist.metadata.add_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/EdgeList') + edgelist.metadata = edgelist.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS), + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ) + edgelist.metadata = edgelist.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS), + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + ) + edgelist.metadata = edgelist.metadata.update( + (metadata_base.ALL_ELEMENTS, source_column_index), + { + 'semantic_types': [ + # "nodeID" is always an integer. + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': nodes_resource_id, + 'column_index': node_id_column_index, + }, + }, + ) + edgelist.metadata = edgelist.metadata.update( + (metadata_base.ALL_ELEMENTS, target_column_index), + { + 'semantic_types': [ + # "nodeID" is always an integer. + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': nodes_resource_id, + 'column_index': node_id_column_index, + }, + }, + ) + + directed = isinstance(resource, networkx.DiGraph) + multi_graph = isinstance(resource, networkx.MultiGraph) + + edgelist.metadata = self._set_edges_metadata( + edgelist.metadata, + source_column_index, + target_column_index, + directed=directed, + multi_graph=multi_graph, + at=(), + ) + + del dataset[resource_id] + dataset.metadata = dataset.metadata.remove((resource_id,), recursive=True) + + dataset[nodes_resource_id] = nodes + dataset[edgelist_resource_id] = edgelist + + dataset.metadata = nodes.metadata.copy_to(dataset.metadata, (), (nodes_resource_id,)) + dataset.metadata = edgelist.metadata.copy_to(dataset.metadata, (), (edgelist_resource_id,)) + + dataset.metadata = dataset.metadata.update((), { + 'dimension': { + 'length': len(dataset), + }, + }) + + node_references = self._get_node_references(dataset) + + for column_reference, reference_resource_id in node_references.items(): + if reference_resource_id == resource_id: + dataset.metadata = dataset.metadata.update( + (column_reference.resource_id, metadata_base.ALL_ELEMENTS, column_reference.column_index), + { + 'foreign_key': metadata_base.NO_VALUE, + }, + ) + dataset.metadata = dataset.metadata.update( + (column_reference.resource_id, metadata_base.ALL_ELEMENTS, column_reference.column_index), + { + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': nodes_resource_id, + 'column_index': node_id_column_index, + }, + }, + ) + + def _set_edges_metadata( + self, metadata: metadata_base.DataMetadata, source_column_index: int, + target_column_index: int, *, directed: bool, multi_graph: bool, + at: metadata_base.Selector, + ) -> metadata_base.DataMetadata: + metadata = metadata.add_semantic_type( + list(at) + [metadata_base.ALL_ELEMENTS, source_column_index], + 'https://metadata.datadrivendiscovery.org/types/EdgeSource', + ) + metadata = metadata.add_semantic_type( + list(at) + [metadata_base.ALL_ELEMENTS, target_column_index], + 'https://metadata.datadrivendiscovery.org/types/EdgeTarget', + ) + metadata = metadata.add_semantic_type( + list(at) + [metadata_base.ALL_ELEMENTS, source_column_index], + f'''https://metadata.datadrivendiscovery.org/types/{'Directed' if directed else 'Undirected'}EdgeSource''', + ) + metadata = metadata.add_semantic_type( + list(at) + [metadata_base.ALL_ELEMENTS, target_column_index], + f'''https://metadata.datadrivendiscovery.org/types/{'Directed' if directed else 'Undirected'}EdgeTarget''', + ) + metadata = metadata.add_semantic_type( + list(at) + [metadata_base.ALL_ELEMENTS, source_column_index], + f'''https://metadata.datadrivendiscovery.org/types/{'Multi' if multi_graph else 'Simple'}EdgeSource''', + ) + metadata = metadata.add_semantic_type( + list(at) + [metadata_base.ALL_ELEMENTS, target_column_index], + f'''https://metadata.datadrivendiscovery.org/types/{'Multi' if multi_graph else 'Simple'}EdgeTarget''', + ) + + return metadata + + # TODO: Support also "edge", "nodeAttribute", and "edgeAttribute" references. + # See: https://gitlab.datadrivendiscovery.org/MIT-LL/d3m_data_supply/merge_requests/35 + # See: https://gitlab.datadrivendiscovery.org/MIT-LL/d3m_data_supply/issues/183 + def _get_node_references(self, dataset: container.Dataset) -> typing.Dict[metadata_base.ColumnReference, str]: + references = {} + + for resource_id, resource in dataset.items(): + if not isinstance(resource, container.DataFrame): + continue + + for column_index in range(dataset.metadata.query_field((resource_id, metadata_base.ALL_ELEMENTS), 'dimension')['length']): + column_metadata = dataset.metadata.query_column(column_index, at=(resource_id,)) + + column_reference = metadata_base.ColumnReference(resource_id, column_index) + + if 'foreign_key' in column_metadata and column_metadata['foreign_key']['type'] == 'NODE': + reference_resource_id = column_metadata['foreign_key']['resource_id'] + + references[column_reference] = reference_resource_id + + return references + + def _update_edge_list(self, dataset: container.Dataset, resource_id: str) -> None: + # We want to allow this primitive to be run multiple times in a row. + # So we have to determine if we have already processed this resource. + if dataset.metadata.list_columns_with_semantic_types([ + 'https://metadata.datadrivendiscovery.org/types/EdgeSource', + 'https://metadata.datadrivendiscovery.org/types/EdgeTarget', + ], at=(resource_id,)): + return + + dataset.metadata = dataset.metadata.update((resource_id,), { + 'dimension': { + 'name': 'edges', + }, + }) + + reference_column_indices = [] + for column_index in range(dataset.metadata.query_field((resource_id, metadata_base.ALL_ELEMENTS), 'dimension')['length']): + column_metadata = dataset.metadata.query_column(column_index, at=(resource_id,)) + + if 'foreign_key' in column_metadata and column_metadata['foreign_key']['type'] == 'COLUMN': + reference_column_indices.append(column_index) + + # If there is a different number of columns than it is tricky for us to + # know which ones belong to edges. We would need some additional metadata + # in D3M dataset format to handle such case. + if len(reference_column_indices) != 2: + raise exceptions.NotSupportedError("Edge list with number of references different than 2 is not supported.") + + source_column_index, target_column_index = reference_column_indices + + # All edge list graphs are undirected. + # See: https://gitlab.datadrivendiscovery.org/MIT-LL/d3m_data_supply/issues/184 + directed = False + multi_graph = self._is_multi_graph(dataset[resource_id], source_column_index, target_column_index) + + dataset.metadata = self._set_edges_metadata( + dataset.metadata, + source_column_index, + target_column_index, + directed=directed, + multi_graph=multi_graph, + at=(resource_id,), + ) + + def _is_multi_graph(self, edgelist: container.DataFrame, source_column_index: int, target_column_index: int) -> bool: + edges = edgelist.iloc[:, [source_column_index, target_column_index]] + + return len(edges) != len(edges.drop_duplicates()) + + +if __name__ == '__main__': + import logging + import pprint + import sys + + logging.basicConfig() + + for dataset_file_path in sys.argv[1:]: + try: + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=os.path.abspath(dataset_file_path))) + except Exception as error: + raise Exception("Unable to load dataset: {dataset_doc_path}".format(dataset_doc_path=dataset_file_path)) from error + + primitive = NormalizeGraphsPrimitive(hyperparams=Hyperparams.defaults()) + + try: + normalized_dataset = primitive.produce(inputs=dataset).value + + pprint.pprint(normalized_dataset) + normalized_dataset.metadata.pretty_print() + except Exception as error: + raise Exception("Unable to normalize dataset: {dataset_doc_path}".format(dataset_doc_path=dataset_file_path)) from error diff --git a/tods/common-primitives/common_primitives/numeric_range_filter.py b/tods/common-primitives/common_primitives/numeric_range_filter.py new file mode 100644 index 0000000..b9c41d9 --- /dev/null +++ b/tods/common-primitives/common_primitives/numeric_range_filter.py @@ -0,0 +1,138 @@ +import collections +import os +import typing + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives +from common_primitives import dataframe_utils + +import pandas as pd # type: ignore + +__all__ = ('NumericRangeFilterPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + column = hyperparams.Hyperparameter[int]( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Index of column filter applies to.' + ) + inclusive = hyperparams.Hyperparameter[bool]( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='True when values outside the range are removed, False when values within the range are removed.' + ) + min = hyperparams.Union[typing.Union[float, None]]( + configuration=collections.OrderedDict( + float=hyperparams.Hyperparameter[float](0), + negative_infinity=hyperparams.Constant(None), + ), + default='negative_infinity', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Minimum value for filter.' + ) + max = hyperparams.Union[typing.Union[float, None]]( + configuration=collections.OrderedDict( + float=hyperparams.Hyperparameter[float](0), + positive_infinity=hyperparams.Constant(None), + ), + default='positive_infinity', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Maximum value for filter.' + ) + strict = hyperparams.Hyperparameter[bool]( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='True when the filter bounds are strict (ie. less than), false then are not (ie. less than equal to).' + ) + + +class NumericRangeFilterPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which filters rows from a DataFrame based on a numeric range applied to a given column. + Columns are identified by their index, and the filter itself can be inclusive (values within range are retained) + or exclusive (values within range are removed). Boundaries values can be included in the filter (ie. <=) or excluded + (ie. <). + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '8c246c78-3082-4ec9-844e-5c98fcc76f9d', + 'version': '0.1.0', + 'name': "Numeric range filter", + 'python_path': 'd3m.primitives.data_preprocessing.numeric_range_filter.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:cbethune@uncharted.software', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/numeric_range_filter.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + # to make sure index matches row indices + resource = inputs.reset_index(drop=True) + + if self.hyperparams['min'] is None: + min = float('inf') + else: + min = self.hyperparams['min'] + + if self.hyperparams['max'] is None: + max = float('inf') + else: + max = self.hyperparams['max'] + + # apply the filter using native dataframe methods + col_idx = self.hyperparams['column'] + try: + to_keep: pd.Series + if self.hyperparams['inclusive']: + if self.hyperparams['strict']: + to_keep = (resource.iloc[:, col_idx].astype(float) > min) & \ + (resource.iloc[:, col_idx].astype(float) < max) + + else: + to_keep = (resource.iloc[:, col_idx].astype(float) >= min) & \ + (resource.iloc[:, col_idx].astype(float) <= max) + else: + if self.hyperparams['strict']: + to_keep = (resource.iloc[:, col_idx].astype(float) < min) | \ + (resource.iloc[:, col_idx].astype(float) > max) + else: + to_keep = (resource.iloc[:, col_idx].astype(float) <= min) | \ + (resource.iloc[:, col_idx].astype(float) >= max) + + to_keep_indices = resource.loc[to_keep].index + + except ValueError as error: + raise exceptions.InvalidArgumentValueError( + "Failure to apply numerical range filter to column {col_idx} of type {type}.".format( + col_idx=col_idx, + type=resource.iloc[:, col_idx].dtype, + ), + ) from error + + # remove dataframe and metadata rows by index + outputs = dataframe_utils.select_rows(inputs, to_keep_indices) + + return base.CallResult(outputs) diff --git a/tods/common-primitives/common_primitives/one_hot_maker.py b/tods/common-primitives/common_primitives/one_hot_maker.py new file mode 100644 index 0000000..72935c7 --- /dev/null +++ b/tods/common-primitives/common_primitives/one_hot_maker.py @@ -0,0 +1,313 @@ +import os +import typing +from typing import Any, Dict, List, Tuple + +import d3m.metadata.base as metadata_module +import numpy as np # type: ignore +import pandas as pd # type: ignore + +from d3m import exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.container.pandas import DataFrame +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces.base import CallResult +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + +import common_primitives + +Inputs = DataFrame +Outputs = DataFrame + + +class Params(params.Params): + categories: Dict[int, np.ndarray] + fitted: bool + + +class Hyperparams(hyperparams.Hyperparams): + separator = hyperparams.Hyperparameter[str]( + default='.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Separator separates additional identifier and original column name", + ) + prefix = hyperparams.Hyperparameter[str]( + default='col', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Separator separates additional identifier and original column name", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to force primitive to operate on. If any specified column cannot " + "be used, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to not operate on. Applicable only if \"use_columns\" is not " + "provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed " + "columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + handle_unseen = hyperparams.Enumeration( + values=['error', 'ignore', 'column'], + default='ignore', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="error: throw exception when unknown value observed" + "ignore: ignore unseen values" + "auto: put unseen values in extra column and mark the cell as 1" + ) + handle_missing_value = hyperparams.Enumeration( + values=['error', 'ignore', 'column'], + default='ignore', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Options for dealing with missing values.' + 'error: throw exceptions when missing values encountered.' + 'ignore: ignore any missing value.' + 'column: add one column for missing value.' + ) + # TODO hyperparams.Hyperparameter[typing.Set[Any]] doesn't work? + missing_values = hyperparams.Hyperparameter[typing.AbstractSet[Any]]( + default={np.NaN, None, ''}, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Values indicate the data is a missing other than 'None' and 'np.NaN'", + ) + encode_target_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Whether to encode target column", + ) + + +class OneHotMakerPrimitive(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Attempts to detect discrete values in data and convert these to a + one-hot embedding. + """ + _unseen_column_name: str = 'Unseen' + _missing_column_name: str = 'Missing' + + metadata = metadata_module.PrimitiveMetadata({ + 'id': 'eaec420d-46eb-4ddf-a2cd-b8097345ff3e', + 'version': '0.3.0', + 'name': 'One-hot maker', + 'keywords': ['data processing', 'one-hot'], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:lin.yang@tamu.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/one_hot_maker.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_module.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__))) + }], + 'python_path': 'd3m.primitives.data_preprocessing.one_hot_encoder.MakerCommon', + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ENCODE_ONE_HOT, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + self._training_inputs: Inputs = None + self._categories: Dict[int, np.array] = {} + self._fitted: bool = False + + # record unseen row index and column name + self._unseen: List[Tuple[int, str]] = [] + self._missing: Dict[str, List[int]] = {} + + def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore + self._training_inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + if self._training_inputs is None: + raise ValueError("Missing training data.") + + column_indices = self._get_columns(self._training_inputs.metadata) + for i in column_indices: + self._categories[i] = self._fit_categories(self._training_inputs.iloc[:, i]) + self._fitted = True + + return CallResult(None) + + def _fit_categories(self, column: pd.Series) -> np.array: + # generates sorted unique value + missing_value_mask = self._get_missing_value_mask(column) + if self.hyperparams['handle_missing_value'] == 'error': + if missing_value_mask.any(): + raise exceptions.MissingValueError('Missing value in categorical data') + _categories = np.unique(column[~missing_value_mask]) + return _categories + + def produce(self, *, inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + selected_inputs, columns_to_use = self._select_columns(inputs) + if len(selected_inputs.columns[selected_inputs.columns.duplicated()].unique()): + raise exceptions.ColumnNameError('Duplicated column name') + # TODO check if input has the same column as input in fit stage + + outputs = [] + for i in columns_to_use: + input = inputs.iloc[:, i] + onehot_result = self._produce_onehot_columns(i, input) + column_names = self._produce_onehot_column_names(i, inputs.metadata, input.name) + output = DataFrame(onehot_result, columns=column_names, generate_metadata=False) + self._produce_onehot_metadata(inputs, output, i) + outputs.append(output) + + outputs = base_utils.combine_columns(inputs, columns_to_use, outputs, + return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns']) + return CallResult(outputs) + + def _produce_onehot_columns(self, column_index: int, column: pd.Series) -> np.ndarray: + category = self._categories[column_index] + column_count = len(category) + row_count = len(category) + 2 + handle_missing_value = self.hyperparams['handle_missing_value'] + handle_unseen = self.hyperparams['handle_unseen'] + + unseen_value_row_index = len(category) + missing_value_row_index = unseen_value_row_index + 1 + + # One more column for missing value when handle is 'column' + if handle_missing_value == 'column': + column_count += 1 + if handle_unseen == 'column': + column_count += 1 + onehotted_cat = np.eye(row_count, column_count, dtype=np.uint8) + if handle_missing_value == 'ignore': + onehotted_cat[missing_value_row_index, :] = 0 + if handle_unseen == 'ignore': + onehotted_cat[unseen_value_row_index, :] = 0 + onehot_index = np.zeros(column.size, dtype=np.uint8) + missing_value_mask = self._get_missing_value_mask(column) + one_hotted_cat_index = np.searchsorted(category, column[~missing_value_mask]) + unseen_value_mask = np.take(category, one_hotted_cat_index, mode='clip') != column[~missing_value_mask] + if np.any(missing_value_mask) and handle_missing_value == 'error': + raise exceptions.UnexpectedValueError( + 'Encountered missing value {} on index {}'.format(column[missing_value_mask], + np.nonzero(column[~missing_value_mask]))) + if np.any(unseen_value_mask) and handle_unseen == 'error': + raise exceptions.UnexpectedValueError( + 'Encountered unseen value {}'.format(column[~missing_value_mask][unseen_value_mask].values)) + onehot_index[missing_value_mask] = missing_value_row_index + onehot_index[~missing_value_mask] = one_hotted_cat_index + onehot_index[~missing_value_mask][unseen_value_mask] = unseen_value_row_index + one_hot_result = onehotted_cat[onehot_index] + return one_hot_result + + def _get_missing_value_mask(self, inputs: pd.Series) -> np.array: + return np.bitwise_or(inputs.isin(self.hyperparams['missing_values']), pd.isnull(inputs)) + + def _produce_onehot_column_names(self, + column_index: int, + metadata: metadata_base.DataMetadata, column_name: str) -> typing.Sequence[str]: + base_column_name = metadata.query_column(column_index).get('name', column_name) + name_prefix = '{}{}'.format(base_column_name, self.hyperparams['separator']) + column_names = ['{}{}'.format(name_prefix, cat_name) for cat_name in self._categories[column_index]] + if self.hyperparams['handle_missing_value'] == 'column': + column_names.append('{}{}'.format(name_prefix, self._missing_column_name)) + if self.hyperparams['handle_unseen'] == 'column': + column_names.append('{}{}'.format(name_prefix, self._unseen_column_name)) + return column_names + + def _produce_onehot_metadata(self, inputs: Inputs, outputs: Outputs, column_index: int) -> None: + for onehot_index in range(outputs.shape[1]): + outputs.metadata = inputs.metadata.copy_to( + outputs.metadata, + (metadata_base.ALL_ELEMENTS, column_index), + (metadata_base.ALL_ELEMENTS, onehot_index), + ) + + # We set column names based on what Pandas generated. + for output_column_index, output_column_name in enumerate(outputs.columns): + outputs.metadata = outputs.metadata.update_column( + output_column_index, + { + 'name': output_column_name, + }, + ) + + # Then we generate the rest of metadata. + outputs.metadata = outputs.metadata.generate(outputs) + + # Then we unmark output columns as categorical data. + for output_column_index in range(outputs.shape[1]): + outputs.metadata = outputs.metadata.remove_semantic_type( + (metadata_base.ALL_ELEMENTS, output_column_index), + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + ) + + def get_params(self) -> Params: + return Params( + categories=self._categories, + fitted=self._fitted + ) + + def set_params(self, *, params: Params) -> None: + self._categories = params['categories'] + self._fitted = params['fitted'] + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_inputs_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + inputs_metadata, + self.hyperparams['use_columns'], + self.hyperparams['exclude_columns'], + can_use_column, + ) + if not columns_to_use: + raise ValueError("No column to use.") + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified inputs columns can used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _can_use_inputs_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + semantic_types = column_metadata.get('semantic_types', []) + + if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: + # Skip parsing if a column is categorical, but also a target column. + if not self.hyperparams['encode_target_columns'] and \ + 'https://metadata.datadrivendiscovery.org/types/Target' in semantic_types: + return False + return True + return False + + def _select_columns(self, inputs: Inputs) -> Tuple[Inputs, List[int]]: + columns_to_use = self._get_columns(inputs.metadata) + + return inputs.select_columns(columns_to_use), columns_to_use diff --git a/tods/common-primitives/common_primitives/pandas_onehot_encoder.py b/tods/common-primitives/common_primitives/pandas_onehot_encoder.py new file mode 100644 index 0000000..3a38dcb --- /dev/null +++ b/tods/common-primitives/common_primitives/pandas_onehot_encoder.py @@ -0,0 +1,238 @@ +import os +from typing import cast, Any, Dict, List, Union, Optional + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, params, hyperparams +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import CallResult + +import pandas # type: ignore +from pandas.api import types as pandas_types # type: ignore + +import common_primitives + +__all__ = ('PandasOneHotEncoderPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + # For each column, a list of category values, sorted. + categories: Optional[Dict[int, List[Any]]] + + +class Hyperparams(hyperparams.Hyperparams): + dummy_na = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Add a column to indicate NaNs, if False NaNs are ignored.", + ) + drop_first = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Whether to get k-1 dummies out of k categorical levels by removing the first level.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be encoded, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='replace', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should encoded columns be appended, should they replace original columns, or should only encoded columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + encode_target_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should it encode also target columns?", + ) + + +class PandasOneHotEncoderPrimitive(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + One-hot encoder using Pandas implementation. + + """ + __author__ = "Louis Huang" + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'f6315ca9-ca39-4e13-91ba-1964ee27281c', + 'version': '0.1.0', + 'name': "Pandas one hot encoder", + 'python_path': 'd3m.primitives.data_preprocessing.one_hot_encoder.PandasCommon', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:luyih@berkeley.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/pandas_onehot_encoder.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ENCODE_ONE_HOT, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._training_inputs: Inputs = None + self._categories: Dict[int, List[Any]] = {} + self._fitted = False + + def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore + self._training_inputs = inputs + self._fitted = False + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + semantic_types = column_metadata.get('semantic_types', []) + + if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: + # Skip parsing if a column is categorical, but also a target column. + if not self.hyperparams['encode_target_columns'] and 'https://metadata.datadrivendiscovery.org/types/Target' in semantic_types: + return False + + return True + + return False + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + # We are OK if no columns ended up being encoded. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns can be encoded. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + if self._fitted: + return CallResult(None) + + columns_to_use = self._get_columns(self._training_inputs.metadata) + + self._categories = {} + for column_index in columns_to_use: + self._fit_column(column_index) + + self._fitted = True + + return CallResult(None) + + def _fit_column(self, column_index: int) -> None: + self._categories[column_index] = sorted(self._training_inputs.iloc[:, column_index].value_counts(dropna=True).index) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + columns_to_use = self._get_columns(inputs.metadata) + + if set(columns_to_use) != set(self._categories.keys()): + raise exceptions.InvalidArgumentValueError("Columns in provided data do not match fitted columns.") + + outputs_columns = [] + for column_index in columns_to_use: + outputs_columns.append(self._produce_column(inputs, column_index)) + + outputs = base_utils.combine_columns(inputs, columns_to_use, outputs_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + return CallResult(outputs) + + def _produce_column(self, inputs: Inputs, column_index: int) -> Outputs: + # By making a column a category and provide "categories" we can assure same + # order between multiple calls to "produce". + input_column = inputs.iloc[:, [column_index]].astype( + pandas_types.CategoricalDtype(categories=self._categories[column_index]), + ) + + # We first set DataFrame column nam to match one from metadata, if it exists. + # This then allows Pandas to generate proper new column names. + input_column.rename({ + input_column.columns[0]: inputs.metadata.query_column(column_index).get('name', input_column.columns[0]), + }, axis=1, inplace=True) + + output_columns = pandas.get_dummies( + input_column, + dummy_na=self.hyperparams['dummy_na'], + drop_first=self.hyperparams['drop_first'], + ) + output_columns = container.DataFrame(output_columns, generate_metadata=False) + + # Copy metadata from input column to all output columns. + for output_column_index in range(len(output_columns.columns)): + output_columns.metadata = inputs.metadata.copy_to( + output_columns.metadata, + (metadata_base.ALL_ELEMENTS, column_index), + (metadata_base.ALL_ELEMENTS, output_column_index), + ) + + # We set column names based on what Pandas generated. + for output_column_index, output_column_name in enumerate(output_columns.columns): + output_columns.metadata = output_columns.metadata.update_column( + output_column_index, + { + 'name': output_column_name, + }, + ) + + # Then we generate the rest of metadata. + output_columns.metadata = output_columns.metadata.generate(output_columns) + + # Then we unmark output columns as categorical data. + for output_column_index in range(len(output_columns.columns)): + output_columns.metadata = output_columns.metadata.remove_semantic_type( + (metadata_base.ALL_ELEMENTS, output_column_index), + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + ) + + return output_columns + + def get_params(self) -> Params: + if not self._fitted: + return Params( + categories=None, + ) + + return Params( + categories=self._categories, + ) + + def set_params(self, *, params: Params) -> None: + self._categories = params['categories'] + self._fitted = all(param is not None for param in params.values()) diff --git a/tods/common-primitives/common_primitives/random_forest.py b/tods/common-primitives/common_primitives/random_forest.py new file mode 100644 index 0000000..7145d34 --- /dev/null +++ b/tods/common-primitives/common_primitives/random_forest.py @@ -0,0 +1,733 @@ +import os +from collections import OrderedDict +from typing import cast, Dict, List, Union, Sequence, Optional, Tuple + +import numpy as np # type: ignore +import pandas as pd # type: ignore +import sklearn.tree # type: ignore +from sklearn.ensemble.forest import RandomForestClassifier # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces.base import CallResult, ProbabilisticCompositionalityMixin, SamplingCompositionalityMixin, ContinueFitMixin +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase + +import common_primitives + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + estimators: Optional[List[sklearn.tree.DecisionTreeClassifier]] + classes: Optional[Union[np.ndarray, List[np.ndarray]]] + n_classes: Optional[Union[int, List[int]]] + n_features: Optional[int] + n_outputs: Optional[int] + attribute_columns_names: Optional[List[str]] + target_columns_metadata: Optional[List[OrderedDict]] + target_columns_names: Optional[List[str]] + oob_score: Optional[float] + oob_decision_function: Optional[Union[np.ndarray, List[np.ndarray]]] + + +class Hyperparams(hyperparams.Hyperparams): + # TODO: How to define it better? + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/150 + n_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=100, + description='The number of trees in the forest.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + n_more_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=100, + description='When continuing a fit, it controls how many more trees to add every time.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + criterion = hyperparams.Enumeration[str]( + values=['gini', 'entropy'], + default='gini', + description='The function to measure the quality of a split.' + ' Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.' + ' Note: this parameter is tree-specific.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + max_features = hyperparams.Union[Union[int, float, str, None]]( + configuration=OrderedDict( + # TODO: How to mark it as depending on the number of input features? + fixed=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + description='Consider "max_features" features at each split.'), + ratio=hyperparams.Uniform( + lower=0, + upper=1, + default=0.25, + lower_inclusive=True, + # "ratio" == 1.0 is equal to "all_features", we do not want to have it twice. + # Moreover, this makes it possible to differentiate between "fixed" and "ratio" just by the value. + upper_inclusive=False, + description='A percentage. "int(max_features * n_features)" features are considered at each split.', + ), + calculated=hyperparams.Enumeration[str]( + values=['sqrt', 'log2'], + default='sqrt', + description='If "sqrt", then "max_features = sqrt(n_features)". If "log2", then "max_features = log2(n_features)".', + ), + all_features=hyperparams.Constant( + default=None, + description='"max_features = n_features".', + ), + ), + default='calculated', + description='The number of features to consider when looking for the best split.' + ' The search for a split does not stop until at least one valid partition of the node samples is found,' + ' even if it requires to effectively inspect more than "max_features" features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + max_depth = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=10, + ), + unlimited=hyperparams.Constant( + default=None, + description='Nodes are expanded until all leaves are pure or until all leaves contain less than "min_samples_split" samples.', + ), + ), + default='unlimited', + description='The maximum depth of the tree.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_samples_split = hyperparams.Union[Union[int, float]]( + configuration=OrderedDict( + # TODO: How to mark it as depending on the number of input samples? + fixed=hyperparams.Bounded[int]( + lower=2, + upper=None, + default=2, + description='Consider "min_samples_split" as the minimum number.', + ), + ratio=hyperparams.Uniform( + lower=0, + upper=1, + default=0.25, + lower_inclusive=False, + upper_inclusive=True, + description='A percentage. "ceil(min_samples_split * n_samples)" are the minimum number of samples for each split.', + ), + ), + default='fixed', + description='The minimum number of samples required to split an internal node.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_samples_leaf = hyperparams.Union[Union[int, float]]( + configuration=OrderedDict( + # TODO: How to mark it as depending on the number of input samples? + fixed=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + description='Consider "min_samples_leaf" as the minimum number.', + ), + ratio=hyperparams.Uniform( + lower=0, + upper=0.5, + default=0.25, + lower_inclusive=False, + upper_inclusive=True, + description='A percentage. "ceil(min_samples_leaf * n_samples)" are the minimum number of samples for each node.', + ), + ), + default='fixed', + description='The minimum number of samples required to be at a leaf node.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_weight_fraction_leaf = hyperparams.Uniform( + lower=0, + upper=0.5, + default=0, + upper_inclusive=True, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + max_leaf_nodes = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=2, + upper=None, + default=10, + ), + unlimited=hyperparams.Constant( + default=None, + description='Unlimited number of leaf nodes.', + ), + ), + default='unlimited', + description='Grow trees with "max_leaf_nodes" in best-first fashion. Best nodes are defined as relative reduction in impurity.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_impurity_decrease = hyperparams.Bounded[float]( + lower=0.0, + upper=None, + default=0.0, + description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + bootstrap = hyperparams.Enumeration[str]( + values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], + default='bootstrap', + description='Whether bootstrap samples are used when building trees.' + ' And whether to use out-of-bag samples to estimate the generalization accuracy.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + # In reality values could also be -2 and so on, which would mean all CPUs minus 1, + # but this does not really seem so useful here, so it is not exposed. + n_jobs = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + ), + all_cores=hyperparams.Enumeration[int]( + values=[-1], + default=-1, + description='The number of jobs is set to the number of cores.', + ), + ), + default='limit', + description='The number of jobs to run in parallel for both "fit" and "produce".', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'], + ) + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + # Default value depends on the nature of the primitive. + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should resulting columns be appended, should they replace original columns, or should only resulting columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no column is selected/provided. Otherwise issue a warning.", + ) + + +# TODO: Support weights on samples. +# There is a "https://metadata.datadrivendiscovery.org/types/InstanceWeight" semantic type which should be used for this. +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/151 +# TODO: How to use/determine class weights? +class RandomForestClassifierPrimitive(ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + SamplingCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams], + SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A random forest classifier using ``sklearn.ensemble.forest.RandomForestClassifier``. + + It uses semantic types to determine which columns to operate on. + """ + + __author__ = 'Oxford DARPA D3M Team, Rob Zinkov ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '37c2b19d-bdab-4a30-ba08-6be49edcc6af', + 'version': '0.4.0', + 'name': "Random forest classifier", + 'python_path': 'd3m.primitives.classification.random_forest.Common', + 'keywords': ['random forest', 'decision tree'], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:zinkov@robots.ox.ac.uk', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/random_forest.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.RANDOM_FOREST, + ], + 'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION, + 'hyperparams_to_tune': [ + 'max_leaf_nodes', + 'criterion', + 'max_features', + ] + } + ) + + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, _verbose: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + # We need random seed multiple times (every time an underlying "RandomForestClassifier" is instantiated), + # and when we sample. So instead we create our own random state we use everywhere. + self._random_state = np.random.RandomState(self.random_seed) + self._verbose = _verbose + self._training_inputs: Inputs = None + self._training_outputs: Outputs = None + self._new_training_data = False + self._learner: RandomForestClassifier = None + self._attribute_columns_names: List[str] = None + self._target_columns_metadata: List[OrderedDict] = None + self._target_columns_names: List[str] = None + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._training_inputs = inputs + self._training_outputs = outputs + self._new_training_data = True + + def _create_learner(self) -> None: + self._learner = RandomForestClassifier( + n_estimators=self.hyperparams['n_estimators'], + criterion=self.hyperparams['criterion'], + max_features=self.hyperparams['max_features'], + max_depth=self.hyperparams['max_depth'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], + oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], + n_jobs=-1 if self.hyperparams['n_jobs'] is None else self.hyperparams['n_jobs'], + warm_start=True, + random_state=self._random_state, + verbose=self._verbose, + ) + + def _get_target_columns_metadata(self, outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = list(column_metadata.get('semantic_types', [])) + if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types: + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + semantic_types = [semantic_type for semantic_type in semantic_types if semantic_type != 'https://metadata.datadrivendiscovery.org/types/TrueTarget'] + column_metadata['semantic_types'] = semantic_types + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _store_columns_metadata_and_names(self, inputs: Inputs, outputs: Outputs) -> None: + self._attribute_columns_names = list(inputs.columns) + self._target_columns_metadata = self._get_target_columns_metadata(outputs.metadata) + self._target_columns_names = list(outputs.columns) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # An optimization. Do not refit if data has not changed. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + inputs, _ = self._select_inputs_columns(self._training_inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + + self._create_learner() + + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.ndim == 2 and outputs.shape[1] == 1: + fit_outputs = np.ravel(outputs) + else: + fit_outputs = outputs + + self._store_columns_metadata_and_names(inputs, outputs) + + # We skip if there are no columns. If "error_on_no_columns" is set, + # exception should have already been raised. + if len(inputs.columns) and len(outputs.columns): + self._learner.fit(inputs, fit_outputs) + + assert self._learner.n_features_ == len(self._attribute_columns_names), (self._learner.n_features_, len(self._attribute_columns_names)) + assert self._learner.n_outputs_ == len(self._target_columns_metadata), (self._learner.n_outputs_, len(self._target_columns_metadata)) + assert self._learner.n_outputs_ == len(self._target_columns_names), (self._learner.n_outputs_, len(self._target_columns_names)) + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # This model is not improving fitting if called multiple times on the same data. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if self._learner is None: + self._create_learner() + + n_estimators = self._learner.get_params()['n_estimators'] + n_estimators += self.hyperparams['n_more_estimators'] + self._learner.set_params(n_estimators=n_estimators) + + inputs, _ = self._select_inputs_columns(self._training_inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.ndim == 2 and outputs.shape[1] == 1: + fit_outputs = np.ravel(outputs) + else: + fit_outputs = outputs + + self._store_columns_metadata_and_names(inputs, outputs) + + # We skip if there are no columns. If "error_on_no_columns" is set, + # exception should have already been raised. + if len(inputs.columns) and len(outputs.columns): + self._learner.fit(inputs, fit_outputs) + + assert self._learner.n_features_ == len(self._attribute_columns_names), (self._learner.n_features_, len(self._attribute_columns_names)) + assert self._learner.n_outputs_ == len(self._target_columns_metadata), (self._learner.n_outputs_, len(self._target_columns_metadata)) + assert self._learner.n_outputs_ == len(self._target_columns_names), (self._learner.n_outputs_, len(self._target_columns_names)) + + return CallResult(None) + + def _update_predictions_metadata(self, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata() + if outputs is not None: + outputs_metadata = outputs_metadata.generate(outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, predictions: np.ndarray) -> Outputs: + outputs = container.DataFrame(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(outputs, self._target_columns_metadata) + outputs.columns = self._target_columns_names + return outputs + + def _predictions_from_proba(self, proba: np.ndarray) -> np.ndarray: + """ + This is copied from ``ForestClassifier.predict``, but also includes a bugfix for + `this issue`_. + + .. _this issue: https://github.com/scikit-learn/scikit-learn/issues/11451 + """ + + if self._learner.n_outputs_ == 1: + return self._learner.classes_.take(np.argmax(proba, axis=1), axis=0) + + else: + predictions = [] + + for k in range(self._learner.n_outputs_): + predictions.append(self._learner.classes_[k].take(np.argmax(proba[k], axis=1), axis=0)) + + return np.array(predictions).T + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if self._learner is None: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + selected_inputs, columns_to_use = self._select_inputs_columns(inputs) + + # We skip if there are no columns. If "error_on_no_columns" is set, exception should have already been raised. + # The number of columns should match the number during fitting, so if columns are available now we assume that + # the learner has been really fitted. + output_columns: List[Outputs] = [] + if len(selected_inputs.columns): + # We are not using "predict" directly because of a bug. + # See: https://github.com/scikit-learn/scikit-learn/issues/11451 + proba = self._learner.predict_proba(selected_inputs) + predictions = self._predictions_from_proba(proba) + + output_columns = [self._wrap_predictions(predictions)] + + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + return CallResult(outputs) + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if self._learner is None: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + if len(getattr(self._learner, 'estimators_', [])): + feature_importances_array = self._learner.feature_importances_.reshape((1, len(self._attribute_columns_names))) + + feature_importances = container.DataFrame(feature_importances_array, generate_metadata=True) + feature_importances.columns = self._attribute_columns_names + for k in range(len(self._attribute_columns_names)): + feature_importances.metadata = feature_importances.metadata.update_column(k, {'name': self._attribute_columns_names[k]}) + + else: + feature_importances = container.DataFrame(generate_metadata=True) + + return CallResult(feature_importances) + + def sample(self, *, inputs: Inputs, num_samples: int = 1, timeout: float = None, iterations: int = None) -> CallResult[Sequence[Outputs]]: + if self._learner is None: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + inputs, _ = self._select_inputs_columns(inputs) + + # We skip if there are no columns. If "error_on_no_columns" is set, exception should have already been raised. + # The number of columns should match the number during fitting, so if columns are available now we assume that + # the learner has been really fitted. + samples = [] + if len(inputs.columns): + for i in range(num_samples): + proba = self._random_state.choice(self._learner.estimators_).predict_proba(inputs) + predictions = self._predictions_from_proba(proba) + samples.append(self._wrap_predictions(predictions)) + + return CallResult(samples) + + def log_likelihoods(self, *, outputs: Outputs, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if self._learner is None: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + inputs, _ = self._select_inputs_columns(inputs) + outputs, _ = self._select_outputs_columns(outputs) + + # We skip if there are no columns. If "error_on_no_columns" is set, exception should have already been raised. + # The number of columns should match the number during fitting, so if columns are available now we assume that + # the learner has been really fitted. + if len(inputs.columns) and len(outputs.columns): + if outputs.shape[1] != self._learner.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._learner.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._learner.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._learner.classes_] + else: + classes = self._learner.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._learner.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pd.Series(np.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][np.arange(samples_length), mapped_outputs_column]) + + results = container.DataFrame(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + # TODO: Copy any other metadata? + for k in range(self._learner.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = container.DataFrame(generate_metadata=True) + + return CallResult(results) + + def get_params(self) -> Params: + if self._learner is None: + return Params( + estimators=None, + classes=None, + n_classes=None, + n_features=None, + n_outputs=None, + attribute_columns_names=None, + target_columns_metadata=None, + target_columns_names=None, + oob_score=None, + oob_decision_function=None, + ) + + elif not len(getattr(self._learner, 'estimators_', [])): + return Params( + estimators=[], + classes=None, + n_classes=None, + n_features=None, + n_outputs=None, + attribute_columns_names=self._attribute_columns_names, + target_columns_metadata=self._target_columns_metadata, + target_columns_names=self._target_columns_names, + oob_score=None, + oob_decision_function=None, + ) + + return Params( + estimators=self._learner.estimators_, + classes=self._learner.classes_, + n_classes=self._learner.n_classes_, + n_features=self._learner.n_features_, + n_outputs=self._learner.n_outputs_, + attribute_columns_names=self._attribute_columns_names, + target_columns_metadata=self._target_columns_metadata, + target_columns_names=self._target_columns_names, + oob_score=getattr(self._learner, 'oob_score_', None), + oob_decision_function=getattr(self._learner, 'oob_decision_function_', None), + ) + + def set_params(self, *, params: Params) -> None: + if params['estimators'] is None: + self._learner = None + else: + self._create_learner() + + if params['estimators']: + self._learner.estimators_ = params['estimators'] + if params['classes'] is not None: + self._learner.classes_ = params['classes'] + if params['n_classes'] is not None: + self._learner.n_classes_ = params['n_classes'] + if params['n_features'] is not None: + self._learner.n_features_ = params['n_features'] + if params['n_outputs'] is not None: + self._learner.n_outputs_ = params['n_outputs'] + self._attribute_columns_names = params['attribute_columns_names'] + self._target_columns_metadata = params['target_columns_metadata'] + self._target_columns_names = params['target_columns_names'] + if params['oob_score'] is not None: + self._learner.oob_score_ = params['oob_score'] + if params['oob_decision_function'] is not None: + self._learner.oob_decision_function_ = params['oob_decision_function'] + + if getattr(self._learner, 'estimators_', []): + # When continuing fitting, we are increasing "n_estimators", so we have to make sure + # "n_estimators" matches the number of fitted estimators which might be different + # from initial value set from through the hyper-parameter. + self._learner.set_params(n_estimators=len(self._learner.estimators_)) + + def __getstate__(self) -> dict: + state = super().__getstate__() + + # Random state is not part of the "Params", but it is part of the state we want to + # pickle and unpickle to have full reproducibility. So we have to add it ourselves here. + # This is also difference between pickling/unpickling and "get_params"/"set_params". + # The later saves only the model state which is useful to produce at a later time, but + # if we want to also reproduce the exact sequence of values, we should be using pickling. + state['random_state'] = self._random_state + + return state + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + + self._random_state = state['random_state'] + + def _can_use_inputs_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + if not d3m_utils.is_numeric(column_metadata['structural_type']): + return False + + return 'https://metadata.datadrivendiscovery.org/types/Attribute' in column_metadata.get('semantic_types', []) + + def _get_inputs_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_inputs_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_inputs_columns'], self.hyperparams['exclude_inputs_columns'], can_use_column) + + if not columns_to_use: + if self.hyperparams['error_on_no_columns']: + raise ValueError("No inputs columns.") + else: + self.logger.warning("No inputs columns.") + + if self.hyperparams['use_inputs_columns'] and columns_to_use and columns_not_to_use: + self.logger.warning("Not all specified inputs columns can be used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _can_use_outputs_column(self, outputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + return 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in column_metadata.get('semantic_types', []) + + def _get_outputs_columns(self, outputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_outputs_column(outputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(outputs_metadata, self.hyperparams['use_outputs_columns'], self.hyperparams['exclude_outputs_columns'], can_use_column) + + if not columns_to_use: + if self.hyperparams['error_on_no_columns']: + raise ValueError("No outputs columns.") + else: + self.logger.warning("No outputs columns.") + + if self.hyperparams['use_outputs_columns'] and columns_to_use and columns_not_to_use: + self.logger.warning("Not all specified outputs columns can be used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _select_inputs_columns(self, inputs: Inputs) -> Tuple[Inputs, List[int]]: + columns_to_use = self._get_inputs_columns(inputs.metadata) + + return inputs.select_columns(columns_to_use, allow_empty_columns=True), columns_to_use + + def _select_outputs_columns(self, outputs: Outputs) -> Tuple[Outputs, List[int]]: + columns_to_use = self._get_outputs_columns(outputs.metadata) + + return outputs.select_columns(columns_to_use, allow_empty_columns=True), columns_to_use diff --git a/tods/common-primitives/common_primitives/ravel.py b/tods/common-primitives/common_primitives/ravel.py new file mode 100644 index 0000000..56058fe --- /dev/null +++ b/tods/common-primitives/common_primitives/ravel.py @@ -0,0 +1,119 @@ +import os +import typing + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('RavelAsRowPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + order = hyperparams.Enumeration( + values=['row-major', 'column-major'], + default='row-major', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="\"row-major\" means to index the elements in row-major, with the last axis index changing fastest, back to the first axis index changing slowest. " + "\"column-major\" means to index the elements in column-major, with the first index changing fastest, and the last index changing slowest.", + ) + + +class RavelAsRowPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which ravels all values in the DataFrame into one row. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'fe87544d-ef93-48d0-b420-76768d351f39', + 'version': '0.1.0', + 'name': "Ravel a DataFrame into a row", + 'python_path': 'd3m.primitives.data_transformation.ravel.DataFrameRowCommon', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/ravel.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_CONCATENATION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + @base.singleton + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + output_values: typing.List[typing.List] = [[]] + output_columns = [] + + rows_length, columns_length = inputs.shape + + if self.hyperparams['order'] == 'row-major': + for row in inputs.itertuples(index=False, name=None): + for column_index, value in enumerate(row): + output_values[0].append(value) + output_columns.append(inputs.columns[column_index]) + + elif self.hyperparams['order'] == 'column-major': + for column_index in range(columns_length): + for value in inputs.iloc[:, column_index]: + output_values[0].append(value) + output_columns.append(inputs.columns[column_index]) + + else: + raise exceptions.InvalidArgumentValueError(f"Invalid \"order\" hyper-parameter value \"{self.hyperparams['order']}\".") + + assert len(output_values[0]) == len(output_columns) + assert len(output_values[0]) == rows_length * columns_length + + outputs = container.DataFrame(output_values, columns=output_columns, metadata=inputs.metadata.query(()), generate_metadata=False) + outputs.metadata = outputs.metadata.update((), { + 'dimension': { + 'length': 1, + }, + }) + outputs.metadata = outputs.metadata.update((metadata_base.ALL_ELEMENTS,), { + 'dimension': { + 'length': len(output_columns), + }, + }) + outputs.metadata = outputs.metadata.set_table_metadata() + + if self.hyperparams['order'] == 'row-major': + for index in range(len(output_columns)): + row_index = int(index / columns_length) + column_index = int(index % columns_length) + + outputs.metadata = outputs.metadata.update( + (0, index), + inputs.metadata.query((row_index, column_index)) + ) + + elif self.hyperparams['order'] == 'column-major': + for index in range(len(output_columns)): + row_index = int(index % rows_length) + column_index = int(index / rows_length) + + outputs.metadata = outputs.metadata.update( + (0, index), + inputs.metadata.query((row_index, column_index)) + ) + + else: + assert False + + return base.CallResult(outputs) diff --git a/tods/common-primitives/common_primitives/redact_columns.py b/tods/common-primitives/common_primitives/redact_columns.py new file mode 100644 index 0000000..5616626 --- /dev/null +++ b/tods/common-primitives/common_primitives/redact_columns.py @@ -0,0 +1,162 @@ +import copy +import os +import typing + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +Inputs = container.List +Outputs = container.List + + +class Hyperparams(hyperparams.Hyperparams): + match_logic = hyperparams.Enumeration( + values=['all', 'any'], + default='any', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should a column have all of semantic types in \"semantic_types\" to be redacted, or any of them?", + ) + semantic_types = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Redact columns with these semantic types. Only columns having semantic types listed here will be operated on, based on \"match_logic\".", + ) + add_semantic_types = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Semantic types to add to redacted columns. All listed semantic types will be added to all columns which were redacted.", + ) + + +# TODO: Make clear the assumption that both container type (List) and Datasets should have metadata. +# Primitive is modifying metadata of Datasets, while there is officially no reason for them +# to really have metadata: metadata is stored available on the input container type, not +# values inside it. +class RedactColumnsPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which takes as an input a list of ``Dataset`` objects and redacts values of all columns matching + a given semantic type or types. + + Redaction is done by setting all values in a redacted column to an empty string. + + It operates only on DataFrame resources inside datasets. + """ + + __author__ = 'Mingjie Sun ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '744c4090-e2f6-489e-8efc-8b1e051bfad6', + 'version': '0.2.0', + 'name': "Redact columns for evaluation", + 'python_path': 'd3m.primitives.evaluation.redact_columns.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:sunmj15@gmail.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/redact_columns.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.EVALUATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + output_datasets = container.List(generate_metadata=True) + + for dataset in inputs: + resources = {} + metadata = dataset.metadata + + for resource_id, resource in dataset.items(): + if not isinstance(resource, container.DataFrame): + resources[resource_id] = resource + continue + + columns_to_redact = self._get_columns_to_redact(metadata, (resource_id,)) + + if not columns_to_redact: + resources[resource_id] = resource + continue + + resource = copy.copy(resource) + + for column_index in columns_to_redact: + column_metadata = dataset.metadata.query((resource_id, metadata_base.ALL_ELEMENTS, column_index)) + if 'structural_type' in column_metadata and issubclass(column_metadata['structural_type'], str): + resource.iloc[:, column_index] = '' + else: + raise TypeError("Primitive can operate only on columns with structural type \"str\", not \"{type}\".".format( + type=column_metadata.get('structural_type', None), + )) + + metadata = self._update_metadata(metadata, resource_id, column_index, ()) + + resources[resource_id] = resource + + dataset = container.Dataset(resources, metadata) + + output_datasets.append(dataset) + + output_datasets.metadata = metadata_base.DataMetadata({ + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.List, + 'dimension': { + 'length': len(output_datasets), + }, + }) + + # We update metadata based on metadata of each dataset. + # TODO: In the future this might be done automatically by generate_metadata. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/119 + for index, dataset in enumerate(output_datasets): + output_datasets.metadata = dataset.metadata.copy_to(output_datasets.metadata, (), (index,)) + + return base.CallResult(output_datasets) + + def _get_columns_to_redact(self, inputs_metadata: metadata_base.DataMetadata, at: metadata_base.Selector) -> typing.Sequence[int]: + columns = [] + + for element in inputs_metadata.get_elements(list(at) + [metadata_base.ALL_ELEMENTS]): + semantic_types = inputs_metadata.query(list(at) + [metadata_base.ALL_ELEMENTS, element]).get('semantic_types', ()) + + # TODO: Should we handle inheritance between semantic types here? + if self.hyperparams['match_logic'] == 'all': + matched = all(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types']) + elif self.hyperparams['match_logic'] == 'any': + matched = any(semantic_type in semantic_types for semantic_type in self.hyperparams['semantic_types']) + else: + raise exceptions.UnexpectedValueError("Unknown value of hyper-parameter \"match_logic\": {value}".format(value=self.hyperparams['match_logic'])) + + if matched: + if element is metadata_base.ALL_ELEMENTS: + return list(range(inputs_metadata.query(list(at) + [metadata_base.ALL_ELEMENTS]).get('dimension', {}).get('length', 0))) + else: + columns.append(typing.cast(int, element)) + + return columns + + def _update_metadata( + self, inputs_metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment, + column_index: int, at: metadata_base.Selector, + ) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata + + for semantic_type in self.hyperparams['add_semantic_types']: + outputs_metadata = outputs_metadata.add_semantic_type(tuple(at) + (resource_id, metadata_base.ALL_ELEMENTS, column_index), semantic_type) + + return outputs_metadata diff --git a/tods/common-primitives/common_primitives/regex_filter.py b/tods/common-primitives/common_primitives/regex_filter.py new file mode 100644 index 0000000..eaa6a6d --- /dev/null +++ b/tods/common-primitives/common_primitives/regex_filter.py @@ -0,0 +1,86 @@ +import os +import re + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives +from common_primitives import dataframe_utils + +__all__ = ('RegexFilterPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + column = hyperparams.Hyperparameter[int]( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Index of column filter applies to.', + ) + inclusive = hyperparams.Hyperparameter[bool]( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='True when values that match the pattern are removed, False when they are removed.', + ) + regex = hyperparams.Hyperparameter[str]( + default="", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='A python regular expression string to act as a filter.', + ) + + +class RegexFilterPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which filters rows from a DataFrame based on a regex applied to a given column. + Columns are identified by index. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'cf73bb3d-170b-4ba9-9ead-3dd4b4524b61', + 'version': '0.1.0', + 'name': "Regex dataset filter", + 'python_path': 'd3m.primitives.data_preprocessing.regex_filter.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:cbethune@uncharted.software', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/regex_filter.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + # to make sure index matches row indices + resource = inputs.reset_index(drop=True) + + try: + # apply the filter + pattern = re.compile(self.hyperparams['regex']) + matched = resource.iloc[:, self.hyperparams['column']].astype(str).str.contains(pattern) + to_keep = matched if self.hyperparams['inclusive'] else ~matched + + to_keep_indices = resource.loc[to_keep].index + + except re.error as error: + raise exceptions.InvalidArgumentValueError("Invalid regex: {regex}".format(regex=self.hyperparams['regex'])) from error + + # remove dataframe and metadata rows by index + outputs = dataframe_utils.select_rows(inputs, to_keep_indices) + + return base.CallResult(outputs) diff --git a/tods/common-primitives/common_primitives/remove_columns.py b/tods/common-primitives/common_primitives/remove_columns.py new file mode 100644 index 0000000..3d81a8e --- /dev/null +++ b/tods/common-primitives/common_primitives/remove_columns.py @@ -0,0 +1,59 @@ +import os + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('RemoveColumnsPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='A set of column indices of columns to remove.', + ) + + +class RemoveColumnsPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which removes a fixed list of columns. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '3b09ba74-cc90-4f22-9e0a-0cf4f29a7e28', + 'version': '0.1.0', + 'name': "Removes columns", + 'python_path': 'd3m.primitives.data_transformation.remove_columns.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:cbethune@uncharted.software', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/remove_columns.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + outputs = inputs.remove_columns(self.hyperparams['columns']) + + return base.CallResult(outputs) diff --git a/tods/common-primitives/common_primitives/remove_duplicate_columns.py b/tods/common-primitives/common_primitives/remove_duplicate_columns.py new file mode 100644 index 0000000..38d7cbb --- /dev/null +++ b/tods/common-primitives/common_primitives/remove_duplicate_columns.py @@ -0,0 +1,169 @@ +import copy +import itertools +import typing +import os + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, params, hyperparams +from d3m.primitive_interfaces import base, unsupervised_learning + +import common_primitives + +__all__ = ('RemoveDuplicateColumnsPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + equal_columns_map: typing.Optional[typing.Dict[int, typing.Set[int]]] + + +class Hyperparams(hyperparams.Hyperparams): + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column is not a duplicate with any other specified, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + + +# TODO: Compare columns also by determining if there exists a bijection between two columns and find such columns duplicate as well. +class RemoveDuplicateColumnsPrimitive(unsupervised_learning.UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A primitive which removes duplicate columns based on exact match in all their values. + + It adds names of removed columns into ``other_names`` metadata for columns remaining. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '130513b9-09ca-4785-b386-37ab31d0cf8b', + 'version': '0.1.0', + 'name': "Removes duplicate columns", + 'python_path': 'd3m.primitives.data_transformation.remove_duplicate_columns.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/remove_duplicate_columns.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._training_inputs: Inputs = None + self._equal_columns_map: typing.Optional[typing.Dict[int, typing.Set[int]]] = None + self._fitted = False + + def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore + self._training_inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + if self._training_inputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + if self._fitted: + return base.CallResult(None) + + columns_to_use = self._get_columns(self._training_inputs.metadata) + columns_to_use_length = len(columns_to_use) + + equal_columns = [] + for i in range(columns_to_use_length): + for j in range(i + 1, columns_to_use_length): + if self._training_inputs.iloc[:, columns_to_use[i]].equals(self._training_inputs.iloc[:, columns_to_use[j]]): + equal_columns.append((i, j)) + + # It might be that more columns are equal to each other, so we resolve those and + # will keep only the first column and remove all others. + equal_columns_map: typing.Dict[int, typing.Set[int]] = {} + for i, j in equal_columns: + for first, others in equal_columns_map.items(): + if first == i: + others.add(j) + break + elif i in others: + others.add(j) + break + else: + equal_columns_map[i] = {j} + + self._equal_columns_map = equal_columns_map + self._fitted = True + + return base.CallResult(None) + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: + def can_use_column(column_index: int) -> bool: + return True + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + return columns_to_use + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + if not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + outputs = copy.copy(inputs) + + # Set "other_names" metadata on columns remaining. + for first, others in self._equal_columns_map.items(): + first_name = outputs.metadata.query_column(first).get('name', None) + + names = set() + for other in others: + other_metadata = outputs.metadata.query_column(other) + # We do not care about empty strings for names either. + if other_metadata.get('name', None): + if first_name != other_metadata['name']: + names.add(other_metadata['name']) + + first_other_names = list(outputs.metadata.query_column(first).get('other_names', [])) + first_other_names += sorted(names) + if first_other_names: + outputs.metadata = outputs.metadata.update_column(first, { + 'other_names': first_other_names, + }) + + # We flatten all values of "equal_columns_map" into one list. + outputs = outputs.remove_columns(list(itertools.chain.from_iterable(self._equal_columns_map.values()))) + + return base.CallResult(outputs) + + def get_params(self) -> Params: + if not self._fitted: + return Params( + equal_columns_map=None, + ) + + return Params( + equal_columns_map=self._equal_columns_map, + ) + + def set_params(self, *, params: Params) -> None: + self._equal_columns_map = params['equal_columns_map'] + self._fitted = params['equal_columns_map'] is not None diff --git a/tods/common-primitives/common_primitives/remove_semantic_types.py b/tods/common-primitives/common_primitives/remove_semantic_types.py new file mode 100644 index 0000000..eeb5dfd --- /dev/null +++ b/tods/common-primitives/common_primitives/remove_semantic_types.py @@ -0,0 +1,78 @@ +import copy +import typing +import os + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('RemoveSemanticTypesPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices of columns to remove semantic types from.", + ) + semantic_types = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Semantic types to remove from columns listed in \"columns\".", + ) + + +class RemoveSemanticTypesPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which removes semantic types from columns in a DataFrame. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '3002bc5b-fa47-4a3d-882e-a8b5f3d756aa', + 'version': '0.1.0', + 'name': "Remove semantic types from columns", + 'python_path': 'd3m.primitives.data_transformation.remove_semantic_types.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/remove_semantic_types.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + outputs = copy.copy(inputs) + + outputs.metadata = self._update_metadata(outputs.metadata) + + return base.CallResult(outputs) + + def _update_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> metadata_base.DataMetadata: + outputs_metadata = inputs_metadata + + for column_index in self.hyperparams['columns']: + for semantic_type in self.hyperparams['semantic_types']: + outputs_metadata = outputs_metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, column_index), semantic_type) + + return outputs_metadata diff --git a/tods/common-primitives/common_primitives/rename_duplicate_columns.py b/tods/common-primitives/common_primitives/rename_duplicate_columns.py new file mode 100644 index 0000000..a25fd3c --- /dev/null +++ b/tods/common-primitives/common_primitives/rename_duplicate_columns.py @@ -0,0 +1,73 @@ +import os + +import numpy # type: ignore +import pandas # type: ignore +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer +from d3m.primitive_interfaces.base import CallResult + +import common_primitives + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + separator = hyperparams.Hyperparameter[str]( + default='.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Separator separates additional identifier and original column name", + ) + + +class RenameDuplicateColumnsPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive renaming columns with duplicated name + + A numerical counter will be postfix on the original name and the original name will be stored in the other_name + column metadata + """ + + __author__ = 'TAMU DARPA D3M Team, TsungLin Yang ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '7b067a78-4ad4-411d-9cf9-87bcee38ac73', + 'version': '0.2.0', + 'name': "Rename all the duplicated name column in DataFrame", + 'python_path': 'd3m.primitives.data_transformation.rename_duplicate_name.DataFrameCommon', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:lin.yang@tamu.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + cols = pandas.Series(inputs.columns) + dup_columns = inputs.columns[inputs.columns.duplicated()].unique() + if not dup_columns.empty: + inputs = inputs.copy() + for dup in dup_columns: + to_change_index = numpy.where(inputs.columns.values == dup)[0] + new_names = [dup + self.hyperparams['separator'] + str(d_idx) if d_idx != 0 else dup for d_idx in + range(len(to_change_index))] + for count, index in enumerate(to_change_index): + cols[index] = new_names[count] + inputs.metadata = inputs.metadata.update_column(index.item(), {'other_name': dup}) + inputs.metadata = inputs.metadata.update_column(index.item(), {'name': cols[index]}) + inputs.columns = cols + return CallResult(inputs) diff --git a/tods/common-primitives/common_primitives/replace_semantic_types.py b/tods/common-primitives/common_primitives/replace_semantic_types.py new file mode 100644 index 0000000..cf278be --- /dev/null +++ b/tods/common-primitives/common_primitives/replace_semantic_types.py @@ -0,0 +1,165 @@ +import typing +import os + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('ReplaceSemanticTypesPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column does not have any semantic type from \"from_semantic_types\", it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='replace', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should columns with replaced semantic types be appended, should they replace original columns, or should only columns with replaced semantic types be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + match_logic = hyperparams.Enumeration( + values=['all', 'any'], + default='any', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should a column have all of semantic types in \"from_semantic_types\" to have semantic types replaced, or any of them?", + ) + from_semantic_types = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Semantic types to replace. Only columns having semantic types listed here will be operated on, based on \"match_logic\". " + "All semantic types listed here will be removed from those columns.", + ) + to_semantic_types = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Semantic types to add to matching columns. All listed semantic types will be added to all columns which had semantic types removed.", + ) + + +class ReplaceSemanticTypesPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which replaces semantic types with new semantic types for columns in a DataFrame. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '7bae062e-f8b0-4358-91f2-9288a51f3e82', + 'version': '0.2.0', + 'name': "Replace semantic types for columns", + 'python_path': 'd3m.primitives.data_transformation.replace_semantic_types.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/replace_semantic_types.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + columns_to_use, output_columns = self._produce_columns(inputs) + + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + return base.CallResult(outputs) + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + semantic_types = column_metadata.get('semantic_types', []) + + if self.hyperparams['match_logic'] == 'all': + return all(semantic_type in semantic_types for semantic_type in self.hyperparams['from_semantic_types']) + elif self.hyperparams['match_logic'] == 'any': + return any(semantic_type in semantic_types for semantic_type in self.hyperparams['from_semantic_types']) + else: + raise exceptions.UnexpectedValueError("Unknown value of hyper-parameter \"match_logic\": {value}".format(value=self.hyperparams['match_logic'])) + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + # We are OK if no columns ended up having semantic types replaced. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns matches semantic types from \"from_semantic_types\". Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _produce_columns(self, inputs: Inputs) -> typing.Tuple[typing.List[int], typing.List[Outputs]]: + columns_to_use = self._get_columns(inputs.metadata) + + output_columns = [] + + for column_index in columns_to_use: + column = inputs.select_columns([column_index]) + column.metadata = self._update_metadata(column.metadata) + output_columns.append(column) + + return columns_to_use, output_columns + + def _produce_columns_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Tuple[typing.List[int], typing.List[metadata_base.DataMetadata]]: + columns_to_use = self._get_columns(inputs_metadata) + + output_columns = [] + + for column_index in columns_to_use: + column_metadata = inputs_metadata.select_columns([column_index]) + column_metadata = self._update_metadata(column_metadata) + output_columns.append(column_metadata) + + return columns_to_use, output_columns + + def _update_metadata(self, inputs_metadata: metadata_base.DataMetadata) -> metadata_base.DataMetadata: + inputs_columns_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + assert inputs_columns_length == 1, inputs_columns_length + + outputs_metadata = inputs_metadata + + for semantic_type in self.hyperparams['from_semantic_types']: + outputs_metadata = outputs_metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 0), semantic_type) + for semantic_type in self.hyperparams['to_semantic_types']: + outputs_metadata = outputs_metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), semantic_type) + + return outputs_metadata diff --git a/tods/common-primitives/common_primitives/simple_exponential_smoothing.py b/tods/common-primitives/common_primitives/simple_exponential_smoothing.py new file mode 100644 index 0000000..cdd6fff --- /dev/null +++ b/tods/common-primitives/common_primitives/simple_exponential_smoothing.py @@ -0,0 +1,354 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import pandas as pd +# Custom import commands if any +from sklearn.preprocessing.data import Normalizer +from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + + +from d3m.primitive_interfaces.base import CallResult,DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + +import os +from typing import Any,Optional,List + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams,params +from d3m.primitive_interfaces import base, transformer + + + +Inputs = d3m_dataframe +# Inputs = container.Dataset +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): +# Added by Mia + endog = hyperparams.Bounded[int]( + lower = 2, + upper = None, + default = 3, + description='Array like time series', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + +# Keep previous + norm = hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2', 'max'], + description='The norm to use to normalize each non zero sample.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SimpleExponentialSmoothing(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for simple exponential smoothing + `statsmodels documentation `_ + + """ + + __author__ = "DataLab@TAMU" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_NORMALIZATION, ], + "name": "sklearn.preprocessing.data.Normalizer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.anomaly_detection.SimpleExponentialSmoothing", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html']}, + "version": "2019.11.13", + "id": "980b3a2d-1574-31f3-8326-ddc62f8fc2c3", + "hyperparams_to_tune": ['norm'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Normalizer( + norm=self.hyperparams['norm'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + self.logger.info('Simple Exponential Smoothing Primitive called') + outputs = inputs + try: + columns_to_calculate_simple_exponential_smoothing= List[str] + if(self.hyperparams['use_columns']==()): + columns_to_calculate_simple_exponential_smoothing = list(set(inputs.columns)-set(['d3mIndex','timestamp','ground_truth'])) + else: + columns_to_calculate_simple_exponential_smoothing = self.hyperparams['use_columns'] + for column in columns_to_calculate_simple_exponential_smoothing: + outputs[column+"_simple_exponential_smoothing"] = SimpleExpSmoothing(inputs[column]).fit(smoothing_level=0.2,optimized=False).fittedvalues + except Exception as e: + self.logger.error("Error in Calculating simple exponential smoothing",e) + self._update_metadata(outputs) + print(inputs) + print("-------------") + print(outputs) + + return base.CallResult(outputs) + + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs,) + + + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SimpleExponentialSmoothing.__doc__ = Normalizer.__doc__ diff --git a/tods/common-primitives/common_primitives/simple_profiler.py b/tods/common-primitives/common_primitives/simple_profiler.py new file mode 100644 index 0000000..1879e41 --- /dev/null +++ b/tods/common-primitives/common_primitives/simple_profiler.py @@ -0,0 +1,795 @@ +import collections +import copy +import os.path +import re +import typing + +import numpy # type: ignore +import pandas # type: ignore +from pandas.io import common as pandas_common # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams as hyperparams_module, params +from d3m.primitive_interfaces import base, unsupervised_learning + +import common_primitives +from common_primitives import utils + +__all__ = ('SimpleProfilerPrimitive',) + +WHITESPACE_REGEX = re.compile(r'\s') + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + add_semantic_types: typing.Optional[typing.List[typing.List[str]]] + remove_semantic_types: typing.Optional[typing.List[typing.List[str]]] + + +class Hyperparams(hyperparams_module.Hyperparams): + detect_semantic_types = hyperparams_module.Set( + elements=hyperparams_module.Enumeration( + values=[ + 'http://schema.org/Boolean', 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer', 'http://schema.org/Float', 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/FloatVector', 'http://schema.org/DateTime', + 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/Time', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey', + ], + # Default is ignored. + # TODO: Remove default. See: https://gitlab.com/datadrivendiscovery/d3m/issues/141 + default='http://schema.org/Boolean', + ), + default=( + 'http://schema.org/Boolean', 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer', 'http://schema.org/Float', 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/FloatVector', 'http://schema.org/DateTime', + 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/Time', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey', + ), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of semantic types to detect and set. One can provide a subset of supported semantic types to limit what the primitive detects.", + ) + remove_unknown_type = hyperparams_module.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Remove \"https://metadata.datadrivendiscovery.org/types/UnknownType\" semantic type from columns on which the primitive has detected other semantic types.", + ) + categorical_max_absolute_distinct_values = hyperparams_module.Union[typing.Union[int, None]]( + configuration=collections.OrderedDict( + limit=hyperparams_module.Bounded[int]( + lower=1, + upper=None, + default=50, + ), + unlimited=hyperparams_module.Hyperparameter[None]( + default=None, + description='No absolute limit on distinct values.', + ), + ), + default='limit', + description='The maximum absolute number of distinct values (all missing values as counted as one distinct value) for a column to be considered categorical.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + categorical_max_ratio_distinct_values = hyperparams_module.Bounded[float]( + lower=0, + upper=1, + default=0.05, + description='The maximum ratio of distinct values (all missing values as counted as one distinct value) vs. number of rows for a column to be considered categorical.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + nan_values = hyperparams_module.Set( + elements=hyperparams_module.Hyperparameter[str](''), + default=sorted(pandas_common._NA_VALUES), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of strings to recognize as NaNs when detecting a float column.", + ) + text_min_ratio_values_with_whitespace = hyperparams_module.Bounded[float]( + lower=0, + upper=1, + default=0.5, + description='The minimum ratio of values with any whitespace (after first stripping) vs. number of rows for a column to be considered a text column.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + use_columns = hyperparams_module.Set( + elements=hyperparams_module.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be detected, it is skipped.", + ) + exclude_columns = hyperparams_module.Set( + elements=hyperparams_module.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams_module.Enumeration( + values=['append', 'replace', 'new'], + default='replace', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should detected columns be appended, should they replace original columns, or should only detected columns be returned?", + ) + add_index_columns = hyperparams_module.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + replace_index_columns = hyperparams_module.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Replace primary index columns even if otherwise appending columns. Applicable only if \"return_result\" is set to \"append\".", + ) + + +class SimpleProfilerPrimitive(unsupervised_learning.UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A primitive which determines missing semantic types for columns and adds + them automatically. It uses a set of hard-coded rules/heuristics to determine + semantic types. Feel free to propose improvements. + + Besides determining column types it also determines some column roles. + + Some rules are intuitive and expected, but there are also few special behaviors + (if not disabled by not providing a corresponding semantic type in + ``detect_semantic_types``): + + * If a column does not have any semantic types, + ``https://metadata.datadrivendiscovery.org/types/UnknownType`` semantic type + is first set for the column. If any other semantic type is set later on as + part of logic of this primitive, the + ``https://metadata.datadrivendiscovery.org/types/UnknownType`` is removed + (including if the column originally came with this semantic type). + * If a column has ``https://metadata.datadrivendiscovery.org/types/SuggestedTarget`` + semantic type and no other column (even those not otherwise operated on by + the primitive) has a semantic type + ``https://metadata.datadrivendiscovery.org/types/TrueTarget`` is set on + the column. This allows operation on data without a problem description. + This is only for the first such column. + * All other columns which are missing semantic types initially we set as + ``https://metadata.datadrivendiscovery.org/types/Attribute``. + * Any column with ``http://schema.org/DateTime`` semantic type is also set + as ``https://metadata.datadrivendiscovery.org/types/Time`` semantic type. + * ``https://metadata.datadrivendiscovery.org/types/PrimaryKey`` or + ``https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey`` is set only + if no other column (even those not otherwise operated on by + the primitive) is a primary key, and set based on the column name: only + when it is ``d3mIndex``. + """ + + __author__ = 'Louis Huang' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'e193afa1-b45e-4d29-918f-5bb1fa3b88a7', + 'version': '0.2.0', + 'name': "Determine missing semantic types for columns automatically", + 'python_path': 'd3m.primitives.schema_discovery.profiler.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:luyih@berkeley.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/simple_profiler.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.SCHEMA_DISCOVERY, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._training_inputs: Inputs = None + self._add_semantic_types: typing.List[typing.List[str]] = None + self._remove_semantic_types: typing.List[typing.List[str]] = None + self._fitted: bool = False + + def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore + self._training_inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + # The logic of detecting values tries to mirror also the logic of parsing + # values in "ColumnParserPrimitive". One should keep them in sync. + + if self._training_inputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + self._add_semantic_types, self._remove_semantic_types = self._fit_columns(self._training_inputs) + self._fitted = True + + return base.CallResult(None) + + def _fit_columns(self, inputs: Inputs) -> typing.Tuple[typing.List[typing.List[str]], typing.List[typing.List[str]]]: + true_target_columns = inputs.metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/TrueTarget']) + index_columns = inputs.metadata.get_index_columns() + + # Target and index columns should be set only once, if they are set. + has_set_target_columns = False + has_set_index_column = False + + columns_to_use = self._get_columns(inputs.metadata) + + fitted_add_semantic_types = [] + fitted_remove_semantic_types = [] + + for column_index in columns_to_use: + input_column = inputs.select_columns([column_index]) + column_metadata = inputs.metadata.query_column(column_index) + column_name = column_metadata.get('name', str(column_index)) + column_semantic_types = list(column_metadata.get('semantic_types', [])) + + # We might be here because column has a known type, but it has "https://metadata.datadrivendiscovery.org/types/SuggestedTarget" set. + has_unknown_type = not column_semantic_types or 'https://metadata.datadrivendiscovery.org/types/UnknownType' in column_semantic_types + + # A normalized copy of semantic types, which always includes unknown type. + normalized_column_semantic_types = copy.copy(column_semantic_types) + + # If we are processing this column and it does not have semantic type that it has missing semantic types, + # we first set it, to normalize the input semantic types. If we will add any other semantic type, + # we will then remove this semantic type. + if has_unknown_type \ + and 'https://metadata.datadrivendiscovery.org/types/UnknownType' in self.hyperparams['detect_semantic_types'] \ + and 'https://metadata.datadrivendiscovery.org/types/UnknownType' not in normalized_column_semantic_types: + normalized_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/UnknownType') + + # A working copy of semantic types. + new_column_semantic_types = copy.copy(normalized_column_semantic_types) + + if has_unknown_type: + is_float = self._is_float(input_column) + is_integer = self._is_integer(input_column) + + # If it looks like proper float (so not integer encoded as float), then we do not detect it as boolean. + if self._is_boolean(input_column) \ + and (not is_float or is_integer) \ + and 'http://schema.org/Boolean' in self.hyperparams['detect_semantic_types'] \ + and 'http://schema.org/Boolean' not in new_column_semantic_types: + new_column_semantic_types.append('http://schema.org/Boolean') + + # If it looks like proper float (so not integer encoded as float), then we do not detect it as categorical. + elif self._is_categorical(input_column) \ + and (not is_float or is_integer) \ + and 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in self.hyperparams['detect_semantic_types'] \ + and 'https://metadata.datadrivendiscovery.org/types/CategoricalData' not in new_column_semantic_types: + new_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/CategoricalData') + + elif is_integer \ + and 'http://schema.org/Integer' in self.hyperparams['detect_semantic_types'] \ + and 'http://schema.org/Integer' not in new_column_semantic_types: + new_column_semantic_types.append('http://schema.org/Integer') + + elif is_float \ + and 'http://schema.org/Float' in self.hyperparams['detect_semantic_types'] \ + and 'http://schema.org/Float' not in new_column_semantic_types: + new_column_semantic_types.append('http://schema.org/Float') + + elif self._is_float_vector(input_column) \ + and 'https://metadata.datadrivendiscovery.org/types/FloatVector' in self.hyperparams['detect_semantic_types'] \ + and 'https://metadata.datadrivendiscovery.org/types/FloatVector' not in new_column_semantic_types: + new_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/FloatVector') + + elif self._is_datetime(input_column) \ + and 'http://schema.org/DateTime' in self.hyperparams['detect_semantic_types'] \ + and 'http://schema.org/DateTime' not in new_column_semantic_types: + new_column_semantic_types.append('http://schema.org/DateTime') + + elif self._is_text(input_column) \ + and 'http://schema.org/Text' in self.hyperparams['detect_semantic_types'] \ + and 'http://schema.org/Text' not in new_column_semantic_types: + new_column_semantic_types.append('http://schema.org/Text') + + if 'https://metadata.datadrivendiscovery.org/types/UniqueKey' in self.hyperparams['detect_semantic_types'] \ + and self._is_unique_key(input_column) \ + and 'http://schema.org/Text' not in new_column_semantic_types \ + and 'https://metadata.datadrivendiscovery.org/types/UniqueKey' not in new_column_semantic_types: + new_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/UniqueKey') + + if not true_target_columns \ + and not has_set_target_columns \ + and 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in self.hyperparams['detect_semantic_types'] \ + and 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in new_column_semantic_types: + # It should not be set because there are no columns with this semantic type in whole DataFrame. + assert 'https://metadata.datadrivendiscovery.org/types/TrueTarget' not in new_column_semantic_types + new_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/TrueTarget') + if 'https://metadata.datadrivendiscovery.org/types/Target' not in new_column_semantic_types: + new_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/Target') + if 'https://metadata.datadrivendiscovery.org/types/Attribute' in new_column_semantic_types: + new_column_semantic_types.remove('https://metadata.datadrivendiscovery.org/types/Attribute') + has_set_target_columns = True + + if has_unknown_type: + if not index_columns and not has_set_index_column: + if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in self.hyperparams['detect_semantic_types'] \ + and column_name == 'd3mIndex' \ + and 'https://metadata.datadrivendiscovery.org/types/UniqueKey' in new_column_semantic_types: + # It should not be set because there are no columns with this semantic type in whole DataFrame. + assert 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' not in new_column_semantic_types + assert 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' not in new_column_semantic_types + new_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/PrimaryKey') + new_column_semantic_types.remove('https://metadata.datadrivendiscovery.org/types/UniqueKey') + if 'https://metadata.datadrivendiscovery.org/types/Attribute' in new_column_semantic_types: + new_column_semantic_types.remove('https://metadata.datadrivendiscovery.org/types/Attribute') + has_set_index_column = True + elif 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' in self.hyperparams['detect_semantic_types'] \ + and column_name == 'd3mIndex': + assert 'https://metadata.datadrivendiscovery.org/types/UniqueKey' not in new_column_semantic_types + # It should not be set because there are no columns with this semantic type in whole DataFrame. + assert 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' not in new_column_semantic_types + assert 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' not in new_column_semantic_types + new_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey') + if 'https://metadata.datadrivendiscovery.org/types/Attribute' in new_column_semantic_types: + new_column_semantic_types.remove('https://metadata.datadrivendiscovery.org/types/Attribute') + has_set_index_column = True + + if 'https://metadata.datadrivendiscovery.org/types/Attribute' in self.hyperparams['detect_semantic_types'] \ + and 'https://metadata.datadrivendiscovery.org/types/TrueTarget' not in new_column_semantic_types \ + and 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' not in new_column_semantic_types \ + and 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey' not in new_column_semantic_types \ + and 'https://metadata.datadrivendiscovery.org/types/Attribute' not in new_column_semantic_types: + new_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/Attribute') + + if 'https://metadata.datadrivendiscovery.org/types/Time' in self.hyperparams['detect_semantic_types'] \ + and 'http://schema.org/DateTime' in new_column_semantic_types \ + and 'https://metadata.datadrivendiscovery.org/types/Time' not in new_column_semantic_types: + new_column_semantic_types.append('https://metadata.datadrivendiscovery.org/types/Time') + + # Have we added any other semantic type besides unknown type? + if new_column_semantic_types != normalized_column_semantic_types: + if self.hyperparams['remove_unknown_type'] and 'https://metadata.datadrivendiscovery.org/types/UnknownType' in new_column_semantic_types: + new_column_semantic_types.remove('https://metadata.datadrivendiscovery.org/types/UnknownType') + + new_column_semantic_types_set = set(new_column_semantic_types) + column_semantic_types_set = set(column_semantic_types) + + fitted_add_semantic_types.append(sorted(new_column_semantic_types_set - column_semantic_types_set)) + fitted_remove_semantic_types.append(sorted(column_semantic_types_set - new_column_semantic_types_set)) + + assert len(fitted_add_semantic_types) == len(columns_to_use) + assert len(fitted_remove_semantic_types) == len(columns_to_use) + + return fitted_add_semantic_types, fitted_remove_semantic_types + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + if not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + assert self._add_semantic_types is not None + assert self._remove_semantic_types is not None + + columns_to_use, output_columns = self._produce_columns(inputs, self._add_semantic_types, self._remove_semantic_types) + + if self.hyperparams['replace_index_columns'] and self.hyperparams['return_result'] == 'append': + assert len(columns_to_use) == len(output_columns) + + index_columns = inputs.metadata.get_index_columns() + + index_columns_to_use = [] + other_columns_to_use = [] + index_output_columns = [] + other_output_columns = [] + for column_to_use, output_column in zip(columns_to_use, output_columns): + if column_to_use in index_columns: + index_columns_to_use.append(column_to_use) + index_output_columns.append(output_column) + else: + other_columns_to_use.append(column_to_use) + other_output_columns.append(output_column) + + outputs = base_utils.combine_columns(inputs, index_columns_to_use, index_output_columns, return_result='replace', add_index_columns=self.hyperparams['add_index_columns']) + outputs = base_utils.combine_columns(outputs, other_columns_to_use, other_output_columns, return_result='append', add_index_columns=self.hyperparams['add_index_columns']) + else: + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + return base.CallResult(outputs) + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query_column(column_index) + + semantic_types = column_metadata.get('semantic_types', []) + + # We detect only on columns which have no semantic types or + # where it is explicitly set as unknown. + if not semantic_types or 'https://metadata.datadrivendiscovery.org/types/UnknownType' in semantic_types: + return True + + # A special case to handle setting "https://metadata.datadrivendiscovery.org/types/TrueTarget". + if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in semantic_types: + return True + + return False + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + # We are OK if no columns ended up being parsed. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns can parsed. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _produce_columns( + self, inputs: Inputs, + add_semantic_types: typing.List[typing.List[str]], + remove_semantic_types: typing.List[typing.List[str]], + ) -> typing.Tuple[typing.List[int], typing.List[Outputs]]: + columns_to_use = self._get_columns(inputs.metadata) + + assert len(add_semantic_types) == len(remove_semantic_types), (len(add_semantic_types), len(remove_semantic_types)) + + if len(columns_to_use) != len(add_semantic_types): + raise exceptions.InvalidStateError("Producing on a different number of columns than fitting.") + + output_columns = [] + + for column_index, column_add_semantic_types, column_remove_semantic_types in zip(columns_to_use, add_semantic_types, remove_semantic_types): + output_column = inputs.select_columns([column_index]) + + for remove_semantic_type in column_remove_semantic_types: + output_column.metadata = output_column.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 0), remove_semantic_type) + for add_semantic_type in column_add_semantic_types: + output_column.metadata = output_column.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), add_semantic_type) + + output_columns.append(output_column) + + assert len(output_columns) == len(columns_to_use) + + return columns_to_use, output_columns + + def _is_boolean(self, input_column: container.DataFrame) -> bool: + # If there are less than 3 rows, we do no detect it to be boolean ever. + if len(input_column) < 3: + return False + + # Or it should already be boolean dtype. + if input_column.dtypes.iloc[0].kind == 'b': + return True + + # Are there only two categorical values (after striping string, + # if there are string values). Missing values (empty strings) are not counted + # towards this, so they are not stored in "values_set". + values_set: typing.Set[typing.Any] = set() + for value in input_column.iloc[:, 0]: + if value == "" or pandas.isna(value): + continue + + if isinstance(value, str): + value = value.strip() + + values_set.add(value) + + if len(values_set) > 2: + return False + + assert len(values_set) <= 2 + + # There should be at least one row not NaN. This prevents a degenerate case + # where we would mark a column of no rows or just NaNs as boolean column. + if values_set: + return True + + return False + + # TODO: What to do when there are very little number of rows? + # Like 10? And 3 distinct values. This should still be seen as categorical? + # TODO: Optimize. We od not have to compute all counts. + # But just to cross the limit to be able to return False. + def _is_categorical(self, input_column: container.DataFrame) -> bool: + # We first count all the values. We do not use "value_counts" so that we can + # strip strings if they are strings when counting. We also put all missing values + # as one value. + missing_values_count = 0 + value_counts: typing.Dict[typing.Any, int] = collections.defaultdict(int) + for value in input_column.iloc[:, 0]: + if value == "" or pandas.isna(value): + missing_values_count += 1 + continue + + if isinstance(value, str): + value = value.strip() + + value_counts[value] += 1 + + input_column_rows = len(input_column) + all_distinct_values = bool(missing_values_count) + len(value_counts) + + # There should be at least one row not NaN. This prevents a degenerate case + # where we would mark a column of no rows or just NaNs as categorical column. + # (Otherwise we also get division by zero below.) + if not input_column_rows: + return False + + # Check the absolute limit. + if self.hyperparams['categorical_max_absolute_distinct_values'] is not None \ + and all_distinct_values > self.hyperparams['categorical_max_absolute_distinct_values']: + return False + + # Check the relative limit. + if (all_distinct_values / input_column_rows) > self.hyperparams['categorical_max_ratio_distinct_values']: + return False + + return True + + def _is_integer(self, input_column: container.DataFrame) -> bool: + column_values = input_column.iloc[:, 0] + + # There should be at least one row. This prevents a degenerate case + # where we would mark a column of no rows as integer column. + if not len(column_values): + return False + + # Or it should already be integer dtype. + if input_column.dtypes.iloc[0].kind in ['i', 'u']: + return True + + # Or is of float dtype which have all values in fact integers. + if input_column.dtypes.iloc[0].kind == 'f': + # If all values are or integers or NaN values. + column_values = column_values.dropna() + + # There should be at least one row not NaN. This prevents a degenerate case + # where we would mark a column of just NaNs as integer column. + if len(column_values) and all(v.is_integer() for v in column_values): + return True + + return False + + not_nan_exists = False + + # Or it should be strings (stripped) which all convert to an integer. + for value in column_values: + if not isinstance(value, str): + return False + + value = value.strip() + + try: + int(value) + not_nan_exists = True + continue + except ValueError: + pass + + try: + # Maybe it is an int represented as a float. Let's try this. + value = float(value) + except ValueError: + # No luck. + return False + + if pandas.isna(value): + continue + + if value.is_integer(): + not_nan_exists = True + continue + + return False + + # There should be at least one row not NaN. This prevents a degenerate case + # where we would mark a column of just NaNs as integer column. + if not_nan_exists: + return True + + return False + + def _is_float(self, input_column: container.DataFrame) -> bool: + column_values = input_column.iloc[:, 0] + + # There should be at least one row. This prevents a degenerate case + # where we would mark a column of no rows as float column. + if not len(column_values): + return False + + # Or it should already be float dtype. It is OK if there are just NaNs in this case. + if input_column.dtypes.iloc[0].kind in ['f', 'c']: + return True + + # Or it should be strings (stripped) which all convert to a float or a nan/missing value. + for value in column_values: + # TODO: Should we just look at structural type in metadata? Instead of spending time checking every value? + if not isinstance(value, str): + return False + + value = value.strip() + + try: + value = float(value) + continue + except ValueError: + pass + + # We allow some string values to exist. When parsing they are parsed as float NaNs. + if value in self.hyperparams['nan_values']: + continue + + return False + + # We do mark a column of all NaNs as float column. This includes marking a column of just + # empty strings as float column. It has to be something, so a float column seems reasonable. + return True + + def _is_float_vector(self, input_column: container.DataFrame) -> bool: + column_values = input_column.iloc[:, 0] + + # There should be at least one row. This prevents a degenerate case + # where we would mark a column of no rows as float vector column. + if not len(column_values): + return False + + try: + structural_type = input_column.metadata.query_column_field(0, 'structural_type') + except KeyError: + structural_type = None + + # Or it is already parsed as 1d ndarray of floats or ints. + if structural_type is not None and d3m_utils.is_subclass(structural_type, container.ndarray): + for value in column_values: + # It has to be a vector ndarray. + if len(value.shape) != 1: + return False + + # With floats or ints. + if value.dtype.kind not in ['f', 'i']: + return False + + return True + + vector_exists = False + + # Or it is a string which can be split by "," and each be parsed as float (without missing values). + # We are pretty strict here because we are assuming this was generated programmatically. + for value in column_values: + # TODO: Should we just look at structural type in metadata? Instead of spending time checking every value? + # But what if "structural_type" is None? Do we want to support that? We probably should not. + if not isinstance(value, str): + return False + + values = value.split(',') + + if not values: + continue + + for value in values: + try: + value = float(value) + except ValueError: + return False + + if pandas.isna(value): + return False + + vector_exists = True + + # There should be at least one row with non-empty vector. This prevents a degenerate case + # where we would mark a column of empty strings as float vector column. + if vector_exists: + return True + + return False + + def _is_datetime(self, input_column: container.DataFrame) -> bool: + column_values = input_column.iloc[:, 0] + + # There should be at least one row. This prevents a degenerate case + # where we would mark a column of no rows as a datetime column. + if not len(column_values): + return False + + # Or it should already be datetime dtype. + if input_column.dtypes.iloc[0].kind == 'M': + return True + + datetime_exists = False + + # Or the value is not-a-datetime value, or it can be parsed as a datetime. + for value in column_values: + # TODO: Should we just look at structural type in metadata? Instead of spending time checking every value? + if not isinstance(value, str): + return False + + # TODO: Allow any other not-a-datetime value? Like string version of Panda's NaT? + if value == "": + continue + + if numpy.isnan(utils.parse_datetime_to_float(value)): + return False + else: + datetime_exists = True + + # There should be at least one row not NaN. This prevents a degenerate case + # where we would mark a column of empty strings as datetime column. + if datetime_exists: + return True + + return False + + # TODO: Optimize. We od not have to check all values. + # If we cross the limit where we already have more than ratio values, we can return True. + def _is_text(self, input_column: container.DataFrame) -> bool: + column_values = input_column.iloc[:, 0] + + # There should be at least one row. This prevents a degenerate case + # where we would mark a column of no rows as a text column. + # (Otherwise we also get division by zero below.) + if not len(column_values): + return False + + values_with_whitespace = 0 + + # It has to be structural type string and at least 50 % of rows should have a whitespace + # in them after the value has been stripped. + for value in column_values: + # TODO: Should we just look at structural type in metadata? Instead of spending time checking every value? + if not isinstance(value, str): + return False + + value = value.strip() + + if WHITESPACE_REGEX.search(value): + values_with_whitespace += 1 + + if (values_with_whitespace / len(column_values)) < self.hyperparams['text_min_ratio_values_with_whitespace']: + return False + + return True + + def _is_unique_key(self, input_column: container.DataFrame) -> bool: + column_values = input_column.iloc[:, 0] + + # There should be at least one row. This prevents a degenerate case + # where we would mark a column of no rows as a unique key column. + # (Otherwise we also get division by zero below.) + if not len(column_values): + return False + + # Here we look at every value as-is. Even empty strings and other missing/nan values. + if any(input_column.duplicated()): + return False + + return True + + def get_params(self) -> Params: + if not self._fitted: + return Params( + add_semantic_types=None, + remove_semantic_types=None, + ) + + return Params( + add_semantic_types=self._add_semantic_types, + remove_semantic_types=self._remove_semantic_types, + ) + + def set_params(self, *, params: Params) -> None: + self._add_semantic_types = params['add_semantic_types'] + self._remove_semantic_types = params['remove_semantic_types'] + self._fitted = all(param is not None for param in params.values()) diff --git a/tods/common-primitives/common_primitives/slacker/README.md b/tods/common-primitives/common_primitives/slacker/README.md new file mode 100644 index 0000000..a8ee0fa --- /dev/null +++ b/tods/common-primitives/common_primitives/slacker/README.md @@ -0,0 +1,13 @@ +This package contains Slacker modules code copied as they are from existing D3M datasets' solutions, +with the following changes: +* `base` module import modified to be a relative import. +* Commented out any prints. +* In `DataFrameCategoricalEncoder` made sure regular dicts are stored in `code_maps` and not `defaultdict`. + This makes pickling possible. +* Made code use `SimpleImputer` instead of `Imputer` for compatibility with newer sklearn. +* Made default value for `missing_values` of `DenseMixedStrategyImputer` be `np.nan` because + `SimpleImputer` does not process string `NaN` as a special value anymore. +* Updated call to `OneHotEncoder` to not use `categorical_features`. +* Replaced all calles of `as_matrix` with calls to `values`. + +Some solutions contain slightly modified versions, but these files here match the most common ones. diff --git a/tods/common-primitives/common_primitives/slacker/__init__.py b/tods/common-primitives/common_primitives/slacker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tods/common-primitives/common_primitives/slacker/base.py b/tods/common-primitives/common_primitives/slacker/base.py new file mode 100644 index 0000000..6bfccb2 --- /dev/null +++ b/tods/common-primitives/common_primitives/slacker/base.py @@ -0,0 +1,102 @@ +from abc import ABC, abstractmethod +from collections import OrderedDict +import numpy as np +from numpy import ndarray +from scipy.sparse import csr_matrix +from pandas import DataFrame +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_selection.base import SelectorMixin + +# https://stackoverflow.com/a/3862957 +def get_all_subclasses(cls): + return cls.__subclasses__() + [g for s in cls.__subclasses__() for g in get_all_subclasses(s)] + + +def sample_param_distributions(param_distributions): + + try: + return sample_param_distributions_dict(param_distributions) + except AttributeError: + i = np.random.randint(len(param_distributions)) + return sample_param_distributions_dict(param_distributions[i]) + + +def sample_param_distributions_dict(param_distributions_dict): + + params = {} + for k, v in param_distributions_dict.items(): + i = np.random.randint(len(v)) + params[k] = v[i] + return params + + +class AbstractParameterized(ABC): + + param_distributions = {} + + @classmethod + def get_random_parameters(cls): + return sample_param_distributions(cls.param_distributions) + + +class AbstractFeatureExtractor(AbstractParameterized, BaseEstimator): + + def fit(self, df, variables): + self.fit_transform(df, variables) + return self + + @abstractmethod + def fit_transform(self, df, variables): + """ Fits the feature extractor + + :param df: + :type df: DataFrame + :param variables: + :type variables: list[D3MVariable] + :return: + :rtype: csr_matrix + """ + pass + + @abstractmethod + def transform(self, df): + """ Transforms the data + + :param df: + :type df: DataFrame + :return: + :rtype: csr_matrix + """ + pass + + +class AbstractFeatureSelector(AbstractParameterized, BaseEstimator, SelectorMixin): + + pass + + +class AbstractEstimator(AbstractParameterized, BaseEstimator): + + @abstractmethod + def fit(self, X, y): + """ + + :param X: + :type X: csr_matrix + :param y: + :type y: ndarray + :return: + :rtype: AbstractEstimator + """ + return self + + @abstractmethod + def predict(self, X): + """ + + :param X: + :type X: csr_matrix + :return: + :rtype: ndarray + """ + pass diff --git a/tods/common-primitives/common_primitives/slacker/estimation.py b/tods/common-primitives/common_primitives/slacker/estimation.py new file mode 100644 index 0000000..044de31 --- /dev/null +++ b/tods/common-primitives/common_primitives/slacker/estimation.py @@ -0,0 +1,105 @@ +from .base import AbstractEstimator + +import numpy as np + +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.kernel_approximation import RBFSampler +from sklearn.linear_model import SGDClassifier, SGDRegressor + + +class SGDClassifierEstimator(AbstractEstimator): + + param_distributions = { + 'loss': ('hinge', 'log', 'squared_hinge', 'perceptron'), + 'penalty': ('elasticnet',), + 'alpha': [float(x) for x in np.logspace(-9, 0, 10)], + 'l1_ratio': [float(x) for x in np.linspace(0, 1, 11)], + 'fit_intercept': (True, True, True, False) + } + + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + def fit(self, X, y): + n_samples = X.shape[0] + self.kwargs['n_iter'] = max(5, int(10**6 / n_samples)) + self.sgd_classifier = SGDClassifier(*self.args, **self.kwargs) + self.sgd_classifier.fit(X, y) + + def predict(self, X): + return self.sgd_classifier.predict(X) + + +class SGDRegressorEstimator(AbstractEstimator): + + param_distributions = { + 'loss': ('squared_loss', 'huber'), + 'penalty': ('elasticnet',), + 'alpha': [float(x) for x in np.logspace(-9, 0, 10)], + 'l1_ratio': [float(x) for x in np.linspace(0, 1, 11)], + 'fit_intercept': (True, True, True, False), + 'epsilon': [float(x) for x in np.logspace(-2, 0, 5)], + 'learning_rate': ('optimal', 'invscaling'), + 'eta0': (0.1, 0.01, 0.001), + 'power_t': [float(x) for x in np.linspace(0, 1, 5)] + } + + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + def fit(self, X, y): + n_samples = X.shape[0] + self.kwargs['n_iter'] = max(5, int(10**6 / n_samples)) + self.sgd_regressor = SGDRegressor(*self.args, **self.kwargs) + self.sgd_regressor.fit(X, y) + + def predict(self, X): + return self.sgd_regressor.predict(X) + + +# TODO: inherit AbstractEstimator, grab param_distributions from cv_setup_map.py in the old slacker, +class RBFSamplerSGDClassifierEstimator(BaseEstimator, TransformerMixin): + + def __init__(self, gamma=1.0, n_components=100, random_state=None, **kwargs): + kwargs['random_state'] = random_state + self.rbf_sampler = RBFSampler(gamma=gamma, n_components=n_components, random_state=random_state) + self.sgdclassifier = SGDClassifier(**kwargs) + + def fit(self, X, y): + X = self.rbf_sampler.fit_transform(X) + self.sgdclassifier.fit(X, y) + return self + + def transform(self, X, y=None): + return np.sqrt(self.rbf_sampler.n_components) / np.sqrt(2.) * self.rbf_sampler.transform(X) + + def predict(self, X): + return self.sgdclassifier.predict(self.transform(X)) + + def decision_function(self, X): + return self.sgdclassifier.decision_function(self.transform(X)) + +# TODO: inherit AbstractEstimator, grab param_distributions from cv_setup_map.py in the old slacker, +class RBFSamplerSGDRegressorEstimator(BaseEstimator, TransformerMixin): + + def __init__(self, gamma=1.0, n_components=100, random_state=None, **kwargs): + kwargs['random_state'] = random_state + self.rbf_sampler = RBFSampler(gamma=gamma, n_components=n_components, random_state=random_state) + self.sgdregressor = SGDRegressor(**kwargs) + + def fit(self, X, y): + X = self.rbf_sampler.fit_transform(X) + self.sgdregressor.fit(X, y) + return self + + def transform(self, X, y=None): + return np.sqrt(self.rbf_sampler.n_components) / np.sqrt(2.) * self.rbf_sampler.transform(X) + + def predict(self, X): + return self.sgdregressor.predict(self.transform(X)) + +# TODO: Add kernel SVM +# TODO: Add kernel ridge regressor +# TODO: Add random forests / xgboost \ No newline at end of file diff --git a/tods/common-primitives/common_primitives/slacker/feature_extraction.py b/tods/common-primitives/common_primitives/slacker/feature_extraction.py new file mode 100644 index 0000000..300d5a4 --- /dev/null +++ b/tods/common-primitives/common_primitives/slacker/feature_extraction.py @@ -0,0 +1,184 @@ +from collections import defaultdict, OrderedDict +import numpy as np +from scipy import signal +from scipy.sparse import csr_matrix, hstack +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.cluster import MiniBatchKMeans +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import OneHotEncoder, StandardScaler +from sklearn.impute import SimpleImputer +from sklearn.utils.validation import check_is_fitted + +from .base import AbstractFeatureExtractor + +class DenseMixedStrategyImputer(BaseEstimator, TransformerMixin): + + def __init__(self, missing_values=np.nan, strategies=None, add_missing_indicator=True, verbose=False): + self.missing_values = missing_values + if strategies is None: + raise ValueError('Must provide strategy.') + allowed_strategies = ['mean', 'median', 'most_frequent'] + if any(s not in allowed_strategies for s in strategies): + raise ValueError('Invalid strategy in list.') + self.strategies = strategies + self.add_missing_indicator = add_missing_indicator + self.verbose = verbose + + def fit(self, X, y=None): + n_samples, n_features = X.shape + # print('n_features',n_features) + if len(self.strategies) != n_features: + raise ValueError('Number of strategies must equal number of features.') + self.impute_strategies = list(set(self.strategies)) + self.impute_indices = [np.array([i for i, x in enumerate(self.strategies) if x == s]) for s in self.impute_strategies] + self.impute_valid_indices = [] + self.imputers = [SimpleImputer(missing_values=self.missing_values, strategy=s, verbose=self.verbose) for s in + self.impute_strategies] + for indices, imputer in zip(self.impute_indices, self.imputers): + imputer.fit(X[:, indices]) + valid_mask = np.logical_not(np.isnan(imputer.statistics_)) + self.impute_valid_indices.append(indices[valid_mask]) + return self + + def transform(self, X): + n_samples, n_features = X.shape + if len(self.strategies) != n_features: + raise ValueError('Number of strategies must equal number of features.') + check_is_fitted(self, 'imputers') + + if self.add_missing_indicator: + output_scale = 2 + else: + output_scale = 1 + + X_out = np.zeros((n_samples, output_scale*n_features)) + for input_indices, output_indices, imputer in zip(self.impute_indices, self.impute_valid_indices, self.imputers): + X_out[:, output_scale*output_indices] = imputer.transform(X[:, input_indices]) + + if self.add_missing_indicator: + X_out[:, np.arange(1, 2*n_features, 2)] = np.isnan(X).astype('float', copy=False) + + return X_out + + +class DataFrameCategoricalEncoder(BaseEstimator, TransformerMixin): + + def fit(self, X, y=None): + self.code_maps = {} + for k in X.columns: + self.code_maps[k] = defaultdict(lambda: np.nan) + self.code_maps[k].update({v: k for k, v in enumerate(X[k].astype('category').cat.categories)}) + self.code_maps[k] = dict(self.code_maps[k]) + return self + + def transform(self, X): + if set(X.columns) != set(self.code_maps): + raise ValueError('Columns do not match fit model.') + return X.apply(lambda x: x.apply(lambda y: self.code_maps[x.name][y])).values + + +class AnnotatedTabularExtractor(AbstractFeatureExtractor): + + param_distributions = { + 'normalize_text': [True, False], + 'categorize': [True, False], + 'numeric_strategy': ['mean', 'median'], + 'add_missing_indicator': [True, False] + } + + def __init__(self, normalize_text=False, categorize=False, numeric_strategy='mean', add_missing_indicator=True): + self.normalize_text = normalize_text + self.categorize = categorize + self.numeric_strategy = numeric_strategy + self.add_missing_indicator = add_missing_indicator + + def set_cols_info(self, cols_info): + self.cols_info = cols_info + + def determine_colType(self, column): + variables = self.cols_info + for var in variables: + var_colName = var['colName'] + if str(var_colName) != str(column): + continue + var_colType = var['colType'] + if var_colType in {'categorical', 'boolean'}: + return 'categorical' + elif var_colType in {'integer', 'real'}: + return 'numeric' + elif var_colType == 'string': + return 'text' + elif var_colType == 'dateTime': + raise RuntimeError('datTime not implemented in this feature extractor yet !!') + + + def fit_transform(self, df, variables): + df = self.copy_normalize_text(df) + + self.column_types = OrderedDict() + + for column in df: + itype = self.determine_colType(column) + # print('itype',itype) + self.column_types[column] = itype + + self.numeric_columns = [column for column, type in self.column_types.items() if type == 'numeric'] + self.categorical_columns = [column for column, type in self.column_types.items() if type == 'categorical'] + self.text_columns = [column for column, type in self.column_types.items() if type == 'text'] + + output_arrays = [] + + if len(self.numeric_columns) > 0: + X = df[self.numeric_columns].apply(lambda x: pd.to_numeric(x, errors='coerce')).values + self.numeric_imputer = DenseMixedStrategyImputer( + strategies=[self.numeric_strategy]*len(self.numeric_columns), + add_missing_indicator=self.add_missing_indicator + ) + X = self.numeric_imputer.fit_transform(X) + self.numeric_scaler = StandardScaler() + output_arrays.append(self.numeric_scaler.fit_transform(X)) + + if len(self.categorical_columns) > 0: + self.categorical_encoder = DataFrameCategoricalEncoder() + X = self.categorical_encoder.fit_transform(df[self.categorical_columns]) + self.categorical_imputer = DenseMixedStrategyImputer( + strategies=['most_frequent']*len(self.categorical_columns), + add_missing_indicator=self.add_missing_indicator + ) + X = self.categorical_imputer.fit_transform(X) + self.one_hot_encoder = OneHotEncoder(categories='auto') + output_arrays.append(self.one_hot_encoder.fit_transform(X)) + + return hstack([csr_matrix(X) for X in output_arrays], format='csr') + + def transform(self, df): + + check_is_fitted(self, 'column_types') + if list(df) != list(self.column_types): + raise ValueError('Data to be transformed does not match fitting data.') + + df = self.copy_normalize_text(df) + + output_arrays = [] + + if len(self.numeric_columns) > 0: + X = df[self.numeric_columns].apply(lambda x: pd.to_numeric(x, errors='coerce')).values + output_arrays.append(self.numeric_scaler.transform(self.numeric_imputer.transform(X))) + + if len(self.categorical_columns) > 0: + X = self.categorical_encoder.transform(df[self.categorical_columns]) + output_arrays.append(self.one_hot_encoder.transform(self.categorical_imputer.transform(X))) + + return hstack([csr_matrix(X) for X in output_arrays], format='csr') + + def copy_normalize_text(self, df): + df = df.copy() + if self.normalize_text: + for column in df: + try: + df[column] = df[column].str.lower().str.strip() + except: + df[column] = df[column] + return df diff --git a/tods/common-primitives/common_primitives/slacker/feature_selection.py b/tods/common-primitives/common_primitives/slacker/feature_selection.py new file mode 100644 index 0000000..4a9a857 --- /dev/null +++ b/tods/common-primitives/common_primitives/slacker/feature_selection.py @@ -0,0 +1,179 @@ +from unittest import TestCase + +from .base import AbstractFeatureSelector + +import numpy as np +from scipy import stats +from scipy.sparse import issparse + +from sklearn.feature_selection import f_classif, SelectFromModel, SelectPercentile +from sklearn.linear_model import Lasso +from sklearn.svm import LinearSVC +from sklearn.utils import check_X_y +from sklearn.utils.extmath import safe_sparse_dot, row_norms +from scipy.linalg import norm + + +# modified to address the issue of centering sparse matrices with a bit of algebra +def better_f_regression(X, y, center=True): + """Univariate linear regression tests. + + Quick linear model for testing the effect of a single regressor, + sequentially for many regressors. + + This is done in 2 steps: + + 1. The cross correlation between each regressor and the target is computed, + that is, ((X[:, i] - mean(X[:, i])) * (y - mean_y)) / (std(X[:, i]) * + std(y)). + 2. It is converted to an F score then to a p-value. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + X : {array-like, sparse matrix} shape = (n_samples, n_features) + The set of regressors that will be tested sequentially. + + y : array of shape(n_samples). + The data matrix + + center : True, bool, + If true, X and y will be centered. + + Returns + ------- + F : array, shape=(n_features,) + F values of features. + + pval : array, shape=(n_features,) + p-values of F-scores. + + See also + -------- + f_classif: ANOVA F-value between label/feature for classification tasks. + chi2: Chi-squared stats of non-negative features for classification tasks. + """ + X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64) + n_samples = X.shape[0] + + if center: + y = y - np.mean(y) + if issparse(X): + X_means = X.mean(axis=0).getA1() + else: + X_means = X.mean(axis=0) + X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples*X_means**2) + else: + X_norms = row_norms(X.T) + + # compute the correlation + corr = safe_sparse_dot(y, X) + corr /= X_norms + corr /= norm(y) + + # convert to p-value + degrees_of_freedom = y.size - (2 if center else 1) + F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom + pv = stats.f.sf(F, 1, degrees_of_freedom) + return F, pv + + +class SelectFromLinearSVC(AbstractFeatureSelector): + + param_distributions = { + 'threshold': (1e-5,), + 'C': [float(x) for x in np.logspace(-2, 5, 100)] + } + + def __init__(self, threshold=None, penalty='l1', loss='squared_hinge', dual=False, tol=0.0001, C=1.0, fit_intercept=True, random_state=None, max_iter=1000): + self.threshold = threshold + self.penalty = penalty + self.loss = loss + self.dual = dual + self.tol = tol + self.C = C + self.fit_intercept = fit_intercept + self.random_state = random_state + self.max_iter = max_iter + + def fit(self, X, y): + self.linear_svc = LinearSVC(penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol, + fit_intercept=self.fit_intercept, random_state=self.random_state, + max_iter=self.max_iter) + self.linear_svc.fit(X, y) + self.select_from_model = SelectFromModel(self.linear_svc, threshold=self.threshold, prefit=True) + return self + + def _get_support_mask(self): + return self.select_from_model._get_support_mask() + +class SelectPercentileClassification(AbstractFeatureSelector, SelectPercentile): + + param_distributions = { + 'score_func': ('f_classif',), + 'percentile': [int(x) for x in np.linspace(10, 100, 100)] + } + + score_funcs = { + 'f_classif': f_classif + } + + def __init__(self, *args, **kwargs): + if 'score_func' in kwargs: + kwargs['score_func'] = self.score_funcs[kwargs['score_func']] + super().__init__(*args, **kwargs) + + +class SelectFromLasso(AbstractFeatureSelector): + + param_distributions = { + 'threshold': (1e-5,), + 'alpha': [float(x) for x in np.logspace(-5, 2, 100)] + } + + def __init__(self, threshold=None, alpha=1.0, fit_intercept=True, normalize=False, max_iter=1000, tol=0.0001, positive=False, selection='cyclic', random_state=None): + self.threshold = threshold + self.alpha = alpha + self.fit_intercept = fit_intercept + self.normalize = normalize + self.max_iter = max_iter + self.tol = tol + self.positive = positive + self.selection = selection + self.random_state = random_state + + def fit(self, X, y): + # NOTE: y is an ndarray of strings + self.lasso = Lasso(alpha=self.alpha, fit_intercept=self.fit_intercept, normalize=self.normalize, + max_iter=self.max_iter, tol=self.tol, positive=self.positive, selection=self.selection, + random_state=self.random_state) + self.lasso.fit(X, y) + self.select_from_model = SelectFromModel(self.lasso, threshold=self.threshold, prefit=True) + return self + + def _get_support_mask(self): + return self.select_from_model._get_support_mask() + + +class SelectPercentileRegression(AbstractFeatureSelector, SelectPercentile): + + param_distributions = { + 'score_func': ('f_regression',), + 'percentile': [int(x) for x in np.linspace(10, 100, 100)] + } + + score_funcs = { + 'f_regression': better_f_regression + } + + def __init__(self, *args, **kwargs): + if 'score_func' in kwargs: + kwargs['score_func'] = self.score_funcs[kwargs['score_func']] + super().__init__(*args, **kwargs) + + def fit(self, X, y): + # NOTE: y is an ndarray of strings + super().fit(X, y) + return self + diff --git a/tods/common-primitives/common_primitives/stack_ndarray_column.py b/tods/common-primitives/common_primitives/stack_ndarray_column.py new file mode 100644 index 0000000..57d4935 --- /dev/null +++ b/tods/common-primitives/common_primitives/stack_ndarray_column.py @@ -0,0 +1,133 @@ +import copy +import os +import typing + +import numpy # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('StackNDArrayColumnPrimitive',) + +Inputs = container.DataFrame +Outputs = container.ndarray + + +class Hyperparams(hyperparams.Hyperparams): + use_column = hyperparams.Hyperparameter[typing.Optional[int]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A column index to force primitive to operate on. If the specified column is not a column of numpy arrays, an error is raised.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + + +class StackNDArrayColumnPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which stacks numpy arrays in a column and returns a stacked numpy array along the new 0 axis. + + All arrays must have the same shape. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '48c99864-14f3-4a61-b3a6-e439f22825f6', + 'version': '0.1.0', + 'name': "Stack numpy arrays in a column", + 'python_path': 'd3m.primitives.data_transformation.stack_ndarray_column.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/stack_ndarray_column.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_CONCATENATION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + column_to_use = self._get_column(inputs.metadata) + + outputs = container.ndarray(numpy.stack(inputs.iloc[:, column_to_use], axis=0), generate_metadata=False) + + outputs.metadata = self._update_metadata(inputs.metadata, column_to_use) + + # Update the structure. + outputs.metadata = outputs.metadata.generate(outputs) + + return base.CallResult(outputs) + + def _update_metadata(self, inputs_metadata: metadata_base.DataMetadata, column_to_use: int) -> metadata_base.DataMetadata: + # Copy input metadata so that we can modify it in-place. + outputs_metadata = copy.copy(inputs_metadata) + outputs_metadata._current_metadata = inputs_metadata._current_metadata.copy() + + # Remove columns dimension and replace it with metadata of the column. + # TODO: Move this to metadata API. + all_columns_metadata_entry = outputs_metadata._current_metadata.all_elements.all_elements + column_metadata_entry = outputs_metadata._current_metadata.all_elements.elements[column_to_use] + if all_columns_metadata_entry is not None: + outputs_metadata._current_metadata.all_elements = outputs_metadata._merge_metadata_entries(all_columns_metadata_entry, column_metadata_entry) + else: + outputs_metadata._current_metadata.all_elements = column_metadata_entry + outputs_metadata._current_metadata.update_is_empty() + + # Move structural type from rows to top-level. + outputs_metadata = outputs_metadata.update((), { + 'structural_type': container.ndarray, + }) + outputs_metadata = outputs_metadata.update((metadata_base.ALL_ELEMENTS,), { + 'structural_type': metadata_base.NO_VALUE, + }) + + return outputs_metadata + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + if issubclass(column_metadata['structural_type'], numpy.ndarray): + return True + + return False + + def _get_column(self, inputs_metadata: metadata_base.DataMetadata) -> int: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + if self.hyperparams['use_column'] is None: + use_columns: typing.List[int] = [] + else: + use_columns = [self.hyperparams['use_column']] + + columns_to_use, _ = base_utils.get_columns_to_use(inputs_metadata, use_columns, self.hyperparams['exclude_columns'], can_use_column) + + if not columns_to_use: + if use_columns: + raise ValueError("Specified column cannot be operated on.") + else: + raise ValueError("No column found to operate on.") + + assert len(columns_to_use) == 1 + + return columns_to_use[0] diff --git a/tods/common-primitives/common_primitives/tabular_extractor.py b/tods/common-primitives/common_primitives/tabular_extractor.py new file mode 100644 index 0000000..e872b82 --- /dev/null +++ b/tods/common-primitives/common_primitives/tabular_extractor.py @@ -0,0 +1,232 @@ +import os.path +import pickle +import typing + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.container import dataset +from d3m.metadata import base as metadata_base, hyperparams as hyperparams_module, params as params_module +from d3m.primitive_interfaces import base, unsupervised_learning + +import common_primitives + +from .slacker import feature_extraction + +__all__ = ('AnnotatedTabularExtractorPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params_module.Params): + column_types: typing.Optional[typing.Dict[str, str]] + numeric_columns: typing.Optional[typing.List[str]] + categorical_columns: typing.Optional[typing.List[str]] + text_columns: typing.Optional[typing.List[str]] + numeric_imputer: typing.Optional[bytes] + numeric_scaler: typing.Optional[bytes] + categorical_encoder: typing.Optional[bytes] + categorical_imputer: typing.Optional[bytes] + one_hot_encoder: typing.Optional[bytes] + + +class Hyperparams(hyperparams_module.Hyperparams): + normalize_text = hyperparams_module.UniformBool( + default=False, + description="Convert text to lowercase and strip whitespace.", + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + numeric_strategy = hyperparams_module.Enumeration[str]( + values=['mean', 'median'], + default='mean', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + add_missing_indicator = hyperparams_module.UniformBool( + default=True, + description="Add columns to indicate missing values.", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + +class AnnotatedTabularExtractorPrimitive(unsupervised_learning.UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): # pylint: disable=inherit-non-class + """ + A primitive wrapping for MIT-LL slacker's ``AnnotatedTabularExtractor``. + """ + + __author__ = 'Tianrui, Jian and Julia' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '6c425897-6ffe-45b8-949f-002d872ccf12', + 'version': '0.1.0', + 'name': 'Annotated tabular extractor', + 'python_path': 'd3m.primitives.data_cleaning.tabular_extractor.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:y.cao@berkeley.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/tabular_extractor.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.IMPUTATION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_CLEANING, + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._training_inputs: Inputs = None + self._extractor: feature_extraction.AnnotatedTabularExtractor = None + + def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore + self._training_inputs = inputs + self._extractor = None + + def fit(self, *, timeout: float = None, iterations: int = None) -> base.CallResult[None]: + # If already fitted with current training data, this call is a no-op. + if self._extractor: + return base.CallResult(None) + + if self._training_inputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + extractor = self._create_extractor() + + attribute_columns = self._get_attribute_columns(self._training_inputs.metadata) + + extractor.set_cols_info(self._metadata_to_cols_info(self._training_inputs.metadata, attribute_columns)) + extractor.fit_transform(self._training_inputs.iloc[:, attribute_columns], []) + + # Fitted. + self._extractor = extractor + + return base.CallResult(None) + + def _create_extractor(self) -> feature_extraction.AnnotatedTabularExtractor: + return feature_extraction.AnnotatedTabularExtractor( + normalize_text=self.hyperparams['normalize_text'], + numeric_strategy=self.hyperparams['numeric_strategy'], + add_missing_indicator=self.hyperparams['add_missing_indicator'], + ) + + def _get_attribute_columns(self, inputs_metadata: metadata_base.DataMetadata) -> typing.Sequence[int]: + return inputs_metadata.list_columns_with_semantic_types(['https://metadata.datadrivendiscovery.org/types/Attribute']) + + def _metadata_to_cols_info(self, inputs_metadata: metadata_base.DataMetadata, attribute_columns: typing.Sequence[int]) -> typing.Sequence[typing.Dict]: + cols_info = [] + for i, column_index in enumerate(attribute_columns): + column_metadata = inputs_metadata.query_column(column_index) + + column_type = None + column_roles = [] + for semantic_type in column_metadata['semantic_types']: + if semantic_type in dataset.SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES: + if column_type is not None: + raise exceptions.InvalidStateError( + "Duplicate semantic types for column types: '{first_type}' and '{second_type}'".format( + first_type=column_type, + second_type=dataset.SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES[semantic_type], + ), + ) + column_type = dataset.SEMANTIC_TYPES_TO_D3M_COLUMN_TYPES[semantic_type] + elif semantic_type in dataset.SEMANTIC_TYPES_TO_D3M_ROLES: + column_roles.append(dataset.SEMANTIC_TYPES_TO_D3M_ROLES[semantic_type]) + + if column_type is None: + raise exceptions.InvalidStateError("Could not find a column type among semantic types.") + + cols_info.append( + { + 'colIndex': i, + 'colName': column_metadata['name'], + 'colType': column_type, + 'role': column_roles, + }, + ) + + return cols_info + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + if not self._extractor: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + attribute_columns = self._get_attribute_columns(inputs.metadata) + + # This is a sparse scipy CSR matrix. + transformed_inputs = self._extractor.transform(inputs.iloc[:, attribute_columns]) + + output_columns = container.DataFrame(transformed_inputs.toarray(), generate_metadata=True) + + # All transformed inputs are attributes. + output_columns.metadata = output_columns.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, metadata_base.ALL_ELEMENTS), + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ) + + # This replaces attribute columns with output columns, while keeping other columns (like "d3mIndex" and target columns). + outputs = base_utils.combine_columns(inputs, list(attribute_columns), [output_columns], return_result='replace', add_index_columns=True) + + return base.CallResult(outputs) + + def get_params(self) -> Params: + if not self._extractor: + return Params( + column_types=None, + numeric_columns=None, + categorical_columns=None, + text_columns=None, + numeric_imputer=None, + numeric_scaler=None, + categorical_encoder=None, + categorical_imputer=None, + one_hot_encoder=None, + ) + + return Params( + # In Python 3.6 all dicts preserve order, so we can do this. + # We have to do it as a workaround for a pytypes bug. + # See: https://github.com/Stewori/pytypes/issues/52 + column_types=dict(self._extractor.column_types), + numeric_columns=self._extractor.numeric_columns, + categorical_columns=self._extractor.categorical_columns, + text_columns=self._extractor.text_columns, + # Generally, one should not just pickle child instances, but extract underlying params. + numeric_imputer=pickle.dumps(self._extractor.numeric_imputer) if hasattr(self._extractor, 'numeric_imputer') else None, + numeric_scaler=pickle.dumps(self._extractor.numeric_scaler) if hasattr(self._extractor, 'numeric_scaler') else None, + categorical_encoder=pickle.dumps(self._extractor.categorical_encoder) if hasattr(self._extractor, 'categorical_encoder') else None, + categorical_imputer=pickle.dumps(self._extractor.categorical_imputer) if hasattr(self._extractor, 'categorical_imputer') else None, + one_hot_encoder=pickle.dumps(self._extractor.one_hot_encoder) if hasattr(self._extractor, 'one_hot_encoder') else None, + ) + + def set_params(self, *, params: Params) -> None: + if not all(params[param] is not None for param in ['column_types', 'numeric_columns', 'categorical_columns', 'text_columns']): + self._extractor = None + else: + extractor = self._create_extractor() + + extractor.column_types = params['column_types'] + extractor.numeric_columns = params['numeric_columns'] + extractor.categorical_columns = params['categorical_columns'] + extractor.text_columns = params['text_columns'] + + if params['numeric_imputer'] is not None: + extractor.numeric_imputer = pickle.loads(params['numeric_imputer']) + if params['numeric_scaler'] is not None: + extractor.numeric_scaler = pickle.loads(params['numeric_scaler']) + if params['categorical_encoder'] is not None: + extractor.categorical_encoder = pickle.loads(params['categorical_encoder']) + if params['categorical_imputer'] is not None: + extractor.categorical_imputer = pickle.loads(params['categorical_imputer']) + if params['one_hot_encoder'] is not None: + extractor.one_hot_encoder = pickle.loads(params['one_hot_encoder']) + + self._extractor = extractor diff --git a/tods/common-primitives/common_primitives/term_filter.py b/tods/common-primitives/common_primitives/term_filter.py new file mode 100644 index 0000000..4f6d541 --- /dev/null +++ b/tods/common-primitives/common_primitives/term_filter.py @@ -0,0 +1,100 @@ +import os +import re + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives +from common_primitives import dataframe_utils + +__all__ = ('TermFilterPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + column = hyperparams.Hyperparameter[int]( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='Index of column filter applies to.', + ) + inclusive = hyperparams.Hyperparameter[bool]( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='True when values that contain a match against the term list are retained, False when they are removed.', + ) + terms = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='A set of terms to filter against. A row will be filtered if any term in the list matches.', + ) + match_whole = hyperparams.Hyperparameter[bool]( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='True if a term is matched only against a full word, False if a word need only contain the term.', + ) + + +class TermFilterPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which filters rows from a DataFrame based on a column value containing a match + against a caller supplied term list. Supports search-style matching where the target need only + contain a term, as well as whole word matching where the target is tokenized using regex word boundaries. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'a6b27300-4625-41a9-9e91-b4338bfc219b', + 'version': '0.1.0', + 'name': "Term list dataset filter", + 'python_path': 'd3m.primitives.data_preprocessing.term_filter.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:cbethune@uncharted.software', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/term_filter.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.ARRAY_SLICING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + # to make sure index matches row indices + resource = inputs.reset_index(drop=True) + + try: + escaped_terms = [re.escape(t) for t in self.hyperparams['terms']] + + if self.hyperparams['match_whole']: + # convert term list into a regex that matches whole words + pattern = re.compile(r'\b(?:%s)\b' % '|'.join(escaped_terms)) + else: + # convert term list into a regex that does a partial match + pattern = re.compile('|'.join(escaped_terms)) + + matched = resource.iloc[:, self.hyperparams['column']].astype(str).str.contains(pattern) + to_keep = matched if self.hyperparams['inclusive'] else ~matched + + to_keep_indices = resource.loc[to_keep].index + + except re.error as error: + raise exceptions.InvalidArgumentValueError("Failed to compile regex for terms: {terms}".format(terms=self.hyperparams['terms'])) from error + + # remove dataframe and metadata rows by index + outputs = dataframe_utils.select_rows(inputs, to_keep_indices) + + return base.CallResult(outputs) diff --git a/tods/common-primitives/common_primitives/text_reader.py b/tods/common-primitives/common_primitives/text_reader.py new file mode 100644 index 0000000..d1c6585 --- /dev/null +++ b/tods/common-primitives/common_primitives/text_reader.py @@ -0,0 +1,70 @@ +import os +from urllib import parse as url_parse + +import frozendict # type: ignore + +from d3m import exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base + +import common_primitives +from common_primitives import base + + +class TextReaderPrimitive(base.FileReaderPrimitiveBase): + """ + A primitive which reads columns referencing plain text files. + + Each column which has ``https://metadata.datadrivendiscovery.org/types/FileName`` semantic type + and a valid media type (``text/plain``) has every filename read as a Python string. By default + the resulting column with read strings is appended to existing columns. + """ + + _supported_media_types = ( + 'text/plain', + ) + _file_structural_type = str + _file_semantic_types = ('http://schema.org/Text',) + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '0b21fcca-8b35-457d-a65d-36294c6f80a2', + 'version': '0.1.0', + 'name': 'Columns text reader', + 'python_path': 'd3m.primitives.data_preprocessing.text_reader.Common', + 'keywords': ['text', 'reader', 'plain'], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/text_reader.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, + ], + 'supported_media_types': _supported_media_types, + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + } + ) + + def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict, fileuri: str) -> str: + parsed_uri = url_parse.urlparse(fileuri, allow_fragments=False) + + if parsed_uri.scheme != 'file': + raise exceptions.NotSupportedError("Only local files are supported, not '{fileuri}'.".format(fileuri=fileuri)) + + if parsed_uri.netloc not in ['', 'localhost']: + raise exceptions.InvalidArgumentValueError("Invalid hostname for a local file: {fileuri}".format(fileuri=fileuri)) + + if not parsed_uri.path.startswith('/'): + raise exceptions.InvalidArgumentValueError("Not an absolute path for a local file: {fileuri}".format(fileuri=fileuri)) + + with open(parsed_uri.path, 'r', encoding='utf8') as file: + return file.read() diff --git a/tods/common-primitives/common_primitives/train_score_split.py b/tods/common-primitives/common_primitives/train_score_split.py new file mode 100644 index 0000000..aae15bf --- /dev/null +++ b/tods/common-primitives/common_primitives/train_score_split.py @@ -0,0 +1,89 @@ +import os +import typing + +import numpy # type: ignore +import pandas # type: ignore +from sklearn import model_selection # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams + +import common_primitives +from common_primitives import base + +__all__ = ('TrainScoreDatasetSplitPrimitive',) + + +class Hyperparams(hyperparams.Hyperparams): + train_score_ratio = hyperparams.Uniform( + lower=0, + upper=1, + default=0.75, + upper_inclusive=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="The ratio between the train and score data and represents the proportion of the Dataset to include in the train split. The rest is included in the score split.", + ) + stratified = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Do stratified folds. The folds are made by preserving the percentage of samples for each class.", + ) + shuffle = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Whether to shuffle the data before splitting into batches.", + ) + delete_recursive = hyperparams.Hyperparameter[bool]( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Delete rows in other resources/tables which are not needed for rows left in the dataset entry point resource/table.", + ) + + +class TrainScoreDatasetSplitPrimitive(base.TabularSplitPrimitiveBase[Hyperparams]): + """ + A primitive which splits a tabular Dataset into random train and score subsets. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '3fcc6dc4-6681-4c86-948e-066d14e7d803', + 'version': '0.1.0', + 'name': "Train-score tabular dataset splits", + 'python_path': 'd3m.primitives.evaluation.train_score_dataset_split.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/train_score_split.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.HOLDOUT, + metadata_base.PrimitiveAlgorithmType.DATA_SPLITTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.EVALUATION, + }, + ) + + def _get_splits(self, attributes: pandas.DataFrame, targets: pandas.DataFrame, dataset: container.Dataset, main_resource_id: str) -> typing.List[typing.Tuple[numpy.ndarray, numpy.ndarray]]: + if self.hyperparams['stratified'] and not len(targets.columns): + raise exceptions.InvalidArgumentValueError("Stratified split is requested, but no target columns found.") + + train_data, score_data = model_selection.train_test_split( + numpy.arange(len(attributes)), + test_size=None, + train_size=self.hyperparams['train_score_ratio'], + random_state=self._random_state, + shuffle=self.hyperparams['shuffle'], + stratify=targets if self.hyperparams['stratified'] else None, + ) + + return [(train_data, score_data)] diff --git a/tods/common-primitives/common_primitives/unseen_label_decoder.py b/tods/common-primitives/common_primitives/unseen_label_decoder.py new file mode 100644 index 0000000..1f617d5 --- /dev/null +++ b/tods/common-primitives/common_primitives/unseen_label_decoder.py @@ -0,0 +1,137 @@ +import os +from typing import cast, Dict, List, Union, Optional + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase +from d3m.primitive_interfaces.base import CallResult + +import common_primitives +from common_primitives import unseen_label_encoder + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + encoder = hyperparams.Primitive( + default=unseen_label_encoder.UnseenLabelEncoderPrimitive, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="An \"UnseenLabelEncoderPrimitive\" to use for decoding.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be decoded, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='replace', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should decoded columns be appended, should they replace original columns, or should only decoded columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + + +# TODO: This is not yet very useful because it currently requires that columns are at the same index when decoding. +# This should be done better once each column has an unique ID. +# Then we can store mapping using that ID instead of column index. +# Alternatively, inverse mapping could be stored into metadata. +# See: https://gitlab.com/datadrivendiscovery/d3m/issues/112 +class UnseenLabelDecoderPrimitive(TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which inverses the label encoding by ``UnseenLabelEncoderPrimitive``. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '39ae30f7-39ed-40af-8679-5cf108499605', + 'version': '0.1.0', + 'name': "Label decoder for UnseenLabelEncoderPrimitive", + 'python_path': 'd3m.primitives.data_preprocessing.label_decoder.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/unseen_label_decoder.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.CATEGORY_ENCODER, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + }) + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + # We produce only on columns which have been encoded (are integers). + if column_metadata['structural_type'] != int: + return False + + semantic_types = column_metadata.get('semantic_types', []) + + if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: + return True + + return False + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + # We are OK if no columns ended up being decoded. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns can be decoded. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + columns_to_use = self._get_columns(inputs.metadata) + + output_columns = [self._produce_column(inputs, column_index) for column_index in columns_to_use] + + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + return CallResult(outputs) + + def _produce_column(self, inputs: Inputs, column_index: int) -> Outputs: + inverse_labels = self.hyperparams['encoder'].get_params()['inverse_labels'] + + # We use an empty string for all labels we cannot decode. + column = container.DataFrame([inverse_labels[column_index].get(value, '') for value in inputs.iloc[:, column_index]], generate_metadata=False) + + column.metadata = self._produce_column_metadata(inputs.metadata, column_index) + + return column + + def _produce_column_metadata(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> metadata_base.DataMetadata: + column_metadata = inputs_metadata.select_columns([column_index]) + column_metadata = column_metadata.update_column(0, {'structural_type': str}) + + return column_metadata diff --git a/tods/common-primitives/common_primitives/unseen_label_encoder.py b/tods/common-primitives/common_primitives/unseen_label_encoder.py new file mode 100644 index 0000000..ad6708d --- /dev/null +++ b/tods/common-primitives/common_primitives/unseen_label_encoder.py @@ -0,0 +1,203 @@ +import os +from typing import cast, Any, Dict, List, Union, Optional + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, params, hyperparams +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import CallResult + +import common_primitives + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + # For each column, a map between original labels and encoded values. + labels: Optional[Dict[int, Dict[str, int]]] + # For each column, a map between encoded values and original labels. + inverse_labels: Optional[Dict[int, Dict[int, str]]] + + +class Hyperparams(hyperparams.Hyperparams): + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be encoded, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='replace', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should encoded columns be appended, should they replace original columns, or should only encoded columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + encode_target_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should it encode also target columns?", + ) + + +class UnseenLabelEncoderPrimitive(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Label encoder that can puts any unseen categories into a single category. + """ + + __author__ = "Brown" + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '26fc8fd3-f6b2-4c65-8afb-edb54ed2a3e4', + 'version': '0.2.0', + 'name': "Label encoder with an unseen category", + 'python_path': 'd3m.primitives.data_preprocessing.label_encoder.Common', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:yeounoh_chung@brown.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/unseen_label_encoder.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.CATEGORY_ENCODER, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._training_inputs: Inputs = None + self._labels: Dict[int, Dict[Any, int]] = {} + self._inverse_labels: Dict[int, Dict[int, Any]] = {} + self._fitted = False + + def set_training_data(self, *, inputs: Inputs) -> None: # type: ignore + self._training_inputs = inputs + self._fitted = False + + def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + # We produce only on columns which have not yet been encoded (are strings). + if column_metadata['structural_type'] != str: + return False + + semantic_types = column_metadata.get('semantic_types', []) + + if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: + # Skip parsing if a column is categorical, but also a target column. + if not self.hyperparams['encode_target_columns'] and 'https://metadata.datadrivendiscovery.org/types/Target' in semantic_types: + return False + + return True + + return False + + def _get_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use(inputs_metadata, self.hyperparams['use_columns'], self.hyperparams['exclude_columns'], can_use_column) + + # We are OK if no columns ended up being encoded. + # "base_utils.combine_columns" will throw an error if it cannot work with this. + + if self.hyperparams['use_columns'] and columns_not_to_use: + self.logger.warning("Not all specified columns can be encoded. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + if self._fitted: + return CallResult(None) + + columns_to_use = self._get_columns(self._training_inputs.metadata) + + self._labels = {} + self._inverse_labels = {} + + for column_index in columns_to_use: + self._fit_column(column_index) + + self._fitted = True + + return CallResult(None) + + def _fit_column(self, column_index: int) -> None: + self._labels[column_index] = {} + self._inverse_labels[column_index] = {} + + for value in self._training_inputs.iloc[:, column_index]: + value = value.strip() + if value not in self._labels[column_index]: + # We add 1 to reserve 0. + new_label = len(self._labels[column_index]) + 1 + self._labels[column_index][value] = new_label + self._inverse_labels[column_index][new_label] = value + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + columns_to_use = self._get_columns(inputs.metadata) + + output_columns = [self._produce_column(inputs, column_index) for column_index in columns_to_use] + + outputs = base_utils.combine_columns(inputs, columns_to_use, output_columns, return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns']) + + return CallResult(outputs) + + def _produce_column(self, inputs: Inputs, column_index: int) -> Outputs: + column = container.DataFrame([self._labels[column_index].get(value.strip(), 0) for value in inputs.iloc[:, column_index]], generate_metadata=False) + + column.metadata = self._produce_column_metadata(inputs.metadata, column_index) + + return column + + def _produce_column_metadata(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> metadata_base.DataMetadata: + column_metadata = inputs_metadata.select_columns([column_index]) + column_metadata = column_metadata.update_column(0, {'structural_type': int}) + + return column_metadata + + def get_params(self) -> Params: + if not self._fitted: + return Params( + labels=None, + inverse_labels=None, + ) + + return Params( + labels=self._labels, + inverse_labels=self._inverse_labels, + ) + + def set_params(self, *, params: Params) -> None: + self._labels = params['labels'] + self._inverse_labels = params['inverse_labels'] + self._fitted = all(param is not None for param in params.values()) diff --git a/tods/common-primitives/common_primitives/utils.py b/tods/common-primitives/common_primitives/utils.py new file mode 100644 index 0000000..d45dbf0 --- /dev/null +++ b/tods/common-primitives/common_primitives/utils.py @@ -0,0 +1,192 @@ +import datetime +import logging +import typing + +import dateutil.parser +import numpy # type: ignore + +from d3m import container, deprecate +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base + +logger = logging.getLogger(__name__) + +DEFAULT_DATETIME = datetime.datetime.fromtimestamp(0, datetime.timezone.utc) + + +@deprecate.function(message="it should not be used anymore") +def copy_elements_metadata(source_metadata: metadata_base.Metadata, target_metadata: metadata_base.DataMetadata, from_selector: metadata_base.Selector, + to_selector: metadata_base.Selector = (), *, ignore_all_elements: bool = False, check: bool = True, source: typing.Any = None) -> metadata_base.DataMetadata: + return source_metadata._copy_elements_metadata(target_metadata, list(from_selector), list(to_selector), [], ignore_all_elements) + + +@deprecate.function(message="use Metadata.copy_to method instead") +def copy_metadata(source_metadata: metadata_base.Metadata, target_metadata: metadata_base.DataMetadata, from_selector: metadata_base.Selector, + to_selector: metadata_base.Selector = (), *, ignore_all_elements: bool = False, check: bool = True, source: typing.Any = None) -> metadata_base.DataMetadata: + return source_metadata.copy_to(target_metadata, from_selector, to_selector, ignore_all_elements=ignore_all_elements) + + +@deprecate.function(message="use DataFrame.select_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def select_columns(inputs: container.DataFrame, columns: typing.Sequence[metadata_base.SimpleSelectorSegment], *, + source: typing.Any = None) -> container.DataFrame: + return inputs.select_columns(columns) + + +@deprecate.function(message="use DataMetadata.select_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def select_columns_metadata(inputs_metadata: metadata_base.DataMetadata, columns: typing.Sequence[metadata_base.SimpleSelectorSegment], *, + source: typing.Any = None) -> metadata_base.DataMetadata: + return inputs_metadata.select_columns(columns) + + +@deprecate.function(message="use DataMetadata.list_columns_with_semantic_types method instead") +def list_columns_with_semantic_types(metadata: metadata_base.DataMetadata, semantic_types: typing.Sequence[str], *, + at: metadata_base.Selector = ()) -> typing.Sequence[int]: + return metadata.list_columns_with_semantic_types(semantic_types, at=at) + + +@deprecate.function(message="use DataMetadata.list_columns_with_structural_types method instead") +def list_columns_with_structural_types(metadata: metadata_base.DataMetadata, structural_types: typing.Union[typing.Callable, typing.Sequence[typing.Union[str, type]]], *, + at: metadata_base.Selector = ()) -> typing.Sequence[int]: + return metadata.list_columns_with_structural_types(structural_types, at=at) + + +@deprecate.function(message="use DataFrame.remove_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def remove_columns(inputs: container.DataFrame, column_indices: typing.Sequence[int], *, source: typing.Any = None) -> container.DataFrame: + return inputs.remove_columns(column_indices) + + +@deprecate.function(message="use DataMetadata.remove_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def remove_columns_metadata(inputs_metadata: metadata_base.DataMetadata, column_indices: typing.Sequence[int], *, source: typing.Any = None) -> metadata_base.DataMetadata: + return inputs_metadata.remove_columns(column_indices) + + +@deprecate.function(message="use DataFrame.append_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def append_columns(left: container.DataFrame, right: container.DataFrame, *, use_right_metadata: bool = False, source: typing.Any = None) -> container.DataFrame: + return left.append_columns(right, use_right_metadata=use_right_metadata) + + +@deprecate.function(message="use DataMetadata.append_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def append_columns_metadata(left_metadata: metadata_base.DataMetadata, right_metadata: metadata_base.DataMetadata, use_right_metadata: bool = False, source: typing.Any = None) -> metadata_base.DataMetadata: + return left_metadata.append_columns(right_metadata, use_right_metadata=use_right_metadata) + + +@deprecate.function(message="use DataFrame.insert_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def insert_columns(inputs: container.DataFrame, columns: container.DataFrame, at_column_index: int, *, source: typing.Any = None) -> container.DataFrame: + return inputs.insert_columns(columns, at_column_index) + + +@deprecate.function(message="use DataMetadata.insert_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def insert_columns_metadata(inputs_metadata: metadata_base.DataMetadata, columns_metadata: metadata_base.DataMetadata, at_column_index: int, *, source: typing.Any = None) -> metadata_base.DataMetadata: + return inputs_metadata.insert_columns(columns_metadata, at_column_index) + + +@deprecate.function(message="use DataFrame.replace_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def replace_columns(inputs: container.DataFrame, columns: container.DataFrame, column_indices: typing.Sequence[int], *, copy: bool = True, source: typing.Any = None) -> container.DataFrame: + return inputs.replace_columns(columns, column_indices, copy=copy) + + +@deprecate.function(message="use DataMetadata.replace_columns method instead") +@deprecate.arguments('source', message="argument ignored") +def replace_columns_metadata(inputs_metadata: metadata_base.DataMetadata, columns_metadata: metadata_base.DataMetadata, column_indices: typing.Sequence[int], *, source: typing.Any = None) -> metadata_base.DataMetadata: + return inputs_metadata.replace_columns(columns_metadata, column_indices) + + +@deprecate.function(message="use DataMetadata.get_index_columns method instead") +def get_index_columns(metadata: metadata_base.DataMetadata, *, at: metadata_base.Selector = ()) -> typing.Sequence[int]: + return metadata.get_index_columns(at=at) + + +@deprecate.function(message="use DataFrame.horizontal_concat method instead") +@deprecate.arguments('source', message="argument ignored") +def horizontal_concat(left: container.DataFrame, right: container.DataFrame, *, use_index: bool = True, + remove_second_index: bool = True, use_right_metadata: bool = False, source: typing.Any = None) -> container.DataFrame: + return left.horizontal_concat(right, use_index=use_index, remove_second_index=remove_second_index, use_right_metadata=use_right_metadata) + + +@deprecate.function(message="use DataMetadata.horizontal_concat method instead") +@deprecate.arguments('source', message="argument ignored") +def horizontal_concat_metadata(left_metadata: metadata_base.DataMetadata, right_metadata: metadata_base.DataMetadata, *, use_index: bool = True, + remove_second_index: bool = True, use_right_metadata: bool = False, source: typing.Any = None) -> metadata_base.DataMetadata: + return left_metadata.horizontal_concat(right_metadata, use_index=use_index, remove_second_index=remove_second_index, use_right_metadata=use_right_metadata) + + +@deprecate.function(message="use d3m.base.utils.get_columns_to_use function instead") +def get_columns_to_use(metadata: metadata_base.DataMetadata, use_columns: typing.Sequence[int], exclude_columns: typing.Sequence[int], + can_use_column: typing.Callable) -> typing.Tuple[typing.List[int], typing.List[int]]: + return base_utils.get_columns_to_use(metadata, use_columns, exclude_columns, can_use_column) + + +@deprecate.function(message="use d3m.base.utils.combine_columns function instead") +@deprecate.arguments('source', message="argument ignored") +def combine_columns(return_result: str, add_index_columns: bool, inputs: container.DataFrame, column_indices: typing.Sequence[int], + columns_list: typing.Sequence[container.DataFrame], *, source: typing.Any = None) -> container.DataFrame: + return base_utils.combine_columns(inputs, column_indices, columns_list, return_result=return_result, add_index_columns=add_index_columns) + + +@deprecate.function(message="use d3m.base.utils.combine_columns_metadata function instead") +@deprecate.arguments('source', message="argument ignored") +def combine_columns_metadata(return_result: str, add_index_columns: bool, inputs_metadata: metadata_base.DataMetadata, column_indices: typing.Sequence[int], + columns_metadata_list: typing.Sequence[metadata_base.DataMetadata], *, source: typing.Any = None) -> metadata_base.DataMetadata: + return base_utils.combine_columns_metadata(inputs_metadata, column_indices, columns_metadata_list, return_result=return_result, add_index_columns=add_index_columns) + + +@deprecate.function(message="use DataMetadata.set_table_metadata method instead") +@deprecate.arguments('source', message="argument ignored") +def set_table_metadata(inputs_metadata: metadata_base.DataMetadata, *, at: metadata_base.Selector = (), source: typing.Any = None) -> metadata_base.DataMetadata: + return inputs_metadata.set_table_metadata(at=at) + + +@deprecate.function(message="use DataMetadata.get_column_index_from_column_name method instead") +def get_column_index_from_column_name(inputs_metadata: metadata_base.DataMetadata, column_name: str, *, at: metadata_base.Selector = ()) -> int: + return inputs_metadata.get_column_index_from_column_name(column_name, at=at) + + +@deprecate.function(message="use Dataset.get_relations_graph method instead") +def build_relation_graph(dataset: container.Dataset) -> typing.Dict[str, typing.List[typing.Tuple[str, bool, int, int, typing.Dict]]]: + return dataset.get_relations_graph() + + +@deprecate.function(message="use d3m.base.utils.get_tabular_resource function instead") +def get_tabular_resource(dataset: container.Dataset, resource_id: typing.Optional[str], *, + pick_entry_point: bool = True, pick_one: bool = True, has_hyperparameter: bool = True) -> typing.Tuple[str, container.DataFrame]: + return base_utils.get_tabular_resource(dataset, resource_id, pick_entry_point=pick_entry_point, pick_one=pick_one, has_hyperparameter=has_hyperparameter) + + +@deprecate.function(message="use d3m.base.utils.get_tabular_resource_metadata function instead") +def get_tabular_resource_metadata(dataset_metadata: metadata_base.DataMetadata, resource_id: typing.Optional[metadata_base.SelectorSegment], *, + pick_entry_point: bool = True, pick_one: bool = True) -> metadata_base.SelectorSegment: + return base_utils.get_tabular_resource_metadata(dataset_metadata, resource_id, pick_entry_point=pick_entry_point, pick_one=pick_one) + + +@deprecate.function(message="use Dataset.select_rows method instead") +@deprecate.arguments('source', message="argument ignored") +def cut_dataset(dataset: container.Dataset, row_indices_to_keep: typing.Mapping[str, typing.Sequence[int]], *, + source: typing.Any = None) -> container.Dataset: + return dataset.select_rows(row_indices_to_keep) + + +def parse_datetime(value: str, *, fuzzy: bool = True) -> typing.Optional[datetime.datetime]: + try: + return dateutil.parser.parse(value, default=DEFAULT_DATETIME, fuzzy=fuzzy) + except (ValueError, OverflowError, TypeError): + return None + + +def parse_datetime_to_float(value: str, *, fuzzy: bool = True) -> float: + try: + parsed = parse_datetime(value, fuzzy=fuzzy) + if parsed is None: + return numpy.nan + else: + return parsed.timestamp() + except (ValueError, OverflowError, TypeError): + return numpy.nan diff --git a/tods/common-primitives/common_primitives/video_reader.py b/tods/common-primitives/common_primitives/video_reader.py new file mode 100644 index 0000000..ce6f661 --- /dev/null +++ b/tods/common-primitives/common_primitives/video_reader.py @@ -0,0 +1,87 @@ +import os + +import cv2 # type: ignore +import frozendict # type: ignore +import numpy # type: ignore + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base + +import common_primitives +from common_primitives import base + + +class VideoReaderPrimitive(base.FileReaderPrimitiveBase): + """ + A primitive which reads columns referencing video files. + + Each column which has ``https://metadata.datadrivendiscovery.org/types/FileName`` semantic type + and a valid media type (``video/mp4``, ``video/avi``) has every filename read into a video + represented as a numpy array. By default the resulting column with read arrays is appended + to existing columns. + + The shape of numpy arrays is F x H x W x C. F is the number of frames, C is the number of + channels in a video (e.g., C = 1 for greyscale, C = 3 for RGB), H is the height, and W + is the width. dtype is uint8. + """ + + _supported_media_types = ( + 'video/mp4', + 'video/avi', + ) + _file_structural_type = container.ndarray + _file_semantic_types = ('http://schema.org/VideoObject',) + + __author__ = 'University of Michigan, Eric Hofesmann, Nathan Louis, Madan Ravi Ganesh' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'a29b0080-aeff-407d-9edb-0aa3eefbde01', + 'version': '0.2.0', + 'name': 'Columns video reader', + 'python_path': 'd3m.primitives.data_preprocessing.video_reader.Common', + 'keywords': ['video', 'reader', 'avi', 'mp4'], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:davjoh@umich.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/video_reader.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.FILE_MANIPULATION, + ], + 'supported_media_types': _supported_media_types, + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + } + ) + + def _read_fileuri(self, metadata: frozendict.FrozenOrderedDict, fileuri: str) -> container.ndarray: + capture = cv2.VideoCapture(fileuri) + frames = [] + + try: + while capture.isOpened(): + ret, frame = capture.read() + if not ret: + break + else: + assert frame.dtype == numpy.uint8, frame.dtype + + if frame.ndim == 2: + # Make sure there are always three dimensions. + frame = frame.reshape(list(frame.shape) + [1]) + + assert frame.ndim == 3, frame.ndim + + frames.append(frame) + finally: + capture.release() + + return container.ndarray(numpy.array(frames), generate_metadata=False) diff --git a/tods/common-primitives/common_primitives/xgboost_dart.py b/tods/common-primitives/common_primitives/xgboost_dart.py new file mode 100644 index 0000000..87e5eae --- /dev/null +++ b/tods/common-primitives/common_primitives/xgboost_dart.py @@ -0,0 +1,684 @@ +import os +from collections import OrderedDict +from typing import cast, Dict, List, Union, Sequence, Optional, Tuple + +import numpy as np # type: ignore +import pandas as pd # type: ignore +import xgboost as xgb # type: ignore +from sklearn.multioutput import MultiOutputClassifier # type: ignore +from sklearn.preprocessing import LabelEncoder # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces.base import CallResult, ProbabilisticCompositionalityMixin, SamplingCompositionalityMixin, ContinueFitMixin +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase + +import common_primitives + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + estimators: Optional[Union[xgb.XGBClassifier, List[xgb.XGBClassifier]]] + booster: Optional[Union[xgb.Booster, List[xgb.Booster]]] + classes: Optional[Union[np.ndarray, List[np.ndarray]]] + n_classes: Optional[Union[int, List[int]]] + objective: Optional[str] + multi_output_estimator_dict: Optional[Dict] + attribute_columns_names: Optional[List[str]] + target_columns_metadata: Optional[List[OrderedDict]] + target_columns_names: Optional[List[str]] + le: Optional[LabelEncoder] + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=200, + description='The number of trees in the forest.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + n_more_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=100, + description='When continuing a fit, it controls how many more trees to add every time.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + max_depth = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=3, + ), + unlimited=hyperparams.Enumeration[int]( + values=[0], + default=0, + description='Nodes are expanded until all leaves are pure or until all leaves contain less than "min_samples_split" samples.', + ), + ), + default='limit', + description='The maximum depth of the tree.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + learning_rate = hyperparams.Uniform( + lower=0, + upper=1, + default=0.1, + description=r'Boosting learning rate (xgb\`s \"eta\")', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + gamma = hyperparams.Bounded[float]( + lower=0.0, + upper=None, + default=0.0, + description='Minimum loss reduction required to make a further partition on a leaf node of the tree', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_child_weight = hyperparams.Bounded[int]( + lower=0, + upper=None, + default=1, + description='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results ' + 'in a leaf node with the sum of instance weight less than min_child_weight, then the building ' + 'process will give up further partitioning ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + max_delta_step = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + # TODO: 1-10 instead? + upper=None, + default=1, + description='Maximum delta step we allow each leaf output to be.' + ), + unlimited=hyperparams.Enumeration[int]( + values=[0], + default=0, + description='No constraint.', + ), + ), + default='unlimited', + description='Maximum delta step we allow.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + # TODO: better way to represent lower bound is exclusive? + subsample = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of the training instances,this will prevent overfitting. Subsampling will occur ' + 'once in every boosting iteration.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + colsample_bytree = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of columns when constructing each tree. Subsampling will occur once in every ' + 'boosting iteration', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + colsample_bylevel = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of columns for each split, in each level. Subsampling will occur each time a new ' + 'split is made', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + reg_lambda = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1, + description='L2 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + reg_alpha = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0, + description='L1 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + scale_pos_weight = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1, + description='Control the balance of positive and negative weights, useful for unbalanced classes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + base_score = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.5, + description='The initial prediction score of all instances, global bias.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + n_jobs = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + ), + all_cores=hyperparams.Enumeration[int]( + values=[-1], + default=-1, + description='The number of jobs is set to the number of cores.', + ), + ), + default='limit', + description='The number of jobs to run in parallel for both "fit" and "produce".', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'], + ) + sample_type = hyperparams.Enumeration[str]( + values=['uniform', 'weighted'], + default='uniform', + description='Type of sampling algorithm', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + normalize_type = hyperparams.Enumeration[str]( + values=['tree', 'forest'], + default='tree', + description='Type of normalization algorithm', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + rate_drop = hyperparams.Bounded[float]( + lower=0, + upper=1.0, + default=0.0, + description='Dropout rate (a fraction of previous trees to drop during the dropout)', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + one_drop = hyperparams.Enumeration[int]( + values=[0, 1], + default=0, + description='When this flag is enabled, at least one tree is always dropped during the dropout', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + skip_drop = hyperparams.Bounded[float]( + lower=0, + upper=1.0, + default=0.0, + description='Probability of skipping the dropout procedure during a boosting iteration', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + # Default value depends on the nature of the primitive. + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should resulting columns be appended, should they replace original columns, or should only resulting columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + + +# TODO: Instead of using XGBoostClassifier instance provided by the xgboost's sklearn interface, use XGBoost original +# API to prevent ugly set and get params +class XGBoostDartClassifierPrimitive(ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + SamplingCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams], + SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A XGBoost classifier using ``xgb.XGBoostClassifier`` with Dart Boosting type. + + It uses semantic types to determine which columns to operate on. + """ + + __author__ = 'TAMU DARPA D3M Team, TsungLin Yang ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '7476950e-4373-4cf5-a852-7e16afb8e098', + 'version': '0.1.0', + 'name': "XGBoost DART classifier", + 'python_path': 'd3m.primitives.classification.xgboost_dart.Common', + 'keywords': ['xgboost', 'decision tree', 'gradient boosted trees', ], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:lin.yang@tamu.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.GRADIENT_BOOSTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION, + 'hyperparams_to_tune': [ + 'learning_rate', + 'colsample_bytree', + 'min_child_weight', + 'subsample', + 'max_depth', + 'max_delta_step' + ] + } + ) + + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, _verbose: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + # We need random seed multiple times (every time an underlying "RandomForestClassifier" is instantiated), + # and when we sample. So instead we create our own random state we use everywhere. + self._random_state = np.random.RandomState(self.random_seed) + self._verbose = _verbose + self._training_inputs: Inputs = None + self._training_outputs: Outputs = None + self._new_training_data = False + self._learner: Union[xgb.XGBClassifier, MultiOutputClassifier] = None + self._attribute_columns_names: List[str] = None + self._target_columns_metadata: List[OrderedDict] = None + self._target_columns_names: List[str] = None + self._multi_output_estimator_dict: Dict = {} + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._training_inputs = inputs + self._training_outputs = outputs + self._new_training_data = True + + def _create_learner(self) -> None: + self._learner = xgb.XGBClassifier( + max_depth=self.hyperparams['max_depth'], + learning_rate=self.hyperparams['learning_rate'], + n_estimators=self.hyperparams['n_estimators'], + gamma=self.hyperparams['gamma'], + min_child_weight=self.hyperparams['min_child_weight'], + max_delta_step=self.hyperparams['max_delta_step'], + subsample=self.hyperparams['subsample'], + colsample_bylevel=self.hyperparams['colsample_bylevel'], + colsample_bytree=self.hyperparams['colsample_bytree'], + reg_alpha=self.hyperparams['reg_alpha'], + reg_lambda=self.hyperparams['reg_lambda'], + scale_pos_weight=self.hyperparams['scale_pos_weight'], + base_score=self.hyperparams['base_score'], + n_jobs=-1 if self.hyperparams['n_jobs'] is None else self.hyperparams['n_jobs'], + random_state=self.random_seed, + booster='dart', + silent=not bool(self._verbose) + ) + + def _get_target_columns_metadata(self, outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = list(column_metadata.get('semantic_types', [])) + if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types: + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + semantic_types = [semantic_type for semantic_type in semantic_types if + semantic_type != 'https://metadata.datadrivendiscovery.org/types/TrueTarget'] + column_metadata['semantic_types'] = semantic_types + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _store_target_columns_metadata(self, inputs: Inputs, outputs: Outputs) -> None: + self._attribute_columns_names = list(inputs.columns) + self._target_columns_metadata = self._get_target_columns_metadata(outputs.metadata) + self._target_columns_names = list(outputs.columns) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # An optimization. Do not refit if data has not changed. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + inputs, _ = self._select_inputs_columns(self._training_inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + + self._create_learner() + + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.ndim == 2 and outputs.shape[1] == 1: + fit_outputs = np.ravel(outputs) + else: + fit_outputs = outputs + self._learner = MultiOutputClassifier(self._learner) + + self._learner.fit(np.array(inputs), np.array(fit_outputs)) + if isinstance(self._learner, MultiOutputClassifier): + for _, output in outputs.iteritems(): + estimator_index = next((index for index, estimator in enumerate(self._learner.estimators_) if + set(output.unique()) == set(estimator.classes_)), None) + self._multi_output_estimator_dict[output.name] = estimator_index + + self._store_target_columns_metadata(inputs, outputs) + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # This model is not improving fitting if called multiple times on the same data. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if not self._learner: + self._create_learner() + + inputs, _ = self._select_inputs_columns(self._training_inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + + # using xgboost api to continue fit the classifier. + def continue_xgb_booster(xgb_model: xgb.XGBClassifier, inputs: Inputs, + output_values: Union[np.ndarray, Outputs], num_of_boosting_round: int) -> None: + label = LabelEncoder().fit_transform(output_values) + dtrain = xgb.DMatrix(data=inputs.values, label=label) + model_param = xgb_model.get_xgb_params() + del (model_param['n_estimators']) + model_param['objective'] = xgb_model.objective + model_param['num_class'] = xgb_model.n_classes_ + booster = xgb.train(params=model_param, dtrain=dtrain, + num_boost_round=num_of_boosting_round, + xgb_model=xgb_model.get_booster()) + xgb_model.set_params(_Booster=booster) + + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.ndim == 2 and outputs.shape[1] == 1: + continue_xgb_booster(self._learner, inputs, np.ravel(outputs), self.hyperparams['n_more_estimators']) + else: + # TODO Currently doesn't support unseen target for continuing multi-output classification. + if outputs.shape[1] != len(self._learner.estimators_): + raise exceptions.InvalidArgumentValueError("Number of target does not match with the original data") + for _, output in outputs.iteritems(): + estimator_index = self._multi_output_estimator_dict.get(output.name, None) + if estimator_index is None: + raise exceptions.InvalidArgumentValueError( + 'Unseen target column when continuing fit {}'.format(output.name)) + estimator = self._learner.estimators_[self._multi_output_estimator_dict[output.name]] + continue_xgb_booster(estimator, inputs, output, self.hyperparams['n_more_estimators']) + + self._store_target_columns_metadata(inputs, outputs) + + return CallResult(None) + + def _update_predictions_metadata(self, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata() + if outputs is not None: + outputs_metadata = outputs_metadata.generate(outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, predictions: np.ndarray) -> Outputs: + outputs = container.DataFrame(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(outputs, self._target_columns_metadata) + outputs.columns = self._target_columns_names + return outputs + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + selected_inputs, columns_to_use = self._select_inputs_columns(inputs) + + # TODO: If the booster object is DART type, predict() will perform dropouts, i.e. only some of the trees will + # be evaluated. This will produce incorrect results if data is not the training data. To obtain correct + # results on test sets, set ntree_limit to a nonzero value. + # Potential BUG + if not isinstance(self._learner, MultiOutputClassifier): + predictions = self._learner.predict(selected_inputs.values, ntree_limit=self._learner.n_estimators) + else: + predictions = [] + for estimator in self._learner.estimators_: + predictions.append(estimator.predict(selected_inputs.values, ntree_limit=estimator.n_estimators)) + predictions = np.array(predictions).transpose() + output_columns = [self._wrap_predictions(predictions)] + + outputs = base_utils.combine_columns( + inputs, columns_to_use, output_columns, + return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], + ) + + return CallResult(outputs) + + def sample(self, *, inputs: Inputs, num_samples: int = 1, timeout: float = None, iterations: int = None) -> \ + CallResult[Sequence[Outputs]]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + inputs, _ = self._select_inputs_columns(inputs) + + samples = [] + for i in range(num_samples): + predictions = self._learner.predict(inputs.values) + samples.append(self._wrap_predictions(predictions)) + + return CallResult(samples) + + def log_likelihoods(self, *, outputs: Outputs, inputs: Inputs, timeout: float = None, iterations: int = None) -> \ + CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + outputs, _ = self._select_outputs_columns(outputs) + inputs, _ = self._select_inputs_columns(inputs) + log_proba = np.log(self._learner.predict_proba(np.array(inputs))) + + if outputs.shape[1] == 1: + log_proba = [log_proba] + classes = [self._learner.classes_] + else: + classes = [x.classes_ for x in self._learner.estimators_] + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(outputs.shape[1]): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pd.Series(np.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][np.arange(samples_length), mapped_outputs_column]) + + results = container.DataFrame(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(outputs.shape[1]): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + return CallResult(results) + + def get_params(self) -> Params: + if not self._learner: + return Params( + estimators=None, + booster=None, + classes=None, + n_classes=None, + objective=None, + multi_output_estimator_dict=None, + le=None, + target_columns_metadata=None, + attribute_columns_names=None, + target_columns_names=None + ) + + return Params( + estimators=self._learner.estimators_ if isinstance(self._learner, MultiOutputClassifier) else self._learner, + booster=self._learner.get_booster() if not isinstance(self._learner, MultiOutputClassifier) else [ + estimator.get_booster() for estimator in self._learner.estimators_], + classes=self._learner.classes_ + if not isinstance(self._learner, MultiOutputClassifier) else [estimator.classes_ for estimator in + self._learner.estimators_], + n_classes=self._learner.n_classes_ + if not isinstance(self._learner, MultiOutputClassifier) else [estimator.n_classes_ for estimator in + self._learner.estimators_], + objective=self._learner.objective + if not isinstance(self._learner, MultiOutputClassifier) else self._learner.estimators_[0].objective, + multi_output_estimator_dict=self._multi_output_estimator_dict + if isinstance(self._learner, MultiOutputClassifier) else {}, + attribute_columns_names=self._attribute_columns_names, + target_columns_metadata=self._target_columns_metadata, + target_columns_names=self._target_columns_names, + le=self._learner._le if not isinstance(self._learner, MultiOutputClassifier) else None + ) + + def set_params(self, *, params: Params) -> None: + if not all(params[param] is not None for param in + ['estimators', 'booster', 'classes', 'n_classes', 'objective', 'target_columns_metadata']): + self._learner = None + else: + if isinstance(self._learner, MultiOutputClassifier): + self._learner.estimators_ = params['estimators'] + self._multi_output_estimator_dict = params['multi_output_estimator_dict'] + else: + self._create_learner() + # A little hack to set lable encoder of XGBoostClassifier instance to prevent exceptions. + self._learner._le = params['le'] + # Another hack to make sure class attribute out side __init__ gets set properly + self._learner.classes_ = 0 + self._learner.n_classes_ = 0 + self._learner.set_params(_Booster=params['booster'], n_classes_=params['n_classes'], + classes_=params['classes'], objective=params['objective']) + self._attribute_columns_names = params['attribute_columns_names'] + self._target_columns_metadata = params['target_columns_metadata'] + self._target_columns_names = params['target_columns_names'] + + def __getstate__(self) -> dict: + state = super().__getstate__() + + # Random state is not part of the "Params", but it is part of the state we want to + # pickle and unpickle to have full reproducibility. So we have to add it ourselves here. + # This is also difference between pickling/unpickling and "get_params"/"set_params". + # The later saves only the model state which is useful to produce at a later time, but + # if we want to also reproduce the exact sequence of values, we should be using pickling. + state['random_state'] = self._random_state + + return state + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + + self._random_state = state['random_state'] + + def _can_use_inputs_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + if not d3m_utils.is_numeric(column_metadata['structural_type']): + return False + + return 'https://metadata.datadrivendiscovery.org/types/Attribute' in column_metadata.get('semantic_types', []) + + def _get_inputs_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_inputs_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + inputs_metadata, + self.hyperparams['use_inputs_columns'], + self.hyperparams['exclude_inputs_columns'], + can_use_column, + ) + + if not columns_to_use: + raise ValueError("No inputs columns.") + + if self.hyperparams['use_inputs_columns'] and columns_not_to_use: + self.logger.warning("Not all specified inputs columns can used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _can_use_outputs_column(self, outputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + return 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in column_metadata.get('semantic_types', []) + + def _get_outputs_columns(self, outputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_outputs_column(outputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + outputs_metadata, + self.hyperparams['use_outputs_columns'], + self.hyperparams['exclude_outputs_columns'], + can_use_column, + ) + + if not columns_to_use: + raise ValueError("No outputs columns.") + + if self.hyperparams['use_outputs_columns'] and columns_not_to_use: + self.logger.warning("Not all specified outputs columns can used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _select_inputs_columns(self, inputs: Inputs) -> Tuple[Inputs, List[int]]: + columns_to_use = self._get_inputs_columns(inputs.metadata) + + return inputs.select_columns(columns_to_use), columns_to_use + + def _select_outputs_columns(self, outputs: Outputs) -> Tuple[Outputs, List[int]]: + columns_to_use = self._get_outputs_columns(outputs.metadata) + + return outputs.select_columns(columns_to_use), columns_to_use diff --git a/tods/common-primitives/common_primitives/xgboost_gbtree.py b/tods/common-primitives/common_primitives/xgboost_gbtree.py new file mode 100644 index 0000000..cda2fad --- /dev/null +++ b/tods/common-primitives/common_primitives/xgboost_gbtree.py @@ -0,0 +1,665 @@ +import os +from collections import OrderedDict +from typing import cast, Dict, List, Union, Sequence, Optional, Tuple + +import numpy as np # type: ignore +import pandas as pd # type: ignore +import xgboost as xgb # type: ignore +from sklearn.multioutput import MultiOutputClassifier # type: ignore +from sklearn.preprocessing import LabelEncoder # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces.base import CallResult, ProbabilisticCompositionalityMixin, SamplingCompositionalityMixin, ContinueFitMixin +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase + +import common_primitives + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + estimators: Optional[Union[xgb.XGBClassifier, List[xgb.XGBClassifier]]] + booster: Optional[Union[xgb.Booster, List[xgb.Booster]]] + classes: Optional[Union[np.ndarray, List[np.ndarray]]] + n_classes: Optional[Union[int, List[int]]] + objective: Optional[str] + multi_output_estimator_dict: Optional[Dict] + attribute_columns_names: Optional[List[str]] + target_columns_metadata: Optional[List[OrderedDict]] + target_columns_names: Optional[List[str]] + le: Optional[LabelEncoder] + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=100, + description='The number of trees in the forest.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + n_more_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=100, + description='When continuing a fit, it controls how many more trees to add every time.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + max_depth = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=3, + ), + unlimited=hyperparams.Enumeration[int]( + values=[0], + default=0, + description='Nodes are expanded until all leaves are pure or until all leaves contain less than "min_samples_split" samples.', + ), + ), + default='limit', + description='The maximum depth of the tree.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + learning_rate = hyperparams.Uniform( + lower=0, + upper=1, + default=0.1, + description=r'Boosting learning rate (xgb\`s \"eta\")', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + gamma = hyperparams.Bounded[float]( + lower=0.0, + upper=None, + default=0.0, + description='Minimum loss reduction required to make a further partition on a leaf node of the tree', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_child_weight = hyperparams.Bounded[int]( + lower=0, + upper=None, + default=1, + description='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results ' + 'in a leaf node with the sum of instance weight less than min_child_weight, then the building ' + 'process will give up further partitioning ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + max_delta_step = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + # TODO: 1-10 instead? + upper=None, + default=1, + description='Maximum delta step we allow each leaf output to be.' + ), + unlimited=hyperparams.Enumeration[int]( + values=[0], + default=0, + description='No constraint.', + ), + ), + default='unlimited', + description='Maximum delta step we allow.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + # TODO: better way to represent lower bound is exclusive? + subsample = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of the training instances,this will prevent overfitting. Subsampling will occur ' + 'once in every boosting iteration.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + colsample_bytree = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of columns when constructing each tree. Subsampling will occur once in every ' + 'boosting iteration', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + colsample_bylevel = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of columns for each split, in each level. Subsampling will occur each time a new ' + 'split is made', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + reg_lambda = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1, + description='L2 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + reg_alpha = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0, + description='L1 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + scale_pos_weight = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1, + description='Control the balance of positive and negative weights, useful for unbalanced classes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + base_score = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.5, + description='The initial prediction score of all instances, global bias.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + n_jobs = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + ), + all_cores=hyperparams.Enumeration[int]( + values=[-1], + default=-1, + description='The number of jobs is set to the number of cores.', + ), + ), + default='limit', + description='The number of jobs to run in parallel for both "fit" and "produce".', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'], + ) + importance_type = hyperparams.Enumeration[str]( + values=['gain', 'weight', 'cover', 'total_gain', 'total_cover'], + default='gain', + description='The feature importance type', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + # Default value depends on the nature of the primitive. + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should resulting columns be appended, should they replace original columns, or should only resulting columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + + +class XGBoostGBTreeClassifierPrimitive(ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + SamplingCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams], + SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A XGBoost classifier using ``xgb.XGBoostClassifier`` with GBTree Boosting type. + + It uses semantic types to determine which columns to operate on. + """ + __author__ = 'TAMU DARPA D3M Team, TsungLin Yang ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'fe0841b7-6e70-4bc3-a56c-0670a95ebc6a', + 'version': '0.1.0', + 'name': "XGBoost GBTree classifier", + 'python_path': 'd3m.primitives.classification.xgboost_gbtree.Common', + 'keywords': ['xgboost', 'decision tree', 'gradient boosted trees', ], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:lin.yang@tamu.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.GRADIENT_BOOSTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION, + 'hyperparams_to_tune': [ + 'learning_rate', + 'colsample_bytree', + 'min_child_weight', + 'subsample', + 'max_depth', + 'max_delta_step' + ] + } + ) + + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, _verbose: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + # We need random seed multiple times (every time an underlying "RandomForestClassifier" is instantiated), + # and when we sample. So instead we create our own random state we use everywhere. + self._random_state = np.random.RandomState(self.random_seed) + self._verbose = _verbose + self._training_inputs: Inputs = None + self._training_outputs: Outputs = None + self._new_training_data = False + self._learner: Union[xgb.XGBClassifier, MultiOutputClassifier] = None + self._attribute_columns_names: List[str] = None + self._target_columns_metadata: List[OrderedDict] = None + self._target_columns_names: List[str] = None + self._multi_output_estimator_dict: Dict = {} + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._training_inputs = inputs + self._training_outputs = outputs + self._new_training_data = True + + def _create_learner(self) -> None: + self._learner = xgb.XGBClassifier( + max_depth=self.hyperparams['max_depth'], + learning_rate=self.hyperparams['learning_rate'], + n_estimators=self.hyperparams['n_estimators'], + gamma=self.hyperparams['gamma'], + min_child_weight=self.hyperparams['min_child_weight'], + max_delta_step=self.hyperparams['max_delta_step'], + subsample=self.hyperparams['subsample'], + colsample_bylevel=self.hyperparams['colsample_bylevel'], + colsample_bytree=self.hyperparams['colsample_bytree'], + reg_alpha=self.hyperparams['reg_alpha'], + reg_lambda=self.hyperparams['reg_lambda'], + scale_pos_weight=self.hyperparams['scale_pos_weight'], + importance_type=self.hyperparams['importance_type'], + base_score=self.hyperparams['base_score'], + n_jobs=-1 if self.hyperparams['n_jobs'] is None else self.hyperparams['n_jobs'], + random_state=self.random_seed, + booster='gbtree', + silent=not bool(self._verbose) + ) + + def _get_target_columns_metadata(self, outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = list(column_metadata.get('semantic_types', [])) + if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types: + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + semantic_types = [semantic_type for semantic_type in semantic_types if + semantic_type != 'https://metadata.datadrivendiscovery.org/types/TrueTarget'] + column_metadata['semantic_types'] = semantic_types + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _store_target_columns_metadata(self, inputs: Inputs, outputs: Outputs) -> None: + self._attribute_columns_names = list(inputs.columns) + self._target_columns_metadata = self._get_target_columns_metadata(outputs.metadata) + self._target_columns_names = list(outputs.columns) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # An optimization. Do not refit if data has not changed. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + inputs, _ = self._select_inputs_columns(self._training_inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + self._create_learner() + + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.ndim == 2 and outputs.shape[1] == 1: + fit_outputs = np.ravel(outputs) + else: + fit_outputs = outputs + self._learner = MultiOutputClassifier(self._learner) + + # convert data to np.array to avoid column name issue + self._learner.fit(np.array(inputs), np.array(fit_outputs)) + if isinstance(self._learner, MultiOutputClassifier): + for _, output in outputs.iteritems(): + estimator_index = next((index for index, estimator in enumerate(self._learner.estimators_) if + set(output.unique()) == set(estimator.classes_)), None) + self._multi_output_estimator_dict[output.name] = estimator_index + + self._store_target_columns_metadata(inputs, outputs) + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # This model is not improving fitting if called multiple times on the same data. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if not self._learner: + self._create_learner() + + inputs, _ = self._select_inputs_columns(self._training_inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + + # using xgboost api to continue fit the classifier. + def continue_xgb_booster(xgb_model: xgb.XGBClassifier, inputs: Inputs, + output_values: Union[np.ndarray, Outputs], num_of_boosting_round: int) -> None: + label = LabelEncoder().fit_transform(output_values) + dtrain = xgb.DMatrix(data=inputs.values, label=label) + model_param = xgb_model.get_xgb_params() + del (model_param['n_estimators']) + model_param['objective'] = xgb_model.objective + model_param['num_class'] = xgb_model.n_classes_ + booster = xgb.train(params=model_param, dtrain=dtrain, + num_boost_round=num_of_boosting_round, + xgb_model=xgb_model.get_booster()) + xgb_model.set_params(_Booster=booster) + + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.ndim == 2 and outputs.shape[1] == 1: + continue_xgb_booster(self._learner, inputs, np.ravel(outputs), self.hyperparams['n_more_estimators']) + else: + # TODO Currently doesn't support unseen target for continuing multi-output classification. + for _, output in outputs.iteritems(): + estimator_index = self._multi_output_estimator_dict.get(output.name, None) + if estimator_index is None: + raise exceptions.InvalidArgumentValueError( + 'Unseen target column when continuing fit {}'.format(output.name)) + estimator = self._learner.estimators_[self._multi_output_estimator_dict[output.name]] + continue_xgb_booster(estimator, inputs, output, self.hyperparams['n_more_estimators']) + + self._store_target_columns_metadata(inputs, outputs) + + return CallResult(None) + + def _update_predictions_metadata(self, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata() + if outputs is not None: + outputs_metadata = outputs_metadata.generate(outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, predictions: np.ndarray) -> Outputs: + outputs = container.DataFrame(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(outputs, self._target_columns_metadata) + outputs.columns = self._target_columns_names + return outputs + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + selected_inputs, columns_to_use = self._select_inputs_columns(inputs) + + predictions = self._learner.predict(selected_inputs.values) + + output_columns = [self._wrap_predictions(predictions)] + + outputs = base_utils.combine_columns( + inputs, columns_to_use, output_columns, + return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], + ) + + return CallResult(outputs) + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + # TODO: is feature importances the same for every target? + if hasattr(self._learner, 'feature_importances_'): + feature_importances_array = self._learner.feature_importances_ + else: + feature_importances_array = self._learner.estimators_[0].feature_importances_ + + feature_importances_array = feature_importances_array.reshape((1, len(self._attribute_columns_names))) + + feature_importances = container.DataFrame(feature_importances_array, generate_metadata=True) + feature_importances.columns = self._attribute_columns_names + for k in range(len(self._attribute_columns_names)): + feature_importances.metadata = feature_importances.metadata.update_column(k, { + 'name': self._attribute_columns_names[k]}) + + return CallResult(feature_importances) + + def sample(self, *, inputs: Inputs, num_samples: int = 1, timeout: float = None, iterations: int = None) -> \ + CallResult[Sequence[Outputs]]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + inputs, _ = self._select_inputs_columns(inputs) + + samples = [] + for i in range(num_samples): + predictions = self._learner.predict(inputs.values) + samples.append(self._wrap_predictions(predictions)) + + return CallResult(samples) + + def log_likelihoods(self, *, outputs: Outputs, inputs: Inputs, timeout: float = None, iterations: int = None) -> \ + CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + outputs, _ = self._select_outputs_columns(outputs) + inputs, _ = self._select_inputs_columns(inputs) + log_proba = np.log(self._learner.predict_proba(np.array(inputs))) + + if outputs.shape[1] == 1: + log_proba = [log_proba] + classes = [self._learner.classes_] + else: + classes = [x.classes_ for x in self._learner.estimators_] + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(outputs.shape[1]): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pd.Series(np.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][np.arange(samples_length), mapped_outputs_column]) + + results = container.DataFrame(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(outputs.shape[1]): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + return CallResult(results) + + def get_params(self) -> Params: + if not self._learner: + return Params( + estimators=None, + booster=None, + classes=None, + n_classes=None, + objective=None, + multi_output_estimator_dict=None, + le=None, + target_columns_metadata=None, + attribute_columns_names=None, + target_columns_names=None + ) + + return Params( + estimators=self._learner.estimators_ if isinstance(self._learner, MultiOutputClassifier) else self._learner, + booster=self._learner.get_booster() if not isinstance(self._learner, MultiOutputClassifier) else [ + estimator.get_booster() for estimator in self._learner.estimators_], + classes=self._learner.classes_ + if not isinstance(self._learner, MultiOutputClassifier) else [estimator.classes_ for estimator in + self._learner.estimators_], + n_classes=self._learner.n_classes_ + if not isinstance(self._learner, MultiOutputClassifier) else [estimator.n_classes_ for estimator in + self._learner.estimators_], + objective=self._learner.objective + if not isinstance(self._learner, MultiOutputClassifier) else self._learner.estimators_[0].objective, + multi_output_estimator_dict=self._multi_output_estimator_dict + if isinstance(self._learner, MultiOutputClassifier) else {}, + attribute_columns_names=self._attribute_columns_names, + target_columns_metadata=self._target_columns_metadata, + target_columns_names=self._target_columns_names, + le=self._learner._le if not isinstance(self._learner, MultiOutputClassifier) else None + ) + + def set_params(self, *, params: Params) -> None: + if not all(params[param] is not None for param in + ['estimators', 'booster', 'classes', 'n_classes', 'objective', 'target_columns_metadata']): + self._learner = None + else: + if isinstance(self._learner, MultiOutputClassifier): + self._learner.estimators_ = params['estimators'] + self._multi_output_estimator_dict = params['multi_output_estimator_dict'] + else: + self._create_learner() + # A little hack to set lable encoder of XGBoostClassifier instance to prevent exceptions. + self._learner._le = params['le'] + # Another hack to make sure class attribute out side __init__ gets set properly + self._learner.classes_ = 0 + self._learner.n_classes_ = 0 + self._learner.set_params(_Booster=params['booster'], n_classes_=params['n_classes'], + classes_=params['classes'], objective=params['objective']) + self._target_columns_metadata = params['target_columns_metadata'] + self._attribute_columns_names = params['attribute_columns_names'] + self._target_columns_names = params['target_columns_names'] + + def __getstate__(self) -> dict: + state = super().__getstate__() + + # Random state is not part of the "Params", but it is part of the state we want to + # pickle and unpickle to have full reproducibility. So we have to add it ourselves here. + # This is also difference between pickling/unpickling and "get_params"/"set_params". + # The later saves only the model state which is useful to produce at a later time, but + # if we want to also reproduce the exact sequence of values, we should be using pickling. + state['random_state'] = self._random_state + + return state + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + + self._random_state = state['random_state'] + + def _can_use_inputs_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + if not d3m_utils.is_numeric(column_metadata['structural_type']): + return False + + return 'https://metadata.datadrivendiscovery.org/types/Attribute' in column_metadata.get('semantic_types', []) + + def _get_inputs_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_inputs_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + inputs_metadata, + self.hyperparams['use_inputs_columns'], + self.hyperparams['exclude_inputs_columns'], + can_use_column, + ) + + if not columns_to_use: + raise ValueError("No inputs columns.") + + if self.hyperparams['use_inputs_columns'] and columns_not_to_use: + self.logger.warning("Not all specified inputs columns can used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _can_use_outputs_column(self, outputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + return 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in column_metadata.get('semantic_types', []) + + def _get_outputs_columns(self, outputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_outputs_column(outputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + outputs_metadata, + self.hyperparams['use_outputs_columns'], + self.hyperparams['exclude_outputs_columns'], + can_use_column, + ) + + if not columns_to_use: + raise ValueError("No outputs columns.") + + if self.hyperparams['use_outputs_columns'] and columns_not_to_use: + self.logger.warning("Not all specified outputs columns can used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _select_inputs_columns(self, inputs: Inputs) -> Tuple[Inputs, List[int]]: + columns_to_use = self._get_inputs_columns(inputs.metadata) + + return inputs.select_columns(columns_to_use), columns_to_use + + def _select_outputs_columns(self, outputs: Outputs) -> Tuple[Outputs, List[int]]: + columns_to_use = self._get_outputs_columns(outputs.metadata) + + return outputs.select_columns(columns_to_use), columns_to_use diff --git a/tods/common-primitives/common_primitives/xgboost_regressor.py b/tods/common-primitives/common_primitives/xgboost_regressor.py new file mode 100644 index 0000000..77e707e --- /dev/null +++ b/tods/common-primitives/common_primitives/xgboost_regressor.py @@ -0,0 +1,588 @@ +import os +from collections import OrderedDict +from typing import cast, Dict, List, Union, Sequence, Optional, Tuple + +import numpy as np # type: ignore +import xgboost as xgb # type: ignore +from sklearn.multioutput import MultiOutputRegressor # type: ignore + +from d3m import container, exceptions, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams, params +from d3m.primitive_interfaces.base import CallResult, SamplingCompositionalityMixin, ContinueFitMixin +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase + +import common_primitives + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(params.Params): + estimators: Optional[Union[xgb.XGBRegressor, List[xgb.XGBRegressor]]] + booster: Optional[Union[xgb.Booster, List[xgb.Booster]]] + objective: Optional[str] + multi_output_estimator_dict: Optional[Dict] + target_columns_metadata: Optional[List[OrderedDict]] + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=100, + description='The number of trees in the forest.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + n_more_estimators = hyperparams.UniformInt( + lower=1, + upper=10000, + default=100, + description='When continuing a fit, it controls how many more trees to add every time.', + semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + 'https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', + ], + ) + max_depth = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=3, + ), + unlimited=hyperparams.Enumeration[int]( + values=[0], + default=0, + description='Nodes are expanded until all leaves are pure or until all leaves contain less than "min_samples_split" samples.', + ), + ), + default='limit', + description='The maximum depth of the tree.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + learning_rate = hyperparams.Uniform( + lower=0, + upper=1, + default=0.1, + description=r'Boosting learning rate (xgb\`s \"eta\")', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + gamma = hyperparams.Bounded[float]( + lower=0.0, + upper=None, + default=0.0, + description='Minimum loss reduction required to make a further partition on a leaf node of the tree', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + min_child_weight = hyperparams.Bounded[int]( + lower=0, + upper=None, + default=1, + description='Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results ' + 'in a leaf node with the sum of instance weight less than min_child_weight, then the building ' + 'process will give up further partitioning ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + max_delta_step = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + # TODO: 1-10 instead? + upper=None, + default=1, + description='Maximum delta step we allow each leaf output to be.' + ), + unlimited=hyperparams.Enumeration[int]( + values=[0], + default=0, + description='No constraint.', + ), + ), + default='unlimited', + description='Maximum delta step we allow.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + # TODO: better way to represent lower bound is exclusive? + subsample = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of the training instances,this will prevent overfitting. Subsampling will occur ' + 'once in every boosting iteration.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + colsample_bytree = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of columns when constructing each tree. Subsampling will occur once in every ' + 'boosting iteration', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + colsample_bylevel = hyperparams.Uniform( + lower=0.0001, + upper=1, + default=1, + upper_inclusive=True, + description='Subsample ratio of columns for each split, in each level. Subsampling will occur each time a new ' + 'split is made', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + reg_lambda = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1, + description='L2 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + reg_alpha = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0, + description='L1 regularization term on weights. Increasing this value will make model more conservative.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + scale_pos_weight = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1, + description='Control the balance of positive and negative weights, useful for unbalanced classes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + base_score = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.5, + description='The initial prediction score of all instances, global bias.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + n_jobs = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + ), + all_cores=hyperparams.Enumeration[int]( + values=[-1], + default=-1, + description='The number of jobs is set to the number of cores.', + ), + ), + default='limit', + description='The number of jobs to run in parallel for both "fit" and "produce".', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'], + ) + importance_type = hyperparams.Enumeration[str]( + values=['gain', 'weight', 'cover', 'total_gain', 'total_cover'], + default='gain', + description='The feature importance type', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of inputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to force primitive to operate on. If any specified column cannot be used, it is skipped.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of outputs column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + # Default value depends on the nature of the primitive. + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should resulting columns be appended, should they replace original columns, or should only resulting columns be returned?", + ) + add_index_columns = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + + +class XGBoostGBTreeRegressorPrimitive(SamplingCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams], + SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A XGBoost classifier using ``xgb.XGBoostRegressor`` with GBTree Boosting type. + + It uses semantic types to determine which columns to operate on. + """ + __author__ = 'TAMU DARPA D3M Team, TsungLin Yang ' + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'cdbb80e4-e9de-4caa-a710-16b5d727b959', + 'version': '0.1.0', + 'name': "XGBoost GBTree regressor", + 'python_path': 'd3m.primitives.regression.xgboost_gbtree.Common', + 'keywords': ['xgboost', 'decision tree', 'gradient boosted trees', ], + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:lin.yang@tamu.edu', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.GRADIENT_BOOSTING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.REGRESSION, + 'hyperparams_to_tune': [ + 'learning_rate', + 'colsample_bytree', + 'min_child_weight', + 'subsample', + 'max_depth', + 'max_delta_step' + ] + } + ) + + def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, _verbose: int = 0) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed) + + # We need random seed multiple times (every time an underlying "RandomForestClassifier" is instantiated), + # and when we sample. So instead we create our own random state we use everywhere. + self._random_state = np.random.RandomState(self.random_seed) + self._verbose = _verbose + self._training_inputs: Inputs = None + self._training_outputs: Outputs = None + self._new_training_data = False + self._learner: Union[xgb.XGBRegressor, MultiOutputRegressor] = None + self._target_columns_metadata: List[OrderedDict] = None + # A dictionary recording estimator-target_column mapping. + self._multi_output_estimator_dict: Dict = {} + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._training_inputs = inputs + self._training_outputs = outputs + self._new_training_data = True + + def _create_learner(self) -> None: + self._learner = xgb.XGBRegressor( + max_depth=self.hyperparams['max_depth'], + learning_rate=self.hyperparams['learning_rate'], + n_estimators=self.hyperparams['n_estimators'], + gamma=self.hyperparams['gamma'], + min_child_weight=self.hyperparams['min_child_weight'], + max_delta_step=self.hyperparams['max_delta_step'], + subsample=self.hyperparams['subsample'], + colsample_bylevel=self.hyperparams['colsample_bylevel'], + colsample_bytree=self.hyperparams['colsample_bytree'], + reg_alpha=self.hyperparams['reg_alpha'], + reg_lambda=self.hyperparams['reg_lambda'], + scale_pos_weight=self.hyperparams['scale_pos_weight'], + importance_type=self.hyperparams['importance_type'], + base_score=self.hyperparams['base_score'], + n_jobs=-1 if self.hyperparams['n_jobs'] is None else self.hyperparams['n_jobs'], + random_state=self.random_seed, + booster='gbtree', + silent=not bool(self._verbose) + ) + + def _get_target_columns_metadata(self, outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = list(column_metadata.get('semantic_types', [])) + if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types: + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + semantic_types = [semantic_type for semantic_type in semantic_types if + semantic_type != 'https://metadata.datadrivendiscovery.org/types/TrueTarget'] + column_metadata['semantic_types'] = semantic_types + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _store_target_columns_metadata(self, outputs: Outputs) -> None: + self._target_columns_metadata = self._get_target_columns_metadata(outputs.metadata) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # An optimization. Do not refit if data has not changed. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + inputs, _ = self._select_inputs_columns(self._training_inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + + self._create_learner() + + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.ndim == 2 and outputs.shape[1] == 1: + fit_outputs = np.ravel(outputs) + else: + fit_outputs = outputs + self._learner = MultiOutputRegressor(self._learner) + + # convert to np.array in order to unify the feature name based on xgboost's implementation + if type(inputs) is not np.array: + inputs = np.array(inputs) + fit_outputs = np.array(fit_outputs) + + self._learner.fit(inputs, fit_outputs) + + if isinstance(self._learner, MultiOutputRegressor): + for index, estimator in enumerate(self._learner.estimators_): + self._multi_output_estimator_dict[outputs.columns.values[index]] = index + + self._store_target_columns_metadata(outputs) + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise exceptions.InvalidStateError("Missing training data.") + + # This model is not improving fitting if called multiple times on the same data. + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if not self._learner: + self._create_learner() + + inputs, _ = self._select_inputs_columns(self._training_inputs) + outputs, _ = self._select_outputs_columns(self._training_outputs) + + # using xgboost api to continue fit the classifier. + def continue_xgb_booster(xgb_model: xgb.XGBRegressor, inputs: Inputs, + output_values: Union[np.ndarray, Outputs], num_of_boosting_round: int) -> None: + dtrain = xgb.DMatrix(data=inputs.values, label=output_values) + model_param = xgb_model.get_xgb_params() + del (model_param['n_estimators']) + model_param['objective'] = xgb_model.objective + booster = xgb.train(params=model_param, dtrain=dtrain, + num_boost_round=num_of_boosting_round, + xgb_model=xgb_model.get_booster()) + xgb_model.set_params(_Booster=booster) + + # A special case for sklearn. It prefers an 1D array instead of 2D when there is only one target. + if outputs.ndim == 2 and outputs.shape[1] == 1: + continue_xgb_booster(self._learner, inputs, np.ravel(outputs), self.hyperparams['n_more_estimators']) + else: + for _, output in outputs.iteritems(): + estimator_index = self._multi_output_estimator_dict.get(output.name, None) + if estimator_index is None: + raise exceptions.InvalidArgumentValueError( + 'Unseen target column when continuing fit {}'.format(output.name)) + estimator = self._learner.estimators_[self._multi_output_estimator_dict[output.name]] + continue_xgb_booster(estimator, inputs, output, self.hyperparams['n_more_estimators']) + + self._store_target_columns_metadata(outputs) + + return CallResult(None) + + def _update_predictions_metadata(self, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata() + if outputs is not None: + outputs_metadata = outputs_metadata.generate(outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, predictions: np.ndarray) -> Outputs: + outputs = container.DataFrame(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(outputs, self._target_columns_metadata) + return outputs + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + selected_inputs, columns_to_use = self._select_inputs_columns(inputs) + + predictions = self._learner.predict(selected_inputs.values) + + output_columns = [self._wrap_predictions(predictions)] + + outputs = base_utils.combine_columns( + inputs, columns_to_use, output_columns, + return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'], + ) + + return CallResult(outputs) + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + # TODO: is feature importances the same for every target? + if hasattr(self._learner, 'feature_importances_'): + feature_importances_array = self._learner.feature_importances_ + else: + feature_importances_array = self._learner.estimators_[0].feature_importances_ + + if feature_importances_array.ndim == 1: + feature_importances_array = feature_importances_array.reshape((1, feature_importances_array.shape[0])) + + return CallResult(container.DataFrame(feature_importances_array, generate_metadata=True)) + + def sample(self, *, inputs: Inputs, num_samples: int = 1, timeout: float = None, iterations: int = None) -> \ + CallResult[Sequence[Outputs]]: + if not self._learner: + raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") + + inputs, _ = self._select_inputs_columns(inputs) + + samples = [] + for i in range(num_samples): + predictions = self._learner.predict(inputs.values) + samples.append(self._wrap_predictions(predictions)) + + return CallResult(samples) + + def get_params(self) -> Params: + if not self._learner: + return Params( + estimators=None, + objective=None, + booster=None, + multi_output_estimator_dict=None, + target_columns_metadata=None, + ) + + return Params( + estimators=self._learner.estimators_ if isinstance(self._learner, MultiOutputRegressor) else self._learner, + objective=self._learner.objective + if not isinstance(self._learner, MultiOutputRegressor) else self._learner.estimators_[0].objective, + booster=self._learner.get_booster() if not isinstance(self._learner, MultiOutputRegressor) else [ + estimator.get_booster() for estimator in self._learner.estimators_], + multi_output_estimator_dict=self._multi_output_estimator_dict + if isinstance(self._learner, MultiOutputRegressor) else {}, + target_columns_metadata=self._target_columns_metadata, + ) + + def set_params(self, *, params: Params) -> None: + if not all(params[param] is not None for param in + ['estimators', 'booster', 'objective', 'multi_output_estimator_dict', 'target_columns_metadata']): + self._learner = None + else: + if isinstance(self._learner, MultiOutputRegressor): + self._learner.estimators_ = params['estimators'] + self._multi_output_estimator_dict = params['multi_output_estimator_dict'] + else: + self._create_learner() + self._learner.set_params(_Booster=params['booster'], objective=params['objective']) + self._target_columns_metadata = params['target_columns_metadata'] + + def __getstate__(self) -> dict: + state = super().__getstate__() + + # Random state is not part of the "Params", but it is part of the state we want to + # pickle and unpickle to have full reproducibility. So we have to add it ourselves here. + # This is also difference between pickling/unpickling and "get_params"/"set_params". + # The later saves only the model state which is useful to produce at a later time, but + # if we want to also reproduce the exact sequence of values, we should be using pickling. + state['random_state'] = self._random_state + + return state + + def __setstate__(self, state: dict) -> None: + super().__setstate__(state) + + self._random_state = state['random_state'] + + def _can_use_inputs_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + if not d3m_utils.is_numeric(column_metadata['structural_type']): + return False + + return 'https://metadata.datadrivendiscovery.org/types/Attribute' in column_metadata.get('semantic_types', []) + + def _get_inputs_columns(self, inputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_inputs_column(inputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + inputs_metadata, + self.hyperparams['use_inputs_columns'], + self.hyperparams['exclude_inputs_columns'], + can_use_column, + ) + + if not columns_to_use: + raise ValueError("No inputs columns.") + + if self.hyperparams['use_inputs_columns'] and columns_not_to_use: + self.logger.warning("Not all specified inputs columns can used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _can_use_outputs_column(self, outputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: + column_metadata = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + return 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in column_metadata.get('semantic_types', []) + + def _get_outputs_columns(self, outputs_metadata: metadata_base.DataMetadata) -> List[int]: + def can_use_column(column_index: int) -> bool: + return self._can_use_outputs_column(outputs_metadata, column_index) + + columns_to_use, columns_not_to_use = base_utils.get_columns_to_use( + outputs_metadata, + self.hyperparams['use_outputs_columns'], + self.hyperparams['exclude_outputs_columns'], + can_use_column, + ) + + if not columns_to_use: + raise ValueError("No outputs columns.") + + if self.hyperparams['use_outputs_columns'] and columns_not_to_use: + self.logger.warning("Not all specified outputs columns can used. Skipping columns: %(columns)s", { + 'columns': columns_not_to_use, + }) + + return columns_to_use + + def _select_inputs_columns(self, inputs: Inputs) -> Tuple[Inputs, List[int]]: + columns_to_use = self._get_inputs_columns(inputs.metadata) + + return inputs.select_columns(columns_to_use), columns_to_use + + def _select_outputs_columns(self, outputs: Outputs) -> Tuple[Outputs, List[int]]: + columns_to_use = self._get_outputs_columns(outputs.metadata) + + return outputs.select_columns(columns_to_use), columns_to_use diff --git a/tods/common-primitives/entry_points.ini b/tods/common-primitives/entry_points.ini new file mode 100644 index 0000000..5dac201 --- /dev/null +++ b/tods/common-primitives/entry_points.ini @@ -0,0 +1,63 @@ +[d3m.primitives] +data_preprocessing.one_hot_encoder.MakerCommon = common_primitives.one_hot_maker:OneHotMakerPrimitive +data_preprocessing.one_hot_encoder.PandasCommon = common_primitives.pandas_onehot_encoder:PandasOneHotEncoderPrimitive +data_transformation.extract_columns.Common = common_primitives.extract_columns:ExtractColumnsPrimitive +data_transformation.extract_columns_by_semantic_types.Common = common_primitives.extract_columns_semantic_types:ExtractColumnsBySemanticTypesPrimitive +data_transformation.extract_columns_by_structural_types.Common = common_primitives.extract_columns_structural_types:ExtractColumnsByStructuralTypesPrimitive +data_transformation.remove_columns.Common = common_primitives.remove_columns:RemoveColumnsPrimitive +data_transformation.remove_duplicate_columns.Common = common_primitives.remove_duplicate_columns:RemoveDuplicateColumnsPrimitive +data_transformation.horizontal_concat.DataFrameCommon = common_primitives.horizontal_concat:HorizontalConcatPrimitive +data_transformation.cast_to_type.Common = common_primitives.cast_to_type:CastToTypePrimitive +data_transformation.column_parser.Common = common_primitives.column_parser:ColumnParserPrimitive +data_transformation.construct_predictions.Common = common_primitives.construct_predictions:ConstructPredictionsPrimitive +data_transformation.dataframe_to_ndarray.Common = common_primitives.dataframe_to_ndarray:DataFrameToNDArrayPrimitive +data_transformation.ndarray_to_dataframe.Common = common_primitives.ndarray_to_dataframe:NDArrayToDataFramePrimitive +data_transformation.dataframe_to_list.Common = common_primitives.dataframe_to_list:DataFrameToListPrimitive +data_transformation.list_to_dataframe.Common = common_primitives.list_to_dataframe:ListToDataFramePrimitive +data_transformation.ndarray_to_list.Common = common_primitives.ndarray_to_list:NDArrayToListPrimitive +data_transformation.list_to_ndarray.Common = common_primitives.list_to_ndarray:ListToNDArrayPrimitive +data_transformation.stack_ndarray_column.Common = common_primitives.stack_ndarray_column:StackNDArrayColumnPrimitive +data_transformation.add_semantic_types.Common = common_primitives.add_semantic_types:AddSemanticTypesPrimitive +data_transformation.remove_semantic_types.Common = common_primitives.remove_semantic_types:RemoveSemanticTypesPrimitive +data_transformation.replace_semantic_types.Common = common_primitives.replace_semantic_types:ReplaceSemanticTypesPrimitive +data_transformation.denormalize.Common = common_primitives.denormalize:DenormalizePrimitive +data_transformation.datetime_field_compose.Common = common_primitives.datetime_field_compose:DatetimeFieldComposePrimitive +data_transformation.grouping_field_compose.Common = common_primitives.grouping_field_compose:GroupingFieldComposePrimitive +data_transformation.dataset_to_dataframe.Common = common_primitives.dataset_to_dataframe:DatasetToDataFramePrimitive +data_transformation.cut_audio.Common = common_primitives.cut_audio:CutAudioPrimitive +data_transformation.rename_duplicate_name.DataFrameCommon = common_primitives.rename_duplicate_columns:RenameDuplicateColumnsPrimitive +#data_transformation.normalize_column_references.Common = common_primitives.normalize_column_references:NormalizeColumnReferencesPrimitive +#data_transformation.normalize_graphs.Common = common_primitives.normalize_graphs:NormalizeGraphsPrimitive +data_transformation.ravel.DataFrameRowCommon = common_primitives.ravel:RavelAsRowPrimitive +data_preprocessing.label_encoder.Common = common_primitives.unseen_label_encoder:UnseenLabelEncoderPrimitive +data_preprocessing.label_decoder.Common = common_primitives.unseen_label_decoder:UnseenLabelDecoderPrimitive +data_preprocessing.image_reader.Common = common_primitives.dataframe_image_reader:DataFrameImageReaderPrimitive +data_preprocessing.text_reader.Common = common_primitives.text_reader:TextReaderPrimitive +data_preprocessing.video_reader.Common = common_primitives.video_reader:VideoReaderPrimitive +data_preprocessing.csv_reader.Common = common_primitives.csv_reader:CSVReaderPrimitive +data_preprocessing.audio_reader.Common = common_primitives.audio_reader:AudioReaderPrimitive +data_preprocessing.regex_filter.Common = common_primitives.regex_filter:RegexFilterPrimitive +data_preprocessing.term_filter.Common = common_primitives.term_filter:TermFilterPrimitive +data_preprocessing.numeric_range_filter.Common = common_primitives.numeric_range_filter:NumericRangeFilterPrimitive +data_preprocessing.datetime_range_filter.Common = common_primitives.datetime_range_filter:DatetimeRangeFilterPrimitive +data_preprocessing.dataset_sample.Common = common_primitives.dataset_sample:DatasetSamplePrimitive +#data_preprocessing.time_interval_transform.Common = common_primitives.time_interval_transform:TimeIntervalTransformPrimitive +data_cleaning.tabular_extractor.Common = common_primitives.tabular_extractor:AnnotatedTabularExtractorPrimitive +evaluation.redact_columns.Common = common_primitives.redact_columns:RedactColumnsPrimitive +evaluation.kfold_dataset_split.Common = common_primitives.kfold_split:KFoldDatasetSplitPrimitive +evaluation.kfold_time_series_split.Common = common_primitives.kfold_split_timeseries:KFoldTimeSeriesSplitPrimitive +evaluation.train_score_dataset_split.Common = common_primitives.train_score_split:TrainScoreDatasetSplitPrimitive +evaluation.no_split_dataset_split.Common = common_primitives.no_split:NoSplitDatasetSplitPrimitive +evaluation.fixed_split_dataset_split.Commmon = common_primitives.fixed_split:FixedSplitDatasetSplitPrimitive +classification.random_forest.Common = common_primitives.random_forest:RandomForestClassifierPrimitive +classification.light_gbm.Common = common_primitives.lgbm_classifier:LightGBMClassifierPrimitive +classification.xgboost_gbtree.Common = common_primitives.xgboost_gbtree:XGBoostGBTreeClassifierPrimitive +classification.xgboost_dart.Common = common_primitives.xgboost_dart:XGBoostDartClassifierPrimitive +regression.xgboost_gbtree.Common = common_primitives.xgboost_regressor:XGBoostGBTreeRegressorPrimitive +schema_discovery.profiler.Common = common_primitives.simple_profiler:SimpleProfilerPrimitive +operator.column_map.Common = common_primitives.column_map:DataFrameColumnMapPrimitive +operator.dataset_map.DataFrameCommon = common_primitives.dataset_map:DataFrameDatasetMapPrimitive +data_preprocessing.flatten.DataFrameCommon = common_primitives.dataframe_flatten:DataFrameFlattenPrimitive +metalearning.metafeature_extractor.Common = common_primitives.compute_metafeatures:ComputeMetafeaturesPrimitive +data_augmentation.datamart_augmentation.Common = common_primitives.datamart_augment:DataMartAugmentPrimitive +data_augmentation.datamart_download.Common = common_primitives.datamart_download:DataMartDownloadPrimitive diff --git a/tods/common-primitives/git-add.sh b/tods/common-primitives/git-add.sh new file mode 100755 index 0000000..896ab85 --- /dev/null +++ b/tods/common-primitives/git-add.sh @@ -0,0 +1,5 @@ +#!/bin/bash -e + +# This requires git LFS 2.9.0 or newer. + +find * -type f -size +100k -exec git lfs track --filename '{}' + diff --git a/tods/common-primitives/git-check.sh b/tods/common-primitives/git-check.sh new file mode 100755 index 0000000..8a6b468 --- /dev/null +++ b/tods/common-primitives/git-check.sh @@ -0,0 +1,21 @@ +#!/bin/bash -e + +if git rev-list --objects --all \ +| git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' \ +| sed -n 's/^blob //p' \ +| awk '$2 >= 100*(2^10)' \ +| awk '{print $3}' \ +| egrep -v '(^|/).gitattributes$' ; then + echo "Repository contains committed objects larger than 100 KB." + exit 1 +fi + +if git lfs ls-files --name-only | xargs -r stat -c '%s %n' | awk '$1 < 100*(2^10)' | awk '{print $2}' | grep . ; then + echo "Repository contains LFS objects smaller than 100 KB." + exit 1 +fi + +if git lfs ls-files --name-only | xargs -r stat -c '%s %n' | awk '$1 >= 2*(2^30)' | awk '{print $2}' | grep . ; then + echo "Repository contains LFS objects not smaller than 2 GB." + exit 1 +fi diff --git a/tods/common-primitives/list_primitives.py b/tods/common-primitives/list_primitives.py new file mode 100755 index 0000000..0d5da96 --- /dev/null +++ b/tods/common-primitives/list_primitives.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +import argparse +import configparser +import re + + +class CaseSensitiveConfigParser(configparser.ConfigParser): + optionxform = staticmethod(str) + + +parser = argparse.ArgumentParser(description='List enabled common primitives.') +group = parser.add_mutually_exclusive_group(required=True) +group.add_argument('--suffix', action='store_true', help='list primitive suffixes of all enabled common primitives') +group.add_argument('--python', action='store_true', help='list Python paths of all enabled common primitives') +group.add_argument('--files', action='store_true', help='list file paths of all enabled common primitives') + +args = parser.parse_args() + +entry_points = CaseSensitiveConfigParser() +entry_points.read('entry_points.ini') + +for primitive_suffix, primitive_path in entry_points.items('d3m.primitives'): + if args.python: + print("d3m.primitives.{primitive_suffix}".format(primitive_suffix=primitive_suffix)) + elif args.suffix: + print(primitive_suffix) + elif args.files: + primitive_path = re.sub(':.+$', '', primitive_path) + primitive_path = re.sub('\.', '/', primitive_path) + print("{primitive_path}.py".format(primitive_path=primitive_path)) + diff --git a/tods/common-primitives/pipeline_runs/classification.light_gbm.DataFrameCommon/1.yaml.gz b/tods/common-primitives/pipeline_runs/classification.light_gbm.DataFrameCommon/1.yaml.gz new file mode 100644 index 0000000..0529242 Binary files /dev/null and b/tods/common-primitives/pipeline_runs/classification.light_gbm.DataFrameCommon/1.yaml.gz differ diff --git a/tods/common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/1.yaml.gz b/tods/common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/1.yaml.gz new file mode 100644 index 0000000..b742e77 Binary files /dev/null and b/tods/common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/1.yaml.gz differ diff --git a/tods/common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz b/tods/common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz new file mode 120000 index 0000000..91f49b3 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/classification.xgboost_dart.DataFrameCommon/1.yaml.gz b/tods/common-primitives/pipeline_runs/classification.xgboost_dart.DataFrameCommon/1.yaml.gz new file mode 100644 index 0000000..144a9cf Binary files /dev/null and b/tods/common-primitives/pipeline_runs/classification.xgboost_dart.DataFrameCommon/1.yaml.gz differ diff --git a/tods/common-primitives/pipeline_runs/classification.xgboost_gbtree.DataFrameCommon/1.yaml.gz b/tods/common-primitives/pipeline_runs/classification.xgboost_gbtree.DataFrameCommon/1.yaml.gz new file mode 100644 index 0000000..1bc8198 Binary files /dev/null and b/tods/common-primitives/pipeline_runs/classification.xgboost_gbtree.DataFrameCommon/1.yaml.gz differ diff --git a/tods/common-primitives/pipeline_runs/data_augmentation.datamart_augmentation.Common/2.yaml.gz b/tods/common-primitives/pipeline_runs/data_augmentation.datamart_augmentation.Common/2.yaml.gz new file mode 100644 index 0000000..e449db8 Binary files /dev/null and b/tods/common-primitives/pipeline_runs/data_augmentation.datamart_augmentation.Common/2.yaml.gz differ diff --git a/tods/common-primitives/pipeline_runs/data_preprocessing.dataset_sample.Common/1.yaml.gz b/tods/common-primitives/pipeline_runs/data_preprocessing.dataset_sample.Common/1.yaml.gz new file mode 100644 index 0000000..824ea25 Binary files /dev/null and b/tods/common-primitives/pipeline_runs/data_preprocessing.dataset_sample.Common/1.yaml.gz differ diff --git a/tods/common-primitives/pipeline_runs/data_preprocessing.one_hot_encoder.PandasCommon/pipeline_run_extract_structural_types.yml.gz b/tods/common-primitives/pipeline_runs/data_preprocessing.one_hot_encoder.PandasCommon/pipeline_run_extract_structural_types.yml.gz new file mode 120000 index 0000000..91f49b3 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_preprocessing.one_hot_encoder.PandasCommon/pipeline_run_extract_structural_types.yml.gz @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/1.yaml.gz b/tods/common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/1.yaml.gz new file mode 120000 index 0000000..b531842 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/1.yaml.gz @@ -0,0 +1 @@ +../classification.light_gbm.DataFrameCommon/1.yaml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz b/tods/common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz new file mode 120000 index 0000000..91f49b3 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_group_field_compose.yml.gz b/tods/common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_group_field_compose.yml.gz new file mode 120000 index 0000000..0a4dd35 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_group_field_compose.yml.gz @@ -0,0 +1 @@ +../data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/1.yaml.gz b/tods/common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/1.yaml.gz new file mode 120000 index 0000000..b531842 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/1.yaml.gz @@ -0,0 +1 @@ +../classification.light_gbm.DataFrameCommon/1.yaml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz b/tods/common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz new file mode 120000 index 0000000..91f49b3 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/1.yaml.gz b/tods/common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/1.yaml.gz new file mode 120000 index 0000000..b531842 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/1.yaml.gz @@ -0,0 +1 @@ +../classification.light_gbm.DataFrameCommon/1.yaml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_extract_structural_types.yml.gz b/tods/common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_extract_structural_types.yml.gz new file mode 120000 index 0000000..91f49b3 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_extract_structural_types.yml.gz @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_group_field_compose.yml.gz b/tods/common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_group_field_compose.yml.gz new file mode 120000 index 0000000..0a4dd35 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_group_field_compose.yml.gz @@ -0,0 +1 @@ +../data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/1.yaml.gz b/tods/common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/1.yaml.gz new file mode 120000 index 0000000..b531842 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/1.yaml.gz @@ -0,0 +1 @@ +../classification.light_gbm.DataFrameCommon/1.yaml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz b/tods/common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz new file mode 120000 index 0000000..91f49b3 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz b/tods/common-primitives/pipeline_runs/data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz new file mode 100644 index 0000000..3e1ee3c Binary files /dev/null and b/tods/common-primitives/pipeline_runs/data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz differ diff --git a/tods/common-primitives/pipeline_runs/data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz b/tods/common-primitives/pipeline_runs/data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz new file mode 100644 index 0000000..8f8bdf0 Binary files /dev/null and b/tods/common-primitives/pipeline_runs/data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz differ diff --git a/tods/common-primitives/pipeline_runs/data_transformation.horizontal_concat.DataFrameConcat/1.yaml.gz b/tods/common-primitives/pipeline_runs/data_transformation.horizontal_concat.DataFrameConcat/1.yaml.gz new file mode 120000 index 0000000..cc4d8fa --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.horizontal_concat.DataFrameConcat/1.yaml.gz @@ -0,0 +1 @@ +../data_preprocessing.one_hot_encoder.MakerCommon/1.yaml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/data_transformation.remove_columns.Common/pipeline_run_extract_structural_types.yml.gz b/tods/common-primitives/pipeline_runs/data_transformation.remove_columns.Common/pipeline_run_extract_structural_types.yml.gz new file mode 120000 index 0000000..91f49b3 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/data_transformation.remove_columns.Common/pipeline_run_extract_structural_types.yml.gz @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/regression.xgboost_gbtree.DataFrameCommon/1.yml b/tods/common-primitives/pipeline_runs/regression.xgboost_gbtree.DataFrameCommon/1.yml new file mode 100644 index 0000000..6484df1 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/regression.xgboost_gbtree.DataFrameCommon/1.yml @@ -0,0 +1,4729 @@ +context: TESTING +datasets: +- digest: 36ab84076f277efd634abdabc2d094b09e079aa2c99253d433647c062972171b + id: 26_radon_seed_dataset_TRAIN +end: '2019-06-19T09:54:49.049658Z' +environment: + engine_version: 2019.6.7 + id: 6fdad0c4-dcb1-541d-a2a7-d2d9590c26dd + reference_engine_version: 2019.6.7 + resources: + cpu: + constraints: + cpu_shares: 1024 + devices: + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + logical_present: 8 + physical_present: 8 + memory: + total_memory: 25281880064 + worker_id: f5ffb488-a883-509d-ad4f-5e819913cb14 +id: 67cc5311-80d6-5f1f-96f6-7f7b9d1308a0 +pipeline: + digest: 79e82bee63a0db3dd75d41de0caa2054b860a8211fa9895a061fdb8f9f1c444d + id: 0f636602-6299-411b-9873-4b974cd393ba +problem: + digest: 01ab113ff802b57fe872f7b4e4422789921d033c41bc8ad0bd6e0d041291ed6f + id: 26_radon_seed_problem +random_seed: 0 +run: + phase: FIT + results: + predictions: + header: + - d3mIndex + - log_radon + values: + - - 0 + - 1 + - 2 + - 3 + - 4 + - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + - 16 + - 17 + - 18 + - 19 + - 20 + - 21 + - 22 + - 24 + - 25 + - 26 + - 27 + - 28 + - 29 + - 32 + - 34 + - 35 + - 36 + - 37 + - 38 + - 40 + - 41 + - 42 + - 43 + - 45 + - 46 + - 47 + - 48 + - 50 + - 51 + - 52 + - 53 + - 55 + - 56 + - 57 + - 58 + - 59 + - 61 + - 62 + - 64 + - 68 + - 69 + - 71 + - 73 + - 74 + - 75 + - 79 + - 80 + - 81 + - 83 + - 84 + - 85 + - 87 + - 88 + - 89 + - 90 + - 91 + - 92 + - 93 + - 94 + - 95 + - 98 + - 99 + - 100 + - 101 + - 102 + - 103 + - 104 + - 105 + - 106 + - 108 + - 111 + - 112 + - 113 + - 114 + - 115 + - 116 + - 117 + - 118 + - 119 + - 121 + - 122 + - 123 + - 124 + - 125 + - 126 + - 127 + - 128 + - 129 + - 130 + - 131 + - 132 + - 133 + - 134 + - 135 + - 138 + - 140 + - 142 + - 143 + - 144 + - 145 + - 146 + - 147 + - 148 + - 149 + - 150 + - 151 + - 152 + - 153 + - 154 + - 155 + - 156 + - 157 + - 158 + - 159 + - 160 + - 161 + - 162 + - 163 + - 164 + - 166 + - 167 + - 169 + - 170 + - 171 + - 172 + - 173 + - 175 + - 176 + - 177 + - 178 + - 179 + - 180 + - 181 + - 182 + - 183 + - 184 + - 185 + - 186 + - 187 + - 188 + - 189 + - 190 + - 191 + - 193 + - 194 + - 195 + - 196 + - 197 + - 199 + - 200 + - 201 + - 202 + - 203 + - 204 + - 205 + - 206 + - 207 + - 211 + - 212 + - 214 + - 216 + - 217 + - 219 + - 220 + - 221 + - 222 + - 223 + - 224 + - 225 + - 226 + - 228 + - 229 + - 230 + - 232 + - 233 + - 234 + - 236 + - 237 + - 238 + - 240 + - 241 + - 242 + - 243 + - 245 + - 246 + - 247 + - 248 + - 249 + - 251 + - 252 + - 253 + - 255 + - 256 + - 257 + - 258 + - 260 + - 262 + - 263 + - 264 + - 267 + - 268 + - 269 + - 270 + - 271 + - 272 + - 273 + - 274 + - 276 + - 277 + - 278 + - 279 + - 280 + - 282 + - 283 + - 284 + - 285 + - 287 + - 288 + - 289 + - 291 + - 292 + - 293 + - 294 + - 295 + - 297 + - 298 + - 301 + - 302 + - 303 + - 304 + - 307 + - 308 + - 310 + - 312 + - 313 + - 315 + - 316 + - 317 + - 318 + - 319 + - 320 + - 322 + - 324 + - 325 + - 327 + - 328 + - 329 + - 330 + - 335 + - 336 + - 337 + - 338 + - 339 + - 340 + - 341 + - 342 + - 343 + - 345 + - 346 + - 347 + - 348 + - 349 + - 352 + - 353 + - 354 + - 355 + - 356 + - 358 + - 361 + - 362 + - 363 + - 364 + - 366 + - 367 + - 368 + - 369 + - 370 + - 371 + - 372 + - 373 + - 374 + - 375 + - 376 + - 378 + - 379 + - 382 + - 383 + - 384 + - 385 + - 386 + - 387 + - 388 + - 389 + - 390 + - 391 + - 392 + - 393 + - 395 + - 396 + - 397 + - 398 + - 399 + - 400 + - 401 + - 402 + - 403 + - 404 + - 405 + - 406 + - 407 + - 409 + - 410 + - 411 + - 412 + - 413 + - 414 + - 415 + - 418 + - 419 + - 420 + - 421 + - 422 + - 423 + - 424 + - 425 + - 426 + - 427 + - 428 + - 431 + - 432 + - 434 + - 435 + - 437 + - 438 + - 441 + - 442 + - 443 + - 444 + - 445 + - 446 + - 447 + - 449 + - 450 + - 451 + - 452 + - 453 + - 454 + - 455 + - 456 + - 457 + - 458 + - 459 + - 460 + - 461 + - 462 + - 463 + - 465 + - 466 + - 467 + - 469 + - 470 + - 471 + - 472 + - 473 + - 474 + - 475 + - 476 + - 477 + - 478 + - 479 + - 480 + - 481 + - 482 + - 484 + - 485 + - 487 + - 488 + - 489 + - 491 + - 492 + - 494 + - 495 + - 496 + - 498 + - 499 + - 500 + - 502 + - 503 + - 504 + - 505 + - 506 + - 507 + - 508 + - 509 + - 510 + - 511 + - 512 + - 513 + - 515 + - 516 + - 517 + - 518 + - 519 + - 520 + - 521 + - 522 + - 524 + - 525 + - 528 + - 529 + - 531 + - 532 + - 534 + - 535 + - 537 + - 538 + - 540 + - 541 + - 543 + - 544 + - 546 + - 547 + - 548 + - 550 + - 553 + - 555 + - 556 + - 557 + - 559 + - 560 + - 561 + - 562 + - 563 + - 564 + - 565 + - 566 + - 567 + - 568 + - 569 + - 570 + - 571 + - 572 + - 573 + - 574 + - 575 + - 576 + - 577 + - 578 + - 579 + - 580 + - 581 + - 582 + - 583 + - 586 + - 587 + - 588 + - 589 + - 590 + - 592 + - 593 + - 596 + - 600 + - 602 + - 603 + - 604 + - 605 + - 606 + - 607 + - 608 + - 609 + - 610 + - 612 + - 613 + - 614 + - 615 + - 618 + - 619 + - 620 + - 621 + - 622 + - 623 + - 624 + - 625 + - 626 + - 627 + - 628 + - 629 + - 630 + - 631 + - 632 + - 634 + - 636 + - 637 + - 639 + - 640 + - 641 + - 642 + - 643 + - 644 + - 645 + - 646 + - 647 + - 648 + - 649 + - 650 + - 651 + - 652 + - 653 + - 654 + - 657 + - 658 + - 659 + - 660 + - 661 + - 662 + - 663 + - 664 + - 665 + - 667 + - 670 + - 671 + - 672 + - 673 + - 674 + - 675 + - 676 + - 677 + - 678 + - 679 + - 681 + - 682 + - 683 + - 684 + - 686 + - 689 + - 690 + - 691 + - 692 + - 693 + - 694 + - 695 + - 696 + - 697 + - 698 + - 699 + - 700 + - 701 + - 702 + - 705 + - 707 + - 709 + - 711 + - 712 + - 713 + - 716 + - 717 + - 718 + - 719 + - 720 + - 721 + - 722 + - 724 + - 725 + - 726 + - 727 + - 728 + - 729 + - 730 + - 731 + - 732 + - 733 + - 735 + - 736 + - 737 + - 738 + - 740 + - 741 + - 742 + - 743 + - 744 + - 746 + - 747 + - 748 + - 749 + - 750 + - 751 + - 752 + - 754 + - 756 + - 757 + - 758 + - 761 + - 762 + - 763 + - 764 + - 765 + - 766 + - 768 + - 769 + - 770 + - 771 + - 772 + - 773 + - 774 + - 775 + - 776 + - 777 + - 778 + - 779 + - 780 + - 781 + - 782 + - 783 + - 784 + - 785 + - 786 + - 787 + - 789 + - 791 + - 792 + - 796 + - 797 + - 799 + - 800 + - 801 + - 802 + - 803 + - 804 + - 805 + - 806 + - 807 + - 810 + - 811 + - 812 + - 813 + - 815 + - 816 + - 818 + - 819 + - 820 + - 821 + - 822 + - 824 + - 825 + - 826 + - 828 + - 829 + - 830 + - 831 + - 832 + - 833 + - 835 + - 836 + - 837 + - 838 + - 839 + - 840 + - 841 + - 842 + - 843 + - 844 + - 845 + - 846 + - 847 + - 849 + - 850 + - 851 + - 852 + - 853 + - 854 + - 855 + - 856 + - 857 + - 858 + - 859 + - 860 + - 861 + - 863 + - 865 + - 866 + - 867 + - 868 + - 870 + - 871 + - 873 + - 874 + - 875 + - 876 + - 877 + - 878 + - 881 + - 883 + - 884 + - 885 + - 886 + - 887 + - 888 + - 889 + - 890 + - 891 + - 892 + - 894 + - 895 + - 896 + - 898 + - 899 + - 900 + - 901 + - 902 + - 903 + - 905 + - 906 + - 907 + - 909 + - 910 + - 911 + - 912 + - 915 + - 916 + - 917 + - 918 + - - 0.8330342769622803 + - 0.8330861330032349 + - 1.098989725112915 + - 0.0957428514957428 + - 1.1612895727157593 + - 0.4703049063682556 + - 0.09569096565246582 + - -0.22225826978683472 + - 0.26259732246398926 + - 0.2628304660320282 + - 0.3366249203681946 + - 0.40562954545021057 + - -0.6918290853500366 + - 0.18271362781524658 + - 1.522369146347046 + - 0.3367617130279541 + - 0.7891305685043335 + - 1.7914975881576538 + - 1.2189394235610962 + - 0.6415701508522034 + - 1.7050760984420776 + - 1.853300929069519 + - 1.8837662935256958 + - 1.161426305770874 + - 1.931780457496643 + - 1.974898338317871 + - 2.062042713165283 + - 1.665402889251709 + - 1.0651249885559082 + - 0.528777003288269 + - 1.4584600925445557 + - 1.704390048980713 + - 1.4109606742858887 + - 0.8739264011383057 + - 0.4055776596069336 + - 1.2182621955871582 + - 1.0990341901779175 + - 0.6417514085769653 + - 0.917670488357544 + - 0.1827174723148346 + - 0.8330342769622803 + - -0.3561854958534241 + - 1.098937749862671 + - 0.8333100080490112 + - 0.5901779532432556 + - 0.4056333899497986 + - 0.641793966293335 + - 0.2626492381095886 + - 1.4767398834228516 + - 1.523003339767456 + - 1.8526148796081543 + - 1.7553712129592896 + - 0.8330342769622803 + - 1.54994797706604 + - 1.098989725112915 + - 1.098989725112915 + - 1.6289032697677612 + - 1.6289032697677612 + - 2.5725338459014893 + - 1.979255199432373 + - 2.264638900756836 + - 1.8087071180343628 + - 1.360558271408081 + - 0.6414333581924438 + - 1.9379489421844482 + - 1.5710643529891968 + - 0.955110490322113 + - 1.9247326850891113 + - 1.4115099906921387 + - 2.3232131004333496 + - 0.8329493999481201 + - 0.641518235206604 + - 1.2587052583694458 + - 1.7441229820251465 + - 1.4767398834228516 + - 1.4584600925445557 + - -0.10479313135147095 + - 0.7409548759460449 + - 0.5288288593292236 + - 2.570702075958252 + - 2.694930076599121 + - 1.570378303527832 + - 2.273859977722168 + - -2.29718017578125 + - 2.0223045349121094 + - 1.4111419916152954 + - 2.062042713165283 + - 0.40562954545021057 + - 2.3121652603149414 + - 2.246690034866333 + - -0.1049298644065857 + - 1.5065677165985107 + - 1.6289032697677612 + - 0.7889493107795715 + - 2.104417562484741 + - 0.0004925131797790527 + - 2.5689167976379395 + - 0.9938793778419495 + - 1.2792216539382935 + - 3.282301425933838 + - 0.4691665768623352 + - 2.5706443786621094 + - 2.1773343086242676 + - 2.984010934829712 + - 0.955195426940918 + - 2.2046756744384766 + - 2.579826831817627 + - 1.3101081848144531 + - 1.9379489421844482 + - 0.00035572052001953125 + - 1.0279760360717773 + - 1.931780457496643 + - 2.3884470462799072 + - -2.3005144596099854 + - 0.955195426940918 + - 0.641518235206604 + - 0.5288288593292236 + - 0.09569096565246582 + - 0.0006737411022186279 + - 1.0988528728485107 + - 1.5072537660598755 + - 0.4703049063682556 + - 1.4350025653839111 + - 0.9553766250610352 + - 1.92511785030365 + - 1.4772891998291016 + - 1.7177082300186157 + - 1.3099268674850464 + - 1.0651249885559082 + - 2.691326856613159 + - 1.9249485731124878 + - 2.1015706062316895 + - 0.9936981201171875 + - 1.0651249885559082 + - 0.5901779532432556 + - 0.7409104108810425 + - 0.4704861640930176 + - 2.274041175842285 + - 2.1045303344726562 + - 1.279273509979248 + - -0.10479313135147095 + - 1.195493221282959 + - 2.387086868286133 + - 2.1045584678649902 + - 1.8532490730285645 + - 1.5822457075119019 + - 1.8085259199142456 + - 0.18271362781524658 + - 2.1773622035980225 + - 2.1765642166137695 + - 1.931780457496643 + - 0.8739264011383057 + - 0.5288288593292236 + - 1.0651249885559082 + - 1.8844523429870605 + - 0.5901779532432556 + - 1.54994797706604 + - 1.2188875675201416 + - 3.0550806522369385 + - 2.2266292572021484 + - 0.0004925131797790527 + - 1.617445707321167 + - 1.6289032697677612 + - 2.0370430946350098 + - 1.70457124710083 + - 1.310056209564209 + - 1.6176269054412842 + - 1.5705543756484985 + - 0.4056333899497986 + - 1.2587052583694458 + - 1.4584600925445557 + - 0.9551546573638916 + - 1.5830662250518799 + - 0.40549275279045105 + - 2.1764791011810303 + - 1.5065158605575562 + - 1.522369146347046 + - -0.5095808506011963 + - 1.7772479057312012 + - 1.704390048980713 + - 1.9800255298614502 + - 1.7555005550384521 + - 2.0229907035827637 + - 1.581559658050537 + - 1.931780457496643 + - 1.334634780883789 + - 1.7183942794799805 + - 2.06461238861084 + - 1.0280206203460693 + - 1.2587052583694458 + - 1.4584600925445557 + - 0.3367617130279541 + - 1.6652216911315918 + - -1.6082706451416016 + - 1.195493221282959 + - 1.195493221282959 + - 2.271498203277588 + - 1.4591904878616333 + - 1.8526148796081543 + - 3.4846649169921875 + - 2.5851869583129883 + - 0.8330861330032349 + - 1.7441229820251465 + - 1.9379489421844482 + - 2.0372588634490967 + - 2.2945101261138916 + - 3.7718851566314697 + - 1.6176269054412842 + - 1.6173088550567627 + - 1.279273509979248 + - 1.7448090314865112 + - 1.3863818645477295 + - 1.9258038997650146 + - 2.0701236724853516 + - 0.5286920666694641 + - 1.4115099906921387 + - 0.6415701508522034 + - 0.9554710984230042 + - 2.445803642272949 + - 0.9939219951629639 + - 1.3862005472183228 + - 2.0227746963500977 + - 0.00035956501960754395 + - -0.6920214891433716 + - 0.9554710984230042 + - 1.8080159425735474 + - 0.7409104108810425 + - 1.1315056085586548 + - 1.0992136001586914 + - 1.717889428138733 + - 1.4355566501617432 + - 2.696624517440796 + - 1.981532096862793 + - 0.8741501569747925 + - 1.506748914718628 + - 0.4704861640930176 + - 2.1661999225616455 + - 1.7444359064102173 + - 2.169395923614502 + - 0.6415701508522034 + - 0.694489061832428 + - -0.10479313135147095 + - 0.7889493107795715 + - 1.065306305885315 + - 1.3862005472183228 + - 1.0653488636016846 + - 1.434821367263794 + - 1.4774258136749268 + - 1.7192147970199585 + - 1.2198303937911987 + - 0.955291748046875 + - 1.0279760360717773 + - 2.140174388885498 + - 1.2198303937911987 + - 1.1955803632736206 + - 2.1685056686401367 + - 1.7568259239196777 + - 1.0280206203460693 + - 1.5715584754943848 + - 2.6298575401306152 + - 2.039620876312256 + - 1.7568777799606323 + - 1.54994797706604 + - 0.8332673907279968 + - 0.917670488357544 + - 1.4118516445159912 + - 1.54994797706604 + - 1.54994797706604 + - 2.38995361328125 + - 2.038114070892334 + - 1.1312817335128784 + - 0.470308780670166 + - 2.806213140487671 + - 1.161426305770874 + - 1.646281123161316 + - 1.6181317567825317 + - 1.8090436458587646 + - 1.3861486911773682 + - 1.744672179222107 + - -0.6920214891433716 + - 0.9936981201171875 + - 1.3099268674850464 + - 3.1659865379333496 + - 1.1315056085586548 + - 1.570378303527832 + - 1.1312817335128784 + - 1.4586412906646729 + - 1.1314630508422852 + - 1.4779438972473145 + - 1.0992136001586914 + - 1.2586534023284912 + - 2.1386680603027344 + - 2.2045223712921143 + - 1.5823750495910645 + - 1.3099268674850464 + - 0.8330342769622803 + - 1.064988374710083 + - -0.10484498739242554 + - 1.5506340265274048 + - 1.3348159790039062 + - 0.8330342769622803 + - 0.6942652463912964 + - 0.9938793778419495 + - 0.6414333581924438 + - 0.9176186323165894 + - 1.4774258136749268 + - 0.9935613870620728 + - 0.18271362781524658 + - 1.2189394235610962 + - 0.9552472829818726 + - 2.246690034866333 + - 0.3367617130279541 + - 1.6289032697677612 + - 1.098937749862671 + - 2.579826831817627 + - 2.7552952766418457 + - 0.6415701508522034 + - 1.3603770732879639 + - 2.0658645629882812 + - 0.9938793778419495 + - 2.444115400314331 + - 1.434821367263794 + - 2.5077669620513916 + - 1.925299048423767 + - 1.9379489421844482 + - 0.0004925131797790527 + - 0.5901779532432556 + - 0.40549275279045105 + - 0.740773618221283 + - 0.0957428514957428 + - 0.0957428514957428 + - 1.0650732517242432 + - 2.777968168258667 + - 0.3366249203681946 + - 0.3366249203681946 + - 0.5288288593292236 + - 0.0004925131797790527 + - 1.0651249885559082 + - -0.510017991065979 + - 0.4703049063682556 + - 1.9742122888565063 + - -0.5102857351303101 + - 2.3232131004333496 + - 1.0991190671920776 + - 2.5077669620513916 + - 1.5223172903060913 + - 1.386063814163208 + - 2.858018159866333 + - 2.3723583221435547 + - 1.8837662935256958 + - 1.9379489421844482 + - 1.6466542482376099 + - 2.5028574466705322 + - 1.6459681987762451 + - 2.2046756744384766 + - 1.780085802078247 + - 1.3867497444152832 + - 0.4703049063682556 + - 3.172811508178711 + - 0.0004925131797790527 + - 0.40562954545021057 + - 0.18271362781524658 + - 1.0651249885559082 + - 3.863699436187744 + - 0.0004925131797790527 + - 2.125455856323242 + - 1.4353705644607544 + - -0.510478138923645 + - 1.92511785030365 + - 2.0260462760925293 + - 2.229368209838867 + - 0.4703049063682556 + - 2.335315465927124 + - 1.386063814163208 + - 2.308807849884033 + - 0.8738744854927063 + - 1.5054293870925903 + - 1.0651249885559082 + - 0.18271362781524658 + - 0.2626492381095886 + - 0.528777003288269 + - 3.2366585731506348 + - -2.297093629837036 + - 2.3712871074676514 + - 0.8741075992584229 + - 1.3862005472183228 + - 1.9791702032089233 + - 0.7889493107795715 + - -0.5097177028656006 + - 1.7546851634979248 + - 0.7889493107795715 + - 1.5065677165985107 + - 0.917670488357544 + - 1.1314630508422852 + - 1.1312817335128784 + - 1.3863818645477295 + - 2.3886282444000244 + - 1.8766850233078003 + - 1.1312817335128784 + - 1.5230551958084106 + - 0.7889937162399292 + - 0.3367617130279541 + - 2.228400230407715 + - 0.18271362781524658 + - 2.3723583221435547 + - 3.1804051399230957 + - 2.2275471687316895 + - 2.5028574466705322 + - 2.104417562484741 + - 2.38759183883667 + - 1.4591460227966309 + - 2.758937358856201 + - 1.7050760984420776 + - 2.279667854309082 + - 2.104417562484741 + - 0.5288288593292236 + - 0.5289582014083862 + - 1.877371072769165 + - 1.5065677165985107 + - 2.444467067718506 + - 2.3131513595581055 + - 2.1023406982421875 + - 0.8741075992584229 + - 1.434821367263794 + - 0.18271362781524658 + - 0.18271362781524658 + - 1.098989725112915 + - 2.0648281574249268 + - 1.3602402210235596 + - 1.098989725112915 + - 0.5902224183082581 + - 2.2458348274230957 + - -0.3560487627983093 + - 0.18257683515548706 + - 0.7889493107795715 + - 2.5685315132141113 + - 1.1953564882278442 + - 1.4584081172943115 + - 1.334634780883789 + - 1.4350025653839111 + - 0.6941283941268921 + - 0.2626492381095886 + - 0.2626492381095886 + - 2.2458348274230957 + - 0.5901779532432556 + - 2.5037755966186523 + - 1.4767398834228516 + - 1.9379489421844482 + - 0.40562954545021057 + - 0.9552472829818726 + - 2.273859977722168 + - 1.3603770732879639 + - 1.2587052583694458 + - 1.9332870244979858 + - 1.3106811046600342 + - 0.8329493999481201 + - 0.9938793778419495 + - 0.7891731262207031 + - 1.9698771238327026 + - 0.2628304660320282 + - 1.3603770732879639 + - 1.279273509979248 + - 1.4584600925445557 + - 0.528777003288269 + - 1.0652544498443604 + - 2.169395923614502 + - 1.8365846872329712 + - 1.6665914058685303 + - 1.279273509979248 + - 1.7187072038650513 + - 2.3217568397521973 + - 1.7199008464813232 + - 0.26260116696357727 + - 1.4108240604400635 + - 1.279273509979248 + - 1.0281999111175537 + - 0.09574669599533081 + - 1.3603770732879639 + - 2.2046756744384766 + - 2.023075580596924 + - 3.033841371536255 + - 1.8085259199142456 + - 0.788812518119812 + - 1.780085802078247 + - 2.2788124084472656 + - 1.8766850233078003 + - 1.7447571754455566 + - 2.9534451961517334 + - 0.9177149534225464 + - 1.1311450004577637 + - 2.103731632232666 + - 1.5702414512634277 + - 2.1388492584228516 + - 0.5288288593292236 + - 1.8078398704528809 + - 0.1826617419719696 + - 2.440305709838867 + - 1.4766032695770264 + - 1.3099268674850464 + - 2.335315465927124 + - 1.2587052583694458 + - 1.161426305770874 + - 1.3099268674850464 + - 1.0279760360717773 + - 1.4115099906921387 + - 0.5901779532432556 + - 2.9554922580718994 + - 2.2256431579589844 + - 2.444732189178467 + - 2.33109712600708 + - 0.7889493107795715 + - 0.2628304660320282 + - 1.195493221282959 + - 0.7409104108810425 + - 1.4774703979492188 + - 0.8329493999481201 + - 1.7050760984420776 + - 3.229017496109009 + - 1.6466542482376099 + - 0.8739264011383057 + - 1.195493221282959 + - 0.9552472829818726 + - 1.0651249885559082 + - 1.1616075038909912 + - 1.4109089374542236 + - 1.628766417503357 + - 0.4703049063682556 + - 1.5817408561706543 + - -0.1049298644065857 + - -0.5095808506011963 + - 0.9175337553024292 + - 0.8739264011383057 + - 1.54994797706604 + - 2.695953845977783 + - 0.4701681137084961 + - 1.3861486911773682 + - 0.6415701508522034 + - 0.5290101170539856 + - -0.5095808506011963 + - -0.6918290853500366 + - -0.5095808506011963 + - 2.1764445304870605 + - 0.5290101170539856 + - 0.40562954545021057 + - 2.3895459175109863 + - 0.4704861640930176 + - 0.18257683515548706 + - 0.0004925131797790527 + - 1.4591460227966309 + - 1.098989725112915 + - 0.6415701508522034 + - 0.6415701508522034 + - 0.9175337553024292 + - 0.5901779532432556 + - -0.10461187362670898 + - 2.4654746055603027 + - 0.641699492931366 + - 1.0641679763793945 + - 1.2791367769241333 + - 1.3099268674850464 + - 1.2791367769241333 + - 1.1312817335128784 + - 1.195493221282959 + - 0.5903592109680176 + - 1.2588346004486084 + - 3.4728047847747803 + - 0.788812518119812 + - -0.10479313135147095 + - 0.4703049063682556 + - 1.9800255298614502 + - 0.40581077337265015 + - 0.33694297075271606 + - 0.4703049063682556 + - 1.6289032697677612 + - 0.8737895488739014 + - 0.917670488357544 + - 1.704390048980713 + - 0.18271362781524658 + - 0.4055776596069336 + - 1.9800255298614502 + - 0.18271362781524658 + - 1.2189394235610962 + - 1.195493221282959 + - 0.4703049063682556 + - 1.3106127977371216 + - -0.10479313135147095 + - 0.4056739807128906 + - 1.0281054973602295 + - 1.2188875675201416 + - 0.0004406273365020752 + - 0.7408585548400879 + - 0.6942652463912964 + - 0.0004925131797790527 + - 1.7049392461776733 + - 0.4704861640930176 + - 0.6415701508522034 + - 0.0004925131797790527 + - 1.2189394235610962 + - 0.5901779532432556 + - 1.161426305770874 + - -0.22207701206207275 + - 1.4769212007522583 + - 0.6415701508522034 + - 0.8330861330032349 + - 0.917670488357544 + - 1.0281572341918945 + - 0.6415701508522034 + - -1.201601266860962 + - 0.8330861330032349 + - 1.5506340265274048 + - 0.7889493107795715 + - 0.7408585548400879 + - 1.8775522708892822 + - 1.1312817335128784 + - 0.7409104108810425 + - 0.0004925131797790527 + - 1.2189394235610962 + - 0.6415701508522034 + - 0.6417514085769653 + - 0.8332673907279968 + - 1.4767398834228516 + - 2.0265510082244873 + - 1.877371072769165 + - 2.125274658203125 + - 0.7889493107795715 + - 1.2189394235610962 + - 0.3367617130279541 + - 1.6289032697677612 + - 0.0957428514957428 + - 1.9740430116653442 + - 1.755319356918335 + - 2.3223230838775635 + - 0.9936981201171875 + - 0.4703049063682556 + - 1.6295374631881714 + - 2.023845911026001 + - 0.9936055541038513 + - 0.6941283941268921 + - 0.8330861330032349 + - 1.6289032697677612 + - 2.0176072120666504 + - 1.334634780883789 + - 1.098989725112915 + - 1.5072537660598755 + - 2.1386680603027344 + - 1.6461493968963623 + - 2.1685404777526855 + - 2.3732762336730957 + - 2.1013545989990234 + - 1.5230551958084106 + - 0.917670488357544 + - 0.4703049063682556 + - 1.931780457496643 + - 0.788812518119812 + - 1.8085259199142456 + - 1.098989725112915 + - 1.925129771232605 + - 1.4109606742858887 + - 1.7906303405761719 + - 2.2039055824279785 + - 0.18271362781524658 + - 1.161426305770874 + - 2.4505856037139893 + - 2.273859977722168 + - 1.0991709232330322 + - -0.22225826978683472 + - 1.5703264474868774 + - 1.5823750495910645 + - -0.6920214891433716 + - 2.241225481033325 + - 0.5901779532432556 + - 0.0006737411022186279 + - 2.3319525718688965 + - 2.0601859092712402 + - 0.8330342769622803 + - 1.8844523429870605 + - 2.5075857639312744 + - 1.5506340265274048 + - 1.8335193395614624 + - 1.0650732517242432 + - 0.6941283941268921 + - 0.2626492381095886 + - 0.917670488357544 + - 0.0957428514957428 + - 0.2628304660320282 + - 0.5288288593292236 + - -0.10479313135147095 + - 0.5901779532432556 + - 1.5703264474868774 + - 0.5901779532432556 + - 1.2189394235610962 + - -0.10479313135147095 + - 1.6957303285598755 + - 0.6941283941268921 + - 1.8844523429870605 + - 1.3612442016601562 + - 1.7901780605316162 + - 0.9552472829818726 + - 2.382542371749878 + - 0.788812518119812 + - 1.5710643529891968 + - 1.3344979286193848 + - 2.597963333129883 + - 1.0991709232330322 + - 1.4776071310043335 + - 0.470308780670166 + - 0.3367617130279541 + - 1.8847652673721313 + - 3.0270333290100098 + - 1.806637167930603 + - 2.631330728530884 + - 2.3328704833984375 + - 1.7555524110794067 + - 2.2414751052856445 + - 1.2587497234344482 + - 1.434821367263794 + - 1.9791702032089233 + - 1.5702414512634277 + - 0.6414333581924438 + - 1.5710643529891968 + - 2.3328704833984375 + - 2.445803642272949 + - 2.038114070892334 + - 2.4652934074401855 + - -0.5107458829879761 + - 1.696232557296753 + - 1.161426305770874 + - 0.788812518119812 + - 1.6459681987762451 + - 0.8330861330032349 + - 0.8738744854927063 + - 2.772785186767578 + - 1.5222322940826416 + - 1.6297705173492432 + - 1.334582805633545 + - 1.0988528728485107 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline_run.json +start: '2019-06-19T09:54:48.783596Z' +status: + state: SUCCESS +steps: +- end: '2019-06-19T09:54:48.794840Z' + hyperparams: + dataframe_resource: + data: null + type: VALUE + method_calls: + - end: '2019-06-19T09:54:48.785537Z' + logging: [] + name: __init__ + start: '2019-06-19T09:54:48.785486Z' + status: + state: SUCCESS + - end: '2019-06-19T09:54:48.790607Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 736 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 30 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: d3mIndex + semantic_types: + - http://schema.org/Integer + - https://metadata.datadrivendiscovery.org/types/PrimaryKey + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: state + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: state2 + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: stfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: zip + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: region + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: typebldg + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: floor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: room + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: basement + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: windoor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 11 + - metadata: + name: rep + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 12 + - metadata: + name: stratum + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 13 + - metadata: + name: wave + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 14 + - metadata: + name: starttm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 15 + - metadata: + name: stoptm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 16 + - metadata: + name: startdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 17 + - metadata: + name: stopdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 18 + - metadata: + name: activity + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 19 + - metadata: + name: pcterr + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 20 + - metadata: + name: adjwt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 21 + - metadata: + name: dupflag + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 22 + - metadata: + name: zipflag + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 23 + - metadata: + name: cntyfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 24 + - metadata: + name: county + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 25 + - metadata: + name: fips + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 26 + - metadata: + name: Uppm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 27 + - metadata: + name: county_code + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 28 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 29 + name: fit_multi_produce + start: '2019-06-19T09:54:48.786064Z' + status: + state: SUCCESS + random_seed: 0 + start: '2019-06-19T09:54:48.783616Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:48.892999Z' + hyperparams: + add_index_columns: + data: true + type: VALUE + exclude_columns: + data: [] + type: VALUE + parse_categorical_target_columns: + data: false + type: VALUE + replace_index_columns: + data: true + type: VALUE + return_result: + data: replace + type: VALUE + use_columns: + data: [] + type: VALUE + method_calls: + - end: '2019-06-19T09:54:48.797730Z' + logging: [] + name: __init__ + start: '2019-06-19T09:54:48.797694Z' + status: + state: SUCCESS + - end: '2019-06-19T09:54:48.888883Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 736 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 30 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: d3mIndex + semantic_types: + - http://schema.org/Integer + - https://metadata.datadrivendiscovery.org/types/PrimaryKey + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: state + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: state2 + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: stfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: zip + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: region + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: typebldg + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: floor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: room + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: basement + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: windoor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 11 + - metadata: + name: rep + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 12 + - metadata: + name: stratum + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 13 + - metadata: + name: wave + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 14 + - metadata: + name: starttm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 15 + - metadata: + name: stoptm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 16 + - metadata: + name: startdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 17 + - metadata: + name: stopdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 18 + - metadata: + name: activity + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 19 + - metadata: + name: pcterr + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 20 + - metadata: + name: adjwt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 21 + - metadata: + name: dupflag + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 22 + - metadata: + name: zipflag + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 23 + - metadata: + name: cntyfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 24 + - metadata: + name: county + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 25 + - metadata: + name: fips + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 26 + - metadata: + name: Uppm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 27 + - metadata: + name: county_code + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 28 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 29 + name: fit_multi_produce + start: '2019-06-19T09:54:48.798357Z' + status: + state: SUCCESS + random_seed: 1 + start: '2019-06-19T09:54:48.794860Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:48.901575Z' + hyperparams: + add_index_columns: + data: false + type: VALUE + exclude_columns: + data: [] + type: VALUE + match_logic: + data: any + type: VALUE + negate: + data: false + type: VALUE + use_columns: + data: [] + type: VALUE + method_calls: + - end: '2019-06-19T09:54:48.895747Z' + logging: [] + name: __init__ + start: '2019-06-19T09:54:48.895710Z' + status: + state: SUCCESS + - end: '2019-06-19T09:54:48.899643Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 736 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 12 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: state + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: state2 + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: zip + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: region + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: typebldg + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: floor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: windoor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: rep + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: stratum + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: county + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: fips + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: county_code + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 11 + name: fit_multi_produce + start: '2019-06-19T09:54:48.896349Z' + status: + state: SUCCESS + random_seed: 2 + start: '2019-06-19T09:54:48.893020Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:48.909559Z' + hyperparams: + add_index_columns: + data: false + type: VALUE + match_logic: + data: any + type: VALUE + negate: + data: false + type: VALUE + use_columns: + data: [] + type: VALUE + method_calls: + - end: '2019-06-19T09:54:48.904245Z' + logging: [] + name: __init__ + start: '2019-06-19T09:54:48.904212Z' + status: + state: SUCCESS + - end: '2019-06-19T09:54:48.907449Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 736 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 14 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: stfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: room + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: wave + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: starttm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: stoptm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: startdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: stopdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: activity + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: pcterr + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: adjwt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: cntyfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 11 + - metadata: + name: Uppm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 12 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 13 + name: fit_multi_produce + start: '2019-06-19T09:54:48.904812Z' + status: + state: SUCCESS + random_seed: 3 + start: '2019-06-19T09:54:48.901595Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:48.915759Z' + hyperparams: + add_index_columns: + data: false + type: VALUE + exclude_columns: + data: [] + type: VALUE + match_logic: + data: any + type: VALUE + negate: + data: false + type: VALUE + use_columns: + data: [] + type: VALUE + method_calls: + - end: '2019-06-19T09:54:48.912059Z' + logging: [] + name: __init__ + start: '2019-06-19T09:54:48.912027Z' + status: + state: SUCCESS + - end: '2019-06-19T09:54:48.915161Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 736 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 1 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 0 + name: fit_multi_produce + start: '2019-06-19T09:54:48.912622Z' + status: + state: SUCCESS + random_seed: 4 + start: '2019-06-19T09:54:48.909578Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:48.966235Z' + hyperparams: + add_index_columns: + data: false + type: VALUE + error_on_no_input: + data: true + type: VALUE + exclude_columns: + data: [] + type: VALUE + fill_value: + data: + case: none + value: null + type: VALUE + missing_values: + data: + case: float + value: + encoding: pickle + value: gANHf/gAAAAAAAAu + type: VALUE + return_semantic_type: + data: https://metadata.datadrivendiscovery.org/types/Attribute + type: VALUE + strategy: + data: mean + type: VALUE + use_columns: + data: [] + type: VALUE + method_calls: + - end: '2019-06-19T09:54:48.919732Z' + logging: [] + name: __init__ + start: '2019-06-19T09:54:48.919684Z' + status: + state: SUCCESS + - end: '2019-06-19T09:54:48.964102Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 736 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 14 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: stfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: room + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: wave + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: starttm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: stoptm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: startdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: stopdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: activity + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: pcterr + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: adjwt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: cntyfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 11 + - metadata: + name: Uppm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 12 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 13 + name: fit_multi_produce + start: '2019-06-19T09:54:48.920417Z' + status: + state: SUCCESS + random_seed: 5 + start: '2019-06-19T09:54:48.915777Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:49.039949Z' + hyperparams: + add_index_columns: + data: true + type: VALUE + base_score: + data: 0.5 + type: VALUE + colsample_bylevel: + data: 1 + type: VALUE + colsample_bytree: + data: 1 + type: VALUE + exclude_inputs_columns: + data: [] + type: VALUE + exclude_outputs_columns: + data: [] + type: VALUE + gamma: + data: 0.0 + type: VALUE + importance_type: + data: gain + type: VALUE + learning_rate: + data: 0.1 + type: VALUE + max_delta_step: + data: + case: unlimited + value: 0 + type: VALUE + max_depth: + data: + case: limit + value: 3 + type: VALUE + min_child_weight: + data: 1 + type: VALUE + n_estimators: + data: 100 + type: VALUE + n_jobs: + data: + case: limit + value: 1 + type: VALUE + n_more_estimators: + data: 100 + type: VALUE + reg_alpha: + data: 0 + type: VALUE + reg_lambda: + data: 1 + type: VALUE + scale_pos_weight: + data: 1 + type: VALUE + subsample: + data: 1 + type: VALUE + use_inputs_columns: + data: [] + type: VALUE + use_outputs_columns: + data: [] + type: VALUE + method_calls: + - end: '2019-06-19T09:54:48.972495Z' + logging: [] + name: __init__ + start: '2019-06-19T09:54:48.972423Z' + status: + state: SUCCESS + - end: '2019-06-19T09:54:49.039069Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 736 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 3 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/PredictedTarget + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 2 + name: fit_multi_produce + start: '2019-06-19T09:54:48.973599Z' + status: + state: SUCCESS + random_seed: 6 + start: '2019-06-19T09:54:48.966255Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:49.049637Z' + hyperparams: + exclude_columns: + data: [] + type: VALUE + use_columns: + data: [] + type: VALUE + method_calls: + - end: '2019-06-19T09:54:49.041989Z' + logging: [] + name: __init__ + start: '2019-06-19T09:54:49.041954Z' + status: + state: SUCCESS + - end: '2019-06-19T09:54:49.048870Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 736 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 2 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: d3mIndex + semantic_types: + - http://schema.org/Integer + - https://metadata.datadrivendiscovery.org/types/PrimaryKey + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/PredictedTarget + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 1 + name: fit_multi_produce + start: '2019-06-19T09:54:49.042531Z' + status: + state: SUCCESS + random_seed: 7 + start: '2019-06-19T09:54:49.039969Z' + status: + state: SUCCESS + type: PRIMITIVE +--- +context: TESTING +datasets: +- digest: d2413f26c84df994b808f397c3aa9908e54169d43ca3034ea1a2c3f9c9a6ec27 + id: 26_radon_seed_dataset_TEST +end: '2019-06-19T09:54:49.273243Z' +environment: + engine_version: 2019.6.7 + id: 6fdad0c4-dcb1-541d-a2a7-d2d9590c26dd + reference_engine_version: 2019.6.7 + resources: + cpu: + constraints: + cpu_shares: 1024 + devices: + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + - name: Intel Core Processor (Broadwell) + logical_present: 8 + physical_present: 8 + memory: + total_memory: 25281880064 + worker_id: f5ffb488-a883-509d-ad4f-5e819913cb14 +id: 93ad8197-9c10-5666-b50c-dcc06d46f9d2 +pipeline: + digest: 79e82bee63a0db3dd75d41de0caa2054b860a8211fa9895a061fdb8f9f1c444d + id: 0f636602-6299-411b-9873-4b974cd393ba +previous_pipeline_run: + id: 67cc5311-80d6-5f1f-96f6-7f7b9d1308a0 +problem: + digest: 01ab113ff802b57fe872f7b4e4422789921d033c41bc8ad0bd6e0d041291ed6f + id: 26_radon_seed_problem +random_seed: 0 +run: + phase: PRODUCE + results: + predictions: + header: + - d3mIndex + - log_radon + values: + - - 5 + - 23 + - 30 + - 31 + - 33 + - 39 + - 44 + - 49 + - 54 + - 60 + - 63 + - 65 + - 66 + - 67 + - 70 + - 72 + - 76 + - 77 + - 78 + - 82 + - 86 + - 96 + - 97 + - 107 + - 109 + - 110 + - 120 + - 136 + - 137 + - 139 + - 141 + - 165 + - 168 + - 174 + - 192 + - 198 + - 208 + - 209 + - 210 + - 213 + - 215 + - 218 + - 227 + - 231 + - 235 + - 239 + - 244 + - 250 + - 254 + - 259 + - 261 + - 265 + - 266 + - 275 + - 281 + - 286 + - 290 + - 296 + - 299 + - 300 + - 305 + - 306 + - 309 + - 311 + - 314 + - 321 + - 323 + - 326 + - 331 + - 332 + - 333 + - 334 + - 344 + - 350 + - 351 + - 357 + - 359 + - 360 + - 365 + - 377 + - 380 + - 381 + - 394 + - 408 + - 416 + - 417 + - 429 + - 430 + - 433 + - 436 + - 439 + - 440 + - 448 + - 464 + - 468 + - 483 + - 486 + - 490 + - 493 + - 497 + - 501 + - 514 + - 523 + - 526 + - 527 + - 530 + - 533 + - 536 + - 539 + - 542 + - 545 + - 549 + - 551 + - 552 + - 554 + - 558 + - 584 + - 585 + - 591 + - 594 + - 595 + - 597 + - 598 + - 599 + - 601 + - 611 + - 616 + - 617 + - 633 + - 635 + - 638 + - 655 + - 656 + - 666 + - 668 + - 669 + - 680 + - 685 + - 687 + - 688 + - 703 + - 704 + - 706 + - 708 + - 710 + - 714 + - 715 + - 723 + - 734 + - 739 + - 745 + - 753 + - 755 + - 759 + - 760 + - 767 + - 788 + - 790 + - 793 + - 794 + - 795 + - 798 + - 808 + - 809 + - 814 + - 817 + - 823 + - 827 + - 834 + - 848 + - 862 + - 864 + - 869 + - 872 + - 879 + - 880 + - 882 + - 893 + - 897 + - 904 + - 908 + - 913 + - 914 + - - 0.9552472829818726 + - 0.6942652463912964 + - 1.5232363939285278 + - 1.506748914718628 + - 2.103731632232666 + - 1.0988528728485107 + - -1.2187950611114502 + - 0.5904017686843872 + - 0.6942652463912964 + - 1.54994797706604 + - -0.6920214891433716 + - 1.5065677165985107 + - 1.8838632106781006 + - 1.0279760360717773 + - 1.9791702032089233 + - 0.992601752281189 + - 1.9309251308441162 + - 2.569783926010132 + - 1.7802670001983643 + - 2.6912167072296143 + - 2.264638900756836 + - 1.3862450122833252 + - 0.3367617130279541 + - 1.334634780883789 + - 0.694213330745697 + - 1.6957303285598755 + - 0.5903072953224182 + - 1.5814228057861328 + - 1.2586534023284912 + - 1.2587052583694458 + - 0.40562954545021057 + - 1.5043662786483765 + - 0.7409104108810425 + - 1.6481608152389526 + - 1.5065677165985107 + - 0.18271362781524658 + - 0.40562954545021057 + - 0.4055776596069336 + - 0.694489061832428 + - 1.3603770732879639 + - 1.4774258136749268 + - 0.8330861330032349 + - 1.8773192167282104 + - 1.5065158605575562 + - 0.8738744854927063 + - 0.9552472829818726 + - 2.2046756744384766 + - 2.690866708755493 + - 0.9936981201171875 + - 1.5821088552474976 + - 1.279273509979248 + - 1.2194933891296387 + - 0.7889493107795715 + - 0.33671367168426514 + - 1.7054443359375 + - 1.3873944282531738 + - 1.0651249885559082 + - 1.3612680435180664 + - 1.7199008464813232 + - 0.9554710984230042 + - 1.4770009517669678 + - 1.5707955360412598 + - 0.528777003288269 + - -0.22207701206207275 + - 1.718523383140564 + - 0.5902649760246277 + - 2.5712332725524902 + - 1.744625210762024 + - 2.038295269012451 + - 0.9936981201171875 + - 1.5226820707321167 + - 1.7928229570388794 + - 0.5289158821105957 + - 0.0004963576793670654 + - 0.641793966293335 + - 1.833651065826416 + - 1.3862005472183228 + - 1.0990341901779175 + - 1.3603770732879639 + - 0.4703049063682556 + - 1.3099268674850464 + - 1.1312817335128784 + - 2.1388492584228516 + - 1.522550344467163 + - 0.3367098271846771 + - 2.444817304611206 + - 1.4766032695770264 + - 1.2190687656402588 + - 1.4584081172943115 + - 1.2189394235610962 + - 2.064997434616089 + - 1.279273509979248 + - 1.550502061843872 + - -0.5095808506011963 + - 0.6415701508522034 + - 1.195493221282959 + - 0.40562954545021057 + - 1.6176269054412842 + - 1.0652544498443604 + - 0.7409104108810425 + - 2.1023406982421875 + - 1.8332862854003906 + - 1.522369146347046 + - 1.195493221282959 + - 1.6297705173492432 + - 0.740773618221283 + - 0.7889493107795715 + - 0.9552472829818726 + - 0.9552472829818726 + - 1.0279760360717773 + - 2.5039567947387695 + - 1.3602402210235596 + - 1.7800339460372925 + - -0.6920214891433716 + - 1.0651249885559082 + - 0.470253050327301 + - 1.0281999111175537 + - 0.2628304660320282 + - 1.6176269054412842 + - 0.9552472829818726 + - 0.2625162601470947 + - 0.5903499126434326 + - 1.1616501808166504 + - -0.22225826978683472 + - 0.6941283941268921 + - 1.54994797706604 + - 1.6466542482376099 + - 2.062042713165283 + - 0.2626492381095886 + - 1.4584600925445557 + - 0.7408585548400879 + - 0.5288288593292236 + - 1.570378303527832 + - 2.3885598182678223 + - 2.169395923614502 + - 1.5230551958084106 + - 2.1773343086242676 + - -0.22239500284194946 + - 1.2587052583694458 + - 0.7889493107795715 + - 1.1616075038909912 + - 1.2189394235610962 + - 1.7447571754455566 + - 0.4701681137084961 + - 0.18271362781524658 + - 0.3367617130279541 + - 1.161426305770874 + - 0.26259732246398926 + - 0.5288288593292236 + - -0.3560487627983093 + - 1.1612895727157593 + - 0.40562954545021057 + - 0.4703049063682556 + - 0.5901779532432556 + - 0.18257683515548706 + - -0.22239500284194946 + - 1.8846335411071777 + - 1.2188026905059814 + - 2.024027109146118 + - 2.6903984546661377 + - 0.6415701508522034 + - 1.334582805633545 + - 1.3101081848144531 + - 0.4703049063682556 + - 1.1312817335128784 + - 1.581559658050537 + - 2.9553821086883545 + - 2.1388492584228516 + - 1.1954413652420044 + - 1.8844523429870605 + - 2.2962329387664795 + - 2.1386680603027344 + - 1.617445707321167 + - 0.917670488357544 + - 1.360558271408081 + - 0.6415701508522034 + - 0.6415701508522034 + - 2.464556932449341 + - -0.22225826978683472 + - 1.92511785030365 + - 2.023845911026001 + - 2.2644577026367188 + - 1.877371072769165 + scores: + - metric: + metric: ROOT_MEAN_SQUARED_ERROR + normalized: 0.9999912641893107 + value: 0.017471621378821623 + scoring: + datasets: + - digest: d2413f26c84df994b808f397c3aa9908e54169d43ca3034ea1a2c3f9c9a6ec27 + id: 26_radon_seed_dataset_SCORE + end: '2019-06-19T09:54:49.314593Z' + pipeline: + digest: a7a07527cdff5a525341894356056b4420d9b99f12bc1a90198880a3ea7f6bd1 + id: f596cd77-25f8-4d4c-a350-bb30ab1e58f6 + random_seed: 0 + start: '2019-06-19T09:54:49.288788Z' + status: + state: SUCCESS + steps: + - end: '2019-06-19T09:54:49.314573Z' + hyperparams: + add_normalized_scores: + data: true + type: VALUE + metrics: + data: + - k: null + metric: ROOT_MEAN_SQUARED_ERROR + pos_label: null + type: VALUE + method_calls: + - end: '2019-06-19T09:54:49.291387Z' + logging: [] + name: __init__ + start: '2019-06-19T09:54:49.291349Z' + status: + state: SUCCESS + - end: '2019-06-19T09:54:49.313706Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 1 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 3 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: metric + semantic_types: + - https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: value + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Score + structural_type: numpy.float64 + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: normalized + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Score + structural_type: numpy.float64 + selector: + - __ALL_ELEMENTS__ + - 2 + name: fit_multi_produce + start: '2019-06-19T09:54:49.291923Z' + status: + state: SUCCESS + random_seed: 0 + start: '2019-06-19T09:54:49.288803Z' + status: + state: SUCCESS + type: PRIMITIVE +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline_run.json +start: '2019-06-19T09:54:49.108507Z' +status: + state: SUCCESS +steps: +- end: '2019-06-19T09:54:49.117083Z' + method_calls: + - end: '2019-06-19T09:54:49.113071Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 183 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 30 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: d3mIndex + semantic_types: + - http://schema.org/Integer + - https://metadata.datadrivendiscovery.org/types/PrimaryKey + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: state + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: state2 + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: stfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: zip + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: region + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: typebldg + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: floor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: room + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: basement + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: windoor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 11 + - metadata: + name: rep + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 12 + - metadata: + name: stratum + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 13 + - metadata: + name: wave + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 14 + - metadata: + name: starttm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 15 + - metadata: + name: stoptm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 16 + - metadata: + name: startdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 17 + - metadata: + name: stopdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 18 + - metadata: + name: activity + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 19 + - metadata: + name: pcterr + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 20 + - metadata: + name: adjwt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 21 + - metadata: + name: dupflag + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 22 + - metadata: + name: zipflag + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 23 + - metadata: + name: cntyfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 24 + - metadata: + name: county + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 25 + - metadata: + name: fips + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 26 + - metadata: + name: Uppm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 27 + - metadata: + name: county_code + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 28 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 29 + name: multi_produce + start: '2019-06-19T09:54:49.109030Z' + status: + state: SUCCESS + start: '2019-06-19T09:54:49.108527Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:49.204005Z' + method_calls: + - end: '2019-06-19T09:54:49.199866Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 183 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 30 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: d3mIndex + semantic_types: + - http://schema.org/Integer + - https://metadata.datadrivendiscovery.org/types/PrimaryKey + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: state + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: state2 + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: stfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: zip + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: region + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: typebldg + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: floor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: room + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: basement + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: windoor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 11 + - metadata: + name: rep + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 12 + - metadata: + name: stratum + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 13 + - metadata: + name: wave + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 14 + - metadata: + name: starttm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 15 + - metadata: + name: stoptm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 16 + - metadata: + name: startdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 17 + - metadata: + name: stopdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 18 + - metadata: + name: activity + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 19 + - metadata: + name: pcterr + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 20 + - metadata: + name: adjwt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 21 + - metadata: + name: dupflag + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 22 + - metadata: + name: zipflag + semantic_types: + - http://schema.org/Boolean + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 23 + - metadata: + name: cntyfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 24 + - metadata: + name: county + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 25 + - metadata: + name: fips + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 26 + - metadata: + name: Uppm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 27 + - metadata: + name: county_code + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 28 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 29 + name: multi_produce + start: '2019-06-19T09:54:49.117714Z' + status: + state: SUCCESS + start: '2019-06-19T09:54:49.117103Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:49.209289Z' + method_calls: + - end: '2019-06-19T09:54:49.207372Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 183 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 12 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: state + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: state2 + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: zip + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: region + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: typebldg + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: floor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: windoor + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: rep + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: stratum + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: county + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: fips + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: county_code + semantic_types: + - https://metadata.datadrivendiscovery.org/types/CategoricalData + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 11 + name: multi_produce + start: '2019-06-19T09:54:49.204709Z' + status: + state: SUCCESS + start: '2019-06-19T09:54:49.204040Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:49.214141Z' + method_calls: + - end: '2019-06-19T09:54:49.212034Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 183 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 14 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: stfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: room + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: wave + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: starttm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: stoptm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: startdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: stopdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: activity + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: pcterr + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: adjwt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: cntyfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 11 + - metadata: + name: Uppm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 12 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 13 + name: multi_produce + start: '2019-06-19T09:54:49.209904Z' + status: + state: SUCCESS + start: '2019-06-19T09:54:49.209307Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:49.217452Z' + method_calls: + - end: '2019-06-19T09:54:49.216856Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 183 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 1 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 0 + name: multi_produce + start: '2019-06-19T09:54:49.214788Z' + status: + state: SUCCESS + start: '2019-06-19T09:54:49.214160Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:49.254497Z' + method_calls: + - end: '2019-06-19T09:54:49.252367Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 183 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 14 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: stfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: room + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 2 + - metadata: + name: wave + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 3 + - metadata: + name: starttm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 4 + - metadata: + name: stoptm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 5 + - metadata: + name: startdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 6 + - metadata: + name: stopdt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 7 + - metadata: + name: activity + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 8 + - metadata: + name: pcterr + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 9 + - metadata: + name: adjwt + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 10 + - metadata: + name: cntyfips + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 11 + - metadata: + name: Uppm + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/Attribute + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 12 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 13 + name: multi_produce + start: '2019-06-19T09:54:49.218199Z' + status: + state: SUCCESS + start: '2019-06-19T09:54:49.217470Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:49.265815Z' + method_calls: + - end: '2019-06-19T09:54:49.264935Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 183 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 3 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: idnum + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/UniqueKey + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/PredictedTarget + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 1 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/TrueTarget + structural_type: float + selector: + - __ALL_ELEMENTS__ + - 2 + name: multi_produce + start: '2019-06-19T09:54:49.255748Z' + status: + state: SUCCESS + start: '2019-06-19T09:54:49.254517Z' + status: + state: SUCCESS + type: PRIMITIVE +- end: '2019-06-19T09:54:49.273222Z' + method_calls: + - end: '2019-06-19T09:54:49.272461Z' + logging: [] + metadata: + produce: + - metadata: + dimension: + length: 183 + name: rows + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularRow + schema: https://metadata.datadrivendiscovery.org/schemas/v0/container.json + semantic_types: + - https://metadata.datadrivendiscovery.org/types/Table + structural_type: d3m.container.pandas.DataFrame + selector: [] + - metadata: + dimension: + length: 2 + name: columns + semantic_types: + - https://metadata.datadrivendiscovery.org/types/TabularColumn + selector: + - __ALL_ELEMENTS__ + - metadata: + name: d3mIndex + semantic_types: + - http://schema.org/Integer + - https://metadata.datadrivendiscovery.org/types/PrimaryKey + structural_type: int + selector: + - __ALL_ELEMENTS__ + - 0 + - metadata: + name: log_radon + semantic_types: + - http://schema.org/Float + - https://metadata.datadrivendiscovery.org/types/SuggestedTarget + - https://metadata.datadrivendiscovery.org/types/Target + - https://metadata.datadrivendiscovery.org/types/PredictedTarget + structural_type: str + selector: + - __ALL_ELEMENTS__ + - 1 + name: multi_produce + start: '2019-06-19T09:54:49.266417Z' + status: + state: SUCCESS + start: '2019-06-19T09:54:49.265835Z' + status: + state: SUCCESS + type: PRIMITIVE diff --git a/tods/common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_extract_structural_types.yml.gz b/tods/common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_extract_structural_types.yml.gz new file mode 120000 index 0000000..91f49b3 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_extract_structural_types.yml.gz @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_group_field_compose.yml.gz b/tods/common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_group_field_compose.yml.gz new file mode 120000 index 0000000..0a4dd35 --- /dev/null +++ b/tods/common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_group_field_compose.yml.gz @@ -0,0 +1 @@ +../data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz \ No newline at end of file diff --git a/tods/common-primitives/pipelines/classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json b/tods/common-primitives/pipelines/classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json new file mode 100644 index 0000000..7ee4edb --- /dev/null +++ b/tods/common-primitives/pipelines/classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json @@ -0,0 +1,246 @@ +{ + "context": "TESTING", + "created": "2019-02-12T01:09:44.343543Z", + "id": "d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde", + "inputs": [ + { + "name": "inputs" + } + ], + "outputs": [ + { + "data": "steps.7.produce", + "name": "output predictions" + } + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "steps": [ + { + "arguments": { + "inputs": { + "data": "inputs.0", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "parse_semantic_types": { + "data": [ + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector", + "http://schema.org/DateTime" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", + "name": "Parses strings into their types", + "python_path": "d3m.primitives.data_transformation.column_parser.Common", + "version": "0.6.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/CategoricalData" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "exclude_columns": { + "data": [ + 0 + ], + "type": "VALUE" + }, + "semantic_types": { + "data": [ + "http://schema.org/Integer", + "http://schema.org/Float" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.3.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + }, + "use_semantic_types": { + "data": true, + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde", + "name": "sklearn.impute.SimpleImputer", + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "version": "2019.6.7" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.5.produce", + "type": "CONTAINER" + }, + "outputs": { + "data": "steps.4.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "259aa747-795c-435e-8e33-8c32a4c83c6b", + "name": "LightGBM GBTree classifier", + "python_path": "d3m.primitives.classification.light_gbm.Common", + "version": "0.1.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.6.produce", + "type": "CONTAINER" + }, + "reference": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "8d38b340-f83f-4877-baaa-162f8e551736", + "name": "Construct pipeline predictions output", + "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + } + ] +} diff --git a/tods/common-primitives/pipelines/classification.random_forest.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json b/tods/common-primitives/pipelines/classification.random_forest.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json new file mode 120000 index 0000000..51266fd --- /dev/null +++ b/tods/common-primitives/pipelines/classification.random_forest.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/classification.random_forest.DataFrameCommon/ccad0f9c-130e-4063-a91e-ea65a18cb041.yaml b/tods/common-primitives/pipelines/classification.random_forest.DataFrameCommon/ccad0f9c-130e-4063-a91e-ea65a18cb041.yaml new file mode 100644 index 0000000..470d2be --- /dev/null +++ b/tods/common-primitives/pipelines/classification.random_forest.DataFrameCommon/ccad0f9c-130e-4063-a91e-ea65a18cb041.yaml @@ -0,0 +1,110 @@ +id: ccad0f9c-130e-4063-a91e-ea65a18cb041 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Mitar +created: "2019-06-05T11:48:52.806069Z" +context: TESTING +name: Random Forest classifier pipeline +description: | + A simple pipeline which runs Random Forest classifier on tabular data. +inputs: + - name: input dataset +outputs: + - name: predictions + data: steps.5.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e + version: 0.2.0 + python_path: d3m.primitives.data_transformation.denormalize.Common + name: Denormalize datasets + arguments: + inputs: + type: CONTAINER + data: inputs.0 + outputs: + - id: produce + # Step 1. + - type: PRIMITIVE + primitive: + id: 4b42ce1e-9b98-4a25-b68e-fad13311eb65 + version: 0.3.0 + python_path: d3m.primitives.data_transformation.dataset_to_dataframe.Common + name: Extract a DataFrame from a Dataset + arguments: + inputs: + type: CONTAINER + data: steps.0.produce + outputs: + - id: produce + # Step 2. + - type: PRIMITIVE + primitive: + id: d510cb7a-1782-4f51-b44c-58f0236e47c7 + version: 0.6.0 + python_path: d3m.primitives.data_transformation.column_parser.Common + name: Parses strings into their types + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce + # Step 3. + - type: PRIMITIVE + primitive: + id: d016df89-de62-3c53-87ed-c06bb6a23cde + version: 2019.6.7 + python_path: d3m.primitives.data_cleaning.imputer.SKlearn + name: sklearn.impute.SimpleImputer + arguments: + inputs: + type: CONTAINER + data: steps.2.produce + outputs: + - id: produce + hyperparams: + use_semantic_types: + type: VALUE + data: true + return_result: + type: VALUE + data: replace + # Step 4. + - type: PRIMITIVE + primitive: + id: 37c2b19d-bdab-4a30-ba08-6be49edcc6af + version: 0.4.0 + python_path: d3m.primitives.classification.random_forest.Common + name: Random forest classifier + arguments: + inputs: + type: CONTAINER + data: steps.3.produce + outputs: + type: CONTAINER + data: steps.3.produce + outputs: + - id: produce + hyperparams: + return_result: + type: VALUE + data: replace + # Step 5. + - type: PRIMITIVE + primitive: + id: 8d38b340-f83f-4877-baaa-162f8e551736 + version: 0.3.0 + python_path: d3m.primitives.data_transformation.construct_predictions.Common + name: Construct pipeline predictions output + arguments: + inputs: + type: CONTAINER + data: steps.4.produce + reference: + type: CONTAINER + data: steps.2.produce + outputs: + - id: produce diff --git a/tods/common-primitives/pipelines/classification.xgboost_dart.DataFrameCommon/b7a24816-2518-4073-9c45-b97f2b2fee30.json b/tods/common-primitives/pipelines/classification.xgboost_dart.DataFrameCommon/b7a24816-2518-4073-9c45-b97f2b2fee30.json new file mode 100644 index 0000000..b5ba302 --- /dev/null +++ b/tods/common-primitives/pipelines/classification.xgboost_dart.DataFrameCommon/b7a24816-2518-4073-9c45-b97f2b2fee30.json @@ -0,0 +1,246 @@ +{ + "context": "TESTING", + "created": "2019-02-12T01:33:29.921236Z", + "id": "b7a24816-2518-4073-9c45-b97f2b2fee30", + "inputs": [ + { + "name": "inputs" + } + ], + "outputs": [ + { + "data": "steps.7.produce", + "name": "output predictions" + } + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "steps": [ + { + "arguments": { + "inputs": { + "data": "inputs.0", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "parse_semantic_types": { + "data": [ + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector", + "http://schema.org/DateTime" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", + "name": "Parses strings into their types", + "python_path": "d3m.primitives.data_transformation.column_parser.Common", + "version": "0.6.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/CategoricalData" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "exclude_columns": { + "data": [ + 0 + ], + "type": "VALUE" + }, + "semantic_types": { + "data": [ + "http://schema.org/Integer", + "http://schema.org/Float" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.3.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + }, + "use_semantic_types": { + "data": true, + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde", + "name": "sklearn.impute.SimpleImputer", + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "version": "2019.6.7" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.5.produce", + "type": "CONTAINER" + }, + "outputs": { + "data": "steps.4.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "7476950e-4373-4cf5-a852-7e16afb8e098", + "name": "XGBoost DART classifier", + "python_path": "d3m.primitives.classification.xgboost_dart.Common", + "version": "0.1.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.6.produce", + "type": "CONTAINER" + }, + "reference": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "8d38b340-f83f-4877-baaa-162f8e551736", + "name": "Construct pipeline predictions output", + "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + } + ] +} diff --git a/tods/common-primitives/pipelines/classification.xgboost_gbtree.DataFrameCommon/4d402450-2562-48cc-93fd-719fb658c43c.json b/tods/common-primitives/pipelines/classification.xgboost_gbtree.DataFrameCommon/4d402450-2562-48cc-93fd-719fb658c43c.json new file mode 100644 index 0000000..629964e --- /dev/null +++ b/tods/common-primitives/pipelines/classification.xgboost_gbtree.DataFrameCommon/4d402450-2562-48cc-93fd-719fb658c43c.json @@ -0,0 +1,246 @@ +{ + "context": "TESTING", + "created": "2019-02-12T01:18:47.753202Z", + "id": "4d402450-2562-48cc-93fd-719fb658c43c", + "inputs": [ + { + "name": "inputs" + } + ], + "outputs": [ + { + "data": "steps.7.produce", + "name": "output predictions" + } + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "steps": [ + { + "arguments": { + "inputs": { + "data": "inputs.0", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "parse_semantic_types": { + "data": [ + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector", + "http://schema.org/DateTime" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", + "name": "Parses strings into their types", + "python_path": "d3m.primitives.data_transformation.column_parser.Common", + "version": "0.6.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/CategoricalData" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "exclude_columns": { + "data": [ + 0 + ], + "type": "VALUE" + }, + "semantic_types": { + "data": [ + "http://schema.org/Integer", + "http://schema.org/Float" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.3.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + }, + "use_semantic_types": { + "data": true, + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde", + "name": "sklearn.impute.SimpleImputer", + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "version": "2019.6.7" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.5.produce", + "type": "CONTAINER" + }, + "outputs": { + "data": "steps.4.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "fe0841b7-6e70-4bc3-a56c-0670a95ebc6a", + "name": "XGBoost GBTree classifier", + "python_path": "d3m.primitives.classification.xgboost_gbtree.Common", + "version": "0.1.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.6.produce", + "type": "CONTAINER" + }, + "reference": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "8d38b340-f83f-4877-baaa-162f8e551736", + "name": "Construct pipeline predictions output", + "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + } + ] +} diff --git a/tods/common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/3afd2bd2-7ba1-4ac1-928f-fad0c39a05e5.json b/tods/common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/3afd2bd2-7ba1-4ac1-928f-fad0c39a05e5.json new file mode 100644 index 0000000..b873182 --- /dev/null +++ b/tods/common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/3afd2bd2-7ba1-4ac1-928f-fad0c39a05e5.json @@ -0,0 +1,522 @@ +{ + "id": "3afd2bd2-7ba1-4ac1-928f-fad0c39a05e5", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2019-11-06T04:22:27.325146Z", + "inputs": [ + { + "name": "input dataset" + } + ], + "outputs": [ + { + "data": "steps.17.produce", + "name": "predictions of input dataset" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e", + "version": "0.2.0", + "python_path": "d3m.primitives.data_transformation.denormalize.Common", + "name": "Denormalize datasets", + "digest": "80ddde3709877015f7e5d262621fb4c25a2db0c7ba03c62c4fdf80cd3ede5d5b" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "starting_resource": { + "type": "VALUE", + "data": null + }, + "recursive": { + "type": "VALUE", + "data": true + }, + "many_to_many": { + "type": "VALUE", + "data": false + }, + "discard_not_joined_tabular_resources": { + "type": "VALUE", + "data": false + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "fe0f1ac8-1d39-463a-b344-7bd498a31b91", + "version": "0.1", + "python_path": "d3m.primitives.data_augmentation.datamart_augmentation.Common", + "name": "Perform dataset augmentation using Datamart", + "digest": "5f3eda98f6a45530343707fd3e2159879d1ad4550f589a5596389c41fab83d47" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.0.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "system_identifier": { + "type": "VALUE", + "data": "ISI" + }, + "search_result": { + "type": "VALUE", + "data": "{\"augmentation\": {\"left_columns\": [[19]], \"right_columns\": [[12]], \"type\": \"join\"}, \"id\": \"wikidata_search_on___P1082___P1449___P1451___P1549___P1705___P1813___P2044___P2046___P2927___P571___P6591___with_column_STABBR_wikidata\", \"materialize_info\": \"{\\\"id\\\": \\\"wikidata_search_on___P1082___P1449___P1451___P1549___P1705___P1813___P2044___P2046___P2927___P571___P6591___with_column_STABBR_wikidata\\\", \\\"score\\\": 1, \\\"metadata\\\": {\\\"connection_url\\\": \\\"http://dsbox02.isi.edu:9000\\\", \\\"search_result\\\": {\\\"p_nodes_needed\\\": [\\\"P1082\\\", \\\"P1449\\\", \\\"P1451\\\", \\\"P1549\\\", \\\"P1705\\\", \\\"P1813\\\", \\\"P2044\\\", \\\"P2046\\\", \\\"P2927\\\", \\\"P571\\\", \\\"P6591\\\"], \\\"target_q_node_column_name\\\": \\\"STABBR_wikidata\\\"}, \\\"query_json\\\": null, \\\"search_type\\\": \\\"wikidata\\\"}, \\\"augmentation\\\": {\\\"properties\\\": \\\"join\\\", \\\"left_columns\\\": [19], \\\"right_columns\\\": [12]}, \\\"datamart_type\\\": \\\"isi\\\"}\", \"metadata\": [{\"metadata\": {\"dimension\": {\"length\": 3243, \"name\": \"rows\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/TabularRow\"]}, \"schema\": \"https://metadata.datadrivendiscovery.org/schemas/v0/container.json\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/Table\"], \"structural_type\": \"d3m.container.pandas.DataFrame\"}, \"selector\": []}, {\"metadata\": {\"dimension\": {\"length\": 11, \"name\": \"columns\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/TabularColumn\"]}}, \"selector\": [\"__ALL_ELEMENTS__\"]}, {\"metadata\": {\"P_node\": \"P1082\", \"name\": \"population_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 0]}, {\"metadata\": {\"P_node\": \"P1449\", \"name\": \"nickname_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 1]}, {\"metadata\": {\"P_node\": \"P1451\", \"name\": \"motto text_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 2]}, {\"metadata\": {\"P_node\": \"P1549\", \"name\": \"demonym_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 3]}, {\"metadata\": {\"P_node\": \"P1705\", \"name\": \"native label_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 4]}, {\"metadata\": {\"P_node\": \"P1813\", \"name\": \"short name_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 5]}, {\"metadata\": {\"P_node\": \"P2044\", \"name\": \"elevation above sea level_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 6]}, {\"metadata\": {\"P_node\": \"P2046\", \"name\": \"area_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 7]}, {\"metadata\": {\"P_node\": \"P2927\", \"name\": \"water as percent of area_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 8]}, {\"metadata\": {\"P_node\": \"P571\", \"name\": \"inception_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/DateTime\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 9]}, {\"metadata\": {\"P_node\": \"P6591\", \"name\": \"maximum temperature record_for_STABBR_wikidata\", \"semantic_types\": [true, [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"]], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 10]}, {\"metadata\": {\"name\": \"q_node\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/CategoricalData\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"http://wikidata.org/qnode\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 11]}, {\"metadata\": {\"name\": \"joining_pairs\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\", \"https://metadata.datadrivendiscovery.org/types/Datamart_augmented_column\"], \"structural_type\": \"list\"}, \"selector\": [\"__ALL_ELEMENTS__\", 12]}], \"score\": 1, \"summary\": {\"Columns\": [\"[0] population\", \"[1] nickname\", \"[2] motto text\", \"[3] demonym\", \"[4] native label\", \"[5] short name\", \"[6] elevation above sea level\", \"[7] area\", \"[8] water as percent of area\", \"[9] inception\", \"[10] maximum temperature record\"], \"Datamart ID\": \"wikidata_search_on___P1082___P1449___P1451___P1549___P1705___P1813___P2044___P2046___P2927___P571___P6591___with_column_STABBR_wikidata\", \"Recommend Join Columns\": \"STABBR_wikidata\", \"Score\": \"1\", \"URL\": \"None\", \"title\": \"wikidata search result for STABBR_wikidata\"}, \"supplied_id\": \"DA_college_debt_dataset_TRAIN\", \"supplied_resource_id\": \"learningData\"}" + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "dsbox-featurizer-do-nothing-dataset-version", + "version": "1.5.3", + "python_path": "d3m.primitives.data_preprocessing.do_nothing_for_dataset.DSBOX", + "name": "DSBox do-nothing primitive dataset version", + "digest": "c42dca1f4110288d5399d05b6dcd776a63e110d8d266521622581b500b08cee2" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.1.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "fe0f1ac8-1d39-463a-b344-7bd498a31b91", + "version": "0.1", + "python_path": "d3m.primitives.data_augmentation.datamart_augmentation.Common", + "name": "Perform dataset augmentation using Datamart", + "digest": "5f3eda98f6a45530343707fd3e2159879d1ad4550f589a5596389c41fab83d47" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.2.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "system_identifier": { + "type": "VALUE", + "data": "ISI" + }, + "search_result": { + "type": "VALUE", + "data": "{\"augmentation\": {\"left_columns\": [[2]], \"right_columns\": [[3]], \"type\": \"join\"}, \"id\": \"D4cb70062-77ed-4097-a486-0b43ffe81463\", \"materialize_info\": \"{\\\"id\\\": \\\"D4cb70062-77ed-4097-a486-0b43ffe81463\\\", \\\"score\\\": 0.9398390424662831, \\\"metadata\\\": {\\\"connection_url\\\": \\\"http://dsbox02.isi.edu:9000\\\", \\\"search_result\\\": {\\\"variable\\\": {\\\"type\\\": \\\"uri\\\", \\\"value\\\": \\\"http://www.wikidata.org/entity/statement/D4cb70062-77ed-4097-a486-0b43ffe81463-db0080de-12d9-4189-b13a-2a46fa63a227\\\"}, \\\"dataset\\\": {\\\"type\\\": \\\"uri\\\", \\\"value\\\": \\\"http://www.wikidata.org/entity/D4cb70062-77ed-4097-a486-0b43ffe81463\\\"}, \\\"url\\\": {\\\"type\\\": \\\"uri\\\", \\\"value\\\": \\\"http://dsbox02.isi.edu:9000/upload/local_datasets/Most-Recent-Cohorts-Scorecard-Elements.csv\\\"}, \\\"file_type\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#string\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"csv\\\"}, \\\"extra_information\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#string\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"{\\\\\\\"column_meta_0\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UNITID\\\\\\\"}, \\\\\\\"column_meta_1\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"OPEID\\\\\\\"}, \\\\\\\"column_meta_2\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"OPEID6\\\\\\\"}, \\\\\\\"column_meta_3\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"INSTNM\\\\\\\"}, \\\\\\\"column_meta_4\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"CITY\\\\\\\"}, \\\\\\\"column_meta_5\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"STABBR\\\\\\\"}, \\\\\\\"column_meta_6\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"INSTURL\\\\\\\"}, \\\\\\\"column_meta_7\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPCURL\\\\\\\"}, \\\\\\\"column_meta_8\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"HCM2\\\\\\\"}, \\\\\\\"column_meta_9\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PREDDEG\\\\\\\"}, \\\\\\\"column_meta_10\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"HIGHDEG\\\\\\\"}, \\\\\\\"column_meta_11\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"CONTROL\\\\\\\"}, \\\\\\\"column_meta_12\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"LOCALE\\\\\\\"}, \\\\\\\"column_meta_13\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"HBCU\\\\\\\"}, \\\\\\\"column_meta_14\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PBI\\\\\\\"}, \\\\\\\"column_meta_15\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ANNHI\\\\\\\"}, \\\\\\\"column_meta_16\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"TRIBAL\\\\\\\"}, \\\\\\\"column_meta_17\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"AANAPII\\\\\\\"}, \\\\\\\"column_meta_18\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"HSI\\\\\\\"}, \\\\\\\"column_meta_19\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NANTI\\\\\\\"}, \\\\\\\"column_meta_20\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"MENONLY\\\\\\\"}, \\\\\\\"column_meta_21\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"WOMENONLY\\\\\\\"}, \\\\\\\"column_meta_22\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"RELAFFIL\\\\\\\"}, \\\\\\\"column_meta_23\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SATVR25\\\\\\\"}, \\\\\\\"column_meta_24\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SATVR75\\\\\\\"}, \\\\\\\"column_meta_25\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SATMT25\\\\\\\"}, \\\\\\\"column_meta_26\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SATMT75\\\\\\\"}, \\\\\\\"column_meta_27\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SATWR25\\\\\\\"}, \\\\\\\"column_meta_28\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SATWR75\\\\\\\"}, \\\\\\\"column_meta_29\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SATVRMID\\\\\\\"}, \\\\\\\"column_meta_30\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SATMTMID\\\\\\\"}, \\\\\\\"column_meta_31\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SATWRMID\\\\\\\"}, \\\\\\\"column_meta_32\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTCM25\\\\\\\"}, \\\\\\\"column_meta_33\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTCM75\\\\\\\"}, \\\\\\\"column_meta_34\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTEN25\\\\\\\"}, \\\\\\\"column_meta_35\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTEN75\\\\\\\"}, \\\\\\\"column_meta_36\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTMT25\\\\\\\"}, \\\\\\\"column_meta_37\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTMT75\\\\\\\"}, \\\\\\\"column_meta_38\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"https://metadata.datadrivendiscovery.org/types/CategoricalData\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTWR25\\\\\\\"}, \\\\\\\"column_meta_39\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"https://metadata.datadrivendiscovery.org/types/CategoricalData\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTWR75\\\\\\\"}, \\\\\\\"column_meta_40\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTCMMID\\\\\\\"}, \\\\\\\"column_meta_41\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTENMID\\\\\\\"}, \\\\\\\"column_meta_42\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTMTMID\\\\\\\"}, \\\\\\\"column_meta_43\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"https://metadata.datadrivendiscovery.org/types/CategoricalData\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"ACTWRMID\\\\\\\"}, \\\\\\\"column_meta_44\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SAT_AVG\\\\\\\"}, \\\\\\\"column_meta_45\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"SAT_AVG_ALL\\\\\\\"}, \\\\\\\"column_meta_46\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP01\\\\\\\"}, \\\\\\\"column_meta_47\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP03\\\\\\\"}, \\\\\\\"column_meta_48\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP04\\\\\\\"}, \\\\\\\"column_meta_49\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP05\\\\\\\"}, \\\\\\\"column_meta_50\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP09\\\\\\\"}, \\\\\\\"column_meta_51\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP10\\\\\\\"}, \\\\\\\"column_meta_52\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP11\\\\\\\"}, \\\\\\\"column_meta_53\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP12\\\\\\\"}, \\\\\\\"column_meta_54\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP13\\\\\\\"}, \\\\\\\"column_meta_55\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP14\\\\\\\"}, \\\\\\\"column_meta_56\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP15\\\\\\\"}, \\\\\\\"column_meta_57\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP16\\\\\\\"}, \\\\\\\"column_meta_58\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP19\\\\\\\"}, \\\\\\\"column_meta_59\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP22\\\\\\\"}, \\\\\\\"column_meta_60\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP23\\\\\\\"}, \\\\\\\"column_meta_61\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP24\\\\\\\"}, \\\\\\\"column_meta_62\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP25\\\\\\\"}, \\\\\\\"column_meta_63\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP26\\\\\\\"}, \\\\\\\"column_meta_64\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP27\\\\\\\"}, \\\\\\\"column_meta_65\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP29\\\\\\\"}, \\\\\\\"column_meta_66\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP30\\\\\\\"}, \\\\\\\"column_meta_67\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP31\\\\\\\"}, \\\\\\\"column_meta_68\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP38\\\\\\\"}, \\\\\\\"column_meta_69\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP39\\\\\\\"}, \\\\\\\"column_meta_70\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP40\\\\\\\"}, \\\\\\\"column_meta_71\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP41\\\\\\\"}, \\\\\\\"column_meta_72\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP42\\\\\\\"}, \\\\\\\"column_meta_73\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP43\\\\\\\"}, \\\\\\\"column_meta_74\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP44\\\\\\\"}, \\\\\\\"column_meta_75\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP45\\\\\\\"}, \\\\\\\"column_meta_76\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP46\\\\\\\"}, \\\\\\\"column_meta_77\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP47\\\\\\\"}, \\\\\\\"column_meta_78\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP48\\\\\\\"}, \\\\\\\"column_meta_79\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP49\\\\\\\"}, \\\\\\\"column_meta_80\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP50\\\\\\\"}, \\\\\\\"column_meta_81\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP51\\\\\\\"}, \\\\\\\"column_meta_82\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP52\\\\\\\"}, \\\\\\\"column_meta_83\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCIP54\\\\\\\"}, \\\\\\\"column_meta_84\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"DISTANCEONLY\\\\\\\"}, \\\\\\\"column_meta_85\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS\\\\\\\"}, \\\\\\\"column_meta_86\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS_WHITE\\\\\\\"}, \\\\\\\"column_meta_87\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS_BLACK\\\\\\\"}, \\\\\\\"column_meta_88\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS_HISP\\\\\\\"}, \\\\\\\"column_meta_89\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS_ASIAN\\\\\\\"}, \\\\\\\"column_meta_90\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS_AIAN\\\\\\\"}, \\\\\\\"column_meta_91\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS_NHPI\\\\\\\"}, \\\\\\\"column_meta_92\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS_2MOR\\\\\\\"}, \\\\\\\"column_meta_93\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS_NRA\\\\\\\"}, \\\\\\\"column_meta_94\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UGDS_UNKN\\\\\\\"}, \\\\\\\"column_meta_95\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PPTUG_EF\\\\\\\"}, \\\\\\\"column_meta_96\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Integer\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"CURROPER\\\\\\\"}, \\\\\\\"column_meta_97\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT4_PUB\\\\\\\"}, \\\\\\\"column_meta_98\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT4_PRIV\\\\\\\"}, \\\\\\\"column_meta_99\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT41_PUB\\\\\\\"}, \\\\\\\"column_meta_100\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT42_PUB\\\\\\\"}, \\\\\\\"column_meta_101\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT43_PUB\\\\\\\"}, \\\\\\\"column_meta_102\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT44_PUB\\\\\\\"}, \\\\\\\"column_meta_103\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT45_PUB\\\\\\\"}, \\\\\\\"column_meta_104\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT41_PRIV\\\\\\\"}, \\\\\\\"column_meta_105\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT42_PRIV\\\\\\\"}, \\\\\\\"column_meta_106\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT43_PRIV\\\\\\\"}, \\\\\\\"column_meta_107\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT44_PRIV\\\\\\\"}, \\\\\\\"column_meta_108\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"NPT45_PRIV\\\\\\\"}, \\\\\\\"column_meta_109\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCTPELL\\\\\\\"}, \\\\\\\"column_meta_110\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"RET_FT4_POOLED_SUPP\\\\\\\"}, \\\\\\\"column_meta_111\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"RET_FTL4_POOLED_SUPP\\\\\\\"}, \\\\\\\"column_meta_112\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"RET_PT4_POOLED_SUPP\\\\\\\"}, \\\\\\\"column_meta_113\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"RET_PTL4_POOLED_SUPP\\\\\\\"}, \\\\\\\"column_meta_114\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"PCTFLOAN\\\\\\\"}, \\\\\\\"column_meta_115\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UG25ABV\\\\\\\"}, \\\\\\\"column_meta_116\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"MD_EARN_WNE_P10\\\\\\\"}, \\\\\\\"column_meta_117\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"GT_25K_P6\\\\\\\"}, \\\\\\\"column_meta_118\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"GT_28K_P6\\\\\\\"}, \\\\\\\"column_meta_119\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"GRAD_DEBT_MDN_SUPP\\\\\\\"}, \\\\\\\"column_meta_120\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"GRAD_DEBT_MDN10YR_SUPP\\\\\\\"}, \\\\\\\"column_meta_121\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"RPY_3YR_RT_SUPP\\\\\\\"}, \\\\\\\"column_meta_122\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"C150_L4_POOLED_SUPP\\\\\\\"}, \\\\\\\"column_meta_123\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Float\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"C150_4_POOLED_SUPP\\\\\\\"}, \\\\\\\"column_meta_124\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"UNITID_wikidata\\\\\\\"}, \\\\\\\"column_meta_125\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"OPEID6_wikidata\\\\\\\"}, \\\\\\\"column_meta_126\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"STABBR_wikidata\\\\\\\"}, \\\\\\\"column_meta_127\\\\\\\": {\\\\\\\"semantic_type\\\\\\\": [\\\\\\\"http://schema.org/Text\\\\\\\", \\\\\\\"https://metadata.datadrivendiscovery.org/types/Attribute\\\\\\\"], \\\\\\\"name\\\\\\\": \\\\\\\"CITY_wikidata\\\\\\\"}, \\\\\\\"data_metadata\\\\\\\": {\\\\\\\"shape_0\\\\\\\": 7175, \\\\\\\"shape_1\\\\\\\": 128}, \\\\\\\"first_10_rows\\\\\\\": \\\\\\\",UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,INSTURL,NPCURL,HCM2,PREDDEG,HIGHDEG,CONTROL,LOCALE,HBCU,PBI,ANNHI,TRIBAL,AANAPII,HSI,NANTI,MENONLY,WOMENONLY,RELAFFIL,SATVR25,SATVR75,SATMT25,SATMT75,SATWR25,SATWR75,SATVRMID,SATMTMID,SATWRMID,ACTCM25,ACTCM75,ACTEN25,ACTEN75,ACTMT25,ACTMT75,ACTWR25,ACTWR75,ACTCMMID,ACTENMID,ACTMTMID,ACTWRMID,SAT_AVG,SAT_AVG_ALL,PCIP01,PCIP03,PCIP04,PCIP05,PCIP09,PCIP10,PCIP11,PCIP12,PCIP13,PCIP14,PCIP15,PCIP16,PCIP19,PCIP22,PCIP23,PCIP24,PCIP25,PCIP26,PCIP27,PCIP29,PCIP30,PCIP31,PCIP38,PCIP39,PCIP40,PCIP41,PCIP42,PCIP43,PCIP44,PCIP45,PCIP46,PCIP47,PCIP48,PCIP49,PCIP50,PCIP51,PCIP52,PCIP54,DISTANCEONLY,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,NPT4_PUB,NPT4_PRIV,NPT41_PUB,NPT42_PUB,NPT43_PUB,NPT44_PUB,NPT45_PUB,NPT41_PRIV,NPT42_PRIV,NPT43_PRIV,NPT44_PRIV,NPT45_PRIV,PCTPELL,RET_FT4_POOLED_SUPP,RET_FTL4_POOLED_SUPP,RET_PT4_POOLED_SUPP,RET_PTL4_POOLED_SUPP,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GT_25K_P6,GT_28K_P6,GRAD_DEBT_MDN_SUPP,GRAD_DEBT_MDN10YR_SUPP,RPY_3YR_RT_SUPP,C150_L4_POOLED_SUPP,C150_4_POOLED_SUPP,UNITID_wikidata,OPEID6_wikidata,STABBR_wikidata,CITY_wikidata\\\\\\\\n0,100654,100200,1002,Alabama A & M University,Normal,AL,www.aamu.edu/,www2.aamu.edu/scripts/netpricecalc/npcalc.htm,0,3,4,1,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,380.0,470.0,370.0,470.0,370.0,457.0,425.0,420.0,414.0,16.0,19.0,14.0,20.0,15.0,18.0,,,18.0,17.0,17.0,,849.0,849.0,0.0448,0.0142,0.0071,0.0,0.0,0.0354,0.0401,0.0,0.1132,0.0896,0.0472,0.0,0.033,0.0,0.0094,0.066,0.0,0.0708,0.0024,0.0,0.0,0.0,0.0,0.0,0.0307,0.0,0.0472,0.0519,0.0377,0.0448,0.0,0.0,0.0,0.0,0.0283,0.0,0.1863,0.0,0.0,4616.0,0.0256,0.9129,0.0076,0.0019,0.0024,0.0017,0.0401,0.0065,0.0013,0.0877,1,15567.0,,15043.0,15491.0,17335.0,19562.0,18865.0,,,,,,0.7039,0.5774,,0.309,,0.7667,0.0859,31000,0.453,0.431,32750,348.16551225731,0.2531554273,,0.2913,Q39624632,Q17203888,Q173,Q575407\\\\\\\\n1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,www.uab.edu,uab.studentaidcalculator.com/survey.aspx,0,3,4,1,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,480.0,640.0,490.0,660.0,,,560.0,575.0,,21.0,28.0,22.0,30.0,19.0,26.0,,,25.0,26.0,23.0,,1125.0,1125.0,0.0,0.0,0.0,0.0005,0.036000000000000004,0.0,0.0131,0.0,0.0748,0.0599,0.0,0.0059,0.0,0.0,0.0158,0.0135,0.0,0.0734,0.009000000000000001,0.0,0.0,0.0,0.005,0.0,0.0212,0.0,0.0766,0.0243,0.0221,0.0365,0.0,0.0,0.0,0.0,0.0392,0.25,0.2072,0.0162,0.0,12047.0,0.5786,0.2626,0.0309,0.0598,0.0028,0.0004,0.0387,0.0179,0.0083,0.2578,1,16475.0,,13849.0,15385.0,18022.0,18705.0,19319.0,,,,,,0.3525,0.8007,,0.5178,,0.5179,0.2363,41200,0.669,0.631,21833,232.106797835537,0.513963161,,0.5384,Q39624677,Q17204336,Q173,Q79867\\\\\\\\n2,100690,2503400,25034,Amridge University,Montgomery,AL,www.amridgeuniversity.edu,www2.amridgeuniversity.edu:9091/,0,3,4,2,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0889,0.0,0.0,0.0889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3556,0.0,1.0,293.0,0.157,0.2355,0.0068,0.0,0.0,0.0034,0.0,0.0,0.5973,0.5392,1,,10155.0,,,,,,10155.0,,,,,0.6971,PrivacySuppressed,,PrivacySuppressed,,0.8436,0.8571,39600,0.658,0.542,22890,243.343773299842,0.2307692308,,PrivacySuppressed,Q39624831,Q17337864,Q173,Q29364\\\\\\\\n3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,www.uah.edu,finaid.uah.edu/,0,3,4,1,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,520.0,660.0,540.0,680.0,,,590.0,610.0,,25.0,31.0,24.0,33.0,23.0,29.0,,,28.0,29.0,26.0,,1257.0,1257.0,0.0,0.0,0.0,0.0,0.0301,0.0,0.0499,0.0,0.0282,0.2702,0.0,0.0151,0.0,0.0,0.0122,0.0,0.0,0.0603,0.0132,0.0,0.0,0.0,0.0113,0.0,0.0226,0.0,0.016,0.0,0.0,0.0188,0.0,0.0,0.0,0.0,0.0264,0.1911,0.225,0.0094,0.0,6346.0,0.7148,0.1131,0.0411,0.0414,0.012,0.0,0.0181,0.0303,0.0292,0.1746,1,19423.0,,15971.0,18016.0,20300.0,21834.0,22059.0,,,,,,0.2949,0.8161,,0.5116,,0.4312,0.2255,46700,0.685,0.649,22647,240.760438353933,0.5485090298,,0.4905,Q39624901,Q17204354,Q173,Q79860\\\\\\\\n4,100724,100500,1005,Alabama State University,Montgomery,AL,www.alasu.edu,www.alasu.edu/cost-aid/forms/calculator/index.aspx,0,3,4,1,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,370.0,460.0,360.0,460.0,,,415.0,410.0,,15.0,19.0,14.0,19.0,15.0,17.0,,,17.0,17.0,16.0,,825.0,825.0,0.0,0.0,0.0,0.0,0.1023,0.0,0.0503,0.0,0.1364,0.0,0.0,0.0,0.0,0.0,0.0114,0.0,0.0,0.0779,0.0146,0.0,0.0,0.0211,0.0,0.0,0.0244,0.0,0.0503,0.1412,0.0633,0.013000000000000001,0.0,0.0,0.0,0.0,0.0487,0.1429,0.0974,0.0049,0.0,4704.0,0.0138,0.9337,0.0111,0.0028,0.0013,0.0004,0.0111,0.0159,0.01,0.0727,1,15037.0,,14111.0,15140.0,17492.0,19079.0,18902.0,,,,,,0.7815,0.6138,,0.5313,,0.8113,0.0974,27700,0.393,0.351,31500,334.876752247489,0.2185867473,,0.2475,Q39624974,Q17203904,Q173,Q29364\\\\\\\\n5,100751,105100,1051,The University of Alabama,Tuscaloosa,AL,www.ua.edu/,financialaid.ua.edu/net-price-calculator/,0,3,4,1,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,490.0,610.0,490.0,620.0,480.0,600.0,550.0,555.0,540.0,23.0,31.0,23.0,33.0,22.0,29.0,7.0,8.0,27.0,28.0,26.0,8.0,1202.0,1202.0,0.0,0.0039,0.0,0.0042,0.102,0.0,0.0098,0.0,0.0782,0.1036,0.0,0.0057,0.0692,0.0,0.0115,0.0,0.0,0.0338,0.009000000000000001,0.0,0.0206,0.0,0.0031,0.0,0.0115,0.0,0.036000000000000004,0.0263,0.0109,0.0362,0.0,0.0,0.0,0.0,0.026000000000000002,0.0988,0.2879,0.0118,0.0,31663.0,0.7841,0.1037,0.0437,0.0118,0.0036,0.0009,0.0297,0.0192,0.0033,0.0819,1,21676.0,,18686.0,20013.0,22425.0,23666.0,24578.0,,,,,,0.1938,0.8637,,0.4308,,0.4007,0.081,44500,0.695,0.679,23290,247.596176502985,0.6019442985,,0.6793,Q39625107,Q17204328,Q173,Q79580\\\\\\\\n6,100760,100700,1007,Central Alabama Community College,Alexander City,AL,www.cacc.edu,www.cacc.edu/NetPriceCalculator/14-15/npcalc.html,0,2,2,1,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0266,0.0082,0.0,0.0,0.1025,0.0,0.0,0.0,0.0,0.2787,0.0,0.0,0.0,0.0,0.0287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0307,0.3176,0.0,0.0,0.1209,0.0861,0.0,0.0,1492.0,0.6877,0.2802,0.0127,0.002,0.004,0.0007,0.0067,0.002,0.004,0.3733,1,9128.0,,8882.0,8647.0,11681.0,11947.0,13868.0,,,,,,0.5109,,0.5666,,0.4554,0.3234,0.263,27700,0.466,0.395,9500,100.994576074639,0.2510056315,0.2136,,Q39625150,Q17203916,Q173,Q79663\\\\\\\\n7,100812,100800,1008,Athens State University,Athens,AL,www.athens.edu,https://24.athens.edu/apex/prod8/f?p=174:1:3941357449598491,0,3,3,1,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0462,0.0,0.2192,0.0,0.0,0.0,0.0,0.0,0.0346,0.0538,0.0,0.0231,0.0205,0.0,0.0154,0.0154,0.0038,0.0,0.0026,0.0,0.0308,0.0282,0.0,0.0218,0.0,0.0,0.0,0.0,0.0256,0.0064,0.4449,0.0077,0.0,2888.0,0.7784,0.125,0.0215,0.0076,0.0142,0.001,0.0187,0.001,0.0325,0.5817,1,,,,,,,,,,,,,0.4219,,,,,0.6455,0.6774,38700,0.653,0.612,18000,191.358144141422,0.5038167939,,,Q39625389,Q17203920,Q173,Q203263\\\\\\\\n8,100830,831000,8310,Auburn University at Montgomery,Montgomery,AL,www.aum.edu,www.aum.edu/current-students/financial-information/net-price-calculator,0,3,4,1,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,435.0,495.0,445.0,495.0,,,465.0,470.0,,19.0,24.0,19.0,24.0,17.0,22.0,,,22.0,22.0,20.0,,1009.0,1009.0,0.0,0.02,0.0,0.0,0.0601,0.0,0.0,0.0,0.0584,0.0,0.0,0.0033,0.0,0.0,0.0067,0.0117,0.0,0.0534,0.0083,0.0,0.0,0.0501,0.0,0.0,0.015,0.0,0.0668,0.0351,0.0,0.0401,0.0,0.0,0.0,0.0,0.0267,0.2621,0.2705,0.0117,0.0,4171.0,0.5126,0.3627,0.0141,0.0247,0.006,0.001,0.0319,0.0412,0.0058,0.2592,1,15053.0,,13480.0,14114.0,16829.0,17950.0,17022.0,,,,,,0.4405,0.6566,,0.4766,,0.5565,0.2257,33300,0.616,0.546,23363,248.372240087558,0.4418886199,,0.2207,Q39625474,Q17613566,Q173,Q29364\\\\\\\\n9,100858,100900,1009,Auburn University,Auburn,AL,www.auburn.edu,https://www.auburn.edu/admissions/netpricecalc/freshman.html,0,3,4,1,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,530.0,620.0,530.0,640.0,520.0,620.0,575.0,585.0,570.0,24.0,30.0,25.0,32.0,23.0,28.0,7.0,8.0,27.0,29.0,26.0,8.0,1217.0,1217.0,0.0437,0.0133,0.0226,0.0,0.0575,0.0,0.0079,0.0,0.0941,0.1873,0.0,0.0097,0.0337,0.0,0.0088,0.0,0.0,0.0724,0.0097,0.0,0.0267,0.0,0.0014,0.0,0.0093,0.0,0.033,0.0,0.0179,0.0312,0.0,0.0,0.0,0.0,0.0326,0.0667,0.2113,0.009000000000000001,0.0,22095.0,0.8285,0.0673,0.0335,0.0252,0.0052,0.0003,0.0128,0.0214,0.0059,0.0831,1,21984.0,,15591.0,19655.0,23286.0,24591.0,25402.0,,,,,,0.1532,0.9043,,0.7229,,0.32799999999999996,0.0427,48800,0.741,0.726,21500,228.566672168921,0.7239612977,,0.74,Q39625609,Q17203926,Q173,Q225519\\\\\\\\n10,100937,101200,1012,Birmingham Southern College,Birmingham,AL,www.bsc.edu/,www.bsc.edu/fp/np-calculator.cfm,0,3,3,2,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,71.0,500.0,610.0,490.0,570.0,,,555.0,530.0,,23.0,28.0,22.0,29.0,22.0,26.0,,,26.0,26.0,24.0,,1150.0,1150.0,0.0,0.023,0.0,0.0077,0.0268,0.0,0.0,0.0,0.046,0.0077,0.0,0.0077,0.0,0.0,0.023,0.0,0.0,0.1379,0.0498,0.0,0.0383,0.0,0.0307,0.0,0.0575,0.0,0.0728,0.0,0.0,0.0881,0.0,0.0,0.0,0.0,0.1034,0.0,0.2261,0.0536,0.0,1289.0,0.7921,0.1171,0.0217,0.0489,0.006999999999999999,0.0,0.0109,0.0,0.0023,0.0054,1,,23227.0,,,,,,20815.0,19582.0,23126.0,24161.0,25729.0,0.1888,0.8386,,,,0.4729,0.0141,46700,0.637,0.618,26045,276.88460356463,0.7559912854,,0.6439,,Q17203945,Q173,Q79867\\\\\\\\n\\\\\\\", \\\\\\\"local_storage\\\\\\\": \\\\\\\"/data00/dsbox/datamart/memcache_storage/datasets_cache/794d5f7dcddae86817a10e16ee1aecfa.h5\\\\\\\"}\\\"}, \\\"title\\\": {\\\"xml:lang\\\": \\\"en\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"most recent cohorts scorecard elements csv\\\"}, \\\"keywords\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#string\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"unitid opeid opeid6 instnm city stabbr insturl npcurl hcm2 preddeg highdeg control locale hbcu pbi annhi tribal aanapii hsi nanti menonly womenonly relaffil satvr25 satvr75 satmt25 satmt75 satwr25 satwr75 satvrmid satmtmid satwrmid actcm25 actcm75 acten25 acten75 actmt25 actmt75 actwr25 actwr75 actcmmid actenmid actmtmid actwrmid sat avg sat avg all pcip01 pcip03 pcip04 pcip05 pcip09 pcip10 pcip11 pcip12 pcip13 pcip14 pcip15 pcip16 pcip19 pcip22 pcip23 pcip24 pcip25 pcip26 pcip27 pcip29 pcip30 pcip31 pcip38 pcip39 pcip40 pcip41 pcip42 pcip43 pcip44 pcip45 pcip46 pcip47 pcip48 pcip49 pcip50 pcip51 pcip52 pcip54 distanceonly ugds ugds white ugds black ugds hisp ugds asian ugds aian ugds nhpi ugds 2mor ugds nra ugds unkn pptug ef curroper npt4 pub npt4 priv npt41 pub npt42 pub npt43 pub npt44 pub npt45 pub npt41 priv npt42 priv npt43 priv npt44 priv npt45 priv pctpell ret ft4 pooled supp ret ftl4 pooled supp ret pt4 pooled supp ret ptl4 pooled supp pctfloan ug25abv md earn wne p10 gt 25k p6 gt 28k p6 grad debt mdn supp grad debt mdn10yr supp rpy 3yr rt supp c150 l4 pooled supp c150 4 pooled supp unitid wikidata opeid6 wikidata stabbr wikidata city wikidata\\\"}, \\\"datasetLabel\\\": {\\\"xml:lang\\\": \\\"en\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"D4cb70062-77ed-4097-a486-0b43ffe81463\\\"}, \\\"variableName\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#string\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"INSTNM\\\"}, \\\"score\\\": {\\\"datatype\\\": \\\"http://www.w3.org/2001/XMLSchema#double\\\", \\\"type\\\": \\\"literal\\\", \\\"value\\\": \\\"0.9398390424662831\\\"}}, \\\"query_json\\\": {\\\"keywords\\\": [\\\"INSTNM\\\"], \\\"variables\\\": {\\\"INSTNM\\\": \\\"medanos funeral liberty costa calhoun management gardens spartanburg alderson fashion montrose platteville point southern polytechnic mayo wiregrass morris waters cajon cosemtology bunker amboy dubois kapiolani salon toni suny buncombe forsyth mcminnville rose hickey gwinnett cities lewiston greene broadview birmingham mountains michigan altamonte charlotte dental reedley defiance paier mcmurry campus georgetown marlboro springfield missouri josef ingram woodbury goldey stamford schreiner finger bottineau essex linn eastwick wharton furman federal caribbean cogswell art blauvelt cem lee aiken richmond soledad lloyd mohave adult moore yavapai viterbo ipswich scripps exton salt metro dlp mccann school benedict bronx jones miracosta inland mexico workforce faribault cross f midway jose henrietta oriental cerritos bowling langhorne clearwater vega desert pioneer upland wofford orange public porter longwood cod elegance sinai columbiana snyder tarrant ludlow grenada metropark cookman kenneth foley daymar helena concordia broadcasting technicians bennet lower albright community anza wooster rogue studies general range red eves luna allied skyline drury bayshore training linda mgh enterprise cooper mokena ashtabula fletcher chemung irving connecticut clovis merritt lucie harold phagans windsor gloucester carroll hairdressing merchandising massage radford canada collins divinity crescent medford technological henrico garrett slippery online automotive aguadilla coyne collegiate roxborough chipola intercoast extended interactive seattle penn victor dutchess highlands coffeyville lynn valley stratton fortis cda wade arapahoe little coalinga panola duluth mentor wolff vanguard house elmira sunstate estes del rensselaer camp santa tioga ann mackie rush noc granville indianhead frostburg steuben watkins northwestern judson sage roane cordon redwoods parkersburg boise dramatic richland brightwood rivertown pierpont roseville hurst philander stone allegheny alto kaskaskia c nutley schilling hays glen john miramar cookeville phillips du brigham marian platt owen coast guy machias crestwood applied faith syracuse rider sylvania remington andrews studio 5 emporia rice belhaven portfolio citadel dabney bridgeport model conemaugh germanna graceland chamblee holyoke scott apex jenny pro site tech centro ottumwa macalester loma learning camden buffalo heidelberg brookdale mohawk diego erskine dawson chicopee modesto cobleskill bethel belleville ct universidad mines lansing wilson faulkner highland broken devry evers barber fayetteville bay mount ferris macon baltimore consolidated somersworth blackwood lincoln anna berea claire branch caribou methodist pierce shelby northwest cumberlands tribal merrell liu valencia kansas age florham juilliard hillsboro aveda cameo lasell pineville jamaica gastonia taos lindenwood england carolina hamilton name brookline ailano dalton newark chaminade pennco towson davison piedmont baylor douglasville clark prince asbury payne augustana accelerated hands corazon juniata eberly lakeland itasca alamogordo ouachita grants elizabeth cuisine professions environmental capitol westminster danville riverside johnson titusville hardin covenant bradley caldwell leeward hazleton seminary cabarrus douglas averett chattanooga mediatech sauk luke haute vermont baptist denmark lac cowley touro allen centers cottey lyndhurst ivy westbury chadron strayer bossier levittown morrison harrold kd shawnee laguardia passaic ave gainesville virgin healthcare simon jolie briar cumberland pharr treme shore wiley woodland baja hobart mcallen county s main arts luther brevard stockton technology herzing hilliard architecture engineering edic grossmont advanced scottsdale coconino granite forestry quinnipiac le noblesville bon conservatory junction barclay big ripon bowie lenoir halifax mary walden christian navarro salus denison iti computer street montgomery hancock motivation northeast carlow skysong hallmark wabash salkehatchie compass liverpool expressions silicon theology keuka joseph ft lima designory altoona zanesville teterboro wood southmost lansdale muncie by ash integral vet arecibo tennessee unity b waterbury victoria west rockland central butler lakes chambersburg henry rasmussen visual rhodes alternative azusa hall rapid avalon ocean hazard pembroke gardena harbor greensburg grand moravian stevenson golfers shuler keiser pearl industrial abraham sheen y ouachitas woburn liceo clear turnersville virginia non carsten beaumont esani governors dewey hannibal for montcalm franciscan m berk binghamton eunice dixie carbondale clair fisher pharmacy beckfield kentwood embry stautzenberger sand concord alfred barranquitas hobby rogers elon paradise continental owens vista fidm antelope educators keys chester prescott lane styling tacoma universal manhattan wittenberg concorde ibmc dc marion johnston albertus peak presbyterian dixon beaufort bryan midwest samuel assistants waterfront clarkson temple hospitality locations moline avance blessing maharishi lufkin maine tufts smith doheny voorhees montevallo lawton politecnica cutler and sarah arthur beach holistic bard burnsville brockport national specs bancroft collin paramus arlington rochelle fort wichita laurel takoma colgate rieman trailer sentara fargo welder fashions purchase pulaski la marshalltown sail fajardo amityville ulster chesterfield naropa brewton cosumnes quincy cedar hypnosis kennesaw purdue rosa van wic neosho piscataway hawaii duke mission air mar o jacinto intermediate tribes feather yuba ford gene roswell cheyenne colorado palladium machinists hondros depauw belmont strand agnes morrisville nashville way coachella friendswood berry jay fear wesleyan midlands susquehanna avery poplar mayaguez lehigh indiana rocks kent campuses humboldt vermilion denver legal medgar woman lilburn prairie upper kensington osteopathic cancer brownsville shepherd myotherapy wilkes sargeant nebraska caguas concept bene forks broomall middlesex king ridley mendota wise branford weill queens covina holy bucks ideal berkeley northeastern folsom medrash lutheran shear tractor behrend ross area the wayne services mobile robeson lowcountry northcoast joya of summit merrimack a govoha edp luis peter plymouth webber wentworth juan howard continuing aeronautical rainy chandler allentown otero falls associated truett heavilin downtown wachusett arundel niagara blades trumbull omnitech aviation louis bluff georgian portage brick unit living north brittany institutes stafford soma houghton california findlay stonehill anthony automeca colton brickell minnesota visible bakersfield kirtland ces biloxi chapman merrillville mattydale p escanaba forest anselm beaver castle intercontinental tri state schoolcraft park cci bergen rust montreat brecksville ozarks hillsborough angelo sam brooklyn olaf salem sandusky huertas sebastian fredric lourdes trinidad hudson jacksonville 6 cazenovia culinaire dow girardeau providence mason wing moberly trend clarendon knoxville perth brown taylor westboro rio whitewater marietta newport antioch ames wheelock culver fine farmingdale case sparks rochester mesa motlow oconee rock salish peirce ambler louisiana romeoville brentwood honolulu sacramento southwest richfield altierus albert everest dona ecole marywood pikeville gwynedd humacao toms hollywood ort cruces plainfield cincinnati independence skidmore hesperia iowa mountwest layton rancho more cloud sw yti pressley federico liberal cornell gardner spalding barbering region global coleman yale irvine musical mississippi united healing ursinus eureka mj success carmel somerset monterey presentation fiance mercyhurst bj durham ultimate whittier florence cuny plaza muskingum johns philadelphia linwood calvin davis springs colby olivet lakeview medical master pikes visalia washington dubuque richey claremont moorhead saint system oneida land asheville pennsylvania staten cbd charzanne corona don l montana ego whatcom billings benedictine christi lancaster oral resurrection mitchell md corporation idaho trade reserve alaska oxford vocations renton adrian court clayton everett mesabi mining hillyard camelot g arkansas sussex appling bloomfield greece worcester willoughby scioto jordan jewell cosmetology clarksburg calvary albizu alvin trinity chemeketa bible chesapeake grambling harrisburg tarleton ogle dartmouth murphy shorter exposito professionals simpson dci xenon powersport therapy huntsville nursing kankakee skin northern vaughn morningside miller windward e kootenai umpqua enfield lafayette mifflin gulf heights mont research ravenna hurstborne college juarez auburn eastfield texas marin nails pitt montclair mankato robert harris pasco lorain massachusetts palm charlie institute verne sum iona arbor manassas valparaiso siloam mcpherson drive barbara nuys papageorge anaheim law diesel norwich lone savannah paramount cameron pitzer southtowns wisconsin cape spartan brandman delmarva augsburg campbellsville rey lowell eugene kokomo spencerian barton kaye parker canyons ad marshall perimeter d mcneese bellevue ranger hampton sweet nunez landmark riddle bradford fond brandon university 2017 winthrop pomeroy bramson hilo ventura haverford willamette natchez hair sheffield natural este milwaukee darlington fe bela troy harrison gavilan hardeman fairleigh frontier mcconnell multnomah 1 dominguez drafting margaret gupton charles asm norfolk wake margate woodbridge wheeling mssu utah practical normandale american hollins nashua puerto bordentown thomas kalamazoo cardinal melbourne charleston sprunt hawk parkside itawamba clara cobb inc south district ayers chicago oswego parishes scholars obispo mills israel madison chamberlain bluegrass j dayton green great bridgevalley james life molloy rouge pellissippi hammond brownsburg olympian steven helene everglades kennedy aeronautics berklee welding port technical biomedical manatee huron german forrest regional dakota brighton wellness seminole hofstra sae saddleback hato watsonville centura nevada roman morrow mansfield capital harvard restaurant schools waynesburg murray beal lawrenceville pensacola calumet medicine luzerne albuquerque thiel grayson whitman kirkwood media carlos boston america creighton kettering de clary division capri bismarck niles hernando stephens klamath education northland ithaca dodge queensborough muhlenberg mckendree webb sarasota eti hulman orlando joliet union texarkana livingstone hagerstown weatherford hackensack southeastern hunter wausau joint sacred pontifical edu staples euphoria schaumburg perry ex greenspoint recording midland quinta parma mountain motte abington mullins sunset las buena word hopkins shenango carrollton okc catholic eagan academy uei puget youngstown baker traverse francis edinboro schuyler quest catherine snow omaha redlands professional harlingen london capstone gulfport vernon mycomputercareer southeast carnegie ata donnelly focus tyler manor winston images warminster emmanuel southfield mercy bellus brite design berkshire old forty sues brunswick accountancy graduate webster quinn hilbert woonsocket ponce scranton cortiva lynwood tinley drew hamrick white rudae dordt talladega jessup paris carrington at hanceville ai david paul catawba tabor bloomington danbury northpoint andrew rosalind gonzaga creative william health washtenaw tallahassee hospital westchester farmington los owings hood rapids richardson akron gateway centenary coker sierra rizzieri dunwoody junior clinton manchester laredo hope univeristy tampa sinclair edward alvernia jefferson modern ntma rolla murfreesboro carson steubenville beacom anderson lea southwestern depaul ferrum ruidoso dean guayama centra arte bethune barry esthetics loudoun essential amherst tiffin ravenscroft ontario music meadows lesley coba emerson geauga blaine walsh batesville dickinson eastern cochran francisco clinic atep loyola 10 nw tusculum lexington hartford radiation vanderbilt slidell warren opportunities laboure company evergreen hills advancing huntington burlington pepperdine circus ucas ancilla esthiology albany five nhti pasadena nassau horry academic ridgewater memphis cordova argosy golf wellspring emory lagrange truckee hawkeye maritime potomac woods caribe sandy clifton killeen collegeamerica superior seacoast evansville ashland morrilton equestrian hennepin maryville alamos linfield brazosport norbert landover fox westport corning orleans haven austin pine location delaware brandeis darby coe six middletown fredonia hamline rhode clinical star welch apollo trine bryant marcos walnut eagle vance flint nazarene boylston boricua christopher gettysburg edison marine dupage clarksville oregon sanford roche jarvis siena q valdosta oehrlein alabama wilmington walla martinsburg appleton stroudsburg houma bob swarthmore motoring joaquin angeles galveston richard make moines oak cortland laboratory malden whitworth neumann meredith 4 houston world lauderhill zane kutztown el on jersey hampshire roanoke terre moyne jacobs poway in mars ridge merced clackamas sawyer peay dividend citrus cattaraugus stevens military atlanta george agricultural louisville wallace salina therapeutic anoka adolphus parisian prism mesquite lithonia fairbanks trades lauderdale excellence katy salle sullivan gill cambridge aurora galen dallas gregory carlsbad northcentral newberry crossroads parish abilene snead lebanon marymount orlo brownson georgia support franklin chapel tricoci islands llc vincent directions humphreys lpn maranatha vogue careers academie architectural century tulsa spa hanover lander kentucky inter kauai indianapolis belle highline simmons oklahoma alliance clover middlebury allan commonwealth skagit nicholls pj crouse river grove carthage northwood stowe irene flathead hesston monroe up wells assumption scholastica hannah lamson san pc oakwood thaddeus clatsop millikin dimondale bucknell beaverton alameda tunxis daytona med fair nyack commerce bissonnet fresno coastline dominican newington brookfield stanford bland culinary pillar tempe tecnologia jamestown cornish smiths headlines florida cuyahoga neumont rocky sound dorothy mahwah fall yeshiva sioux cedarville ringling names orion soka benjamin pueblo ambrose eau bothell brookhaven kilgore mott brothers twin suffolk solano cascades bluffton roger madisonville rowan otterbein aquinas setters cecil science berks rutgers shreveport fuld eastland augusta dover innsbrook se heath sumter westfield broaddus states city mineral summerlin towns heathrow digital weslaco hodges hill bodywork beauty elaine elkhart baymeadows merchant conway taunton bayamon st zona tompkins pedro wales spokane leandro bethany plains farms pinnacle pines worth jameson tualatin walters turlock vegas antonio glenville corpus beau industrialization maryland mr pleasant denton hairstyling marygrove centre salisbury assist young portsmouth beth baton ursuline blue portland phoenix culture sterling vatterott wright barre greystone goodwin news lawrence manati delta boces chula chattahoochee career wytheville kenyon colleges freed pomona campbell muskegon vocational empire chillicothe maccormac lukes glendale stanbridge island missoula oaks monroeville adventist carbon toledo veeb ana bend high elkins nicolet pratt fredericksburg morgantown polaris elley theological hialeah ranken cet lewisville amarillo fitchburg greensboro mildred agriculture arizona corban westech dynamics saginaw anne mendocino montserrat mid international davenport panhandle des occidental wheaton minneapolis family sagrado artistic villanova pottsville indian shoals tidewater miat maintenance elizabethtown acupuncture kittanning long warrendale columbia clarke laconia anschutz charter maria columbus cairn limestone stritch wyoming alexandria eastlake grabber mt mckenna decatur henager networks pass mcnally southgate thornton iselin intellitec arrow gerbers raleigh european greenville michael mti monde lewis allegany center brainerd rhyne philip administration magnus capella pci treasury choffin triangle carey whitestone rockford stratford grace heart western traditional middle closed bonaventure program mclean myomassology bennington williston gate basin illinois abbey bloomsburg cogliano moscow cliff ramsey cherry chestnut programs williams warner southington petersburg ottawa roosevelt seguin canton moraine dearborn sonoma fayette bristol tucson ecpi metropolitan kendall cheeks hastings east cleveland genesis crosse copiah granger care milan madonna dorsey film lemoore lake rockhurst stetson logan trocaire dominion midstate cannella town pittsburgh schenectady gordon secours manhattanville unitech grande herkimer black homestead fremont business fulton jasper monticello edgecombe lubbock shelton gallaudet grays wor ogeechee gustavus strongsville britain princeton martin spelman station nelly johnstown testing all york view coastal bleu geneseo barrett advancement casper reporting aims lehman albion waubonsee line degree xavier golden flagstaff licking upstate atlantic antonelli memorial mchenry monmouth magnolia divers davidson boulder rob lassen langston trevecca athens holmes jackson women metairie doane goshen rollins pacific full swlr fairfield erie benton stark ohio ivc bartending onondaga alice deaf chenoweth spring greater baldwin riverhead alliant salter reynolds cozmo radiologic paso kaplan bangor miles leon jesuit roberts stephen ball sciences metropolitana jfk bastyr beltsville italy rico miami earlham reno new fairmont roy quinsigamond myrtle paltz ne ogden tuskegee keystone hibbing service wesley pima sewanee blinn lamar monica\\\"}, \\\"keywords_search\\\": [\\\"college\\\", \\\"scorecard\\\", \\\"finance\\\", \\\"debt\\\", \\\"earnings\\\"], \\\"variables_search\\\": {}}, \\\"search_type\\\": \\\"general\\\"}, \\\"augmentation\\\": {\\\"properties\\\": \\\"join\\\", \\\"right_columns\\\": [3], \\\"left_columns\\\": [2]}, \\\"datamart_type\\\": \\\"isi\\\"}\", \"metadata\": [{\"metadata\": {\"dimension\": {\"length\": 7175, \"name\": \"rows\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/TabularRow\"]}, \"schema\": \"https://metadata.datadrivendiscovery.org/schemas/v0/container.json\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/Table\"], \"structural_type\": \"d3m.container.pandas.DataFrame\"}, \"selector\": []}, {\"metadata\": {\"dimension\": {\"length\": 128, \"name\": \"columns\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/TabularColumn\"]}}, \"selector\": [\"__ALL_ELEMENTS__\"]}, {\"metadata\": {\"name\": \"UNITID\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 0]}, {\"metadata\": {\"name\": \"OPEID\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 1]}, {\"metadata\": {\"name\": \"OPEID6\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 2]}, {\"metadata\": {\"name\": \"INSTNM\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 3]}, {\"metadata\": {\"name\": \"CITY\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 4]}, {\"metadata\": {\"name\": \"STABBR\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 5]}, {\"metadata\": {\"name\": \"INSTURL\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 6]}, {\"metadata\": {\"name\": \"NPCURL\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 7]}, {\"metadata\": {\"name\": \"HCM2\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 8]}, {\"metadata\": {\"name\": \"PREDDEG\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 9]}, {\"metadata\": {\"name\": \"HIGHDEG\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 10]}, {\"metadata\": {\"name\": \"CONTROL\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 11]}, {\"metadata\": {\"name\": \"LOCALE\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 12]}, {\"metadata\": {\"name\": \"HBCU\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 13]}, {\"metadata\": {\"name\": \"PBI\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 14]}, {\"metadata\": {\"name\": \"ANNHI\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 15]}, {\"metadata\": {\"name\": \"TRIBAL\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 16]}, {\"metadata\": {\"name\": \"AANAPII\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 17]}, {\"metadata\": {\"name\": \"HSI\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 18]}, {\"metadata\": {\"name\": \"NANTI\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 19]}, {\"metadata\": {\"name\": \"MENONLY\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 20]}, {\"metadata\": {\"name\": \"WOMENONLY\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 21]}, {\"metadata\": {\"name\": \"RELAFFIL\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 22]}, {\"metadata\": {\"name\": \"SATVR25\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 23]}, {\"metadata\": {\"name\": \"SATVR75\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 24]}, {\"metadata\": {\"name\": \"SATMT25\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 25]}, {\"metadata\": {\"name\": \"SATMT75\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 26]}, {\"metadata\": {\"name\": \"SATWR25\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 27]}, {\"metadata\": {\"name\": \"SATWR75\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 28]}, {\"metadata\": {\"name\": \"SATVRMID\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 29]}, {\"metadata\": {\"name\": \"SATMTMID\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 30]}, {\"metadata\": {\"name\": \"SATWRMID\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 31]}, {\"metadata\": {\"name\": \"ACTCM25\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 32]}, {\"metadata\": {\"name\": \"ACTCM75\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 33]}, {\"metadata\": {\"name\": \"ACTEN25\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 34]}, {\"metadata\": {\"name\": \"ACTEN75\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 35]}, {\"metadata\": {\"name\": \"ACTMT25\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 36]}, {\"metadata\": {\"name\": \"ACTMT75\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 37]}, {\"metadata\": {\"name\": \"ACTWR25\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/CategoricalData\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 38]}, {\"metadata\": {\"name\": \"ACTWR75\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/CategoricalData\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 39]}, {\"metadata\": {\"name\": \"ACTCMMID\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 40]}, {\"metadata\": {\"name\": \"ACTENMID\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 41]}, {\"metadata\": {\"name\": \"ACTMTMID\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 42]}, {\"metadata\": {\"name\": \"ACTWRMID\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/CategoricalData\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 43]}, {\"metadata\": {\"name\": \"SAT_AVG\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 44]}, {\"metadata\": {\"name\": \"SAT_AVG_ALL\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 45]}, {\"metadata\": {\"name\": \"PCIP01\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 46]}, {\"metadata\": {\"name\": \"PCIP03\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 47]}, {\"metadata\": {\"name\": \"PCIP04\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 48]}, {\"metadata\": {\"name\": \"PCIP05\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 49]}, {\"metadata\": {\"name\": \"PCIP09\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 50]}, {\"metadata\": {\"name\": \"PCIP10\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 51]}, {\"metadata\": {\"name\": \"PCIP11\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 52]}, {\"metadata\": {\"name\": \"PCIP12\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 53]}, {\"metadata\": {\"name\": \"PCIP13\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 54]}, {\"metadata\": {\"name\": \"PCIP14\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 55]}, {\"metadata\": {\"name\": \"PCIP15\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 56]}, {\"metadata\": {\"name\": \"PCIP16\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 57]}, {\"metadata\": {\"name\": \"PCIP19\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 58]}, {\"metadata\": {\"name\": \"PCIP22\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 59]}, {\"metadata\": {\"name\": \"PCIP23\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 60]}, {\"metadata\": {\"name\": \"PCIP24\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 61]}, {\"metadata\": {\"name\": \"PCIP25\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 62]}, {\"metadata\": {\"name\": \"PCIP26\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 63]}, {\"metadata\": {\"name\": \"PCIP27\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 64]}, {\"metadata\": {\"name\": \"PCIP29\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 65]}, {\"metadata\": {\"name\": \"PCIP30\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 66]}, {\"metadata\": {\"name\": \"PCIP31\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 67]}, {\"metadata\": {\"name\": \"PCIP38\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 68]}, {\"metadata\": {\"name\": \"PCIP39\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 69]}, {\"metadata\": {\"name\": \"PCIP40\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 70]}, {\"metadata\": {\"name\": \"PCIP41\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 71]}, {\"metadata\": {\"name\": \"PCIP42\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 72]}, {\"metadata\": {\"name\": \"PCIP43\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 73]}, {\"metadata\": {\"name\": \"PCIP44\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 74]}, {\"metadata\": {\"name\": \"PCIP45\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 75]}, {\"metadata\": {\"name\": \"PCIP46\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 76]}, {\"metadata\": {\"name\": \"PCIP47\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 77]}, {\"metadata\": {\"name\": \"PCIP48\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 78]}, {\"metadata\": {\"name\": \"PCIP49\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 79]}, {\"metadata\": {\"name\": \"PCIP50\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 80]}, {\"metadata\": {\"name\": \"PCIP51\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 81]}, {\"metadata\": {\"name\": \"PCIP52\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 82]}, {\"metadata\": {\"name\": \"PCIP54\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 83]}, {\"metadata\": {\"name\": \"DISTANCEONLY\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 84]}, {\"metadata\": {\"name\": \"UGDS\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 85]}, {\"metadata\": {\"name\": \"UGDS_WHITE\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 86]}, {\"metadata\": {\"name\": \"UGDS_BLACK\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 87]}, {\"metadata\": {\"name\": \"UGDS_HISP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 88]}, {\"metadata\": {\"name\": \"UGDS_ASIAN\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 89]}, {\"metadata\": {\"name\": \"UGDS_AIAN\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 90]}, {\"metadata\": {\"name\": \"UGDS_NHPI\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 91]}, {\"metadata\": {\"name\": \"UGDS_2MOR\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 92]}, {\"metadata\": {\"name\": \"UGDS_NRA\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 93]}, {\"metadata\": {\"name\": \"UGDS_UNKN\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 94]}, {\"metadata\": {\"name\": \"PPTUG_EF\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 95]}, {\"metadata\": {\"name\": \"CURROPER\", \"semantic_types\": [\"http://schema.org/Integer\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 96]}, {\"metadata\": {\"name\": \"NPT4_PUB\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 97]}, {\"metadata\": {\"name\": \"NPT4_PRIV\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 98]}, {\"metadata\": {\"name\": \"NPT41_PUB\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 99]}, {\"metadata\": {\"name\": \"NPT42_PUB\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 100]}, {\"metadata\": {\"name\": \"NPT43_PUB\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 101]}, {\"metadata\": {\"name\": \"NPT44_PUB\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 102]}, {\"metadata\": {\"name\": \"NPT45_PUB\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 103]}, {\"metadata\": {\"name\": \"NPT41_PRIV\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 104]}, {\"metadata\": {\"name\": \"NPT42_PRIV\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 105]}, {\"metadata\": {\"name\": \"NPT43_PRIV\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 106]}, {\"metadata\": {\"name\": \"NPT44_PRIV\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 107]}, {\"metadata\": {\"name\": \"NPT45_PRIV\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 108]}, {\"metadata\": {\"name\": \"PCTPELL\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 109]}, {\"metadata\": {\"name\": \"RET_FT4_POOLED_SUPP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 110]}, {\"metadata\": {\"name\": \"RET_FTL4_POOLED_SUPP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 111]}, {\"metadata\": {\"name\": \"RET_PT4_POOLED_SUPP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 112]}, {\"metadata\": {\"name\": \"RET_PTL4_POOLED_SUPP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 113]}, {\"metadata\": {\"name\": \"PCTFLOAN\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 114]}, {\"metadata\": {\"name\": \"UG25ABV\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 115]}, {\"metadata\": {\"name\": \"MD_EARN_WNE_P10\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 116]}, {\"metadata\": {\"name\": \"GT_25K_P6\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 117]}, {\"metadata\": {\"name\": \"GT_28K_P6\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 118]}, {\"metadata\": {\"name\": \"GRAD_DEBT_MDN_SUPP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 119]}, {\"metadata\": {\"name\": \"GRAD_DEBT_MDN10YR_SUPP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 120]}, {\"metadata\": {\"name\": \"RPY_3YR_RT_SUPP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 121]}, {\"metadata\": {\"name\": \"C150_L4_POOLED_SUPP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 122]}, {\"metadata\": {\"name\": \"C150_4_POOLED_SUPP\", \"semantic_types\": [\"http://schema.org/Float\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 123]}, {\"metadata\": {\"name\": \"UNITID_wikidata\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 124]}, {\"metadata\": {\"name\": \"OPEID6_wikidata\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 125]}, {\"metadata\": {\"name\": \"STABBR_wikidata\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 126]}, {\"metadata\": {\"name\": \"CITY_wikidata\", \"semantic_types\": [\"http://schema.org/Text\", \"https://metadata.datadrivendiscovery.org/types/Attribute\"], \"structural_type\": \"str\"}, \"selector\": [\"__ALL_ELEMENTS__\", 127]}], \"score\": 0.9398390424662831, \"summary\": {\"Columns\": [\"[0] UNITID\", \"[1] OPEID\", \"[2] OPEID6\", \"[3] INSTNM\", \"[4] CITY\", \"[5] STABBR\", \"[6] INSTURL\", \"[7] NPCURL\", \"[8] HCM2\", \"[9] PREDDEG\", \"[10] HIGHDEG\", \"[11] CONTROL\", \"[12] LOCALE\", \"[13] HBCU\", \"[14] PBI\", \"[15] ANNHI\", \"[16] TRIBAL\", \"[17] AANAPII\", \"[18] HSI\", \"[19] NANTI\", \"[20] MENONLY\", \"[21] WOMENONLY\", \"[22] RELAFFIL\", \"[23] SATVR25\", \"[24] SATVR75\", \"[25] SATMT25\", \"[26] SATMT75\", \"[27] SATWR25\", \"[28] SATWR75\", \"[29] SATVRMID\", \"[30] SATMTMID\", \"[31] SATWRMID\", \"[32] ACTCM25\", \"[33] ACTCM75\", \"[34] ACTEN25\", \"[35] ACTEN75\", \"[36] ACTMT25\", \"[37] ACTMT75\", \"[38] ACTWR25\", \"[39] ACTWR75\", \"[40] ACTCMMID\", \"[41] ACTENMID\", \"[42] ACTMTMID\", \"[43] ACTWRMID\", \"[44] SAT_AVG\", \"[45] SAT_AVG_ALL\", \"[46] PCIP01\", \"[47] PCIP03\", \"[48] PCIP04\", \"[49] PCIP05\", \"[50] PCIP09\", \"[51] PCIP10\", \"[52] PCIP11\", \"[53] PCIP12\", \"[54] PCIP13\", \"[55] PCIP14\", \"[56] PCIP15\", \"[57] PCIP16\", \"[58] PCIP19\", \"[59] PCIP22\", \"[60] PCIP23\", \"[61] PCIP24\", \"[62] PCIP25\", \"[63] PCIP26\", \"[64] PCIP27\", \"[65] PCIP29\", \"[66] PCIP30\", \"[67] PCIP31\", \"[68] PCIP38\", \"[69] PCIP39\", \"[70] PCIP40\", \"[71] PCIP41\", \"[72] PCIP42\", \"[73] PCIP43\", \"[74] PCIP44\", \"[75] PCIP45\", \"[76] PCIP46\", \"[77] PCIP47\", \"[78] PCIP48\", \"[79] PCIP49\", \"[80] PCIP50\", \"[81] PCIP51\", \"[82] PCIP52\", \"[83] PCIP54\", \"[84] DISTANCEONLY\", \"[85] UGDS\", \"[86] UGDS_WHITE\", \"[87] UGDS_BLACK\", \"[88] UGDS_HISP\", \"[89] UGDS_ASIAN\", \"[90] UGDS_AIAN\", \"[91] UGDS_NHPI\", \"[92] UGDS_2MOR\", \"[93] UGDS_NRA\", \"[94] UGDS_UNKN\", \"[95] PPTUG_EF\", \"[96] CURROPER\", \"[97] NPT4_PUB\", \"[98] NPT4_PRIV\", \"[99] NPT41_PUB\", \"[100] NPT42_PUB\", \"[101] NPT43_PUB\", \"[102] NPT44_PUB\", \"[103] NPT45_PUB\", \"[104] NPT41_PRIV\", \"[105] NPT42_PRIV\", \"[106] NPT43_PRIV\", \"[107] NPT44_PRIV\", \"[108] NPT45_PRIV\", \"[109] PCTPELL\", \"[110] RET_FT4_POOLED_SUPP\", \"[111] RET_FTL4_POOLED_SUPP\", \"[112] RET_PT4_POOLED_SUPP\", \"[113] RET_PTL4_POOLED_SUPP\", \"[114] PCTFLOAN\", \"[115] UG25ABV\", \"[116] MD_EARN_WNE_P10\", \"[117] GT_25K_P6\", \"[118] GT_28K_P6\", \"[119] GRAD_DEBT_MDN_SUPP\", \"[120] GRAD_DEBT_MDN10YR_SUPP\", \"[121] RPY_3YR_RT_SUPP\", \"[122] C150_L4_POOLED_SUPP\", \"[123] C150_4_POOLED_SUPP\", \"[124] UNITID_wikidata\", \"[125] OPEID6_wikidata\", \"[126] STABBR_wikidata\", \"[127] CITY_wikidata\"], \"Datamart ID\": \"D4cb70062-77ed-4097-a486-0b43ffe81463\", \"Recommend Join Columns\": \"INSTNM\", \"Score\": \"0.9398390424662831\", \"URL\": \"http://dsbox02.isi.edu:9000/upload/local_datasets/Most-Recent-Cohorts-Scorecard-Elements.csv\", \"title\": \"most recent cohorts scorecard elements csv\"}, \"supplied_id\": \"DA_college_debt_dataset_TRAIN\", \"supplied_resource_id\": \"learningData\"}" + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "version": "0.3.0", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "name": "Extract a DataFrame from a Dataset", + "digest": "57475517f8d20c260757a13497239c3ddfb3c0949ab9769e5c177c18b919eaa1" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.3.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "3002bc5b-fa47-4a3d-882e-a8b5f3d756aa", + "version": "0.1.0", + "python_path": "d3m.primitives.data_transformation.remove_semantic_types.Common", + "name": "Remove semantic types from columns", + "digest": "a7a99c19c430ad238787bb17f33bb5ad6dd62f350190284dae86798f880281c0" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.4.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "semantic_types": { + "type": "VALUE", + "data": [ + "http://wikidata.org/qnode" + ] + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "version": "0.3.0", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "name": "Extracts columns by semantic type", + "digest": "75a68013cd3c12e77ba31e392298d2a62766ae00d556fdaf30401f7ba4a29b8c" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.5.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "semantic_types": { + "type": "VALUE", + "data": [ + "https://metadata.datadrivendiscovery.org/types/PrimaryKey", + "https://metadata.datadrivendiscovery.org/types/Attribute" + ] + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "b2612849-39e4-33ce-bfda-24f3e2cb1e93", + "version": "1.5.3", + "python_path": "d3m.primitives.schema_discovery.profiler.DSBOX", + "name": "DSBox Profiler", + "digest": "d584c3e2af2f60947f9703fd8aa22ea04ccf9fe20266a3f7ac87da939838fe5f" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.6.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "dsbox-cleaning-featurizer", + "version": "1.5.3", + "python_path": "d3m.primitives.data_cleaning.cleaning_featurizer.DSBOX", + "name": "DSBox Cleaning Featurizer", + "digest": "3e0646c87ba9d9745ff0ced1ef381da434a29af1654bd3cdc2db46a7f1a87f20" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.7.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "18f0bb42-6350-3753-8f2d-d1c3da70f279", + "version": "1.5.3", + "python_path": "d3m.primitives.data_preprocessing.encoder.DSBOX", + "name": "ISI DSBox Data Encoder", + "digest": "026f3fb4af7c426034e492829a2fb6968bb6961fee868f6d3be4fd5c0aae72f7" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.8.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "0c64ffd6-cb9e-49f0-b7cb-abd70a5a8261", + "version": "1.0.0", + "python_path": "d3m.primitives.feature_construction.corex_text.DSBOX", + "name": "CorexText", + "digest": "7d942ed753a5d1d4089a37aa446c25cf80a14e6fb0feb2a6a4fc0218d5f88292" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.9.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "7ddf2fd8-2f7f-4e53-96a7-0d9f5aeecf93", + "version": "1.5.3", + "python_path": "d3m.primitives.data_transformation.to_numeric.DSBOX", + "name": "ISI DSBox To Numeric DataFrame", + "digest": "0c06f13376139c95f9c7ee2c4ea0e1b74242092a9c0a5359444b584b7b26b4b6" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.10.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "7894b699-61e9-3a50-ac9f-9bc510466667", + "version": "1.5.3", + "python_path": "d3m.primitives.data_preprocessing.mean_imputation.DSBOX", + "name": "DSBox Mean Imputer", + "digest": "c06061074a29dffda6f59d779bf6658fd69f5d101a4e48569cd6ad35775da9f0" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.11.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "dsbox-featurizer-do-nothing", + "version": "1.5.3", + "python_path": "d3m.primitives.data_preprocessing.do_nothing.DSBOX", + "name": "DSBox do-nothing primitive", + "digest": "b540e87d22c38511e88693cce3dcdba7085ede3119d2b18c1172f734df16ce43" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.12.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "dsbox-featurizer-do-nothing", + "version": "1.5.3", + "python_path": "d3m.primitives.data_preprocessing.do_nothing.DSBOX", + "name": "DSBox do-nothing primitive", + "digest": "b540e87d22c38511e88693cce3dcdba7085ede3119d2b18c1172f734df16ce43" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.13.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "version": "0.3.0", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "name": "Extracts columns by semantic type", + "digest": "75a68013cd3c12e77ba31e392298d2a62766ae00d556fdaf30401f7ba4a29b8c" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.4.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "semantic_types": { + "type": "VALUE", + "data": [ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ] + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "7ddf2fd8-2f7f-4e53-96a7-0d9f5aeecf93", + "version": "1.5.3", + "python_path": "d3m.primitives.data_transformation.to_numeric.DSBOX", + "name": "ISI DSBox To Numeric DataFrame", + "digest": "0c06f13376139c95f9c7ee2c4ea0e1b74242092a9c0a5359444b584b7b26b4b6" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.15.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "drop_non_numeric_columns": { + "type": "VALUE", + "data": false + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "35321059-2a1a-31fd-9509-5494efc751c7", + "version": "2019.6.7", + "python_path": "d3m.primitives.regression.extra_trees.SKlearn", + "name": "sklearn.ensemble.forest.ExtraTreesRegressor", + "digest": "0a8153e2821cacf807429c02b1b210ed6c700e8342b7af988b93245514b6f345" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.14.produce" + }, + "outputs": { + "type": "CONTAINER", + "data": "steps.16.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "bootstrap": { + "type": "VALUE", + "data": "bootstrap" + }, + "max_depth": { + "type": "VALUE", + "data": { + "case": "none", + "value": null + } + }, + "min_samples_leaf": { + "type": "VALUE", + "data": { + "case": "absolute", + "value": 1 + } + }, + "min_samples_split": { + "type": "VALUE", + "data": { + "case": "int", + "value": 10 + } + }, + "max_features": { + "type": "VALUE", + "data": { + "case": "calculated", + "value": "auto" + } + }, + "n_estimators": { + "type": "VALUE", + "data": 100 + }, + "add_index_columns": { + "type": "VALUE", + "data": true + }, + "use_semantic_types": { + "type": "VALUE", + "data": true + } + } + } + ], + "source": { + "name": "ISI", + "contact": "mailto:kyao@isi.edu" + }, + "name": "default_regression_template:140004175511624", + "description": "", + "digest": "e1a65c6510329dcee1df7ebee899a5b554578b054e8e0cb1434af88cce4b8d45" +} diff --git a/tods/common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/4ff2f21d-1bba-4c44-bb96-e05728bcf6ed.json b/tods/common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/4ff2f21d-1bba-4c44-bb96-e05728bcf6ed.json new file mode 100644 index 0000000..5dad103 --- /dev/null +++ b/tods/common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/4ff2f21d-1bba-4c44-bb96-e05728bcf6ed.json @@ -0,0 +1,342 @@ +{ + "id": "4ff2f21d-1bba-4c44-bb96-e05728bcf6ed", + "name": "classification_template(imputer=d3m.primitives.data_cleaning.imputer.SKlearn, classifier=d3m.primitives.regression.random_forest.SKlearn)", + "description": "To be used with NYU datamart.", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2019-06-06T21:30:30Z", + "context": "TESTING", + "inputs": [ + { + "name": "input dataset" + } + ], + "outputs": [ + { + "data": "steps.12.produce", + "name": "predictions" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "fe0f1ac8-1d39-463a-b344-7bd498a31b91", + "version": "0.1", + "name": "Perform dataset augmentation using Datamart", + "python_path": "d3m.primitives.data_augmentation.datamart_augmentation.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "system_identifier": { + "type": "VALUE", + "data": "NYU" + }, + "search_result": { + "type": "VALUE", + "data": "{\"augmentation\": {\"left_columns\": [[1]], \"left_columns_names\": [\"tpep_pickup_datetime\"], \"right_columns\": [[0]], \"type\": \"join\"}, \"id\": \"datamart.url.a3943fd7892d5d219012f889327c6661\", \"metadata\": {\"columns\": [{\"coverage\": [{\"range\": {\"gte\": 1451610000.0, \"lte\": 1540252800.0}}], \"mean\": 1495931400.0, \"name\": \"DATE\", \"semantic_types\": [\"http://schema.org/DateTime\"], \"stddev\": 25590011.431395352, \"structural_type\": \"http://schema.org/Text\"}, {\"name\": \"HOURLYSKYCONDITIONS\", \"semantic_types\": [], \"structural_type\": \"http://schema.org/Text\"}, {\"coverage\": [{\"range\": {\"gte\": -17.2, \"lte\": 37.8}}], \"mean\": 14.666224009096823, \"name\": \"HOURLYDRYBULBTEMPC\", \"semantic_types\": [], \"stddev\": 9.973788193915643, \"structural_type\": \"http://schema.org/Float\"}, {\"coverage\": [{\"range\": {\"gte\": 11.0, \"lte\": 100.0}}], \"mean\": 60.70849577647823, \"name\": \"HOURLYRelativeHumidity\", \"semantic_types\": [], \"stddev\": 18.42048051096981, \"structural_type\": \"http://schema.org/Float\"}, {\"coverage\": [{\"range\": {\"gte\": 0.0, \"lte\": 41.0}}], \"mean\": 10.68859649122807, \"name\": \"HOURLYWindSpeed\", \"semantic_types\": [], \"stddev\": 5.539675475162907, \"structural_type\": \"http://schema.org/Float\"}, {\"name\": \"HOURLYWindDirection\", \"semantic_types\": [], \"structural_type\": \"http://schema.org/Text\"}, {\"coverage\": [{\"range\": {\"gte\": 28.89, \"lte\": 30.81}}], \"mean\": 29.90760315139694, \"name\": \"HOURLYStationPressure\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/PhoneNumber\"], \"stddev\": 0.24584097919742368, \"structural_type\": \"http://schema.org/Float\"}], \"date\": \"2019-01-22T01:54:58.281183Z\", \"description\": \"This data contains weather information for NY city around LaGuardia Airport from 2016 to 2018; weath...\", \"materialize\": {\"direct_url\": \"https://drive.google.com/uc?export=download&id=1jRwzZwEGMICE3n6-nwmVxMD2c0QCHad4\", \"identifier\": \"datamart.url\"}, \"name\": \"Newyork Weather Data around Airport 2016-18\", \"nb_rows\": 24624, \"size\": 1523693}, \"score\": 1.0, \"supplied_id\": \"DA_ny_taxi_demand_dataset_TRAIN\", \"supplied_resource_id\": \"learningData\"}" + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e", + "version": "0.2.0", + "name": "Denormalize datasets", + "python_path": "d3m.primitives.data_transformation.denormalize.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.0.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "version": "0.3.0", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.1.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", + "version": "0.6.0", + "name": "Parses strings into their types", + "python_path": "d3m.primitives.data_transformation.column_parser.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.2.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "version": "0.3.0", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.3.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "semantic_types": { + "type": "VALUE", + "data": [ + "https://metadata.datadrivendiscovery.org/types/Attribute" + ] + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "name": "sklearn.impute.SimpleImputer", + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "version": "2019.11.13", + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.4.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "strategy": { + "type": "VALUE", + "data": "most_frequent" + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "name": "sklearn.preprocessing.data.OneHotEncoder", + "python_path": "d3m.primitives.data_transformation.one_hot_encoder.SKlearn", + "version": "2019.11.13", + "id": "c977e879-1bf5-3829-b5b0-39b00233aff5" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.5.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "handle_unknown": { + "type": "VALUE", + "data": "ignore" + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "eb5fe752-f22a-4090-948b-aafcef203bf5", + "version": "0.2.0", + "name": "Casts DataFrame", + "python_path": "d3m.primitives.data_transformation.cast_to_type.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.6.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "type_to_cast": { + "type": "VALUE", + "data": "float" + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "version": "0.3.0", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.3.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "semantic_types": { + "type": "VALUE", + "data": [ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ] + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "eb5fe752-f22a-4090-948b-aafcef203bf5", + "version": "0.2.0", + "name": "Casts DataFrame", + "python_path": "d3m.primitives.data_transformation.cast_to_type.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.8.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "name": "sklearn.ensemble.forest.RandomForestRegressor", + "python_path": "d3m.primitives.regression.random_forest.SKlearn", + "version": "2019.11.13", + "id": "f0fd7a62-09b5-3abc-93bb-f5f999f7cc80" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.7.produce" + }, + "outputs": { + "type": "CONTAINER", + "data": "steps.9.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "version": "0.3.0", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.3.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "semantic_types": { + "type": "VALUE", + "data": [ + "https://metadata.datadrivendiscovery.org/types/Target", + "https://metadata.datadrivendiscovery.org/types/PrimaryKey" + ] + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "8d38b340-f83f-4877-baaa-162f8e551736", + "version": "0.3.0", + "name": "Construct pipeline predictions output", + "python_path": "d3m.primitives.data_transformation.construct_predictions.Common" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.10.produce" + }, + "reference": { + "type": "CONTAINER", + "data": "steps.11.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ] +} diff --git a/tods/common-primitives/pipelines/data_preprocessing.dataset_sample.Common/387d432a-9893-4558-b190-1c5e9e399dbf.yaml b/tods/common-primitives/pipelines/data_preprocessing.dataset_sample.Common/387d432a-9893-4558-b190-1c5e9e399dbf.yaml new file mode 100644 index 0000000..d8ece59 --- /dev/null +++ b/tods/common-primitives/pipelines/data_preprocessing.dataset_sample.Common/387d432a-9893-4558-b190-1c5e9e399dbf.yaml @@ -0,0 +1,123 @@ +id: 387d432a-9893-4558-b190-1c5e9e399dbf +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Jeffrey Gleason +created: "2019-06-05T2:48:52.806069Z" +context: TESTING +name: Dataset sample test pipeline +description: | + A simple pipeline which runs Random Forest classifier on tabular data after sampling the dataset (50% of rows) +inputs: + - name: input dataset +outputs: + - name: predictions + data: steps.6.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 268315c1-7549-4aee-a4cc-28921cba74c0 + version: 0.1.0 + python_path: d3m.primitives.data_preprocessing.dataset_sample.Common + name: Dataset sampling primitive + arguments: + inputs: + type: CONTAINER + data: inputs.0 + outputs: + - id: produce + # Step 1. + - type: PRIMITIVE + primitive: + id: f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e + version: 0.2.0 + python_path: d3m.primitives.data_transformation.denormalize.Common + name: Denormalize datasets + arguments: + inputs: + type: CONTAINER + data: steps.0.produce + outputs: + - id: produce + # Step 2. + - type: PRIMITIVE + primitive: + id: 4b42ce1e-9b98-4a25-b68e-fad13311eb65 + version: 0.3.0 + python_path: d3m.primitives.data_transformation.dataset_to_dataframe.Common + name: Extract a DataFrame from a Dataset + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce + # Step 3. + - type: PRIMITIVE + primitive: + id: d510cb7a-1782-4f51-b44c-58f0236e47c7 + version: 0.6.0 + python_path: d3m.primitives.data_transformation.column_parser.Common + name: Parses strings into their types + arguments: + inputs: + type: CONTAINER + data: steps.2.produce + outputs: + - id: produce + # Step 4. + - type: PRIMITIVE + primitive: + id: d016df89-de62-3c53-87ed-c06bb6a23cde + version: 2019.6.7 + python_path: d3m.primitives.data_cleaning.imputer.SKlearn + name: sklearn.impute.SimpleImputer + arguments: + inputs: + type: CONTAINER + data: steps.3.produce + outputs: + - id: produce + hyperparams: + use_semantic_types: + type: VALUE + data: true + return_result: + type: VALUE + data: replace + # Step 5. + - type: PRIMITIVE + primitive: + id: 37c2b19d-bdab-4a30-ba08-6be49edcc6af + version: 0.4.0 + python_path: d3m.primitives.classification.random_forest.Common + name: Random forest classifier + arguments: + inputs: + type: CONTAINER + data: steps.4.produce + outputs: + type: CONTAINER + data: steps.4.produce + outputs: + - id: produce + hyperparams: + return_result: + type: VALUE + data: replace + # Step 6. + - type: PRIMITIVE + primitive: + id: 8d38b340-f83f-4877-baaa-162f8e551736 + version: 0.3.0 + python_path: d3m.primitives.data_transformation.construct_predictions.Common + name: Construct pipeline predictions output + arguments: + inputs: + type: CONTAINER + data: steps.5.produce + reference: + type: CONTAINER + data: steps.3.produce + outputs: + - id: produce diff --git a/tods/common-primitives/pipelines/data_preprocessing.one_hot_encoder.MakerCommon/2b307634-f01e-412e-8d95-7e54afd4731f.json b/tods/common-primitives/pipelines/data_preprocessing.one_hot_encoder.MakerCommon/2b307634-f01e-412e-8d95-7e54afd4731f.json new file mode 100644 index 0000000..5606e66 --- /dev/null +++ b/tods/common-primitives/pipelines/data_preprocessing.one_hot_encoder.MakerCommon/2b307634-f01e-412e-8d95-7e54afd4731f.json @@ -0,0 +1,300 @@ +{ + "context": "TESTING", + "created": "2019-02-12T02:10:00.929519Z", + "id": "2b307634-f01e-412e-8d95-7e54afd4731f", + "inputs": [ + { + "name": "inputs" + } + ], + "outputs": [ + { + "data": "steps.9.produce", + "name": "output predictions" + } + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "steps": [ + { + "arguments": { + "inputs": { + "data": "inputs.0", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "parse_semantic_types": { + "data": [ + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector", + "http://schema.org/DateTime" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", + "name": "Parses strings into their types", + "python_path": "d3m.primitives.data_transformation.column_parser.Common", + "version": "0.6.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/CategoricalData" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "exclude_columns": { + "data": [ + 0 + ], + "type": "VALUE" + }, + "semantic_types": { + "data": [ + "http://schema.org/Integer", + "http://schema.org/Float" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.3.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + }, + "use_semantic_types": { + "data": true, + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde", + "name": "sklearn.impute.SimpleImputer", + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "version": "2019.6.7" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.2.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "eaec420d-46eb-4ddf-a2cd-b8097345ff3e", + "name": "One-hot maker", + "python_path": "d3m.primitives.data_preprocessing.one_hot_encoder.MakerCommon", + "version": "0.2.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "left": { + "data": "steps.6.produce", + "type": "CONTAINER" + }, + "right": { + "data": "steps.5.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "aff6a77a-faa0-41c5-9595-de2e7f7c4760", + "name": "Concatenate two dataframes", + "python_path": "d3m.primitives.data_transformation.horizontal_concat.DataFrameCommon", + "version": "0.2.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.7.produce", + "type": "CONTAINER" + }, + "outputs": { + "data": "steps.4.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + }, + "use_semantic_types": { + "data": true, + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "1dd82833-5692-39cb-84fb-2455683075f3", + "name": "sklearn.ensemble.forest.RandomForestClassifier", + "python_path": "d3m.primitives.classification.random_forest.SKlearn", + "version": "2019.6.7" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.8.produce", + "type": "CONTAINER" + }, + "reference": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "8d38b340-f83f-4877-baaa-162f8e551736", + "name": "Construct pipeline predictions output", + "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + } + ] +} diff --git a/tods/common-primitives/pipelines/data_preprocessing.one_hot_encoder.PandasCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json b/tods/common-primitives/pipelines/data_preprocessing.one_hot_encoder.PandasCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json new file mode 120000 index 0000000..51266fd --- /dev/null +++ b/tods/common-primitives/pipelines/data_preprocessing.one_hot_encoder.PandasCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json b/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json new file mode 120000 index 0000000..0deae2e --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json @@ -0,0 +1 @@ +../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/a8c40699-c48d-4f12-aa18-639c5fb6baae.json b/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/a8c40699-c48d-4f12-aa18-639c5fb6baae.json new file mode 120000 index 0000000..b1225d9 --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/a8c40699-c48d-4f12-aa18-639c5fb6baae.json @@ -0,0 +1 @@ +../data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json b/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json new file mode 120000 index 0000000..51266fd --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json b/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json new file mode 120000 index 0000000..080c8da --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json @@ -0,0 +1 @@ +../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json b/tods/common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json new file mode 120000 index 0000000..0deae2e --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json @@ -0,0 +1 @@ +../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json b/tods/common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json new file mode 120000 index 0000000..51266fd --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json b/tods/common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json new file mode 120000 index 0000000..080c8da --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json @@ -0,0 +1 @@ +../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json b/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json new file mode 120000 index 0000000..0deae2e --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json @@ -0,0 +1 @@ +../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json b/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json new file mode 120000 index 0000000..b1225d9 --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json @@ -0,0 +1 @@ +../data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json b/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json new file mode 120000 index 0000000..51266fd --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json b/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json new file mode 120000 index 0000000..080c8da --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json @@ -0,0 +1 @@ +../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json b/tods/common-primitives/pipelines/data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json new file mode 100644 index 0000000..1217fd3 --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json @@ -0,0 +1 @@ +{"id": "4ec215d1-6484-4502-a6dd-f659943ccb94", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-15T17:49:59.327063Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7", "version": "0.2.0", "python_path": "d3m.primitives.schema_discovery.profiler.Common", "name": "Determine missing semantic types for columns automatically", "digest": "a3d51cbc0bf18168114c1c8f12c497d691dbe30b71667f355f30c13a9a08ba32"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.extract_columns.Common", "name": "Extracts columns", "digest": "7b9ba98e3b7b9d1d8e17547249c7a25cd8d58ec60d957217f772753e37526145"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"columns": {"type": "VALUE", "data": [25]}}}, {"type": "PRIMITIVE", "primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.extract_columns.Common", "name": "Extracts columns", "digest": "7b9ba98e3b7b9d1d8e17547249c7a25cd8d58ec60d957217f772753e37526145"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"columns": {"type": "VALUE", "data": [6]}}}, {"type": "PRIMITIVE", "primitive": {"id": "09f252eb-215d-4e0b-9a60-fcd967f5e708", "version": "0.2.0", "python_path": "d3m.primitives.data_transformation.encoder.DistilTextEncoder", "name": "Text encoder", "digest": "e468d66d1eda057a61b2c79ecf5288f137778f47dac9eabdc60707a4941532a3"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"encoder_type": {"type": "VALUE", "data": "tfidf"}}}, {"type": "PRIMITIVE", "primitive": {"id": "e0ad06ce-b484-46b0-a478-c567e1ea7e02", "version": "0.2.0", "python_path": "d3m.primitives.learner.random_forest.DistilEnsembleForest", "name": "EnsembleForest", "digest": "4ba7a354b15ea626bf96aa771a2a3cba034ad5d0a8ccdbbf68bce2d828db1b4d"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "674a644333a3a481769591341591461b06de566fef7439010284739194e18af8"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "a26edc0cc9bcf9121189186d621ff1b4cebb2afc76b6ef171d7d8194e55cf475"} \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.extract_columns.Common/pipeline.py b/tods/common-primitives/pipelines/data_transformation.extract_columns.Common/pipeline.py new file mode 100644 index 0000000..e307251 --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.extract_columns.Common/pipeline.py @@ -0,0 +1,71 @@ +from d3m import index +from d3m.metadata.base import ArgumentType, Context +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: Simple profiler primitive +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.schema_discovery.profiler.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: column_parser +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Step 3: Extract text column explicitly +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data = [25]) +step_3.add_output('produce') +pipeline_description.add_step(step_3) + +# Step 4: Extract target column explicitly +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns.Common')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_4.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data = [6]) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5: encode text column +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.encoder.DistilTextEncoder')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') +step_5.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') +step_5.add_hyperparameter(name='encoder_type', argument_type=ArgumentType.VALUE, data = 'tfidf') +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Step 6: classifier +step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.learner.random_forest.DistilEnsembleForest')) +step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +step_6.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') +step_6.add_output('produce') +pipeline_description.add_step(step_6) + +# Step 7: construct output +step_7 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) +step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') +step_7.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_7.add_output('produce') +pipeline_description.add_step(step_7) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.7.produce') + +# Output json pipeline +blob = pipeline_description.to_json() +filename = blob[8:44] + '.json' +with open(filename, 'w') as outfile: + outfile.write(blob) + diff --git a/tods/common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json b/tods/common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json new file mode 120000 index 0000000..51266fd --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json b/tods/common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json new file mode 120000 index 0000000..080c8da --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json @@ -0,0 +1 @@ +../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json b/tods/common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json new file mode 100644 index 0000000..ca4500d --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json @@ -0,0 +1 @@ +{"id": "b523335c-0c47-4d02-a582-f69609cde1e8", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-15T19:51:17.782254Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.9.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7", "version": "0.2.0", "python_path": "d3m.primitives.schema_discovery.profiler.Common", "name": "Determine missing semantic types for columns automatically", "digest": "a3d51cbc0bf18168114c1c8f12c497d691dbe30b71667f355f30c13a9a08ba32"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "79674d68-9b93-4359-b385-7b5f60645b06", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_structural_types.Common", "name": "Extracts columns by structural type", "digest": "7805010b9581bb96c035fefa5943209c69a1e234f10d9057d487af42c0fd4830"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "f6315ca9-ca39-4e13-91ba-1964ee27281c", "version": "0.1.0", "python_path": "d3m.primitives.data_preprocessing.one_hot_encoder.PandasCommon", "name": "Pandas one hot encoder", "digest": "ed1217d4d7c017d8239b4f958c8e6ca0b3b67966ccb50cc5c578a9f14e465ec0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"use_columns": {"type": "VALUE", "data": [2, 5]}}}, {"type": "PRIMITIVE", "primitive": {"id": "3b09ba74-cc90-4f22-9e0a-0cf4f29a7e28", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.remove_columns.Common", "name": "Removes columns", "digest": "a725d149595186b85f1dea2bacbf4b853712b6a50eddb7c4c2295fabc3a04df1"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"columns": {"type": "VALUE", "data": [25]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "505df38f9be4964ff19683ab3e185f19333fb35c26121c12a1c55bddd9d38f72"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "505df38f9be4964ff19683ab3e185f19333fb35c26121c12a1c55bddd9d38f72"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Target"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "37c2b19d-bdab-4a30-ba08-6be49edcc6af", "version": "0.4.0", "python_path": "d3m.primitives.classification.random_forest.Common", "name": "Random forest classifier", "digest": "f5f702fc561775a6064c64c008a519f605eb00ca80f59a5d5e39b1340c7c015e"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.7.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "674a644333a3a481769591341591461b06de566fef7439010284739194e18af8"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.8.produce"}, "reference": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "7929f79fa8e2aaddcbe66d0f592525081280549e0713198e583728ff88b0f895"} \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/pipeline.py b/tods/common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/pipeline.py new file mode 100644 index 0000000..ae876cd --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/pipeline.py @@ -0,0 +1,83 @@ +from d3m import index +from d3m.metadata.base import ArgumentType, Context +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name='inputs') + +# Step 0: dataset_to_dataframe +step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) +step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') +step_0.add_output('produce') +pipeline_description.add_step(step_0) + +# Step 1: Simple profiler primitive +step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.schema_discovery.profiler.Common')) +step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_1.add_output('produce') +pipeline_description.add_step(step_1) + +# Step 2: Extract columns by structural type +step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_structural_types.Common')) +step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') +step_2.add_output('produce') +pipeline_description.add_step(step_2) + +# Step 3: column_parser +step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) +step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') +step_3.add_output('produce') +pipeline_description.add_step(step_3) + +# Step 4 one hot encode +step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_preprocessing.one_hot_encoder.PandasCommon')) +step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') +step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data = [2,5]) +step_4.add_output('produce') +pipeline_description.add_step(step_4) + +# Step 5 remove text +step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.remove_columns.Common')) +step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') +step_5.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data = [25]) +step_5.add_output('produce') +pipeline_description.add_step(step_5) + +# Step 6 extract attributes +step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +step_6.add_hyperparameter(name="semantic_types", argument_type=ArgumentType.VALUE, data=["https://metadata.datadrivendiscovery.org/types/Attribute"],) +step_6.add_output('produce') +pipeline_description.add_step(step_6) + +# Step 7 extract target +step_7 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) +step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') +step_7.add_hyperparameter(name="semantic_types", argument_type=ArgumentType.VALUE, data=["https://metadata.datadrivendiscovery.org/types/Target"],) +step_7.add_output('produce') +pipeline_description.add_step(step_7) + +# Step 8: classifier +step_8 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.classification.random_forest.Common')) +step_8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') +step_8.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce') +step_8.add_output('produce') +pipeline_description.add_step(step_8) + +# Step 9: construct output +step_9 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) +step_9.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.8.produce') +step_9.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') +step_9.add_output('produce') +pipeline_description.add_step(step_9) + +# Final Output +pipeline_description.add_output(name='output predictions', data_reference='steps.9.produce') + +# Output json pipeline +blob = pipeline_description.to_json() +filename = blob[8:44] + '.json' +with open(filename, 'w') as outfile: + outfile.write(blob) + diff --git a/tods/common-primitives/pipelines/data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json b/tods/common-primitives/pipelines/data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json new file mode 100644 index 0000000..dbaf998 --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json @@ -0,0 +1 @@ +{"id": "a8c40699-c48d-4f12-aa18-639c5fb6baae", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-15T19:35:58.976691Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.4.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7", "version": "0.2.0", "python_path": "d3m.primitives.schema_discovery.profiler.Common", "name": "Determine missing semantic types for columns automatically", "digest": "a3d51cbc0bf18168114c1c8f12c497d691dbe30b71667f355f30c13a9a08ba32"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"parse_semantic_types": {"type": "VALUE", "data": ["http://schema.org/Boolean", "http://schema.org/Integer", "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/FloatVector", "http://schema.org/DateTime"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "59db88b9-dd81-4e50-8f43-8f2af959560b", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.grouping_field_compose.Common", "name": "Grouping Field Compose", "digest": "e93815bfdb1c82ce0e2fa61f092d6ee9bcf39367a27072accbb9f0dd9189fb03"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "76b5a479-c209-4d94-92b5-7eba7a4d4499", "version": "1.0.2", "python_path": "d3m.primitives.time_series_forecasting.vector_autoregression.VAR", "name": "VAR", "digest": "7e22a1e7fe228114a5788f16a8d3c7709ed3a98a90e9cc82e3b80ab5f232d352"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.3.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "da2c7d2605256f263ca4725fe7385be5e027a3ddadc8dbf7523ff98bcd016005"} \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.grouping_field_compose.Common/pipeline.py b/tods/common-primitives/pipelines/data_transformation.grouping_field_compose.Common/pipeline.py new file mode 100644 index 0000000..9a9ebb1 --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.grouping_field_compose.Common/pipeline.py @@ -0,0 +1,100 @@ +from d3m import index +from d3m.metadata.base import ArgumentType +from d3m.metadata.pipeline import Pipeline, PrimitiveStep + +# Creating pipeline +pipeline_description = Pipeline() +pipeline_description.add_input(name="inputs") + +# Step 0: DS to DF on input DS +step_0 = PrimitiveStep( + primitive=index.get_primitive( + "d3m.primitives.data_transformation.dataset_to_dataframe.Common" + ) +) +step_0.add_argument( + name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0" +) +step_0.add_output("produce") +pipeline_description.add_step(step_0) + +# Step 1: Simple Profiler Column Role Annotation +step_1 = PrimitiveStep( + primitive=index.get_primitive("d3m.primitives.schema_discovery.profiler.Common") +) +step_1.add_argument( + name="inputs", + argument_type=ArgumentType.CONTAINER, + data_reference="steps.0.produce", +) +step_1.add_output("produce") +pipeline_description.add_step(step_1) + +# Step 2: column parser on input DF +step_2 = PrimitiveStep( + primitive=index.get_primitive( + "d3m.primitives.data_transformation.column_parser.Common" + ) +) +step_2.add_argument( + name="inputs", + argument_type=ArgumentType.CONTAINER, + data_reference="steps.1.produce", +) +step_2.add_output("produce") +step_2.add_hyperparameter( + name="parse_semantic_types", + argument_type=ArgumentType.VALUE, + data=[ + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector", + "http://schema.org/DateTime", + ], +) +pipeline_description.add_step(step_2) + +# Step 3: Grouping Field Compose +step_3 = PrimitiveStep( + primitive=index.get_primitive( + "d3m.primitives.data_transformation.grouping_field_compose.Common" + ) +) +step_3.add_argument( + name="inputs", + argument_type=ArgumentType.CONTAINER, + data_reference="steps.2.produce", +) +step_3.add_output("produce") +pipeline_description.add_step(step_3) + +# Step 4: forecasting primitive +step_4 = PrimitiveStep( + primitive=index.get_primitive( + "d3m.primitives.time_series_forecasting.vector_autoregression.VAR" + ) +) +step_4.add_argument( + name="inputs", + argument_type=ArgumentType.CONTAINER, + data_reference="steps.3.produce", +) +step_4.add_argument( + name="outputs", + argument_type=ArgumentType.CONTAINER, + data_reference="steps.3.produce", +) +step_4.add_output("produce") +pipeline_description.add_step(step_4) + +# Final Output +pipeline_description.add_output( + name="output predictions", data_reference="steps.4.produce" +) + +# Output json pipeline +blob = pipeline_description.to_json() +filename = blob[8:44] + ".json" +with open(filename, "w") as outfile: + outfile.write(blob) diff --git a/tods/common-primitives/pipelines/data_transformation.horizontal_concat.DataFrameConcat/2b307634-f01e-412e-8d95-7e54afd4731f.json b/tods/common-primitives/pipelines/data_transformation.horizontal_concat.DataFrameConcat/2b307634-f01e-412e-8d95-7e54afd4731f.json new file mode 120000 index 0000000..146d403 --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.horizontal_concat.DataFrameConcat/2b307634-f01e-412e-8d95-7e54afd4731f.json @@ -0,0 +1 @@ +../data_preprocessing.one_hot_encoder.MakerCommon/2b307634-f01e-412e-8d95-7e54afd4731f.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.remove_columns.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json b/tods/common-primitives/pipelines/data_transformation.remove_columns.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json new file mode 120000 index 0000000..51266fd --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.remove_columns.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/data_transformation.rename_duplicate_name.DataFrameCommon/11ee9290-992d-4e48-97ed-1a6e4c15f92f.json b/tods/common-primitives/pipelines/data_transformation.rename_duplicate_name.DataFrameCommon/11ee9290-992d-4e48-97ed-1a6e4c15f92f.json new file mode 100644 index 0000000..8ea69cd --- /dev/null +++ b/tods/common-primitives/pipelines/data_transformation.rename_duplicate_name.DataFrameCommon/11ee9290-992d-4e48-97ed-1a6e4c15f92f.json @@ -0,0 +1,272 @@ +{ + "context": "TESTING", + "created": "2019-02-12T02:01:52.663008Z", + "id": "11ee9290-992d-4e48-97ed-1a6e4c15f92f", + "inputs": [ + { + "name": "inputs" + } + ], + "outputs": [ + { + "data": "steps.8.produce", + "name": "output predictions" + } + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "steps": [ + { + "arguments": { + "inputs": { + "data": "inputs.0", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "parse_semantic_types": { + "data": [ + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector", + "http://schema.org/DateTime" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", + "name": "Parses strings into their types", + "python_path": "d3m.primitives.data_transformation.column_parser.Common", + "version": "0.6.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "separator": { + "data": "----", + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "7b067a78-4ad4-411d-9cf9-87bcee38ac73", + "name": "Rename all the duplicated name column in DataFrame", + "python_path": "d3m.primitives.data_transformation.rename_duplicate_name.DataFrameCommon", + "version": "0.2.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.2.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/CategoricalData" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.2.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "exclude_columns": { + "data": [ + 0 + ], + "type": "VALUE" + }, + "semantic_types": { + "data": [ + "http://schema.org/Integer", + "http://schema.org/Float" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.4.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + }, + "use_semantic_types": { + "data": true, + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde", + "name": "sklearn.impute.SimpleImputer", + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "version": "2019.6.7" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.6.produce", + "type": "CONTAINER" + }, + "outputs": { + "data": "steps.5.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "1dd82833-5692-39cb-84fb-2455683075f3", + "name": "sklearn.ensemble.forest.RandomForestClassifier", + "python_path": "d3m.primitives.classification.random_forest.SKlearn", + "version": "2019.6.7" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.7.produce", + "type": "CONTAINER" + }, + "reference": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "8d38b340-f83f-4877-baaa-162f8e551736", + "name": "Construct pipeline predictions output", + "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + } + ] +} diff --git a/tods/common-primitives/pipelines/evaluation.kfold_timeseries_split.Common/k-fold-timeseries-split.yml b/tods/common-primitives/pipelines/evaluation.kfold_timeseries_split.Common/k-fold-timeseries-split.yml new file mode 100644 index 0000000..88e99d6 --- /dev/null +++ b/tods/common-primitives/pipelines/evaluation.kfold_timeseries_split.Common/k-fold-timeseries-split.yml @@ -0,0 +1,83 @@ +id: 5bed1f23-ac17-4b52-9d06-a5b77a6aea51 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Jeffrey Gleason +created: "2019-04-08T16:18:27.250294Z" +context: TESTING +name: K-fold split of timeseries datasets +description: | + K-fold split of timeseries datasets for cross-validation. +inputs: + - name: folds + - name: full dataset +outputs: + - name: train datasets + data: steps.0.produce + - name: test datasets + data: steps.2.produce + - name: score datasets + data: steps.1.produce +steps: + # Step 0. + - type: PRIMITIVE + primitive: + id: 002f9ad1-46e3-40f4-89ed-eeffbb3a102b + version: 0.1.0 + python_path: d3m.primitives.evaluation.kfold_time_series_split.Common + name: K-fold cross-validation timeseries dataset splits + arguments: + inputs: + type: CONTAINER + data: inputs.0 + dataset: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce + - id: produce_score_data + # Step 1. We redact privileged attributes for both score and test splits. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.0.produce_score_data + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/PrivilegedData + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData + - https://metadata.datadrivendiscovery.org/types/MissingData + # Step 2. We further redact targets in test split. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/TrueTarget + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedTarget + - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/tods/common-primitives/pipelines/operator.dataset_map.DataFrameCommon/k-fold-timeseries-split-raw.yml b/tods/common-primitives/pipelines/operator.dataset_map.DataFrameCommon/k-fold-timeseries-split-raw.yml new file mode 100644 index 0000000..ea0047c --- /dev/null +++ b/tods/common-primitives/pipelines/operator.dataset_map.DataFrameCommon/k-fold-timeseries-split-raw.yml @@ -0,0 +1,108 @@ +# todo change name +id: 5bed1f23-ac17-4b52-9d06-a5b77a6aea51 +schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json +source: + name: Jeffrey Gleason +created: "2019-12-19T16:29:34.702933Z" +context: TESTING +name: K-fold split of timeseries datasets +description: | + K-fold split of timeseries datasets for cross-validation. +inputs: + - name: folds + - name: full dataset +outputs: + - name: train datasets + data: steps.2.produce + - name: test datasets + data: steps.4.produce + - name: score datasets + data: steps.3.produce +steps: + # Step 0. Simon Data Typing primitive to infer DateTime column + - type: PRIMITIVE + primitive: + id: d2fa8df2-6517-3c26-bafc-87b701c4043a + version: 1.2.2 + python_path: d3m.primitives.data_cleaning.column_type_profiler.Simon + name: simon + # Step 1. Mapped Simon Data Typing primitive to infer DateTime column + - type: PRIMITIVE + primitive: + id: 5bef5738-1638-48d6-9935-72445f0eecdc + version: 0.1.0 + python_path: d3m.primitives.operator.dataset_map.DataFrameCommon + name: Map DataFrame resources to new resources using provided primitive + arguments: + inputs: + type: CONTAINER + data: inputs.1 + outputs: + - id: produce + hyperparams: + primitive: + type: PRIMITIVE + data: 0 + # Step 2. K-fold cross-validation timeseries dataset splits + - type: PRIMITIVE + primitive: + id: 002f9ad1-46e3-40f4-89ed-eeffbb3a102b + version: 0.1.0 + python_path: d3m.primitives.evaluation.kfold_time_series_split.Common + name: K-fold cross-validation timeseries dataset splits + arguments: + inputs: + type: CONTAINER + data: inputs.0 + dataset: + type: CONTAINER + data: steps.1.produce + outputs: + - id: produce + - id: produce_score_data + # Step 3. We redact privileged attributes for both score and test splits. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.2.produce_score_data + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/PrivilegedData + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData + - https://metadata.datadrivendiscovery.org/types/MissingData + # Step 4. We further redact targets in test split. + - type: PRIMITIVE + primitive: + id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 + version: 0.2.0 + python_path: d3m.primitives.evaluation.redact_columns.Common + name: Redact columns for evaluation + arguments: + inputs: + type: CONTAINER + data: steps.3.produce + outputs: + - id: produce + hyperparams: + semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/TrueTarget + add_semantic_types: + type: VALUE + data: + - https://metadata.datadrivendiscovery.org/types/RedactedTarget + - https://metadata.datadrivendiscovery.org/types/MissingData diff --git a/tods/common-primitives/pipelines/regression.xgboost_gbtree.DataFrameCommon/0f636602-6299-411b-9873-4b974cd393ba.json b/tods/common-primitives/pipelines/regression.xgboost_gbtree.DataFrameCommon/0f636602-6299-411b-9873-4b974cd393ba.json new file mode 100644 index 0000000..1ae892b --- /dev/null +++ b/tods/common-primitives/pipelines/regression.xgboost_gbtree.DataFrameCommon/0f636602-6299-411b-9873-4b974cd393ba.json @@ -0,0 +1,247 @@ + +{ + "context": "TESTING", + "created": "2019-02-12T01:35:59.402796Z", + "id": "0f636602-6299-411b-9873-4b974cd393ba", + "inputs": [ + { + "name": "inputs" + } + ], + "outputs": [ + { + "data": "steps.7.produce", + "name": "output predictions" + } + ], + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "steps": [ + { + "arguments": { + "inputs": { + "data": "inputs.0", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", + "name": "Extract a DataFrame from a Dataset", + "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "parse_semantic_types": { + "data": [ + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector", + "http://schema.org/DateTime" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", + "name": "Parses strings into their types", + "python_path": "d3m.primitives.data_transformation.column_parser.Common", + "version": "0.6.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/CategoricalData" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "exclude_columns": { + "data": [ + 0 + ], + "type": "VALUE" + }, + "semantic_types": { + "data": [ + "http://schema.org/Integer", + "http://schema.org/Float" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.0.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "semantic_types": { + "data": [ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ], + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", + "name": "Extracts columns by semantic type", + "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.3.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + }, + "use_semantic_types": { + "data": true, + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde", + "name": "sklearn.impute.SimpleImputer", + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "version": "2019.6.7" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.5.produce", + "type": "CONTAINER" + }, + "outputs": { + "data": "steps.4.produce", + "type": "CONTAINER" + } + }, + "hyperparams": { + "return_result": { + "data": "replace", + "type": "VALUE" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "cdbb80e4-e9de-4caa-a710-16b5d727b959", + "name": "XGBoost GBTree regressor", + "python_path": "d3m.primitives.regression.xgboost_gbtree.Common", + "version": "0.1.0" + }, + "type": "PRIMITIVE" + }, + { + "arguments": { + "inputs": { + "data": "steps.6.produce", + "type": "CONTAINER" + }, + "reference": { + "data": "steps.1.produce", + "type": "CONTAINER" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "primitive": { + "id": "8d38b340-f83f-4877-baaa-162f8e551736", + "name": "Construct pipeline predictions output", + "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", + "version": "0.3.0" + }, + "type": "PRIMITIVE" + } + ] +} diff --git a/tods/common-primitives/pipelines/schema_discovery.profiler.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json b/tods/common-primitives/pipelines/schema_discovery.profiler.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json new file mode 120000 index 0000000..0deae2e --- /dev/null +++ b/tods/common-primitives/pipelines/schema_discovery.profiler.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json @@ -0,0 +1 @@ +../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/schema_discovery.profiler.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json b/tods/common-primitives/pipelines/schema_discovery.profiler.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json new file mode 120000 index 0000000..b1225d9 --- /dev/null +++ b/tods/common-primitives/pipelines/schema_discovery.profiler.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json @@ -0,0 +1 @@ +../data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json \ No newline at end of file diff --git a/tods/common-primitives/pipelines/schema_discovery.profiler.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json b/tods/common-primitives/pipelines/schema_discovery.profiler.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json new file mode 120000 index 0000000..51266fd --- /dev/null +++ b/tods/common-primitives/pipelines/schema_discovery.profiler.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json @@ -0,0 +1 @@ +../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json \ No newline at end of file diff --git a/tods/common-primitives/run_pipelines.sh b/tods/common-primitives/run_pipelines.sh new file mode 100755 index 0000000..437c24b --- /dev/null +++ b/tods/common-primitives/run_pipelines.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +mkdir -p results + +overall_result="0" + +while IFS= read -r pipeline_run_file; do + pipeline_run_name="$(dirname "$pipeline_run_file")/$(basename -s .yml.gz "$(basename -s .yaml.gz "$pipeline_run_file")")" + primitive_name="$(basename "$(dirname "$pipeline_run_file")")" + + if [[ -L "$pipeline_run_file" ]]; then + echo ">>> Skipping '$pipeline_run_file'." + continue + else + mkdir -p "results/$pipeline_run_name" + fi + + pipelines_path="pipelines/$primitive_name" + + if [[ ! -d "$pipelines_path" ]]; then + echo ">>> ERROR: Could not find pipelines for '$pipeline_run_file'." + overall_result="1" + continue + fi + + echo ">>> Running '$pipeline_run_file'." + python3 -m d3m --pipelines-path "$pipelines_path" \ + runtime \ + --datasets /data/datasets --volumes /data/static_files \ + fit-score --input-run "$pipeline_run_file" \ + --output "results/$pipeline_run_name/predictions.csv" \ + --scores "results/$pipeline_run_name/scores.csv" \ + --output-run "results/$pipeline_run_name/pipeline_runs.yaml" + result="$?" + + if [[ "$result" -eq 0 ]]; then + echo ">>> SUCCESS ($pipeline_run_file)" + else + echo ">>> ERROR ($pipeline_run_file)" + overall_result="1" + fi +done < <(find pipeline_runs -name '*.yml.gz' -or -name '*.yaml.gz') + +exit "$overall_result" diff --git a/tods/common-primitives/run_tests.py b/tods/common-primitives/run_tests.py new file mode 100755 index 0000000..16c264a --- /dev/null +++ b/tods/common-primitives/run_tests.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +import sys +import unittest + +runner = unittest.TextTestRunner(verbosity=1) + +tests = unittest.TestLoader().discover('tests') + +if not runner.run(tests).wasSuccessful(): + sys.exit(1) diff --git a/tods/common-primitives/setup.cfg b/tods/common-primitives/setup.cfg new file mode 100644 index 0000000..e218fc8 --- /dev/null +++ b/tods/common-primitives/setup.cfg @@ -0,0 +1,28 @@ +[pycodestyle] +max-line-length = 200 + +[metadata] +description-file = README.md + +[mypy] +warn_redundant_casts = True +# TODO: Enable back once false positives are fixed. +# See: https://github.com/python/mypy/issues/4412 +#warn_unused_ignores = True +warn_unused_configs = True +disallow_untyped_defs = True + +# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300 +[mypy-d3m.container.list] +ignore_errors = True + +# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300 +[mypy-d3m.metadata.hyperparams] +ignore_errors = True + +# TODO: Remove once this is fixed: https://github.com/python/mypy/pull/4384#issuecomment-354033177 +[mypy-d3m.primitive_interfaces.distance] +ignore_errors = True + +[mypy-common_primitives.slacker.*] +ignore_errors = True diff --git a/tods/common-primitives/setup.py b/tods/common-primitives/setup.py new file mode 100644 index 0000000..c8d1e21 --- /dev/null +++ b/tods/common-primitives/setup.py @@ -0,0 +1,65 @@ +import os +import sys +from setuptools import setup, find_packages + +PACKAGE_NAME = 'common_primitives' +MINIMUM_PYTHON_VERSION = 3, 6 + + +def check_python_version(): + """Exit when the Python version is too low.""" + if sys.version_info < MINIMUM_PYTHON_VERSION: + sys.exit("Python {}.{}+ is required.".format(*MINIMUM_PYTHON_VERSION)) + + +def read_package_variable(key): + """Read the value of a variable from the package without importing.""" + module_path = os.path.join(PACKAGE_NAME, '__init__.py') + with open(module_path) as module: + for line in module: + parts = line.strip().split(' ') + if parts and parts[0] == key: + return parts[-1].strip("'") + raise KeyError("'{0}' not found in '{1}'".format(key, module_path)) + + +def read_readme(): + with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf8') as file: + return file.read() + + +def read_entry_points(): + with open('entry_points.ini') as entry_points: + return entry_points.read() + + +check_python_version() +version = read_package_variable('__version__') + +setup( + name=PACKAGE_NAME, + version=version, + description='D3M common primitives', + author=read_package_variable('__author__'), + packages=find_packages(exclude=['contrib', 'docs', 'tests*']), + data_files=[('./', ['./entry_points.ini'])], + install_requires=[ + 'd3m', + 'pandas', + 'scikit-learn', + 'numpy', + 'lightgbm>=2.2.2,<=2.3.0', + 'opencv-python-headless<=4.1.1.26,>=4.1', + 'imageio>=2.3.0,<=2.6.0', + 'pillow==6.2.1', + 'xgboost>=0.81,<=0.90', + ], + entry_points=read_entry_points(), + url='https://gitlab.com/datadrivendiscovery/common-primitives', + long_description=read_readme(), + long_description_content_type='text/markdown', + license='Apache-2.0', + classifiers=[ + 'License :: OSI Approved :: Apache Software License', + ], +) diff --git a/tods/common-primitives/sklearn-wrap/.gitignore b/tods/common-primitives/sklearn-wrap/.gitignore new file mode 100644 index 0000000..36fa0f3 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/.gitignore @@ -0,0 +1,2 @@ +.pyc +__pycache__ diff --git a/tods/common-primitives/sklearn-wrap/requirements.txt b/tods/common-primitives/sklearn-wrap/requirements.txt new file mode 100644 index 0000000..d587988 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/requirements.txt @@ -0,0 +1,31 @@ +scikit-learn==0.21.3 +pytypes==1.0b5 +frozendict==1.2 +numpy>=1.15.4,<=1.18.1 +jsonschema==2.6.0 +requests>=2.19.1,<=2.22.0 +strict-rfc3339==0.7 +rfc3987==1.3.8 +webcolors>=1.8.1,<=1.10 +dateparser>=0.7.0,<=0.7.2 +python-dateutil==2.8.1 +pandas==0.23.4 +typing-inspect==0.5.0 +GitPython>=2.1.11,<=3.0.5 +jsonpath-ng==1.4.3 +custom-inherit>=2.2.0,<=2.2.2 +PyYAML>=5.1,<=5.3 +pycurl>=7.43.0.2,<=7.43.0.3 +pyarrow==0.15.1 +gputil>=1.3.0,<=1.4.0 +pyrsistent>=0.14.11,<=0.15.7 +scipy>=1.2.1,<=1.4.1 +openml==0.10.1 +lightgbm>=2.2.2,<=2.3.0 +opencv-python-headless<=4.1.1.26,>=4.1 +imageio>=2.3.0,<=2.6.0 +pillow==6.2.1 +xgboost>=0.81,<=0.90 +Jinja2==2.9.4 +simplejson==3.12.0 +gitdb2==2.0.6 diff --git a/tods/common-primitives/sklearn-wrap/setup.py b/tods/common-primitives/sklearn-wrap/setup.py new file mode 100644 index 0000000..a8f506e --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/setup.py @@ -0,0 +1,106 @@ +import os +from setuptools import setup, find_packages + +PACKAGE_NAME = 'sklearn_wrap' + + +def read_package_variable(key): + """Read the value of a variable from the package without importing.""" + module_path = os.path.join(PACKAGE_NAME, '__init__.py') + with open(module_path) as module: + for line in module: + parts = line.strip().split(' ') + if parts and parts[0] == key: + return parts[-1].strip("'") + assert False, "'{0}' not found in '{1}'".format(key, module_path) + + +setup( + name=PACKAGE_NAME, + version=read_package_variable('__version__'), + description='Primitives created using the Sklearn auto wrapper', + author=read_package_variable('__author__'), + packages=find_packages(exclude=['contrib', 'docs', 'tests*']), + install_requires=[ + 'd3m', + 'Jinja2==2.9.4', + 'simplejson==3.12.0', + 'scikit-learn==0.21.3', + ], + url='https://gitlab.datadrivendiscovery.org/jpl/sklearn-wrapping', + entry_points = { + 'd3m.primitives': [ + 'data_cleaning.string_imputer.SKlearn = sklearn_wrap.SKStringImputer:SKStringImputer', + 'classification.gradient_boosting.SKlearn = sklearn_wrap.SKGradientBoostingClassifier:SKGradientBoostingClassifier', + 'classification.quadratic_discriminant_analysis.SKlearn = sklearn_wrap.SKQuadraticDiscriminantAnalysis:SKQuadraticDiscriminantAnalysis', + 'classification.decision_tree.SKlearn = sklearn_wrap.SKDecisionTreeClassifier:SKDecisionTreeClassifier', + 'classification.sgd.SKlearn = sklearn_wrap.SKSGDClassifier:SKSGDClassifier', + 'classification.nearest_centroid.SKlearn = sklearn_wrap.SKNearestCentroid:SKNearestCentroid', + 'classification.mlp.SKlearn = sklearn_wrap.SKMLPClassifier:SKMLPClassifier', + 'classification.bagging.SKlearn = sklearn_wrap.SKBaggingClassifier:SKBaggingClassifier', + 'classification.linear_svc.SKlearn = sklearn_wrap.SKLinearSVC:SKLinearSVC', + 'classification.linear_discriminant_analysis.SKlearn = sklearn_wrap.SKLinearDiscriminantAnalysis:SKLinearDiscriminantAnalysis', + 'classification.passive_aggressive.SKlearn = sklearn_wrap.SKPassiveAggressiveClassifier:SKPassiveAggressiveClassifier', + 'classification.gaussian_naive_bayes.SKlearn = sklearn_wrap.SKGaussianNB:SKGaussianNB', + 'classification.ada_boost.SKlearn = sklearn_wrap.SKAdaBoostClassifier:SKAdaBoostClassifier', + 'classification.random_forest.SKlearn = sklearn_wrap.SKRandomForestClassifier:SKRandomForestClassifier', + 'classification.svc.SKlearn = sklearn_wrap.SKSVC:SKSVC', + 'classification.multinomial_naive_bayes.SKlearn = sklearn_wrap.SKMultinomialNB:SKMultinomialNB', + 'classification.dummy.SKlearn = sklearn_wrap.SKDummyClassifier:SKDummyClassifier', + 'classification.extra_trees.SKlearn = sklearn_wrap.SKExtraTreesClassifier:SKExtraTreesClassifier', + 'classification.logistic_regression.SKlearn = sklearn_wrap.SKLogisticRegression:SKLogisticRegression', + 'classification.bernoulli_naive_bayes.SKlearn = sklearn_wrap.SKBernoulliNB:SKBernoulliNB', + 'classification.k_neighbors.SKlearn = sklearn_wrap.SKKNeighborsClassifier:SKKNeighborsClassifier', + 'regression.decision_tree.SKlearn = sklearn_wrap.SKDecisionTreeRegressor:SKDecisionTreeRegressor', + 'regression.ada_boost.SKlearn = sklearn_wrap.SKAdaBoostRegressor:SKAdaBoostRegressor', + 'regression.k_neighbors.SKlearn = sklearn_wrap.SKKNeighborsRegressor:SKKNeighborsRegressor', + 'regression.linear.SKlearn = sklearn_wrap.SKLinearRegression:SKLinearRegression', + 'regression.bagging.SKlearn = sklearn_wrap.SKBaggingRegressor:SKBaggingRegressor', + 'regression.lasso_cv.SKlearn = sklearn_wrap.SKLassoCV:SKLassoCV', + 'regression.elastic_net.SKlearn = sklearn_wrap.SKElasticNet:SKElasticNet', + 'regression.ard.SKlearn = sklearn_wrap.SKARDRegression:SKARDRegression', + 'regression.svr.SKlearn = sklearn_wrap.SKSVR:SKSVR', + 'regression.ridge.SKlearn = sklearn_wrap.SKRidge:SKRidge', + 'regression.gaussian_process.SKlearn = sklearn_wrap.SKGaussianProcessRegressor:SKGaussianProcessRegressor', + 'regression.mlp.SKlearn = sklearn_wrap.SKMLPRegressor:SKMLPRegressor', + 'regression.dummy.SKlearn = sklearn_wrap.SKDummyRegressor:SKDummyRegressor', + 'regression.sgd.SKlearn = sklearn_wrap.SKSGDRegressor:SKSGDRegressor', + 'regression.lasso.SKlearn = sklearn_wrap.SKLasso:SKLasso', + 'regression.lars.SKlearn = sklearn_wrap.SKLars:SKLars', + 'regression.extra_trees.SKlearn = sklearn_wrap.SKExtraTreesRegressor:SKExtraTreesRegressor', + 'regression.linear_svr.SKlearn = sklearn_wrap.SKLinearSVR:SKLinearSVR', + 'regression.random_forest.SKlearn = sklearn_wrap.SKRandomForestRegressor:SKRandomForestRegressor', + 'regression.gradient_boosting.SKlearn = sklearn_wrap.SKGradientBoostingRegressor:SKGradientBoostingRegressor', + 'regression.passive_aggressive.SKlearn = sklearn_wrap.SKPassiveAggressiveRegressor:SKPassiveAggressiveRegressor', + 'regression.kernel_ridge.SKlearn = sklearn_wrap.SKKernelRidge:SKKernelRidge', + 'data_preprocessing.max_abs_scaler.SKlearn = sklearn_wrap.SKMaxAbsScaler:SKMaxAbsScaler', + 'data_preprocessing.normalizer.SKlearn = sklearn_wrap.SKNormalizer:SKNormalizer', + 'data_preprocessing.robust_scaler.SKlearn = sklearn_wrap.SKRobustScaler:SKRobustScaler', + 'data_preprocessing.tfidf_vectorizer.SKlearn = sklearn_wrap.SKTfidfVectorizer:SKTfidfVectorizer', + 'data_transformation.one_hot_encoder.SKlearn = sklearn_wrap.SKOneHotEncoder:SKOneHotEncoder', + 'data_preprocessing.truncated_svd.SKlearn = sklearn_wrap.SKTruncatedSVD:SKTruncatedSVD', + 'feature_selection.select_percentile.SKlearn = sklearn_wrap.SKSelectPercentile:SKSelectPercentile', + 'feature_extraction.pca.SKlearn = sklearn_wrap.SKPCA:SKPCA', + 'data_preprocessing.count_vectorizer.SKlearn = sklearn_wrap.SKCountVectorizer:SKCountVectorizer', + 'data_transformation.ordinal_encoder.SKlearn = sklearn_wrap.SKOrdinalEncoder:SKOrdinalEncoder', + 'data_preprocessing.binarizer.SKlearn = sklearn_wrap.SKBinarizer:SKBinarizer', + 'data_cleaning.missing_indicator.SKlearn = sklearn_wrap.SKMissingIndicator:SKMissingIndicator', + 'feature_selection.select_fwe.SKlearn = sklearn_wrap.SKSelectFwe:SKSelectFwe', + 'data_preprocessing.rbf_sampler.SKlearn = sklearn_wrap.SKRBFSampler:SKRBFSampler', + 'data_preprocessing.min_max_scaler.SKlearn = sklearn_wrap.SKMinMaxScaler:SKMinMaxScaler', + 'data_preprocessing.random_trees_embedding.SKlearn = sklearn_wrap.SKRandomTreesEmbedding:SKRandomTreesEmbedding', + 'data_transformation.gaussian_random_projection.SKlearn = sklearn_wrap.SKGaussianRandomProjection:SKGaussianRandomProjection', + 'feature_extraction.kernel_pca.SKlearn = sklearn_wrap.SKKernelPCA:SKKernelPCA', + 'data_preprocessing.polynomial_features.SKlearn = sklearn_wrap.SKPolynomialFeatures:SKPolynomialFeatures', + 'data_preprocessing.feature_agglomeration.SKlearn = sklearn_wrap.SKFeatureAgglomeration:SKFeatureAgglomeration', + 'data_cleaning.imputer.SKlearn = sklearn_wrap.SKImputer:SKImputer', + 'data_preprocessing.standard_scaler.SKlearn = sklearn_wrap.SKStandardScaler:SKStandardScaler', + 'data_transformation.fast_ica.SKlearn = sklearn_wrap.SKFastICA:SKFastICA', + 'data_preprocessing.quantile_transformer.SKlearn = sklearn_wrap.SKQuantileTransformer:SKQuantileTransformer', + 'data_transformation.sparse_random_projection.SKlearn = sklearn_wrap.SKSparseRandomProjection:SKSparseRandomProjection', + 'data_preprocessing.nystroem.SKlearn = sklearn_wrap.SKNystroem:SKNystroem', + 'feature_selection.variance_threshold.SKlearn = sklearn_wrap.SKVarianceThreshold:SKVarianceThreshold', + 'feature_selection.generic_univariate_select.SKlearn = sklearn_wrap.SKGenericUnivariateSelect:SKGenericUnivariateSelect', + ], + }, +) diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKARDRegression.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKARDRegression.py new file mode 100644 index 0000000..6d1b782 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKARDRegression.py @@ -0,0 +1,470 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.bayes import ARDRegression + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + alpha_: Optional[float] + lambda_: Optional[ndarray] + sigma_: Optional[ndarray] + scores_: Optional[Sequence[Any]] + intercept_: Optional[float] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_iter = hyperparams.Bounded[int]( + default=300, + lower=0, + upper=None, + description='Maximum number of iterations. Default is 300', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.001, + lower=0, + upper=None, + description='Stop the algorithm if w has converged. Default is 1.e-3.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + alpha_1 = hyperparams.Hyperparameter[float]( + default=1e-06, + description='Hyper-parameter : shape parameter for the Gamma distribution prior over the alpha parameter. Default is 1.e-6.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + alpha_2 = hyperparams.Hyperparameter[float]( + default=1e-06, + description='Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the alpha parameter. Default is 1.e-6.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + lambda_1 = hyperparams.Hyperparameter[float]( + default=1e-06, + description='Hyper-parameter : shape parameter for the Gamma distribution prior over the lambda parameter. Default is 1.e-6.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + lambda_2 = hyperparams.Hyperparameter[float]( + default=1e-06, + description='Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the lambda parameter. Default is 1.e-6.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + threshold_lambda = hyperparams.Hyperparameter[float]( + default=10000.0, + description='threshold for removing (pruning) weights with high precision from the computation. Default is 1.e+4.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). Default is True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + normalize = hyperparams.UniformBool( + default=False, + description='If True, the regressors X will be normalized before regression. This parameter is ignored when `fit_intercept` is set to False. When the regressors are normalized, note that this makes the hyperparameters learnt more robust and almost independent of the number of samples. The same property is not valid for standardized data. However, if you wish to standardize, please use `preprocessing.StandardScaler` before calling `fit` on an estimator with `normalize=False`. copy_X : boolean, optional, default True. If True, X will be copied; else, it may be overwritten.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKARDRegression(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn ARDRegression + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.BAYESIAN_LINEAR_REGRESSION, ], + "name": "sklearn.linear_model.bayes.ARDRegression", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.ard.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html']}, + "version": "2019.11.13", + "id": "966dd2c4-d439-3ad6-b49f-17706595606c", + "hyperparams_to_tune": ['n_iter'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _copy_X: bool = True, + _verbose: bool = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = ARDRegression( + n_iter=self.hyperparams['n_iter'], + tol=self.hyperparams['tol'], + alpha_1=self.hyperparams['alpha_1'], + alpha_2=self.hyperparams['alpha_2'], + lambda_1=self.hyperparams['lambda_1'], + lambda_2=self.hyperparams['lambda_2'], + threshold_lambda=self.hyperparams['threshold_lambda'], + fit_intercept=self.hyperparams['fit_intercept'], + normalize=self.hyperparams['normalize'], + copy_X=_copy_X, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + alpha_=None, + lambda_=None, + sigma_=None, + scores_=None, + intercept_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + alpha_=getattr(self._clf, 'alpha_', None), + lambda_=getattr(self._clf, 'lambda_', None), + sigma_=getattr(self._clf, 'sigma_', None), + scores_=getattr(self._clf, 'scores_', None), + intercept_=getattr(self._clf, 'intercept_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.alpha_ = params['alpha_'] + self._clf.lambda_ = params['lambda_'] + self._clf.sigma_ = params['sigma_'] + self._clf.scores_ = params['scores_'] + self._clf.intercept_ = params['intercept_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['alpha_'] is not None: + self._fitted = True + if params['lambda_'] is not None: + self._fitted = True + if params['sigma_'] is not None: + self._fitted = True + if params['scores_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKARDRegression.__doc__ = ARDRegression.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostClassifier.py new file mode 100644 index 0000000..e48b2b6 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostClassifier.py @@ -0,0 +1,498 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.weight_boosting import AdaBoostClassifier + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + estimators_: Optional[Sequence[sklearn.base.BaseEstimator]] + classes_: Optional[ndarray] + n_classes_: Optional[int] + estimator_weights_: Optional[ndarray] + estimator_errors_: Optional[ndarray] + base_estimator_: Optional[object] + estimator_params: Optional[tuple] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + base_estimator = hyperparams.Constant( + default=None, + description='The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper `classes_` and `n_classes_` attributes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_estimators = hyperparams.Bounded[int]( + lower=1, + upper=None, + default=50, + description='The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + learning_rate = hyperparams.Uniform( + lower=0.01, + upper=2, + default=0.1, + description='Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + algorithm = hyperparams.Enumeration[str]( + values=['SAMME.R', 'SAMME'], + default='SAMME.R', + description='If \'SAMME.R\' then use the SAMME.R real boosting algorithm. ``base_estimator`` must support calculation of class probabilities. If \'SAMME\' then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKAdaBoostClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn AdaBoostClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ADABOOST, ], + "name": "sklearn.ensemble.weight_boosting.AdaBoostClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.ada_boost.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html']}, + "version": "2019.11.13", + "id": "4210a6a6-14ab-4490-a7dc-460763e70e55", + "hyperparams_to_tune": ['learning_rate', 'n_estimators'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = AdaBoostClassifier( + base_estimator=self.hyperparams['base_estimator'], + n_estimators=self.hyperparams['n_estimators'], + learning_rate=self.hyperparams['learning_rate'], + algorithm=self.hyperparams['algorithm'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + estimators_=None, + classes_=None, + n_classes_=None, + estimator_weights_=None, + estimator_errors_=None, + base_estimator_=None, + estimator_params=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + estimators_=getattr(self._clf, 'estimators_', None), + classes_=getattr(self._clf, 'classes_', None), + n_classes_=getattr(self._clf, 'n_classes_', None), + estimator_weights_=getattr(self._clf, 'estimator_weights_', None), + estimator_errors_=getattr(self._clf, 'estimator_errors_', None), + base_estimator_=getattr(self._clf, 'base_estimator_', None), + estimator_params=getattr(self._clf, 'estimator_params', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.estimators_ = params['estimators_'] + self._clf.classes_ = params['classes_'] + self._clf.n_classes_ = params['n_classes_'] + self._clf.estimator_weights_ = params['estimator_weights_'] + self._clf.estimator_errors_ = params['estimator_errors_'] + self._clf.base_estimator_ = params['base_estimator_'] + self._clf.estimator_params = params['estimator_params'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['estimators_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['n_classes_'] is not None: + self._fitted = True + if params['estimator_weights_'] is not None: + self._fitted = True + if params['estimator_errors_'] is not None: + self._fitted = True + if params['base_estimator_'] is not None: + self._fitted = True + if params['estimator_params'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKAdaBoostClassifier.__doc__ = AdaBoostClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostRegressor.py new file mode 100644 index 0000000..bf06e54 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostRegressor.py @@ -0,0 +1,437 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.weight_boosting import AdaBoostRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + estimators_: Optional[List[sklearn.tree.DecisionTreeRegressor]] + estimator_weights_: Optional[ndarray] + estimator_errors_: Optional[ndarray] + estimator_params: Optional[tuple] + base_estimator_: Optional[object] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + base_estimator = hyperparams.Constant( + default=None, + description='The base estimator from which the boosted ensemble is built. Support for sample weighting is required.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_estimators = hyperparams.Bounded[int]( + lower=1, + upper=None, + default=50, + description='The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + learning_rate = hyperparams.Uniform( + lower=0.01, + upper=2, + default=0.1, + description='Learning rate shrinks the contribution of each regressor by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + loss = hyperparams.Enumeration[str]( + values=['linear', 'square', 'exponential'], + default='linear', + description='The loss function to use when updating the weights after each boosting iteration.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKAdaBoostRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn AdaBoostRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ADABOOST, ], + "name": "sklearn.ensemble.weight_boosting.AdaBoostRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.ada_boost.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html']}, + "version": "2019.11.13", + "id": "6cab1537-02e1-4dc4-9ebb-53fa2cbabedd", + "hyperparams_to_tune": ['learning_rate', 'n_estimators'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = AdaBoostRegressor( + base_estimator=self.hyperparams['base_estimator'], + n_estimators=self.hyperparams['n_estimators'], + learning_rate=self.hyperparams['learning_rate'], + loss=self.hyperparams['loss'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + estimators_=None, + estimator_weights_=None, + estimator_errors_=None, + estimator_params=None, + base_estimator_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + estimators_=getattr(self._clf, 'estimators_', None), + estimator_weights_=getattr(self._clf, 'estimator_weights_', None), + estimator_errors_=getattr(self._clf, 'estimator_errors_', None), + estimator_params=getattr(self._clf, 'estimator_params', None), + base_estimator_=getattr(self._clf, 'base_estimator_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.estimators_ = params['estimators_'] + self._clf.estimator_weights_ = params['estimator_weights_'] + self._clf.estimator_errors_ = params['estimator_errors_'] + self._clf.estimator_params = params['estimator_params'] + self._clf.base_estimator_ = params['base_estimator_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['estimators_'] is not None: + self._fitted = True + if params['estimator_weights_'] is not None: + self._fitted = True + if params['estimator_errors_'] is not None: + self._fitted = True + if params['estimator_params'] is not None: + self._fitted = True + if params['base_estimator_'] is not None: + self._fitted = True + + + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKAdaBoostRegressor.__doc__ = AdaBoostRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingClassifier.py new file mode 100644 index 0000000..c875434 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingClassifier.py @@ -0,0 +1,589 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.bagging import BaggingClassifier + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + base_estimator_: Optional[object] + estimators_: Optional[List[sklearn.tree.DecisionTreeClassifier]] + estimators_features_: Optional[List[ndarray]] + classes_: Optional[ndarray] + n_classes_: Optional[int] + oob_score_: Optional[float] + oob_decision_function_: Optional[List[ndarray]] + n_features_: Optional[int] + _max_features: Optional[int] + _max_samples: Optional[int] + _n_samples: Optional[int] + _seeds: Optional[ndarray] + estimator_params: Optional[tuple] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='The number of base estimators in the ensemble.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_samples = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=1.0, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='percent', + description='The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=1.0, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='percent', + description='The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + bootstrap = hyperparams.Enumeration[str]( + values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], + default='bootstrap', + description='Whether bootstrap samples are used when building trees.' + ' And whether to use out-of-bag samples to estimate the generalization accuracy.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + bootstrap_features = hyperparams.UniformBool( + default=False, + description='Whether features are drawn with replacement.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. .. versionadded:: 0.17 *warm_start* constructor parameter.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKBaggingClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn BaggingClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ENSEMBLE_LEARNING, ], + "name": "sklearn.ensemble.bagging.BaggingClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.bagging.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html']}, + "version": "2019.11.13", + "id": "1b2a32a6-0ec5-3ca0-9386-b8b1f1b831d1", + "hyperparams_to_tune": ['n_estimators', 'max_samples', 'max_features'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = BaggingClassifier( + n_estimators=self.hyperparams['n_estimators'], + max_samples=self.hyperparams['max_samples'], + max_features=self.hyperparams['max_features'], + bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], + bootstrap_features=self.hyperparams['bootstrap_features'], + oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], + warm_start=self.hyperparams['warm_start'], + n_jobs=self.hyperparams['n_jobs'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + base_estimator_=None, + estimators_=None, + estimators_features_=None, + classes_=None, + n_classes_=None, + oob_score_=None, + oob_decision_function_=None, + n_features_=None, + _max_features=None, + _max_samples=None, + _n_samples=None, + _seeds=None, + estimator_params=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + base_estimator_=getattr(self._clf, 'base_estimator_', None), + estimators_=getattr(self._clf, 'estimators_', None), + estimators_features_=getattr(self._clf, 'estimators_features_', None), + classes_=getattr(self._clf, 'classes_', None), + n_classes_=getattr(self._clf, 'n_classes_', None), + oob_score_=getattr(self._clf, 'oob_score_', None), + oob_decision_function_=getattr(self._clf, 'oob_decision_function_', None), + n_features_=getattr(self._clf, 'n_features_', None), + _max_features=getattr(self._clf, '_max_features', None), + _max_samples=getattr(self._clf, '_max_samples', None), + _n_samples=getattr(self._clf, '_n_samples', None), + _seeds=getattr(self._clf, '_seeds', None), + estimator_params=getattr(self._clf, 'estimator_params', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.base_estimator_ = params['base_estimator_'] + self._clf.estimators_ = params['estimators_'] + self._clf.estimators_features_ = params['estimators_features_'] + self._clf.classes_ = params['classes_'] + self._clf.n_classes_ = params['n_classes_'] + self._clf.oob_score_ = params['oob_score_'] + self._clf.oob_decision_function_ = params['oob_decision_function_'] + self._clf.n_features_ = params['n_features_'] + self._clf._max_features = params['_max_features'] + self._clf._max_samples = params['_max_samples'] + self._clf._n_samples = params['_n_samples'] + self._clf._seeds = params['_seeds'] + self._clf.estimator_params = params['estimator_params'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['base_estimator_'] is not None: + self._fitted = True + if params['estimators_'] is not None: + self._fitted = True + if params['estimators_features_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['n_classes_'] is not None: + self._fitted = True + if params['oob_score_'] is not None: + self._fitted = True + if params['oob_decision_function_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['_max_features'] is not None: + self._fitted = True + if params['_max_samples'] is not None: + self._fitted = True + if params['_n_samples'] is not None: + self._fitted = True + if params['_seeds'] is not None: + self._fitted = True + if params['estimator_params'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKBaggingClassifier.__doc__ = BaggingClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingRegressor.py new file mode 100644 index 0000000..7a62c7b --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingRegressor.py @@ -0,0 +1,533 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.bagging import BaggingRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + estimators_: Optional[List[sklearn.tree.DecisionTreeRegressor]] + estimators_features_: Optional[List[ndarray]] + oob_score_: Optional[float] + oob_prediction_: Optional[ndarray] + base_estimator_: Optional[object] + n_features_: Optional[int] + _max_features: Optional[int] + _max_samples: Optional[int] + _n_samples: Optional[int] + _seeds: Optional[ndarray] + estimator_params: Optional[tuple] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + base_estimator = hyperparams.Constant( + default=None, + description='The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_estimators = hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='The number of base estimators in the ensemble.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_samples = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=1.0, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='percent', + description='The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=1.0, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='percent', + description='The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + bootstrap = hyperparams.Enumeration[str]( + values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], + default='bootstrap', + description='Whether bootstrap samples are used when building trees.' + ' And whether to use out-of-bag samples to estimate the generalization accuracy.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + bootstrap_features = hyperparams.UniformBool( + default=False, + description='Whether features are drawn with replacement.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. See :term:`the Glossary `.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of jobs to run in parallel for both `fit` and `predict`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKBaggingRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn BaggingRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ENSEMBLE_LEARNING, ], + "name": "sklearn.ensemble.bagging.BaggingRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.bagging.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html']}, + "version": "2019.11.13", + "id": "0dbc4b6d-aa57-4f11-ab18-36125880151b", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = BaggingRegressor( + base_estimator=self.hyperparams['base_estimator'], + n_estimators=self.hyperparams['n_estimators'], + max_samples=self.hyperparams['max_samples'], + max_features=self.hyperparams['max_features'], + bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], + bootstrap_features=self.hyperparams['bootstrap_features'], + oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], + warm_start=self.hyperparams['warm_start'], + n_jobs=self.hyperparams['n_jobs'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + estimators_=None, + estimators_features_=None, + oob_score_=None, + oob_prediction_=None, + base_estimator_=None, + n_features_=None, + _max_features=None, + _max_samples=None, + _n_samples=None, + _seeds=None, + estimator_params=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + estimators_=getattr(self._clf, 'estimators_', None), + estimators_features_=getattr(self._clf, 'estimators_features_', None), + oob_score_=getattr(self._clf, 'oob_score_', None), + oob_prediction_=getattr(self._clf, 'oob_prediction_', None), + base_estimator_=getattr(self._clf, 'base_estimator_', None), + n_features_=getattr(self._clf, 'n_features_', None), + _max_features=getattr(self._clf, '_max_features', None), + _max_samples=getattr(self._clf, '_max_samples', None), + _n_samples=getattr(self._clf, '_n_samples', None), + _seeds=getattr(self._clf, '_seeds', None), + estimator_params=getattr(self._clf, 'estimator_params', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.estimators_ = params['estimators_'] + self._clf.estimators_features_ = params['estimators_features_'] + self._clf.oob_score_ = params['oob_score_'] + self._clf.oob_prediction_ = params['oob_prediction_'] + self._clf.base_estimator_ = params['base_estimator_'] + self._clf.n_features_ = params['n_features_'] + self._clf._max_features = params['_max_features'] + self._clf._max_samples = params['_max_samples'] + self._clf._n_samples = params['_n_samples'] + self._clf._seeds = params['_seeds'] + self._clf.estimator_params = params['estimator_params'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['estimators_'] is not None: + self._fitted = True + if params['estimators_features_'] is not None: + self._fitted = True + if params['oob_score_'] is not None: + self._fitted = True + if params['oob_prediction_'] is not None: + self._fitted = True + if params['base_estimator_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['_max_features'] is not None: + self._fitted = True + if params['_max_samples'] is not None: + self._fitted = True + if params['_n_samples'] is not None: + self._fitted = True + if params['_seeds'] is not None: + self._fitted = True + if params['estimator_params'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKBaggingRegressor.__doc__ = BaggingRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBernoulliNB.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBernoulliNB.py new file mode 100644 index 0000000..40dde7e --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBernoulliNB.py @@ -0,0 +1,508 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.naive_bayes import BernoulliNB + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + class_log_prior_: Optional[ndarray] + feature_log_prob_: Optional[ndarray] + class_count_: Optional[ndarray] + feature_count_: Optional[ndarray] + classes_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + alpha = hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + description='Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + binarize = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + description='Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_prior = hyperparams.UniformBool( + default=True, + description='Whether to learn class prior probabilities or not. If false, a uniform prior will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKBernoulliNB(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn BernoulliNB + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.NAIVE_BAYES_CLASSIFIER, ], + "name": "sklearn.naive_bayes.BernoulliNB", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.bernoulli_naive_bayes.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html']}, + "version": "2019.11.13", + "id": "dfb1004e-02ac-3399-ba57-8a95639312cd", + "hyperparams_to_tune": ['alpha', 'binarize', 'fit_prior'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = BernoulliNB( + alpha=self.hyperparams['alpha'], + binarize=self.hyperparams['binarize'], + fit_prior=self.hyperparams['fit_prior'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.partial_fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + class_log_prior_=None, + feature_log_prob_=None, + class_count_=None, + feature_count_=None, + classes_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + class_log_prior_=getattr(self._clf, 'class_log_prior_', None), + feature_log_prob_=getattr(self._clf, 'feature_log_prob_', None), + class_count_=getattr(self._clf, 'class_count_', None), + feature_count_=getattr(self._clf, 'feature_count_', None), + classes_=getattr(self._clf, 'classes_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.class_log_prior_ = params['class_log_prior_'] + self._clf.feature_log_prob_ = params['feature_log_prob_'] + self._clf.class_count_ = params['class_count_'] + self._clf.feature_count_ = params['feature_count_'] + self._clf.classes_ = params['classes_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['class_log_prior_'] is not None: + self._fitted = True + if params['feature_log_prob_'] is not None: + self._fitted = True + if params['class_count_'] is not None: + self._fitted = True + if params['feature_count_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKBernoulliNB.__doc__ = BernoulliNB.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBinarizer.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBinarizer.py new file mode 100644 index 0000000..7d1166e --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKBinarizer.py @@ -0,0 +1,330 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import Binarizer + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + threshold = hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + description='Feature values below or equal to this are replaced by 0, above it by 1. Threshold may not be less than 0 for operations on sparse matrices.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKBinarizer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn Binarizer + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.FEATURE_SCALING, ], + "name": "sklearn.preprocessing.data.Binarizer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.binarizer.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html']}, + "version": "2019.11.13", + "id": "13777068-9dc0-3c5b-b4da-99350d67ee3f", + "hyperparams_to_tune": ['threshold'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Binarizer( + threshold=self.hyperparams['threshold'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKBinarizer.__doc__ = Binarizer.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKCountVectorizer.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKCountVectorizer.py new file mode 100644 index 0000000..264c92f --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKCountVectorizer.py @@ -0,0 +1,490 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.feature_extraction.text import CountVectorizer + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.metadata.base import ALL_ELEMENTS + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + vocabulary_: Optional[Sequence[dict]] + stop_words_: Optional[Sequence[set]] + fixed_vocabulary_: Optional[Sequence[bool]] + _stop_words_id: Optional[Sequence[int]] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + + +class Hyperparams(hyperparams.Hyperparams): + strip_accents = hyperparams.Union( + configuration=OrderedDict({ + 'accents': hyperparams.Enumeration[str]( + default='ascii', + values=['ascii', 'unicode'], + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Remove accents during the preprocessing step. \'ascii\' is a fast method that only works on characters that have an direct ASCII mapping. \'unicode\' is a slightly slower method that works on any characters. None (default) does nothing.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + analyzer = hyperparams.Enumeration[str]( + default='word', + values=['word', 'char', 'char_wb'], + description='Whether the feature should be made of word or character n-grams. Option \'char_wb\' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + ngram_range = hyperparams.SortedList( + elements=hyperparams.Bounded[int](1, None, 1), + default=(1, 1), + min_size=2, + max_size=2, + description='The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + stop_words = hyperparams.Union( + configuration=OrderedDict({ + 'string': hyperparams.Hyperparameter[str]( + default='english', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'list': hyperparams.List( + elements=hyperparams.Hyperparameter[str](''), + default=[], + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='If \'english\', a built-in stop word list for English is used. If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == \'word\'``. If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + lowercase = hyperparams.UniformBool( + default=True, + description='Convert all characters to lowercase before tokenizing.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + token_pattern = hyperparams.Hyperparameter[str]( + default='(?u)\\b\w\w+\\b', + description='Regular expression denoting what constitutes a "token", only used if ``analyzer == \'word\'``. The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_df = hyperparams.Union( + configuration=OrderedDict({ + 'proportion': hyperparams.Bounded[float]( + default=1.0, + lower=0.0, + upper=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='proportion', + description='When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_df = hyperparams.Union( + configuration=OrderedDict({ + 'proportion': hyperparams.Bounded[float]( + default=1.0, + lower=0.0, + upper=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + binary = hyperparams.UniformBool( + default=False, + description='If True, all non zero counts are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + + +class SKCountVectorizer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn CountVectorizer + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.MINIMUM_REDUNDANCY_FEATURE_SELECTION, ], + "name": "sklearn.feature_extraction.text.CountVectorizer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.count_vectorizer.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.CountVectorizer.html']}, + "version": "2019.11.13", + "id": "0609859b-8ed9-397f-ac7a-7c4f63863560", + "hyperparams_to_tune": ['max_df', 'min_df'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # True + + self._clf = list() + + self._training_inputs = None + self._target_names = None + self._training_indices = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + + if self._training_inputs is None: + raise ValueError("Missing training data.") + + if len(self._training_indices) > 0: + for column_index in range(len(self._training_inputs.columns)): + clf = self._create_new_sklearn_estimator() + clf.fit(self._training_inputs.iloc[:, column_index]) + self._clf.append(clf) + + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs, training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + else: + training_indices = list(range(len(inputs))) + + # Iterating over all estimators and call transform on them. + # No. of estimators should be equal to the number of columns in the input + if len(self._clf) != len(sk_inputs.columns): + raise RuntimeError("Input data does not have the same number of columns as training data") + outputs = [] + if len(self._training_indices) > 0: + for column_index in range(len(sk_inputs.columns)): + clf = self._clf[column_index] + output = clf.transform(sk_inputs.iloc[:, column_index]) + column_name = sk_inputs.columns[column_index] + + if sparse.issparse(output): + output = output.toarray() + output = self._wrap_predictions(inputs, output) + + # Updating column names. + output.columns = map(lambda x: "{}_{}".format(column_name, x), clf.get_feature_names()) + for i, name in enumerate(clf.get_feature_names()): + output.metadata = output.metadata.update((ALL_ELEMENTS, i), {'name': name}) + + outputs.append(output) + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=outputs) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + vocabulary_=None, + stop_words_=None, + fixed_vocabulary_=None, + _stop_words_id=None, + training_indices_=self._training_indices, + target_names_=self._target_names + ) + + return Params( + vocabulary_=list(map(lambda clf: getattr(clf, 'vocabulary_', None), self._clf)), + stop_words_=list(map(lambda clf: getattr(clf, 'stop_words_', None), self._clf)), + fixed_vocabulary_=list(map(lambda clf: getattr(clf, 'fixed_vocabulary_', None), self._clf)), + _stop_words_id=list(map(lambda clf: getattr(clf, '_stop_words_id', None), self._clf)), + training_indices_=self._training_indices, + target_names_=self._target_names + ) + + def set_params(self, *, params: Params) -> None: + for param, val in params.items(): + if val is not None and param not in ['target_names_', 'training_indices_']: + self._clf = list(map(lambda x: self._create_new_sklearn_estimator(), val)) + break + for index in range(len(self._clf)): + for param, val in params.items(): + if val is not None: + setattr(self._clf[index], param, val[index]) + else: + setattr(self._clf[index], param, None) + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._fitted = False + + if params['vocabulary_'] is not None: + self._fitted = True + if params['stop_words_'] is not None: + self._fitted = True + if params['fixed_vocabulary_'] is not None: + self._fitted = True + if params['_stop_words_id'] is not None: + self._fitted = True + + def _create_new_sklearn_estimator(self): + clf = CountVectorizer( + strip_accents=self.hyperparams['strip_accents'], + analyzer=self.hyperparams['analyzer'], + ngram_range=self.hyperparams['ngram_range'], + stop_words=self.hyperparams['stop_words'], + lowercase=self.hyperparams['lowercase'], + token_pattern=self.hyperparams['token_pattern'], + max_df=self.hyperparams['max_df'], + min_df=self.hyperparams['min_df'], + max_features=self.hyperparams['max_features'], + binary=self.hyperparams['binary'], + ) + return clf + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (str,) + accepted_semantic_types = set(["http://schema.org/Text",]) + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), [] + target_names = [] + target_semantic_type = [] + target_column_indices = [] + metadata = data.metadata + target_column_indices.extend(metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')) + + for column_index in target_column_indices: + if column_index is metadata_base.ALL_ELEMENTS: + continue + column_index = typing.cast(metadata_base.SimpleSelectorSegment, column_index) + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + target_names.append(column_metadata.get('name', str(column_index))) + target_semantic_type.append(column_metadata.get('semantic_types', [])) + + targets = data.iloc[:, target_column_indices] + return targets, target_names, target_semantic_type + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/Attribute') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKCountVectorizer.__doc__ = CountVectorizer.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeClassifier.py new file mode 100644 index 0000000..46d060a --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeClassifier.py @@ -0,0 +1,621 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.tree.tree import DecisionTreeClassifier +import numpy + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + classes_: Optional[Union[ndarray, List[ndarray]]] + max_features_: Optional[int] + n_classes_: Optional[Union[numpy.int64, List[numpy.int64]]] + n_features_: Optional[int] + n_outputs_: Optional[int] + tree_: Optional[object] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + criterion = hyperparams.Enumeration[str]( + values=['gini', 'entropy'], + default='gini', + description='The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + splitter = hyperparams.Enumeration[str]( + values=['best', 'random'], + default='best', + description='The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_depth = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=10, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_split = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=2, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_leaf = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=0.5, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_weight_fraction_leaf = hyperparams.Bounded[float]( + default=0, + lower=0, + upper=0.5, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_leaf_nodes = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'specified_int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_decrease = hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + class_weight = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Constant( + default='balanced', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + presort = hyperparams.UniformBool( + default=False, + description='Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large datasets, setting this to true may slow down the training process. When using either a smaller dataset or a restricted depth, this may speed up the training.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKDecisionTreeClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn DecisionTreeClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ], + "name": "sklearn.tree.tree.DecisionTreeClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.decision_tree.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html']}, + "version": "2019.11.13", + "id": "e20d003d-6a9f-35b0-b4b5-20e42b30282a", + "hyperparams_to_tune": ['max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = DecisionTreeClassifier( + criterion=self.hyperparams['criterion'], + splitter=self.hyperparams['splitter'], + max_depth=self.hyperparams['max_depth'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + max_features=self.hyperparams['max_features'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + class_weight=self.hyperparams['class_weight'], + presort=self.hyperparams['presort'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + classes_=None, + max_features_=None, + n_classes_=None, + n_features_=None, + n_outputs_=None, + tree_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + classes_=getattr(self._clf, 'classes_', None), + max_features_=getattr(self._clf, 'max_features_', None), + n_classes_=getattr(self._clf, 'n_classes_', None), + n_features_=getattr(self._clf, 'n_features_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + tree_=getattr(self._clf, 'tree_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.classes_ = params['classes_'] + self._clf.max_features_ = params['max_features_'] + self._clf.n_classes_ = params['n_classes_'] + self._clf.n_features_ = params['n_features_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.tree_ = params['tree_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['classes_'] is not None: + self._fitted = True + if params['max_features_'] is not None: + self._fitted = True + if params['n_classes_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['tree_'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKDecisionTreeClassifier.__doc__ = DecisionTreeClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeRegressor.py new file mode 100644 index 0000000..1886dd3 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeRegressor.py @@ -0,0 +1,565 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.tree.tree import DecisionTreeRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + max_features_: Optional[int] + n_features_: Optional[int] + n_outputs_: Optional[int] + tree_: Optional[object] + classes_: Optional[Union[ndarray, List[ndarray]]] + n_classes_: Optional[Union[numpy.int64, List[numpy.int64]]] + class_weight: Optional[Union[str, dict, List[dict]]] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + criterion = hyperparams.Enumeration[str]( + values=['mse', 'friedman_mse', 'mae'], + default='mse', + description='The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + splitter = hyperparams.Enumeration[str]( + values=['best', 'random'], + default='best', + description='The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_depth = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=5, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_split = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=1, + default=1.0, + description='It\'s a percentage and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=2, + description='Minimum number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_leaf = hyperparams.Union( + configuration=OrderedDict({ + 'percent': hyperparams.Bounded[float]( + lower=0, + upper=0.5, + default=0.25, + description='It\'s a percentage and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'absolute': hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + description='Minimum number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_weight_fraction_leaf = hyperparams.Bounded[float]( + default=0, + lower=0, + upper=0.5, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_leaf_nodes = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'specified_int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='calculated', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_decrease = hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + presort = hyperparams.UniformBool( + default=False, + description='Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large datasets, setting this to true may slow down the training process. When using either a smaller dataset or a restricted depth, this may speed up the training.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKDecisionTreeRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn DecisionTreeRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ], + "name": "sklearn.tree.tree.DecisionTreeRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.decision_tree.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html']}, + "version": "2019.11.13", + "id": "6c420bd8-01d1-321f-9a35-afc4b758a5c6", + "hyperparams_to_tune": ['max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = DecisionTreeRegressor( + criterion=self.hyperparams['criterion'], + splitter=self.hyperparams['splitter'], + max_depth=self.hyperparams['max_depth'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + max_features=self.hyperparams['max_features'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + presort=self.hyperparams['presort'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + max_features_=None, + n_features_=None, + n_outputs_=None, + tree_=None, + classes_=None, + n_classes_=None, + class_weight=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + max_features_=getattr(self._clf, 'max_features_', None), + n_features_=getattr(self._clf, 'n_features_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + tree_=getattr(self._clf, 'tree_', None), + classes_=getattr(self._clf, 'classes_', None), + n_classes_=getattr(self._clf, 'n_classes_', None), + class_weight=getattr(self._clf, 'class_weight', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.max_features_ = params['max_features_'] + self._clf.n_features_ = params['n_features_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.tree_ = params['tree_'] + self._clf.classes_ = params['classes_'] + self._clf.n_classes_ = params['n_classes_'] + self._clf.class_weight = params['class_weight'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['max_features_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['tree_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['n_classes_'] is not None: + self._fitted = True + if params['class_weight'] is not None: + self._fitted = True + + + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKDecisionTreeRegressor.__doc__ = DecisionTreeRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDummyClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDummyClassifier.py new file mode 100644 index 0000000..4425428 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDummyClassifier.py @@ -0,0 +1,503 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.dummy import DummyClassifier + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + classes_: Optional[ndarray] + n_classes_: Optional[Union[int,ndarray]] + class_prior_: Optional[ndarray] + n_outputs_: Optional[int] + sparse_output_: Optional[bool] + output_2d_: Optional[bool] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + strategy = hyperparams.Choice( + choices={ + 'stratified': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'most_frequent': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'prior': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'uniform': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'constant': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'constant': hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Hyperparameter[str]( + default='one', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'int': hyperparams.Bounded[int]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'ndarray': hyperparams.Hyperparameter[ndarray]( + default=numpy.array([]), + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='stratified', + description='Strategy to use to generate predictions. * "stratified": generates predictions by respecting the training set\'s class distribution. * "most_frequent": always predicts the most frequent label in the training set. * "prior": always predicts the class that maximizes the class prior (like "most_frequent") and ``predict_proba`` returns the class prior. * "uniform": generates predictions uniformly at random. * "constant": always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class .. versionadded:: 0.17 Dummy Classifier now supports prior fitting strategy using parameter *prior*.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKDummyClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn DummyClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.RULE_BASED_MACHINE_LEARNING, ], + "name": "sklearn.dummy.DummyClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.dummy.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html']}, + "version": "2019.11.13", + "id": "a1056ddf-2e89-3d8d-8308-2146170ae54d", + "hyperparams_to_tune": ['strategy'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = DummyClassifier( + strategy=self.hyperparams['strategy']['choice'], + constant=self.hyperparams['strategy'].get('constant', 'int'), + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + classes_=None, + n_classes_=None, + class_prior_=None, + n_outputs_=None, + sparse_output_=None, + output_2d_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + classes_=getattr(self._clf, 'classes_', None), + n_classes_=getattr(self._clf, 'n_classes_', None), + class_prior_=getattr(self._clf, 'class_prior_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + sparse_output_=getattr(self._clf, 'sparse_output_', None), + output_2d_=getattr(self._clf, 'output_2d_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.classes_ = params['classes_'] + self._clf.n_classes_ = params['n_classes_'] + self._clf.class_prior_ = params['class_prior_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.sparse_output_ = params['sparse_output_'] + self._clf.output_2d_ = params['output_2d_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['classes_'] is not None: + self._fitted = True + if params['n_classes_'] is not None: + self._fitted = True + if params['class_prior_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['sparse_output_'] is not None: + self._fitted = True + if params['output_2d_'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKDummyClassifier.__doc__ = DummyClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDummyRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDummyRegressor.py new file mode 100644 index 0000000..020942d --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKDummyRegressor.py @@ -0,0 +1,442 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.dummy import DummyRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + constant_: Optional[Union[float, ndarray]] + n_outputs_: Optional[int] + output_2d_: Optional[bool] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + strategy = hyperparams.Choice( + choices={ + 'mean': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'median': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'quantile': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'quantile': hyperparams.Uniform( + default=0.5, + lower=0, + upper=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'constant': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'constant': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'int': hyperparams.Bounded[int]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'ndarray': hyperparams.Hyperparameter[ndarray]( + default=numpy.array([]), + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='mean', + description='Strategy to use to generate predictions. * "mean": always predicts the mean of the training set * "median": always predicts the median of the training set * "quantile": always predicts a specified quantile of the training set, provided with the quantile parameter. * "constant": always predicts a constant value that is provided by the user.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKDummyRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn DummyRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.RULE_BASED_MACHINE_LEARNING, ], + "name": "sklearn.dummy.DummyRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.dummy.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html']}, + "version": "2019.11.13", + "id": "05aa5b6a-3b27-34dc-9ba7-8511fb13f253", + "hyperparams_to_tune": ['strategy'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = DummyRegressor( + strategy=self.hyperparams['strategy']['choice'], + quantile=self.hyperparams['strategy'].get('quantile', 0.5), + constant=self.hyperparams['strategy'].get('constant', 'float'), + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + constant_=None, + n_outputs_=None, + output_2d_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + constant_=getattr(self._clf, 'constant_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + output_2d_=getattr(self._clf, 'output_2d_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.constant_ = params['constant_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.output_2d_ = params['output_2d_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['constant_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['output_2d_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKDummyRegressor.__doc__ = DummyRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKElasticNet.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKElasticNet.py new file mode 100644 index 0000000..894fcad --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKElasticNet.py @@ -0,0 +1,466 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.coordinate_descent import ElasticNet + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[float] + n_iter_: Optional[int] + dual_gap_: Optional[float] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + alpha = hyperparams.Bounded[float]( + default=1.0, + lower=0, + upper=None, + description='Constant that multiplies the penalty terms. Defaults to 1.0. See the notes for the exact mathematical meaning of this parameter.``alpha = 0`` is equivalent to an ordinary least square, solved by the :class:`LinearRegression` object. For numerical reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. Given this, you should use the :class:`LinearRegression` object.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + l1_ratio = hyperparams.Uniform( + default=0.5, + lower=0, + upper=1, + description='The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='Whether the intercept should be estimated or not. If ``False``, the data is assumed to be already centered.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + normalize = hyperparams.UniformBool( + default=False, + description='This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + precompute = hyperparams.UniformBool( + default=False, + description='Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``True`` to preserve sparsity.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + max_iter = hyperparams.Bounded[int]( + default=1000, + lower=0, + upper=None, + description='The maximum number of iterations', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + positive = hyperparams.UniformBool( + default=False, + description='When set to ``True``, forces the coefficients to be positive.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + selection = hyperparams.Enumeration[str]( + default='cyclic', + values=['cyclic', 'random'], + description='If set to \'random\', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to \'random\') often leads to significantly faster convergence especially when tol is higher than 1e-4.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKElasticNet(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn ElasticNet + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ELASTIC_NET_REGULARIZATION, ], + "name": "sklearn.linear_model.coordinate_descent.ElasticNet", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.elastic_net.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html']}, + "version": "2019.11.13", + "id": "a85d4ffb-49ab-35b1-a70c-6df209312aae", + "hyperparams_to_tune": ['alpha', 'max_iter', 'l1_ratio'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = ElasticNet( + alpha=self.hyperparams['alpha'], + l1_ratio=self.hyperparams['l1_ratio'], + fit_intercept=self.hyperparams['fit_intercept'], + normalize=self.hyperparams['normalize'], + precompute=self.hyperparams['precompute'], + max_iter=self.hyperparams['max_iter'], + tol=self.hyperparams['tol'], + positive=self.hyperparams['positive'], + selection=self.hyperparams['selection'], + warm_start=self.hyperparams['warm_start'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + n_iter_=None, + dual_gap_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + dual_gap_=getattr(self._clf, 'dual_gap_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.n_iter_ = params['n_iter_'] + self._clf.dual_gap_ = params['dual_gap_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + if params['dual_gap_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKElasticNet.__doc__ = ElasticNet.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesClassifier.py new file mode 100644 index 0000000..51d77c9 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesClassifier.py @@ -0,0 +1,675 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.forest import ExtraTreesClassifier + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + estimators_: Optional[Sequence[sklearn.base.BaseEstimator]] + classes_: Optional[Union[ndarray, List[ndarray]]] + n_classes_: Optional[Union[int, List[int]]] + n_features_: Optional[int] + n_outputs_: Optional[int] + oob_score_: Optional[float] + oob_decision_function_: Optional[ndarray] + base_estimator_: Optional[object] + estimator_params: Optional[tuple] + base_estimator: Optional[object] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='The number of trees in the forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + criterion = hyperparams.Enumeration[str]( + values=['gini', 'entropy'], + default='gini', + description='The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_depth = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_split = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=2, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_leaf = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=0.5, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_weight_fraction_leaf = hyperparams.Bounded[float]( + default=0, + lower=0, + upper=0.5, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_leaf_nodes = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=10, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'specified_int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='calculated', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_decrease = hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + bootstrap = hyperparams.Enumeration[str]( + values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], + default='bootstrap', + description='Whether bootstrap samples are used when building trees.' + ' And whether to use out-of-bag samples to estimate the generalization accuracy.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + class_weight = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Enumeration[str]( + default='balanced', + values=['balanced', 'balanced_subsample'], + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The "balanced_subsample" mode is the same as "balanced" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKExtraTreesClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn ExtraTreesClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ], + "name": "sklearn.ensemble.forest.ExtraTreesClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.extra_trees.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html']}, + "version": "2019.11.13", + "id": "c8a28f02-ef4a-35a8-87f1-cf79980f5c3e", + "hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = ExtraTreesClassifier( + n_estimators=self.hyperparams['n_estimators'], + criterion=self.hyperparams['criterion'], + max_depth=self.hyperparams['max_depth'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + max_features=self.hyperparams['max_features'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], + oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], + n_jobs=self.hyperparams['n_jobs'], + warm_start=self.hyperparams['warm_start'], + class_weight=self.hyperparams['class_weight'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + estimators_=None, + classes_=None, + n_classes_=None, + n_features_=None, + n_outputs_=None, + oob_score_=None, + oob_decision_function_=None, + base_estimator_=None, + estimator_params=None, + base_estimator=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + estimators_=getattr(self._clf, 'estimators_', None), + classes_=getattr(self._clf, 'classes_', None), + n_classes_=getattr(self._clf, 'n_classes_', None), + n_features_=getattr(self._clf, 'n_features_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + oob_score_=getattr(self._clf, 'oob_score_', None), + oob_decision_function_=getattr(self._clf, 'oob_decision_function_', None), + base_estimator_=getattr(self._clf, 'base_estimator_', None), + estimator_params=getattr(self._clf, 'estimator_params', None), + base_estimator=getattr(self._clf, 'base_estimator', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.estimators_ = params['estimators_'] + self._clf.classes_ = params['classes_'] + self._clf.n_classes_ = params['n_classes_'] + self._clf.n_features_ = params['n_features_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.oob_score_ = params['oob_score_'] + self._clf.oob_decision_function_ = params['oob_decision_function_'] + self._clf.base_estimator_ = params['base_estimator_'] + self._clf.estimator_params = params['estimator_params'] + self._clf.base_estimator = params['base_estimator'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['estimators_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['n_classes_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['oob_score_'] is not None: + self._fitted = True + if params['oob_decision_function_'] is not None: + self._fitted = True + if params['base_estimator_'] is not None: + self._fitted = True + if params['estimator_params'] is not None: + self._fitted = True + if params['base_estimator'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKExtraTreesClassifier.__doc__ = ExtraTreesClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesRegressor.py new file mode 100644 index 0000000..4e4b10c --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesRegressor.py @@ -0,0 +1,607 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.forest import ExtraTreesRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + estimators_: Optional[List[sklearn.tree.ExtraTreeRegressor]] + n_features_: Optional[int] + n_outputs_: Optional[int] + oob_score_: Optional[float] + oob_prediction_: Optional[ndarray] + base_estimator_: Optional[object] + estimator_params: Optional[tuple] + class_weight: Optional[Union[str, dict, List[dict]]] + base_estimator: Optional[object] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='The number of trees in the forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + criterion = hyperparams.Enumeration[str]( + values=['mse', 'mae'], + default='mse', + description='The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_depth = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=5, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_split = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=1, + default=1.0, + description='It\'s a percentage and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=2, + description='Minimum number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_leaf = hyperparams.Union( + configuration=OrderedDict({ + 'percent': hyperparams.Bounded[float]( + lower=0, + upper=0.5, + default=0.25, + description='It\'s a percentage and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'absolute': hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + description='Minimum number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_weight_fraction_leaf = hyperparams.Bounded[float]( + default=0, + lower=0, + upper=0.5, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_leaf_nodes = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'specified_int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='calculated', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_decrease = hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + bootstrap = hyperparams.Enumeration[str]( + values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], + default='bootstrap', + description='Whether bootstrap samples are used when building trees.' + ' And whether to use out-of-bag samples to estimate the generalization accuracy.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKExtraTreesRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn ExtraTreesRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ], + "name": "sklearn.ensemble.forest.ExtraTreesRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.extra_trees.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html']}, + "version": "2019.11.13", + "id": "35321059-2a1a-31fd-9509-5494efc751c7", + "hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = ExtraTreesRegressor( + n_estimators=self.hyperparams['n_estimators'], + criterion=self.hyperparams['criterion'], + max_depth=self.hyperparams['max_depth'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + max_features=self.hyperparams['max_features'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], + oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], + warm_start=self.hyperparams['warm_start'], + n_jobs=self.hyperparams['n_jobs'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + estimators_=None, + n_features_=None, + n_outputs_=None, + oob_score_=None, + oob_prediction_=None, + base_estimator_=None, + estimator_params=None, + class_weight=None, + base_estimator=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + estimators_=getattr(self._clf, 'estimators_', None), + n_features_=getattr(self._clf, 'n_features_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + oob_score_=getattr(self._clf, 'oob_score_', None), + oob_prediction_=getattr(self._clf, 'oob_prediction_', None), + base_estimator_=getattr(self._clf, 'base_estimator_', None), + estimator_params=getattr(self._clf, 'estimator_params', None), + class_weight=getattr(self._clf, 'class_weight', None), + base_estimator=getattr(self._clf, 'base_estimator', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.estimators_ = params['estimators_'] + self._clf.n_features_ = params['n_features_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.oob_score_ = params['oob_score_'] + self._clf.oob_prediction_ = params['oob_prediction_'] + self._clf.base_estimator_ = params['base_estimator_'] + self._clf.estimator_params = params['estimator_params'] + self._clf.class_weight = params['class_weight'] + self._clf.base_estimator = params['base_estimator'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['estimators_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['oob_score_'] is not None: + self._fitted = True + if params['oob_prediction_'] is not None: + self._fitted = True + if params['base_estimator_'] is not None: + self._fitted = True + if params['estimator_params'] is not None: + self._fitted = True + if params['class_weight'] is not None: + self._fitted = True + if params['base_estimator'] is not None: + self._fitted = True + + + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKExtraTreesRegressor.__doc__ = ExtraTreesRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKFastICA.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKFastICA.py new file mode 100644 index 0000000..f160a02 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKFastICA.py @@ -0,0 +1,439 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.decomposition.fastica_ import FastICA + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + n_iter_: Optional[int] + mixing_: Optional[ndarray] + components_: Optional[ndarray] + mean_: Optional[ndarray] + whitening_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_components = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + description='All components are used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Number of components to extract. If None no dimension reduction is performed.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + algorithm = hyperparams.Enumeration[str]( + default='parallel', + values=['parallel', 'deflation'], + description='Apply a parallel or deflational FASTICA algorithm.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + whiten = hyperparams.UniformBool( + default=True, + description='If True perform an initial whitening of the data. If False, the data is assumed to have already been preprocessed: it should be centered, normed and white. Otherwise you will get incorrect results. In this case the parameter n_components will be ignored.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fun = hyperparams.Choice( + choices={ + 'logcosh': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'alpha': hyperparams.Hyperparameter[float]( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'exp': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'cube': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ) + }, + default='logcosh', + description='The functional form of the G function used in the approximation to neg-entropy. Could be either \'logcosh\', \'exp\', or \'cube\'. You can also provide your own function. It should return a tuple containing the value of the function, and of its derivative, in the point. Example: def my_g(x): return x ** 3, 3 * x ** 2', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Bounded[int]( + default=200, + lower=0, + upper=None, + description='Maximum number of iterations to perform. tol: float, optional A positive scalar giving the tolerance at which the un-mixing matrix is considered to have converged.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + w_init = hyperparams.Union( + configuration=OrderedDict({ + 'ndarray': hyperparams.Hyperparameter[ndarray]( + default=numpy.array([]), + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Initial un-mixing array of dimension (n.comp,n.comp). If None (default) then an array of normal r.v.\'s is used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKFastICA(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn FastICA + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, ], + "name": "sklearn.decomposition.fastica_.FastICA", + "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + "python_path": "d3m.primitives.data_transformation.fast_ica.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html']}, + "version": "2019.11.13", + "id": "03633ffa-425e-37d4-9f1c-bbb552f1e995", + "hyperparams_to_tune": ['n_components', 'algorithm'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = FastICA( + n_components=self.hyperparams['n_components'], + algorithm=self.hyperparams['algorithm'], + whiten=self.hyperparams['whiten'], + fun=self.hyperparams['fun']['choice'], + fun_args=self.hyperparams['fun'], + max_iter=self.hyperparams['max_iter'], + tol=self.hyperparams['tol'], + w_init=self.hyperparams['w_init'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + n_iter_=None, + mixing_=None, + components_=None, + mean_=None, + whitening_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + n_iter_=getattr(self._clf, 'n_iter_', None), + mixing_=getattr(self._clf, 'mixing_', None), + components_=getattr(self._clf, 'components_', None), + mean_=getattr(self._clf, 'mean_', None), + whitening_=getattr(self._clf, 'whitening_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.n_iter_ = params['n_iter_'] + self._clf.mixing_ = params['mixing_'] + self._clf.components_ = params['components_'] + self._clf.mean_ = params['mean_'] + self._clf.whitening_ = params['whitening_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['n_iter_'] is not None: + self._fitted = True + if params['mixing_'] is not None: + self._fitted = True + if params['components_'] is not None: + self._fitted = True + if params['mean_'] is not None: + self._fitted = True + if params['whitening_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKFastICA.__doc__ = FastICA.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKFeatureAgglomeration.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKFeatureAgglomeration.py new file mode 100644 index 0000000..36c1411 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKFeatureAgglomeration.py @@ -0,0 +1,361 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.cluster.hierarchical import FeatureAgglomeration +from numpy import mean as npmean + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + labels_: Optional[ndarray] + n_leaves_: Optional[int] + children_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_clusters = hyperparams.Bounded[int]( + default=2, + lower=0, + upper=None, + description='The number of clusters to find.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + affinity = hyperparams.Enumeration[str]( + default='euclidean', + values=['euclidean', 'l1', 'l2', 'manhattan', 'cosine', 'precomputed'], + description='Metric used to compute the linkage. Can be "euclidean", "l1", "l2", "manhattan", "cosine", or \'precomputed\'. If linkage is "ward", only "euclidean" is accepted.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + compute_full_tree = hyperparams.Union( + configuration=OrderedDict({ + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of features. This option is useful only when specifying a connectivity matrix. Note also that when varying the number of clusters and using caching, it may be advantageous to compute the full tree.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + linkage = hyperparams.Enumeration[str]( + default='ward', + values=['ward', 'complete', 'average', 'single'], + description='Which linkage criterion to use. The linkage criterion determines which distance to use between sets of features. The algorithm will merge the pairs of cluster that minimize this criterion. - ward minimizes the variance of the clusters being merged. - average uses the average of the distances of each feature of the two sets. - complete or maximum linkage uses the maximum distances between all features of the two sets.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKFeatureAgglomeration(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn FeatureAgglomeration + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_STREAM_CLUSTERING, ], + "name": "sklearn.cluster.hierarchical.FeatureAgglomeration", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.feature_agglomeration.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html']}, + "version": "2019.11.13", + "id": "f259b009-5e0f-37b1-b117-441aba2b65c8", + "hyperparams_to_tune": ['n_clusters', 'affinity', 'linkage'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = FeatureAgglomeration( + n_clusters=self.hyperparams['n_clusters'], + affinity=self.hyperparams['affinity'], + compute_full_tree=self.hyperparams['compute_full_tree'], + linkage=self.hyperparams['linkage'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + labels_=None, + n_leaves_=None, + children_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + labels_=getattr(self._clf, 'labels_', None), + n_leaves_=getattr(self._clf, 'n_leaves_', None), + children_=getattr(self._clf, 'children_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.labels_ = params['labels_'] + self._clf.n_leaves_ = params['n_leaves_'] + self._clf.children_ = params['children_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['labels_'] is not None: + self._fitted = True + if params['n_leaves_'] is not None: + self._fitted = True + if params['children_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKFeatureAgglomeration.__doc__ = FeatureAgglomeration.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianNB.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianNB.py new file mode 100644 index 0000000..d132e05 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianNB.py @@ -0,0 +1,492 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.naive_bayes import GaussianNB + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + class_prior_: Optional[ndarray] + class_count_: Optional[ndarray] + theta_: Optional[ndarray] + sigma_: Optional[ndarray] + classes_: Optional[ndarray] + epsilon_: Optional[float] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + var_smoothing = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1e-09, + description='Portion of the largest variance of all features that is added to variances for calculation stability.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKGaussianNB(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn GaussianNB + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.NAIVE_BAYES_CLASSIFIER, ], + "name": "sklearn.naive_bayes.GaussianNB", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.gaussian_naive_bayes.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html']}, + "version": "2019.11.13", + "id": "464783a8-771e-340d-999b-ae90b9f84f0b", + "hyperparams_to_tune": ['var_smoothing'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _priors: Union[ndarray, None] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = GaussianNB( + var_smoothing=self.hyperparams['var_smoothing'], + priors=_priors + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.partial_fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + class_prior_=None, + class_count_=None, + theta_=None, + sigma_=None, + classes_=None, + epsilon_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + class_prior_=getattr(self._clf, 'class_prior_', None), + class_count_=getattr(self._clf, 'class_count_', None), + theta_=getattr(self._clf, 'theta_', None), + sigma_=getattr(self._clf, 'sigma_', None), + classes_=getattr(self._clf, 'classes_', None), + epsilon_=getattr(self._clf, 'epsilon_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.class_prior_ = params['class_prior_'] + self._clf.class_count_ = params['class_count_'] + self._clf.theta_ = params['theta_'] + self._clf.sigma_ = params['sigma_'] + self._clf.classes_ = params['classes_'] + self._clf.epsilon_ = params['epsilon_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['class_prior_'] is not None: + self._fitted = True + if params['class_count_'] is not None: + self._fitted = True + if params['theta_'] is not None: + self._fitted = True + if params['sigma_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['epsilon_'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKGaussianNB.__doc__ = GaussianNB.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianProcessRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianProcessRegressor.py new file mode 100644 index 0000000..ff8417e --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianProcessRegressor.py @@ -0,0 +1,463 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.gaussian_process.gpr import GaussianProcessRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + X_train_: Optional[ndarray] + y_train_: Optional[ndarray] + kernel_: Optional[Callable] + alpha_: Optional[ndarray] + log_marginal_likelihood_value_: Optional[float] + _y_train_mean: Optional[ndarray] + _rng: Optional[numpy.random.mtrand.RandomState] + L_: Optional[ndarray] + _K_inv: Optional[object] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + alpha = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Hyperparameter[float]( + default=1e-10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'ndarray': hyperparams.Hyperparameter[ndarray]( + default=numpy.array([]), + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + description='Value added to the diagonal of the kernel matrix during fitting. Larger values correspond to increased noise level in the observations and reduce potential numerical issue during fitting. If an array is passed, it must have the same number of entries as the data used for fitting and is used as datapoint-dependent noise level. Note that this is equivalent to adding a WhiteKernel with c=alpha. Allowing to specify the noise level directly as a parameter is mainly for convenience and for consistency with Ridge.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + optimizer = hyperparams.Constant( + default='fmin_l_bfgs_b', + description='Can either be one of the internally supported optimizers for optimizing the kernel\'s parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * \'obj_func\' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * \'initial_theta\': the initial value for theta, which can be # used by local optimizers # * \'bounds\': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the \'fmin_l_bfgs_b\' algorithm from scipy.optimize is used. If None is passed, the kernel\'s parameters are kept fixed. Available internal optimizers are:: \'fmin_l_bfgs_b\'', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_restarts_optimizer = hyperparams.Bounded[int]( + default=0, + lower=0, + upper=None, + description='The number of restarts of the optimizer for finding the kernel\'s parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel\'s initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer == 0 implies that one run is performed.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + normalize_y = hyperparams.UniformBool( + default=False, + description='Whether the target values y are normalized, i.e., the mean of the observed target values become zero. This parameter should be set to True if the target values\' mean is expected to differ considerable from zero. When enabled, the normalization effectively modifies the GP\'s prior based on the data, which contradicts the likelihood principle; normalization is thus disabled per default. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKGaussianProcessRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn GaussianProcessRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.GAUSSIAN_PROCESS, ], + "name": "sklearn.gaussian_process.gpr.GaussianProcessRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.gaussian_process.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html']}, + "version": "2019.11.13", + "id": "3894e630-d67b-35d9-ab78-233e264f6324", + "hyperparams_to_tune": ['alpha'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = GaussianProcessRegressor( + alpha=self.hyperparams['alpha'], + optimizer=self.hyperparams['optimizer'], + n_restarts_optimizer=self.hyperparams['n_restarts_optimizer'], + normalize_y=self.hyperparams['normalize_y'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + X_train_=None, + y_train_=None, + kernel_=None, + alpha_=None, + log_marginal_likelihood_value_=None, + _y_train_mean=None, + _rng=None, + L_=None, + _K_inv=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + X_train_=getattr(self._clf, 'X_train_', None), + y_train_=getattr(self._clf, 'y_train_', None), + kernel_=getattr(self._clf, 'kernel_', None), + alpha_=getattr(self._clf, 'alpha_', None), + log_marginal_likelihood_value_=getattr(self._clf, 'log_marginal_likelihood_value_', None), + _y_train_mean=getattr(self._clf, '_y_train_mean', None), + _rng=getattr(self._clf, '_rng', None), + L_=getattr(self._clf, 'L_', None), + _K_inv=getattr(self._clf, '_K_inv', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.X_train_ = params['X_train_'] + self._clf.y_train_ = params['y_train_'] + self._clf.kernel_ = params['kernel_'] + self._clf.alpha_ = params['alpha_'] + self._clf.log_marginal_likelihood_value_ = params['log_marginal_likelihood_value_'] + self._clf._y_train_mean = params['_y_train_mean'] + self._clf._rng = params['_rng'] + self._clf.L_ = params['L_'] + self._clf._K_inv = params['_K_inv'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['X_train_'] is not None: + self._fitted = True + if params['y_train_'] is not None: + self._fitted = True + if params['kernel_'] is not None: + self._fitted = True + if params['alpha_'] is not None: + self._fitted = True + if params['log_marginal_likelihood_value_'] is not None: + self._fitted = True + if params['_y_train_mean'] is not None: + self._fitted = True + if params['_rng'] is not None: + self._fitted = True + if params['L_'] is not None: + self._fitted = True + if params['_K_inv'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKGaussianProcessRegressor.__doc__ = GaussianProcessRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianRandomProjection.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianRandomProjection.py new file mode 100644 index 0000000..867d904 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianRandomProjection.py @@ -0,0 +1,344 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.random_projection import GaussianRandomProjection + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + n_component_: Optional[int] + components_: Optional[Union[ndarray, sparse.spmatrix]] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_components = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=100, + description='Number of components to keep.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Dimensionality of the target projection space. n_components can be automatically adjusted according to the number of samples in the dataset and the bound given by the Johnson-Lindenstrauss lemma. In that case the quality of the embedding is controlled by the ``eps`` parameter. It should be noted that Johnson-Lindenstrauss lemma can yield very conservative estimated of the required number of components as it makes no assumption on the structure of the dataset.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + eps = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=1, + description='Parameter to control the quality of the embedding according to the Johnson-Lindenstrauss lemma when n_components is set to \'auto\'. Smaller values lead to better embedding and higher number of dimensions (n_components) in the target projection space.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKGaussianRandomProjection(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn GaussianRandomProjection + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.RANDOM_PROJECTION, ], + "name": "sklearn.random_projection.GaussianRandomProjection", + "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + "python_path": "d3m.primitives.data_transformation.gaussian_random_projection.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.random_projection.GaussianRandomProjection.html']}, + "version": "2019.11.13", + "id": "fc933ab9-baaf-47ca-a373-bdd33081f5fa", + "hyperparams_to_tune": ['n_components'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = GaussianRandomProjection( + n_components=self.hyperparams['n_components'], + eps=self.hyperparams['eps'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + n_component_=None, + components_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + n_component_=getattr(self._clf, 'n_component_', None), + components_=getattr(self._clf, 'components_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.n_component_ = params['n_component_'] + self._clf.components_ = params['components_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['n_component_'] is not None: + self._fitted = True + if params['components_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKGaussianRandomProjection.__doc__ = GaussianRandomProjection.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGenericUnivariateSelect.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGenericUnivariateSelect.py new file mode 100644 index 0000000..b0c45ad --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGenericUnivariateSelect.py @@ -0,0 +1,443 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.feature_selection.univariate_selection import GenericUnivariateSelect +from sklearn.feature_selection import f_classif, f_regression, chi2 + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + scores_: Optional[ndarray] + pvalues_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + score_func = hyperparams.Enumeration[str]( + default='f_classif', + values=['f_classif', 'f_regression', 'chi2'], + description='Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). For modes \'percentile\' or \'kbest\' it can return a single array scores.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + mode = hyperparams.Enumeration[str]( + default='percentile', + values=['percentile', 'k_best', 'fpr', 'fdr', 'fwe'], + description='Feature selection mode.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + param = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Hyperparameter[float]( + default=1e-05, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'int': hyperparams.Hyperparameter[int]( + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + description='Parameter of the corresponding mode.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['update_semantic_types', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", +) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKGenericUnivariateSelect(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn GenericUnivariateSelect + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.STATISTICAL_MOMENT_ANALYSIS, ], + "name": "sklearn.feature_selection.univariate_selection.GenericUnivariateSelect", + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_SELECTION, + "python_path": "d3m.primitives.feature_selection.generic_univariate_select.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.GenericUnivariateSelect.html']}, + "version": "2019.11.13", + "id": "1055a114-5c94-33b0-9100-675fd0200e72", + "hyperparams_to_tune": ['mode'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = GenericUnivariateSelect( + score_func=eval(self.hyperparams['score_func']), + mode=self.hyperparams['mode'], + param=self.hyperparams['param'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.transform(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + target_columns_metadata = self._copy_columns_metadata(inputs.iloc[:, self._training_indices].metadata, + self.produce_support().value) + output = self._wrap_predictions(inputs, sk_output, target_columns_metadata) + output.columns = [inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self.produce_support().value] + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if self.hyperparams['return_result'] == 'update_semantic_types': + temp_inputs = inputs.copy() + columns_not_selected = sorted(set(range(len(temp_inputs.columns))) - set(self.produce_support().value)) + + for idx in columns_not_selected: + temp_inputs.metadata = temp_inputs.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, idx), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + + temp_inputs = temp_inputs.select_columns(self._training_indices) + outputs = base_utils.combine_columns(return_result='replace', + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=[temp_inputs]) + return CallResult(outputs) + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output) + + return CallResult(outputs) + + def produce_support(self, *, timeout: float = None, iterations: int = None) -> CallResult[Any]: + all_indices = self._training_indices + selected_indices = self._clf.get_support(indices=True).tolist() + indices = [all_indices[index] for index in selected_indices] + return CallResult(indices) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + scores_=None, + pvalues_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + scores_=getattr(self._clf, 'scores_', None), + pvalues_=getattr(self._clf, 'pvalues_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.scores_ = params['scores_'] + self._clf.pvalues_ = params['pvalues_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['scores_'] is not None: + self._fitted = True + if params['pvalues_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + if len(target_columns_metadata) == 1: + name = column_metadata.get("name") + for idx in range(len(outputs.columns)): + outputs_metadata = outputs_metadata.update_column(idx, column_metadata) + if len(outputs.columns) > 1: + # Updating column names. + outputs_metadata = outputs_metadata.update((metadata_base.ALL_ELEMENTS, idx), {'name': "{}_{}".format(name, idx)}) + else: + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray, target_columns_metadata) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices) -> List[OrderedDict]: + outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in column_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKGenericUnivariateSelect.__doc__ = GenericUnivariateSelect.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGradientBoostingClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGradientBoostingClassifier.py new file mode 100644 index 0000000..0c92268 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGradientBoostingClassifier.py @@ -0,0 +1,707 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier +import sys + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + oob_improvement_: Optional[ndarray] + train_score_: Optional[ndarray] + loss_: Optional[object] + init_: Optional[object] + estimators_: Optional[ndarray] + n_features_: Optional[int] + classes_: Optional[ndarray] + max_features_: Optional[int] + n_classes_: Optional[Union[int, List[int]]] + alpha: Optional[float] + _rng: Optional[object] + n_estimators_: Optional[int] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + loss = hyperparams.Enumeration[str]( + default='deviance', + values=['deviance', 'exponential'], + description='loss function to be optimized. \'deviance\' refers to deviance (= logistic regression) for classification with probabilistic outputs. For loss \'exponential\' gradient boosting recovers the AdaBoost algorithm.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + learning_rate = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + description='learning rate shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_estimators = hyperparams.Bounded[int]( + default=100, + lower=1, + upper=None, + description='The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_depth = hyperparams.Bounded[int]( + default=3, + lower=0, + upper=None, + description='maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + criterion = hyperparams.Enumeration[str]( + default='friedman_mse', + values=['friedman_mse', 'mse', 'mae'], + description='The function to measure the quality of a split. Supported criteria are "friedman_mse" for the mean squared error with improvement score by Friedman, "mse" for mean squared error, and "mae" for the mean absolute error. The default value of "friedman_mse" is generally the best as it can provide a better approximation in some cases. .. versionadded:: 0.18', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_split = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=2, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_leaf = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=0.5, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_weight_fraction_leaf = hyperparams.Bounded[float]( + default=0, + lower=0, + upper=0.5, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + subsample = hyperparams.Bounded[float]( + default=1.0, + lower=0, + upper=None, + description='The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0 this results in Stochastic Gradient Boosting. `subsample` interacts with the parameter `n_estimators`. Choosing `subsample < 1.0` leads to a reduction of variance and an increase in bias.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'specified_int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Choosing `max_features < n_features` leads to a reduction of variance and an increase in bias. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_leaf_nodes = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=10, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_decrease = hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just erase the previous solution.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + presort = hyperparams.Union( + configuration=OrderedDict({ + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Whether to presort the data to speed up the finding of best splits in fitting. Auto mode by default will use presorting on dense data and default to normal sorting on sparse data. Setting presort to true on sparse data will raise an error. .. versionadded:: 0.17 *presort* parameter.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + validation_fraction = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=1, + description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if ``n_iter_no_change`` is set to an integer.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_iter_no_change = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='``n_iter_no_change`` is used to decide if early stopping will be used to terminate training when validation score is not improving. By default it is set to None to disable early stopping. If set to a number, it will set aside ``validation_fraction`` size of the training data as validation and terminate training when validation score is not improving in all of the previous ``n_iter_no_change`` numbers of iterations.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='Tolerance for the early stopping. When the loss is not improving by at least tol for ``n_iter_no_change`` iterations (if set to a number), the training stops.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKGradientBoostingClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn GradientBoostingClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.GRADIENT_BOOSTING, ], + "name": "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.gradient_boosting.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html']}, + "version": "2019.11.13", + "id": "01d2c086-91bf-3ca5-b023-5139cf239c77", + "hyperparams_to_tune": ['n_estimators', 'learning_rate', 'max_depth', 'min_samples_leaf', 'min_samples_split', 'max_features'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = GradientBoostingClassifier( + loss=self.hyperparams['loss'], + learning_rate=self.hyperparams['learning_rate'], + n_estimators=self.hyperparams['n_estimators'], + max_depth=self.hyperparams['max_depth'], + criterion=self.hyperparams['criterion'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + subsample=self.hyperparams['subsample'], + max_features=self.hyperparams['max_features'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + warm_start=self.hyperparams['warm_start'], + presort=self.hyperparams['presort'], + validation_fraction=self.hyperparams['validation_fraction'], + n_iter_no_change=self.hyperparams['n_iter_no_change'], + tol=self.hyperparams['tol'], + verbose=_verbose, + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + oob_improvement_=None, + train_score_=None, + loss_=None, + init_=None, + estimators_=None, + n_features_=None, + classes_=None, + max_features_=None, + n_classes_=None, + alpha=None, + _rng=None, + n_estimators_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + oob_improvement_=getattr(self._clf, 'oob_improvement_', None), + train_score_=getattr(self._clf, 'train_score_', None), + loss_=getattr(self._clf, 'loss_', None), + init_=getattr(self._clf, 'init_', None), + estimators_=getattr(self._clf, 'estimators_', None), + n_features_=getattr(self._clf, 'n_features_', None), + classes_=getattr(self._clf, 'classes_', None), + max_features_=getattr(self._clf, 'max_features_', None), + n_classes_=getattr(self._clf, 'n_classes_', None), + alpha=getattr(self._clf, 'alpha', None), + _rng=getattr(self._clf, '_rng', None), + n_estimators_=getattr(self._clf, 'n_estimators_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.oob_improvement_ = params['oob_improvement_'] + self._clf.train_score_ = params['train_score_'] + self._clf.loss_ = params['loss_'] + self._clf.init_ = params['init_'] + self._clf.estimators_ = params['estimators_'] + self._clf.n_features_ = params['n_features_'] + self._clf.classes_ = params['classes_'] + self._clf.max_features_ = params['max_features_'] + self._clf.n_classes_ = params['n_classes_'] + self._clf.alpha = params['alpha'] + self._clf._rng = params['_rng'] + self._clf.n_estimators_ = params['n_estimators_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['oob_improvement_'] is not None: + self._fitted = True + if params['train_score_'] is not None: + self._fitted = True + if params['loss_'] is not None: + self._fitted = True + if params['init_'] is not None: + self._fitted = True + if params['estimators_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['max_features_'] is not None: + self._fitted = True + if params['n_classes_'] is not None: + self._fitted = True + if params['alpha'] is not None: + self._fitted = True + if params['_rng'] is not None: + self._fitted = True + if params['n_estimators_'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKGradientBoostingClassifier.__doc__ = GradientBoostingClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGradientBoostingRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGradientBoostingRegressor.py new file mode 100644 index 0000000..7ec68f0 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKGradientBoostingRegressor.py @@ -0,0 +1,673 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + oob_improvement_: Optional[ndarray] + train_score_: Optional[ndarray] + loss_: Optional[object] + estimators_: Optional[object] + n_features_: Optional[int] + init_: Optional[object] + max_features_: Optional[int] + n_classes_: Optional[Union[int, List[int]]] + _rng: Optional[object] + n_estimators_: Optional[int] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + loss = hyperparams.Choice( + choices={ + 'ls': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'lad': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'huber': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'alpha': hyperparams.Constant( + default=0.9, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'quantile': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'alpha': hyperparams.Constant( + default=0.9, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='ls', + description='loss function to be optimized. \'ls\' refers to least squares regression. \'lad\' (least absolute deviation) is a highly robust loss function solely based on order information of the input variables. \'huber\' is a combination of the two. \'quantile\' allows quantile regression (use `alpha` to specify the quantile).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + learning_rate = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.1, + description='learning rate shrinks the contribution of each tree by `learning_rate`. There is a trade-off between learning_rate and n_estimators.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_estimators = hyperparams.Bounded[int]( + lower=1, + upper=None, + default=100, + description='The number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_depth = hyperparams.Bounded[int]( + lower=0, + upper=None, + default=3, + description='maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. Tune this parameter for best performance; the best value depends on the interaction of the input variables.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + criterion = hyperparams.Enumeration[str]( + values=['friedman_mse', 'mse', 'mae'], + default='friedman_mse', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description='The function to measure the quality of a split. Supported criteria are "friedman_mse" for the mean squared error with improvement score by Friedman, "mse" for mean squared error, and "mae" for the mean absolute error. The default value of "friedman_mse" is generally the best as it can provide a better approximation in some cases. .. versionadded:: 0.18' + ) + min_samples_split = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=1, + default=1.0, + description='It\'s a percentage and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=2, + description='Minimum number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_leaf = hyperparams.Union( + configuration=OrderedDict({ + 'percent': hyperparams.Bounded[float]( + lower=0, + upper=0.5, + default=0.25, + description='It\'s a percentage and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'absolute': hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + description='Minimum number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_weight_fraction_leaf = hyperparams.Bounded[float]( + default=0, + lower=0, + upper=0.5, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + subsample = hyperparams.Bounded[int]( + default=1, + lower=0, + upper=None, + description='The fraction of samples to be used for fitting the individual base learners. If smaller than 1.0 this results in Stochastic Gradient Boosting. `subsample` interacts with the parameter `n_estimators`. Choosing `subsample < 1.0` leads to a reduction of variance and an increase in bias.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'specified_int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Bounded[float]( + default=0.25, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Choosing `max_features < n_features` leads to a reduction of variance and an increase in bias. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_leaf_nodes = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_decrease = hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just erase the previous solution.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + presort = hyperparams.Union( + configuration=OrderedDict({ + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Whether to presort the data to speed up the finding of best splits in fitting. Auto mode by default will use presorting on dense data and default to normal sorting on sparse data. Setting presort to true on sparse data will raise an error. .. versionadded:: 0.17 optional parameter *presort*.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + validation_fraction = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=1, + description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if ``n_iter_no_change`` is set to an integer.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_iter_no_change = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='``n_iter_no_change`` is used to decide if early stopping will be used to terminate training when validation score is not improving. By default it is set to None to disable early stopping. If set to a number, it will set aside ``validation_fraction`` size of the training data as validation and terminate training when validation score is not improving in all of the previous ``n_iter_no_change`` numbers of iterations.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='Tolerance for the early stopping. When the loss is not improving by at least tol for ``n_iter_no_change`` iterations (if set to a number), the training stops.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKGradientBoostingRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn GradientBoostingRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.GRADIENT_BOOSTING, ], + "name": "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.gradient_boosting.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html']}, + "version": "2019.11.13", + "id": "2a031907-6b2c-3390-b365-921f89c8816a", + "hyperparams_to_tune": ['n_estimators', 'learning_rate', 'max_depth', 'min_samples_leaf', 'min_samples_split', 'max_features'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = GradientBoostingRegressor( + loss=self.hyperparams['loss']['choice'], + alpha=self.hyperparams['loss'].get('alpha', 0.9), + learning_rate=self.hyperparams['learning_rate'], + n_estimators=self.hyperparams['n_estimators'], + max_depth=self.hyperparams['max_depth'], + criterion=self.hyperparams['criterion'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + subsample=self.hyperparams['subsample'], + max_features=self.hyperparams['max_features'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + warm_start=self.hyperparams['warm_start'], + presort=self.hyperparams['presort'], + validation_fraction=self.hyperparams['validation_fraction'], + n_iter_no_change=self.hyperparams['n_iter_no_change'], + tol=self.hyperparams['tol'], + verbose=_verbose, + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + oob_improvement_=None, + train_score_=None, + loss_=None, + estimators_=None, + n_features_=None, + init_=None, + max_features_=None, + n_classes_=None, + _rng=None, + n_estimators_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + oob_improvement_=getattr(self._clf, 'oob_improvement_', None), + train_score_=getattr(self._clf, 'train_score_', None), + loss_=getattr(self._clf, 'loss_', None), + estimators_=getattr(self._clf, 'estimators_', None), + n_features_=getattr(self._clf, 'n_features_', None), + init_=getattr(self._clf, 'init_', None), + max_features_=getattr(self._clf, 'max_features_', None), + n_classes_=getattr(self._clf, 'n_classes_', None), + _rng=getattr(self._clf, '_rng', None), + n_estimators_=getattr(self._clf, 'n_estimators_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.oob_improvement_ = params['oob_improvement_'] + self._clf.train_score_ = params['train_score_'] + self._clf.loss_ = params['loss_'] + self._clf.estimators_ = params['estimators_'] + self._clf.n_features_ = params['n_features_'] + self._clf.init_ = params['init_'] + self._clf.max_features_ = params['max_features_'] + self._clf.n_classes_ = params['n_classes_'] + self._clf._rng = params['_rng'] + self._clf.n_estimators_ = params['n_estimators_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['oob_improvement_'] is not None: + self._fitted = True + if params['train_score_'] is not None: + self._fitted = True + if params['loss_'] is not None: + self._fitted = True + if params['estimators_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['init_'] is not None: + self._fitted = True + if params['max_features_'] is not None: + self._fitted = True + if params['n_classes_'] is not None: + self._fitted = True + if params['_rng'] is not None: + self._fitted = True + if params['n_estimators_'] is not None: + self._fitted = True + + + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKGradientBoostingRegressor.__doc__ = GradientBoostingRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKImputer.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKImputer.py new file mode 100644 index 0000000..203a3ca --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKImputer.py @@ -0,0 +1,391 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.impute import SimpleImputer +from sklearn.impute._base import _get_mask + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + statistics_: Optional[ndarray] + indicator_: Optional[sklearn.base.BaseEstimator] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + missing_values = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Hyperparameter[int]( + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'float': hyperparams.Hyperparameter[float]( + default=numpy.nan, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + description='The placeholder for the missing values. All occurrences of `missing_values` will be imputed.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + strategy = hyperparams.Enumeration[str]( + default='mean', + values=['median', 'most_frequent', 'mean', 'constant'], + description='The imputation strategy. - If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. - If "median", then replace missing values using the median along each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. - If "constant", then replace missing values with fill_value. Can be used with strings or numeric data. .. versionadded:: 0.20 strategy="constant" for fixed value imputation.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + add_indicator = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fill_value = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Hyperparameter[int]( + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='When strategy == "constant", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and "missing_value" for strings or object data types.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKImputer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn SimpleImputer + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.IMPUTATION, ], + "name": "sklearn.impute.SimpleImputer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_CLEANING, + "python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html']}, + "version": "2019.11.13", + "id": "d016df89-de62-3c53-87ed-c06bb6a23cde", + "hyperparams_to_tune": ['strategy'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = SimpleImputer( + missing_values=self.hyperparams['missing_values'], + strategy=self.hyperparams['strategy'], + add_indicator=self.hyperparams['add_indicator'], + fill_value=self.hyperparams['fill_value'], + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices, _ = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use, _ = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.transform(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + target_columns_metadata = self._copy_columns_metadata(inputs.metadata, self._training_indices, self.hyperparams) + output = self._wrap_predictions(inputs, sk_output, target_columns_metadata) + + output.columns = [inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self._training_indices] + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + _, _, dropped_cols = self._get_columns_to_fit(inputs, self.hyperparams) + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices + dropped_cols, + columns_list=output) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + statistics_=None, + indicator_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + statistics_=getattr(self._clf, 'statistics_', None), + indicator_=getattr(self._clf, 'indicator_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.statistics_ = params['statistics_'] + self._clf.indicator_ = params['indicator_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['statistics_'] is not None: + self._fitted = True + if params['indicator_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + + if not hyperparams['use_semantic_types']: + columns_to_produce = list(range(len(inputs.columns))) + + else: + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + + columns_to_drop = cls._get_columns_to_drop(inputs, columns_to_produce, hyperparams) + for col in columns_to_drop: + columns_to_produce.remove(col) + + return inputs.iloc[:, columns_to_produce], columns_to_produce, columns_to_drop + + @classmethod + def _get_columns_to_drop(cls, inputs: Inputs, column_indices: List[int], hyperparams: Hyperparams): + """ + Check for columns that contain missing_values that need to be imputed + If strategy is constant and missin_values is nan, then all nan columns will not be dropped + :param inputs: + :param column_indices: + :return: + """ + columns_to_remove = [] + if hyperparams['strategy'] != "constant": + for _, col in enumerate(column_indices): + inp = inputs.iloc[:, [col]].values + mask = _get_mask(inp, hyperparams['missing_values']) + if mask.all(): + columns_to_remove.append(col) + return columns_to_remove + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray, target_columns_metadata) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices, hyperparams) -> List[OrderedDict]: + outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in column_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKImputer.__doc__ = SimpleImputer.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKNeighborsClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKNeighborsClassifier.py new file mode 100644 index 0000000..75d5f2f --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKNeighborsClassifier.py @@ -0,0 +1,497 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.neighbors.classification import KNeighborsClassifier + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + _fit_method: Optional[str] + _fit_X: Optional[ndarray] + _tree: Optional[object] + classes_: Optional[ndarray] + _y: Optional[ndarray] + outputs_2d_: Optional[bool] + effective_metric_: Optional[str] + effective_metric_params_: Optional[Dict] + radius: Optional[float] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_neighbors = hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + description='Number of neighbors to use by default for :meth:`k_neighbors` queries.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + weights = hyperparams.Enumeration[str]( + values=['uniform', 'distance'], + default='uniform', + description='weight function used in prediction. Possible values: - \'uniform\' : uniform weights. All points in each neighborhood are weighted equally. - \'distance\' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + algorithm = hyperparams.Enumeration[str]( + values=['auto', 'ball_tree', 'kd_tree', 'brute'], + default='auto', + description='Algorithm used to compute the nearest neighbors: - \'ball_tree\' will use :class:`BallTree` - \'kd_tree\' will use :class:`KDTree` - \'brute\' will use a brute-force search. - \'auto\' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + leaf_size = hyperparams.Bounded[int]( + default=30, + lower=0, + upper=None, + description='Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', 'https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + metric = hyperparams.Enumeration[str]( + values=['euclidean', 'manhattan', 'chebyshev', 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis'], + default='minkowski', + description='the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + p = hyperparams.Enumeration[int]( + values=[1, 2], + default=2, + description='Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Doesn\'t affect :meth:`fit` method.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKKNeighborsClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn KNeighborsClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], + "name": "sklearn.neighbors.classification.KNeighborsClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.k_neighbors.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html']}, + "version": "2019.11.13", + "id": "754f7210-a0b7-3b7a-8c98-f43c7b663d28", + "hyperparams_to_tune": ['n_neighbors', 'p'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = KNeighborsClassifier( + n_neighbors=self.hyperparams['n_neighbors'], + weights=self.hyperparams['weights'], + algorithm=self.hyperparams['algorithm'], + leaf_size=self.hyperparams['leaf_size'], + metric=self.hyperparams['metric'], + p=self.hyperparams['p'], + n_jobs=self.hyperparams['n_jobs'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + _fit_method=None, + _fit_X=None, + _tree=None, + classes_=None, + _y=None, + outputs_2d_=None, + effective_metric_=None, + effective_metric_params_=None, + radius=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + _fit_method=getattr(self._clf, '_fit_method', None), + _fit_X=getattr(self._clf, '_fit_X', None), + _tree=getattr(self._clf, '_tree', None), + classes_=getattr(self._clf, 'classes_', None), + _y=getattr(self._clf, '_y', None), + outputs_2d_=getattr(self._clf, 'outputs_2d_', None), + effective_metric_=getattr(self._clf, 'effective_metric_', None), + effective_metric_params_=getattr(self._clf, 'effective_metric_params_', None), + radius=getattr(self._clf, 'radius', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf._fit_method = params['_fit_method'] + self._clf._fit_X = params['_fit_X'] + self._clf._tree = params['_tree'] + self._clf.classes_ = params['classes_'] + self._clf._y = params['_y'] + self._clf.outputs_2d_ = params['outputs_2d_'] + self._clf.effective_metric_ = params['effective_metric_'] + self._clf.effective_metric_params_ = params['effective_metric_params_'] + self._clf.radius = params['radius'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['_fit_method'] is not None: + self._fitted = True + if params['_fit_X'] is not None: + self._fitted = True + if params['_tree'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['_y'] is not None: + self._fitted = True + if params['outputs_2d_'] is not None: + self._fitted = True + if params['effective_metric_'] is not None: + self._fitted = True + if params['effective_metric_params_'] is not None: + self._fitted = True + if params['radius'] is not None: + self._fitted = True + + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.values # Get ndarray + outputs = outputs.values + return CallResult(numpy.log(self._clf.predict_proba(inputs)[:, outputs])) + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKKNeighborsClassifier.__doc__ = KNeighborsClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKNeighborsRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKNeighborsRegressor.py new file mode 100644 index 0000000..38b4469 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKNeighborsRegressor.py @@ -0,0 +1,475 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.neighbors.regression import KNeighborsRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + _fit_method: Optional[str] + _fit_X: Optional[ndarray] + _tree: Optional[object] + _y: Optional[ndarray] + effective_metric_: Optional[str] + effective_metric_params_: Optional[Dict] + radius: Optional[float] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_neighbors = hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + description='Number of neighbors to use by default for :meth:`k_neighbors` queries.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + weights = hyperparams.Enumeration[str]( + values=['uniform', 'distance'], + default='uniform', + description='weight function used in prediction. Possible values: - \'uniform\' : uniform weights. All points in each neighborhood are weighted equally. - \'distance\' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights. Uniform weights are used by default.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + algorithm = hyperparams.Enumeration[str]( + values=['auto', 'ball_tree', 'kd_tree', 'brute'], + default='auto', + description='Algorithm used to compute the nearest neighbors: - \'ball_tree\' will use :class:`BallTree` - \'kd_tree\' will use :class:`KDtree` - \'brute\' will use a brute-force search. - \'auto\' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + leaf_size = hyperparams.Bounded[int]( + default=30, + lower=0, + upper=None, + description='Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter', 'https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + metric = hyperparams.Constant( + default='minkowski', + description='the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean metric. See the documentation of the DistanceMetric class for a list of available metrics.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + p = hyperparams.Enumeration[int]( + values=[1, 2], + default=2, + description='Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Doesn\'t affect :meth:`fit` method.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKKNeighborsRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn KNeighborsRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], + "name": "sklearn.neighbors.regression.KNeighborsRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.k_neighbors.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html']}, + "version": "2019.11.13", + "id": "50b499a5-cef8-3028-8a99-ae553819f855", + "hyperparams_to_tune": ['n_neighbors', 'p'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = KNeighborsRegressor( + n_neighbors=self.hyperparams['n_neighbors'], + weights=self.hyperparams['weights'], + algorithm=self.hyperparams['algorithm'], + leaf_size=self.hyperparams['leaf_size'], + metric=self.hyperparams['metric'], + p=self.hyperparams['p'], + n_jobs=self.hyperparams['n_jobs'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + _fit_method=None, + _fit_X=None, + _tree=None, + _y=None, + effective_metric_=None, + effective_metric_params_=None, + radius=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + _fit_method=getattr(self._clf, '_fit_method', None), + _fit_X=getattr(self._clf, '_fit_X', None), + _tree=getattr(self._clf, '_tree', None), + _y=getattr(self._clf, '_y', None), + effective_metric_=getattr(self._clf, 'effective_metric_', None), + effective_metric_params_=getattr(self._clf, 'effective_metric_params_', None), + radius=getattr(self._clf, 'radius', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf._fit_method = params['_fit_method'] + self._clf._fit_X = params['_fit_X'] + self._clf._tree = params['_tree'] + self._clf._y = params['_y'] + self._clf.effective_metric_ = params['effective_metric_'] + self._clf.effective_metric_params_ = params['effective_metric_params_'] + self._clf.radius = params['radius'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['_fit_method'] is not None: + self._fitted = True + if params['_fit_X'] is not None: + self._fitted = True + if params['_tree'] is not None: + self._fitted = True + if params['_y'] is not None: + self._fitted = True + if params['effective_metric_'] is not None: + self._fitted = True + if params['effective_metric_params_'] is not None: + self._fitted = True + if params['radius'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKKNeighborsRegressor.__doc__ = KNeighborsRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKernelPCA.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKernelPCA.py new file mode 100644 index 0000000..0c7fb57 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKernelPCA.py @@ -0,0 +1,536 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.decomposition.kernel_pca import KernelPCA + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + lambdas_: Optional[ndarray] + alphas_: Optional[ndarray] + dual_coef_: Optional[ndarray] + X_fit_: Optional[ndarray] + _centerer: Optional[sklearn.base.BaseEstimator] + X_transformed_fit_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_components = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=100, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + description='All non-zero components are kept.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Number of components. If None, all non-zero components are kept.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + kernel = hyperparams.Choice( + choices={ + 'linear': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'poly': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'degree': hyperparams.Bounded[int]( + default=3, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + description='Equals 1/n_features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Constant( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'rbf': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + description='Equals 1/n_features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'sigmoid': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + description='Equals 1/n_features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Constant( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'precomputed': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ) + }, + default='rbf', + description='Kernel. Default="linear".', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_inverse_transform = hyperparams.UniformBool( + default=False, + description='Learn the inverse transform for non-precomputed kernels. (i.e. learn to find the pre-image of a point)', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + alpha = hyperparams.Constant( + default=1, + description='Hyperparameter of the ridge regression that learns the inverse transform (when fit_inverse_transform=True).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + eigen_solver = hyperparams.Enumeration[str]( + default='auto', + values=['auto', 'dense', 'arpack'], + description='Select eigensolver to use. If n_components is much less than the number of training samples, arpack may be more efficient than the dense eigensolver.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0, + lower=0, + upper=None, + description='Convergence tolerance for arpack. If 0, optimal value will be chosen by arpack.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=4, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + description='Optimal value is chosen by arpack.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Maximum number of iterations for arpack. If None, optimal value will be chosen by arpack.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + remove_zero_eig = hyperparams.UniformBool( + default=False, + description='If True, then all components with zero eigenvalues are removed, so that the number of components in the output may be < n_components (and sometimes even zero due to numerical instability). When n_components is None, this parameter is ignored and components with zero eigenvalues are removed regardless.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of parallel jobs to run. If `-1`, then the number of jobs is set to the number of CPU cores. .. versionadded:: 0.18 copy_X : boolean, default=True If True, input X is copied and stored by the model in the `X_fit_` attribute. If no further changes will be done to X, setting `copy_X=False` saves memory by storing a reference. .. versionadded:: 0.18', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKKernelPCA(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn KernelPCA + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, ], + "name": "sklearn.decomposition.kernel_pca.KernelPCA", + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_EXTRACTION, + "python_path": "d3m.primitives.feature_extraction.kernel_pca.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html']}, + "version": "2019.11.13", + "id": "fec6eba2-4a1b-3ea9-a31f-1da371941ede", + "hyperparams_to_tune": ['n_components', 'kernel', 'alpha'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = KernelPCA( + n_components=self.hyperparams['n_components'], + kernel=self.hyperparams['kernel']['choice'], + degree=self.hyperparams['kernel'].get('degree', 3), + gamma=self.hyperparams['kernel'].get('gamma', 'none'), + coef0=self.hyperparams['kernel'].get('coef0', 1), + fit_inverse_transform=self.hyperparams['fit_inverse_transform'], + alpha=self.hyperparams['alpha'], + eigen_solver=self.hyperparams['eigen_solver'], + tol=self.hyperparams['tol'], + max_iter=self.hyperparams['max_iter'], + remove_zero_eig=self.hyperparams['remove_zero_eig'], + n_jobs=self.hyperparams['n_jobs'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + lambdas_=None, + alphas_=None, + dual_coef_=None, + X_fit_=None, + _centerer=None, + X_transformed_fit_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + lambdas_=getattr(self._clf, 'lambdas_', None), + alphas_=getattr(self._clf, 'alphas_', None), + dual_coef_=getattr(self._clf, 'dual_coef_', None), + X_fit_=getattr(self._clf, 'X_fit_', None), + _centerer=getattr(self._clf, '_centerer', None), + X_transformed_fit_=getattr(self._clf, 'X_transformed_fit_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.lambdas_ = params['lambdas_'] + self._clf.alphas_ = params['alphas_'] + self._clf.dual_coef_ = params['dual_coef_'] + self._clf.X_fit_ = params['X_fit_'] + self._clf._centerer = params['_centerer'] + self._clf.X_transformed_fit_ = params['X_transformed_fit_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['lambdas_'] is not None: + self._fitted = True + if params['alphas_'] is not None: + self._fitted = True + if params['dual_coef_'] is not None: + self._fitted = True + if params['X_fit_'] is not None: + self._fitted = True + if params['_centerer'] is not None: + self._fitted = True + if params['X_transformed_fit_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKKernelPCA.__doc__ = KernelPCA.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKernelRidge.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKernelRidge.py new file mode 100644 index 0000000..a8b12ee --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKKernelRidge.py @@ -0,0 +1,491 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.kernel_ridge import KernelRidge + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + dual_coef_: Optional[ndarray] + X_fit_: Optional[Union[ndarray, sparse.spmatrix]] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + alpha = hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + description='Small positive values of alpha improve the conditioning of the problem and reduce the variance of the estimates. Alpha corresponds to ``(2*C)^-1`` in other linear models such as LogisticRegression or LinearSVC. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + kernel = hyperparams.Choice( + choices={ + 'linear': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'poly': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'degree': hyperparams.Bounded[float]( + default=3, + lower=0, + upper=None, + description='Degree of the polynomial kernel. Ignored by other kernels.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'gamma': hyperparams.Bounded[float]( + default=0, + lower=0, + upper=None, + description='Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and sigmoid kernels. Interpretation of the default value is left to the kernel; see the documentation for sklearn.metrics.pairwise. Ignored by other kernels.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + description='Zero coefficient for polynomial and sigmoid kernels. Ignored by other kernels classes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'rbf': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Bounded[float]( + default=0, + lower=0, + upper=None, + description='Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and sigmoid kernels. Interpretation of the default value is left to the kernel; see the documentation for sklearn.metrics.pairwise. Ignored by other kernels.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'sigmoid': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Bounded[float]( + default=0, + lower=0, + upper=None, + description='Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and sigmoid kernels. Interpretation of the default value is left to the kernel; see the documentation for sklearn.metrics.pairwise. Ignored by other kernels.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + description='Zero coefficient for polynomial and sigmoid kernels. Ignored by other kernels classes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'additive_chi2': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'chi2': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Bounded[float]( + default=0, + lower=0, + upper=None, + description='Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and sigmoid kernels. Interpretation of the default value is left to the kernel; see the documentation for sklearn.metrics.pairwise. Ignored by other kernels.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'laplacian': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Bounded[float]( + default=0, + lower=0, + upper=None, + description='Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and sigmoid kernels. Interpretation of the default value is left to the kernel; see the documentation for sklearn.metrics.pairwise. Ignored by other kernels.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'cosine': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'precomputed': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ) + }, + default='linear', + description='Kernel mapping used internally. A callable should accept two arguments and the keyword arguments passed to this object as kernel_params, and should return a floating point number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKKernelRidge(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn KernelRidge + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SUPPORT_VECTOR_MACHINE, ], + "name": "sklearn.kernel_ridge.KernelRidge", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.kernel_ridge.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.kernel_ridge.KernelRidge.html']}, + "version": "2019.11.13", + "id": "0fca4b96-d46b-3598-a4a5-bfa428d039fc", + "hyperparams_to_tune": ['alpha', 'kernel'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = KernelRidge( + alpha=self.hyperparams['alpha'], + kernel=self.hyperparams['kernel']['choice'], + degree=self.hyperparams['kernel'].get('degree', 3), + gamma=self.hyperparams['kernel'].get('gamma', 0), + coef0=self.hyperparams['kernel'].get('coef0', 1), + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + dual_coef_=None, + X_fit_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + dual_coef_=getattr(self._clf, 'dual_coef_', None), + X_fit_=getattr(self._clf, 'X_fit_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.dual_coef_ = params['dual_coef_'] + self._clf.X_fit_ = params['X_fit_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['dual_coef_'] is not None: + self._fitted = True + if params['X_fit_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKKernelRidge.__doc__ = KernelRidge.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLars.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLars.py new file mode 100644 index 0000000..1136d16 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLars.py @@ -0,0 +1,460 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.least_angle import Lars + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + alphas_: Optional[ndarray] + active_: Optional[Sequence[Any]] + coef_path_: Optional[ndarray] + coef_: Optional[ndarray] + intercept_: Optional[Union[float, ndarray]] + n_iter_: Optional[Union[int, ndarray, None]] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + fit_intercept = hyperparams.UniformBool( + default=True, + description='Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + normalize = hyperparams.UniformBool( + default=True, + description='This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + precompute = hyperparams.Union( + configuration=OrderedDict({ + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Whether to use a precomputed Gram matrix to speed up calculations. If set to ``\'auto\'`` let us decide. The Gram matrix can also be passed as argument.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + n_nonzero_coefs = hyperparams.Bounded[int]( + default=500, + lower=0, + upper=None, + description='Target number of non-zero coefficients. Use ``np.inf`` for no limit.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + eps = hyperparams.Bounded[float]( + default=numpy.finfo(numpy.float).eps, + lower=0, + upper=None, + description='The machine-precision regularization in the computation of the Cholesky diagonal factors. Increase this for very ill-conditioned systems. Unlike the ``tol`` parameter in some iterative optimization-based algorithms, this parameter does not control the tolerance of the optimization. copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_path = hyperparams.UniformBool( + default=True, + description='If True the full path is stored in the ``coef_path_`` attribute. If you compute the solution for a large problem or many targets, setting ``fit_path`` to ``False`` will lead to a speedup, especially with a small alpha.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKLars(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn Lars + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LINEAR_REGRESSION, ], + "name": "sklearn.linear_model.least_angle.Lars", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.lars.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lars.html']}, + "version": "2019.11.13", + "id": "989a40cd-114c-309d-9a94-59d2669d6c94", + "hyperparams_to_tune": ['eps'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Lars( + fit_intercept=self.hyperparams['fit_intercept'], + normalize=self.hyperparams['normalize'], + precompute=self.hyperparams['precompute'], + n_nonzero_coefs=self.hyperparams['n_nonzero_coefs'], + eps=self.hyperparams['eps'], + fit_path=self.hyperparams['fit_path'], + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + alphas_=None, + active_=None, + coef_path_=None, + coef_=None, + intercept_=None, + n_iter_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + alphas_=getattr(self._clf, 'alphas_', None), + active_=getattr(self._clf, 'active_', None), + coef_path_=getattr(self._clf, 'coef_path_', None), + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.alphas_ = params['alphas_'] + self._clf.active_ = params['active_'] + self._clf.coef_path_ = params['coef_path_'] + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.n_iter_ = params['n_iter_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['alphas_'] is not None: + self._fitted = True + if params['active_'] is not None: + self._fitted = True + if params['coef_path_'] is not None: + self._fitted = True + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKLars.__doc__ = Lars.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLasso.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLasso.py new file mode 100644 index 0000000..028f7f7 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLasso.py @@ -0,0 +1,474 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.coordinate_descent import Lasso + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[Union[float, ndarray]] + n_iter_: Optional[int] + dual_gap_: Optional[float] + l1_ratio: Optional[float] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + alpha = hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + description='Constant that multiplies the L1 term. Defaults to 1.0. ``alpha = 0`` is equivalent to an ordinary least square, solved by the :class:`LinearRegression` object. For numerical reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. Given this, you should use the :class:`LinearRegression` object.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + normalize = hyperparams.UniformBool( + default=False, + description='This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + precompute = hyperparams.Union( + configuration=OrderedDict({ + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='bool', + description='Whether to use a precomputed Gram matrix to speed up calculations. If set to ``\'auto\'`` let us decide. The Gram matrix can also be passed as argument. For sparse input this option is always ``True`` to preserve sparsity. copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + max_iter = hyperparams.Bounded[int]( + default=1000, + lower=0, + upper=None, + description='The maximum number of iterations', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + positive = hyperparams.UniformBool( + default=False, + description='When set to ``True``, forces the coefficients to be positive.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + selection = hyperparams.Enumeration[str]( + default='cyclic', + values=['cyclic', 'random'], + description='If set to \'random\', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to \'random\') often leads to significantly faster convergence especially when tol is higher than 1e-4.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKLasso(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn Lasso + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LASSO, ], + "name": "sklearn.linear_model.coordinate_descent.Lasso", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.lasso.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html']}, + "version": "2019.11.13", + "id": "a7100c7d-8d8e-3f2a-a0ee-b4380383ed6c", + "hyperparams_to_tune": ['alpha', 'max_iter'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Lasso( + alpha=self.hyperparams['alpha'], + fit_intercept=self.hyperparams['fit_intercept'], + normalize=self.hyperparams['normalize'], + precompute=self.hyperparams['precompute'], + max_iter=self.hyperparams['max_iter'], + tol=self.hyperparams['tol'], + warm_start=self.hyperparams['warm_start'], + positive=self.hyperparams['positive'], + selection=self.hyperparams['selection'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + n_iter_=None, + dual_gap_=None, + l1_ratio=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + dual_gap_=getattr(self._clf, 'dual_gap_', None), + l1_ratio=getattr(self._clf, 'l1_ratio', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.n_iter_ = params['n_iter_'] + self._clf.dual_gap_ = params['dual_gap_'] + self._clf.l1_ratio = params['l1_ratio'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + if params['dual_gap_'] is not None: + self._fitted = True + if params['l1_ratio'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKLasso.__doc__ = Lasso.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLassoCV.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLassoCV.py new file mode 100644 index 0000000..5c53829 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLassoCV.py @@ -0,0 +1,526 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.coordinate_descent import LassoCV + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + alpha_: Optional[float] + coef_: Optional[ndarray] + intercept_: Optional[float] + mse_path_: Optional[ndarray] + alphas_: Optional[ndarray] + dual_gap_: Optional[float] + n_iter_: Optional[int] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + eps = hyperparams.Bounded[float]( + default=0.001, + lower=0, + upper=None, + description='Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_alphas = hyperparams.Bounded[int]( + default=100, + lower=0, + upper=None, + description='Number of alphas along the regularization path', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + normalize = hyperparams.UniformBool( + default=False, + description='This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + precompute = hyperparams.Union( + configuration=OrderedDict({ + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Whether to use a precomputed Gram matrix to speed up calculations. If set to ``\'auto\'`` let us decide. The Gram matrix can also be passed as argument.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + max_iter = hyperparams.Bounded[int]( + default=1000, + lower=0, + upper=None, + description='The maximum number of iterations', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + cv = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + description='Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, :class:`KFold` is used. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='Number of CPUs to use during the cross validation. If ``-1``, use all the CPUs.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + positive = hyperparams.UniformBool( + default=False, + description='If positive, restrict regression coefficients to be positive', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + selection = hyperparams.Enumeration[str]( + default='cyclic', + values=['cyclic', 'random'], + description='If set to \'random\', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to \'random\') often leads to significantly faster convergence especially when tol is higher than 1e-4.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKLassoCV(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn LassoCV + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LASSO, ], + "name": "sklearn.linear_model.coordinate_descent.LassoCV", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.lasso_cv.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html']}, + "version": "2019.11.13", + "id": "cfd0482b-d639-3d2b-b876-87f25277a088", + "hyperparams_to_tune": ['eps', 'max_iter'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = LassoCV( + eps=self.hyperparams['eps'], + n_alphas=self.hyperparams['n_alphas'], + fit_intercept=self.hyperparams['fit_intercept'], + normalize=self.hyperparams['normalize'], + precompute=self.hyperparams['precompute'], + max_iter=self.hyperparams['max_iter'], + tol=self.hyperparams['tol'], + cv=self.hyperparams['cv'], + n_jobs=self.hyperparams['n_jobs'], + positive=self.hyperparams['positive'], + selection=self.hyperparams['selection'], + verbose=_verbose, + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + alpha_=None, + coef_=None, + intercept_=None, + mse_path_=None, + alphas_=None, + dual_gap_=None, + n_iter_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + alpha_=getattr(self._clf, 'alpha_', None), + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + mse_path_=getattr(self._clf, 'mse_path_', None), + alphas_=getattr(self._clf, 'alphas_', None), + dual_gap_=getattr(self._clf, 'dual_gap_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.alpha_ = params['alpha_'] + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.mse_path_ = params['mse_path_'] + self._clf.alphas_ = params['alphas_'] + self._clf.dual_gap_ = params['dual_gap_'] + self._clf.n_iter_ = params['n_iter_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['alpha_'] is not None: + self._fitted = True + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['mse_path_'] is not None: + self._fitted = True + if params['alphas_'] is not None: + self._fitted = True + if params['dual_gap_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKLassoCV.__doc__ = LassoCV.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearDiscriminantAnalysis.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearDiscriminantAnalysis.py new file mode 100644 index 0000000..b574279 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearDiscriminantAnalysis.py @@ -0,0 +1,535 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[ndarray] + covariance_: Optional[ndarray] + explained_variance_ratio_: Optional[ndarray] + means_: Optional[ndarray] + priors_: Optional[ndarray] + scalings_: Optional[ndarray] + xbar_: Optional[ndarray] + classes_: Optional[ndarray] + _max_components: Optional[int] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + solver = hyperparams.Enumeration[str]( + default='svd', + values=['svd', 'lsqr', 'eigen'], + description='Solver to use, possible values: - \'svd\': Singular value decomposition (default). Does not compute the covariance matrix, therefore this solver is recommended for data with a large number of features. - \'lsqr\': Least squares solution, can be combined with shrinkage. - \'eigen\': Eigenvalue decomposition, can be combined with shrinkage.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + shrinkage = hyperparams.Union( + configuration=OrderedDict({ + 'string': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'float': hyperparams.Bounded[float]( + default=0, + lower=0, + upper=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Shrinkage parameter, possible values: - None: no shrinkage (default). - \'auto\': automatic shrinkage using the Ledoit-Wolf lemma. - float between 0 and 1: fixed shrinkage parameter. Note that shrinkage works only with \'lsqr\' and \'eigen\' solvers.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_components = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=0, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Number of components (< n_classes - 1) for dimensionality reduction.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='Threshold used for rank estimation in SVD solver. .. versionadded:: 0.17', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKLinearDiscriminantAnalysis(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn LinearDiscriminantAnalysis + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LINEAR_DISCRIMINANT_ANALYSIS, ], + "name": "sklearn.discriminant_analysis.LinearDiscriminantAnalysis", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.linear_discriminant_analysis.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html']}, + "version": "2019.11.13", + "id": "a323b46a-6c15-373e-91b4-20efbd65402f", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = LinearDiscriminantAnalysis( + solver=self.hyperparams['solver'], + shrinkage=self.hyperparams['shrinkage'], + n_components=self.hyperparams['n_components'], + tol=self.hyperparams['tol'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + covariance_=None, + explained_variance_ratio_=None, + means_=None, + priors_=None, + scalings_=None, + xbar_=None, + classes_=None, + _max_components=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + covariance_=getattr(self._clf, 'covariance_', None), + explained_variance_ratio_=getattr(self._clf, 'explained_variance_ratio_', None), + means_=getattr(self._clf, 'means_', None), + priors_=getattr(self._clf, 'priors_', None), + scalings_=getattr(self._clf, 'scalings_', None), + xbar_=getattr(self._clf, 'xbar_', None), + classes_=getattr(self._clf, 'classes_', None), + _max_components=getattr(self._clf, '_max_components', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.covariance_ = params['covariance_'] + self._clf.explained_variance_ratio_ = params['explained_variance_ratio_'] + self._clf.means_ = params['means_'] + self._clf.priors_ = params['priors_'] + self._clf.scalings_ = params['scalings_'] + self._clf.xbar_ = params['xbar_'] + self._clf.classes_ = params['classes_'] + self._clf._max_components = params['_max_components'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['covariance_'] is not None: + self._fitted = True + if params['explained_variance_ratio_'] is not None: + self._fitted = True + if params['means_'] is not None: + self._fitted = True + if params['priors_'] is not None: + self._fitted = True + if params['scalings_'] is not None: + self._fitted = True + if params['xbar_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['_max_components'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKLinearDiscriminantAnalysis.__doc__ = LinearDiscriminantAnalysis.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearRegression.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearRegression.py new file mode 100644 index 0000000..62ce474 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearRegression.py @@ -0,0 +1,431 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.base import LinearRegression + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[float] + _residues: Optional[float] + rank_: Optional[int] + singular_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + fit_intercept = hyperparams.UniformBool( + default=True, + description='whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (e.g. data is expected to be already centered).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + normalize = hyperparams.UniformBool( + default=True, + description='This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of jobs to use for the computation. This will only provide speedup for n_targets > 1 and sufficient large problems. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKLinearRegression(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn LinearRegression + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LINEAR_REGRESSION, ], + "name": "sklearn.linear_model.base.LinearRegression", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.linear.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html']}, + "version": "2019.11.13", + "id": "816cc0f8-8bf4-4d00-830d-272342349577", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = LinearRegression( + fit_intercept=self.hyperparams['fit_intercept'], + normalize=self.hyperparams['normalize'], + n_jobs=self.hyperparams['n_jobs'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + _residues=None, + rank_=None, + singular_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + _residues=getattr(self._clf, '_residues', None), + rank_=getattr(self._clf, 'rank_', None), + singular_=getattr(self._clf, 'singular_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf._residues = params['_residues'] + self._clf.rank_ = params['rank_'] + self._clf.singular_ = params['singular_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['_residues'] is not None: + self._fitted = True + if params['rank_'] is not None: + self._fitted = True + if params['singular_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKLinearRegression.__doc__ = LinearRegression.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearSVC.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearSVC.py new file mode 100644 index 0000000..55bb114 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearSVC.py @@ -0,0 +1,478 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.svm.classes import LinearSVC + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[ndarray] + classes_: Optional[ndarray] + n_iter_: Optional[numpy.int32] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + penalty = hyperparams.Enumeration[str]( + values=['l1', 'l2'], + default='l2', + description='Specifies the norm used in the penalization. The \'l2\' penalty is the standard used in SVC. The \'l1\' leads to ``coef_`` vectors that are sparse.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + loss = hyperparams.Enumeration[str]( + values=['hinge', 'squared_hinge'], + default='squared_hinge', + description='Specifies the loss function. \'hinge\' is the standard SVM loss (used e.g. by the SVC class) while \'squared_hinge\' is the square of the hinge loss.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + dual = hyperparams.UniformBool( + default=True, + description='Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='Tolerance for stopping criteria. multi_class: string, \'ovr\' or \'crammer_singer\' (default=\'ovr\') Determines the multi-class strategy if `y` contains more than two classes. ``"ovr"`` trains n_classes one-vs-rest classifiers, while ``"crammer_singer"`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``"crammer_singer"`` is chosen, the options loss, penalty and dual will be ignored.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + C = hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description='Penalty parameter C of the error term.' + ) + multi_class = hyperparams.Enumeration[str]( + values=['ovr', 'crammer_singer'], + default='ovr', + description='Determines the multi-class strategy if `y` contains more than two classes. ``"ovr"`` trains n_classes one-vs-rest classifiers, while ``"crammer_singer"`` optimizes a joint objective over all classes. While `crammer_singer` is interesting from a theoretical perspective as it is consistent, it is seldom used in practice as it rarely leads to better accuracy and is more expensive to compute. If ``"crammer_singer"`` is chosen, the options loss, penalty and dual will be ignored. ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be already centered).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + intercept_scaling = hyperparams.Hyperparameter[float]( + default=1, + description='When self.fit_intercept is True, instance vector x becomes ``[x, self.intercept_scaling]``, i.e. a "synthetic" feature with constant value equals to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + class_weight = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Constant( + default='balanced', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Set the parameter C of class i to ``class_weight[i]*C`` for SVC. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Bounded[int]( + default=1000, + lower=0, + upper=None, + description='The maximum number of iterations to be run.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKLinearSVC(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn LinearSVC + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SUPPORT_VECTOR_MACHINE, ], + "name": "sklearn.svm.classes.LinearSVC", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.linear_svc.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html']}, + "version": "2019.11.13", + "id": "71749b20-80e9-3a8e-998e-25da5bbc1abc", + "hyperparams_to_tune": ['C'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = LinearSVC( + penalty=self.hyperparams['penalty'], + loss=self.hyperparams['loss'], + dual=self.hyperparams['dual'], + tol=self.hyperparams['tol'], + C=self.hyperparams['C'], + multi_class=self.hyperparams['multi_class'], + fit_intercept=self.hyperparams['fit_intercept'], + intercept_scaling=self.hyperparams['intercept_scaling'], + class_weight=self.hyperparams['class_weight'], + max_iter=self.hyperparams['max_iter'], + verbose=_verbose, + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + classes_=None, + n_iter_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + classes_=getattr(self._clf, 'classes_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.classes_ = params['classes_'] + self._clf.n_iter_ = params['n_iter_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKLinearSVC.__doc__ = LinearSVC.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearSVR.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearSVR.py new file mode 100644 index 0000000..af809b8 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLinearSVR.py @@ -0,0 +1,452 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.svm.classes import LinearSVR + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[ndarray] + n_iter_: Optional[numpy.int32] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + C = hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description='Penalty parameter C of the error term. The penalty is a squared l2 penalty. The bigger this parameter, the less regularization is used.' + ) + loss = hyperparams.Enumeration[str]( + values=['epsilon_insensitive', 'squared_epsilon_insensitive'], + default='epsilon_insensitive', + description='Specifies the loss function. \'l1\' is the epsilon-insensitive loss (standard SVR) while \'l2\' is the squared epsilon-insensitive loss.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + epsilon = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + description='Epsilon parameter in the epsilon-insensitive loss function. Note that the value of this parameter depends on the scale of the target variable y. If unsure, set ``epsilon=0``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + dual = hyperparams.UniformBool( + default=True, + description='Select the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='Tolerance for stopping criteria.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be already centered).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + intercept_scaling = hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + description='When self.fit_intercept is True, instance vector x becomes [x, self.intercept_scaling], i.e. a "synthetic" feature with constant value equals to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic feature weight Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Bounded[int]( + default=1000, + lower=0, + upper=None, + description='The maximum number of iterations to be run.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKLinearSVR(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn LinearSVR + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SUPPORT_VECTOR_MACHINE, ], + "name": "sklearn.svm.classes.LinearSVR", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.linear_svr.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html']}, + "version": "2019.11.13", + "id": "f40ffdc0-1d6d-3234-8fd0-a3e4d7a136a7", + "hyperparams_to_tune": ['C'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = LinearSVR( + C=self.hyperparams['C'], + loss=self.hyperparams['loss'], + epsilon=self.hyperparams['epsilon'], + dual=self.hyperparams['dual'], + tol=self.hyperparams['tol'], + fit_intercept=self.hyperparams['fit_intercept'], + intercept_scaling=self.hyperparams['intercept_scaling'], + max_iter=self.hyperparams['max_iter'], + verbose=_verbose, + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + n_iter_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.n_iter_ = params['n_iter_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKLinearSVR.__doc__ = LinearSVR.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLogisticRegression.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLogisticRegression.py new file mode 100644 index 0000000..f5578d7 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKLogisticRegression.py @@ -0,0 +1,582 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.logistic import LogisticRegression + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[ndarray] + n_iter_: Optional[ndarray] + classes_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + penalty = hyperparams.Choice( + choices={ + 'l1': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'l2': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'none': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'elasticnet': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'l1_ratio': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Uniform( + lower=0, + upper=1, + default=0.001, + lower_inclusive=True, + upper_inclusive=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='l2', + description='Used to specify the norm used in the penalization. The \'newton-cg\', \'sag\' and \'lbfgs\' solvers support only l2 penalties.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + dual = hyperparams.UniformBool( + default=False, + description='Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + intercept_scaling = hyperparams.Hyperparameter[float]( + default=1, + description='Useful only when the solver \'liblinear\' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], i.e. a "synthetic" feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + class_weight = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Constant( + default='balanced', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17 *class_weight=\'balanced\'* instead of deprecated *class_weight=\'auto\'*.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Bounded[int]( + default=100, + lower=0, + upper=None, + description='Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + solver = hyperparams.Enumeration[str]( + values=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], + default='liblinear', + description='Algorithm to use in the optimization problem. - For small datasets, \'liblinear\' is a good choice, whereas \'sag\' is faster for large ones. - For multiclass problems, only \'newton-cg\', \'sag\' and \'lbfgs\' handle multinomial loss; \'liblinear\' is limited to one-versus-rest schemes. - \'newton-cg\', \'lbfgs\' and \'sag\' only handle L2 penalty. Note that \'sag\' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. .. versionadded:: 0.17 Stochastic Average Gradient descent solver.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='Tolerance for stopping criteria.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + C = hyperparams.Hyperparameter[float]( + default=1.0, + description='Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + multi_class = hyperparams.Enumeration[str]( + values=['ovr', 'multinomial'], + default='ovr', + description='Multiclass option can be either \'ovr\' or \'multinomial\'. If the option chosen is \'ovr\', then a binary problem is fit for each label. Else the loss minimised is the multinomial loss fit across the entire probability distribution. Works only for the \'newton-cg\', \'sag\' and \'lbfgs\' solver. .. versionadded:: 0.18 Stochastic Average Gradient descent solver for \'multinomial\' case.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. .. versionadded:: 0.17 *warm_start* to support *lbfgs*, *newton-cg*, *sag* solvers.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='Number of CPU cores used during the cross-validation loop. If given a value of -1, all cores are used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKLogisticRegression(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn LogisticRegression + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LOGISTIC_REGRESSION, ], + "name": "sklearn.linear_model.logistic.LogisticRegression", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.logistic_regression.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html']}, + "version": "2019.11.13", + "id": "b9c81b40-8ed1-3b23-80cf-0d6fe6863962", + "hyperparams_to_tune": ['C', 'penalty'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = LogisticRegression( + penalty=self.hyperparams['penalty']['choice'], + l1_ratio=self.hyperparams['penalty'].get('l1_ratio', 'float'), + dual=self.hyperparams['dual'], + fit_intercept=self.hyperparams['fit_intercept'], + intercept_scaling=self.hyperparams['intercept_scaling'], + class_weight=self.hyperparams['class_weight'], + max_iter=self.hyperparams['max_iter'], + solver=self.hyperparams['solver'], + tol=self.hyperparams['tol'], + C=self.hyperparams['C'], + multi_class=self.hyperparams['multi_class'], + warm_start=self.hyperparams['warm_start'], + n_jobs=self.hyperparams['n_jobs'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + n_iter_=None, + classes_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + classes_=getattr(self._clf, 'classes_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.n_iter_ = params['n_iter_'] + self._clf.classes_ = params['classes_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKLogisticRegression.__doc__ = LogisticRegression.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMLPClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMLPClassifier.py new file mode 100644 index 0000000..c0acbcd --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMLPClassifier.py @@ -0,0 +1,730 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.neural_network.multilayer_perceptron import MLPClassifier + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + classes_: Optional[ndarray] + loss_: Optional[float] + coefs_: Optional[Sequence[Any]] + intercepts_: Optional[Sequence[Any]] + n_iter_: Optional[int] + n_layers_: Optional[int] + n_outputs_: Optional[int] + out_activation_: Optional[str] + _best_coefs: Optional[Sequence[Any]] + _best_intercepts: Optional[Sequence[Any]] + _label_binarizer: Optional[sklearn.preprocessing.LabelBinarizer] + _no_improvement_count: Optional[int] + _random_state: Optional[numpy.random.mtrand.RandomState] + best_validation_score_: Optional[numpy.float64] + loss_curve_: Optional[Sequence[Any]] + t_: Optional[int] + _optimizer: Optional[sklearn.neural_network._stochastic_optimizers.AdamOptimizer] + validation_scores_: Optional[Sequence[Any]] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + hidden_layer_sizes = hyperparams.List( + elements=hyperparams.Bounded(1, None, 100), + default=(100, ), + min_size=1, + max_size=None, + description='The ith element represents the number of neurons in the ith hidden layer.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + activation = hyperparams.Enumeration[str]( + values=['identity', 'logistic', 'tanh', 'relu'], + default='relu', + description='Activation function for the hidden layer. - \'identity\', no-op activation, useful to implement linear bottleneck, returns f(x) = x - \'logistic\', the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)). - \'tanh\', the hyperbolic tan function, returns f(x) = tanh(x). - \'relu\', the rectified linear unit function, returns f(x) = max(0, x)', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + solver = hyperparams.Choice( + choices={ + 'lbfgs': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'sgd': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'learning_rate': hyperparams.Enumeration[str]( + values=['constant', 'invscaling', 'adaptive'], + default='constant', + description='Learning rate schedule for weight updates. Only used when solver=’sgd’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'learning_rate_init': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.001, + description='The initial learning rate used. It controls the step-size in updating the weights. Only used when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'power_t': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.5, + description='The exponent for inverse scaling learning rate. Only used when solver=’sgd’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'shuffle': hyperparams.UniformBool( + default=True, + description='Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'momentum': hyperparams.Bounded[float]( + default=0.9, + lower=0, + upper=1, + description='Momentum for gradient descent update. Should be between 0 and 1. Only used when solver=’sgd’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'nesterovs_momentum': hyperparams.UniformBool( + default=True, + description='Whether to use Nesterov’s momentum. Only used when solver=’sgd’ and momentum > 0.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'early_stopping': hyperparams.UniformBool( + default=False, + description='Whether to use early stopping to terminate training when validation score is not improving.If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'n_iter_no_change': hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'adam': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'learning_rate_init': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.001, + description='The initial learning rate used. It controls the step-size in updating the weights. Only used when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'shuffle': hyperparams.UniformBool( + default=True, + description='Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'early_stopping': hyperparams.UniformBool( + default=False, + description='Whether to use early stopping to terminate training when validation score is not improving.If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'beta_1': hyperparams.Bounded[float]( + default=0.9, + lower=0, + upper=1, + description='Exponential decay rate for estimates of first moment vector in adam, should be in [0, 1).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'beta_2': hyperparams.Bounded[float]( + default=0.999, + lower=0, + upper=1, + description='Exponential decay rate for estimates of second moment vector in adam, should be in [0, 1).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'epsilon': hyperparams.Bounded[float]( + default=1e-08, + lower=0, + upper=None, + description='Value for numerical stability in adam. Only used when solver=’adam’', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'n_iter_no_change': hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='adam', + description='The solver for weight optimization. - \'lbfgs\' is an optimizer in the family of quasi-Newton methods. - \'sgd\' refers to stochastic gradient descent. - \'adam\' refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba Note: The default solver \'adam\' works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, \'lbfgs\' can converge faster and perform better.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + alpha = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.0001, + description='L2 penalty (regularization term) parameter.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + batch_size = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=16, + description='Size of minibatches for stochastic optimizers. If the solver is ‘lbfgs’, the classifier will not use minibatch', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + description='When set to “auto”, batch_size=min(200, n_samples)', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Size of minibatches for stochastic optimizers. If the solver is \'lbfgs\', the classifier will not use minibatch. When set to "auto", `batch_size=min(200, n_samples)`', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=200, + description='Maximum number of iterations. The solver iterates until convergence (determined by \'tol\') or this number of iterations. For stochastic solvers (\'sgd\', \'adam\'), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='Tolerance for the optimization. When the loss or score is not improving by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, unless ``learning_rate`` is set to \'adaptive\', convergence is considered to be reached and training stops.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + validation_fraction = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKMLPClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn MLPClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.MULTILAYER_PERCEPTRON, ], + "name": "sklearn.neural_network.multilayer_perceptron.MLPClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.mlp.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html']}, + "version": "2019.11.13", + "id": "89d7ffbd-df5d-352f-a038-311b7d379cd0", + "hyperparams_to_tune": ['hidden_layer_sizes', 'activation', 'solver', 'alpha'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: bool = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = MLPClassifier( + hidden_layer_sizes=self.hyperparams['hidden_layer_sizes'], + activation=self.hyperparams['activation'], + solver=self.hyperparams['solver']['choice'], + learning_rate=self.hyperparams['solver'].get('learning_rate', 'constant'), + learning_rate_init=self.hyperparams['solver'].get('learning_rate_init', 0.001), + power_t=self.hyperparams['solver'].get('power_t', 0.5), + shuffle=self.hyperparams['solver'].get('shuffle', True), + momentum=self.hyperparams['solver'].get('momentum', 0.9), + nesterovs_momentum=self.hyperparams['solver'].get('nesterovs_momentum', True), + early_stopping=self.hyperparams['solver'].get('early_stopping', False), + beta_1=self.hyperparams['solver'].get('beta_1', 0.9), + beta_2=self.hyperparams['solver'].get('beta_2', 0.999), + epsilon=self.hyperparams['solver'].get('epsilon', 1e-08), + n_iter_no_change=self.hyperparams['solver'].get('n_iter_no_change', 10), + alpha=self.hyperparams['alpha'], + batch_size=self.hyperparams['batch_size'], + max_iter=self.hyperparams['max_iter'], + tol=self.hyperparams['tol'], + validation_fraction=self.hyperparams['validation_fraction'], + warm_start=self.hyperparams['warm_start'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + classes_=None, + loss_=None, + coefs_=None, + intercepts_=None, + n_iter_=None, + n_layers_=None, + n_outputs_=None, + out_activation_=None, + _best_coefs=None, + _best_intercepts=None, + _label_binarizer=None, + _no_improvement_count=None, + _random_state=None, + best_validation_score_=None, + loss_curve_=None, + t_=None, + _optimizer=None, + validation_scores_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + classes_=getattr(self._clf, 'classes_', None), + loss_=getattr(self._clf, 'loss_', None), + coefs_=getattr(self._clf, 'coefs_', None), + intercepts_=getattr(self._clf, 'intercepts_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + n_layers_=getattr(self._clf, 'n_layers_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + out_activation_=getattr(self._clf, 'out_activation_', None), + _best_coefs=getattr(self._clf, '_best_coefs', None), + _best_intercepts=getattr(self._clf, '_best_intercepts', None), + _label_binarizer=getattr(self._clf, '_label_binarizer', None), + _no_improvement_count=getattr(self._clf, '_no_improvement_count', None), + _random_state=getattr(self._clf, '_random_state', None), + best_validation_score_=getattr(self._clf, 'best_validation_score_', None), + loss_curve_=getattr(self._clf, 'loss_curve_', None), + t_=getattr(self._clf, 't_', None), + _optimizer=getattr(self._clf, '_optimizer', None), + validation_scores_=getattr(self._clf, 'validation_scores_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.classes_ = params['classes_'] + self._clf.loss_ = params['loss_'] + self._clf.coefs_ = params['coefs_'] + self._clf.intercepts_ = params['intercepts_'] + self._clf.n_iter_ = params['n_iter_'] + self._clf.n_layers_ = params['n_layers_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.out_activation_ = params['out_activation_'] + self._clf._best_coefs = params['_best_coefs'] + self._clf._best_intercepts = params['_best_intercepts'] + self._clf._label_binarizer = params['_label_binarizer'] + self._clf._no_improvement_count = params['_no_improvement_count'] + self._clf._random_state = params['_random_state'] + self._clf.best_validation_score_ = params['best_validation_score_'] + self._clf.loss_curve_ = params['loss_curve_'] + self._clf.t_ = params['t_'] + self._clf._optimizer = params['_optimizer'] + self._clf.validation_scores_ = params['validation_scores_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['classes_'] is not None: + self._fitted = True + if params['loss_'] is not None: + self._fitted = True + if params['coefs_'] is not None: + self._fitted = True + if params['intercepts_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + if params['n_layers_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['out_activation_'] is not None: + self._fitted = True + if params['_best_coefs'] is not None: + self._fitted = True + if params['_best_intercepts'] is not None: + self._fitted = True + if params['_label_binarizer'] is not None: + self._fitted = True + if params['_no_improvement_count'] is not None: + self._fitted = True + if params['_random_state'] is not None: + self._fitted = True + if params['best_validation_score_'] is not None: + self._fitted = True + if params['loss_curve_'] is not None: + self._fitted = True + if params['t_'] is not None: + self._fitted = True + if params['_optimizer'] is not None: + self._fitted = True + if params['validation_scores_'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKMLPClassifier.__doc__ = MLPClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMLPRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMLPRegressor.py new file mode 100644 index 0000000..df6b0e9 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMLPRegressor.py @@ -0,0 +1,669 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.neural_network.multilayer_perceptron import MLPRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + loss_: Optional[float] + coefs_: Optional[Sequence[Any]] + intercepts_: Optional[Sequence[Any]] + n_iter_: Optional[int] + n_layers_: Optional[int] + n_outputs_: Optional[int] + out_activation_: Optional[str] + _best_coefs: Optional[Sequence[Any]] + _best_intercepts: Optional[Sequence[Any]] + _no_improvement_count: Optional[int] + _random_state: Optional[numpy.random.mtrand.RandomState] + best_validation_score_: Optional[numpy.float64] + loss_curve_: Optional[Sequence[Any]] + t_: Optional[int] + _optimizer: Optional[sklearn.neural_network._stochastic_optimizers.AdamOptimizer] + validation_scores_: Optional[Sequence[Any]] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + hidden_layer_sizes = hyperparams.List( + elements=hyperparams.Bounded(1, None, 100), + default=(100, ), + min_size=1, + max_size=None, + description='The ith element represents the number of neurons in the ith hidden layer.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + activation = hyperparams.Enumeration[str]( + values=['identity', 'logistic', 'tanh', 'relu'], + default='relu', + description='Activation function for the hidden layer. - \'identity\', no-op activation, useful to implement linear bottleneck, returns f(x) = x - \'logistic\', the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)). - \'tanh\', the hyperbolic tan function, returns f(x) = tanh(x). - \'relu\', the rectified linear unit function, returns f(x) = max(0, x)', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + solver = hyperparams.Choice( + choices={ + 'lbfgs': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'sgd': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'learning_rate': hyperparams.Enumeration[str]( + values=['constant', 'invscaling', 'adaptive'], + default='constant', + description='Learning rate schedule for weight updates. Only used when solver=’sgd’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'learning_rate_init': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.001, + description='The initial learning rate used. It controls the step-size in updating the weights. Only used when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'power_t': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.5, + description='The exponent for inverse scaling learning rate. Only used when solver=’sgd’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'shuffle': hyperparams.UniformBool( + default=True, + description='Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'momentum': hyperparams.Bounded[float]( + default=0.9, + lower=0, + upper=1, + description='Momentum for gradient descent update. Should be between 0 and 1. Only used when solver=’sgd’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'nesterovs_momentum': hyperparams.UniformBool( + default=True, + description='Whether to use Nesterov’s momentum. Only used when solver=’sgd’ and momentum > 0.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'early_stopping': hyperparams.UniformBool( + default=False, + description='Whether to use early stopping to terminate training when validation score is not improving.If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'n_iter_no_change': hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'adam': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'learning_rate_init': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.001, + description='The initial learning rate used. It controls the step-size in updating the weights. Only used when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'shuffle': hyperparams.UniformBool( + default=True, + description='Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'early_stopping': hyperparams.UniformBool( + default=False, + description='Whether to use early stopping to terminate training when validation score is not improving.If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'beta_1': hyperparams.Bounded[float]( + default=0.9, + lower=0, + upper=1, + description='Exponential decay rate for estimates of first moment vector in adam, should be in [0, 1).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'beta_2': hyperparams.Bounded[float]( + default=0.999, + lower=0, + upper=1, + description='Exponential decay rate for estimates of second moment vector in adam, should be in [0, 1).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'epsilon': hyperparams.Bounded[float]( + default=1e-08, + lower=0, + upper=None, + description='Value for numerical stability in adam. Only used when solver=’adam’', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'n_iter_no_change': hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='adam', + description='The solver for weight optimization. - \'lbfgs\' is an optimizer in the family of quasi-Newton methods. - \'sgd\' refers to stochastic gradient descent. - \'adam\' refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba Note: The default solver \'adam\' works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, \'lbfgs\' can converge faster and perform better.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + alpha = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.0001, + description='L2 penalty (regularization term) parameter.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + batch_size = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=16, + description='Size of minibatches for stochastic optimizers. If the solver is \'lbfgs\', the classifier will not use minibatch', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + description='When set to \'auto\', batch_size=min(200, n_samples)', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Size of minibatches for stochastic optimizers. If the solver is \'lbfgs\', the classifier will not use minibatch. When set to "auto", `batch_size=min(200, n_samples)`', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=200, + description='Maximum number of iterations. The solver iterates until convergence (determined by \'tol\') or this number of iterations. For stochastic solvers (\'sgd\', \'adam\'), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='Tolerance for the optimization. When the loss or score is not improving by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, unless ``learning_rate`` is set to \'adaptive\', convergence is considered to be reached and training stops.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + validation_fraction = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKMLPRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn MLPRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.MULTILAYER_PERCEPTRON, ], + "name": "sklearn.neural_network.multilayer_perceptron.MLPRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.mlp.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html']}, + "version": "2019.11.13", + "id": "a4fedbf8-f69a-3440-9423-559291dfbd61", + "hyperparams_to_tune": ['hidden_layer_sizes', 'activation', 'solver', 'alpha'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: bool = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = MLPRegressor( + hidden_layer_sizes=self.hyperparams['hidden_layer_sizes'], + activation=self.hyperparams['activation'], + solver=self.hyperparams['solver']['choice'], + learning_rate=self.hyperparams['solver'].get('learning_rate', 'constant'), + learning_rate_init=self.hyperparams['solver'].get('learning_rate_init', 0.001), + power_t=self.hyperparams['solver'].get('power_t', 0.5), + shuffle=self.hyperparams['solver'].get('shuffle', True), + momentum=self.hyperparams['solver'].get('momentum', 0.9), + nesterovs_momentum=self.hyperparams['solver'].get('nesterovs_momentum', True), + early_stopping=self.hyperparams['solver'].get('early_stopping', False), + beta_1=self.hyperparams['solver'].get('beta_1', 0.9), + beta_2=self.hyperparams['solver'].get('beta_2', 0.999), + epsilon=self.hyperparams['solver'].get('epsilon', 1e-08), + n_iter_no_change=self.hyperparams['solver'].get('n_iter_no_change', 10), + alpha=self.hyperparams['alpha'], + batch_size=self.hyperparams['batch_size'], + max_iter=self.hyperparams['max_iter'], + tol=self.hyperparams['tol'], + warm_start=self.hyperparams['warm_start'], + validation_fraction=self.hyperparams['validation_fraction'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + loss_=None, + coefs_=None, + intercepts_=None, + n_iter_=None, + n_layers_=None, + n_outputs_=None, + out_activation_=None, + _best_coefs=None, + _best_intercepts=None, + _no_improvement_count=None, + _random_state=None, + best_validation_score_=None, + loss_curve_=None, + t_=None, + _optimizer=None, + validation_scores_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + loss_=getattr(self._clf, 'loss_', None), + coefs_=getattr(self._clf, 'coefs_', None), + intercepts_=getattr(self._clf, 'intercepts_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + n_layers_=getattr(self._clf, 'n_layers_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + out_activation_=getattr(self._clf, 'out_activation_', None), + _best_coefs=getattr(self._clf, '_best_coefs', None), + _best_intercepts=getattr(self._clf, '_best_intercepts', None), + _no_improvement_count=getattr(self._clf, '_no_improvement_count', None), + _random_state=getattr(self._clf, '_random_state', None), + best_validation_score_=getattr(self._clf, 'best_validation_score_', None), + loss_curve_=getattr(self._clf, 'loss_curve_', None), + t_=getattr(self._clf, 't_', None), + _optimizer=getattr(self._clf, '_optimizer', None), + validation_scores_=getattr(self._clf, 'validation_scores_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.loss_ = params['loss_'] + self._clf.coefs_ = params['coefs_'] + self._clf.intercepts_ = params['intercepts_'] + self._clf.n_iter_ = params['n_iter_'] + self._clf.n_layers_ = params['n_layers_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.out_activation_ = params['out_activation_'] + self._clf._best_coefs = params['_best_coefs'] + self._clf._best_intercepts = params['_best_intercepts'] + self._clf._no_improvement_count = params['_no_improvement_count'] + self._clf._random_state = params['_random_state'] + self._clf.best_validation_score_ = params['best_validation_score_'] + self._clf.loss_curve_ = params['loss_curve_'] + self._clf.t_ = params['t_'] + self._clf._optimizer = params['_optimizer'] + self._clf.validation_scores_ = params['validation_scores_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['loss_'] is not None: + self._fitted = True + if params['coefs_'] is not None: + self._fitted = True + if params['intercepts_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + if params['n_layers_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['out_activation_'] is not None: + self._fitted = True + if params['_best_coefs'] is not None: + self._fitted = True + if params['_best_intercepts'] is not None: + self._fitted = True + if params['_no_improvement_count'] is not None: + self._fitted = True + if params['_random_state'] is not None: + self._fitted = True + if params['best_validation_score_'] is not None: + self._fitted = True + if params['loss_curve_'] is not None: + self._fitted = True + if params['t_'] is not None: + self._fitted = True + if params['_optimizer'] is not None: + self._fitted = True + if params['validation_scores_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKMLPRegressor.__doc__ = MLPRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMaxAbsScaler.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMaxAbsScaler.py new file mode 100644 index 0000000..50eaf4d --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMaxAbsScaler.py @@ -0,0 +1,339 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import MaxAbsScaler + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + scale_: Optional[ndarray] + max_abs_: Optional[ndarray] + n_samples_seen_: Optional[int] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKMaxAbsScaler(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn MaxAbsScaler + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.FEATURE_SCALING, ], + "name": "sklearn.preprocessing.data.MaxAbsScaler", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.max_abs_scaler.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html']}, + "version": "2019.11.13", + "id": "64d2ef5d-b221-3033-8342-76d0293fa99c", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = MaxAbsScaler( + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + scale_=None, + max_abs_=None, + n_samples_seen_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + scale_=getattr(self._clf, 'scale_', None), + max_abs_=getattr(self._clf, 'max_abs_', None), + n_samples_seen_=getattr(self._clf, 'n_samples_seen_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.scale_ = params['scale_'] + self._clf.max_abs_ = params['max_abs_'] + self._clf.n_samples_seen_ = params['n_samples_seen_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['scale_'] is not None: + self._fitted = True + if params['max_abs_'] is not None: + self._fitted = True + if params['n_samples_seen_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKMaxAbsScaler.__doc__ = MaxAbsScaler.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMinMaxScaler.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMinMaxScaler.py new file mode 100644 index 0000000..dc8fc78 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMinMaxScaler.py @@ -0,0 +1,366 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import MinMaxScaler + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + min_: Optional[ndarray] + scale_: Optional[ndarray] + data_min_: Optional[ndarray] + data_max_: Optional[ndarray] + data_range_: Optional[ndarray] + n_samples_seen_: Optional[int] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + feature_range = hyperparams.SortedSet( + elements=hyperparams.Hyperparameter[int](0), + default=(0, 1), + min_size=2, + max_size=2, + description='Desired range of transformed data.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKMinMaxScaler(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn MinMaxScaler + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.FEATURE_SCALING, ], + "name": "sklearn.preprocessing.data.MinMaxScaler", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.min_max_scaler.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html']}, + "version": "2019.11.13", + "id": "08d0579d-38da-307b-8b75-6a213ef2972e", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = MinMaxScaler( + feature_range=self.hyperparams['feature_range'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + min_=None, + scale_=None, + data_min_=None, + data_max_=None, + data_range_=None, + n_samples_seen_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + min_=getattr(self._clf, 'min_', None), + scale_=getattr(self._clf, 'scale_', None), + data_min_=getattr(self._clf, 'data_min_', None), + data_max_=getattr(self._clf, 'data_max_', None), + data_range_=getattr(self._clf, 'data_range_', None), + n_samples_seen_=getattr(self._clf, 'n_samples_seen_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.min_ = params['min_'] + self._clf.scale_ = params['scale_'] + self._clf.data_min_ = params['data_min_'] + self._clf.data_max_ = params['data_max_'] + self._clf.data_range_ = params['data_range_'] + self._clf.n_samples_seen_ = params['n_samples_seen_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['min_'] is not None: + self._fitted = True + if params['scale_'] is not None: + self._fitted = True + if params['data_min_'] is not None: + self._fitted = True + if params['data_max_'] is not None: + self._fitted = True + if params['data_range_'] is not None: + self._fitted = True + if params['n_samples_seen_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKMinMaxScaler.__doc__ = MinMaxScaler.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMissingIndicator.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMissingIndicator.py new file mode 100644 index 0000000..929389f --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMissingIndicator.py @@ -0,0 +1,373 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.impute import MissingIndicator +from sklearn.impute._base import _get_mask + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + features_: Optional[ndarray] + _n_features: Optional[int] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + missing_values = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Hyperparameter[int]( + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'np.nan': hyperparams.Hyperparameter[float]( + default=numpy.nan, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='np.nan', + description='The placeholder for the missing values. All occurrences of `missing_values` will be indicated (True in the output array), the other values will be marked as False.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + features = hyperparams.Enumeration[str]( + values=['missing-only', 'all'], + default='missing-only', + description='Whether the imputer mask should represent all or a subset of features. - If "missing-only" (default), the imputer mask will only represent features containing missing values during fit time. - If "all", the imputer mask will represent all features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + error_on_new = hyperparams.UniformBool( + default=True, + description='If True (default), transform will raise an error when there are features with missing values in transform that have no missing values in fit. This is applicable only when ``features="missing-only"``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKMissingIndicator(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn MissingIndicator + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.IMPUTATION, ], + "name": "sklearn.impute.MissingIndicator", + "primitive_family": metadata_base.PrimitiveFamily.DATA_CLEANING, + "python_path": "d3m.primitives.data_cleaning.missing_indicator.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.impute.MissingIndicator.html']}, + "version": "2019.11.13", + "id": "94c5c918-9ad5-3496-8e52-2359056e0120", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = MissingIndicator( + missing_values=self.hyperparams['missing_values'], + features=self.hyperparams['features'], + error_on_new=self.hyperparams['error_on_new'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices, _ = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use, _ = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.transform(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + target_columns_metadata = self._copy_columns_metadata(inputs.metadata, self._training_indices, self.hyperparams) + output = self._wrap_predictions(inputs, sk_output, target_columns_metadata) + + output.columns = [inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self._training_indices] + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + _, _, dropped_cols = self._get_columns_to_fit(inputs, self.hyperparams) + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices + dropped_cols, + columns_list=output) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + features_=None, + _n_features=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + features_=getattr(self._clf, 'features_', None), + _n_features=getattr(self._clf, '_n_features', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.features_ = params['features_'] + self._clf._n_features = params['_n_features'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['features_'] is not None: + self._fitted = True + if params['_n_features'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + + if not hyperparams['use_semantic_types']: + columns_to_produce = list(range(len(inputs.columns))) + + else: + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + + columns_to_drop = cls._get_columns_to_drop(inputs, columns_to_produce, hyperparams) + for col in columns_to_drop: + columns_to_produce.remove(col) + + return inputs.iloc[:, columns_to_produce], columns_to_produce, columns_to_drop + + @classmethod + def _get_columns_to_drop(cls, inputs: Inputs, column_indices: List[int], hyperparams: Hyperparams): + """ + Check for columns that contain missing_values that need to be imputed + If strategy is constant and missin_values is nan, then all nan columns will not be dropped + :param inputs: + :param column_indices: + :return: + """ + columns_to_remove = [] + if hyperparams['features'] == "missing-only": + for _, col in enumerate(column_indices): + inp = inputs.iloc[:, [col]].values + mask = _get_mask(inp, hyperparams['missing_values']) + if not mask.any(): + columns_to_remove.append(col) + return columns_to_remove + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray, target_columns_metadata) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices, hyperparams) -> List[OrderedDict]: + outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in column_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKMissingIndicator.__doc__ = MissingIndicator.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMultinomialNB.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMultinomialNB.py new file mode 100644 index 0000000..b429050 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKMultinomialNB.py @@ -0,0 +1,488 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.naive_bayes import MultinomialNB + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + class_log_prior_: Optional[ndarray] + feature_log_prob_: Optional[ndarray] + class_count_: Optional[ndarray] + feature_count_: Optional[ndarray] + classes_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + alpha = hyperparams.Hyperparameter[float]( + default=1, + description='Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_prior = hyperparams.UniformBool( + default=True, + description='Whether to learn class prior probabilities or not. If false, a uniform prior will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKMultinomialNB(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn MultinomialNB + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.NAIVE_BAYES_CLASSIFIER, ], + "name": "sklearn.naive_bayes.MultinomialNB", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.multinomial_naive_bayes.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html']}, + "version": "2019.11.13", + "id": "adf13b4b-9fe5-38a2-a1ea-d1b1cc342576", + "hyperparams_to_tune": ['alpha', 'fit_prior'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = MultinomialNB( + alpha=self.hyperparams['alpha'], + fit_prior=self.hyperparams['fit_prior'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.partial_fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + class_log_prior_=None, + feature_log_prob_=None, + class_count_=None, + feature_count_=None, + classes_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + class_log_prior_=getattr(self._clf, 'class_log_prior_', None), + feature_log_prob_=getattr(self._clf, 'feature_log_prob_', None), + class_count_=getattr(self._clf, 'class_count_', None), + feature_count_=getattr(self._clf, 'feature_count_', None), + classes_=getattr(self._clf, 'classes_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.class_log_prior_ = params['class_log_prior_'] + self._clf.feature_log_prob_ = params['feature_log_prob_'] + self._clf.class_count_ = params['class_count_'] + self._clf.feature_count_ = params['feature_count_'] + self._clf.classes_ = params['classes_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['class_log_prior_'] is not None: + self._fitted = True + if params['feature_log_prob_'] is not None: + self._fitted = True + if params['class_count_'] is not None: + self._fitted = True + if params['feature_count_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKMultinomialNB.__doc__ = MultinomialNB.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKNearestCentroid.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKNearestCentroid.py new file mode 100644 index 0000000..62bc158 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKNearestCentroid.py @@ -0,0 +1,408 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.neighbors.nearest_centroid import NearestCentroid + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + centroids_: Optional[ndarray] + classes_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + metric = hyperparams.Enumeration[str]( + default='euclidean', + values=['euclidean', 'manhattan'], + description='The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.pairwise_distances for its metric parameter. The centroids for the samples corresponding to each class is the point from which the sum of the distances (according to the metric) of all samples that belong to that particular class are minimized. If the "manhattan" metric is provided, this centroid is the median and for all other metrics, the centroid is now set to be the mean.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + shrink_threshold = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Threshold for shrinking centroids to remove features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKNearestCentroid(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn NearestCentroid + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.NEAREST_CENTROID_CLASSIFIER, ], + "name": "sklearn.neighbors.nearest_centroid.NearestCentroid", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.nearest_centroid.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestCentroid.html']}, + "version": "2019.11.13", + "id": "90e7b335-5af0-35ad-932c-9c771fe84693", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = NearestCentroid( + metric=self.hyperparams['metric'], + shrink_threshold=self.hyperparams['shrink_threshold'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + centroids_=None, + classes_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + centroids_=getattr(self._clf, 'centroids_', None), + classes_=getattr(self._clf, 'classes_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.centroids_ = params['centroids_'] + self._clf.classes_ = params['classes_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['centroids_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKNearestCentroid.__doc__ = NearestCentroid.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKNormalizer.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKNormalizer.py new file mode 100644 index 0000000..b358b7c --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKNormalizer.py @@ -0,0 +1,329 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import Normalizer + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + norm = hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2', 'max'], + description='The norm to use to normalize each non zero sample.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKNormalizer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn Normalizer + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_NORMALIZATION, ], + "name": "sklearn.preprocessing.data.Normalizer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.normalizer.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html']}, + "version": "2019.11.13", + "id": "980b3a2d-1574-31f3-8326-ddc62f8fc2c3", + "hyperparams_to_tune": ['norm'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Normalizer( + norm=self.hyperparams['norm'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKNormalizer.__doc__ = Normalizer.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKNystroem.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKNystroem.py new file mode 100644 index 0000000..b92c92f --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKNystroem.py @@ -0,0 +1,522 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.kernel_approximation import Nystroem + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + components_: Optional[ndarray] + component_indices_: Optional[ndarray] + normalization_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + kernel = hyperparams.Choice( + choices={ + 'rbf': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=0.1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'laplacian': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=0.1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'polynomial': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=0.1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'degree': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'exponential': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=0.1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'chi2': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=0.1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'sigmoid': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=0.1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Constant( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'cosine': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'poly': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'linear': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'additive_chi2': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ) + }, + default='rbf', + description='Kernel map to be approximated. A callable should accept two arguments and the keyword arguments passed to this object as kernel_params, and should return a floating point number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_components = hyperparams.Bounded[int]( + default=100, + lower=0, + upper=None, + description='Number of features to construct. How many data points will be used to construct the mapping.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKNystroem(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn Nystroem + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.KERNEL_METHOD, ], + "name": "sklearn.kernel_approximation.Nystroem", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.nystroem.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html']}, + "version": "2019.11.13", + "id": "ca3a4357-a49f-31f0-82ed-244b66e29426", + "hyperparams_to_tune": ['kernel'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Nystroem( + kernel=self.hyperparams['kernel']['choice'], + degree=self.hyperparams['kernel'].get('degree', 'none'), + gamma=self.hyperparams['kernel'].get('gamma', 'none'), + coef0=self.hyperparams['kernel'].get('coef0', 'none'), + n_components=self.hyperparams['n_components'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + components_=None, + component_indices_=None, + normalization_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + components_=getattr(self._clf, 'components_', None), + component_indices_=getattr(self._clf, 'component_indices_', None), + normalization_=getattr(self._clf, 'normalization_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.components_ = params['components_'] + self._clf.component_indices_ = params['component_indices_'] + self._clf.normalization_ = params['normalization_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['components_'] is not None: + self._fitted = True + if params['component_indices_'] is not None: + self._fitted = True + if params['normalization_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKNystroem.__doc__ = Nystroem.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKOneHotEncoder.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKOneHotEncoder.py new file mode 100644 index 0000000..536c585 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKOneHotEncoder.py @@ -0,0 +1,420 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import OneHotEncoder +from numpy import float as npfloat + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + _active_features_: Optional[ndarray] + _categorical_features: Optional[Union[str, ndarray]] + _categories: Optional[Sequence[Any]] + _feature_indices_: Optional[ndarray] + _legacy_mode: Optional[bool] + _n_values_: Optional[ndarray] + _n_values: Optional[Union[str, ndarray]] + categories_: Optional[Sequence[Any]] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_values = hyperparams.Union( + configuration=OrderedDict({ + 'auto': hyperparams.Constant( + default='auto', + description='Determine value range from training data.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=10, + description='Number of categorical values per feature. Each feature value should be in range(n_values).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'list': hyperparams.List( + default=[], + elements=hyperparams.Hyperparameter[int](1), + description='n_values[i] is the number of categorical values in X[:, i]. Each feature value should be in range(n_values[i]).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Number of values per feature. - \'auto\' : determine value range from training data. - int : number of categorical values per feature. Each feature value should be in ``range(n_values)`` - array : ``n_values[i]`` is the number of categorical values in ``X[:, i]``. Each feature value should be in ``range(n_values[i])``', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + sparse = hyperparams.UniformBool( + default=True, + description='Will return sparse matrix if set True else will return an array.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + handle_unknown = hyperparams.Enumeration[str]( + values=['error', 'ignore'], + default='error', + description='Whether to raise an error or ignore if a unknown categorical feature is present during transform.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + categories = hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + encode_target_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should it encode also target columns?", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKOneHotEncoder(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn OneHotEncoder + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ENCODE_ONE_HOT, ], + "name": "sklearn.preprocessing.data.OneHotEncoder", + "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + "python_path": "d3m.primitives.data_transformation.one_hot_encoder.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html']}, + "version": "2019.11.13", + "id": "c977e879-1bf5-3829-b5b0-39b00233aff5", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = OneHotEncoder( + n_values=self.hyperparams['n_values'], + sparse=self.hyperparams['sparse'], + handle_unknown=self.hyperparams['handle_unknown'], + categories=self.hyperparams['categories'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + _active_features_=None, + _categorical_features=None, + _categories=None, + _feature_indices_=None, + _legacy_mode=None, + _n_values_=None, + _n_values=None, + categories_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + _active_features_=getattr(self._clf, '_active_features_', None), + _categorical_features=getattr(self._clf, '_categorical_features', None), + _categories=getattr(self._clf, '_categories', None), + _feature_indices_=getattr(self._clf, '_feature_indices_', None), + _legacy_mode=getattr(self._clf, '_legacy_mode', None), + _n_values_=getattr(self._clf, '_n_values_', None), + _n_values=getattr(self._clf, '_n_values', None), + categories_=getattr(self._clf, 'categories_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf._active_features_ = params['_active_features_'] + self._clf._categorical_features = params['_categorical_features'] + self._clf._categories = params['_categories'] + self._clf._feature_indices_ = params['_feature_indices_'] + self._clf._legacy_mode = params['_legacy_mode'] + self._clf._n_values_ = params['_n_values_'] + self._clf._n_values = params['_n_values'] + self._clf.categories_ = params['categories_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['_active_features_'] is not None: + self._fitted = True + if params['_categorical_features'] is not None: + self._fitted = True + if params['_categories'] is not None: + self._fitted = True + if params['_feature_indices_'] is not None: + self._fitted = True + if params['_legacy_mode'] is not None: + self._fitted = True + if params['_n_values_'] is not None: + self._fitted = True + if params['_n_values'] is not None: + self._fitted = True + if params['categories_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int,float,numpy.integer,numpy.float64,str,) + accepted_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/CategoricalData","https://metadata.datadrivendiscovery.org/types/Attribute",]) + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + if hyperparams['encode_target_columns'] and 'https://metadata.datadrivendiscovery.org/types/Target' in semantic_types: + return True + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKOneHotEncoder.__doc__ = OneHotEncoder.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKOrdinalEncoder.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKOrdinalEncoder.py new file mode 100644 index 0000000..7396073 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKOrdinalEncoder.py @@ -0,0 +1,343 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing._encoders import OrdinalEncoder + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + categories_: Optional[Optional[Sequence[Any]]] + _categories: Optional[str] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + categories = hyperparams.Constant( + default='auto', + description='Categories (unique values) per feature: - \'auto\' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories should not mix strings and numeric values, and should be sorted in case of numeric values. The used categories can be found in the ``categories_`` attribute.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKOrdinalEncoder(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn OrdinalEncoder + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.CATEGORY_ENCODER, ], + "name": "sklearn.preprocessing._encoders.OrdinalEncoder", + "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + "python_path": "d3m.primitives.data_transformation.ordinal_encoder.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html']}, + "version": "2019.11.13", + "id": "a048aaa7-4475-3834-b739-de3105ec7217", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = OrdinalEncoder( + categories=self.hyperparams['categories'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + categories_=None, + _categories=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + categories_=getattr(self._clf, 'categories_', None), + _categories=getattr(self._clf, '_categories', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.categories_ = params['categories_'] + self._clf._categories = params['_categories'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['categories_'] is not None: + self._fitted = True + if params['_categories'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int,float,numpy.integer,numpy.float64,str,) + accepted_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/CategoricalData",]) + not_accepted_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/Target",]) + + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + if len(not_accepted_semantic_types.intersection(semantic_types)) > 0: + return False + + # Making sure at least one accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types.intersection(semantic_types)) > 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKOrdinalEncoder.__doc__ = OrdinalEncoder.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPCA.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPCA.py new file mode 100644 index 0000000..a8c7973 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPCA.py @@ -0,0 +1,468 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.decomposition.pca import PCA +import sys + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + components_: Optional[ndarray] + explained_variance_: Optional[ndarray] + explained_variance_ratio_: Optional[ndarray] + mean_: Optional[ndarray] + n_components_: Optional[int] + noise_variance_: Optional[float] + n_features_: Optional[int] + n_samples_: Optional[int] + singular_values_: Optional[ndarray] + _fit_svd_solver: Optional[str] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_components = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + description='Number of components to keep.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'float': hyperparams.Uniform( + lower=0, + upper=1, + default=0.5, + description='Selects the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'mle': hyperparams.Constant( + default='mle', + description='If svd_solver == \'full\', Minka\'s MLE is used to guess the dimension.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + description='All components are kept, n_components == min(n_samples, n_features).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Number of components to keep. if n_components is not set all components are kept:: n_components == min(n_samples, n_features) if n_components == \'mle\' and svd_solver == \'full\', Minka\'s MLE is used to guess the dimension if ``0 < n_components < 1`` and svd_solver == \'full\', select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components n_components cannot be equal to n_features for svd_solver == \'arpack\'.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + whiten = hyperparams.UniformBool( + default=False, + description='When True (False by default) the `components_` vectors are multiplied by the square root of n_samples and then divided by the singular values to ensure uncorrelated outputs with unit component-wise variances. Whitening will remove some information from the transformed signal (the relative variance scales of the components) but can sometime improve the predictive accuracy of the downstream estimators by making their data respect some hard-wired assumptions.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + svd_solver = hyperparams.Choice( + choices={ + 'auto': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'full': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'arpack': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'tol': hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'randomized': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'iterated_power': hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='auto', + description='auto : the solver is selected by a default policy based on `X.shape` and `n_components`: if the input data is larger than 500x500 and the number of components to extract is lower than 80% of the smallest dimension of the data, then the more efficient \'randomized\' method is enabled. Otherwise the exact full SVD is computed and optionally truncated afterwards. full : run exact full SVD calling the standard LAPACK solver via `scipy.linalg.svd` and select the components by postprocessing arpack : run SVD truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`. It requires strictly 0 < n_components < X.shape[1] randomized : run randomized SVD by the method of Halko et al. .. versionadded:: 0.18.0', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKPCA(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn PCA + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, ], + "name": "sklearn.decomposition.pca.PCA", + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_EXTRACTION, + "python_path": "d3m.primitives.feature_extraction.pca.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html']}, + "version": "2019.11.13", + "id": "2fb28cd1-5de6-3663-a2dc-09c786fba7f4", + "hyperparams_to_tune": ['n_components', 'svd_solver'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = PCA( + n_components=self.hyperparams['n_components'], + whiten=self.hyperparams['whiten'], + svd_solver=self.hyperparams['svd_solver']['choice'], + tol=self.hyperparams['svd_solver'].get('tol', 0.0), + iterated_power=self.hyperparams['svd_solver'].get('iterated_power', 'auto'), + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + components_=None, + explained_variance_=None, + explained_variance_ratio_=None, + mean_=None, + n_components_=None, + noise_variance_=None, + n_features_=None, + n_samples_=None, + singular_values_=None, + _fit_svd_solver=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + components_=getattr(self._clf, 'components_', None), + explained_variance_=getattr(self._clf, 'explained_variance_', None), + explained_variance_ratio_=getattr(self._clf, 'explained_variance_ratio_', None), + mean_=getattr(self._clf, 'mean_', None), + n_components_=getattr(self._clf, 'n_components_', None), + noise_variance_=getattr(self._clf, 'noise_variance_', None), + n_features_=getattr(self._clf, 'n_features_', None), + n_samples_=getattr(self._clf, 'n_samples_', None), + singular_values_=getattr(self._clf, 'singular_values_', None), + _fit_svd_solver=getattr(self._clf, '_fit_svd_solver', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.components_ = params['components_'] + self._clf.explained_variance_ = params['explained_variance_'] + self._clf.explained_variance_ratio_ = params['explained_variance_ratio_'] + self._clf.mean_ = params['mean_'] + self._clf.n_components_ = params['n_components_'] + self._clf.noise_variance_ = params['noise_variance_'] + self._clf.n_features_ = params['n_features_'] + self._clf.n_samples_ = params['n_samples_'] + self._clf.singular_values_ = params['singular_values_'] + self._clf._fit_svd_solver = params['_fit_svd_solver'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['components_'] is not None: + self._fitted = True + if params['explained_variance_'] is not None: + self._fitted = True + if params['explained_variance_ratio_'] is not None: + self._fitted = True + if params['mean_'] is not None: + self._fitted = True + if params['n_components_'] is not None: + self._fitted = True + if params['noise_variance_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['n_samples_'] is not None: + self._fitted = True + if params['singular_values_'] is not None: + self._fitted = True + if params['_fit_svd_solver'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKPCA.__doc__ = PCA.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPassiveAggressiveClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPassiveAggressiveClassifier.py new file mode 100644 index 0000000..9a4cfa9 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPassiveAggressiveClassifier.py @@ -0,0 +1,648 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.passive_aggressive import PassiveAggressiveClassifier + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[ndarray] + classes_: Optional[ndarray] + _expanded_class_weight: Optional[ndarray] + alpha: Optional[float] + epsilon: Optional[float] + eta0: Optional[float] + l1_ratio: Optional[float] + learning_rate: Optional[str] + loss_function_: Optional[object] + n_iter_: Optional[int] + penalty: Optional[str] + power_t: Optional[float] + t_: Optional[float] + average_coef_: Optional[ndarray] + average_intercept_: Optional[ndarray] + standard_coef_: Optional[ndarray] + standard_intercept_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + C = hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=False, + description='Whether the intercept should be estimated or not. If False, the data is assumed to be already centered.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=1000, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + shuffle = hyperparams.UniformBool( + default=True, + description='Whether or not the training data should be shuffled after each epoch.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + default=0.001, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of CPUs to use to do the OVA (One Versus All, for multi-class problems) computation. -1 means \'all CPUs\'. Defaults to 1.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + loss = hyperparams.Enumeration[str]( + values=['hinge', 'squared_hinge'], + default='hinge', + description='The loss function to be used: hinge: equivalent to PA-I in the reference paper. squared_hinge: equivalent to PA-II in the reference paper.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + class_weight = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Constant( + default='balanced', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Preset for the class_weight fit parameter. Weights associated with classes. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` .. versionadded:: 0.17 parameter *class_weight* to automatically weight samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + average = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=2, + upper=None, + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='bool', + description='When set to True, computes the averaged SGD weights and stores the result in the coef_ attribute. If set to an int greater than 1, averaging will begin once the total number of samples seen reaches average. So average=10 will begin averaging after seeing 10 samples. New in version 0.19: parameter average to use weights averaging in SGD', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + early_stopping = hyperparams.UniformBool( + default=False, + description='Whether to use early stopping to terminate training when validation score is not improving. If set to True, it will automatically set asid a fraction of training data as validation and terminate training whe validation score is not improving by at least tol fo n_iter_no_change consecutive epochs.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + validation_fraction = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=1, + description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_iter_no_change = hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + description='Number of iterations with no improvement to wait before early stopping.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKPassiveAggressiveClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn PassiveAggressiveClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.PASSIVE_AGGRESSIVE, ], + "name": "sklearn.linear_model.passive_aggressive.PassiveAggressiveClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.passive_aggressive.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html']}, + "version": "2019.11.13", + "id": "85e5c88d-9eec-3452-8f2f-414f17d3e4d5", + "hyperparams_to_tune": ['C'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = PassiveAggressiveClassifier( + C=self.hyperparams['C'], + fit_intercept=self.hyperparams['fit_intercept'], + max_iter=self.hyperparams['max_iter'], + shuffle=self.hyperparams['shuffle'], + tol=self.hyperparams['tol'], + n_jobs=self.hyperparams['n_jobs'], + loss=self.hyperparams['loss'], + warm_start=self.hyperparams['warm_start'], + class_weight=self.hyperparams['class_weight'], + average=self.hyperparams['average'], + early_stopping=self.hyperparams['early_stopping'], + validation_fraction=self.hyperparams['validation_fraction'], + n_iter_no_change=self.hyperparams['n_iter_no_change'], + verbose=_verbose, + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.partial_fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + classes_=None, + _expanded_class_weight=None, + alpha=None, + epsilon=None, + eta0=None, + l1_ratio=None, + learning_rate=None, + loss_function_=None, + n_iter_=None, + penalty=None, + power_t=None, + t_=None, + average_coef_=None, + average_intercept_=None, + standard_coef_=None, + standard_intercept_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + classes_=getattr(self._clf, 'classes_', None), + _expanded_class_weight=getattr(self._clf, '_expanded_class_weight', None), + alpha=getattr(self._clf, 'alpha', None), + epsilon=getattr(self._clf, 'epsilon', None), + eta0=getattr(self._clf, 'eta0', None), + l1_ratio=getattr(self._clf, 'l1_ratio', None), + learning_rate=getattr(self._clf, 'learning_rate', None), + loss_function_=getattr(self._clf, 'loss_function_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + penalty=getattr(self._clf, 'penalty', None), + power_t=getattr(self._clf, 'power_t', None), + t_=getattr(self._clf, 't_', None), + average_coef_=getattr(self._clf, 'average_coef_', None), + average_intercept_=getattr(self._clf, 'average_intercept_', None), + standard_coef_=getattr(self._clf, 'standard_coef_', None), + standard_intercept_=getattr(self._clf, 'standard_intercept_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.classes_ = params['classes_'] + self._clf._expanded_class_weight = params['_expanded_class_weight'] + self._clf.alpha = params['alpha'] + self._clf.epsilon = params['epsilon'] + self._clf.eta0 = params['eta0'] + self._clf.l1_ratio = params['l1_ratio'] + self._clf.learning_rate = params['learning_rate'] + self._clf.loss_function_ = params['loss_function_'] + self._clf.n_iter_ = params['n_iter_'] + self._clf.penalty = params['penalty'] + self._clf.power_t = params['power_t'] + self._clf.t_ = params['t_'] + self._clf.average_coef_ = params['average_coef_'] + self._clf.average_intercept_ = params['average_intercept_'] + self._clf.standard_coef_ = params['standard_coef_'] + self._clf.standard_intercept_ = params['standard_intercept_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['_expanded_class_weight'] is not None: + self._fitted = True + if params['alpha'] is not None: + self._fitted = True + if params['epsilon'] is not None: + self._fitted = True + if params['eta0'] is not None: + self._fitted = True + if params['l1_ratio'] is not None: + self._fitted = True + if params['learning_rate'] is not None: + self._fitted = True + if params['loss_function_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + if params['penalty'] is not None: + self._fitted = True + if params['power_t'] is not None: + self._fitted = True + if params['t_'] is not None: + self._fitted = True + if params['average_coef_'] is not None: + self._fitted = True + if params['average_intercept_'] is not None: + self._fitted = True + if params['standard_coef_'] is not None: + self._fitted = True + if params['standard_intercept_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKPassiveAggressiveClassifier.__doc__ = PassiveAggressiveClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPassiveAggressiveRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPassiveAggressiveRegressor.py new file mode 100644 index 0000000..900de99 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPassiveAggressiveRegressor.py @@ -0,0 +1,583 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.passive_aggressive import PassiveAggressiveRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[ndarray] + t_: Optional[float] + alpha: Optional[float] + eta0: Optional[float] + l1_ratio: Optional[int] + learning_rate: Optional[str] + n_iter_: Optional[int] + penalty: Optional[float] + power_t: Optional[float] + average_coef_: Optional[ndarray] + average_intercept_: Optional[ndarray] + standard_coef_: Optional[ndarray] + standard_intercept_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + C = hyperparams.Hyperparameter[float]( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. Defaults to True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Hyperparameter[int]( + default=1000, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + shuffle = hyperparams.UniformBool( + default=True, + description='Whether or not the training data should be shuffled after each epoch.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.001, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + loss = hyperparams.Enumeration[str]( + values=['epsilon_insensitive', 'squared_epsilon_insensitive'], + default='epsilon_insensitive', + description='The loss function to be used: epsilon_insensitive: equivalent to PA-I in the reference paper. squared_epsilon_insensitive: equivalent to PA-II in the reference paper.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + average = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=2, + lower=2, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='bool', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + epsilon = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.1, + description='If the difference between the current prediction and the correct label is below this threshold, the model is not updated.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + early_stopping = hyperparams.UniformBool( + default=False, + description='Whether to use early stopping to terminate training when validation score is not improving. If set to True, it will automatically set asid a fraction of training data as validation and terminate training whe validation score is not improving by at least tol fo n_iter_no_change consecutive epochs.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + validation_fraction = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=1, + description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_iter_no_change = hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + description='Number of iterations with no improvement to wait before early stopping.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKPassiveAggressiveRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn PassiveAggressiveRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.PASSIVE_AGGRESSIVE, ], + "name": "sklearn.linear_model.passive_aggressive.PassiveAggressiveRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.passive_aggressive.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveRegressor.html']}, + "version": "2019.11.13", + "id": "50ce5919-a155-3c72-a230-f4ab4b5babba", + "hyperparams_to_tune": ['C'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = PassiveAggressiveRegressor( + C=self.hyperparams['C'], + fit_intercept=self.hyperparams['fit_intercept'], + max_iter=self.hyperparams['max_iter'], + shuffle=self.hyperparams['shuffle'], + tol=self.hyperparams['tol'], + loss=self.hyperparams['loss'], + warm_start=self.hyperparams['warm_start'], + average=self.hyperparams['average'], + epsilon=self.hyperparams['epsilon'], + early_stopping=self.hyperparams['early_stopping'], + validation_fraction=self.hyperparams['validation_fraction'], + n_iter_no_change=self.hyperparams['n_iter_no_change'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.partial_fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + t_=None, + alpha=None, + eta0=None, + l1_ratio=None, + learning_rate=None, + n_iter_=None, + penalty=None, + power_t=None, + average_coef_=None, + average_intercept_=None, + standard_coef_=None, + standard_intercept_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + t_=getattr(self._clf, 't_', None), + alpha=getattr(self._clf, 'alpha', None), + eta0=getattr(self._clf, 'eta0', None), + l1_ratio=getattr(self._clf, 'l1_ratio', None), + learning_rate=getattr(self._clf, 'learning_rate', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + penalty=getattr(self._clf, 'penalty', None), + power_t=getattr(self._clf, 'power_t', None), + average_coef_=getattr(self._clf, 'average_coef_', None), + average_intercept_=getattr(self._clf, 'average_intercept_', None), + standard_coef_=getattr(self._clf, 'standard_coef_', None), + standard_intercept_=getattr(self._clf, 'standard_intercept_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.t_ = params['t_'] + self._clf.alpha = params['alpha'] + self._clf.eta0 = params['eta0'] + self._clf.l1_ratio = params['l1_ratio'] + self._clf.learning_rate = params['learning_rate'] + self._clf.n_iter_ = params['n_iter_'] + self._clf.penalty = params['penalty'] + self._clf.power_t = params['power_t'] + self._clf.average_coef_ = params['average_coef_'] + self._clf.average_intercept_ = params['average_intercept_'] + self._clf.standard_coef_ = params['standard_coef_'] + self._clf.standard_intercept_ = params['standard_intercept_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['t_'] is not None: + self._fitted = True + if params['alpha'] is not None: + self._fitted = True + if params['eta0'] is not None: + self._fitted = True + if params['l1_ratio'] is not None: + self._fitted = True + if params['learning_rate'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + if params['penalty'] is not None: + self._fitted = True + if params['power_t'] is not None: + self._fitted = True + if params['average_coef_'] is not None: + self._fitted = True + if params['average_intercept_'] is not None: + self._fitted = True + if params['standard_coef_'] is not None: + self._fitted = True + if params['standard_intercept_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKPassiveAggressiveRegressor.__doc__ = PassiveAggressiveRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPolynomialFeatures.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPolynomialFeatures.py new file mode 100644 index 0000000..283adfd --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKPolynomialFeatures.py @@ -0,0 +1,346 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import PolynomialFeatures + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + n_input_features_: Optional[int] + n_output_features_: Optional[int] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + degree = hyperparams.Hyperparameter[int]( + default=2, + description='The degree of the polynomial features. Default = 2.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + include_bias = hyperparams.UniformBool( + default=True, + description='If True (default), then include a bias column, the feature in which all polynomial powers are zero (i.e. a column of ones - acts as an intercept term in a linear model). Examples -------- >>> X = np.arange(6).reshape(3, 2) >>> X array([[0, 1], [2, 3], [4, 5]]) >>> poly = PolynomialFeatures(2) >>> poly.fit_transform(X) array([[ 1., 0., 1., 0., 0., 1.], [ 1., 2., 3., 4., 6., 9.], [ 1., 4., 5., 16., 20., 25.]]) >>> poly = PolynomialFeatures(interaction_only=True) >>> poly.fit_transform(X) array([[ 1., 0., 1., 0.], [ 1., 2., 3., 6.], [ 1., 4., 5., 20.]])', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKPolynomialFeatures(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn PolynomialFeatures + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.STATISTICAL_MOMENT_ANALYSIS, ], + "name": "sklearn.preprocessing.data.PolynomialFeatures", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.polynomial_features.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html']}, + "version": "2019.11.13", + "id": "93acb44b-532a-37d3-987a-8e61a8489d77", + "hyperparams_to_tune": ['degree'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = PolynomialFeatures( + degree=self.hyperparams['degree'], + include_bias=self.hyperparams['include_bias'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + n_input_features_=None, + n_output_features_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + n_input_features_=getattr(self._clf, 'n_input_features_', None), + n_output_features_=getattr(self._clf, 'n_output_features_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.n_input_features_ = params['n_input_features_'] + self._clf.n_output_features_ = params['n_output_features_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['n_input_features_'] is not None: + self._fitted = True + if params['n_output_features_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKPolynomialFeatures.__doc__ = PolynomialFeatures.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKQuadraticDiscriminantAnalysis.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKQuadraticDiscriminantAnalysis.py new file mode 100644 index 0000000..fa90760 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKQuadraticDiscriminantAnalysis.py @@ -0,0 +1,473 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + covariance_: Optional[ndarray] + means_: Optional[ndarray] + priors_: Optional[ndarray] + rotations_: Optional[Sequence[ndarray]] + scalings_: Optional[Sequence[ndarray]] + classes_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + reg_param = hyperparams.Bounded[float]( + default=0.0, + lower=0, + upper=1, + description='Regularizes the covariance estimate as ``(1-reg_param)*Sigma + reg_param*np.eye(n_features)``', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.0001, + lower=0, + upper=None, + description='Threshold used for rank estimation. .. versionadded:: 0.17', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKQuadraticDiscriminantAnalysis(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn QuadraticDiscriminantAnalysis + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.QUADRATIC_DISCRIMINANT_ANALYSIS, ], + "name": "sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.quadratic_discriminant_analysis.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html']}, + "version": "2019.11.13", + "id": "321dbf4d-07d9-3274-bd1b-2751520ed1d7", + "hyperparams_to_tune": ['reg_param'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = QuadraticDiscriminantAnalysis( + reg_param=self.hyperparams['reg_param'], + tol=self.hyperparams['tol'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + covariance_=None, + means_=None, + priors_=None, + rotations_=None, + scalings_=None, + classes_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + covariance_=getattr(self._clf, 'covariance_', None), + means_=getattr(self._clf, 'means_', None), + priors_=getattr(self._clf, 'priors_', None), + rotations_=getattr(self._clf, 'rotations_', None), + scalings_=getattr(self._clf, 'scalings_', None), + classes_=getattr(self._clf, 'classes_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.covariance_ = params['covariance_'] + self._clf.means_ = params['means_'] + self._clf.priors_ = params['priors_'] + self._clf.rotations_ = params['rotations_'] + self._clf.scalings_ = params['scalings_'] + self._clf.classes_ = params['classes_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['covariance_'] is not None: + self._fitted = True + if params['means_'] is not None: + self._fitted = True + if params['priors_'] is not None: + self._fitted = True + if params['rotations_'] is not None: + self._fitted = True + if params['scalings_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKQuadraticDiscriminantAnalysis.__doc__ = QuadraticDiscriminantAnalysis.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKQuantileTransformer.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKQuantileTransformer.py new file mode 100644 index 0000000..e077dd2 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKQuantileTransformer.py @@ -0,0 +1,364 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import QuantileTransformer + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + quantiles_: Optional[ndarray] + references_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_quantiles = hyperparams.UniformInt( + default=1000, + lower=100, + upper=10000, + description='Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative distribution function.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + output_distribution = hyperparams.Enumeration[str]( + default='uniform', + values=['uniform', 'normal'], + description='Marginal distribution for the transformed data. The choices are \'uniform\' (default) or \'normal\'.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + ignore_implicit_zeros = hyperparams.UniformBool( + default=False, + description='Only applies to sparse matrices. If True, the sparse entries of the matrix are discarded to compute the quantile statistics. If False, these entries are treated as zeros.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + subsample = hyperparams.Bounded[float]( + default=100000.0, + lower=1000.0, + upper=100000.0, + description='Maximum number of samples used to estimate the quantiles for computational efficiency. Note that the subsampling procedure may differ for value-identical sparse and dense matrices.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKQuantileTransformer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn QuantileTransformer + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, ], + "name": "sklearn.preprocessing.data.QuantileTransformer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.quantile_transformer.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html']}, + "version": "2019.11.13", + "id": "54c5e71f-0909-400b-ae65-b33631e7648f", + "hyperparams_to_tune": ['n_quantiles', 'output_distribution'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = QuantileTransformer( + n_quantiles=self.hyperparams['n_quantiles'], + output_distribution=self.hyperparams['output_distribution'], + ignore_implicit_zeros=self.hyperparams['ignore_implicit_zeros'], + subsample=self.hyperparams['subsample'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + quantiles_=None, + references_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + quantiles_=getattr(self._clf, 'quantiles_', None), + references_=getattr(self._clf, 'references_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.quantiles_ = params['quantiles_'] + self._clf.references_ = params['references_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['quantiles_'] is not None: + self._fitted = True + if params['references_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKQuantileTransformer.__doc__ = QuantileTransformer.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRBFSampler.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRBFSampler.py new file mode 100644 index 0000000..03cd11c --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRBFSampler.py @@ -0,0 +1,349 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.kernel_approximation import RBFSampler + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + random_weights_: Optional[ndarray] + random_offset_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + gamma = hyperparams.Hyperparameter[float]( + default=1, + description='Parameter of RBF kernel: exp(-gamma * x^2)', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_components = hyperparams.Bounded[int]( + lower=0, + upper=None, + default=100, + description='Number of Monte Carlo samples per original feature. Equals the dimensionality of the computed feature space.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKRBFSampler(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn RBFSampler + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.KERNEL_METHOD, ], + "name": "sklearn.kernel_approximation.RBFSampler", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.rbf_sampler.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.RBFSampler.html']}, + "version": "2019.11.13", + "id": "0823123d-45a3-3dc8-9ef1-ff643236993a", + "hyperparams_to_tune": ['gamma', 'n_components'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = RBFSampler( + gamma=self.hyperparams['gamma'], + n_components=self.hyperparams['n_components'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + random_weights_=None, + random_offset_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + random_weights_=getattr(self._clf, 'random_weights_', None), + random_offset_=getattr(self._clf, 'random_offset_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.random_weights_ = params['random_weights_'] + self._clf.random_offset_ = params['random_offset_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['random_weights_'] is not None: + self._fitted = True + if params['random_offset_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKRBFSampler.__doc__ = RBFSampler.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRandomForestClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRandomForestClassifier.py new file mode 100644 index 0000000..ddef232 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRandomForestClassifier.py @@ -0,0 +1,682 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.forest import RandomForestClassifier + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + estimators_: Optional[List[sklearn.tree.DecisionTreeClassifier]] + classes_: Optional[Union[ndarray, List[ndarray]]] + n_classes_: Optional[Union[int, List[int]]] + n_features_: Optional[int] + n_outputs_: Optional[int] + oob_score_: Optional[float] + oob_decision_function_: Optional[ndarray] + base_estimator_: Optional[object] + estimator_params: Optional[tuple] + base_estimator: Optional[object] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='The number of trees in the forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + criterion = hyperparams.Enumeration[str]( + values=['gini', 'entropy'], + default='gini', + description='The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain. Note: this parameter is tree-specific.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'specified_int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Uniform( + default=0.25, + lower=0, + upper=1, + lower_inclusive=True, + upper_inclusive=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='calculated', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_depth = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=10, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_split = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=2, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Uniform( + default=0.25, + lower=0, + upper=1, + lower_inclusive=False, + upper_inclusive=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_leaf = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Uniform( + default=0.25, + lower=0, + upper=0.5, + lower_inclusive=False, + upper_inclusive=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_weight_fraction_leaf = hyperparams.Uniform( + default=0, + lower=0, + upper=0.5, + upper_inclusive=True, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_leaf_nodes = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=10, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_decrease = hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + bootstrap = hyperparams.Enumeration[str]( + values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], + default='bootstrap', + description='Whether bootstrap samples are used when building trees.' + ' And whether to use out-of-bag samples to estimate the generalization accuracy.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + class_weight = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Enumeration[str]( + default='balanced', + values=['balanced', 'balanced_subsample'], + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='"balanced_subsample" or None, optional (default=None) Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The "balanced_subsample" mode is the same as "balanced" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKRandomForestClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn RandomForestClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.RANDOM_FOREST, ], + "name": "sklearn.ensemble.forest.RandomForestClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.random_forest.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html']}, + "version": "2019.11.13", + "id": "1dd82833-5692-39cb-84fb-2455683075f3", + "hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = RandomForestClassifier( + n_estimators=self.hyperparams['n_estimators'], + criterion=self.hyperparams['criterion'], + max_features=self.hyperparams['max_features'], + max_depth=self.hyperparams['max_depth'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], + oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], + n_jobs=self.hyperparams['n_jobs'], + warm_start=self.hyperparams['warm_start'], + class_weight=self.hyperparams['class_weight'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + estimators_=None, + classes_=None, + n_classes_=None, + n_features_=None, + n_outputs_=None, + oob_score_=None, + oob_decision_function_=None, + base_estimator_=None, + estimator_params=None, + base_estimator=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + estimators_=getattr(self._clf, 'estimators_', None), + classes_=getattr(self._clf, 'classes_', None), + n_classes_=getattr(self._clf, 'n_classes_', None), + n_features_=getattr(self._clf, 'n_features_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + oob_score_=getattr(self._clf, 'oob_score_', None), + oob_decision_function_=getattr(self._clf, 'oob_decision_function_', None), + base_estimator_=getattr(self._clf, 'base_estimator_', None), + estimator_params=getattr(self._clf, 'estimator_params', None), + base_estimator=getattr(self._clf, 'base_estimator', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.estimators_ = params['estimators_'] + self._clf.classes_ = params['classes_'] + self._clf.n_classes_ = params['n_classes_'] + self._clf.n_features_ = params['n_features_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.oob_score_ = params['oob_score_'] + self._clf.oob_decision_function_ = params['oob_decision_function_'] + self._clf.base_estimator_ = params['base_estimator_'] + self._clf.estimator_params = params['estimator_params'] + self._clf.base_estimator = params['base_estimator'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['estimators_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['n_classes_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['oob_score_'] is not None: + self._fitted = True + if params['oob_decision_function_'] is not None: + self._fitted = True + if params['base_estimator_'] is not None: + self._fitted = True + if params['estimator_params'] is not None: + self._fitted = True + if params['base_estimator'] is not None: + self._fitted = True + + + def log_likelihoods(self, *, + outputs: Outputs, + inputs: Inputs, + timeout: float = None, + iterations: int = None) -> CallResult[Sequence[float]]: + inputs = inputs.iloc[:, self._training_indices] # Get ndarray + outputs = outputs.iloc[:, self._target_column_indices] + + if len(inputs.columns) and len(outputs.columns): + + if outputs.shape[1] != self._clf.n_outputs_: + raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") + + log_proba = self._clf.predict_log_proba(inputs) + + # Making it always a list, even when only one target. + if self._clf.n_outputs_ == 1: + log_proba = [log_proba] + classes = [self._clf.classes_] + else: + classes = self._clf.classes_ + + samples_length = inputs.shape[0] + + log_likelihoods = [] + for k in range(self._clf.n_outputs_): + # We have to map each class to its internal (numerical) index used in the learner. + # This allows "outputs" to contain string classes. + outputs_column = outputs.iloc[:, k] + classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) + mapped_outputs_column = outputs_column.map(classes_map) + + # For each target column (column in "outputs"), for each sample (row) we pick the log + # likelihood for a given class. + log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) + + results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) + results.columns = outputs.columns + + for k in range(self._clf.n_outputs_): + column_metadata = outputs.metadata.query_column(k) + if 'name' in column_metadata: + results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) + + else: + results = d3m_dataframe(generate_metadata=True) + + return CallResult(results) + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKRandomForestClassifier.__doc__ = RandomForestClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRandomForestRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRandomForestRegressor.py new file mode 100644 index 0000000..181105a --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRandomForestRegressor.py @@ -0,0 +1,609 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.forest import RandomForestRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + estimators_: Optional[List[sklearn.tree.DecisionTreeRegressor]] + n_features_: Optional[int] + n_outputs_: Optional[int] + oob_score_: Optional[float] + oob_prediction_: Optional[ndarray] + base_estimator_: Optional[object] + estimator_params: Optional[tuple] + base_estimator: Optional[object] + class_weight: Optional[Union[str, dict, List[dict]]] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='The number of trees in the forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + criterion = hyperparams.Enumeration[str]( + values=['mse', 'mae'], + default='mse', + description='The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'specified_int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'calculated': hyperparams.Enumeration[str]( + values=['auto', 'sqrt', 'log2'], + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Uniform( + default=0.25, + lower=0, + upper=1, + lower_inclusive=True, + upper_inclusive=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='calculated', + description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_depth = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=10, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_split = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=2, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Uniform( + default=0.25, + lower=0, + upper=1, + lower_inclusive=False, + upper_inclusive=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_leaf = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'percent': hyperparams.Uniform( + default=0.25, + lower=0, + upper=0.5, + lower_inclusive=False, + upper_inclusive=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_weight_fraction_leaf = hyperparams.Uniform( + default=0, + lower=0, + upper=0.5, + upper_inclusive=True, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_leaf_nodes = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=10, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_decrease = hyperparams.Bounded[float]( + default=0.0, + lower=0.0, + upper=None, + description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + bootstrap = hyperparams.Enumeration[str]( + values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], + default='bootstrap', + description='Whether bootstrap samples are used when building trees.' + ' And whether to use out-of-bag samples to estimate the generalization accuracy.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKRandomForestRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn RandomForestRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.RANDOM_FOREST, ], + "name": "sklearn.ensemble.forest.RandomForestRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.random_forest.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html']}, + "version": "2019.11.13", + "id": "f0fd7a62-09b5-3abc-93bb-f5f999f7cc80", + "hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = RandomForestRegressor( + n_estimators=self.hyperparams['n_estimators'], + criterion=self.hyperparams['criterion'], + max_features=self.hyperparams['max_features'], + max_depth=self.hyperparams['max_depth'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], + oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], + n_jobs=self.hyperparams['n_jobs'], + warm_start=self.hyperparams['warm_start'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + estimators_=None, + n_features_=None, + n_outputs_=None, + oob_score_=None, + oob_prediction_=None, + base_estimator_=None, + estimator_params=None, + base_estimator=None, + class_weight=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + estimators_=getattr(self._clf, 'estimators_', None), + n_features_=getattr(self._clf, 'n_features_', None), + n_outputs_=getattr(self._clf, 'n_outputs_', None), + oob_score_=getattr(self._clf, 'oob_score_', None), + oob_prediction_=getattr(self._clf, 'oob_prediction_', None), + base_estimator_=getattr(self._clf, 'base_estimator_', None), + estimator_params=getattr(self._clf, 'estimator_params', None), + base_estimator=getattr(self._clf, 'base_estimator', None), + class_weight=getattr(self._clf, 'class_weight', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.estimators_ = params['estimators_'] + self._clf.n_features_ = params['n_features_'] + self._clf.n_outputs_ = params['n_outputs_'] + self._clf.oob_score_ = params['oob_score_'] + self._clf.oob_prediction_ = params['oob_prediction_'] + self._clf.base_estimator_ = params['base_estimator_'] + self._clf.estimator_params = params['estimator_params'] + self._clf.base_estimator = params['base_estimator'] + self._clf.class_weight = params['class_weight'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['estimators_'] is not None: + self._fitted = True + if params['n_features_'] is not None: + self._fitted = True + if params['n_outputs_'] is not None: + self._fitted = True + if params['oob_score_'] is not None: + self._fitted = True + if params['oob_prediction_'] is not None: + self._fitted = True + if params['base_estimator_'] is not None: + self._fitted = True + if params['estimator_params'] is not None: + self._fitted = True + if params['base_estimator'] is not None: + self._fitted = True + if params['class_weight'] is not None: + self._fitted = True + + + + + + def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: + output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) + output.columns = self._input_column_names + for i in range(len(self._input_column_names)): + output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) + return CallResult(output) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKRandomForestRegressor.__doc__ = RandomForestRegressor.__doc__ diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRandomTreesEmbedding.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRandomTreesEmbedding.py new file mode 100644 index 0000000..c4f7adf --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRandomTreesEmbedding.py @@ -0,0 +1,482 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.ensemble.forest import RandomTreesEmbedding + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + estimators_: Optional[Sequence[sklearn.base.BaseEstimator]] + one_hot_encoder_: Optional[object] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_estimators = hyperparams.Bounded[int]( + default=10, + lower=1, + upper=None, + description='Number of trees in the forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_depth = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=5, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + description='The maximum depth of each tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_split = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=1, + default=1.0, + description='It\'s a percentage and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=2, + description='Minimum number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_samples_leaf = hyperparams.Union( + configuration=OrderedDict({ + 'percent': hyperparams.Bounded[float]( + lower=0, + upper=0.5, + default=0.25, + description='It\'s a percentage and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'absolute': hyperparams.Bounded[int]( + lower=1, + upper=None, + default=1, + description='Minimum number.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_weight_fraction_leaf = hyperparams.Bounded[float]( + default=0, + lower=0, + upper=0.5, + description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_leaf_nodes = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_split = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf. .. versionadded:: 0.18', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_impurity_decrease = hyperparams.Bounded[float]( + default=0, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKRandomTreesEmbedding(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn RandomTreesEmbedding + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.RANDOM_FOREST, ], + "name": "sklearn.ensemble.forest.RandomTreesEmbedding", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.random_trees_embedding.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomTreesEmbedding.html']}, + "version": "2019.11.13", + "id": "8889ff47-1d2e-3a80-bdef-8397a95e1c6e", + "hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = RandomTreesEmbedding( + n_estimators=self.hyperparams['n_estimators'], + max_depth=self.hyperparams['max_depth'], + min_samples_split=self.hyperparams['min_samples_split'], + min_samples_leaf=self.hyperparams['min_samples_leaf'], + min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], + max_leaf_nodes=self.hyperparams['max_leaf_nodes'], + min_impurity_split=self.hyperparams['min_impurity_split'], + min_impurity_decrease=self.hyperparams['min_impurity_decrease'], + n_jobs=self.hyperparams['n_jobs'], + warm_start=self.hyperparams['warm_start'], + random_state=self.random_seed, + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + estimators_=None, + one_hot_encoder_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + estimators_=getattr(self._clf, 'estimators_', None), + one_hot_encoder_=getattr(self._clf, 'one_hot_encoder_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.estimators_ = params['estimators_'] + self._clf.one_hot_encoder_ = params['one_hot_encoder_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['estimators_'] is not None: + self._fitted = True + if params['one_hot_encoder_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKRandomTreesEmbedding.__doc__ = RandomTreesEmbedding.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRidge.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRidge.py new file mode 100644 index 0000000..3ca48ef --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRidge.py @@ -0,0 +1,444 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.ridge import Ridge + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[Union[float, ndarray]] + n_iter_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + alpha = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=1, + description='Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``C^-1`` in other linear models such as LogisticRegression or LinearSVC. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number. copy_X : boolean, optional, default True If True, X will be copied; else, it may be overwritten.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + normalize = hyperparams.UniformBool( + default=False, + description='If True, the regressors X will be normalized before regression. This parameter is ignored when `fit_intercept` is set to False. When the regressors are normalized, note that this makes the hyperparameters learnt more robust and almost independent of the number of samples. The same property is not valid for standardized data. However, if you wish to standardize, please use `preprocessing.StandardScaler` before calling `fit` on an estimator with `normalize=False`.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=1000, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Maximum number of iterations for conjugate gradient solver. For \'sparse_cg\' and \'lsqr\' solvers, the default value is determined by scipy.sparse.linalg. For \'sag\' solver, the default value is 1000.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.001, + lower=0, + upper=None, + description='Precision of the solution.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + solver = hyperparams.Enumeration[str]( + values=['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], + default='auto', + description='Solver to use in the computational routines: - \'auto\' chooses the solver automatically based on the type of data. - \'svd\' uses a Singular Value Decomposition of X to compute the Ridge coefficients. More stable for singular matrices than \'cholesky\'. - \'cholesky\' uses the standard scipy.linalg.solve function to obtain a closed-form solution. - \'sparse_cg\' uses the conjugate gradient solver as found in scipy.sparse.linalg.cg. As an iterative algorithm, this solver is more appropriate than \'cholesky\' for large-scale data (possibility to set `tol` and `max_iter`). - \'lsqr\' uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr. It is the fastest but may not be available in old scipy versions. It also uses an iterative procedure. - \'sag\' uses a Stochastic Average Gradient descent. It also uses an iterative procedure, and is often faster than other solvers when both n_samples and n_features are large. Note that \'sag\' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. All last four solvers support both dense and sparse data. However, only \'sag\' supports sparse input when `fit_intercept` is True. .. versionadded:: 0.17 Stochastic Average Gradient descent solver.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKRidge(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn Ridge + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.TIKHONOV_REGULARIZATION, ], + "name": "sklearn.linear_model.ridge.Ridge", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.ridge.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html']}, + "version": "2019.11.13", + "id": "2fb16403-8509-3f02-bdbf-9696e2fcad55", + "hyperparams_to_tune": ['alpha', 'max_iter'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Ridge( + alpha=self.hyperparams['alpha'], + fit_intercept=self.hyperparams['fit_intercept'], + normalize=self.hyperparams['normalize'], + max_iter=self.hyperparams['max_iter'], + tol=self.hyperparams['tol'], + solver=self.hyperparams['solver'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + n_iter_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.n_iter_ = params['n_iter_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKRidge.__doc__ = Ridge.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRobustScaler.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRobustScaler.py new file mode 100644 index 0000000..6b98060 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKRobustScaler.py @@ -0,0 +1,354 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import RobustScaler + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + center_: Optional[ndarray] + scale_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + with_centering = hyperparams.UniformBool( + default=True, + description='If True, center the data before scaling. This will cause ``transform`` to raise an exception when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + with_scaling = hyperparams.UniformBool( + default=True, + description='If True, scale the data to interquartile range.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + quantile_range = hyperparams.SortedSet( + elements=hyperparams.Uniform(0.0, 100.0, 50.0, lower_inclusive=False, upper_inclusive=False), + default=(25.0, 75.0), + min_size=2, + max_size=2, + description='Default: (25.0, 75.0) = (1st quantile, 3rd quantile) = IQR Quantile range used to calculate ``scale_``. .. versionadded:: 0.18', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKRobustScaler(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn RobustScaler + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.FEATURE_SCALING, ], + "name": "sklearn.preprocessing.data.RobustScaler", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.robust_scaler.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html']}, + "version": "2019.11.13", + "id": "854727ed-c82c-3137-ac59-fd52bc9ba385", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = RobustScaler( + with_centering=self.hyperparams['with_centering'], + with_scaling=self.hyperparams['with_scaling'], + quantile_range=self.hyperparams['quantile_range'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + center_=None, + scale_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + center_=getattr(self._clf, 'center_', None), + scale_=getattr(self._clf, 'scale_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.center_ = params['center_'] + self._clf.scale_ = params['scale_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['center_'] is not None: + self._fitted = True + if params['scale_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKRobustScaler.__doc__ = RobustScaler.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSGDClassifier.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSGDClassifier.py new file mode 100644 index 0000000..e5f0422 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSGDClassifier.py @@ -0,0 +1,661 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.stochastic_gradient import SGDClassifier + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[ndarray] + n_iter_: Optional[int] + loss_function_: Optional[object] + classes_: Optional[ndarray] + _expanded_class_weight: Optional[ndarray] + t_: Optional[float] + C: Optional[float] + average_coef_: Optional[ndarray] + average_intercept_: Optional[ndarray] + standard_coef_: Optional[ndarray] + standard_intercept_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + loss = hyperparams.Enumeration[str]( + values=['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], + default='hinge', + description='The loss function to be used. Defaults to \'hinge\', which gives a linear SVM. The possible options are \'hinge\', \'log\', \'modified_huber\', \'squared_hinge\', \'perceptron\', or a regression loss: \'squared_loss\', \'huber\', \'epsilon_insensitive\', or \'squared_epsilon_insensitive\'. The \'log\' loss gives logistic regression, a probabilistic classifier. \'modified_huber\' is another smooth loss that brings tolerance to outliers as well as probability estimates. \'squared_hinge\' is like hinge but is quadratically penalized. \'perceptron\' is the linear loss used by the perceptron algorithm. The other losses are designed for regression but can be useful in classification as well; see SGDRegressor for a description.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + penalty = hyperparams.Enumeration[str]( + values=['l1', 'l2', 'elasticnet', 'none'], + default='l2', + description='The penalty (aka regularization term) to be used. Defaults to \'l2\' which is the standard regularizer for linear SVM models. \'l1\' and \'elasticnet\' might bring sparsity to the model (feature selection) not achievable with \'l2\'.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + alpha = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.0001, + description='Constant that multiplies the regularization term. Defaults to 0.0001 Also used to compute learning_rate when set to \'optimal\'.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + l1_ratio = hyperparams.Bounded[float]( + lower=0, + upper=1, + default=0.15, + description='The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. Defaults to 0.15.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. Defaults to True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + shuffle = hyperparams.UniformBool( + default=True, + description='Whether or not the training data should be shuffled after each epoch. Defaults to True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + epsilon = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.1, + description='Epsilon in the epsilon-insensitive loss functions; only if `loss` is \'huber\', \'epsilon_insensitive\', or \'squared_epsilon_insensitive\'. For \'huber\', determines the threshold at which it becomes less important to get the prediction exactly right. For epsilon-insensitive, any differences between the current prediction and the correct label are ignored if they are less than this threshold.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_jobs = hyperparams.Union( + configuration=OrderedDict({ + 'limit': hyperparams.Bounded[int]( + default=1, + lower=1, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'all_cores': hyperparams.Constant( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='limit', + description='The number of CPUs to use to do the OVA (One Versus All, for multi-class problems) computation. -1 means \'all CPUs\'. Defaults to 1.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + learning_rate = hyperparams.Enumeration[str]( + values=['optimal', 'invscaling', 'constant', 'adaptive'], + default='optimal', + description='The learning rate schedule: - \'constant\': eta = eta0 - \'optimal\': eta = 1.0 / (alpha * (t + t0)) [default] - \'invscaling\': eta = eta0 / pow(t, power_t) where t0 is chosen by a heuristic proposed by Leon Bottou.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + power_t = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.5, + description='The exponent for inverse scaling learning rate [default 0.5].', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + average = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=2, + lower=2, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='bool', + description='When set to True, computes the averaged SGD weights and stores the result in the ``coef_`` attribute. If set to an int greater than 1, averaging will begin once the total number of samples seen reaches average. So ``average=10`` will begin averaging after seeing 10 samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + eta0 = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.0, + description='The initial learning rate for the \'constant\' or \'invscaling\' schedules. The default value is 0.0 as eta0 is not used by the default schedule \'optimal\'.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=1000, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + description='The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the ``fit`` method, and not the `partial_fit`. Defaults to 5. Defaults to 1000 from 0.21, or if tol is not None. .. versionadded:: 0.19', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.001, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + description='The stopping criterion. If it is not None, the iterations will stop when (loss > previous_loss - tol). Defaults to None. Defaults to 1e-3 from 0.21. .. versionadded:: 0.19', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + class_weight = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Constant( + default='balanced', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Preset for the class_weight fit parameter. Weights associated with classes. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + early_stopping = hyperparams.UniformBool( + default=False, + description='Whether to use early stopping to terminate training when validation score is not improving. If set to True, it will automatically set asid a fraction of training data as validation and terminate training whe validation score is not improving by at least tol fo n_iter_no_change consecutive epochs.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + validation_fraction = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=1, + description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_iter_no_change = hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + description='Number of iterations with no improvement to wait before early stopping.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKSGDClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn SGDClassifier + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.GRADIENT_DESCENT, ], + "name": "sklearn.linear_model.stochastic_gradient.SGDClassifier", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.sgd.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html']}, + "version": "2019.11.13", + "id": "2305e400-131e-356d-bf77-e8db19517b7a", + "hyperparams_to_tune": ['max_iter', 'penalty', 'alpha'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = SGDClassifier( + loss=self.hyperparams['loss'], + penalty=self.hyperparams['penalty'], + alpha=self.hyperparams['alpha'], + l1_ratio=self.hyperparams['l1_ratio'], + fit_intercept=self.hyperparams['fit_intercept'], + shuffle=self.hyperparams['shuffle'], + epsilon=self.hyperparams['epsilon'], + n_jobs=self.hyperparams['n_jobs'], + learning_rate=self.hyperparams['learning_rate'], + power_t=self.hyperparams['power_t'], + warm_start=self.hyperparams['warm_start'], + average=self.hyperparams['average'], + eta0=self.hyperparams['eta0'], + max_iter=self.hyperparams['max_iter'], + tol=self.hyperparams['tol'], + class_weight=self.hyperparams['class_weight'], + early_stopping=self.hyperparams['early_stopping'], + validation_fraction=self.hyperparams['validation_fraction'], + n_iter_no_change=self.hyperparams['n_iter_no_change'], + verbose=_verbose, + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.partial_fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + n_iter_=None, + loss_function_=None, + classes_=None, + _expanded_class_weight=None, + t_=None, + C=None, + average_coef_=None, + average_intercept_=None, + standard_coef_=None, + standard_intercept_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + loss_function_=getattr(self._clf, 'loss_function_', None), + classes_=getattr(self._clf, 'classes_', None), + _expanded_class_weight=getattr(self._clf, '_expanded_class_weight', None), + t_=getattr(self._clf, 't_', None), + C=getattr(self._clf, 'C', None), + average_coef_=getattr(self._clf, 'average_coef_', None), + average_intercept_=getattr(self._clf, 'average_intercept_', None), + standard_coef_=getattr(self._clf, 'standard_coef_', None), + standard_intercept_=getattr(self._clf, 'standard_intercept_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.n_iter_ = params['n_iter_'] + self._clf.loss_function_ = params['loss_function_'] + self._clf.classes_ = params['classes_'] + self._clf._expanded_class_weight = params['_expanded_class_weight'] + self._clf.t_ = params['t_'] + self._clf.C = params['C'] + self._clf.average_coef_ = params['average_coef_'] + self._clf.average_intercept_ = params['average_intercept_'] + self._clf.standard_coef_ = params['standard_coef_'] + self._clf.standard_intercept_ = params['standard_intercept_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + if params['loss_function_'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['_expanded_class_weight'] is not None: + self._fitted = True + if params['t_'] is not None: + self._fitted = True + if params['C'] is not None: + self._fitted = True + if params['average_coef_'] is not None: + self._fitted = True + if params['average_intercept_'] is not None: + self._fitted = True + if params['standard_coef_'] is not None: + self._fitted = True + if params['standard_intercept_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKSGDClassifier.__doc__ = SGDClassifier.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSGDRegressor.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSGDRegressor.py new file mode 100644 index 0000000..a6361ef --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSGDRegressor.py @@ -0,0 +1,643 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.linear_model.stochastic_gradient import SGDRegressor + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + coef_: Optional[ndarray] + intercept_: Optional[ndarray] + average_coef_: Optional[ndarray] + average_intercept_: Optional[ndarray] + t_: Optional[float] + n_iter_: Optional[int] + C: Optional[float] + standard_coef_: Optional[ndarray] + standard_intercept_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + loss = hyperparams.Choice( + choices={ + 'squared_loss': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'huber': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'epsilon': hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'epsilon_insensitive': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'epsilon': hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'squared_epsilon_insensitive': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'epsilon': hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='squared_loss', + description='The loss function to be used. Defaults to \'squared_loss\' which refers to the ordinary least squares fit. \'huber\' modifies \'squared_loss\' to focus less on getting outliers correct by switching from squared to linear loss past a distance of epsilon. \'epsilon_insensitive\' ignores errors less than epsilon and is linear past that; this is the loss function used in SVR. \'squared_epsilon_insensitive\' is the same but becomes squared loss past a tolerance of epsilon.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + penalty = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Enumeration[str]( + values=['l1', 'l2', 'elasticnet'], + default='l2', + description='The penalty (aka regularization term) to be used. Defaults to \'l2\' which is the standard regularizer for linear SVM models. \'l1\' and \'elasticnet\' might bring sparsity to the model (feature selection) not achievable with \'l2\'.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='str', + description='The penalty (aka regularization term) to be used. Defaults to \'l2\' which is the standard regularizer for linear SVM models. \'l1\' and \'elasticnet\' might bring sparsity to the model (feature selection) not achievable with \'l2\'.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + alpha = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.0001, + description='Constant that multiplies the regularization term. Defaults to 0.0001 Also used to compute learning_rate when set to \'optimal\'. l1_ratio : float The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1. l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1. Defaults to 0.15.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + l1_ratio = hyperparams.Bounded[float]( + lower=0, + upper=1, + default=0.15, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fit_intercept = hyperparams.UniformBool( + default=True, + description='Whether the intercept should be estimated or not. If False, the data is assumed to be already centered. Defaults to True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=1000, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='int', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.001, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='float', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + shuffle = hyperparams.UniformBool( + default=True, + description='Whether or not the training data should be shuffled after each epoch. Defaults to True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + learning_rate = hyperparams.Enumeration[str]( + values=['optimal', 'invscaling', 'constant', 'adaptive'], + default='invscaling', + description='The learning rate schedule: - \'constant\': eta = eta0 - \'optimal\': eta = 1.0 / (alpha * (t + t0)) [default] - \'invscaling\': eta = eta0 / pow(t, power_t) where t0 is chosen by a heuristic proposed by Leon Bottou. eta0 : double, optional The initial learning rate [default 0.01].', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + eta0 = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.01, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + power_t = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.25, + description='The exponent for inverse scaling learning rate [default 0.25].', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + warm_start = hyperparams.UniformBool( + default=False, + description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + average = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + default=2, + lower=2, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'bool': hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='bool', + description='When set to True, computes the averaged SGD weights and stores the result in the ``coef_`` attribute. If set to an int greater than 1, averaging will begin once the total number of samples seen reaches average. So ``average=10`` will begin averaging after seeing 10 samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + early_stopping = hyperparams.UniformBool( + default=False, + description='Whether to use early stopping to terminate training when validation score is not improving. If set to True, it will automatically set asid a fraction of training data as validation and terminate training whe validation score is not improving by at least tol fo n_iter_no_change consecutive epochs.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + validation_fraction = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=1, + description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + n_iter_no_change = hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + description='Number of iterations with no improvement to wait before early stopping.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKSGDRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], + ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn SGDRegressor + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.GRADIENT_DESCENT, ], + "name": "sklearn.linear_model.stochastic_gradient.SGDRegressor", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.sgd.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html']}, + "version": "2019.11.13", + "id": "db3a7669-72e1-3c95-91c1-0c2a3f137d78", + "hyperparams_to_tune": ['max_iter', 'penalty', 'alpha'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = SGDRegressor( + loss=self.hyperparams['loss']['choice'], + epsilon=self.hyperparams['loss'].get('epsilon', 0.1), + penalty=self.hyperparams['penalty'], + alpha=self.hyperparams['alpha'], + l1_ratio=self.hyperparams['l1_ratio'], + fit_intercept=self.hyperparams['fit_intercept'], + max_iter=self.hyperparams['max_iter'], + tol=self.hyperparams['tol'], + shuffle=self.hyperparams['shuffle'], + learning_rate=self.hyperparams['learning_rate'], + eta0=self.hyperparams['eta0'], + power_t=self.hyperparams['power_t'], + warm_start=self.hyperparams['warm_start'], + average=self.hyperparams['average'], + early_stopping=self.hyperparams['early_stopping'], + validation_fraction=self.hyperparams['validation_fraction'], + n_iter_no_change=self.hyperparams['n_iter_no_change'], + verbose=_verbose, + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.partial_fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + coef_=None, + intercept_=None, + average_coef_=None, + average_intercept_=None, + t_=None, + n_iter_=None, + C=None, + standard_coef_=None, + standard_intercept_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + coef_=getattr(self._clf, 'coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + average_coef_=getattr(self._clf, 'average_coef_', None), + average_intercept_=getattr(self._clf, 'average_intercept_', None), + t_=getattr(self._clf, 't_', None), + n_iter_=getattr(self._clf, 'n_iter_', None), + C=getattr(self._clf, 'C', None), + standard_coef_=getattr(self._clf, 'standard_coef_', None), + standard_intercept_=getattr(self._clf, 'standard_intercept_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.coef_ = params['coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf.average_coef_ = params['average_coef_'] + self._clf.average_intercept_ = params['average_intercept_'] + self._clf.t_ = params['t_'] + self._clf.n_iter_ = params['n_iter_'] + self._clf.C = params['C'] + self._clf.standard_coef_ = params['standard_coef_'] + self._clf.standard_intercept_ = params['standard_intercept_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['average_coef_'] is not None: + self._fitted = True + if params['average_intercept_'] is not None: + self._fitted = True + if params['t_'] is not None: + self._fitted = True + if params['n_iter_'] is not None: + self._fitted = True + if params['C'] is not None: + self._fitted = True + if params['standard_coef_'] is not None: + self._fitted = True + if params['standard_intercept_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKSGDRegressor.__doc__ = SGDRegressor.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSVC.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSVC.py new file mode 100644 index 0000000..c8f60e5 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSVC.py @@ -0,0 +1,635 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.svm.classes import SVC + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + support_: Optional[ndarray] + support_vectors_: Optional[ndarray] + n_support_: Optional[ndarray] + dual_coef_: Optional[ndarray] + intercept_: Optional[ndarray] + _sparse: Optional[bool] + shape_fit_: Optional[tuple] + _dual_coef_: Optional[ndarray] + _intercept_: Optional[ndarray] + probA_: Optional[ndarray] + probB_: Optional[ndarray] + _gamma: Optional[float] + classes_: Optional[ndarray] + class_weight_: Optional[ndarray] + fit_status_: Optional[int] + epsilon: Optional[float] + nu: Optional[float] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + C = hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description='Penalty parameter C of the error term.' + ) + kernel = hyperparams.Choice( + choices={ + 'linear': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'poly': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'degree': hyperparams.Bounded[int]( + default=3, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + description='1/n_features will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Constant( + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'rbf': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + description='1/n_features will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'sigmoid': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + description='1/n_features will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Constant( + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='rbf', + description='Specifies the kernel type to be used in the algorithm. It must be one of \'linear\', \'poly\', \'rbf\', \'sigmoid\', \'precomputed\' or a callable. If none is given, \'rbf\' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + probability = hyperparams.UniformBool( + default=False, + description='Whether to enable probability estimates. This must be enabled prior to calling `fit`, and will slow down that method.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + shrinking = hyperparams.UniformBool( + default=True, + description='Whether to use the shrinking heuristic.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.001, + lower=0, + upper=None, + description='Tolerance for stopping criterion.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + cache_size = hyperparams.Bounded[float]( + default=200, + lower=0, + upper=None, + description='Specify the size of the kernel cache (in MB).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + class_weight = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Constant( + default='balanced', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_iter = hyperparams.Bounded[int]( + default=-1, + lower=-1, + upper=None, + description='Hard limit on iterations within solver, or -1 for no limit.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + decision_function_shape = hyperparams.Enumeration[str]( + values=['ovr', 'ovo'], + default='ovr', + description='Whether to return a one-vs-rest (\'ovr\') decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one (\'ovo\') decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). The default of None will currently behave as \'ovo\' for backward compatibility and raise a deprecation warning, but will change \'ovr\' in 0.19. .. versionadded:: 0.17 *decision_function_shape=\'ovr\'* is recommended. .. versionchanged:: 0.17 Deprecated *decision_function_shape=\'ovo\' and None*.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKSVC(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn SVC + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SUPPORT_VECTOR_MACHINE, ], + "name": "sklearn.svm.classes.SVC", + "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, + "python_path": "d3m.primitives.classification.svc.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html']}, + "version": "2019.11.13", + "id": "0ae7d42d-f765-3348-a28c-57d94880aa6a", + "hyperparams_to_tune": ['C', 'kernel'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = SVC( + C=self.hyperparams['C'], + kernel=self.hyperparams['kernel']['choice'], + degree=self.hyperparams['kernel'].get('degree', 3), + gamma=self.hyperparams['kernel'].get('gamma', 'auto'), + coef0=self.hyperparams['kernel'].get('coef0', 0), + probability=self.hyperparams['probability'], + shrinking=self.hyperparams['shrinking'], + tol=self.hyperparams['tol'], + cache_size=self.hyperparams['cache_size'], + class_weight=self.hyperparams['class_weight'], + max_iter=self.hyperparams['max_iter'], + decision_function_shape=self.hyperparams['decision_function_shape'], + verbose=_verbose, + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + support_=None, + support_vectors_=None, + n_support_=None, + dual_coef_=None, + intercept_=None, + _sparse=None, + shape_fit_=None, + _dual_coef_=None, + _intercept_=None, + probA_=None, + probB_=None, + _gamma=None, + classes_=None, + class_weight_=None, + fit_status_=None, + epsilon=None, + nu=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + support_=getattr(self._clf, 'support_', None), + support_vectors_=getattr(self._clf, 'support_vectors_', None), + n_support_=getattr(self._clf, 'n_support_', None), + dual_coef_=getattr(self._clf, 'dual_coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + _sparse=getattr(self._clf, '_sparse', None), + shape_fit_=getattr(self._clf, 'shape_fit_', None), + _dual_coef_=getattr(self._clf, '_dual_coef_', None), + _intercept_=getattr(self._clf, '_intercept_', None), + probA_=getattr(self._clf, 'probA_', None), + probB_=getattr(self._clf, 'probB_', None), + _gamma=getattr(self._clf, '_gamma', None), + classes_=getattr(self._clf, 'classes_', None), + class_weight_=getattr(self._clf, 'class_weight_', None), + fit_status_=getattr(self._clf, 'fit_status_', None), + epsilon=getattr(self._clf, 'epsilon', None), + nu=getattr(self._clf, 'nu', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.support_ = params['support_'] + self._clf.support_vectors_ = params['support_vectors_'] + self._clf.n_support_ = params['n_support_'] + self._clf.dual_coef_ = params['dual_coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf._sparse = params['_sparse'] + self._clf.shape_fit_ = params['shape_fit_'] + self._clf._dual_coef_ = params['_dual_coef_'] + self._clf._intercept_ = params['_intercept_'] + self._clf.probA_ = params['probA_'] + self._clf.probB_ = params['probB_'] + self._clf._gamma = params['_gamma'] + self._clf.classes_ = params['classes_'] + self._clf.class_weight_ = params['class_weight_'] + self._clf.fit_status_ = params['fit_status_'] + self._clf.epsilon = params['epsilon'] + self._clf.nu = params['nu'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['support_'] is not None: + self._fitted = True + if params['support_vectors_'] is not None: + self._fitted = True + if params['n_support_'] is not None: + self._fitted = True + if params['dual_coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['_sparse'] is not None: + self._fitted = True + if params['shape_fit_'] is not None: + self._fitted = True + if params['_dual_coef_'] is not None: + self._fitted = True + if params['_intercept_'] is not None: + self._fitted = True + if params['probA_'] is not None: + self._fitted = True + if params['probB_'] is not None: + self._fitted = True + if params['_gamma'] is not None: + self._fitted = True + if params['classes_'] is not None: + self._fitted = True + if params['class_weight_'] is not None: + self._fitted = True + if params['fit_status_'] is not None: + self._fitted = True + if params['epsilon'] is not None: + self._fitted = True + if params['nu'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKSVC.__doc__ = SVC.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSVR.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSVR.py new file mode 100644 index 0000000..8f17ca5 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSVR.py @@ -0,0 +1,616 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.svm.classes import SVR + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + support_: Optional[ndarray] + support_vectors_: Optional[ndarray] + dual_coef_: Optional[ndarray] + intercept_: Optional[ndarray] + _sparse: Optional[bool] + shape_fit_: Optional[tuple] + n_support_: Optional[ndarray] + probA_: Optional[ndarray] + probB_: Optional[ndarray] + _gamma: Optional[float] + _dual_coef_: Optional[ndarray] + _intercept_: Optional[ndarray] + class_weight_: Optional[ndarray] + fit_status_: Optional[int] + class_weight: Optional[Union[str, Dict, List[Dict]]] + nu: Optional[float] + probability: Optional[bool] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + C = hyperparams.Bounded[float]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description='Penalty parameter C of the error term.' + ) + epsilon = hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.1, + description='Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + kernel = hyperparams.Choice( + choices={ + 'linear': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ), + 'poly': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'degree': hyperparams.Bounded[int]( + default=3, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + description='1/n_features will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Constant( + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'rbf': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + description='1/n_features will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'sigmoid': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'gamma': hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + description='1/n_features will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + 'coef0': hyperparams.Constant( + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'precomputed': hyperparams.Hyperparams.define( + configuration=OrderedDict({}) + ) + }, + default='rbf', + description='Specifies the kernel type to be used in the algorithm. It must be one of \'linear\', \'poly\', \'rbf\', \'sigmoid\', \'precomputed\' or a callable. If none is given, \'rbf\' will be used. If a callable is given it is used to precompute the kernel matrix.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + shrinking = hyperparams.UniformBool( + default=True, + description='Whether to use the shrinking heuristic.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + tol = hyperparams.Bounded[float]( + default=0.001, + lower=0, + upper=None, + description='Tolerance for stopping criterion.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + cache_size = hyperparams.Bounded[float]( + default=200, + lower=0, + upper=None, + description='Specify the size of the kernel cache (in MB).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] + ) + max_iter = hyperparams.Bounded[int]( + default=-1, + lower=-1, + upper=None, + description='Hard limit on iterations within solver, or -1 for no limit.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKSVR(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn SVR + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SUPPORT_VECTOR_MACHINE, ], + "name": "sklearn.svm.classes.SVR", + "primitive_family": metadata_base.PrimitiveFamily.REGRESSION, + "python_path": "d3m.primitives.regression.svr.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html']}, + "version": "2019.11.13", + "id": "ebbc3404-902d-33cc-a10c-e42b06dfe60c", + "hyperparams_to_tune": ['C', 'kernel'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = SVR( + C=self.hyperparams['C'], + epsilon=self.hyperparams['epsilon'], + kernel=self.hyperparams['kernel']['choice'], + degree=self.hyperparams['kernel'].get('degree', 3), + gamma=self.hyperparams['kernel'].get('gamma', 'auto'), + coef0=self.hyperparams['kernel'].get('coef0', 0), + shrinking=self.hyperparams['shrinking'], + tol=self.hyperparams['tol'], + cache_size=self.hyperparams['cache_size'], + max_iter=self.hyperparams['max_iter'], + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._inputs is None or self._outputs is None: + raise ValueError("Missing training data.") + + if not self._new_training_data: + return CallResult(None) + self._new_training_data = False + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.predict(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + # For primitives that allow predicting without fitting like GaussianProcessRegressor + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + output = self._wrap_predictions(inputs, sk_output) + output.columns = self._target_names + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._target_column_indices, + columns_list=output) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + support_=None, + support_vectors_=None, + dual_coef_=None, + intercept_=None, + _sparse=None, + shape_fit_=None, + n_support_=None, + probA_=None, + probB_=None, + _gamma=None, + _dual_coef_=None, + _intercept_=None, + class_weight_=None, + fit_status_=None, + class_weight=None, + nu=None, + probability=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + support_=getattr(self._clf, 'support_', None), + support_vectors_=getattr(self._clf, 'support_vectors_', None), + dual_coef_=getattr(self._clf, 'dual_coef_', None), + intercept_=getattr(self._clf, 'intercept_', None), + _sparse=getattr(self._clf, '_sparse', None), + shape_fit_=getattr(self._clf, 'shape_fit_', None), + n_support_=getattr(self._clf, 'n_support_', None), + probA_=getattr(self._clf, 'probA_', None), + probB_=getattr(self._clf, 'probB_', None), + _gamma=getattr(self._clf, '_gamma', None), + _dual_coef_=getattr(self._clf, '_dual_coef_', None), + _intercept_=getattr(self._clf, '_intercept_', None), + class_weight_=getattr(self._clf, 'class_weight_', None), + fit_status_=getattr(self._clf, 'fit_status_', None), + class_weight=getattr(self._clf, 'class_weight', None), + nu=getattr(self._clf, 'nu', None), + probability=getattr(self._clf, 'probability', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.support_ = params['support_'] + self._clf.support_vectors_ = params['support_vectors_'] + self._clf.dual_coef_ = params['dual_coef_'] + self._clf.intercept_ = params['intercept_'] + self._clf._sparse = params['_sparse'] + self._clf.shape_fit_ = params['shape_fit_'] + self._clf.n_support_ = params['n_support_'] + self._clf.probA_ = params['probA_'] + self._clf.probB_ = params['probB_'] + self._clf._gamma = params['_gamma'] + self._clf._dual_coef_ = params['_dual_coef_'] + self._clf._intercept_ = params['_intercept_'] + self._clf.class_weight_ = params['class_weight_'] + self._clf.fit_status_ = params['fit_status_'] + self._clf.class_weight = params['class_weight'] + self._clf.nu = params['nu'] + self._clf.probability = params['probability'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['support_'] is not None: + self._fitted = True + if params['support_vectors_'] is not None: + self._fitted = True + if params['dual_coef_'] is not None: + self._fitted = True + if params['intercept_'] is not None: + self._fitted = True + if params['_sparse'] is not None: + self._fitted = True + if params['shape_fit_'] is not None: + self._fitted = True + if params['n_support_'] is not None: + self._fitted = True + if params['probA_'] is not None: + self._fitted = True + if params['probB_'] is not None: + self._fitted = True + if params['_gamma'] is not None: + self._fitted = True + if params['_dual_coef_'] is not None: + self._fitted = True + if params['_intercept_'] is not None: + self._fitted = True + if params['class_weight_'] is not None: + self._fitted = True + if params['fit_status_'] is not None: + self._fitted = True + if params['class_weight'] is not None: + self._fitted = True + if params['nu'] is not None: + self._fitted = True + if params['probability'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) + add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKSVR.__doc__ = SVR.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSelectFwe.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSelectFwe.py new file mode 100644 index 0000000..b7e534c --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSelectFwe.py @@ -0,0 +1,428 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.feature_selection.univariate_selection import SelectFwe +from sklearn.feature_selection import f_classif, f_regression, chi2 + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + scores_: Optional[ndarray] + pvalues_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + score_func = hyperparams.Enumeration[str]( + default='f_classif', + values=['f_classif', 'f_regression', 'chi2'], + description='Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). Default is f_classif (see below "See also"). The default function only works with classification tasks.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + alpha = hyperparams.Bounded[float]( + default=0.05, + lower=0, + upper=None, + description='The highest uncorrected p-value for features to keep.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['update_semantic_types', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", +) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKSelectFwe(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn SelectFwe + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.FEATURE_SCALING, ], + "name": "sklearn.feature_selection.univariate_selection.SelectFwe", + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_SELECTION, + "python_path": "d3m.primitives.feature_selection.select_fwe.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFwe.html']}, + "version": "2019.11.13", + "id": "09a4cffa-a59f-30ac-b78f-101c35b3f7c6", + "hyperparams_to_tune": ['alpha'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = SelectFwe( + score_func=eval(self.hyperparams['score_func']), + alpha=self.hyperparams['alpha'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.transform(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + target_columns_metadata = self._copy_columns_metadata(inputs.iloc[:, self._training_indices].metadata, + self.produce_support().value) + output = self._wrap_predictions(inputs, sk_output, target_columns_metadata) + output.columns = [inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self.produce_support().value] + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if self.hyperparams['return_result'] == 'update_semantic_types': + temp_inputs = inputs.copy() + columns_not_selected = sorted(set(range(len(temp_inputs.columns))) - set(self.produce_support().value)) + + for idx in columns_not_selected: + temp_inputs.metadata = temp_inputs.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, idx), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + + temp_inputs = temp_inputs.select_columns(self._training_indices) + outputs = base_utils.combine_columns(return_result='replace', + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=[temp_inputs]) + return CallResult(outputs) + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output) + + return CallResult(outputs) + + def produce_support(self, *, timeout: float = None, iterations: int = None) -> CallResult[Any]: + all_indices = self._training_indices + selected_indices = self._clf.get_support(indices=True).tolist() + indices = [all_indices[index] for index in selected_indices] + return CallResult(indices) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + scores_=None, + pvalues_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + scores_=getattr(self._clf, 'scores_', None), + pvalues_=getattr(self._clf, 'pvalues_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.scores_ = params['scores_'] + self._clf.pvalues_ = params['pvalues_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['scores_'] is not None: + self._fitted = True + if params['pvalues_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + if len(target_columns_metadata) == 1: + name = column_metadata.get("name") + for idx in range(len(outputs.columns)): + outputs_metadata = outputs_metadata.update_column(idx, column_metadata) + if len(outputs.columns) > 1: + # Updating column names. + outputs_metadata = outputs_metadata.update((metadata_base.ALL_ELEMENTS, idx), {'name': "{}_{}".format(name, idx)}) + else: + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray, target_columns_metadata) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices) -> List[OrderedDict]: + outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in column_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKSelectFwe.__doc__ = SelectFwe.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSelectPercentile.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSelectPercentile.py new file mode 100644 index 0000000..05044c1 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSelectPercentile.py @@ -0,0 +1,428 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.feature_selection.univariate_selection import SelectPercentile +from sklearn.feature_selection import f_classif, f_regression, chi2 + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + scores_: Optional[ndarray] + pvalues_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + score_func = hyperparams.Enumeration[str]( + default='f_classif', + values=['f_classif', 'f_regression', 'chi2'], + description='Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. Default is f_classif (see below "See also"). The default function only works with classification tasks.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + percentile = hyperparams.Bounded[int]( + default=10, + lower=0, + upper=100, + description='Percent of features to keep.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['update_semantic_types', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", +) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKSelectPercentile(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn SelectPercentile + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.STATISTICAL_MOMENT_ANALYSIS, ], + "name": "sklearn.feature_selection.univariate_selection.SelectPercentile", + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_SELECTION, + "python_path": "d3m.primitives.feature_selection.select_percentile.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html']}, + "version": "2019.11.13", + "id": "16696c4d-bed9-34a2-b9ae-b882c069512d", + "hyperparams_to_tune": ['percentile'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = SelectPercentile( + score_func=eval(self.hyperparams['score_func']), + percentile=self.hyperparams['percentile'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.transform(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + target_columns_metadata = self._copy_columns_metadata(inputs.iloc[:, self._training_indices].metadata, + self.produce_support().value) + output = self._wrap_predictions(inputs, sk_output, target_columns_metadata) + output.columns = [inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self.produce_support().value] + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if self.hyperparams['return_result'] == 'update_semantic_types': + temp_inputs = inputs.copy() + columns_not_selected = sorted(set(range(len(temp_inputs.columns))) - set(self.produce_support().value)) + + for idx in columns_not_selected: + temp_inputs.metadata = temp_inputs.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, idx), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + + temp_inputs = temp_inputs.select_columns(self._training_indices) + outputs = base_utils.combine_columns(return_result='replace', + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=[temp_inputs]) + return CallResult(outputs) + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output) + + return CallResult(outputs) + + def produce_support(self, *, timeout: float = None, iterations: int = None) -> CallResult[Any]: + all_indices = self._training_indices + selected_indices = self._clf.get_support(indices=True).tolist() + indices = [all_indices[index] for index in selected_indices] + return CallResult(indices) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + scores_=None, + pvalues_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + scores_=getattr(self._clf, 'scores_', None), + pvalues_=getattr(self._clf, 'pvalues_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.scores_ = params['scores_'] + self._clf.pvalues_ = params['pvalues_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['scores_'] is not None: + self._fitted = True + if params['pvalues_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + if len(target_columns_metadata) == 1: + name = column_metadata.get("name") + for idx in range(len(outputs.columns)): + outputs_metadata = outputs_metadata.update_column(idx, column_metadata) + if len(outputs.columns) > 1: + # Updating column names. + outputs_metadata = outputs_metadata.update((metadata_base.ALL_ELEMENTS, idx), {'name': "{}_{}".format(name, idx)}) + else: + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray, target_columns_metadata) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices) -> List[OrderedDict]: + outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in column_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKSelectPercentile.__doc__ = SelectPercentile.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSparseRandomProjection.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSparseRandomProjection.py new file mode 100644 index 0000000..351f4d8 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKSparseRandomProjection.py @@ -0,0 +1,375 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.random_projection import SparseRandomProjection + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + n_component_: Optional[int] + components_: Optional[Union[ndarray, sparse.spmatrix]] + density_: Optional[float] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_components = hyperparams.Union( + configuration=OrderedDict({ + 'int': hyperparams.Bounded[int]( + lower=0, + upper=None, + default=100, + description='Number of components to keep.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Dimensionality of the target projection space. n_components can be automatically adjusted according to the number of samples in the dataset and the bound given by the Johnson-Lindenstrauss lemma. In that case the quality of the embedding is controlled by the ``eps`` parameter. It should be noted that Johnson-Lindenstrauss lemma can yield very conservative estimated of the required number of components as it makes no assumption on the structure of the dataset.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + density = hyperparams.Union( + configuration=OrderedDict({ + 'float': hyperparams.Uniform( + lower=0, + upper=1, + default=0.3, + description='Number of components to keep.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'auto': hyperparams.Constant( + default='auto', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='auto', + description='Ratio of non-zero component in the random projection matrix. If density = \'auto\', the value is set to the minimum density as recommended by Ping Li et al.: 1 / sqrt(n_features). Use density = 1 / 3.0 if you want to reproduce the results from Achlioptas, 2001.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + eps = hyperparams.Bounded[float]( + default=0.1, + lower=0, + upper=1, + description='Parameter to control the quality of the embedding according to the Johnson-Lindenstrauss lemma when n_components is set to \'auto\'. Smaller values lead to better embedding and higher number of dimensions (n_components) in the target projection space.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + dense_output = hyperparams.UniformBool( + default=False, + description='If True, ensure that the output of the random projection is a dense numpy array even if the input and random projection matrix are both sparse. In practice, if the number of components is small the number of zero components in the projected data will be very small and it will be more CPU and memory efficient to use a dense representation. If False, the projected data uses a sparse representation if the input is sparse.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKSparseRandomProjection(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn SparseRandomProjection + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.RANDOM_PROJECTION, ], + "name": "sklearn.random_projection.SparseRandomProjection", + "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + "python_path": "d3m.primitives.data_transformation.sparse_random_projection.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.random_projection.SparseRandomProjection.html']}, + "version": "2019.11.13", + "id": "43ddd6be-bb4f-3fd0-8765-df961c16d7dc", + "hyperparams_to_tune": ['n_components'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = SparseRandomProjection( + n_components=self.hyperparams['n_components'], + density=self.hyperparams['density'], + eps=self.hyperparams['eps'], + dense_output=self.hyperparams['dense_output'], + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + n_component_=None, + components_=None, + density_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + n_component_=getattr(self._clf, 'n_component_', None), + components_=getattr(self._clf, 'components_', None), + density_=getattr(self._clf, 'density_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.n_component_ = params['n_component_'] + self._clf.components_ = params['components_'] + self._clf.density_ = params['density_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['n_component_'] is not None: + self._fitted = True + if params['components_'] is not None: + self._fitted = True + if params['density_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKSparseRandomProjection.__doc__ = SparseRandomProjection.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKStandardScaler.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKStandardScaler.py new file mode 100644 index 0000000..f8491bb --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKStandardScaler.py @@ -0,0 +1,357 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import StandardScaler + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + scale_: Optional[ndarray] + mean_: Optional[ndarray] + var_: Optional[ndarray] + n_samples_seen_: Optional[Union[int, numpy.integer]] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + with_mean = hyperparams.UniformBool( + default=True, + description='If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + with_std = hyperparams.UniformBool( + default=True, + description='If True, scale the data to unit variance (or equivalently, unit standard deviation).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKStandardScaler(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn StandardScaler + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.FEATURE_SCALING, ], + "name": "sklearn.preprocessing.data.StandardScaler", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.standard_scaler.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html']}, + "version": "2019.11.13", + "id": "d639947e-ece0-3a39-a666-e974acf4521d", + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = StandardScaler( + with_mean=self.hyperparams['with_mean'], + with_std=self.hyperparams['with_std'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + scale_=None, + mean_=None, + var_=None, + n_samples_seen_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + scale_=getattr(self._clf, 'scale_', None), + mean_=getattr(self._clf, 'mean_', None), + var_=getattr(self._clf, 'var_', None), + n_samples_seen_=getattr(self._clf, 'n_samples_seen_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.scale_ = params['scale_'] + self._clf.mean_ = params['mean_'] + self._clf.var_ = params['var_'] + self._clf.n_samples_seen_ = params['n_samples_seen_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['scale_'] is not None: + self._fitted = True + if params['mean_'] is not None: + self._fitted = True + if params['var_'] is not None: + self._fitted = True + if params['n_samples_seen_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKStandardScaler.__doc__ = StandardScaler.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKStringImputer.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKStringImputer.py new file mode 100644 index 0000000..6e0c125 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKStringImputer.py @@ -0,0 +1,371 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.impute import SimpleImputer +from sklearn.impute._base import _get_mask + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + statistics_: Optional[ndarray] + indicator_: Optional[sklearn.base.BaseEstimator] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + missing_values = hyperparams.Hyperparameter[str]( + default='', + description='The placeholder for the missing values. All occurrences of `missing_values` will be imputed.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + add_indicator = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + strategy = hyperparams.Enumeration[str]( + default='most_frequent', + values=['most_frequent', 'constant'], + description='The imputation strategy. - If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. - If "median", then replace missing values using the median along each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. - If "constant", then replace missing values with fill_value. Can be used with strings or numeric data. .. versionadded:: 0.20 strategy="constant" for fixed value imputation.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + fill_value = hyperparams.Hyperparameter[str]( + default='', + description='When strategy == "constant", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and "missing_value" for strings or object data types.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKStringImputer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn SimpleImputer + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.IMPUTATION, ], + "name": "sklearn.impute.SimpleImputer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_CLEANING, + "python_path": "d3m.primitives.data_cleaning.string_imputer.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html']}, + "version": "2019.11.13", + "id": "caeed986-cd1b-303b-900f-868dfc665341", + "hyperparams_to_tune": ['strategy'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None, + _verbose: int = 0) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = SimpleImputer( + missing_values=self.hyperparams['missing_values'], + add_indicator=self.hyperparams['add_indicator'], + strategy=self.hyperparams['strategy'], + fill_value=self.hyperparams['fill_value'], + verbose=_verbose + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices, _ = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use, _ = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.transform(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + target_columns_metadata = self._copy_columns_metadata(inputs.metadata, self._training_indices, self.hyperparams) + output = self._wrap_predictions(inputs, sk_output, target_columns_metadata) + + output.columns = [inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self._training_indices] + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + _, _, dropped_cols = self._get_columns_to_fit(inputs, self.hyperparams) + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices + dropped_cols, + columns_list=output) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + statistics_=None, + indicator_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + statistics_=getattr(self._clf, 'statistics_', None), + indicator_=getattr(self._clf, 'indicator_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.statistics_ = params['statistics_'] + self._clf.indicator_ = params['indicator_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['statistics_'] is not None: + self._fitted = True + if params['indicator_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + + if not hyperparams['use_semantic_types']: + columns_to_produce = list(range(len(inputs.columns))) + + else: + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + + columns_to_drop = cls._get_columns_to_drop(inputs, columns_to_produce, hyperparams) + for col in columns_to_drop: + columns_to_produce.remove(col) + + return inputs.iloc[:, columns_to_produce], columns_to_produce, columns_to_drop + + @classmethod + def _get_columns_to_drop(cls, inputs: Inputs, column_indices: List[int], hyperparams: Hyperparams): + """ + Check for columns that contain missing_values that need to be imputed + If strategy is constant and missin_values is nan, then all nan columns will not be dropped + :param inputs: + :param column_indices: + :return: + """ + columns_to_remove = [] + if hyperparams['strategy'] != "constant": + for _, col in enumerate(column_indices): + inp = inputs.iloc[:, [col]].values + mask = _get_mask(inp, hyperparams['missing_values']) + if mask.all(): + columns_to_remove.append(col) + return columns_to_remove + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (str,) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray, target_columns_metadata) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices, hyperparams) -> List[OrderedDict]: + outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in column_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKStringImputer.__doc__ = SimpleImputer.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKTfidfVectorizer.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKTfidfVectorizer.py new file mode 100644 index 0000000..99cd7da --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKTfidfVectorizer.py @@ -0,0 +1,530 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.feature_extraction.text import TfidfVectorizer + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.metadata.base import ALL_ELEMENTS + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + vocabulary_: Optional[Sequence[dict]] + stop_words_: Optional[Sequence[set]] + _tfidf: Optional[Sequence[object]] + fixed_vocabulary_: Optional[Sequence[bool]] + _stop_words_id: Optional[Sequence[int]] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + + +class Hyperparams(hyperparams.Hyperparams): + strip_accents = hyperparams.Union( + configuration=OrderedDict({ + 'accents': hyperparams.Enumeration[str]( + default='ascii', + values=['ascii', 'unicode'], + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Remove accents during the preprocessing step. \'ascii\' is a fast method that only works on characters that have an direct ASCII mapping. \'unicode\' is a slightly slower method that works on any characters. None (default) does nothing.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + analyzer = hyperparams.Enumeration[str]( + default='word', + values=['word', 'char', 'char_wb'], + description='Whether the feature should be made of word or character n-grams. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + ngram_range = hyperparams.SortedList( + elements=hyperparams.Bounded[int](1, None, 1), + default=(1, 1), + min_size=2, + max_size=2, + description='The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + stop_words = hyperparams.Union( + configuration=OrderedDict({ + 'string': hyperparams.Hyperparameter[str]( + default='english', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'list': hyperparams.List( + elements=hyperparams.Hyperparameter[str](''), + default=[], + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='If a string, it is passed to _check_stop_list and the appropriate stop list is returned. \'english\' is currently the only supported string value. If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == \'word\'``. If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + lowercase = hyperparams.UniformBool( + default=True, + description='Convert all characters to lowercase before tokenizing.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + token_pattern = hyperparams.Hyperparameter[str]( + default='(?u)\\b\w\w+\\b', + description='Regular expression denoting what constitutes a "token", only used if ``analyzer == \'word\'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_df = hyperparams.Union( + configuration=OrderedDict({ + 'proportion': hyperparams.Bounded[float]( + default=1.0, + lower=0.0, + upper=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='proportion', + description='When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + min_df = hyperparams.Union( + configuration=OrderedDict({ + 'proportion': hyperparams.Bounded[float]( + default=1.0, + lower=0.0, + upper=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='absolute', + description='When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + max_features = hyperparams.Union( + configuration=OrderedDict({ + 'absolute': hyperparams.Bounded[int]( + default=1, + lower=0, + upper=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + binary = hyperparams.UniformBool( + default=False, + description='If True, all non-zero term counts are set to 1. This does not mean outputs will have only 0/1 values, only that the tf term in tf-idf is binary. (Set idf and normalization to False to get 0/1 outputs.)', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + norm = hyperparams.Union( + configuration=OrderedDict({ + 'str': hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2'], + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Norm used to normalize term vectors. None for no normalization.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + use_idf = hyperparams.UniformBool( + default=True, + description='Enable inverse-document-frequency reweighting.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + smooth_idf = hyperparams.UniformBool( + default=True, + description='Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + sublinear_tf = hyperparams.UniformBool( + default=False, + description='Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + + +class SKTfidfVectorizer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn TfidfVectorizer + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.MINIMUM_REDUNDANCY_FEATURE_SELECTION, ], + "name": "sklearn.feature_extraction.text.TfidfVectorizer", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.tfidf_vectorizer.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.TfidfVectorizer.html']}, + "version": "2019.11.13", + "id": "1f7ce2c7-1ec8-3483-9a65-eedd4b5811d6", + "hyperparams_to_tune": ['max_df', 'min_df'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # True + + self._clf = list() + + self._training_inputs = None + self._target_names = None + self._training_indices = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + + if self._training_inputs is None: + raise ValueError("Missing training data.") + + if len(self._training_indices) > 0: + for column_index in range(len(self._training_inputs.columns)): + clf = self._create_new_sklearn_estimator() + clf.fit(self._training_inputs.iloc[:, column_index]) + self._clf.append(clf) + + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs, training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + else: + training_indices = list(range(len(inputs))) + + # Iterating over all estimators and call transform on them. + # No. of estimators should be equal to the number of columns in the input + if len(self._clf) != len(sk_inputs.columns): + raise RuntimeError("Input data does not have the same number of columns as training data") + outputs = [] + if len(self._training_indices) > 0: + for column_index in range(len(sk_inputs.columns)): + clf = self._clf[column_index] + output = clf.transform(sk_inputs.iloc[:, column_index]) + column_name = sk_inputs.columns[column_index] + + if sparse.issparse(output): + output = output.toarray() + output = self._wrap_predictions(inputs, output) + + # Updating column names. + output.columns = map(lambda x: "{}_{}".format(column_name, x), clf.get_feature_names()) + for i, name in enumerate(clf.get_feature_names()): + output.metadata = output.metadata.update((ALL_ELEMENTS, i), {'name': name}) + + outputs.append(output) + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=outputs) + + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + vocabulary_=None, + stop_words_=None, + _tfidf=None, + fixed_vocabulary_=None, + _stop_words_id=None, + training_indices_=self._training_indices, + target_names_=self._target_names + ) + + return Params( + vocabulary_=list(map(lambda clf: getattr(clf, 'vocabulary_', None), self._clf)), + stop_words_=list(map(lambda clf: getattr(clf, 'stop_words_', None), self._clf)), + _tfidf=list(map(lambda clf: getattr(clf, '_tfidf', None), self._clf)), + fixed_vocabulary_=list(map(lambda clf: getattr(clf, 'fixed_vocabulary_', None), self._clf)), + _stop_words_id=list(map(lambda clf: getattr(clf, '_stop_words_id', None), self._clf)), + training_indices_=self._training_indices, + target_names_=self._target_names + ) + + def set_params(self, *, params: Params) -> None: + for param, val in params.items(): + if val is not None and param not in ['target_names_', 'training_indices_']: + self._clf = list(map(lambda x: self._create_new_sklearn_estimator(), val)) + break + for index in range(len(self._clf)): + for param, val in params.items(): + if val is not None: + setattr(self._clf[index], param, val[index]) + else: + setattr(self._clf[index], param, None) + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._fitted = False + + if params['vocabulary_'] is not None: + self._fitted = True + if params['stop_words_'] is not None: + self._fitted = True + if params['_tfidf'] is not None: + self._fitted = True + if params['fixed_vocabulary_'] is not None: + self._fitted = True + if params['_stop_words_id'] is not None: + self._fitted = True + + def _create_new_sklearn_estimator(self): + clf = TfidfVectorizer( + strip_accents=self.hyperparams['strip_accents'], + analyzer=self.hyperparams['analyzer'], + ngram_range=self.hyperparams['ngram_range'], + stop_words=self.hyperparams['stop_words'], + lowercase=self.hyperparams['lowercase'], + token_pattern=self.hyperparams['token_pattern'], + max_df=self.hyperparams['max_df'], + min_df=self.hyperparams['min_df'], + max_features=self.hyperparams['max_features'], + binary=self.hyperparams['binary'], + norm=self.hyperparams['norm'], + use_idf=self.hyperparams['use_idf'], + smooth_idf=self.hyperparams['smooth_idf'], + sublinear_tf=self.hyperparams['sublinear_tf'], + ) + return clf + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (str,) + accepted_semantic_types = set(["http://schema.org/Text",]) + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), [] + target_names = [] + target_semantic_type = [] + target_column_indices = [] + metadata = data.metadata + target_column_indices.extend(metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')) + + for column_index in target_column_indices: + if column_index is metadata_base.ALL_ELEMENTS: + continue + column_index = typing.cast(metadata_base.SimpleSelectorSegment, column_index) + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + target_names.append(column_metadata.get('name', str(column_index))) + target_semantic_type.append(column_metadata.get('semantic_types', [])) + + targets = data.iloc[:, target_column_indices] + return targets, target_names, target_semantic_type + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict() + semantic_types = [] + semantic_types.append('https://metadata.datadrivendiscovery.org/types/Attribute') + column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = semantic_types + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKTfidfVectorizer.__doc__ = TfidfVectorizer.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKTruncatedSVD.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKTruncatedSVD.py new file mode 100644 index 0000000..2591180 --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKTruncatedSVD.py @@ -0,0 +1,369 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.decomposition.truncated_svd import TruncatedSVD + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + components_: Optional[ndarray] + explained_variance_ratio_: Optional[ndarray] + explained_variance_: Optional[ndarray] + singular_values_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + n_components = hyperparams.Bounded[int]( + default=2, + lower=0, + upper=None, + description='Desired dimensionality of output data. Must be strictly less than the number of features. The default value is useful for visualisation. For LSA, a value of 100 is recommended.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + algorithm = hyperparams.Choice( + choices={ + 'randomized': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'n_iter': hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + description='Number of iterations for randomized SVD solver. Not used in arpack', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'arpack': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'tol': hyperparams.Bounded[float]( + default=0, + lower=0, + upper=None, + description='Tolerance for ARPACK. 0 means machine precision. Ignored by randomized SVD solver.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='randomized', + description='SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds), or "randomized" for the randomized algorithm due to Halko (2009).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn TruncatedSVD + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SINGULAR_VALUE_DECOMPOSITION, ], + "name": "sklearn.decomposition.truncated_svd.TruncatedSVD", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.data_preprocessing.truncated_svd.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html']}, + "version": "2019.11.13", + "id": "9231fde3-7322-3c41-b4cf-d00a93558c44", + "hyperparams_to_tune": ['n_components'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = TruncatedSVD( + n_components=self.hyperparams['n_components'], + algorithm=self.hyperparams['algorithm']['choice'], + n_iter=self.hyperparams['algorithm'].get('n_iter', 5), + tol=self.hyperparams['algorithm'].get('tol', 0), + random_state=self.random_seed, + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + components_=None, + explained_variance_ratio_=None, + explained_variance_=None, + singular_values_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + components_=getattr(self._clf, 'components_', None), + explained_variance_ratio_=getattr(self._clf, 'explained_variance_ratio_', None), + explained_variance_=getattr(self._clf, 'explained_variance_', None), + singular_values_=getattr(self._clf, 'singular_values_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.components_ = params['components_'] + self._clf.explained_variance_ratio_ = params['explained_variance_ratio_'] + self._clf.explained_variance_ = params['explained_variance_'] + self._clf.singular_values_ = params['singular_values_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['components_'] is not None: + self._fitted = True + if params['explained_variance_ratio_'] is not None: + self._fitted = True + if params['explained_variance_'] is not None: + self._fitted = True + if params['singular_values_'] is not None: + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKTruncatedSVD.__doc__ = TruncatedSVD.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKVarianceThreshold.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKVarianceThreshold.py new file mode 100644 index 0000000..d6f30ab --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/SKVarianceThreshold.py @@ -0,0 +1,414 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.feature_selection.variance_threshold import VarianceThreshold + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + variances_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + threshold = hyperparams.Bounded[float]( + default=0.0, + lower=0, + upper=None, + description='Features with a training-set variance lower than this threshold will be removed. The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", + ) + use_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", + ) + exclude_inputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", + ) + exclude_outputs_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['update_semantic_types', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", +) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', + description='Decides what semantic type to attach to generated output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKVarianceThreshold(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn VarianceThreshold + `sklearn documentation `_ + + """ + + __author__ = "JPL MARVIN" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.FEATURE_SCALING, ], + "name": "sklearn.feature_selection.variance_threshold.VarianceThreshold", + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_SELECTION, + "python_path": "d3m.primitives.feature_selection.variance_threshold.SKlearn", + "source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html']}, + "version": "2019.11.13", + "id": "980c43c7-ab2a-3dc9-943b-db08a7c25cb6", + "hyperparams_to_tune": ['threshold'], + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( + git_commit=utils.current_git_commit(os.path.dirname(__file__)), + ), + }] + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = VarianceThreshold( + threshold=self.hyperparams['threshold'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + self._new_training_data = False + + def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: + self._inputs = inputs + self._outputs = outputs + self._fitted = False + self._new_training_data = True + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None or self._training_outputs is None: + raise ValueError("Missing training data.") + + if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: + sk_training_output = self._training_outputs.values + + shape = sk_training_output.shape + if len(shape) == 2 and shape[1] == 1: + sk_training_output = numpy.ravel(sk_training_output) + + self._clf.fit(self._training_inputs, sk_training_output) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) + output = [] + if len(sk_inputs.columns): + try: + sk_output = self._clf.transform(sk_inputs) + except sklearn.exceptions.NotFittedError as error: + raise PrimitiveNotFittedError("Primitive not fitted.") from error + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + target_columns_metadata = self._copy_columns_metadata(inputs.iloc[:, self._training_indices].metadata, + self.produce_support().value) + output = self._wrap_predictions(inputs, sk_output, target_columns_metadata) + output.columns = [inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self.produce_support().value] + output = [output] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if self.hyperparams['return_result'] == 'update_semantic_types': + temp_inputs = inputs.copy() + columns_not_selected = sorted(set(range(len(temp_inputs.columns))) - set(self.produce_support().value)) + + for idx in columns_not_selected: + temp_inputs.metadata = temp_inputs.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, idx), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + + temp_inputs = temp_inputs.select_columns(self._training_indices) + outputs = base_utils.combine_columns(return_result='replace', + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=[temp_inputs]) + return CallResult(outputs) + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output) + + return CallResult(outputs) + + def produce_support(self, *, timeout: float = None, iterations: int = None) -> CallResult[Any]: + all_indices = self._training_indices + selected_indices = self._clf.get_support(indices=True).tolist() + indices = [all_indices[index] for index in selected_indices] + return CallResult(indices) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + variances_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + variances_=getattr(self._clf, 'variances_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._clf.variances_ = params['variances_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['variances_'] is not None: + self._fitted = True + + + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_inputs_columns'], + exclude_columns=hyperparams['exclude_inputs_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return data, list(data.columns), list(range(len(data.columns))) + + metadata = data.metadata + + def can_produce_column(column_index: int) -> bool: + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") + column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + return False + + target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, + use_columns=hyperparams[ + 'use_outputs_columns'], + exclude_columns= + hyperparams[ + 'exclude_outputs_columns'], + can_use_column=can_produce_column) + targets = [] + if target_column_indices: + targets = data.select_columns(target_column_indices) + target_column_names = [] + for idx in target_column_indices: + target_column_names.append(data.columns[idx]) + return targets, target_column_names, target_column_indices + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + if len(target_columns_metadata) == 1: + name = column_metadata.get("name") + for idx in range(len(outputs.columns)): + outputs_metadata = outputs_metadata.update_column(idx, column_metadata) + if len(outputs.columns) > 1: + # Updating column names. + outputs_metadata = outputs_metadata.update((metadata_base.ALL_ELEMENTS, idx), {'name': "{}_{}".format(name, idx)}) + else: + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray, target_columns_metadata) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=False) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices) -> List[OrderedDict]: + outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in column_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKVarianceThreshold.__doc__ = VarianceThreshold.__doc__ \ No newline at end of file diff --git a/tods/common-primitives/sklearn-wrap/sklearn_wrap/__init__.py b/tods/common-primitives/sklearn-wrap/sklearn_wrap/__init__.py new file mode 100644 index 0000000..def4f5b --- /dev/null +++ b/tods/common-primitives/sklearn-wrap/sklearn_wrap/__init__.py @@ -0,0 +1,2 @@ +__author__ = 'JPL DARPA D3M TEAM' +__version__ = '2019.11.13' diff --git a/tods/common-primitives/tests/test_audio_reader.py b/tods/common-primitives/tests/test_audio_reader.py new file mode 100644 index 0000000..f02bd2b --- /dev/null +++ b/tods/common-primitives/tests/test_audio_reader.py @@ -0,0 +1,105 @@ +import unittest +import os + +from d3m import container + +from common_primitives import audio_reader, dataset_to_dataframe, denormalize + + +class AudioReaderPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'audio_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults().replace({'dataframe_resource': '0'})) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + audio_hyperparams_class = audio_reader.AudioReaderPrimitive.metadata.get_hyperparams() + audio_primitive = audio_reader.AudioReaderPrimitive(hyperparams=audio_hyperparams_class.defaults().replace({'return_result': 'replace'})) + audios = audio_primitive.produce(inputs=dataframe).value + + self.assertEqual(audios.shape, (1, 1)) + self.assertEqual(audios.iloc[0, 0].shape, (4410, 1)) + + self._test_metadata(audios.metadata, True) + + self.assertEqual(audios.metadata.query((0, 0))['dimension']['length'], 4410) + self.assertEqual(audios.metadata.query((0, 0))['dimension']['sampling_rate'], 44100) + + def _test_metadata(self, metadata, is_table): + semantic_types = ('https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/AudioObject') + + if is_table: + semantic_types += ('https://metadata.datadrivendiscovery.org/types/Table',) + + self.assertEqual(metadata.query_column(0)['name'], 'filename') + self.assertEqual(metadata.query_column(0)['structural_type'], container.ndarray) + self.assertEqual(metadata.query_column(0)['semantic_types'], semantic_types) + + def test_boundaries_reassign(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'audio_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + denormalize_hyperparams_class = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + denormalize_primitive = denormalize.DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults()) + dataset = denormalize_primitive.produce(inputs=dataset).value + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + audio_hyperparams_class = audio_reader.AudioReaderPrimitive.metadata.get_hyperparams() + audio_primitive = audio_reader.AudioReaderPrimitive(hyperparams=audio_hyperparams_class.defaults().replace({'return_result': 'append'})) + audios = audio_primitive.produce(inputs=dataframe).value + + self.assertEqual(audios.shape, (1, 6)) + self.assertEqual(audios.iloc[0, 5].shape, (4410, 1)) + + self._test_boundaries_reassign_metadata(audios.metadata, True) + + self.assertEqual(audios.metadata.query((0, 5))['dimension']['length'], 4410) + self.assertEqual(audios.metadata.query((0, 5))['dimension']['sampling_rate'], 44100) + + def _test_boundaries_reassign_metadata(self, metadata, is_table): + semantic_types = ('http://schema.org/AudioObject', 'https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/UniqueKey') + + if is_table: + semantic_types += ('https://metadata.datadrivendiscovery.org/types/Table',) + + self.assertEqual(metadata.query_column(5)['name'], 'filename') + self.assertEqual(metadata.query_column(5)['structural_type'], container.ndarray) + self.assertEqual(metadata.query_column(5)['semantic_types'], semantic_types) + + self.assertEqual(metadata.query_column(2), { + 'structural_type': str, + 'name': 'start', + 'semantic_types': ( + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Boundary', + 'https://metadata.datadrivendiscovery.org/types/IntervalStart', + ), + 'boundary_for': { + 'resource_id': 'learningData', + 'column_index': 5, + }, + }) + self.assertEqual(metadata.query_column(3), { + 'structural_type': str, + 'name': 'end', + 'semantic_types': ( + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Boundary', + 'https://metadata.datadrivendiscovery.org/types/IntervalEnd', + ), + 'boundary_for': { + 'resource_id': 'learningData', + 'column_index': 5, + }, + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_cast_to_type.py b/tods/common-primitives/tests/test_cast_to_type.py new file mode 100644 index 0000000..304ef18 --- /dev/null +++ b/tods/common-primitives/tests/test_cast_to_type.py @@ -0,0 +1,131 @@ +import os +import logging +import unittest + +import numpy + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import cast_to_type, column_parser, dataset_to_dataframe, extract_columns_semantic_types + + +class CastToTypePrimitiveTestCase(unittest.TestCase): + def test_basic(self): + inputs = container.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}, generate_metadata=True) + + self.assertEqual(inputs.dtypes['a'], numpy.int64) + self.assertEqual(inputs.dtypes['b'], object) + + hyperparams_class = cast_to_type.CastToTypePrimitive.metadata.get_hyperparams() + + primitive = cast_to_type.CastToTypePrimitive(hyperparams=hyperparams_class.defaults().replace({'type_to_cast': 'str'})) + + call_metadata = primitive.produce(inputs=inputs) + + self.assertIsInstance(call_metadata.value, container.DataFrame) + + self.assertEqual(len(call_metadata.value.dtypes), 2) + self.assertEqual(call_metadata.value.dtypes['a'], object) + self.assertEqual(call_metadata.value.dtypes['b'], object) + + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS, 0))['structural_type'], str) + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS, 1))['structural_type'], str) + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'], 2) + + primitive = cast_to_type.CastToTypePrimitive(hyperparams=hyperparams_class.defaults().replace({'type_to_cast': 'float'})) + + with self.assertLogs(level=logging.WARNING) as cm: + call_metadata = primitive.produce(inputs=inputs) + + self.assertEqual(len(call_metadata.value.dtypes), 1) + self.assertEqual(call_metadata.value.dtypes['a'], float) + + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS, 0))['structural_type'], float) + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'], 1) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "Not all columns can be cast to type '%(type)s'. Skipping columns: %(columns)s") + + primitive = cast_to_type.CastToTypePrimitive(hyperparams=hyperparams_class.defaults().replace({'exclude_columns': (0,), 'type_to_cast': 'float'})) + + with self.assertRaisesRegex(ValueError, 'No columns to be cast to type'): + primitive.produce(inputs=inputs) + + def test_objects(self): + hyperparams_class = cast_to_type.CastToTypePrimitive.metadata.get_hyperparams() + + inputs = container.DataFrame({'a': [1, 2, 3], 'b': [{'a': 1}, {'b': 1}, {'c': 1}]}, { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.DataFrame, + 'dimension': { + 'length': 3, + }, + }, generate_metadata=False) + inputs.metadata = inputs.metadata.update((metadata_base.ALL_ELEMENTS,), { + 'dimension': { + 'length': 2, + }, + }) + inputs.metadata = inputs.metadata.update((metadata_base.ALL_ELEMENTS, 0), { + 'structural_type': int, + }) + inputs.metadata = inputs.metadata.update((metadata_base.ALL_ELEMENTS, 1), { + 'structural_type': dict, + }) + + self.assertEqual(inputs.dtypes['a'], numpy.int64) + self.assertEqual(inputs.dtypes['b'], object) + + primitive = cast_to_type.CastToTypePrimitive(hyperparams=hyperparams_class.defaults().replace({'type_to_cast': 'str'})) + + call_metadata = primitive.produce(inputs=inputs) + + self.assertEqual(len(call_metadata.value.dtypes), 2) + self.assertEqual(call_metadata.value.dtypes['a'], object) + self.assertEqual(call_metadata.value.dtypes['b'], object) + + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS, 0))['structural_type'], str) + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS, 1))['structural_type'], str) + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'], 2) + + primitive = cast_to_type.CastToTypePrimitive(hyperparams=hyperparams_class.defaults().replace({'type_to_cast': 'float'})) + + with self.assertLogs(level=logging.WARNING) as cm: + call_metadata = primitive.produce(inputs=inputs) + + self.assertEqual(len(call_metadata.value.dtypes), 1) + self.assertEqual(call_metadata.value.dtypes['a'], float) + + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS, 0))['structural_type'], float) + self.assertEqual(call_metadata.value.metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'], 1) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "Not all columns can be cast to type '%(type)s'. Skipping columns: %(columns)s") + + def test_data(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataset).value + + hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + hyperparams_class = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams() + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults()) + attributes = primitive.produce(inputs=dataframe).value + + hyperparams_class = cast_to_type.CastToTypePrimitive.metadata.get_hyperparams() + primitive = cast_to_type.CastToTypePrimitive(hyperparams=hyperparams_class.defaults().replace({'type_to_cast': 'float'})) + cast_attributes = primitive.produce(inputs=attributes).value + + self.assertEqual(cast_attributes.values.dtype, numpy.float64) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_column_map.py b/tods/common-primitives/tests/test_column_map.py new file mode 100644 index 0000000..0323239 --- /dev/null +++ b/tods/common-primitives/tests/test_column_map.py @@ -0,0 +1,75 @@ +import unittest +import os +import pickle +import sys + +from d3m import container, index, utils as d3m_utils + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.null import NullTransformerPrimitive, NullUnsupervisedLearnerPrimitive + +# To hide any logging or stdout output. +with d3m_utils.silence(): + index.register_primitive('d3m.primitives.operator.null.TransformerTest', NullTransformerPrimitive) + index.register_primitive('d3m.primitives.operator.null.UnsupervisedLearnerTest', NullUnsupervisedLearnerPrimitive) + +from common_primitives import dataset_to_dataframe, csv_reader, denormalize, column_map, column_parser + +import utils as test_utils + + +class ColumnMapTestCase(unittest.TestCase): + def test_transformer(self): + self.maxDiff = None + + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_2', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + primitive = denormalize.DenormalizePrimitive(hyperparams=hyperparams.defaults()) + dataset = primitive.produce(inputs=dataset).value + + hyperparams = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams.defaults()) + dataframe = primitive.produce(inputs=dataset).value + + hyperparams = csv_reader.CSVReaderPrimitive.metadata.get_hyperparams() + primitive = csv_reader.CSVReaderPrimitive(hyperparams=hyperparams.defaults().replace({'return_result': 'replace'})) + dataframe = primitive.produce(inputs=dataframe).value + + hyperparams = column_map.DataFrameColumnMapPrimitive.metadata.get_hyperparams() + primitive = column_map.DataFrameColumnMapPrimitive( + # We have to make an instance of the primitive ourselves. + hyperparams=hyperparams.defaults().replace({ + # First we use identity primitive which should not really change anything. + 'primitive': NullTransformerPrimitive( + hyperparams=NullTransformerPrimitive.metadata.get_hyperparams().defaults(), + ), + }), + ) + mapped_dataframe = primitive.produce(inputs=dataframe).value + + self.assertEqual(test_utils.convert_through_json(test_utils.effective_metadata(dataframe.metadata)), test_utils.convert_through_json(test_utils.effective_metadata(mapped_dataframe.metadata))) + + self.assertEqual(test_utils.convert_through_json(dataframe), test_utils.convert_through_json(mapped_dataframe)) + + primitive = column_map.DataFrameColumnMapPrimitive( + # We have to make an instance of the primitive ourselves. + hyperparams=hyperparams.defaults().replace({ + 'primitive': column_parser.ColumnParserPrimitive( + hyperparams=column_parser.ColumnParserPrimitive.metadata.get_hyperparams().defaults(), + ), + }), + ) + dataframe = primitive.produce(inputs=mapped_dataframe).value + + self.assertEqual(test_utils.convert_through_json(dataframe)[0][1][0], [0, 2.6173]) + + pickle.dumps(primitive) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_column_parser.py b/tods/common-primitives/tests/test_column_parser.py new file mode 100644 index 0000000..5d4e4b6 --- /dev/null +++ b/tods/common-primitives/tests/test_column_parser.py @@ -0,0 +1,474 @@ +import math +import os.path +import unittest + +import numpy + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, column_parser, utils as common_utils + +import utils as test_utils + + +class ColumnParserPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, (0, 5.1, 3.5, 1.4, 0.2, 6241605690342144121)) + + self.assertEqual([type(o) for o in first_row], [int, float, float, float, float, int]) + + self._test_basic_metadata(dataframe.metadata) + + def _test_basic_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + for i in range(1, 5): + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, i))), { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'][i - 1], + 'structural_type': 'float', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, i) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))), { + 'name': 'species', + 'structural_type': 'int', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + def test_new(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'use_columns': [2]})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, ('0', 3.5)) + + self.assertEqual([type(o) for o in first_row], [str, float]) + + self._test_new_metadata(dataframe.metadata) + + def _test_new_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), { + 'name': 'sepalWidth', + 'structural_type': 'float', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + def test_append(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults().replace({'return_result': 'append', 'replace_index_columns': False, 'parse_semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', 'http://schema.org/Integer']})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, ('0', '5.1', '3.5', '1.4', '0.2', 'Iris-setosa', 0, 6241605690342144121)) + + self.assertEqual([type(o) for o in first_row], [str, str, str, str, str, str, int, int]) + + self._test_append_metadata(dataframe.metadata, False) + + def test_append_replace_index_columns(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults().replace({'return_result': 'append', 'parse_semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', 'http://schema.org/Integer']})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + first_row = list(dataframe.itertuples(index=False, name=None))[0] + + self.assertEqual(first_row, (0, '5.1', '3.5', '1.4', '0.2', 'Iris-setosa', 6241605690342144121)) + + self.assertEqual([type(o) for o in first_row], [int, str, str, str, str, str, int]) + + self._test_append_metadata(dataframe.metadata, True) + + def _test_append_metadata(self, metadata, replace_index_columns): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 7 if replace_index_columns else 8, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'int' if replace_index_columns else 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + for i in range(1, 5): + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, i))), { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'][i - 1], + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, i) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))), { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + if not replace_index_columns: + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 6))), { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 6 if replace_index_columns else 7))), { + 'name': 'species', + 'structural_type': 'int', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + def test_integer(self): + hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = container.DataFrame({'a': ['1.0', '2.0', '3.0']}, generate_metadata=True) + + dataframe.metadata = dataframe.metadata.update((metadata_base.ALL_ELEMENTS, 0), { + 'name': 'test', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + call_metadata = primitive.produce(inputs=dataframe) + + parsed_dataframe = call_metadata.value + + self.assertEqual(test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'test', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(list(parsed_dataframe.iloc[:, 0]), [1, 2, 3]) + + dataframe.iloc[2, 0] = '3.1' + + call_metadata = primitive.produce(inputs=dataframe) + + parsed_dataframe = call_metadata.value + + self.assertEqual(test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'test', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(list(parsed_dataframe.iloc[:, 0]), [1, 2, 3]) + + dataframe.iloc[2, 0] = 'aaa' + + with self.assertRaisesRegex(ValueError, 'Not all values in a column can be parsed into integers, but only integers were expected'): + primitive.produce(inputs=dataframe) + + dataframe.metadata = dataframe.metadata.update((metadata_base.ALL_ELEMENTS, 0), { + 'name': 'test', + 'structural_type': str, + 'semantic_types': [ + 'http://schema.org/Integer', + ], + }) + + call_metadata = primitive.produce(inputs=dataframe) + + parsed_dataframe = call_metadata.value + + self.assertEqual(test_utils.convert_through_json(parsed_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'test', + 'structural_type': 'float', + 'semantic_types': [ + 'http://schema.org/Integer', + ], + }) + + self.assertEqual(list(parsed_dataframe.iloc[0:2, 0]), [1.0, 2.0]) + self.assertTrue(math.isnan(parsed_dataframe.iloc[2, 0])) + + def test_float_vector(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'object_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults().replace({'dataframe_resource': 'learningData'})) + dataframe = primitive.produce(inputs=dataset).value + + hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + self.assertIsInstance(dataframe.iloc[0, 3], container.ndarray) + self.assertEqual(dataframe.iloc[0, 3].shape, (8,)) + + self.assertEqual(utils.to_json_structure(dataframe.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json'}, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'int', + 'name': 'd3mIndex', + 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'image', + 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Text', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': '0', + 'column_index': 0, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'color_not_class', + 'structural_type': 'int', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 8, + }, + 'name': 'bounding_polygon_area', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/FloatVector', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Boundary', + 'https://metadata.datadrivendiscovery.org/types/BoundingPolygon', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'boundary_for': { + 'resource_id': 'learningData', + 'column_name': 'image', + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3, '__ALL_ELEMENTS__'], + 'metadata': {'structural_type': 'numpy.float64'}, + }]) + + def test_ugly_time_values(self): + for value in [ + 'Original chained constant price data are rescaled.', + '1986/87', + ]: + self.assertTrue(numpy.isnan(common_utils.parse_datetime_to_float(value)), value) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_compute_metafeatures.py b/tods/common-primitives/tests/test_compute_metafeatures.py new file mode 100644 index 0000000..07a1e4c --- /dev/null +++ b/tods/common-primitives/tests/test_compute_metafeatures.py @@ -0,0 +1,1106 @@ +import math +import os +import os.path +import unittest + +import numpy + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import column_parser, compute_metafeatures, dataset_to_dataframe, denormalize + +import utils as test_utils + + +def round_to_significant_digits(x, n): + if x == 0: + return x + elif not numpy.isfinite(x): + return x + else: + return round(x, -int(math.floor(math.log10(abs(x)))) + (n - 1)) + + +def round_numbers(obj): + if isinstance(obj, (int, str)): + return obj + elif isinstance(obj, float): + return round_to_significant_digits(obj, 12) + elif isinstance(obj, list): + return [round_numbers(el) for el in obj] + elif isinstance(obj, tuple): + return tuple(round_numbers(el) for el in obj) + elif isinstance(obj, dict): + return {k: round_numbers(v) for k, v in obj.items()} + else: + return obj + + +class ComputeMetafeaturesPrimitiveTestCase(unittest.TestCase): + def _get_iris(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + column_parser_hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + column_parser_primitive = column_parser.ColumnParserPrimitive(hyperparams=column_parser_hyperparams_class.defaults()) + dataframe = column_parser_primitive.produce(inputs=dataframe).value + + return dataframe + + def _get_database(self, parse_categorical_columns): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + denormalize_hyperparams_class = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + denormalize_primitive = denormalize.DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults()) + dataset = denormalize_primitive.produce(inputs=dataset).value + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + if parse_categorical_columns: + parse_semantic_types = ( + 'http://schema.org/Boolean', 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer', 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/FloatVector', 'http://schema.org/DateTime', + ) + else: + parse_semantic_types = ( + 'http://schema.org/Boolean', + 'http://schema.org/Integer', 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/FloatVector', 'http://schema.org/DateTime', + ) + + column_parser_hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + column_parser_primitive = column_parser.ColumnParserPrimitive(hyperparams=column_parser_hyperparams_class.defaults().replace({'parse_semantic_types': parse_semantic_types})) + dataframe = column_parser_primitive.produce(inputs=dataframe).value + + return dataframe + + def test_iris(self): + self.maxDiff = None + + dataframe = self._get_iris() + + hyperparams_class = compute_metafeatures.ComputeMetafeaturesPrimitive.metadata.get_hyperparams() + primitive = compute_metafeatures.ComputeMetafeaturesPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + self.assertEqual(round_numbers(test_utils.convert_through_json(dataframe.metadata.query(())['data_metafeatures'])), round_numbers({ + 'attribute_counts_by_semantic_type': { + 'http://schema.org/Float': 4, + 'https://metadata.datadrivendiscovery.org/types/Attribute': 4, + }, + 'attribute_counts_by_structural_type': { + 'float': 4, + }, + 'attribute_ratios_by_semantic_type': { + 'http://schema.org/Float': 1.0, + 'https://metadata.datadrivendiscovery.org/types/Attribute': 1.0, + }, + 'attribute_ratios_by_structural_type': { + 'float': 1.0, + }, + 'dimensionality': 0.02666666666666667, + 'entropy_of_attributes': { + 'count': 4, + 'kurtosis': -1.4343159590314425, + 'max': 1.525353510619575, + 'mean': 1.4166844257365265, + 'median': 1.4323995290219738, + 'min': 1.2765851342825842, + 'quartile_1': 1.3565647450899858, + 'quartile_3': 1.4925192096685145, + 'skewness': -0.6047691718752254, + 'std': 0.11070539686522164, + }, + 'entropy_of_numeric_attributes': { + 'count': 4, + 'kurtosis': -1.4343159590314425, + 'max': 1.525353510619575, + 'mean': 1.4166844257365265, + 'median': 1.4323995290219738, + 'min': 1.2765851342825842, + 'quartile_1': 1.3565647450899858, + 'quartile_3': 1.4925192096685145, + 'skewness': -0.6047691718752254, + 'std': 0.11070539686522164, + }, + 'kurtosis_of_attributes': { + 'count': 4, + 'kurtosis': -1.1515850633224236, + 'max': 0.2907810623654279, + 'mean': -0.7507394876837397, + 'median': -0.9459091062274914, + 'min': -1.4019208006454036, + 'quartile_1': -1.3552958285158583, + 'quartile_3': -0.3413527653953726, + 'skewness': 0.8725328682893572, + 'std': 0.7948191385132984, + }, + 'mean_of_attributes': { + 'count': 4, + 'kurtosis': 0.8595879081956515, + 'max': 5.843333333333335, + 'mean': 3.4636666666666684, + 'median': 3.406333333333335, + 'min': 1.1986666666666672, + 'quartile_1': 2.5901666666666676, + 'quartile_3': 4.279833333333335, + 'skewness': 0.17098811780721151, + 'std': 1.919017997329383, + }, + 'number_distinct_values_of_numeric_attributes': { + 'count': 4, + 'kurtosis': -3.0617196548227046, + 'max': 43, + 'mean': 30.75, + 'median': 29.0, + 'min': 22, + 'quartile_1': 22.75, + 'quartile_3': 37.0, + 'skewness': 0.5076458131399395, + 'std': 10.07885575516057, + }, + 'number_of_attributes': 4, + 'number_of_binary_attributes': 0, + 'number_of_categorical_attributes': 0, + 'number_of_discrete_attributes': 0, + 'number_of_instances': 150, + 'number_of_instances_with_missing_values': 0, + 'number_of_instances_with_present_values': 150, + 'number_of_numeric_attributes': 4, + 'number_of_other_attributes': 0, + 'number_of_string_attributes': 0, + 'ratio_of_binary_attributes': 0.0, + 'ratio_of_categorical_attributes': 0.0, + 'ratio_of_discrete_attributes': 0.0, + 'ratio_of_instances_with_missing_values': 0.0, + 'ratio_of_instances_with_present_values': 1.0, + 'ratio_of_numeric_attributes': 1.0, + 'ratio_of_other_attributes': 0.0, + 'ratio_of_string_attributes': 0.0, + 'skew_of_attributes': { + 'count': 4, + 'kurtosis': -4.4981774675194846, + 'max': 0.3340526621720866, + 'mean': 0.06737570104778733, + 'median': 0.10495719724642275, + 'min': -0.27446425247378287, + 'quartile_1': -0.1473634847265412, + 'quartile_3': 0.3196963830207513, + 'skewness': -0.25709026597426626, + 'std': 0.3049355425307816, + }, + 'standard_deviation_of_attributes': { + 'count': 4, + 'kurtosis': 2.65240266862979, + 'max': 1.7644204199522617, + 'mean': 0.9473104002482848, + 'median': 0.7956134348393522, + 'min': 0.4335943113621737, + 'quartile_1': 0.6807691341161745, + 'quartile_3': 1.0621547009714627, + 'skewness': 1.4362343455338735, + 'std': 0.5714610798918619, + } + })) + self.assertFalse('data_metafeatures' in dataframe.metadata.query_column(0)) + self.assertEqual(round_numbers(test_utils.convert_through_json(dataframe.metadata.query_column(1)['data_metafeatures'])), round_numbers({ + 'entropy_of_values': 1.525353510619575, + 'number_distinct_values': 35, + 'number_of_missing_values': 0, + 'number_of_negative_numeric_values': 0, + 'number_of_numeric_values': 150, + 'number_of_numeric_values_equal_-1': 0, + 'number_of_numeric_values_equal_0': 0, + 'number_of_numeric_values_equal_1': 0, + 'number_of_positive_numeric_values': 150, + 'number_of_present_values': 150, + 'ratio_of_missing_values': 0.0, + 'ratio_of_negative_numeric_values': 0.0, + 'ratio_of_numeric_values': 1.0, + 'ratio_of_numeric_values_equal_-1': 0.0, + 'ratio_of_numeric_values_equal_0': 0.0, + 'ratio_of_numeric_values_equal_1': 0.0, + 'ratio_of_positive_numeric_values': 1.0, + 'ratio_of_present_values': 1.0, + 'value_counts_aggregate': { + 'count': 5, + 'kurtosis': -0.46949652355057747, + 'max': 42, + 'mean': 30.0, + 'median': 32.0, + 'min': 11, + 'quartile_1': 24.0, + 'quartile_3': 41.0, + 'skewness': -0.7773115383470599, + 'std': 12.90348790056394, + }, + 'value_probabilities_aggregate': { + 'count': 5, + 'kurtosis': -0.4694965235505757, + 'max': 0.28, + 'mean': 0.2, + 'median': 0.21333333333333335, + 'min': 0.07333333333333333, + 'quartile_1': 0.16, + 'quartile_3': 0.2733333333333333, + 'skewness': -0.7773115383470603, + 'std': 0.08602325267042626, + }, + 'values_aggregate': { + 'count': 150, + 'kurtosis': -0.5520640413156395, + 'max': 7.9, + 'mean': 5.843333333333335, + 'median': 5.8, + 'min': 4.3, + 'quartile_1': 5.1, + 'quartile_3': 6.4, + 'skewness': 0.3149109566369728, + 'std': 0.8280661279778629, + }, + })) + self.assertEqual(round_numbers(test_utils.convert_through_json(dataframe.metadata.query_column(2)['data_metafeatures'])), round_numbers({ + 'entropy_of_values': 1.2765851342825842, + 'number_distinct_values': 23, + 'number_of_missing_values': 0, + 'number_of_negative_numeric_values': 0, + 'number_of_numeric_values': 150, + 'number_of_numeric_values_equal_-1': 0, + 'number_of_numeric_values_equal_0': 0, + 'number_of_numeric_values_equal_1': 0, + 'number_of_positive_numeric_values': 150, + 'number_of_present_values': 150, + 'ratio_of_missing_values': 0.0, + 'ratio_of_negative_numeric_values': 0.0, + 'ratio_of_numeric_values': 1.0, + 'ratio_of_numeric_values_equal_-1': 0.0, + 'ratio_of_numeric_values_equal_0': 0.0, + 'ratio_of_numeric_values_equal_1': 0.0, + 'ratio_of_positive_numeric_values': 1.0, + 'ratio_of_present_values': 1.0, + 'value_counts_aggregate': { + 'count': 5, + 'kurtosis': -0.9899064888741496, + 'max': 69, + 'mean': 30.0, + 'median': 20.0, + 'min': 4, + 'quartile_1': 11.0, + 'quartile_3': 46.0, + 'skewness': 0.8048211570183503, + 'std': 26.99073915253156, + }, + 'value_probabilities_aggregate': { + 'count': 5, + 'kurtosis': -0.9899064888741478, + 'max': 0.46, + 'mean': 0.19999999999999998, + 'median': 0.13333333333333333, + 'min': 0.02666666666666667, + 'quartile_1': 0.07333333333333333, + 'quartile_3': 0.30666666666666664, + 'skewness': 0.8048211570183509, + 'std': 0.17993826101687704, + }, + 'values_aggregate': { + 'count': 150, + 'kurtosis': 0.2907810623654279, + 'max': 4.4, + 'mean': 3.0540000000000007, + 'median': 3.0, + 'min': 2.0, + 'quartile_1': 2.8, + 'quartile_3': 3.3, + 'skewness': 0.3340526621720866, + 'std': 0.4335943113621737, + }, + })) + self.assertEqual(round_numbers(test_utils.convert_through_json(dataframe.metadata.query_column(3)['data_metafeatures'])), round_numbers({ + 'entropy_of_values': 1.38322461535912, + 'number_distinct_values': 43, + 'number_of_missing_values': 0, + 'number_of_negative_numeric_values': 0, + 'number_of_numeric_values': 150, + 'number_of_numeric_values_equal_-1': 0, + 'number_of_numeric_values_equal_0': 0, + 'number_of_numeric_values_equal_1': 1, + 'number_of_positive_numeric_values': 150, + 'number_of_present_values': 150, + 'ratio_of_missing_values': 0.0, + 'ratio_of_negative_numeric_values': 0.0, + 'ratio_of_numeric_values': 1.0, + 'ratio_of_numeric_values_equal_-1': 0.0, + 'ratio_of_numeric_values_equal_0': 0.0, + 'ratio_of_numeric_values_equal_1': 0.006666666666666667, + 'ratio_of_positive_numeric_values': 1.0, + 'ratio_of_present_values': 1.0, + 'value_counts_aggregate': { + 'count': 5, + 'kurtosis': -1.875313335089766, + 'max': 50, + 'mean': 30.0, + 'median': 34.0, + 'min': 3, + 'quartile_1': 16.0, + 'quartile_3': 47.0, + 'skewness': -0.4786622161186872, + 'std': 20.18662923818635, + }, + 'value_probabilities_aggregate': { + 'count': 5, + 'kurtosis': -1.8753133350897668, + 'max': 0.3333333333333333, + 'mean': 0.2, + 'median': 0.22666666666666666, + 'min': 0.02, + 'quartile_1': 0.10666666666666667, + 'quartile_3': 0.31333333333333335, + 'skewness': -0.4786622161186876, + 'std': 0.13457752825457567, + }, + 'values_aggregate': { + 'count': 150, + 'kurtosis': -1.4019208006454036, + 'max': 6.9, + 'mean': 3.7586666666666693, + 'median': 4.35, + 'min': 1.0, + 'quartile_1': 1.6, + 'quartile_3': 5.1, + 'skewness': -0.27446425247378287, + 'std': 1.7644204199522617, + }, + })) + self.assertEqual(round_numbers(test_utils.convert_through_json(dataframe.metadata.query_column(4)['data_metafeatures'])), round_numbers({ + 'entropy_of_values': 1.4815744426848276, + 'number_distinct_values': 22, + 'number_of_missing_values': 0, + 'number_of_negative_numeric_values': 0, + 'number_of_numeric_values': 150, + 'number_of_numeric_values_equal_-1': 0, + 'number_of_numeric_values_equal_0': 0, + 'number_of_numeric_values_equal_1': 7, + 'number_of_positive_numeric_values': 150, + 'number_of_present_values': 150, + 'ratio_of_missing_values': 0.0, + 'ratio_of_negative_numeric_values': 0.0, + 'ratio_of_numeric_values': 1.0, + 'ratio_of_numeric_values_equal_-1': 0.0, + 'ratio_of_numeric_values_equal_0': 0.0, + 'ratio_of_numeric_values_equal_1': 0.04666666666666667, + 'ratio_of_positive_numeric_values': 1.0, + 'ratio_of_present_values': 1.0, + 'value_counts_aggregate': { + 'count': 5, + 'kurtosis': -0.6060977121954245, + 'max': 49, + 'mean': 30.0, + 'median': 29.0, + 'min': 8, + 'quartile_1': 23.0, + 'quartile_3': 41.0, + 'skewness': -0.28840734350346464, + 'std': 15.937377450509228, + }, + 'value_probabilities_aggregate': { + 'count': 5, + 'kurtosis': -0.606097712195421, + 'max': 0.32666666666666666, + 'mean': 0.2, + 'median': 0.19333333333333333, + 'min': 0.05333333333333334, + 'quartile_1': 0.15333333333333332, + 'quartile_3': 0.2733333333333333, + 'skewness': -0.2884073435034653, + 'std': 0.10624918300339484, + }, + 'values_aggregate': { + 'count': 150, + 'kurtosis': -1.3397541711393433, + 'max': 2.5, + 'mean': 1.1986666666666672, + 'median': 1.3, + 'min': 0.1, + 'quartile_1': 0.3, + 'quartile_3': 1.8, + 'skewness': -0.10499656214412734, + 'std': 0.7631607417008414, + }, + })) + self.assertEqual(round_numbers(test_utils.convert_through_json(dataframe.metadata.query_column(5)['data_metafeatures'])), round_numbers({ + 'default_accuracy': 0.3333333333333333, + 'entropy_of_values': 1.0986122886681096, + 'equivalent_number_of_numeric_attributes': 1.7538156960944151, + 'joint_entropy_of_attributes': { + 'count': 4, + 'kurtosis': -4.468260105522818, + 'max': 0.9180949375453917, + 'mean': 0.6264126219845205, + 'median': 0.6607409495199184, + 'min': 0.26607365135285327, + 'quartile_1': 0.3993550878466134, + 'quartile_3': 0.8877984836578254, + 'skewness': -0.24309705749856694, + 'std': 0.3221913428169348, + }, + 'joint_entropy_of_numeric_attributes': { + 'count': 4, + 'kurtosis': -5.533056612798099, + 'max': 2.1801835659431514, + 'mean': 1.8888840924201158, + 'median': 1.8856077827026931, + 'min': 1.604137238331926, + 'quartile_1': 1.6476031549386407, + 'quartile_3': 2.1268887201841684, + 'skewness': 0.01639056780792744, + 'std': 0.29770030633854977, + }, + 'mutual_information_of_numeric_attributes': { + 'count': 4, + 'kurtosis': -4.468260105522818, + 'max': 0.9180949375453917, + 'mean': 0.6264126219845205, + 'median': 0.6607409495199184, + 'min': 0.26607365135285327, + 'quartile_1': 0.3993550878466134, + 'quartile_3': 0.8877984836578254, + 'skewness': -0.24309705749856694, + 'std': 0.3221913428169348, + }, + 'number_distinct_values': 3, + 'number_of_missing_values': 0, + 'number_of_present_values': 150, + 'numeric_noise_to_signal_ratio': 1.2615834611511623, + 'ratio_of_missing_values': 0.0, + 'ratio_of_present_values': 1.0, + 'value_counts_aggregate': { + 'count': 3, + 'max': 50, + 'mean': 50.0, + 'median': 50.0, + 'min': 50, + 'quartile_1': 50.0, + 'quartile_3': 50.0, + 'skewness': 0, + 'std': 0.0, + }, + 'value_probabilities_aggregate': { + 'count': 3, + 'max': 0.3333333333333333, + 'mean': 0.3333333333333333, + 'median': 0.3333333333333333, + 'min': 0.3333333333333333, + 'quartile_1': 0.3333333333333333, + 'quartile_3': 0.3333333333333333, + 'skewness': 0, + 'std': 0.0, + }, + })) + + def test_database_with_parsed_categorical_columns(self): + self.maxDiff = None + + dataframe = self._get_database(True) + + hyperparams_class = compute_metafeatures.ComputeMetafeaturesPrimitive.metadata.get_hyperparams() + primitive = compute_metafeatures.ComputeMetafeaturesPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + self._test_database_metafeatures(dataframe.metadata, True) + + def test_database_without_parsed_categorical_columns(self): + self.maxDiff = None + + dataframe = self._get_database(False) + + hyperparams_class = compute_metafeatures.ComputeMetafeaturesPrimitive.metadata.get_hyperparams() + primitive = compute_metafeatures.ComputeMetafeaturesPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + self._test_database_metafeatures(dataframe.metadata, False) + + def _test_database_metafeatures(self, metadata, parse_categorical_columns): + expected_metafeatures = { + 'attribute_counts_by_semantic_type': { + 'http://schema.org/DateTime': 1, + 'http://schema.org/Integer': 1, + 'http://schema.org/Text': 2, + 'https://metadata.datadrivendiscovery.org/types/Attribute': 6, + 'https://metadata.datadrivendiscovery.org/types/CategoricalData': 2, + }, + 'attribute_counts_by_structural_type': { + 'float': 2, + 'str': 4, + }, + 'attribute_ratios_by_semantic_type': { + 'http://schema.org/DateTime': 0.16666666666666666, + 'http://schema.org/Integer': 0.16666666666666666, + 'http://schema.org/Text': 0.3333333333333333, + 'https://metadata.datadrivendiscovery.org/types/Attribute': 1.0, + 'https://metadata.datadrivendiscovery.org/types/CategoricalData': 0.3333333333333333, + }, + 'attribute_ratios_by_structural_type': { + 'float': 0.3333333333333333, + 'str': 0.6666666666666666, + }, + 'dimensionality': 0.13333333333333333, + 'entropy_of_attributes': { + 'count': 4, + 'kurtosis': 1.5975414707531783, + 'max': 1.6094379124341005, + 'mean': 1.1249524175825663, + 'median': 1.0986122886681096, + 'min': 0.6931471805599453, + 'quartile_1': 0.9972460116410685, + 'quartile_3': 1.2263186946096072, + 'skewness': 0.4183300365459641, + 'std': 0.3753085673700856, + }, + 'entropy_of_categorical_attributes': { + 'count': 2, + 'max': 1.6094379124341005, + 'mean': 1.354025100551105, + 'median': 1.354025100551105, + 'min': 1.0986122886681096, + 'quartile_1': 1.2263186946096072, + 'quartile_3': 1.4817315064926029, + 'std': 0.3612082625687802, + }, + 'entropy_of_discrete_attributes': { + 'count': 2, + 'max': 1.0986122886681096, + 'mean': 0.8958797346140275, + 'median': 0.8958797346140275, + 'min': 0.6931471805599453, + 'quartile_1': 0.7945134575869863, + 'quartile_3': 0.9972460116410685, + 'std': 0.28670712747781957, + }, + 'entropy_of_numeric_attributes': { + 'count': 2, + 'max': 1.0986122886681096, + 'mean': 0.8958797346140275, + 'median': 0.8958797346140275, + 'min': 0.6931471805599453, + 'quartile_1': 0.7945134575869863, + 'quartile_3': 0.9972460116410685, + 'std': 0.28670712747781957, + }, + 'kurtosis_of_attributes': { + 'count': 2, + 'max': -1.5348837209302326, + 'mean': -1.8415159345391905, + 'median': -1.8415159345391905, + 'min': -2.1481481481481484, + 'quartile_1': -1.9948320413436693, + 'quartile_3': -1.6881998277347114, + 'std': 0.4336434351462721, + }, + 'mean_of_attributes': { + 'count': 2, + 'max': 946713600.0, + 'mean': 473356800.75, + 'median': 473356800.75, + 'min': 1.5, + 'quartile_1': 236678401.125, + 'quartile_3': 710035200.375, + 'std': 669427605.3408685, + }, + 'number_distinct_values_of_categorical_attributes': { + 'count': 2, + 'max': 5, + 'mean': 4.0, + 'median': 4.0, + 'min': 3, + 'quartile_1': 3.5, + 'quartile_3': 4.5, + 'std': 1.4142135623730951, + }, + 'number_distinct_values_of_discrete_attributes': { + 'count': 2, + 'max': 3, + 'mean': 2.5, + 'median': 2.5, + 'min': 2, + 'quartile_1': 2.25, + 'quartile_3': 2.75, + 'std': 0.7071067811865476, + }, + 'number_distinct_values_of_numeric_attributes': { + 'count': 2, + 'max': 3, + 'mean': 2.5, + 'median': 2.5, + 'min': 2, + 'quartile_1': 2.25, + 'quartile_3': 2.75, + 'std': 0.7071067811865476, + }, + 'number_of_attributes': 6, + 'number_of_binary_attributes': 1, + 'number_of_categorical_attributes': 2, + 'number_of_discrete_attributes': 2, + 'number_of_instances': 45, + 'number_of_instances_with_missing_values': 15, + 'number_of_instances_with_present_values': 45, + 'number_of_numeric_attributes': 2, + 'number_of_other_attributes': 0, + 'number_of_string_attributes': 2, + 'ratio_of_binary_attributes': 0.16666666666666666, + 'ratio_of_categorical_attributes': 0.3333333333333333, + 'ratio_of_discrete_attributes': 0.3333333333333333, + 'ratio_of_instances_with_missing_values': 0.3333333333333333, + 'ratio_of_instances_with_present_values': 1.0, + 'ratio_of_numeric_attributes': 0.3333333333333333, + 'ratio_of_other_attributes': 0.0, + 'ratio_of_string_attributes': 0.3333333333333333, + 'skew_of_attributes': { + 'count': 2, + 'max': 0.00017349603091112943, + 'mean': 8.674801545556472e-05, + 'median': 8.674801545556472e-05, + 'min': 0.0, + 'quartile_1': 4.337400772778236e-05, + 'quartile_3': 0.00013012202318334707, + 'std': 0.0001226802199662105, + }, + 'standard_deviation_of_attributes': { + 'count': 2, + 'max': 260578306.67149138, + 'mean': 130289153.59001951, + 'median': 130289153.59001951, + 'min': 0.5085476277156078, + 'quartile_1': 65144577.049283564, + 'quartile_3': 195433730.13075545, + 'std': 184256687.31792185, + }, + } + + if parse_categorical_columns: + expected_metafeatures['attribute_counts_by_structural_type'] = { + 'float': 2, + 'int': 2, + 'str': 2, + } + expected_metafeatures['attribute_ratios_by_structural_type'] = { + 'float': 0.3333333333333333, + 'int': 0.3333333333333333, + 'str': 0.3333333333333333, + } + + self.assertEqual(round_numbers(test_utils.convert_through_json(metadata.query(())['data_metafeatures'])), round_numbers(expected_metafeatures)) + self.assertFalse('data_metafeatures' in metadata.query_column(0)) + + expected_metafeatures = { + 'entropy_of_values': 1.0986122886681096, + 'number_distinct_values': 3, + 'number_of_missing_values': 0, + 'number_of_present_values': 45, + 'ratio_of_missing_values': 0.0, + 'ratio_of_present_values': 1.0, + 'value_counts_aggregate': { + 'count': 3, + 'max': 15, + 'mean': 15.0, + 'median': 15.0, + 'min': 15, + 'quartile_1': 15.0, + 'quartile_3': 15.0, + 'skewness': 0, + 'std': 0.0, + }, + 'value_probabilities_aggregate': { + 'count': 3, + 'max': 0.3333333333333333, + 'mean': 0.3333333333333333, + 'median': 0.3333333333333333, + 'min': 0.3333333333333333, + 'quartile_1': 0.3333333333333333, + 'quartile_3': 0.3333333333333333, + 'skewness': 0, + 'std': 0.0, + }, + } + + if parse_categorical_columns: + expected_metafeatures['values_aggregate'] = { + 'count': 45, + 'kurtosis': -1.5348837209302337, + 'max': 3183890296585507471, + 'mean': 1.3152606765673695e+18, + 'median': 5.866629697275507e+17, + 'min': 175228763389048878, + 'quartile_1': 1.7522876338904886e+17, + 'quartile_3': 3.1838902965855073e+18, + 'skewness': 0.679711376572956, + 'std': 1.3470047628846746e+18, + } + + self.assertEqual(round_numbers(test_utils.convert_through_json(metadata.query_column(1)['data_metafeatures'])), round_numbers(expected_metafeatures)) + self.assertEqual(round_numbers(test_utils.convert_through_json(metadata.query_column(2)['data_metafeatures'])), round_numbers({ + 'number_of_missing_values': 0, + 'number_of_present_values': 45, + 'ratio_of_missing_values': 0.0, + 'ratio_of_present_values': 1.0, + })) + self.assertEqual(round_numbers(test_utils.convert_through_json(metadata.query_column(3)['data_metafeatures'])), round_numbers({ + 'entropy_of_values': 0.6931471805599453, + 'number_distinct_values': 2, + 'number_of_missing_values': 15, + 'number_of_negative_numeric_values': 0, + 'number_of_numeric_values': 30, + 'number_of_numeric_values_equal_-1': 0, + 'number_of_numeric_values_equal_0': 0, + 'number_of_numeric_values_equal_1': 15, + 'number_of_positive_numeric_values': 30, + 'number_of_present_values': 30, + 'ratio_of_missing_values': 0.3333333333333333, + 'ratio_of_negative_numeric_values': 0.0, + 'ratio_of_numeric_values': 0.6666666666666666, + 'ratio_of_numeric_values_equal_-1': 0.0, + 'ratio_of_numeric_values_equal_0': 0.0, + 'ratio_of_numeric_values_equal_1': 0.3333333333333333, + 'ratio_of_positive_numeric_values': 0.6666666666666666, + 'ratio_of_present_values': 0.6666666666666666, + 'value_counts_aggregate': { + 'count': 2, + 'max': 15, + 'mean': 15.0, + 'median': 15.0, + 'min': 15, + 'quartile_1': 15.0, + 'quartile_3': 15.0, + 'std': 0.0, + }, + 'value_probabilities_aggregate': { + 'count': 2, + 'max': 0.5, + 'mean': 0.5, + 'median': 0.5, + 'min': 0.5, + 'quartile_1': 0.5, + 'quartile_3': 0.5, + 'std': 0.0, + }, + 'values_aggregate': { + 'count': 30, + 'kurtosis': -2.1481481481481484, + 'max': 2.0, + 'mean': 1.5, + 'median': 1.5, + 'min': 1.0, + 'quartile_1': 1.0, + 'quartile_3': 2.0, + 'skewness': 0.0, + 'std': 0.5085476277156078, + }, + })) + self.assertEqual(round_numbers(test_utils.convert_through_json(metadata.query_column(4)['data_metafeatures'])), round_numbers({ + 'number_of_missing_values': 0, + 'number_of_present_values': 45, + 'ratio_of_missing_values': 0.0, + 'ratio_of_present_values': 1.0, + })) + + expected_metafeatures = { + 'entropy_of_values': 1.6094379124341005, + 'number_distinct_values': 5, + 'number_of_missing_values': 0, + 'number_of_present_values': 45, + 'ratio_of_missing_values': 0.0, + 'ratio_of_present_values': 1.0, + 'value_counts_aggregate': { + 'count': 5, + 'kurtosis': 0, + 'max': 9, + 'mean': 9.0, + 'median': 9.0, + 'min': 9, + 'quartile_1': 9.0, + 'quartile_3': 9.0, + 'skewness': 0, + 'std': 0.0, + }, + 'value_probabilities_aggregate': { + 'count': 5, + 'kurtosis': 0, + 'max': 0.2, + 'mean': 0.2, + 'median': 0.2, + 'min': 0.2, + 'quartile_1': 0.2, + 'quartile_3': 0.2, + 'skewness': 0, + 'std': 0.0, + }, + } + + if parse_categorical_columns: + expected_metafeatures['values_aggregate'] = { + 'count': 45, + 'kurtosis': -0.8249445297886884, + 'max': 17926897368031380755, + 'mean': 1.1617029581691474e+19, + 'median': 1.1818891258207388e+19, + 'min': 4819821729471251610, + 'quartile_1': 9.804127312560234e+18, + 'quartile_3': 1.3715410240187093e+19, + 'skewness': -0.15176089654708094, + 'std': 4.378987201456074e+18, + } + + self.assertEqual(round_numbers(test_utils.convert_through_json(metadata.query_column(5)['data_metafeatures'])), round_numbers(expected_metafeatures)) + self.assertEqual(round_numbers(test_utils.convert_through_json(metadata.query_column(6)['data_metafeatures'])), round_numbers({ + 'entropy_of_values': 1.0986122886681096, + 'number_distinct_values': 3, + 'number_of_missing_values': 0, + 'number_of_negative_numeric_values': 0, + 'number_of_numeric_values': 45, + 'number_of_numeric_values_equal_-1': 0, + 'number_of_numeric_values_equal_0': 0, + 'number_of_numeric_values_equal_1': 0, + 'number_of_positive_numeric_values': 45, + 'number_of_present_values': 45, + 'ratio_of_missing_values': 0.0, + 'ratio_of_negative_numeric_values': 0.0, + 'ratio_of_numeric_values': 1.0, + 'ratio_of_numeric_values_equal_-1': 0.0, + 'ratio_of_numeric_values_equal_0': 0.0, + 'ratio_of_numeric_values_equal_1': 0.0, + 'ratio_of_positive_numeric_values': 1.0, + 'ratio_of_present_values': 1.0, + 'value_counts_aggregate': { + 'count': 3, + 'max': 15, + 'mean': 15.0, + 'median': 15.0, + 'min': 15, + 'quartile_1': 15.0, + 'quartile_3': 15.0, + 'skewness': 0, + 'std': 0.0, + }, + 'value_probabilities_aggregate': { + 'count': 3, + 'max': 0.3333333333333333, + 'mean': 0.3333333333333333, + 'median': 0.3333333333333333, + 'min': 0.3333333333333333, + 'quartile_1': 0.3333333333333333, + 'quartile_3': 0.3333333333333333, + 'skewness': 0, + 'std': 0.0, + }, + 'values_aggregate': { + 'count': 45, + 'kurtosis': -1.5348837209302326, + 'max': 1262304000.0, + 'mean': 946713600.0, + 'median': 946684800.0, + 'min': 631152000.0, + 'quartile_1': 631152000.0, + 'quartile_3': 1262304000.0, + 'skewness': 0.00017349603091112943, + 'std': 260578306.67149138, + }, + })) + + expected_metafeatures = { + 'categorical_noise_to_signal_ratio': 6.856024896846719, + 'discrete_noise_to_signal_ratio': 16.280596971377722, + 'entropy_of_values': 1.2922333886497557, + 'equivalent_number_of_attributes': 7.497510695804063, + 'equivalent_number_of_categorical_attributes': 7.497510695804063, + 'equivalent_number_of_discrete_attributes': 24.925850557201, + 'equivalent_number_of_numeric_attributes': 24.925850557201, + 'joint_entropy_of_attributes': { + 'count': 4, + 'kurtosis': 3.8310594212937232, + 'max': 0.27405736318703244, + 'mean': 0.11209904602421886, + 'median': 0.06401513288957879, + 'min': 0.04630855513068542, + 'quartile_1': 0.05461037397689525, + 'quartile_3': 0.12150380493690241, + 'skewness': 1.949786087429789, + 'std': 0.10842988984399864, + }, + 'joint_entropy_of_categorical_attributes': { + 'count': 2, + 'max': 2.6276139378968235, + 'mean': 2.473903498180581, + 'median': 2.473903498180581, + 'min': 2.3201930584643393, + 'quartile_1': 2.3970482783224605, + 'quartile_3': 2.5507587180387024, + 'std': 0.2173793885250416, + }, + 'joint_entropy_of_discrete_attributes': { + 'count': 2, + 'max': 2.3334680303922335, + 'mean': 2.139600733638498, + 'median': 2.139600733638498, + 'min': 1.945733436884763, + 'quartile_1': 2.0426670852616304, + 'quartile_3': 2.236534382015366, + 'std': 0.2741697603697419, + }, + 'joint_entropy_of_numeric_attributes': { + 'count': 2, + 'max': 2.3334680303922335, + 'mean': 2.139600733638498, + 'median': 2.139600733638498, + 'min': 1.945733436884763, + 'quartile_1': 2.0426670852616304, + 'quartile_3': 2.236534382015366, + 'std': 0.2741697603697419, + }, + 'mutual_information_of_attributes': { + 'count': 2, + 'max': 0.27405736318703244, + 'mean': 0.17235499102027907, + 'median': 0.17235499102027907, + 'min': 0.07065261885352572, + 'quartile_1': 0.12150380493690241, + 'quartile_3': 0.22320617710365576, + 'std': 0.1438288740437386, + }, + 'mutual_information_of_categorical_attributes': { + 'count': 2, + 'max': 0.27405736318703244, + 'mean': 0.17235499102027907, + 'median': 0.17235499102027907, + 'min': 0.07065261885352572, + 'quartile_1': 0.12150380493690241, + 'quartile_3': 0.22320617710365576, + 'std': 0.1438288740437386, + }, + 'mutual_information_of_discrete_attributes': { + 'count': 2, + 'max': 0.05737764692563185, + 'mean': 0.05184310102815864, + 'median': 0.05184310102815864, + 'min': 0.04630855513068542, + 'quartile_1': 0.049075828079422026, + 'quartile_3': 0.05461037397689525, + 'std': 0.007827029869782995, + }, + 'mutual_information_of_numeric_attributes': { + 'count': 2, + 'max': 0.05737764692563185, + 'mean': 0.05184310102815864, + 'median': 0.05184310102815864, + 'min': 0.04630855513068542, + 'quartile_1': 0.049075828079422026, + 'quartile_3': 0.05461037397689525, + 'std': 0.007827029869782995, + }, + 'noise_to_signal_ratio': 5.526950051885679, + 'number_distinct_values': 45, + 'number_of_missing_values': 0, + 'number_of_negative_numeric_values': 0, + 'number_of_numeric_values': 45, + 'number_of_numeric_values_equal_-1': 0, + 'number_of_numeric_values_equal_0': 0, + 'number_of_numeric_values_equal_1': 0, + 'number_of_positive_numeric_values': 45, + 'number_of_present_values': 45, + 'numeric_noise_to_signal_ratio': 16.280596971377722, + 'ratio_of_missing_values': 0.0, + 'ratio_of_negative_numeric_values': 0.0, + 'ratio_of_numeric_values': 1.0, + 'ratio_of_numeric_values_equal_-1': 0.0, + 'ratio_of_numeric_values_equal_0': 0.0, + 'ratio_of_numeric_values_equal_1': 0.0, + 'ratio_of_positive_numeric_values': 1.0, + 'ratio_of_present_values': 1.0, + 'value_counts_aggregate': { + 'count': 4, + 'kurtosis': 0.2795705816375573, + 'max': 19, + 'mean': 11.25, + 'median': 10.0, + 'min': 6, + 'quartile_1': 7.5, + 'quartile_3': 13.75, + 'skewness': 1.0126926768695854, + 'std': 5.737304826019502, + }, + 'value_probabilities_aggregate': { + 'count': 4, + 'kurtosis': 0.2795705816375609, + 'max': 0.4222222222222222, + 'mean': 0.25, + 'median': 0.2222222222222222, + 'min': 0.13333333333333333, + 'quartile_1': 0.16666666666666666, + 'quartile_3': 0.3055555555555556, + 'skewness': 1.0126926768695859, + 'std': 0.12749566280043337, + }, + 'values_aggregate': { + 'count': 45, + 'kurtosis': -1.376558337329924, + 'max': 70.8170731707317, + 'mean': 54.363425575007106, + 'median': 53.6699876392329, + 'min': 32.328512195122, + 'quartile_1': 45.648691933945, + 'quartile_3': 65.5693658536586, + 'skewness': -0.11742803570367141, + 'std': 11.607381033992365, + }, + } + + if parse_categorical_columns: + # Because the order of string values is different from the order of encoded values, + # the numbers are slightly different between parsed and not parsed cases. + expected_metafeatures['joint_entropy_of_categorical_attributes'] = { + 'count': 2, + 'max': 2.6276139378968226, + 'mean': 2.473903498180581, + 'median': 2.473903498180581, + 'min': 2.3201930584643393, + 'quartile_1': 2.39704827832246, + 'quartile_3': 2.550758718038702, + 'std': 0.217379388525041, + } + expected_metafeatures['joint_entropy_of_attributes'] = { + 'count': 4, + 'kurtosis': 3.8310594212937232, + 'max': 0.27405736318703244, + 'mean': 0.11209904602421886, + 'median': 0.06401513288957879, + 'min': 0.04630855513068542, + 'quartile_1': 0.05461037397689525, + 'quartile_3': 0.12150380493690241, + 'skewness': 1.949786087429789, + 'std': 0.10842988984399864, + } + + self.assertEqual(round_numbers(test_utils.convert_through_json(metadata.query_column(7)['data_metafeatures'])), round_numbers(expected_metafeatures)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_construct_predictions.py b/tods/common-primitives/tests/test_construct_predictions.py new file mode 100644 index 0000000..531d711 --- /dev/null +++ b/tods/common-primitives/tests/test_construct_predictions.py @@ -0,0 +1,233 @@ +import copy +import os +import unittest + +import numpy + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, construct_predictions, extract_columns_semantic_types + +import utils as test_utils + + +class ConstructPredictionsPrimitiveTestCase(unittest.TestCase): + # TODO: Make this part of metadata API. + # Something like setting a semantic type for given columns. + def _mark_all_targets(self, dataset, targets): + for target in targets: + dataset.metadata = dataset.metadata.add_semantic_type((target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type((target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type((target['resource_id'], metadata_base.ALL_ELEMENTS, target['column_index']), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + def _get_iris_dataframe(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + self._mark_all_targets(dataset, [{'resource_id': 'learningData', 'column_index': 5}]) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + return dataframe + + def test_correct_order(self): + dataframe = self._get_iris_dataframe() + + hyperparams_class = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams() + + # We extract both the primary index and targets. So it is in the output format already. + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Target',)})) + + call_metadata = primitive.produce(inputs=dataframe) + + targets = call_metadata.value + + # We pretend these are our predictions. + targets.metadata = targets.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + targets.metadata = targets.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') + + # We switch columns around. + targets = targets.select_columns([1, 0]) + + hyperparams_class = construct_predictions.ConstructPredictionsPrimitive.metadata.get_hyperparams() + + construct_primitive = construct_predictions.ConstructPredictionsPrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = construct_primitive.produce(inputs=targets, reference=dataframe) + + dataframe = call_metadata.value + + self.assertEqual(list(dataframe.columns), ['d3mIndex', 'species']) + + self._test_metadata(dataframe.metadata) + + def test_all_columns(self): + dataframe = self._get_iris_dataframe() + + # We use all columns. Output has to be just index and targets. + targets = copy.copy(dataframe) + + # We pretend these are our predictions. + targets.metadata = targets.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + targets.metadata = targets.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') + + hyperparams_class = construct_predictions.ConstructPredictionsPrimitive.metadata.get_hyperparams() + + construct_primitive = construct_predictions.ConstructPredictionsPrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = construct_primitive.produce(inputs=targets, reference=dataframe) + + dataframe = call_metadata.value + + self.assertEqual(list(dataframe.columns), ['d3mIndex', 'species']) + + self._test_metadata(dataframe.metadata) + + def test_missing_index(self): + dataframe = self._get_iris_dataframe() + + # We just use all columns. + targets = copy.copy(dataframe) + + # We pretend these are our predictions. + targets.metadata = targets.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + targets.metadata = targets.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') + + # Remove primary index. This one has to be reconstructed. + targets = targets.remove_columns([0]) + + hyperparams_class = construct_predictions.ConstructPredictionsPrimitive.metadata.get_hyperparams() + + construct_primitive = construct_predictions.ConstructPredictionsPrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = construct_primitive.produce(inputs=targets, reference=dataframe) + + dataframe = call_metadata.value + + self.assertEqual(list(dataframe.columns), ['d3mIndex', 'species']) + + self._test_metadata(dataframe.metadata) + + def test_just_targets_no_metadata(self): + dataframe = self._get_iris_dataframe() + + hyperparams_class = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams() + + # We extract just targets. + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Target',)})) + + call_metadata = primitive.produce(inputs=dataframe) + + targets = call_metadata.value + + # Remove all metadata. + targets.metadata = metadata_base.DataMetadata().generate(targets) + + hyperparams_class = construct_predictions.ConstructPredictionsPrimitive.metadata.get_hyperparams() + + construct_primitive = construct_predictions.ConstructPredictionsPrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = construct_primitive.produce(inputs=targets, reference=dataframe) + + dataframe = call_metadata.value + + self.assertEqual(list(dataframe.columns), ['d3mIndex', 'species']) + + self._test_metadata(dataframe.metadata, True) + + def _test_metadata(self, metadata, no_metadata=False): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + if no_metadata: + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', + ], + }) + + else: + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', + ], + }) + + def test_float_vector(self): + dataframe = container.DataFrame({ + 'd3mIndex': [0], + 'target': [container.ndarray(numpy.array([3,5,9,10]))], + }, generate_metadata=True) + + # Update metadata. + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget') + + hyperparams_class = construct_predictions.ConstructPredictionsPrimitive.metadata.get_hyperparams() + + construct_primitive = construct_predictions.ConstructPredictionsPrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = construct_primitive.produce(inputs=dataframe, reference=dataframe).value + + self.assertEqual(list(dataframe.columns), ['d3mIndex', 'target']) + + self.assertEqual(dataframe.values.tolist(), [ + [0, '3,5,9,10'], + ]) + + self.assertEqual(dataframe.metadata.query_column(1), { + 'structural_type': str, + 'name': 'target', + 'semantic_types': ( + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', + ), + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_csv_reader.py b/tods/common-primitives/tests/test_csv_reader.py new file mode 100644 index 0000000..3430c33 --- /dev/null +++ b/tods/common-primitives/tests/test_csv_reader.py @@ -0,0 +1,50 @@ +import unittest +import os + +from d3m import container + +from common_primitives import dataset_to_dataframe, csv_reader + + +class CSVReaderPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_2', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults().replace({'dataframe_resource': '0'})) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + csv_hyperparams_class = csv_reader.CSVReaderPrimitive.metadata.get_hyperparams() + csv_primitive = csv_reader.CSVReaderPrimitive(hyperparams=csv_hyperparams_class.defaults().replace({'return_result': 'replace'})) + tables = csv_primitive.produce(inputs=dataframe).value + + self.assertEqual(tables.shape, (5, 1)) + + self._test_metadata(tables.metadata) + + def _test_metadata(self, metadata): + self.assertEqual(metadata.query_column(0)['structural_type'], container.DataFrame) + self.assertEqual(metadata.query_column(0)['semantic_types'], ('https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Timeseries', 'https://metadata.datadrivendiscovery.org/types/Table')) + + self.assertEqual(metadata.query_column(0, at=(0, 0)), { + 'structural_type': str, + 'name': 'time', + 'semantic_types': ( + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Time', + ) + }) + self.assertEqual(metadata.query_column(1, at=(0, 0)), { + 'structural_type': str, + 'name': 'value', + 'semantic_types': ( + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ) + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_cut_audio.py b/tods/common-primitives/tests/test_cut_audio.py new file mode 100644 index 0000000..da8282a --- /dev/null +++ b/tods/common-primitives/tests/test_cut_audio.py @@ -0,0 +1,122 @@ +import unittest +import os + +from d3m import container + +from common_primitives import audio_reader, cut_audio, dataset_to_dataframe, denormalize, column_parser + + +class AudioReaderPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'audio_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + denormalize_hyperparams_class = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + denormalize_primitive = denormalize.DenormalizePrimitive(hyperparams=denormalize_hyperparams_class.defaults()) + dataset = denormalize_primitive.produce(inputs=dataset).value + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + column_parser_hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + column_parser_primitive = column_parser.ColumnParserPrimitive(hyperparams=column_parser_hyperparams_class.defaults()) + dataframe = column_parser_primitive.produce(inputs=dataframe).value + + audio_hyperparams_class = audio_reader.AudioReaderPrimitive.metadata.get_hyperparams() + audio_primitive = audio_reader.AudioReaderPrimitive(hyperparams=audio_hyperparams_class.defaults()) + dataframe = audio_primitive.produce(inputs=dataframe).value + + self.assertEqual(dataframe.iloc[0, 1], 'test_audio.mp3') + self.assertEqual(dataframe.iloc[0, 5].shape, (4410, 1)) + + cut_audio_hyperparams_class = cut_audio.CutAudioPrimitive.metadata.get_hyperparams() + cut_audio_primitive = cut_audio.CutAudioPrimitive(hyperparams=cut_audio_hyperparams_class.defaults()) + dataframe = cut_audio_primitive.produce(inputs=dataframe).value + + self.assertEqual(dataframe.iloc[0, 1], 'test_audio.mp3') + self.assertEqual(dataframe.iloc[0, 5].shape, (44, 1)) + + self._test_metadata(dataframe.metadata, False) + + def _test_metadata(self, dataframe_metadata, is_can_accept): + self.assertEqual(dataframe_metadata.query_column(2), { + 'structural_type': float, + 'name': 'start', + 'semantic_types': ( + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Boundary', + 'https://metadata.datadrivendiscovery.org/types/IntervalStart', + ), + }) + self.assertEqual(dataframe_metadata.query_column(3), { + 'structural_type': float, + 'name': 'end', + 'semantic_types': ( + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Boundary', + 'https://metadata.datadrivendiscovery.org/types/IntervalEnd', + ), + }) + + if is_can_accept: + self.assertEqual(dataframe_metadata.query_column(5), { + 'structural_type': container.ndarray, + 'semantic_types': ( + 'http://schema.org/AudioObject', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + ), + 'name': 'filename', + }) + self.assertEqual(dataframe_metadata.query((0, 5)), { + 'structural_type': container.ndarray, + 'semantic_types': ( + 'http://schema.org/AudioObject', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + ), + 'name': 'filename', + }) + else: + self.assertEqual(dataframe_metadata.query_column(5), { + 'structural_type': container.ndarray, + 'semantic_types': ( + 'http://schema.org/AudioObject', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + 'https://metadata.datadrivendiscovery.org/types/Table', + ), + 'dimension': { + # The length is set here only because there is only one row. + 'length': 44, + 'name': 'rows', + 'semantic_types': ( + 'https://metadata.datadrivendiscovery.org/types/TabularRow', + ), + }, + 'name': 'filename', + }) + self.assertEqual(dataframe_metadata.query((0, 5)), { + 'structural_type': container.ndarray, + 'semantic_types': ( + 'http://schema.org/AudioObject', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + 'https://metadata.datadrivendiscovery.org/types/Table', + ), + 'dimension': { + 'length': 44, + 'name': 'rows', + 'semantic_types': ( + 'https://metadata.datadrivendiscovery.org/types/TabularRow', + ), + 'sampling_rate': 44100, + }, + 'name': 'filename', + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_dataframe_flatten.py b/tods/common-primitives/tests/test_dataframe_flatten.py new file mode 100644 index 0000000..7554132 --- /dev/null +++ b/tods/common-primitives/tests/test_dataframe_flatten.py @@ -0,0 +1,132 @@ +import unittest +import os + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, csv_reader, dataframe_flatten + + +class DataFrameFlattenPrimitiveTestCase(unittest.TestCase): + + COLUMN_METADATA = { + 'time': { + 'structural_type': str, + 'name': 'time', + 'semantic_types': ( + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Time' + ), + }, + 'value': { + 'structural_type': str, + 'name': 'value', + 'semantic_types': ( + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute' + ), + } + } + + def test_replace(self) -> None: + tables = self._load_data() + flat_hyperparams_class = dataframe_flatten.DataFrameFlattenPrimitive.metadata.get_hyperparams() + flat_primitive = dataframe_flatten.DataFrameFlattenPrimitive(hyperparams=flat_hyperparams_class.defaults()) + flat_result = flat_primitive.produce(inputs=tables).value + + self.assertEqual(flat_result.shape, (830, 3)) + + metadata = flat_result.metadata + self._check_filename_metadata(metadata, 0) + self.assertEqual(metadata.query_column(1), self.COLUMN_METADATA['time']) + self.assertEqual(metadata.query_column(2), self.COLUMN_METADATA['value']) + + def test_new(self) -> None: + tables = self._load_data() + + flat_hyperparams_class = dataframe_flatten.DataFrameFlattenPrimitive.metadata.get_hyperparams() + hp = flat_hyperparams_class.defaults().replace({ + 'return_result': 'new', + 'add_index_columns': False + }) + flat_primitive = dataframe_flatten.DataFrameFlattenPrimitive(hyperparams=hp) + flat_result = flat_primitive.produce(inputs=tables).value + + self.assertEqual(flat_result.shape, (830, 2)) + metadata = flat_result.metadata + self.assertEqual(metadata.query_column(0), self.COLUMN_METADATA['time']) + self.assertEqual(metadata.query_column(1), self.COLUMN_METADATA['value']) + + def test_add_index_columns(self) -> None: + tables = self._load_data() + + flat_hyperparams_class = dataframe_flatten.DataFrameFlattenPrimitive.metadata.get_hyperparams() + hp = flat_hyperparams_class.defaults().replace({ + 'return_result': 'new', + 'add_index_columns': True + }) + flat_primitive = dataframe_flatten.DataFrameFlattenPrimitive(hyperparams=hp) + flat_result = flat_primitive.produce(inputs=tables).value + + self.assertEqual(flat_result.shape, (830, 3)) + metadata = flat_result.metadata + self._check_filename_metadata(metadata, 0) + self.assertEqual(metadata.query_column(1), self.COLUMN_METADATA['time']) + self.assertEqual(metadata.query_column(2), self.COLUMN_METADATA['value']) + + def test_use_columns(self) -> None: + tables = self._load_data() + + flat_hyperparams_class = dataframe_flatten.DataFrameFlattenPrimitive.metadata.get_hyperparams() + hp = flat_hyperparams_class.defaults().replace({'use_columns': [1]}) + + flat_primitive = dataframe_flatten.DataFrameFlattenPrimitive(hyperparams=hp) + flat_result = flat_primitive.produce(inputs=tables).value + + self.assertEqual(flat_result.shape, (830, 3), [0]) + + metadata = flat_result.metadata + self._check_filename_metadata(metadata, 0) + self.assertEqual(metadata.query_column(1), self.COLUMN_METADATA['time']) + self.assertEqual(metadata.query_column(2), self.COLUMN_METADATA['value']) + + def test_exclude_columns(self) -> None: + tables = self._load_data() + + flat_hyperparams_class = dataframe_flatten.DataFrameFlattenPrimitive.metadata.get_hyperparams() + hp = flat_hyperparams_class.defaults().replace({'exclude_columns': [0]}) + + flat_primitive = dataframe_flatten.DataFrameFlattenPrimitive(hyperparams=hp) + flat_result = flat_primitive.produce(inputs=tables).value + + self.assertEqual(flat_result.shape, (830, 3), [0]) + + metadata = flat_result.metadata + self._check_filename_metadata(metadata, 0) + self.assertEqual(metadata.query_column(1), self.COLUMN_METADATA['time']) + self.assertEqual(metadata.query_column(2), self.COLUMN_METADATA['value']) + + def _load_data(self) -> container.DataFrame: + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_2', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults().replace({'dataframe_resource': '0'})) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + csv_hyperparams_class = csv_reader.CSVReaderPrimitive.metadata.get_hyperparams() + csv_primitive = csv_reader.CSVReaderPrimitive(hyperparams=csv_hyperparams_class.defaults().replace({'return_result': 'append'})) + return csv_primitive.produce(inputs=dataframe).value + + def _check_filename_metadata(self, metadata: metadata_base.Metadata, col_num: int) -> None: + self.assertEqual(metadata.query_column(col_num)['name'], 'filename') + self.assertEqual(metadata.query_column(col_num)['structural_type'], str) + self.assertEqual(metadata.query_column(col_num)['semantic_types'], ( + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + 'https://metadata.datadrivendiscovery.org/types/FileName', + 'https://metadata.datadrivendiscovery.org/types/Timeseries')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_dataframe_image_reader.py b/tods/common-primitives/tests/test_dataframe_image_reader.py new file mode 100644 index 0000000..7368997 --- /dev/null +++ b/tods/common-primitives/tests/test_dataframe_image_reader.py @@ -0,0 +1,46 @@ +import unittest +import os + +from d3m import container + +from common_primitives import dataset_to_dataframe, dataframe_image_reader + + +class DataFrameImageReaderPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'image_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults().replace({'dataframe_resource': '0'})) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + image_hyperparams_class = dataframe_image_reader.DataFrameImageReaderPrimitive.metadata.get_hyperparams() + image_primitive = dataframe_image_reader.DataFrameImageReaderPrimitive(hyperparams=image_hyperparams_class.defaults().replace({'return_result': 'replace'})) + images = image_primitive.produce(inputs=dataframe).value + + self.assertEqual(images.shape, (5, 1)) + self.assertEqual(images.iloc[0, 0].shape, (225, 150, 3)) + self.assertEqual(images.iloc[1, 0].shape, (32, 32, 3)) + self.assertEqual(images.iloc[2, 0].shape, (32, 32, 3)) + self.assertEqual(images.iloc[3, 0].shape, (28, 28, 1)) + self.assertEqual(images.iloc[4, 0].shape, (28, 28, 1)) + + self._test_metadata(images.metadata) + + self.assertEqual(images.metadata.query((0, 0))['image_reader_metadata'], { + 'jfif': 257, + 'jfif_version': (1, 1), + 'dpi': (96, 96), + 'jfif_unit': 1, + 'jfif_density': (96, 96), + }) + + def _test_metadata(self, metadata): + self.assertEqual(metadata.query_column(0)['structural_type'], container.ndarray) + self.assertEqual(metadata.query_column(0)['semantic_types'], ('https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/ImageObject')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_dataframe_to_list.py b/tods/common-primitives/tests/test_dataframe_to_list.py new file mode 100644 index 0000000..512396c --- /dev/null +++ b/tods/common-primitives/tests/test_dataframe_to_list.py @@ -0,0 +1,41 @@ +import unittest + +from d3m import container + +from common_primitives import dataframe_to_list, dataset_to_dataframe + +import utils as test_utils + + +class DataFrameToListPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + # load the iris dataset + dataset = test_utils.load_iris_metadata() + + # convert the dataset into a dataframe + dataset_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataset_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + # convert the dataframe into a list + list_hyperparams_class = dataframe_to_list.DataFrameToListPrimitive.metadata.get_hyperparams() + list_primitive = dataframe_to_list.DataFrameToListPrimitive(hyperparams=list_hyperparams_class.defaults()) + list_value = list_primitive.produce(inputs=dataframe).value + + self.assertIsInstance(list_value, container.List) + + # verify dimensions + self.assertEqual(len(list_value), 150) + self.assertEqual(len(list_value[0]), 6) + + # verify data type is unchanged + for row in list_value: + for val in row: + self.assertIsInstance(val, str) + + # validate metadata + test_utils.test_iris_metadata(self, list_value.metadata, 'd3m.container.list.List', 'd3m.container.list.List') + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_dataframe_to_ndarray.py b/tods/common-primitives/tests/test_dataframe_to_ndarray.py new file mode 100644 index 0000000..6e79645 --- /dev/null +++ b/tods/common-primitives/tests/test_dataframe_to_ndarray.py @@ -0,0 +1,40 @@ +import unittest + +from common_primitives import dataframe_to_ndarray, dataset_to_dataframe +from d3m import container + +import utils as test_utils + + +class DataFrameToNDArrayPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + # load the iris dataset + dataset = test_utils.load_iris_metadata() + + # convert the dataset into a dataframe + dataset_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataset_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + # convert the dataframe into a numpy array + numpy_hyperparams_class = dataframe_to_ndarray.DataFrameToNDArrayPrimitive.metadata.get_hyperparams() + numpy_primitive = dataframe_to_ndarray.DataFrameToNDArrayPrimitive(hyperparams=numpy_hyperparams_class.defaults()) + numpy_array = numpy_primitive.produce(inputs=dataframe).value + + self.assertIsInstance(numpy_array, container.ndarray) + + # verify dimensions + self.assertEqual(len(numpy_array), 150) + self.assertEqual(len(numpy_array[0]), 6) + + # verify data type is unchanged + for row in numpy_array: + for val in row: + self.assertIsInstance(val, str) + + # validate metadata + test_utils.test_iris_metadata(self, numpy_array.metadata, 'd3m.container.numpy.ndarray') + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_dataframe_utils.py b/tods/common-primitives/tests/test_dataframe_utils.py new file mode 100644 index 0000000..9b2b7d7 --- /dev/null +++ b/tods/common-primitives/tests/test_dataframe_utils.py @@ -0,0 +1,27 @@ +import unittest +import os + +from common_primitives import dataframe_utils +from d3m import container +from d3m.base import utils as base_utils + +import utils as test_utils + + +class DataFrameUtilsTestCase(unittest.TestCase): + def test_inclusive(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + to_keep_indices = [1, 2, 5] + + output = dataframe_utils.select_rows(resource, to_keep_indices) + self.assertEqual(len(output), 3) + self.assertEqual(len(output.iloc[0]), 5) + self.assertEqual(output.iloc[1, 0], '3') + self.assertEqual(output.iloc[2, 0], '6') + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_dataset_map.py b/tods/common-primitives/tests/test_dataset_map.py new file mode 100644 index 0000000..a789d4d --- /dev/null +++ b/tods/common-primitives/tests/test_dataset_map.py @@ -0,0 +1,73 @@ +import unittest +import os +import pickle +import sys + +from d3m import container, index, utils as d3m_utils + +TEST_PRIMITIVES_DIR = os.path.join(os.path.dirname(__file__), 'data', 'primitives') +sys.path.insert(0, TEST_PRIMITIVES_DIR) + +from test_primitives.null import NullTransformerPrimitive, NullUnsupervisedLearnerPrimitive + +# To hide any logging or stdout output. +with d3m_utils.silence(): + index.register_primitive('d3m.primitives.operator.null.TransformerTest', NullTransformerPrimitive) + index.register_primitive('d3m.primitives.operator.null.UnsupervisedLearnerTest', NullUnsupervisedLearnerPrimitive) + +from common_primitives import dataset_to_dataframe, denormalize, dataset_map, column_parser + +import utils as test_utils + + +class DatasetMapTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # First we try denormalizing and column parsing. + hyperparams = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + primitive = denormalize.DenormalizePrimitive(hyperparams=hyperparams.defaults()) + dataset_1 = primitive.produce(inputs=dataset).value + + hyperparams = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams.defaults()) + dataframe_1 = primitive.produce(inputs=dataset_1).value + + hyperparams = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams.defaults().replace({'return_result': 'replace'})) + dataframe_1 = primitive.produce(inputs=dataframe_1).value + + # Second we try first column parsing and then denormalizing. + hyperparams = dataset_map.DataFrameDatasetMapPrimitive.metadata.get_hyperparams() + primitive = dataset_map.DataFrameDatasetMapPrimitive( + # We have to make an instance of the primitive ourselves. + hyperparams=hyperparams.defaults().replace({ + 'primitive': column_parser.ColumnParserPrimitive( + hyperparams=column_parser.ColumnParserPrimitive.metadata.get_hyperparams().defaults(), + ), + 'resources': 'all', + }), + + ) + dataset_2 = primitive.produce(inputs=dataset).value + + hyperparams = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + primitive = denormalize.DenormalizePrimitive(hyperparams=hyperparams.defaults()) + dataset_2 = primitive.produce(inputs=dataset_2).value + + hyperparams = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams.defaults()) + dataframe_2 = primitive.produce(inputs=dataset_2).value + + self.assertEqual(test_utils.convert_through_json(dataframe_1), test_utils.convert_through_json(dataframe_2)) + self.assertEqual(dataframe_1.metadata.to_internal_json_structure(), dataframe_2.metadata.to_internal_json_structure()) + + pickle.dumps(primitive) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_dataset_sample.py b/tods/common-primitives/tests/test_dataset_sample.py new file mode 100644 index 0000000..57da93a --- /dev/null +++ b/tods/common-primitives/tests/test_dataset_sample.py @@ -0,0 +1,58 @@ +import os +import pickle +import unittest +import pandas as pd + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_sample + + +class DatasetSamplePrimitiveTestCase(unittest.TestCase): + def test_produce(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_sample.DatasetSamplePrimitive.metadata.get_hyperparams() + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + + sample_sizes = [0.1, 0.5, 0.9, 4, 22, 40] + dataset_sizes = [4, 22, 40, 4, 22, 40] + for s, d in zip(sample_sizes, dataset_sizes): + primitive = dataset_sample.DatasetSamplePrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'sample_size': s, + })) + result = primitive.produce(inputs=dataset).value + self.assertEqual(len(result['learningData'].iloc[:, 0]), d, s) + + def test_empty_test_set(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # set target columns to '' to imitate test dataset + dataset['learningData']['species'] = '' + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + + hyperparams_class = dataset_sample.DatasetSamplePrimitive.metadata.get_hyperparams() + + # check that no rows are sampled + sample_sizes = [0.1, 0.5, 0.9] + for s in sample_sizes: + primitive = dataset_sample.DatasetSamplePrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'sample_size': s, + })) + result = primitive.produce(inputs=dataset).value + self.assertEqual(len(result['learningData'].iloc[:, 0]), 150, s) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_dataset_to_dataframe.py b/tods/common-primitives/tests/test_dataset_to_dataframe.py new file mode 100644 index 0000000..a7718be --- /dev/null +++ b/tods/common-primitives/tests/test_dataset_to_dataframe.py @@ -0,0 +1,93 @@ +import os +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe + +import utils as test_utils + + +class DatasetToDataFramePrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + self.assertIsInstance(dataframe, container.DataFrame) + + for row in dataframe: + for cell in row: + # Nothing should be parsed from a string. + self.assertIsInstance(cell, str) + + self.assertEqual(len(dataframe), 150) + self.assertEqual(len(dataframe.iloc[0]), 6) + + self._test_metadata(dataframe.metadata) + + def _test_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + for i in range(1, 5): + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, i))), { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'][i - 1], + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, i) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))), { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_datetime_field_compose.py b/tods/common-primitives/tests/test_datetime_field_compose.py new file mode 100644 index 0000000..ac93823 --- /dev/null +++ b/tods/common-primitives/tests/test_datetime_field_compose.py @@ -0,0 +1,67 @@ +import math +import os.path +import unittest + +from datetime import datetime +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, datetime_field_compose + +import utils as test_utils + + +class DatetimeFieldComposePrimitiveTestCase(unittest.TestCase): + def test_compose_two_fields(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_3', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + compose_hyperparams_class = datetime_field_compose.DatetimeFieldComposePrimitive.metadata.get_hyperparams() + hp = compose_hyperparams_class({ + 'columns': [2,3], + 'join_char': '-', + 'output_name': 'timestamp' + }) + compose_primitive = datetime_field_compose.DatetimeFieldComposePrimitive(hyperparams=hp) + new_dataframe = compose_primitive.produce(inputs=resource).value + + self.assertEqual(new_dataframe.shape, (40, 6)) + self.assertEqual(datetime(2013, 11, 1), new_dataframe['timestamp'][0]) + + col_meta = new_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 5)) + self.assertEqual(col_meta['name'], 'timestamp') + self.assertTrue('https://metadata.datadrivendiscovery.org/types/Time' in col_meta['semantic_types']) + + def test_bad_join_char(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_3', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + compose_hyperparams_class = datetime_field_compose.DatetimeFieldComposePrimitive.metadata.get_hyperparams() + hp = compose_hyperparams_class({ + 'columns': [2,3], + 'join_char': 'cc', + 'output_name': 'timestamp' + }) + compose_primitive = datetime_field_compose.DatetimeFieldComposePrimitive(hyperparams=hp) + with self.assertRaises(ValueError): + compose_primitive.produce(inputs=resource) + + def test_bad_columns(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_3', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + compose_hyperparams_class = datetime_field_compose.DatetimeFieldComposePrimitive.metadata.get_hyperparams() + hp = compose_hyperparams_class({ + 'columns': [1,2], + 'join_char': '-', + 'output_name': 'timestamp' + }) + compose_primitive = datetime_field_compose.DatetimeFieldComposePrimitive(hyperparams=hp) + with self.assertRaises(ValueError): + compose_primitive.produce(inputs=resource) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_datetime_range_filter.py b/tods/common-primitives/tests/test_datetime_range_filter.py new file mode 100644 index 0000000..d047e92 --- /dev/null +++ b/tods/common-primitives/tests/test_datetime_range_filter.py @@ -0,0 +1,149 @@ +import unittest +import os + +from datetime import datetime +from dateutil import parser +from common_primitives import datetime_range_filter +from d3m import container + +import utils as test_utils + + +class DatetimeRangeFilterPrimitiveTestCase(unittest.TestCase): + def test_inclusive_strict(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = datetime_range_filter.DatetimeRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class.defaults().replace({ + 'column': 3, + 'min': datetime(2013, 11, 8), + 'max': datetime(2013, 12, 3), + 'strict': True, + 'inclusive': True + }) + filter_primitive = datetime_range_filter.DatetimeRangeFilterPrimitive(hyperparams=hp) + new_dataframe = filter_primitive.produce(inputs=resource).value + + self.assertGreater(new_dataframe['Date'].apply(parser.parse).min(), datetime(2013, 11, 8)) + self.assertLess(new_dataframe['Date'].apply(parser.parse).max(), datetime(2013, 12, 3)) + self.assertEqual(15, len(new_dataframe)) + + def test_inclusive_permissive(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = datetime_range_filter.DatetimeRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class.defaults().replace({ + 'column': 3, + 'min': datetime(2013, 11, 8), + 'max': datetime(2013, 12, 3), + 'strict': False, + 'inclusive': True + }) + filter_primitive = datetime_range_filter.DatetimeRangeFilterPrimitive(hyperparams=hp) + new_dataframe = filter_primitive.produce(inputs=resource).value + + self.assertGreaterEqual(new_dataframe['Date'].apply(parser.parse).min(), datetime(2013, 11, 8)) + self.assertLessEqual(new_dataframe['Date'].apply(parser.parse).max(), datetime(2013, 12, 3)) + self.assertEqual(17, len(new_dataframe)) + + def test_exclusive_strict(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = datetime_range_filter \ + .DatetimeRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class.defaults().replace({ + 'column': 3, + 'min': datetime(2013, 11, 8), + 'max': datetime(2013, 12, 3), + 'strict': True, + 'inclusive': False + }) + filter_primitive = datetime_range_filter.DatetimeRangeFilterPrimitive(hyperparams=hp) + new_dataframe = filter_primitive.produce(inputs=resource).value + + self.assertEqual( + len(new_dataframe.loc[ + (new_dataframe['Date'].apply(parser.parse) >= datetime(2013, 11, 8)) & + (new_dataframe['Date'].apply(parser.parse).max() <= datetime(2013, 12, 3))]), 0) + self.assertEqual(23, len(new_dataframe)) + + def test_exclusive_permissive(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = datetime_range_filter \ + .DatetimeRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class.defaults().replace({ + 'column': 3, + 'min': datetime(2013, 11, 8), + 'max': datetime(2013, 12, 3), + 'strict': False, + 'inclusive': False + }) + filter_primitive = datetime_range_filter.DatetimeRangeFilterPrimitive(hyperparams=hp) + new_dataframe = filter_primitive.produce(inputs=resource).value + + self.assertEqual( + len(new_dataframe.loc[ + (new_dataframe['Date'].apply(parser.parse) > datetime(2013, 11, 8)) & + (new_dataframe['Date'].apply(parser.parse).max() < datetime(2013, 12, 3))]), 0) + self.assertEqual(25, len(new_dataframe)) + + def test_row_metadata_removal(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # add metadata for rows 0 and 1 + dataset.metadata = dataset.metadata.update(('learningData', 0), {'a': 0}) + dataset.metadata = dataset.metadata.update(('learningData', 5), {'b': 1}) + + resource = test_utils.get_dataframe(dataset) + + # apply filter that removes rows 0 and 1 + filter_hyperparams_class = datetime_range_filter.DatetimeRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class.defaults().replace({ + 'column': 3, + 'min': datetime(2013, 11, 4), + 'max': datetime(2013, 11, 7), + 'strict': True, + 'inclusive': False + }) + filter_primitive = datetime_range_filter.DatetimeRangeFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + # verify that the length is correct + self.assertEqual(len(new_df), new_df.metadata.query(())['dimension']['length']) + + # verify that the rows were re-indexed in the metadata + self.assertEqual(new_df.metadata.query((0,))['a'], 0) + self.assertEqual(new_df.metadata.query((1,))['b'], 1) + self.assertFalse('b' in new_df.metadata.query((5,))) + + def test_bad_type_handling(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = datetime_range_filter \ + .DatetimeRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class.defaults().replace({ + 'column': 1, + 'min': datetime(2013, 11, 1), + 'max': datetime(2013, 11, 4), + 'strict': False, + 'inclusive': False, + }) + filter_primitive = datetime_range_filter.DatetimeRangeFilterPrimitive(hyperparams=hp) + with self.assertRaises(ValueError): + filter_primitive.produce(inputs=resource) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_denormalize.py b/tods/common-primitives/tests/test_denormalize.py new file mode 100644 index 0000000..0737fed --- /dev/null +++ b/tods/common-primitives/tests/test_denormalize.py @@ -0,0 +1,469 @@ +import os +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import denormalize + +import utils as test_utils + + +class DenormalizePrimitiveTestCase(unittest.TestCase): + def test_discard(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataset_metadata_before = dataset.metadata.to_internal_json_structure() + + hyperparams_class = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + + primitive = denormalize.DenormalizePrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'recursive': False, + 'discard_not_joined_tabular_resources': True, + })) + + denormalized_dataset = primitive.produce(inputs=dataset).value + + self.assertIsInstance(denormalized_dataset, container.Dataset) + + self.assertEqual(len(denormalized_dataset), 1) + + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 2]), {'AAA name', 'BBB name', 'CCC name'}) + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 3]), {'1', '2', ''}) + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 4]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'}) + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 5]), {'1990', '2000', '2010'}) + + self._test_discard_metadata(denormalized_dataset.metadata, dataset_doc_path) + + self.assertEqual(dataset.metadata.to_internal_json_structure(), dataset_metadata_before) + + def _test_discard_metadata(self, metadata, dataset_doc_path): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': 'database_dataset_1', + 'version': '4.0.0', + 'name': 'A dataset simulating a database dump', + 'location_uris': [ + 'file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path), + ], + 'dimension': { + 'name': 'resources', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/DatasetResource', + ], + 'length': 1, + }, + 'digest': '68c435c6ba9a1c419c79507275c0d5710786dfe481e48f35591d87a7dbf5bb1a', + 'description': 'A synthetic dataset trying to be similar to a database dump, with tables with different relations between them.', + 'source': { + 'license': 'CC', + 'redacted': False, + }, + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData',))), { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow', + ], + 'length': 45, + }, + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 7, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 3))), { + 'name': 'author', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'foreign_key': { + 'type': 'COLUMN', + 'resource_id': 'authors', + 'column_index': 0, + }, + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 1))), { + 'name': 'code', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 2))), { + 'name': 'name', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 4))), { + 'name': 'key', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 5))), { + 'name': 'year', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/DateTime', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 6))), { + 'name': 'value', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + def test_recursive(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataset_metadata_before = dataset.metadata.to_internal_json_structure() + + hyperparams_class = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + + primitive = denormalize.DenormalizePrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'recursive': True, + 'discard_not_joined_tabular_resources': False, + })) + + denormalized_dataset = primitive.produce(inputs=dataset).value + + self.assertIsInstance(denormalized_dataset, container.Dataset) + + self.assertEqual(len(denormalized_dataset), 4) + + self.assertEqual(denormalized_dataset['values'].shape[0], 64) + self.assertEqual(denormalized_dataset['learningData'].shape[1], 8) + + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 2]), {'AAA name', 'BBB name', 'CCC name'}) + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 3]), {'1', '2', ''}) + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 4]), {'1 name', '2 name', ''}) + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 5]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'}) + self.assertEqual(set(denormalized_dataset['learningData'].iloc[:, 6]), {'1990', '2000', '2010'}) + + self._test_recursive_metadata(denormalized_dataset.metadata, dataset_doc_path) + + self.assertEqual(dataset.metadata.to_internal_json_structure(), dataset_metadata_before) + + def _test_recursive_metadata(self, metadata, dataset_doc_path): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': 'database_dataset_1', + 'version': '4.0.0', + 'name': 'A dataset simulating a database dump', + 'location_uris': [ + 'file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path), + ], + 'dimension': { + 'name': 'resources', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/DatasetResource', + ], + 'length': 4, + }, + 'digest': '68c435c6ba9a1c419c79507275c0d5710786dfe481e48f35591d87a7dbf5bb1a', + 'description': 'A synthetic dataset trying to be similar to a database dump, with tables with different relations between them.', + 'source': { + 'license': 'CC', + 'redacted': False, + }, + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData',))), { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow', + ], + 'length': 45, + }, + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 8, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 3))), { + 'name': 'id', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 1))), { + 'name': 'code', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + for i in [2, 4]: + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, i))), { + 'name': ['name', None, 'name'][i - 2], + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, i) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 5))), { + 'name': 'key', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 6))), { + 'name': 'year', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/DateTime', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 7))), { + 'name': 'value', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + def test_row_order(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'image_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataset_metadata_before = dataset.metadata.to_internal_json_structure() + + hyperparams_class = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + + primitive = denormalize.DenormalizePrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'recursive': True, + 'discard_not_joined_tabular_resources': False, + })) + + denormalized_dataset = primitive.produce(inputs=dataset).value + + self.assertIsInstance(denormalized_dataset, container.Dataset) + + self.assertEqual(len(denormalized_dataset), 1) + + self.assertEqual(denormalized_dataset['learningData'].shape, (5, 3)) + + self.assertEqual(denormalized_dataset['learningData'].values.tolist(), [ + ['0', 'mnist_0_2.png', 'mnist'], + ['1', 'mnist_1_1.png', 'mnist'], + ['2', '001_HandPhoto_left_01.jpg', 'handgeometry'], + ['3', 'cifar10_bird_1.png', 'cifar'], + ['4', 'cifar10_bird_2.png', 'cifar'], + ]) + + self._test_row_order_metadata(denormalized_dataset.metadata, dataset_doc_path) + + self.assertEqual(dataset.metadata.to_internal_json_structure(), dataset_metadata_before) + + def _test_row_order_metadata(self, metadata, dataset_doc_path): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': 'image_dataset_1', + 'version': '4.0.0', + 'name': 'Image dataset to be used for tests', + 'location_uris': [ + 'file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path), + ], + 'dimension': { + 'name': 'resources', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/DatasetResource', + ], + 'length': 1, + }, + 'digest': '9b5553ce5ad84dfcefd379814dc6b11ef60a049479e3e91aa1251f7a5ef7409e', + 'description': 'There are a total of 5 image files, one is a left hand from the handgeometry dataset, two birds from cifar10 dataset and 2 figures from mnist dataset.', + 'source': { + 'license': 'Creative Commons Attribution-NonCommercial 4.0', + 'redacted': False, + }, + 'approximate_stored_size': 24000, + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData',))), { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow', + ], + 'length': 5, + }, + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 1))), { + 'name': 'filename', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/FileName', + 'http://schema.org/ImageObject', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + ], + 'location_base_uris': [ + 'file://{dataset_base_path}/media/'.format(dataset_base_path=os.path.dirname(dataset_doc_path)), + ], + 'media_types': [ + 'image/jpeg', + 'image/png', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', metadata_base.ALL_ELEMENTS, 2))), { + 'name': 'class', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', 0, 1))), { + 'name': 'filename', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/FileName', + 'http://schema.org/ImageObject', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + ], + 'location_base_uris': [ + 'file://{dataset_base_path}/media/'.format(dataset_base_path=os.path.dirname(dataset_doc_path)), + ], + 'media_types': [ + 'image/png', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query(('learningData', 2, 1))), { + 'name': 'filename', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/FileName', + 'http://schema.org/ImageObject', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/UniqueKey', + ], + 'location_base_uris': [ + 'file://{dataset_base_path}/media/'.format(dataset_base_path=os.path.dirname(dataset_doc_path)), + ], + 'media_types': [ + 'image/jpeg', + ], + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_extract_columns_semantic_types.py b/tods/common-primitives/tests/test_extract_columns_semantic_types.py new file mode 100644 index 0000000..aff2b59 --- /dev/null +++ b/tods/common-primitives/tests/test_extract_columns_semantic_types.py @@ -0,0 +1,203 @@ +import os +import unittest + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, extract_columns_semantic_types + +import utils as test_utils + + +class ExtractColumnsBySemanticTypePrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams() + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey')})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + self._test_metadata(dataframe.metadata) + + def _test_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 5, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + for i in range(1, 5): + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, i))), { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'][i - 1], + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, i) + + self.assertTrue(metadata.get_elements((metadata_base.ALL_ELEMENTS,)) in [[0, 1, 2, 3, 4], [metadata_base.ALL_ELEMENTS, 0, 1, 2, 3, 4]]) + + def test_set(self): + dataset_doc_path = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "data", + "datasets", + "boston_dataset_1", + "datasetDoc.json", + ) + ) + + dataset = container.Dataset.load( + "file://{dataset_doc_path}".format(dataset_doc_path=dataset_doc_path) + ) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type( + ("learningData", metadata_base.ALL_ELEMENTS, 14), + "https://metadata.datadrivendiscovery.org/types/Target", + ) + dataset.metadata = dataset.metadata.add_semantic_type( + ("learningData", metadata_base.ALL_ELEMENTS, 14), + "https://metadata.datadrivendiscovery.org/types/TrueTarget", + ) + dataset.metadata = dataset.metadata.remove_semantic_type( + ("learningData", metadata_base.ALL_ELEMENTS, 14), + "https://metadata.datadrivendiscovery.org/types/Attribute", + ) + + hyperparams_class = ( + dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + ) + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive( + hyperparams=hyperparams_class.defaults() + ) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = ( + extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams() + ) + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + { + "semantic_types": ( + "https://metadata.datadrivendiscovery.org/types/Attribute", + "http://schema.org/Integer", + ), + "match_logic": "equal", + } + ) + ) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + self._test_equal_metadata(dataframe.metadata) + + def _test_equal_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual( + test_utils.convert_through_json(metadata.query(())), + { + "structural_type": "d3m.container.pandas.DataFrame", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/Table" + ], + "dimension": { + "name": "rows", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/TabularRow" + ], + "length": 506, + }, + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/container.json", + }, + ) + + # only one column that should match + self.assertEqual( + test_utils.convert_through_json( + metadata.query((metadata_base.ALL_ELEMENTS,)) + ), + { + "dimension": { + "name": "columns", + "semantic_types": [ + "https://metadata.datadrivendiscovery.org/types/TabularColumn" + ], + "length": 1, + } + }, + ) + + self.assertEqual( + test_utils.convert_through_json( + metadata.query((metadata_base.ALL_ELEMENTS, 0)) + ), + { + "name": "TAX", + "structural_type": "str", + "semantic_types": [ + "http://schema.org/Integer", + "https://metadata.datadrivendiscovery.org/types/Attribute", + ], + "description": "full-value property-tax rate per $10,000", + }, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_extract_columns_structural_types.py b/tods/common-primitives/tests/test_extract_columns_structural_types.py new file mode 100644 index 0000000..2271181 --- /dev/null +++ b/tods/common-primitives/tests/test_extract_columns_structural_types.py @@ -0,0 +1,89 @@ +import os +import unittest + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, extract_columns_structural_types, column_parser + +import utils as test_utils + + +class ExtractColumnsByStructuralTypesPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + hyperparams_class = extract_columns_structural_types.ExtractColumnsByStructuralTypesPrimitive.metadata.get_hyperparams() + + primitive = extract_columns_structural_types.ExtractColumnsByStructuralTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'structural_types': ('int',)})) + + call_metadata = primitive.produce(inputs=dataframe) + + dataframe = call_metadata.value + + self._test_metadata(dataframe.metadata) + + def _test_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 1))), { + 'name': 'species', + 'structural_type': 'int', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_fixed_split.py b/tods/common-primitives/tests/test_fixed_split.py new file mode 100644 index 0000000..7059ada --- /dev/null +++ b/tods/common-primitives/tests/test_fixed_split.py @@ -0,0 +1,148 @@ +import os +import pickle +import unittest + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import fixed_split + + +class FixedSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): + def test_produce_train_values(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() + + hyperparams = hyperparams_class.defaults().replace({ + 'primary_index_values': ['9', '11', '13'], + }) + + # We want to make sure "primary_index_values" is encoded just as a list and not + # a pickle because runtime populates this primitive as a list from a split file. + self.assertEqual(hyperparams.values_to_json_structure(), {'primary_index_values': ['9', '11', '13'], 'row_indices': [], 'delete_recursive': False}) + + primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + # To test that pickling works. + pickle.dumps(primitive) + + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(results[0]['learningData'].shape[0], 147) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i not in [9, 11, 13]]) + + def test_produce_score_values(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() + + hyperparams = hyperparams_class.defaults().replace({ + 'primary_index_values': ['9', '11', '13'], + }) + + # We want to make sure "primary_index_values" is encoded just as a list and not + # a pickle because runtime populates this primitive as a list from a split file. + self.assertEqual(hyperparams.values_to_json_structure(), {'primary_index_values': ['9', '11', '13'], 'row_indices': [], 'delete_recursive': False}) + + primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(results[0]['learningData'].shape[0], 3) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i in [9, 11, 13]]) + + def test_produce_train_indices(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'row_indices': [9, 11, 13], + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + # To test that pickling works. + pickle.dumps(primitive) + + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(results[0]['learningData'].shape[0], 147) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i not in [9, 11, 13]]) + + def test_produce_score_indices(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = fixed_split.FixedSplitDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = fixed_split.FixedSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'row_indices': [9, 11, 13], + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(results[0]['learningData'].shape[0], 3) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150) if i in [9, 11, 13]]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_grouping_field_compose.py b/tods/common-primitives/tests/test_grouping_field_compose.py new file mode 100644 index 0000000..5380be8 --- /dev/null +++ b/tods/common-primitives/tests/test_grouping_field_compose.py @@ -0,0 +1,56 @@ +import math +import os.path +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, grouping_field_compose + +import utils as test_utils + + +class GroupingFieldComposePrimitiveTestCase(unittest.TestCase): + def test_compose_two_suggested_fields(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_3', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + compose_hyperparams_class = grouping_field_compose.GroupingFieldComposePrimitive.metadata.get_hyperparams() + hp = compose_hyperparams_class.defaults().replace({ + 'join_char': '-', + 'output_name': 'grouping' + }) + compose_primitive = grouping_field_compose.GroupingFieldComposePrimitive(hyperparams=hp) + new_dataframe = compose_primitive.produce(inputs=resource).value + + self.assertEqual(new_dataframe.shape, (40, 6)) + self.assertEqual('abbv-2013', new_dataframe['grouping'][0]) + + col_meta = new_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 5)) + self.assertEqual(col_meta['name'], 'grouping') + self.assertTrue('https://metadata.datadrivendiscovery.org/types/GroupingKey' in col_meta['semantic_types']) + + def test_compose_two_specified_fields(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_3', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + compose_hyperparams_class = grouping_field_compose.GroupingFieldComposePrimitive.metadata.get_hyperparams() + hp = compose_hyperparams_class.defaults().replace({ + 'columns': [1,3], + 'join_char': '-', + 'output_name': 'grouping' + }) + compose_primitive = grouping_field_compose.GroupingFieldComposePrimitive(hyperparams=hp) + new_dataframe = compose_primitive.produce(inputs=resource).value + + self.assertEqual(new_dataframe.shape, (40, 6)) + self.assertEqual('abbv-11-01', new_dataframe['grouping'][0]) + + col_meta = new_dataframe.metadata.query((metadata_base.ALL_ELEMENTS, 5)) + self.assertEqual(col_meta['name'], 'grouping') + self.assertTrue('https://metadata.datadrivendiscovery.org/types/GroupingKey' in col_meta['semantic_types']) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_horizontal_concat.py b/tods/common-primitives/tests/test_horizontal_concat.py new file mode 100644 index 0000000..0f8e78f --- /dev/null +++ b/tods/common-primitives/tests/test_horizontal_concat.py @@ -0,0 +1,183 @@ +import unittest +import os + +import numpy + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, extract_columns_semantic_types, horizontal_concat + + +class HorizontalConcatPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + test_data_inputs = {'col1': [1.0, 2.0, 3.0]} + dataframe_inputs = container.DataFrame(data=test_data_inputs, generate_metadata=True) + + test_data_targets = {'col2': [1, 2 ,3]} + dataframe_targets = container.DataFrame(data=test_data_targets, generate_metadata=True) + + hyperparams_class = horizontal_concat.HorizontalConcatPrimitive.metadata.get_hyperparams() + + primitive = horizontal_concat.HorizontalConcatPrimitive(hyperparams=hyperparams_class.defaults()) + + call_result = primitive.produce(left=dataframe_inputs, right=dataframe_targets) + + dataframe_concat = call_result.value + + self.assertEqual(dataframe_concat.values.tolist(), [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]) + + self._test_basic_metadata(dataframe_concat.metadata) + + def _test_basic_metadata(self, metadata): + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'], 2) + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS, 0))['name'], 'col1') + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS, 0))['structural_type'], numpy.float64) + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS, 1))['name'], 'col2') + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS, 1))['structural_type'], numpy.int64) + + def _get_iris(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + return dataframe + + def _get_iris_columns(self): + dataframe = self._get_iris() + + hyperparams_class = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams() + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/PrimaryKey',)})) + + call_metadata = primitive.produce(inputs=dataframe) + + index = call_metadata.value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',)})) + + call_metadata = primitive.produce(inputs=dataframe) + + attributes = call_metadata.value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)})) + + call_metadata = primitive.produce(inputs=dataframe) + + targets = call_metadata.value + + return dataframe, index, attributes, targets + + def test_iris(self): + dataframe, index, attributes, targets = self._get_iris_columns() + + hyperparams_class = horizontal_concat.HorizontalConcatPrimitive.metadata.get_hyperparams() + + primitive = horizontal_concat.HorizontalConcatPrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(left=index, right=attributes) + + call_metadata = primitive.produce(left=call_metadata.value, right=targets) + + new_dataframe = call_metadata.value + + self.assertEqual(dataframe.values.tolist(), new_dataframe.values.tolist()) + + self._test_iris_metadata(dataframe.metadata, new_dataframe.metadata) + + def _test_iris_metadata(self, metadata, new_metadata): + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'], new_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']) + + for i in range(new_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']): + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS, i)), new_metadata.query((metadata_base.ALL_ELEMENTS, i)), i) + + def _get_iris_columns_with_index(self): + dataframe = self._get_iris() + + hyperparams_class = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams() + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/PrimaryKey',)})) + + call_metadata = primitive.produce(inputs=dataframe) + + index = call_metadata.value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/Attribute')})) + + call_metadata = primitive.produce(inputs=dataframe) + + attributes = call_metadata.value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget')})) + + call_metadata = primitive.produce(inputs=dataframe) + + targets = call_metadata.value + + return dataframe, index, attributes, targets + + def test_iris_with_index_removed(self): + dataframe, index, attributes, targets = self._get_iris_columns_with_index() + + hyperparams_class = horizontal_concat.HorizontalConcatPrimitive.metadata.get_hyperparams() + + primitive = horizontal_concat.HorizontalConcatPrimitive(hyperparams=hyperparams_class.defaults().replace({'use_index': False})) + + call_metadata = primitive.produce(left=index, right=attributes) + + call_metadata = primitive.produce(left=call_metadata.value, right=targets) + + new_dataframe = call_metadata.value + + self.assertEqual(dataframe.values.tolist(), new_dataframe.values.tolist()) + + self._test_iris_with_index_removed_metadata(dataframe.metadata, new_dataframe.metadata) + + def _test_iris_with_index_removed_metadata(self, metadata, new_metadata): + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'], new_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']) + + for i in range(new_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']): + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS, i)), new_metadata.query((metadata_base.ALL_ELEMENTS, i)), i) + + def test_iris_with_index_reorder(self): + dataframe, index, attributes, targets = self._get_iris_columns_with_index() + + # Let's make problems. + attributes = attributes.sort_values(by='sepalLength').reset_index(drop=True) + + hyperparams_class = horizontal_concat.HorizontalConcatPrimitive.metadata.get_hyperparams() + + primitive = horizontal_concat.HorizontalConcatPrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(left=index, right=attributes) + + call_metadata = primitive.produce(left=call_metadata.value, right=targets) + + new_dataframe = call_metadata.value + + self.assertEqual(dataframe.values.tolist(), new_dataframe.values.tolist()) + + self._test_iris_with_index_reorder_metadata(dataframe.metadata, new_dataframe.metadata) + + def _test_iris_with_index_reorder_metadata(self, metadata, new_metadata): + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'], new_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']) + + for i in range(new_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']): + self.assertEqual(metadata.query((metadata_base.ALL_ELEMENTS, i)), new_metadata.query((metadata_base.ALL_ELEMENTS, i)), i) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_kfold_split.py b/tods/common-primitives/tests/test_kfold_split.py new file mode 100644 index 0000000..9983a6e --- /dev/null +++ b/tods/common-primitives/tests/test_kfold_split.py @@ -0,0 +1,100 @@ +import os +import pickle +import unittest + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import kfold_split + + +class KFoldDatasetSplitPrimitiveTestCase(unittest.TestCase): + def test_produce_train(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = kfold_split.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = kfold_split.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'number_of_folds': 10, + 'shuffle': True, + 'delete_recursive': True, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + # To test that pickling works. + pickle.dumps(primitive) + + results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + + self.assertEqual(len(results), 2) + + for dataset in results: + self.assertEqual(len(dataset), 4) + + self.assertEqual(results[0]['codes'].shape[0], 3) + self.assertEqual(results[1]['codes'].shape[0], 3) + + self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 40) + self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'}) + + self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 40) + self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ccc', 'ddd', 'eee'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'}) + + def test_produce_score(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = kfold_split.KFoldDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = kfold_split.KFoldDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'number_of_folds': 10, + 'shuffle': True, + 'delete_recursive': True, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + results = primitive.produce_score_data(inputs=container.List([0, 1], generate_metadata=True)).value + + self.assertEqual(len(results), 2) + + for dataset in results: + self.assertEqual(len(dataset), 4) + + self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 0]), {'5', '11', '28', '31', '38'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ddd', 'eee'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000'}) + + self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'BBB', 'CCC'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 0]), {'12', '26', '29', '32', '39'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'BBB', 'CCC'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'bbb', 'ccc', 'ddd', 'eee'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000', '2010'}) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_kfold_timeseries_split.py b/tods/common-primitives/tests/test_kfold_timeseries_split.py new file mode 100644 index 0000000..885ab2e --- /dev/null +++ b/tods/common-primitives/tests/test_kfold_timeseries_split.py @@ -0,0 +1,223 @@ +import os +import pickle +import unittest + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import kfold_split_timeseries + + +class KFoldTimeSeriesSplitPrimitiveTestCase(unittest.TestCase): + def test_produce_train_timeseries_1(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + + folds = 5 + primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'number_of_folds': folds, + 'number_of_window_folds': 1, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + # To test that pickling works. + pickle.dumps(primitive) + + results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + + self.assertEqual(len(results), 2) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 8) + self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-05', '2013-11-06', '2013-11-07', '2013-11-08', '2013-11-11', + '2013-11-12', '2013-11-13', '2013-11-14'}) + + self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 8) + self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-13', '2013-11-14', '2013-11-15', '2013-11-18', '2013-11-19', + '2013-11-20', '2013-11-21', '2013-11-22'}) + + def test_produce_score_timeseries_1(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + + folds = 5 + primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'number_of_folds': folds, + 'number_of_window_folds': 1, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + results = primitive.produce_score_data(inputs=container.List([0, 1], generate_metadata=True)).value + + self.assertEqual(len(results), 2) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 6) + self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-15', '2013-11-18', '2013-11-19', + '2013-11-20', '2013-11-21', '2013-11-22'}) + + self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 6) + self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-25', '2013-11-26', '2013-11-27', + '2013-11-29', '2013-12-02', '2013-12-03'}) + + def test_produce_train(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + # We fake that the dataset is time-series. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Time') + + hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + + folds = 5 + primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'number_of_folds': folds, + 'number_of_window_folds': 1, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + # To test that pickling works. + pickle.dumps(primitive) + + results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + + self.assertEqual(len(results), 2) + + for dataset in results: + self.assertEqual(len(dataset), 4) + + self.assertEqual(results[0]['codes'].shape[0], 3) + self.assertEqual(results[1]['codes'].shape[0], 3) + + self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 9) + self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'bbb', 'ccc', 'ddd'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990'}) + + self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 9) + self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'aaa', 'bbb', 'ddd', 'eee'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'1990', '2000'}) + + def test_produce_score(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + # We fake that the dataset is time-series. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 3), 'https://metadata.datadrivendiscovery.org/types/Time') + + hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + + folds = 5 + primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'number_of_folds': folds, + 'number_of_window_folds': 1, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + results = primitive.produce_score_data(inputs=container.List([0, 1], generate_metadata=True)).value + + self.assertEqual(len(results), 2) + + for dataset in results: + self.assertEqual(len(dataset), 4) + + self.assertEqual(results[0]['codes'].shape[0], 3) + self.assertEqual(results[1]['codes'].shape[0], 3) + + self.assertEqual(set(results[0]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 0]), {'2', '3', '32', '33', '37', '38', '39'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 2]), {'aaa', 'ddd', 'eee'}) + self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'1990', '2000'}) + + self.assertEqual(set(results[1]['codes'].iloc[:, 0]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 0]), {'22', '23', '24', '31', '40', '41', '42'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 1]), {'AAA', 'BBB', 'CCC'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 2]), {'ccc', 'ddd', 'eee'}) + self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2000'}) + + def test_unsorted_datetimes_timeseries_4(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'timeseries_dataset_4', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive.metadata.get_hyperparams() + + folds = 5 + primitive = kfold_split_timeseries.KFoldTimeSeriesSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'number_of_folds': folds, + 'number_of_window_folds': 1, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + # To test that pickling works. + pickle.dumps(primitive) + + results = primitive.produce(inputs=container.List([0, 1], generate_metadata=True)).value + + self.assertEqual(len(results), 2) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(len(results[0]['learningData'].iloc[:, 0]), 8) + self.assertEqual(set(results[0]['learningData'].iloc[:, 3]), {'2013-11-05', '2013-11-06', '2013-11-07', '2013-11-08', '2013-11-11', + '2013-11-12', '2013-11-13', '2013-11-14'}) + + self.assertEqual(len(results[1]['learningData'].iloc[:, 0]), 8) + self.assertEqual(set(results[1]['learningData'].iloc[:, 3]), {'2013-11-13', '2013-11-14', '2013-11-15', '2013-11-18', '2013-11-19', + '2013-11-20', '2013-11-21', '2013-11-22'}) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_lgbm_classifier.py b/tods/common-primitives/tests/test_lgbm_classifier.py new file mode 100644 index 0000000..90d7d43 --- /dev/null +++ b/tods/common-primitives/tests/test_lgbm_classifier.py @@ -0,0 +1,571 @@ +import os +import pickle +import unittest + +import numpy as np + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, extract_columns_semantic_types, lgbm_classifier, column_parser + + +def _add_categorical_col(attributes): + rand_str = ['a', 'b', 'c', 'd', 'e'] + attributes = attributes.append_columns(container.DataFrame(data={ + 'mock_cat_col': np.random.choice(rand_str, attributes.shape[0]) + }, generate_metadata=True)) + attributes.metadata = attributes.metadata.add_semantic_type([metadata_base.ALL_ELEMENTS, attributes.shape[-1] - 1], + 'https://metadata.datadrivendiscovery.org/types/CategoricalData') + attributes.metadata = attributes.metadata.add_semantic_type([metadata_base.ALL_ELEMENTS, attributes.shape[-1] - 1], + 'https://metadata.datadrivendiscovery.org/types/Attribute') + return attributes + + +def _get_iris(): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = \ + dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = primitive.produce(inputs=dataset).value + return dataframe + + +def _get_iris_columns(): + dataframe = _get_iris() + + # We set custom metadata on columns. + for column_index in range(1, 5): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'attributes'}) + for column_index in range(5, 6): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'targets'}) + + # We set semantic types like runtime would. + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/Target') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataframe.metadata = dataframe.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataframe = _add_categorical_col(dataframe) + + # Parsing. + hyperparams_class = \ + column_parser.ColumnParserPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + hyperparams_class = \ + extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',)})) + attributes = primitive.produce(inputs=dataframe).value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)})) + targets = primitive.produce(inputs=dataframe).value + + return dataframe, attributes, targets + + +class LGBMTestCase(unittest.TestCase): + attributes: container.DataFrame = None + targets: container.DataFrame = None + dataframe: container.DataFrame = None + + @classmethod + def setUpClass(cls) -> None: + cls.dataframe, cls.attributes, cls.targets = _get_iris_columns() + cls.excp_attributes = cls.attributes.copy() + + def test_single_target(self): + self.assertEqual(list(self.targets.columns), ['species']) + + hyperparams_class = \ + lgbm_classifier.LightGBMClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = lgbm_classifier.LightGBMClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=self.attributes, outputs=self.targets) + primitive.fit() + + predictions = primitive.produce(inputs=self.attributes).value + + self.assertEqual(list(predictions.columns), ['species']) + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + self._test_single_target_metadata(predictions.metadata) + + samples = primitive.sample(inputs=self.attributes).value + + self.assertEqual(list(samples[0].columns), ['species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=self.attributes, outputs=self.targets).value + + self.assertEqual(list(log_likelihoods.columns), ['species']) + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=self.attributes, outputs=self.targets).value + + self.assertEqual(list(log_likelihood.columns), ['species']) + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -6.338635478886032) + self.assertEqual(log_likelihood.metadata.query_column(0)['name'], 'species') + + def test_single_target_continue_fit(self): + hyperparams_class = \ + lgbm_classifier.LightGBMClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = lgbm_classifier.LightGBMClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=self.attributes, outputs=self.targets) + primitive.fit() + # reset the training data to make continue_fit() work. + primitive.set_training_data(inputs=self.attributes, outputs=self.targets) + primitive.continue_fit() + params = primitive.get_params() + self.assertEqual(params['booster'].current_iteration(), + primitive.hyperparams['n_estimators'] + primitive.hyperparams['n_more_estimators']) + predictions = primitive.produce(inputs=self.attributes).value + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + self._test_single_target_metadata(predictions.metadata) + + samples = primitive.sample(inputs=self.attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=self.attributes, outputs=self.targets).value + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=self.attributes, outputs=self.targets).value + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -3.723258225143776) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + def _test_single_target_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_semantic_types(self): + # dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + lgbm_classifier.LightGBMClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = lgbm_classifier.LightGBMClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=self.dataframe, outputs=self.dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=self.dataframe).value + + self.assertEqual(list(predictions.columns), ['species']) + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=self.dataframe).value + self.assertEqual(list(samples[0].columns), ['species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=self.dataframe, outputs=self.dataframe).value + self.assertEqual(list(log_likelihoods.columns), ['species']) + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=self.dataframe, outputs=self.dataframe).value + self.assertEqual(list(log_likelihood.columns), ['species']) + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -6.338635478886032) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + feature_importances = primitive.produce_feature_importances().value + self.assertEqual(list(feature_importances), + ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'mock_cat_col']) + self.assertEqual(feature_importances.metadata.query_column(0)['name'], 'sepalLength') + self.assertEqual(feature_importances.metadata.query_column(1)['name'], 'sepalWidth') + self.assertEqual(feature_importances.metadata.query_column(2)['name'], 'petalLength') + self.assertEqual(feature_importances.metadata.query_column(3)['name'], 'petalWidth') + + self.assertEqual(feature_importances.values.tolist(), + [[0.22740524781341107, 0.18513119533527697, 0.3323615160349854, 0.25510204081632654, 0.0]]) + + def test_return_append(self): + hyperparams_class = \ + lgbm_classifier.LightGBMClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = lgbm_classifier.LightGBMClassifierPrimitive(hyperparams=hyperparams_class.defaults()) + + primitive.set_training_data(inputs=self.dataframe, outputs=self.dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=self.dataframe).value + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'sepalLength', + 'sepalWidth', + 'petalLength', + 'petalWidth', + 'species', + 'mock_cat_col', + 'species', + ]) + self.assertEqual(predictions.shape, (150, 8)) + self.assertEqual(predictions.iloc[0, 7], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 7), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 7), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(7)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(7)['custom_metadata'], 'targets') + + self._test_return_append_metadata(predictions.metadata) + + def _test_return_append_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'metadata': {'dimension': {'length': 150, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 8, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'd3mIndex', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + 'structural_type': 'int'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'custom_metadata': 'attributes', + 'name': 'sepalLength', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'float'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'custom_metadata': 'attributes', + 'name': 'sepalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'float'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'custom_metadata': 'attributes', + 'name': 'petalLength', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'float'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'custom_metadata': 'attributes', + 'name': 'petalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'float'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + {'metadata': {'custom_metadata': 'targets', + 'name': 'species', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'structural_type': 'str'}, + 'selector': ['__ALL_ELEMENTS__', 5]}, + {'metadata': {'name': 'mock_cat_col', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'int'}, + 'selector': ['__ALL_ELEMENTS__', 6]}, + {'metadata': {'custom_metadata': 'targets', + 'name': 'species', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'structural_type': 'str'}, + 'selector': ['__ALL_ELEMENTS__', 7]}] + ) + + def test_return_new(self): + hyperparams_class = \ + lgbm_classifier.LightGBMClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = lgbm_classifier.LightGBMClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new'})) + + primitive.set_training_data(inputs=self.dataframe, outputs=self.dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=self.dataframe).value + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + ]) + + self.assertEqual(predictions.shape, (150, 2)) + self.assertEqual(predictions.iloc[0, 1], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_new_metadata(predictions.metadata) + + def _test_return_new_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_return_replace(self): + hyperparams_class = \ + lgbm_classifier.LightGBMClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = lgbm_classifier.LightGBMClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace'})) + + primitive.set_training_data(inputs=self.dataframe, outputs=self.dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=self.dataframe).value + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + 'species', + ]) + self.assertEqual(predictions.shape, (150, 3)) + self.assertEqual(predictions.iloc[0, 1], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_replace_metadata(predictions.metadata) + + def test_pickle_unpickle(self): + hyperparams_class = \ + lgbm_classifier.LightGBMClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = lgbm_classifier.LightGBMClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=self.attributes, outputs=self.targets) + primitive.fit() + + before_pickled_prediction = primitive.produce(inputs=self.attributes).value + pickle_object = pickle.dumps(primitive) + primitive = pickle.loads(pickle_object) + after_unpickled_prediction = primitive.produce(inputs=self.attributes).value + self.assertTrue(container.DataFrame.equals(before_pickled_prediction, after_unpickled_prediction)) + + def _test_return_replace_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'custom_metadata': 'targets', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_list_to_dataframe.py b/tods/common-primitives/tests/test_list_to_dataframe.py new file mode 100644 index 0000000..0860981 --- /dev/null +++ b/tods/common-primitives/tests/test_list_to_dataframe.py @@ -0,0 +1,185 @@ +import unittest + +import numpy + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import list_to_dataframe + + +class ListToDataFramePrimitiveTestCase(unittest.TestCase): + def test_basic(self): + data = container.List([container.List([1, 2, 3]), container.List([4, 5, 6])], generate_metadata=True) + + list_hyperparams_class = list_to_dataframe.ListToDataFramePrimitive.metadata.get_hyperparams() + list_primitive = list_to_dataframe.ListToDataFramePrimitive(hyperparams=list_hyperparams_class.defaults()) + dataframe = list_primitive.produce(inputs=data).value + + self._test_basic_metadata(dataframe.metadata, 'numpy.int64', True) + + def _test_basic_metadata(self, metadata, structural_type, add_individual_columns): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'dimension': { + 'length': 2, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + 'structural_type': '__NO_VALUE__', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'int', + }, + }] + + if add_individual_columns: + expected_metadata.extend([{ + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': structural_type, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': structural_type, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'structural_type': structural_type, + }, + }]) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), expected_metadata) + + def test_just_list(self): + data = container.List([1, 2, 3], generate_metadata=True) + + list_hyperparams_class = list_to_dataframe.ListToDataFramePrimitive.metadata.get_hyperparams() + list_primitive = list_to_dataframe.ListToDataFramePrimitive(hyperparams=list_hyperparams_class.defaults()) + dataframe = list_primitive.produce(inputs=data).value + + self._test_just_list_metadata(dataframe.metadata, 'numpy.int64', True) + + def _test_just_list_metadata(self, metadata, structural_type, use_individual_columns): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + 'structural_type': '__NO_VALUE__', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': structural_type, + }, + }] + + if use_individual_columns: + expected_metadata[-1]['selector'] = ['__ALL_ELEMENTS__', 0] + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), expected_metadata) + + def test_list_ndarray(self): + data = container.List([container.ndarray(numpy.array([1, 2, 3])), container.ndarray(numpy.array([4, 5, 6]))], generate_metadata=True) + + list_hyperparams_class = list_to_dataframe.ListToDataFramePrimitive.metadata.get_hyperparams() + list_primitive = list_to_dataframe.ListToDataFramePrimitive(hyperparams=list_hyperparams_class.defaults()) + dataframe = list_primitive.produce(inputs=data).value + + self._test_list_ndarray_metadata(dataframe.metadata, True) + + def _test_list_ndarray_metadata(self, metadata, add_individual_columns): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'dimension': { + 'length': 2, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + 'structural_type': '__NO_VALUE__', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }] + + if add_individual_columns: + expected_metadata.extend([{ + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), expected_metadata) + + def test_list_deeper_ndarray(self): + data = container.List([container.ndarray(numpy.array([[1, 2, 3], [11, 12, 13]])), container.ndarray(numpy.array([[4, 5, 6], [14, 15, 16]]))], generate_metadata=True) + + list_hyperparams_class = list_to_dataframe.ListToDataFramePrimitive.metadata.get_hyperparams() + list_primitive = list_to_dataframe.ListToDataFramePrimitive(hyperparams=list_hyperparams_class.defaults()) + + with self.assertRaisesRegex(ValueError, 'Must pass 2-d input'): + list_primitive.produce(inputs=data).value + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_list_to_ndarray.py b/tods/common-primitives/tests/test_list_to_ndarray.py new file mode 100644 index 0000000..07d6d23 --- /dev/null +++ b/tods/common-primitives/tests/test_list_to_ndarray.py @@ -0,0 +1,132 @@ +import unittest + +import numpy + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import list_to_ndarray + + +class ListToNDRrrayPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + data = container.List([container.List([1, 2, 3]), container.List([4, 5, 6])], generate_metadata=True) + + list_hyperparams_class = list_to_ndarray.ListToNDArrayPrimitive.metadata.get_hyperparams() + list_primitive = list_to_ndarray.ListToNDArrayPrimitive(hyperparams=list_hyperparams_class.defaults()) + array = list_primitive.produce(inputs=data).value + + self._test_basic_metadata(array.metadata, 'numpy.int64') + + def _test_basic_metadata(self, metadata, structural_type): + self.maxDiff = None + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 2, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + 'structural_type': '__NO_VALUE__', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': structural_type, + }, + }]) + + def test_just_list(self): + data = container.List([1, 2, 3], generate_metadata=True) + + list_hyperparams_class = list_to_ndarray.ListToNDArrayPrimitive.metadata.get_hyperparams() + list_primitive = list_to_ndarray.ListToNDArrayPrimitive(hyperparams=list_hyperparams_class.defaults()) + array = list_primitive.produce(inputs=data).value + + self._test_just_list_metadata(array.metadata, 'numpy.int64') + + def _test_just_list_metadata(self, metadata, structural_type): + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()),[{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': structural_type, + }, + }]) + + def test_list_ndarray(self): + data = container.List([container.ndarray(numpy.array([[1, 2, 3], [11, 12, 13]])), container.ndarray(numpy.array([[4, 5, 6], [14, 15, 16]]))], generate_metadata=True) + + list_hyperparams_class = list_to_ndarray.ListToNDArrayPrimitive.metadata.get_hyperparams() + list_primitive = list_to_ndarray.ListToNDArrayPrimitive(hyperparams=list_hyperparams_class.defaults()) + array = list_primitive.produce(inputs=data).value + + self._test_list_ndarray_metadata(array.metadata) + + def _test_list_ndarray_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'length': 2, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'semantic_types': '__NO_VALUE__', + 'dimension': { + 'length': 2, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + 'structural_type': '__NO_VALUE__', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'semantic_types': '__NO_VALUE__', + 'name': '__NO_VALUE__', + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_ndarray_to_dataframe.py b/tods/common-primitives/tests/test_ndarray_to_dataframe.py new file mode 100644 index 0000000..d2987e2 --- /dev/null +++ b/tods/common-primitives/tests/test_ndarray_to_dataframe.py @@ -0,0 +1,99 @@ +import unittest + +import numpy + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataframe_to_ndarray, dataset_to_dataframe, ndarray_to_dataframe + +import utils as test_utils + + +class NDArrayToDataFramePrimitiveTestCase(unittest.TestCase): + def test_basic(self): + # TODO: Find a less cumbersome way to get a numpy array loaded with a dataset + # load the iris dataset + dataset = test_utils.load_iris_metadata() + + # convert the dataset into a dataframe + dataset_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataset_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataset_hyperparams_class.defaults()) + dataframe_dataset = dataset_primitive.produce(inputs=dataset).value + + # convert the dataframe into a numpy array + numpy_hyperparams_class = dataframe_to_ndarray.DataFrameToNDArrayPrimitive.metadata.get_hyperparams() + numpy_primitive = dataframe_to_ndarray.DataFrameToNDArrayPrimitive(hyperparams=numpy_hyperparams_class.defaults()) + numpy_array = numpy_primitive.produce(inputs=dataframe_dataset).value + + # convert the numpy array back into a dataframe + dataframe_hyperparams_class = ndarray_to_dataframe.NDArrayToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = ndarray_to_dataframe.NDArrayToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=numpy_array).value + + self.assertIsInstance(dataframe, container.DataFrame) + + # verify dimensions + self.assertEqual(len(dataframe), 150) + self.assertEqual(len(dataframe.iloc[0]), 6) + + # ensure column names added to dataframe + self.assertListEqual(list(dataframe.columns.values), ['d3mIndex', 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species']) + + # verify data type is unchanged + for row in dataframe: + for cell in row: + self.assertIsInstance(cell, str) + + # validate metadata + test_utils.test_iris_metadata(self, dataframe.metadata, 'd3m.container.pandas.DataFrame') + + def test_vector(self): + data = container.ndarray(numpy.array([1, 2, 3]), generate_metadata=True) + + dataframe_hyperparams_class = ndarray_to_dataframe.NDArrayToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = ndarray_to_dataframe.NDArrayToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=data).value + + self._test_vector_metadata(dataframe.metadata, True) + + def _test_vector_metadata(self, metadata, use_individual_columns): + self.maxDiff = None + + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + 'structural_type': '__NO_VALUE__', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }] + + if use_individual_columns: + expected_metadata[-1]['selector'] = ['__ALL_ELEMENTS__', 0] + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), expected_metadata) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_ndarray_to_list.py b/tods/common-primitives/tests/test_ndarray_to_list.py new file mode 100644 index 0000000..b2c6555 --- /dev/null +++ b/tods/common-primitives/tests/test_ndarray_to_list.py @@ -0,0 +1,116 @@ +import unittest + +import numpy + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataframe_to_ndarray, dataset_to_dataframe, ndarray_to_list + +import utils as test_utils + + +class NDArrayToListPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + # TODO: Find a less cumbersome way to get a numpy array loaded with a dataset + # load the iris dataset + dataset = test_utils.load_iris_metadata() + + # convert the dataset into a dataframe + dataset_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataset_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataset_hyperparams_class.defaults()) + dataframe_dataset = dataset_primitive.produce(inputs=dataset).value + + # convert the dataframe into a numpy array + numpy_hyperparams_class = dataframe_to_ndarray.DataFrameToNDArrayPrimitive.metadata.get_hyperparams() + numpy_primitive = dataframe_to_ndarray.DataFrameToNDArrayPrimitive(hyperparams=numpy_hyperparams_class.defaults()) + numpy_array = numpy_primitive.produce(inputs=dataframe_dataset).value + + list_hyperparams_class = ndarray_to_list.NDArrayToListPrimitive.metadata.get_hyperparams() + list_primitive = ndarray_to_list.NDArrayToListPrimitive(hyperparams=list_hyperparams_class.defaults()) + list_value = list_primitive.produce(inputs=numpy_array).value + + self.assertIsInstance(list_value, container.List) + + # verify dimensions + self.assertEqual(len(list_value), 150) + self.assertEqual(len(list_value[0]), 6) + + # validate metadata + test_utils.test_iris_metadata(self, list_value.metadata, 'd3m.container.list.List', 'd3m.container.numpy.ndarray') + + def test_vector(self): + data = container.ndarray(numpy.array([1, 2, 3]), generate_metadata=True) + + list_hyperparams_class = ndarray_to_list.NDArrayToListPrimitive.metadata.get_hyperparams() + list_primitive = ndarray_to_list.NDArrayToListPrimitive(hyperparams=list_hyperparams_class.defaults()) + list_value = list_primitive.produce(inputs=data).value + + self._test_vector_metadata(list_value.metadata) + + def _test_vector_metadata(self, metadata): + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + def test_deep_array(self): + data = container.ndarray(numpy.array(range(2 * 3 * 4)).reshape((2, 3, 4)), generate_metadata=True) + + list_hyperparams_class = ndarray_to_list.NDArrayToListPrimitive.metadata.get_hyperparams() + list_primitive = ndarray_to_list.NDArrayToListPrimitive(hyperparams=list_hyperparams_class.defaults()) + list_value = list_primitive.produce(inputs=data).value + + self._test_deep_vector_metadata(list_value.metadata) + + def _test_deep_vector_metadata(self, metadata): + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + 'dimension': { + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'structural_type': 'd3m.container.numpy.ndarray', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_no_split.py b/tods/common-primitives/tests/test_no_split.py new file mode 100644 index 0000000..f61f476 --- /dev/null +++ b/tods/common-primitives/tests/test_no_split.py @@ -0,0 +1,71 @@ +import os +import pickle +import unittest + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import no_split + + +class NoSplitDatasetSplitPrimitiveTestCase(unittest.TestCase): + def test_produce_train(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = no_split.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = no_split.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + # To test that pickling works. + pickle.dumps(primitive) + + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(results[0]['learningData'].shape[0], 150) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150)]) + + def test_produce_score(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = no_split.NoSplitDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = no_split.NoSplitDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults()) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(results[0]['learningData'].shape[0], 150) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [str(i) for i in range(150)]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_normalize_column_references.py b/tods/common-primitives/tests/test_normalize_column_references.py new file mode 100644 index 0000000..363ecb0 --- /dev/null +++ b/tods/common-primitives/tests/test_normalize_column_references.py @@ -0,0 +1,597 @@ +import os +import unittest + +from d3m import container, utils + +from common_primitives import normalize_column_references + +import utils as test_utils + + +class NormalizeColumnReferencesPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json') + ) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + metadata_before = dataset.metadata.to_internal_json_structure() + + self._test_metadata_before(utils.to_json_structure(dataset.metadata.to_internal_simple_structure()), dataset_doc_path) + + hyperparams_class = normalize_column_references.NormalizeColumnReferencesPrimitive.metadata.get_hyperparams() + + primitive = normalize_column_references.NormalizeColumnReferencesPrimitive( + hyperparams=hyperparams_class.defaults() + ) + + normalized_dataset = primitive.produce(inputs=dataset).value + + self.assertIsInstance(normalized_dataset, container.Dataset) + + self._test_metadata_after(utils.to_json_structure(normalized_dataset.metadata.to_internal_simple_structure()), dataset_doc_path) + + self.assertEqual(metadata_before, dataset.metadata.to_internal_json_structure()) + + def _test_metadata_before(self, metadata, dataset_doc_path): + self.maxDiff = None + + self.assertEqual( + test_utils.convert_through_json(metadata), + [ + { + 'selector': [], + 'metadata': { + 'description': 'A synthetic dataset trying to be similar to a database dump, with tables with different relations between them.', + 'digest': '68c435c6ba9a1c419c79507275c0d5710786dfe481e48f35591d87a7dbf5bb1a', + 'dimension': { + 'length': 4, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'id': 'database_dataset_1', + 'location_uris': [ + 'file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path), + ], + 'name': 'A dataset simulating a database dump', + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'source': {'license': 'CC', 'redacted': False}, + 'structural_type': 'd3m.container.dataset.Dataset', + 'version': '4.0.0', + }, + }, + { + 'selector': ['authors'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, + { + 'selector': ['authors', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 2, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': ['authors', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'id', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['authors', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'name', + 'semantic_types': [ + 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['codes'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, + { + 'selector': ['codes', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': ['codes', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'code', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['codes', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'name', + 'semantic_types': [ + 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['codes', '__ALL_ELEMENTS__', 2], + 'metadata': { + 'foreign_key': {'column_index': 0, 'resource_id': 'authors', 'type': 'COLUMN'}, + 'name': 'author', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData'], + 'metadata': { + 'dimension': { + 'length': 45, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'foreign_key': {'column_name': 'code', 'resource_id': 'codes', 'type': 'COLUMN'}, + 'name': 'code', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'key', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'year', + 'semantic_types': [ + 'http://schema.org/DateTime', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'value', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['values'], + 'metadata': { + 'dimension': { + 'length': 64, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'foreign_key': {'column_name': 'code', 'resource_id': 'codes', 'type': 'COLUMN'}, + 'name': 'code', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'key', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'year', + 'semantic_types': [ + 'http://schema.org/DateTime', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'value', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + ], + ) + + def _test_metadata_after(self, metadata, dataset_doc_path): + self.maxDiff = None + + self.assertEqual( + test_utils.convert_through_json(metadata), + [ + { + 'selector': [], + 'metadata': { + 'description': 'A synthetic dataset trying to be similar to a database dump, with tables with different relations between them.', + 'digest': '68c435c6ba9a1c419c79507275c0d5710786dfe481e48f35591d87a7dbf5bb1a', + 'dimension': { + 'length': 4, + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + }, + 'id': 'database_dataset_1', + 'location_uris': [ + 'file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path), + ], + 'name': 'A dataset simulating a database dump', + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'source': {'license': 'CC', 'redacted': False}, + 'structural_type': 'd3m.container.dataset.Dataset', + 'version': '4.0.0', + }, + }, + { + 'selector': ['authors'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, + { + 'selector': ['authors', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 2, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': ['authors', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'id', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['authors', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'name', + 'semantic_types': [ + 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['codes'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, + { + 'selector': ['codes', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': ['codes', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'code', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['codes', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'name', + 'semantic_types': [ + 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['codes', '__ALL_ELEMENTS__', 2], + 'metadata': { + 'foreign_key': {'column_index': 0, 'resource_id': 'authors', 'type': 'COLUMN'}, + 'name': 'author', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData'], + 'metadata': { + 'dimension': { + 'length': 45, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', + ], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'foreign_key': {'column_index': 0, 'column_name': '__NO_VALUE__', 'resource_id': 'codes', 'type': 'COLUMN'}, + 'name': 'code', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'key', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'year', + 'semantic_types': [ + 'http://schema.org/DateTime', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['learningData', '__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'value', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['values'], + 'metadata': { + 'dimension': { + 'length': 64, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + } + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'foreign_key': {'column_index': 0, 'column_name': '__NO_VALUE__', 'resource_id': 'codes', 'type': 'COLUMN'}, + 'name': 'code', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'key', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'year', + 'semantic_types': [ + 'http://schema.org/DateTime', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + { + 'selector': ['values', '__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'value', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + 'structural_type': 'str', + }, + }, + ], + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_normalize_graphs.py b/tods/common-primitives/tests/test_normalize_graphs.py new file mode 100644 index 0000000..e6eeb8d --- /dev/null +++ b/tods/common-primitives/tests/test_normalize_graphs.py @@ -0,0 +1,207 @@ +import os +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import normalize_graphs, denormalize, dataset_map, column_parser, normalize_column_references, simple_profiler + +import utils as test_utils + + +class NormalizeGraphsPrimitiveTestCase(unittest.TestCase): + def _parse_columns(self, dataset): + hyperparams_class = dataset_map.DataFrameDatasetMapPrimitive.metadata.get_hyperparams() + + primitive = dataset_map.DataFrameDatasetMapPrimitive( + # We have to make an instance of the primitive ourselves. + hyperparams=hyperparams_class.defaults().replace({ + 'primitive': column_parser.ColumnParserPrimitive( + hyperparams=column_parser.ColumnParserPrimitive.metadata.get_hyperparams().defaults(), + ), + 'resources': 'all', + }), + + ) + + return primitive.produce(inputs=dataset).value + + def _normalize_column_references(self, dataset): + hyperparams_class = normalize_column_references.NormalizeColumnReferencesPrimitive.metadata.get_hyperparams() + + primitive = normalize_column_references.NormalizeColumnReferencesPrimitive( + hyperparams=hyperparams_class.defaults(), + ) + + return primitive.produce(inputs=dataset).value + + def _get_dataset_1(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'graph_dataset_1', 'datasetDoc.json') + ) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 2), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + metadata_before = dataset.metadata.to_internal_json_structure() + + normalized_dataset = self._normalize_column_references(dataset) + + hyperparams_class = normalize_graphs.NormalizeGraphsPrimitive.metadata.get_hyperparams() + + primitive = normalize_graphs.NormalizeGraphsPrimitive( + hyperparams=hyperparams_class.defaults(), + ) + + normalized_dataset = primitive.produce(inputs=normalized_dataset).value + + hyperparams_class = dataset_map.DataFrameDatasetMapPrimitive.metadata.get_hyperparams() + + primitive = dataset_map.DataFrameDatasetMapPrimitive( + # We have to make an instance of the primitive ourselves. + hyperparams=hyperparams_class.defaults().replace({ + 'primitive': simple_profiler.SimpleProfilerPrimitive( + hyperparams=simple_profiler.SimpleProfilerPrimitive.metadata.get_hyperparams().defaults().replace({ + 'detect_semantic_types': [ + 'http://schema.org/Boolean', 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'http://schema.org/Integer', 'http://schema.org/Float', 'http://schema.org/Text', + 'https://metadata.datadrivendiscovery.org/types/FloatVector', 'http://schema.org/DateTime', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/Time', + 'https://metadata.datadrivendiscovery.org/types/UnknownType', + ], + }), + ), + 'resources': 'all', + }), + + ) + + primitive.set_training_data(inputs=normalized_dataset) + primitive.fit() + normalized_dataset = primitive.produce(inputs=normalized_dataset).value + + normalized_dataset = self._parse_columns(normalized_dataset) + + hyperparams_class = denormalize.DenormalizePrimitive.metadata.get_hyperparams() + + primitive = denormalize.DenormalizePrimitive( + hyperparams=hyperparams_class.defaults(), + ) + + normalized_dataset = primitive.produce(inputs=normalized_dataset).value + + # To make metadata match in recorded structural types. + normalized_dataset.metadata = normalized_dataset.metadata.generate(normalized_dataset) + + self.assertEqual(metadata_before, dataset.metadata.to_internal_json_structure()) + + return normalized_dataset + + def _get_dataset_2(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'graph_dataset_2', 'datasetDoc.json') + ) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 4), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + metadata_before = dataset.metadata.to_internal_json_structure() + + normalized_dataset = self._normalize_column_references(dataset) + + hyperparams_class = normalize_graphs.NormalizeGraphsPrimitive.metadata.get_hyperparams() + + primitive = normalize_graphs.NormalizeGraphsPrimitive( + hyperparams=hyperparams_class.defaults(), + ) + + normalized_dataset = primitive.produce(inputs=normalized_dataset).value + + normalized_dataset = self._parse_columns(normalized_dataset) + + # To make metadata match in recorded structural types. + normalized_dataset.metadata = normalized_dataset.metadata.generate(normalized_dataset) + + self.assertEqual(metadata_before, dataset.metadata.to_internal_json_structure()) + + return normalized_dataset + + def test_basic(self): + self.maxDiff = None + + dataset_1 = self._get_dataset_1() + dataset_2 = self._get_dataset_2() + + # Making some changes to make resulting datasets the same. + dataset_2['G1_edges'] = dataset_2['edgeList'] + del dataset_2['edgeList'] + + dataset_2.metadata = dataset_2.metadata.copy_to(dataset_2.metadata, ('edgeList',), ('G1_edges',)) + dataset_2.metadata = dataset_2.metadata.remove(('edgeList',), recursive=True) + + for field in ['description', 'digest', 'id', 'location_uris', 'name']: + dataset_1.metadata = dataset_1.metadata.update((), {field: metadata_base.NO_VALUE}) + dataset_2.metadata = dataset_2.metadata.update((), {field: metadata_base.NO_VALUE}) + + dataset_1_metadata = test_utils.effective_metadata(dataset_1.metadata) + dataset_2_metadata = test_utils.effective_metadata(dataset_2.metadata) + + # Removing an ALL_ELEMENTS selector which does not really apply to any element anymore + # (it is overridden by more specific selectors). + del dataset_1_metadata[3] + + self.assertEqual(dataset_1_metadata, dataset_2_metadata) + + self.assertCountEqual(dataset_1.keys(), dataset_2.keys()) + + for resource_id in dataset_1.keys(): + self.assertTrue(dataset_1[resource_id].equals(dataset_2[resource_id]), resource_id) + + def test_idempotent_dataset_1(self): + dataset = self._get_dataset_1() + + hyperparams_class = normalize_graphs.NormalizeGraphsPrimitive.metadata.get_hyperparams() + + primitive = normalize_graphs.NormalizeGraphsPrimitive( + hyperparams=hyperparams_class.defaults(), + ) + + normalized_dataset = primitive.produce(inputs=dataset).value + + self.assertEqual(utils.to_json_structure(dataset.metadata.to_internal_simple_structure()), normalized_dataset.metadata.to_internal_json_structure()) + + self.assertCountEqual(dataset.keys(), normalized_dataset.keys()) + + for resource_id in dataset.keys(): + self.assertTrue(dataset[resource_id].equals(normalized_dataset[resource_id]), resource_id) + + def test_idempotent_dataset_2(self): + dataset = self._get_dataset_2() + + hyperparams_class = normalize_graphs.NormalizeGraphsPrimitive.metadata.get_hyperparams() + + primitive = normalize_graphs.NormalizeGraphsPrimitive( + hyperparams=hyperparams_class.defaults(), + ) + + normalized_dataset = primitive.produce(inputs=dataset).value + + self.assertEqual(dataset.metadata.to_internal_json_structure(), normalized_dataset.metadata.to_internal_json_structure()) + + self.assertCountEqual(dataset.keys(), normalized_dataset.keys()) + + for resource_id in dataset.keys(): + self.assertTrue(dataset[resource_id].equals(normalized_dataset[resource_id]), resource_id) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_numeric_range_filter.py b/tods/common-primitives/tests/test_numeric_range_filter.py new file mode 100644 index 0000000..df340af --- /dev/null +++ b/tods/common-primitives/tests/test_numeric_range_filter.py @@ -0,0 +1,143 @@ +import unittest +import os + +from common_primitives import numeric_range_filter +from d3m import container + +import utils as test_utils + + +class NumericRangeFilterPrimitiveTestCase(unittest.TestCase): + def test_inclusive_strict(self): + # load the iris dataset + dataset = test_utils.load_iris_metadata() + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = numeric_range_filter.NumericRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'min': 6.5, + 'max': 6.7, + 'strict': True, + 'inclusive': True + }) + filter_primitive = numeric_range_filter.NumericRangeFilterPrimitive(hyperparams=hp) + new_dataframe = filter_primitive.produce(inputs=resource).value + + self.assertGreater(new_dataframe['sepalLength'].astype(float).min(), 6.5) + self.assertLess(new_dataframe['sepalLength'].astype(float).max(), 6.7) + + def test_inclusive_permissive(self): + # load the iris dataset + dataset = test_utils.load_iris_metadata() + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = numeric_range_filter.NumericRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'min': 6.5, + 'max': 6.7, + 'strict': False, + 'inclusive': True + }) + filter_primitive = numeric_range_filter.NumericRangeFilterPrimitive(hyperparams=hp) + new_dataframe = filter_primitive.produce(inputs=resource).value + + self.assertGreaterEqual(new_dataframe['sepalLength'].astype(float).min(), 6.5) + self.assertLessEqual(new_dataframe['sepalLength'].astype(float).max(), 6.7) + + def test_exclusive_strict(self): + # load the iris dataset + dataset = test_utils.load_iris_metadata() + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = numeric_range_filter \ + .NumericRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'min': 6.5, + 'max': 6.7, + 'strict': True, + 'inclusive': False + }) + filter_primitive = numeric_range_filter.NumericRangeFilterPrimitive(hyperparams=hp) + new_dataframe = filter_primitive.produce(inputs=resource).value + + self.assertEqual( + len(new_dataframe.loc[ + (new_dataframe['sepalLength'].astype(float) >= 6.5) & + (new_dataframe['sepalLength'].astype(float) <= 6.7)]), 0) + + def test_exclusive_permissive(self): + # load the iris dataset + dataset = test_utils.load_iris_metadata() + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = numeric_range_filter \ + .NumericRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'min': 6.5, + 'max': 6.7, + 'strict': False, + 'inclusive': False + }) + filter_primitive = numeric_range_filter.NumericRangeFilterPrimitive(hyperparams=hp) + new_dataframe = filter_primitive.produce(inputs=resource).value + + self.assertEqual( + len(new_dataframe.loc[ + (new_dataframe['sepalLength'].astype(float) > 6.5) & + (new_dataframe['sepalLength'].astype(float) < 6.7)]), 0) + + def test_row_metadata_removal(self): + # load the iris dataset + dataset = test_utils.load_iris_metadata() + + # add metadata for rows 0 and 1 + dataset.metadata = dataset.metadata.update(('learningData', 0), {'a': 0}) + dataset.metadata = dataset.metadata.update(('learningData', 5), {'b': 1}) + + resource = test_utils.get_dataframe(dataset) + + # apply filter that removes rows 0 and 1 + filter_hyperparams_class = numeric_range_filter.NumericRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 0, + 'min': 1, + 'max': 4, + 'strict': True, + 'inclusive': False + }) + filter_primitive = numeric_range_filter.NumericRangeFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + # verify that the length is correct + self.assertEqual(len(new_df), new_df.metadata.query(())['dimension']['length']) + + # verify that the rows were re-indexed in the metadata + self.assertEqual(new_df.metadata.query((0,))['a'], 0) + self.assertEqual(new_df.metadata.query((1,))['b'], 1) + self.assertFalse('b' in new_df.metadata.query((5,))) + + def test_bad_type_handling(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'timeseries_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = numeric_range_filter \ + .NumericRangeFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'min': 6.5, + 'max': 6.7, + 'strict': False, + 'inclusive': False + }) + filter_primitive = numeric_range_filter.NumericRangeFilterPrimitive(hyperparams=hp) + with self.assertRaises(ValueError): + filter_primitive.produce(inputs=resource) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_one_hot_maker.py b/tods/common-primitives/tests/test_one_hot_maker.py new file mode 100644 index 0000000..245fd70 --- /dev/null +++ b/tods/common-primitives/tests/test_one_hot_maker.py @@ -0,0 +1,516 @@ +import os +import time +import unittest +import numpy as np +import pickle +from d3m import container, exceptions, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, extract_columns_semantic_types, one_hot_maker, column_parser + + +def _copy_target_as_categorical_feature(attributes, targets): + attributes = targets.append_columns(attributes) + for column_name in targets.columns.values: + column_mask = attributes.columns.get_loc(column_name) + if isinstance(column_mask, int): + column_index = column_mask + else: + column_index = np.where(column_mask)[0][-1].item() + attributes.metadata = attributes.metadata.remove_semantic_type( + (metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget') + attributes.metadata = attributes.metadata.remove_semantic_type( + (metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/Target') + attributes.metadata = attributes.metadata.remove_semantic_type( + (metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + attributes.metadata = attributes.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/CategoricalData') + attributes.metadata = attributes.metadata.add_semantic_type( + (metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + attributes.metadata = attributes.metadata.update_column(column_index, + {'custom_metadata': metadata_base.NO_VALUE}) + return attributes + + +def _get_iris(): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = \ + dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = primitive.produce(inputs=dataset).value + + return dataframe + + +def _get_iris_columns(): + dataframe = _get_iris() + + # We set custom metadata on columns. + for column_index in range(1, 5): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'attributes'}) + for column_index in range(5, 6): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'targets'}) + + # We set semantic types like runtime would. + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/Target') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataframe.metadata = dataframe.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + + # Parsing. + hyperparams_class = \ + column_parser.ColumnParserPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + hyperparams_class = \ + extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',)})) + attributes = primitive.produce(inputs=dataframe).value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)})) + targets = primitive.produce(inputs=dataframe).value + + return dataframe, attributes, targets + + +class OneHotTestCase(unittest.TestCase): + attributes: container.DataFrame = None + excp_attributes: container.DataFrame = None + targets: container.DataFrame = None + dataframe: container.DataFrame = None + unseen_species: str = 'Unseen-Species' + missing_value: float = np.NaN + + @classmethod + def setUpClass(cls) -> None: + cls.dataframe, cls.attributes, cls.targets = _get_iris_columns() + cls.attributes = _copy_target_as_categorical_feature(attributes=cls.attributes, targets=cls.targets) + cls.excp_attributes = cls.attributes.copy() + + def tearDown(self): + self.attributes.iloc[:3, 0] = 'Iris-setosa' + self.excp_attributes.iloc[:3, 0] = 'Iris-setosa' + + def test_fit_produce(self): + attributes = _copy_target_as_categorical_feature(self.attributes, + self.targets.rename(columns={'species': '2-species'})) + attributes.metadata = attributes.metadata.update_column(1, { + 'name': '2-species' + }) + + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace'})) + + primitive.set_training_data(inputs=attributes) + primitive.fit() + after_onehot = primitive.produce(inputs=attributes).value + # 1 for the original, so we remove it. + self.assertEqual(after_onehot.shape[1], 2 * (len(self.targets['species'].unique()) - 1) + attributes.shape[1]) + self.assertEqual(after_onehot.shape[0], self.targets.shape[0]) + # 3 unique value for 2 (species, 2-species) 3 * 2 = 6 + self.assertTrue(all(dtype == 'uint8' for dtype in after_onehot.dtypes[:6])) + self.assertEqual(list(after_onehot.columns.values), [ + 'species.Iris-setosa', 'species.Iris-versicolor', 'species.Iris-virginica', + '2-species.Iris-setosa', '2-species.Iris-versicolor', '2-species.Iris-virginica', + 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth']) + self._test_metadata_return_replace(after_onehot.metadata) + + def test_error_unseen_categories_ignore(self): + # default(ignore) case + self.excp_attributes.iloc[0, 0] = self.unseen_species + self.excp_attributes.iloc[1, 0] = self.unseen_species + '-2' + self.excp_attributes.iloc[2, 0] = np.NaN + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace'})) + + primitive.set_training_data(inputs=self.attributes) + primitive.fit() + one_hot_result = primitive.produce(inputs=self.excp_attributes).value + self.assertEqual(one_hot_result.shape[1], len(self.targets['species'].unique()) + self.attributes.shape[1] - 1) + self.assertEqual(one_hot_result.shape[0], self.targets.shape[0]) + self.assertTrue(all(dtype == 'uint8' for dtype in one_hot_result.dtypes[:3])) + + def test_error_unseen_categories_error(self): + # error case + self.excp_attributes.iloc[0, 0] = self.unseen_species + self.excp_attributes.iloc[1, 0] = self.unseen_species + '-2' + self.excp_attributes.iloc[2, 0] = np.NaN + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace', 'handle_unseen': 'error'})) + + primitive.set_training_data(inputs=self.attributes) + primitive.fit() + self.assertRaises(exceptions.UnexpectedValueError, primitive.produce, inputs=self.excp_attributes) + + def test_unseen_categories_handle(self): + # handle case + self.excp_attributes.iloc[0, 0] = self.unseen_species + self.excp_attributes.iloc[1, 0] = self.unseen_species + '-2' + self.excp_attributes.iloc[2, 0] = np.NaN + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace', 'handle_unseen': 'column'})) + + primitive.set_training_data(inputs=self.attributes) + primitive.fit() + one_hot_result = primitive.produce(inputs=self.excp_attributes).value + self.assertEqual(one_hot_result.shape[1], + len(self.targets['species'].unique()) + self.attributes.shape[1] - 1 + 1) + # unseen cell should be 1 + self.assertEqual(one_hot_result.iloc[0, 3], 1) + self.assertEqual(one_hot_result.shape[0], self.targets.shape[0]) + self.assertTrue(all(dtype == 'uint8' for dtype in one_hot_result.dtypes[:3])) + self.assertEqual(set(one_hot_result.columns.values), {'petalLength', + 'petalWidth', + 'sepalLength', + 'sepalWidth', + 'species.Iris-setosa', + 'species.Iris-versicolor', + 'species.Iris-virginica', + 'species.Unseen'}) + self._test_metadata_unseen_handle_return_replace(one_hot_result.metadata) + + def test_missing_value_ignore(self): + self.excp_attributes.iloc[0, 0] = self.missing_value + self.excp_attributes.iloc[1, 0] = self.missing_value + + # missing present during fit + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace'})) + + primitive.set_training_data(inputs=self.excp_attributes) + primitive.fit() + one_hot_result = primitive.produce(inputs=self.excp_attributes).value + self.assertEqual(one_hot_result.shape[1], len(self.targets['species'].unique()) + self.attributes.shape[1] - 1) + self.assertEqual(one_hot_result.shape[0], self.targets.shape[0]) + self.assertTrue(all(dtype == 'uint8' for dtype in one_hot_result.dtypes[:3])) + self.assertEqual(set(one_hot_result.columns.values), { + 'species.Iris-setosa', 'species.Iris-versicolor', 'species.Iris-virginica', + 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'}) + + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace'})) + + primitive.set_training_data(inputs=self.attributes) + primitive.fit() + one_hot_result = primitive.produce(inputs=self.excp_attributes).value + self.assertEqual(one_hot_result.shape[1], len(self.targets['species'].unique()) + self.attributes.shape[1] - 1) + self.assertEqual(one_hot_result.shape[0], self.targets.shape[0]) + self.assertTrue(all(dtype == 'uint8' for dtype in one_hot_result.dtypes[:3])) + self.assertEqual(set(one_hot_result.columns.values), { + 'species.Iris-setosa', 'species.Iris-versicolor', 'species.Iris-virginica', + 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'}) + + def test_missing_value_error(self): + self.excp_attributes.iloc[0, 0] = np.NaN + self.excp_attributes.iloc[1, 0] = None + # error + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({ + 'return_result': 'replace', + 'handle_missing_value': 'error', + })) + + primitive.set_training_data(inputs=self.excp_attributes) + self.assertRaises(exceptions.MissingValueError, primitive.fit) + + def test_missing_value_column(self): + self.excp_attributes.iloc[0, 0] = np.NaN + self.excp_attributes.iloc[1, 0] = np.NaN + self.excp_attributes.iloc[2, 0] = 'Unseen-Species' + # column + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({ + 'return_result': 'replace', + 'handle_missing_value': 'column', + })) + + primitive.set_training_data(inputs=self.attributes) + primitive.fit() + one_hot_result = primitive.produce(inputs=self.excp_attributes).value + self.assertEqual(one_hot_result.shape[1], + len(self.targets['species'].unique()) + 1 + self.attributes.shape[1] - 1) + self.assertEqual(one_hot_result.shape[0], self.targets.shape[0]) + self.assertTrue(all(dtype == 'uint8' for dtype in one_hot_result.dtypes[:4])) + self.assertEqual(set(one_hot_result.columns.values), {'petalLength', + 'petalWidth', + 'sepalLength', + 'sepalWidth', + 'species.Iris-setosa', + 'species.Iris-versicolor', + 'species.Iris-virginica', + 'species.Missing'}) + + def test_unseen_column_and_missing_value_column(self): + self.excp_attributes.iloc[0, 0] = np.NaN + self.excp_attributes.iloc[1, 0] = np.NaN + self.excp_attributes.iloc[2, 0] = 'Unseen-Species' + # column + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({ + 'return_result': 'replace', + 'handle_missing_value': 'column', + 'handle_unseen': 'column' + })) + + primitive.set_training_data(inputs=self.attributes) + primitive.fit() + one_hot_result = primitive.produce(inputs=self.excp_attributes).value + self.assertEqual(one_hot_result.shape[1], + len(self.targets['species'].unique()) + 2 + self.attributes.shape[1] - 1) + self.assertEqual(one_hot_result.shape[0], self.targets.shape[0]) + self.assertTrue(all(dtype == 'uint8' for dtype in one_hot_result.dtypes[:4])) + self.assertEqual(set(one_hot_result.columns.values), {'petalLength', + 'petalWidth', + 'sepalLength', + 'sepalWidth', + 'species.Iris-setosa', + 'species.Iris-versicolor', + 'species.Iris-virginica', + 'species.Missing', + 'species.Unseen'}) + + def test_pickle_unpickle(self): + hyperparams_class = \ + one_hot_maker.OneHotMakerPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = one_hot_maker.OneHotMakerPrimitive( + hyperparams=hyperparams_class.defaults().replace({ + 'return_result': 'replace', + 'handle_missing_value': 'column', + 'handle_unseen': 'column' + })) + + primitive.set_training_data(inputs=self.attributes) + primitive.fit() + + before_pickled_prediction = primitive.produce(inputs=self.attributes).value + pickle_object = pickle.dumps(primitive) + primitive = pickle.loads(pickle_object) + after_unpickled_prediction = primitive.produce(inputs=self.attributes).value + self.assertTrue(container.DataFrame.equals(before_pickled_prediction, after_unpickled_prediction)) + + def _test_metadata_unseen_handle_return_replace(self, after_onehot_metadata): + self.assertEqual(utils.to_json_structure(after_onehot_metadata.to_internal_simple_structure()), [{ + 'metadata': { + 'dimension': { + 'length': 150, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'] + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame' + }, + 'selector': [] + }, + { + 'metadata': { + 'dimension': { + 'length': 8, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'] + } + }, + 'selector': ['__ALL_ELEMENTS__'] + }, + { + 'metadata': { + 'custom_metadata': '__NO_VALUE__', + 'name': 'species.Iris-setosa', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8' + }, + 'selector': ['__ALL_ELEMENTS__', 0] + }, + { + 'metadata': { + 'custom_metadata': '__NO_VALUE__', + 'name': 'species.Iris-versicolor', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8' + }, + 'selector': ['__ALL_ELEMENTS__', 1]}, + { + 'metadata': { + 'custom_metadata': '__NO_VALUE__', + 'name': 'species.Iris-virginica', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8' + }, + 'selector': ['__ALL_ELEMENTS__', 2]}, + { + 'metadata': {'custom_metadata': '__NO_VALUE__', + 'name': 'species.Unseen', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8'}, + 'selector': ['__ALL_ELEMENTS__', 3] + }, + { + 'metadata': { + 'custom_metadata': 'attributes', + 'name': 'sepalLength', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute' + ], + 'structural_type': 'float' + }, + 'selector': ['__ALL_ELEMENTS__', 4] + }, + { + 'metadata': { + 'custom_metadata': 'attributes', + 'name': 'sepalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute' + ], + 'structural_type': 'float' + }, + 'selector': ['__ALL_ELEMENTS__', 5] + }, + { + 'metadata': { + 'custom_metadata': 'attributes', + 'name': 'petalLength', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute' + ], + 'structural_type': 'float' + }, + 'selector': ['__ALL_ELEMENTS__', 6] + }, + { + 'metadata': { + 'custom_metadata': 'attributes', + 'name': 'petalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute' + ], + 'structural_type': 'float' + }, + 'selector': ['__ALL_ELEMENTS__', 7] + } + ]) + + def _test_metadata_return_replace(self, after_onehot_metadata): + self.assertEqual( + utils.to_json_structure(after_onehot_metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 150, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 10, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'custom_metadata': '__NO_VALUE__', + 'name': 'species.Iris-setosa', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'custom_metadata': '__NO_VALUE__', + 'name': 'species.Iris-versicolor', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'custom_metadata': '__NO_VALUE__', + 'name': 'species.Iris-virginica', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'custom_metadata': '__NO_VALUE__', + 'name': '2-species.Iris-setosa', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'custom_metadata': '__NO_VALUE__', + 'name': '2-species.Iris-versicolor', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + {'metadata': {'custom_metadata': '__NO_VALUE__', + 'name': '2-species.Iris-virginica', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8'}, + 'selector': ['__ALL_ELEMENTS__', 5]}, + {'metadata': {'custom_metadata': 'attributes', + 'name': 'sepalLength', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'float'}, + 'selector': ['__ALL_ELEMENTS__', 6]}, + {'metadata': {'custom_metadata': 'attributes', + 'name': 'sepalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'float'}, + 'selector': ['__ALL_ELEMENTS__', 7]}, + {'metadata': {'custom_metadata': 'attributes', + 'name': 'petalLength', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'float'}, + 'selector': ['__ALL_ELEMENTS__', 8]}, + {'metadata': {'custom_metadata': 'attributes', + 'name': 'petalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'float'}, + 'selector': ['__ALL_ELEMENTS__', 9]}] + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_pandas_onehot_encoder.py b/tods/common-primitives/tests/test_pandas_onehot_encoder.py new file mode 100644 index 0000000..d7b4b30 --- /dev/null +++ b/tods/common-primitives/tests/test_pandas_onehot_encoder.py @@ -0,0 +1,178 @@ +import unittest +import pandas as pd + +from d3m import container, utils +from common_primitives.pandas_onehot_encoder import PandasOneHotEncoderPrimitive +from d3m.metadata import base as metadata_base + +import utils as test_utils + + +class PandasOneHotEncoderPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + training = pd.DataFrame({'Name': ['Henry', 'Diane', 'Kitty', 'Peter']}) + training = container.DataFrame(training, generate_metadata=True) + training.metadata = training.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/CategoricalData',) + training.metadata = training.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Attribute',) + + testing = pd.DataFrame({'Name': ['John', 'Alex','Henry','Diane']}) + testing = container.DataFrame(testing, generate_metadata=True) + testing.metadata = testing.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/CategoricalData') + testing.metadata = testing.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/Attribute',) + testing.metadata = testing.metadata.update_column(0, { + 'custom_metadata': 42, + }) + + Hyperparams = PandasOneHotEncoderPrimitive.metadata.get_hyperparams() + ht = PandasOneHotEncoderPrimitive(hyperparams=Hyperparams.defaults()) + + ht.set_training_data(inputs=training) + ht.fit() + + result_df = ht.produce(inputs=testing).value + + self.assertEqual(list(result_df.columns), ['Name_Diane', 'Name_Henry', 'Name_Kitty', 'Name_Peter']) + + self.assertEqual(list(result_df['Name_Henry']), [0, 0, 1, 0]) + self.assertEqual(list(result_df['Name_Diane']), [0, 0, 0, 1]) + self.assertEqual(list(result_df['Name_Kitty']), [0, 0, 0, 0]) + self.assertEqual(list(result_df['Name_Peter']), [0, 0, 0, 0]) + + self.assertEqual(test_utils.convert_metadata(utils.to_json_structure(result_df.metadata.to_internal_simple_structure())), [{ + 'selector': [], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'custom_metadata': 42, + 'name': 'Name_Diane', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'custom_metadata': 42, + 'name': 'Name_Henry', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'custom_metadata': 42, + 'name': 'Name_Kitty', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'custom_metadata': 42, + 'name': 'Name_Peter', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8', + }, + }]) + + ht = PandasOneHotEncoderPrimitive(hyperparams=Hyperparams.defaults().replace({ + 'dummy_na': True, + })) + + ht.set_training_data(inputs=training) + ht.fit() + + result_df = ht.produce(inputs=testing).value + + self.assertEqual(list(result_df.columns), ['Name_Diane', 'Name_Henry', 'Name_Kitty', 'Name_Peter', 'Name_nan']) + + self.assertEqual(list(result_df['Name_Henry']), [0, 0, 1, 0]) + self.assertEqual(list(result_df['Name_Diane']), [0, 0, 0, 1]) + self.assertEqual(list(result_df['Name_Kitty']), [0, 0, 0, 0]) + self.assertEqual(list(result_df['Name_Peter']), [0, 0, 0, 0]) + self.assertEqual(list(result_df['Name_nan']), [1, 1, 0, 0]) + + self.assertEqual(test_utils.convert_metadata(utils.to_json_structure(result_df.metadata.to_internal_simple_structure())), [{ + 'selector': [], + 'metadata': { + 'dimension': { + 'length': 4, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 5, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'custom_metadata': 42, + 'name': 'Name_Diane', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'custom_metadata': 42, + 'name': 'Name_Henry', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'custom_metadata': 42, + 'name': 'Name_Kitty', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'custom_metadata': 42, + 'name': 'Name_Peter', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'custom_metadata': 42, + 'name': 'Name_nan', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.uint8', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_random_forest.py b/tods/common-primitives/tests/test_random_forest.py new file mode 100644 index 0000000..5daee9c --- /dev/null +++ b/tods/common-primitives/tests/test_random_forest.py @@ -0,0 +1,701 @@ +import logging +import os +import pickle +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, extract_columns_semantic_types, random_forest, column_parser + + +class RandomForestTestCase(unittest.TestCase): + def _get_iris(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = primitive.produce(inputs=dataset).value + + return dataframe + + def _get_iris_columns(self): + dataframe = self._get_iris() + + # We set custom metadata on columns. + for column_index in range(1, 5): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'attributes'}) + for column_index in range(5, 6): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'targets'}) + + # We set semantic types like runtime would. + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataframe.metadata = dataframe.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + # Parsing. + hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + hyperparams_class = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.get_hyperparams() + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',)})) + attributes = primitive.produce(inputs=dataframe).value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)})) + targets = primitive.produce(inputs=dataframe).value + + return dataframe, attributes, targets + + def test_single_target(self): + dataframe, attributes, targets = self._get_iris_columns() + + self.assertEqual(list(targets.columns), ['species']) + + hyperparams_class = random_forest.RandomForestClassifierPrimitive.metadata.get_hyperparams() + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + predictions = primitive.produce(inputs=attributes).value + + self.assertEqual(list(predictions.columns), ['species']) + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + self._test_single_target_metadata(predictions.metadata) + + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(list(samples[0].columns), ['species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + + self.assertEqual(list(log_likelihoods.columns), ['species']) + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + + self.assertEqual(list(log_likelihood.columns), ['species']) + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -3.72702785304761) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + feature_importances = primitive.produce_feature_importances().value + + self.assertEqual(list(feature_importances), ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth']) + self.assertEqual(feature_importances.metadata.query_column(0)['name'], 'sepalLength') + self.assertEqual(feature_importances.metadata.query_column(1)['name'], 'sepalWidth') + self.assertEqual(feature_importances.metadata.query_column(2)['name'], 'petalLength') + self.assertEqual(feature_importances.metadata.query_column(3)['name'], 'petalWidth') + + self.assertEqual(feature_importances.values.tolist(), [[0.09090795402103087, + 0.024531041234715757, + 0.46044473961715215, + 0.42411626512710127, + ]]) + + def _test_single_target_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_multiple_targets(self): + dataframe, attributes, targets = self._get_iris_columns() + + targets = targets.append_columns(targets) + + self.assertEqual(list(targets.columns), ['species', 'species']) + + hyperparams_class = random_forest.RandomForestClassifierPrimitive.metadata.get_hyperparams() + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + predictions = primitive.produce(inputs=attributes).value + + self.assertEqual(list(predictions.columns), ['species', 'species']) + + self.assertEqual(predictions.shape, (150, 2)) + for column_index in range(2): + self.assertEqual(predictions.iloc[0, column_index], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(column_index)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(column_index)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(list(samples[0].columns), ['species', 'species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 2)) + for column_index in range(2): + self.assertEqual(samples[0].iloc[0, column_index], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(column_index)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(column_index)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + + self.assertEqual(list(log_likelihoods.columns), ['species', 'species']) + + self.assertEqual(log_likelihoods.shape, (150, 2)) + for column_index in range(2): + self.assertEqual(log_likelihoods.metadata.query_column(column_index)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + + self.assertEqual(list(log_likelihood.columns), ['species', 'species']) + + self.assertEqual(log_likelihood.shape, (1, 2)) + for column_index in range(2): + self.assertAlmostEqual(log_likelihood.iloc[0, column_index], -3.72702785304761) + self.assertEqual(log_likelihoods.metadata.query_column(column_index)['name'], 'species') + + feature_importances = primitive.produce_feature_importances().value + + self.assertEqual(list(feature_importances), ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth']) + self.assertEqual(feature_importances.metadata.query_column(0)['name'], 'sepalLength') + self.assertEqual(feature_importances.metadata.query_column(1)['name'], 'sepalWidth') + self.assertEqual(feature_importances.metadata.query_column(2)['name'], 'petalLength') + self.assertEqual(feature_importances.metadata.query_column(3)['name'], 'petalWidth') + + self.assertEqual(feature_importances.values.tolist(), [[0.09090795402103087, + 0.024531041234715757, + 0.46044473961715215, + 0.42411626512710127, + ]]) + + def test_semantic_types(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = random_forest.RandomForestClassifierPrimitive.metadata.get_hyperparams() + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(list(predictions.columns), ['species']) + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=dataframe).value + + self.assertEqual(list(samples[0].columns), ['species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=dataframe, outputs=dataframe).value + + self.assertEqual(list(log_likelihoods.columns), ['species']) + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=dataframe, outputs=dataframe).value + + self.assertEqual(list(log_likelihood.columns), ['species']) + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -3.72702785304761) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + feature_importances = primitive.produce_feature_importances().value + + self.assertEqual(list(feature_importances), ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth']) + self.assertEqual(feature_importances.metadata.query_column(0)['name'], 'sepalLength') + self.assertEqual(feature_importances.metadata.query_column(1)['name'], 'sepalWidth') + self.assertEqual(feature_importances.metadata.query_column(2)['name'], 'petalLength') + self.assertEqual(feature_importances.metadata.query_column(3)['name'], 'petalWidth') + + self.assertEqual(feature_importances.values.tolist(), [[0.09090795402103087, + 0.024531041234715757, + 0.46044473961715215, + 0.42411626512710127, + ]]) + + def test_return_append(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = random_forest.RandomForestClassifierPrimitive.metadata.get_hyperparams() + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults()) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'sepalLength', + 'sepalWidth', + 'petalLength', + 'petalWidth', + 'species', + 'species', + ]) + + self.assertEqual(predictions.shape, (150, 7)) + self.assertEqual(predictions.iloc[0, 6], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 6), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 6), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(6)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(6)['custom_metadata'], 'targets') + + self._test_return_append_metadata(predictions.metadata) + + def _test_return_append_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 7, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'sepalLength', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'sepalWidth', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'petalLength', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'petalWidth', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'custom_metadata': 'targets', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }]) + + def test_return_new(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = random_forest.RandomForestClassifierPrimitive.metadata.get_hyperparams() + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults().replace({'return_result': 'new'})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + ]) + + self.assertEqual(predictions.shape, (150, 2)) + self.assertEqual(predictions.iloc[0, 1], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_new_metadata(predictions.metadata) + + def _test_return_new_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_return_replace(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = random_forest.RandomForestClassifierPrimitive.metadata.get_hyperparams() + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace'})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + 'species', + ]) + + self.assertEqual(predictions.shape, (150, 3)) + self.assertEqual(predictions.iloc[0, 1], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_replace_metadata(predictions.metadata) + + def test_get_set_params(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = random_forest.RandomForestClassifierPrimitive.metadata.get_hyperparams() + primitive = random_forest.RandomForestClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + before_set_prediction = primitive.produce(inputs=attributes).value + params = primitive.get_params() + primitive.set_params(params=params) + after_set_prediction = primitive.produce(inputs=attributes).value + self.assertTrue(container.DataFrame.equals(before_set_prediction, after_set_prediction)) + + def test_pickle_unpickle(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = random_forest.RandomForestClassifierPrimitive.metadata.get_hyperparams() + primitive = random_forest.RandomForestClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + before_pickled_prediction = primitive.produce(inputs=attributes).value + pickle_object = pickle.dumps(primitive) + primitive = pickle.loads(pickle_object) + after_unpickled_prediction = primitive.produce(inputs=attributes).value + self.assertTrue(container.DataFrame.equals(before_pickled_prediction, after_unpickled_prediction)) + + def _test_return_replace_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', 'https://metadata.datadrivendiscovery.org/types/Target', 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'custom_metadata': 'targets', + }, + }]) + + def test_empty_data(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = random_forest.RandomForestClassifierPrimitive.metadata.get_hyperparams() + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults()) + + just_index_dataframe = dataframe.select_columns([0]) + no_attributes_dataframe = dataframe.select_columns([0, 5]) + + primitive.set_training_data(inputs=just_index_dataframe, outputs=just_index_dataframe) + + with self.assertRaises(Exception): + primitive.fit() + + primitive.set_training_data(inputs=no_attributes_dataframe, outputs=no_attributes_dataframe) + + with self.assertRaises(Exception): + primitive.fit() + + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'error_on_no_columns': False, + 'return_result': 'replace', + })) + + primitive.set_training_data(inputs=just_index_dataframe, outputs=just_index_dataframe) + + with self.assertLogs(primitive.logger, level=logging.WARNING) as cm: + primitive.fit() + + self.assertEqual(len(cm.records), 2) + self.assertEqual(cm.records[0].msg, "No inputs columns.") + self.assertEqual(cm.records[1].msg, "No outputs columns.") + + # Test pickling. + pickle_object = pickle.dumps(primitive) + pickle.loads(pickle_object) + + with self.assertLogs(primitive.logger, level=logging.WARNING) as cm: + predictions = primitive.produce(inputs=just_index_dataframe).value + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "No inputs columns.") + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + ]) + self.assertEqual(predictions.shape, (150, 1)) + + self.assertEqual(predictions.metadata.to_internal_json_structure(), just_index_dataframe.metadata.to_internal_json_structure()) + + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'error_on_no_columns': False, + 'return_result': 'replace', + })) + + primitive.set_training_data(inputs=no_attributes_dataframe, outputs=no_attributes_dataframe) + + with self.assertLogs(primitive.logger, level=logging.WARNING) as cm: + primitive.fit() + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "No inputs columns.") + + # Test pickling. + pickle_object = pickle.dumps(primitive) + pickle.loads(pickle_object) + + with self.assertLogs(primitive.logger, level=logging.WARNING) as cm: + predictions = primitive.produce(inputs=no_attributes_dataframe).value + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "No inputs columns.") + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + ]) + self.assertEqual(predictions.shape, (150, 2)) + + self.assertEqual(predictions.metadata.to_internal_json_structure(), no_attributes_dataframe.metadata.to_internal_json_structure()) + + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'error_on_no_columns': False, + 'return_result': 'new', + })) + + primitive.set_training_data(inputs=no_attributes_dataframe, outputs=no_attributes_dataframe) + + with self.assertLogs(primitive.logger, level=logging.WARNING) as cm: + primitive.fit() + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "No inputs columns.") + + # Test pickling. + pickle_object = pickle.dumps(primitive) + pickle.loads(pickle_object) + + with self.assertLogs(primitive.logger, level=logging.WARNING) as cm: + with self.assertRaises(ValueError): + primitive.produce(inputs=no_attributes_dataframe) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "No inputs columns.") + + primitive = random_forest.RandomForestClassifierPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'error_on_no_columns': False, + 'return_result': 'append', + })) + + primitive.set_training_data(inputs=no_attributes_dataframe, outputs=no_attributes_dataframe) + + with self.assertLogs(primitive.logger, level=logging.WARNING) as cm: + primitive.fit() + + # Test pickling. + pickle_object = pickle.dumps(primitive) + pickle.loads(pickle_object) + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "No inputs columns.") + + with self.assertLogs(primitive.logger, level=logging.WARNING) as cm: + predictions = primitive.produce(inputs=no_attributes_dataframe).value + + self.assertEqual(len(cm.records), 1) + self.assertEqual(cm.records[0].msg, "No inputs columns.") + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + ]) + self.assertEqual(predictions.shape, (150, 2)) + + self.assertEqual(predictions.metadata.to_internal_json_structure(), no_attributes_dataframe.metadata.to_internal_json_structure()) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_ravel.py b/tods/common-primitives/tests/test_ravel.py new file mode 100644 index 0000000..33d11ac --- /dev/null +++ b/tods/common-primitives/tests/test_ravel.py @@ -0,0 +1,125 @@ +import unittest + +from d3m import container, utils + +from common_primitives import ravel + + +class RavelAsRowPrimitiveTestCase(unittest.TestCase): + def _get_data(self): + data = container.DataFrame({ + 'a': [1, 2, 3], + 'b': [container.ndarray([2, 3, 4]), container.ndarray([5, 6, 7]), container.ndarray([8, 9, 10])] + }, { + 'top_level': 'foobar1', + }, generate_metadata=True) + + data.metadata = data.metadata.update_column(1, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + }) + + return data + + def test_basic(self): + dataframe = container.DataFrame({ + 'a': [1, 2, 3], + 'b': ['a', 'b', 'c'] + }, { + 'top_level': 'foobar1', + }, generate_metadata=True) + + self.assertEqual(dataframe.shape, (3, 2)) + + for row_index in range(len(dataframe)): + for column_index in range(len(dataframe.columns)): + dataframe.metadata = dataframe.metadata.update((row_index, column_index), { + 'location': (row_index, column_index), + }) + + dataframe.metadata.check(dataframe) + + hyperparams = ravel.RavelAsRowPrimitive.metadata.get_hyperparams() + primitive = ravel.RavelAsRowPrimitive(hyperparams=hyperparams.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + self.assertEqual(dataframe.shape, (1, 6)) + + self.assertEqual(dataframe.values.tolist(), [[1, 'a', 2, 'b', 3, 'c']]) + self.assertEqual(list(dataframe.columns), ['a', 'b', 'a', 'b', 'a', 'b']) + + self.assertEqual(utils.to_json_structure(dataframe.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'dimension': { + 'length': 1, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + 'top_level': 'foobar1', + }, + }, + { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 6, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, + { + 'selector': [0, 0], + 'metadata': { + 'location': [0, 0], + 'name': 'a', + 'structural_type': 'numpy.int64', + }, + }, + { + 'selector': [0, 1], + 'metadata': { + 'location': [0, 1], + 'name': 'b', + 'structural_type': 'str', + }, + }, + { + 'selector': [0, 2], + 'metadata': { + 'location': [1, 0], + 'name': 'a', + 'structural_type': 'numpy.int64', + }, + }, + { + 'selector': [0, 3], + 'metadata': { + 'location': [1, 1], + 'name': 'b', + 'structural_type': 'str', + }, + }, + { + 'selector': [0, 4], + 'metadata': { + 'location': [2, 0], + 'name': 'a', + 'structural_type': 'numpy.int64', + }, + }, + { + 'selector': [0, 5], + 'metadata': { + 'location': [2, 1], + 'name': 'b', + 'structural_type': 'str', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_redact_columns.py b/tods/common-primitives/tests/test_redact_columns.py new file mode 100644 index 0000000..5bd5df0 --- /dev/null +++ b/tods/common-primitives/tests/test_redact_columns.py @@ -0,0 +1,173 @@ +import os +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import redact_columns + + +class RedactColumnsPrimitiveTestCase(unittest.TestCase): + def _get_datasets(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + datasets = container.List([dataset], { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': container.List, + 'dimension': { + 'length': 1, + }, + }, generate_metadata=False) + + # We update metadata based on metadata of each dataset. + # TODO: In the future this might be done automatically by generate_metadata. + # See: https://gitlab.com/datadrivendiscovery/d3m/issues/119 + for index, dataset in enumerate(datasets): + datasets.metadata = dataset.metadata.copy_to(datasets.metadata, (), (index,)) + + return dataset_doc_path, datasets + + def test_basic(self): + dataset_doc_path, datasets = self._get_datasets() + + hyperparams_class = redact_columns.RedactColumnsPrimitive.metadata.get_hyperparams() + + primitive = redact_columns.RedactColumnsPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TrueTarget',), + 'add_semantic_types': ('https://metadata.datadrivendiscovery.org/types/RedactedTarget', 'https://metadata.datadrivendiscovery.org/types/MissingData'), + })) + redacted_datasets = primitive.produce(inputs=datasets).value + + self.assertTrue(len(redacted_datasets), 1) + + redacted_dataset = redacted_datasets[0] + + self.assertIsInstance(redacted_dataset, container.Dataset) + self.assertEqual(redacted_dataset['learningData']['species'].values.tolist(), [''] * 150) + + self._test_metadata(redacted_datasets.metadata, dataset_doc_path, True) + self._test_metadata(redacted_dataset.metadata, dataset_doc_path, False) + + def _test_metadata(self, metadata, dataset_doc_path, is_list): + top_metadata = { + 'structural_type': 'd3m.container.dataset.Dataset', + 'id': 'iris_dataset_1', + 'version': '4.0.0', + 'name': 'Iris Dataset', + 'location_uris': [ + 'file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path), + ], + 'dimension': { + 'name': 'resources', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], + 'length': 1, + }, + 'digest': '49404bf166238fbdac2b6d6baa899a0d1bf8ed5976525fa7353fd732ac218a85', + 'source': { + 'license': 'CC', + 'redacted': False, + 'human_subjects_research': False, + }, + } + + if is_list: + prefix = [0] + list_metadata = [{ + 'selector': [], + 'metadata': { + 'dimension': { + 'length': 1, + }, + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.list.List', + }, + }] + else: + prefix = [] + list_metadata = [] + top_metadata['schema'] = metadata_base.CONTAINER_SCHEMA_VERSION + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), list_metadata + [{ + 'selector': prefix + [], + 'metadata': top_metadata, + }, { + 'selector': prefix + ['learningData'], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table', 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + }, { + 'selector': prefix + ['learningData', '__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': prefix + ['learningData', '__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': prefix + ['learningData', '__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'sepalLength', + 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + }, + }, { + 'selector': prefix + ['learningData', '__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'sepalWidth', + 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + }, + }, { + 'selector': prefix + ['learningData', '__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'petalLength', + 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + }, + }, { + 'selector': prefix + ['learningData', '__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'petalWidth', + 'structural_type': 'str', + 'semantic_types': ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + }, + }, { + 'selector': prefix + ['learningData', '__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + 'https://metadata.datadrivendiscovery.org/types/RedactedTarget', + 'https://metadata.datadrivendiscovery.org/types/MissingData', + ], + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_regex_filter.py b/tods/common-primitives/tests/test_regex_filter.py new file mode 100644 index 0000000..42e0d71 --- /dev/null +++ b/tods/common-primitives/tests/test_regex_filter.py @@ -0,0 +1,114 @@ +import unittest +import os + +from common_primitives import regex_filter +from d3m import container, exceptions + +import utils as test_utils + + +class RegexFilterPrimitiveTestCase(unittest.TestCase): + def test_inclusive(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = regex_filter.RegexFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'inclusive': True, + 'regex': 'AAA' + }) + + filter_primitive = regex_filter.RegexFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + matches = new_df[new_df['code'].str.match('AAA')] + self.assertTrue(matches['code'].unique() == ['AAA']) + + def test_exclusive(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = regex_filter.RegexFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'inclusive': False, + 'regex': 'AAA' + }) + + filter_primitive = regex_filter.RegexFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + matches = new_df[~new_df['code'].str.match('AAA')] + self.assertTrue(set(matches['code'].unique()) == set(['BBB', 'CCC'])) + + def test_numeric(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + # set dataframe type to int to match output of a prior parse columns step + resource.iloc[:,3] = resource.iloc[:,3].astype(int) + + filter_hyperparams_class = regex_filter.RegexFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 3, + 'inclusive': False, + 'regex': '1990' + }) + + filter_primitive = regex_filter.RegexFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + matches = new_df[~new_df['year'].astype(str).str.match('1990')] + self.assertTrue(set(matches['year'].unique()) == set([2000, 2010])) + + def test_row_metadata_removal(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # add metadata for rows 0 and 1 + dataset.metadata = dataset.metadata.update(('learningData', 1), {'a': 0}) + dataset.metadata = dataset.metadata.update(('learningData', 2), {'b': 1}) + + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = regex_filter.RegexFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'inclusive': False, + 'regex': 'AAA' + }) + + filter_primitive = regex_filter.RegexFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + # verify that the lenght is correct + self.assertEqual(len(new_df), new_df.metadata.query(())['dimension']['length']) + + # verify that the rows were re-indexed in the metadata + self.assertEquals(new_df.metadata.query((0,))['a'], 0) + self.assertEquals(new_df.metadata.query((1,))['b'], 1) + self.assertFalse('b' in new_df.metadata.query((2,))) + + def test_bad_regex(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = regex_filter.RegexFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'inclusive': True, + 'regex': '[' + }) + + filter_primitive = regex_filter.RegexFilterPrimitive(hyperparams=hp) + with self.assertRaises(exceptions.InvalidArgumentValueError): + filter_primitive.produce(inputs=resource) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_remove_duplicate_columns.py b/tods/common-primitives/tests/test_remove_duplicate_columns.py new file mode 100644 index 0000000..3713751 --- /dev/null +++ b/tods/common-primitives/tests/test_remove_duplicate_columns.py @@ -0,0 +1,123 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import remove_duplicate_columns + + +class RemoveDuplicateColumnsPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + main = container.DataFrame({'a1': [1, 2, 3], 'b1': [4, 5, 6], 'a2': [1, 2, 3], 'c1': [7, 8, 9], 'a3': [1, 2, 3], 'a1a': [1, 2, 3]}, { + 'top_level': 'main', + }, columns=['a1', 'b1', 'a2', 'c1', 'a3', 'a1a'], generate_metadata=True) + main.metadata = main.metadata.update_column(0, {'name': 'aaa111'}) + main.metadata = main.metadata.update_column(1, {'name': 'bbb111'}) + main.metadata = main.metadata.update_column(2, {'name': 'aaa222'}) + main.metadata = main.metadata.update_column(3, {'name': 'ccc111'}) + main.metadata = main.metadata.update_column(4, {'name': 'aaa333'}) + main.metadata = main.metadata.update_column(5, {'name': 'aaa111'}) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'aaa111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'bbb111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'aaa222'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'ccc111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'aaa333'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'aaa111'}, + }]) + + hyperparams_class = remove_duplicate_columns.RemoveDuplicateColumnsPrimitive.metadata.get_hyperparams() + primitive = remove_duplicate_columns.RemoveDuplicateColumnsPrimitive(hyperparams=hyperparams_class.defaults()) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + + self.assertEqual(new_main.values.tolist(), [ + [1, 4, 7], + [2, 5, 8], + [3, 6, 9], + ]) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa111', + 'other_names': ['aaa222', 'aaa333'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'ccc111', + 'structural_type': 'numpy.int64', + }, + }]) + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_rename_duplicate_columns.py b/tods/common-primitives/tests/test_rename_duplicate_columns.py new file mode 100644 index 0000000..90cc522 --- /dev/null +++ b/tods/common-primitives/tests/test_rename_duplicate_columns.py @@ -0,0 +1,136 @@ +import os +import unittest + +import pandas as pd + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, column_parser, rename_duplicate_columns + + +class RenameDuplicateColumnsPrimitiveTestCase(unittest.TestCase): + def _get_iris(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = \ + dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = primitive.produce(inputs=dataset).value + + return dataframe + + def _get_iris_columns(self): + dataframe = self._get_iris() + # We set semantic types like runtime would. + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/Target') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataframe.metadata = dataframe.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + + # Parsing. + hyperparams_class = \ + column_parser.ColumnParserPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + return dataframe + + def test_basic(self): + test_data_inputs = {'col1': [1.0, 2.0, 3.0], + 'col2': [4.0, 5.0, 6.0], + 'col3': [100, 200, 300]} + dataframe_inputs = container.DataFrame.from_dict(data=test_data_inputs) + test_data_inputs_dup = {'col1': [1.0, 2.0, 3.0], + 'col2': [4.0, 5.0, 6.0]} + dataframe_inputs_dup = container.DataFrame.from_dict(data=test_data_inputs_dup) + test_data_inputs_dup_2 = {'col1': [1.0, 2.0, 3.0], + 'col2': [4.0, 5.0, 6.0], + 'col3': [100, 200, 300]} + dataframe_inputs_dup_2 = container.DataFrame.from_dict(data=test_data_inputs_dup_2) + input = pd.concat([dataframe_inputs, dataframe_inputs_dup, dataframe_inputs_dup_2], axis=1) + + hyperparams_class = rename_duplicate_columns.RenameDuplicateColumnsPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + + primitive = rename_duplicate_columns.RenameDuplicateColumnsPrimitive(hyperparams=hyperparams_class.defaults()) + + call_result = primitive.produce(inputs=input) + dataframe_renamed = call_result.value + self.assertEqual(dataframe_renamed.columns.values.tolist(), + ['col1', 'col2', 'col3', 'col1.1', 'col2.1', 'col1.2', 'col2.2', 'col3.1']) + + def test_monotonic_dup_col_name(self): + """This test is added because of issue #73""" + test_data_inputs = {'a': [1.0, 2.0, 3.0], + 'b': [100, 200, 300]} + dataframe_inputs = container.DataFrame.from_dict(data=test_data_inputs) + test_data_inputs_dup = {'b': [1.0, 2.0, 3.0], + 'c': [4.0, 5.0, 6.0]} + dataframe_inputs_dup = container.DataFrame.from_dict(data=test_data_inputs_dup) + input = pd.concat([dataframe_inputs, dataframe_inputs_dup], axis=1) + + hyperparams_class = rename_duplicate_columns.RenameDuplicateColumnsPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + + primitive = rename_duplicate_columns.RenameDuplicateColumnsPrimitive(hyperparams=hyperparams_class.defaults()) + + call_result = primitive.produce(inputs=input) + dataframe_renamed = call_result.value + self.assertEqual(dataframe_renamed.columns.values.tolist(), + ['a', 'b', 'b.1', 'c']) + + def test_no_change(self): + test_data_inputs = {'col0': [1.0, 2.0, 3.0], + 'col1': [4.0, 5.0, 6.0], + 'col2': [100, 200, 300]} + dataframe_inputs = container.DataFrame.from_dict(data=test_data_inputs) + test_data_inputs = {'col3': [1.0, 2.0, 3.0], + 'col4': [4.0, 5.0, 6.0], + 'col5': [100, 200, 300]} + dataframe_inputs_2 = container.DataFrame.from_dict(data=test_data_inputs) + + inputs = pd.concat([dataframe_inputs, dataframe_inputs_2], axis=1) + hyperparams_class = rename_duplicate_columns.RenameDuplicateColumnsPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + + primitive = rename_duplicate_columns.RenameDuplicateColumnsPrimitive(hyperparams=hyperparams_class.defaults()) + + call_result = primitive.produce(inputs=inputs) + dataframe_renamed = call_result.value + + self.assertEqual(dataframe_renamed.columns.values.tolist(), + ['col0', 'col1', 'col2', 'col3', 'col4', 'col5']) + + def test_iris_with_metadata(self): + dataframe = self._get_iris_columns() + dataframe_1 = self._get_iris_columns() + dataframe_concated = dataframe.append_columns(dataframe_1) + dataframe_concated_bk = dataframe_concated.copy() + hyperparams_class = rename_duplicate_columns.RenameDuplicateColumnsPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + + primitive = rename_duplicate_columns.RenameDuplicateColumnsPrimitive(hyperparams=hyperparams_class.defaults()) + + call_result = primitive.produce(inputs=dataframe_concated) + dataframe_renamed = call_result.value + names = ['d3mIndex', 'sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species', + 'd3mIndex.1', 'sepalLength.1', 'sepalWidth.1', 'petalLength.1', 'petalWidth.1', + 'species.1'] + self.assertEqual(dataframe_renamed.columns.values.tolist(), names) + self.assertTrue(dataframe_concated.equals(dataframe_concated_bk)) + self.assertTrue(dataframe_concated.metadata.to_internal_json_structure(), + dataframe_concated_bk.metadata.to_internal_json_structure()) + + for i, column_name in enumerate(dataframe_renamed.columns): + self.assertEqual(dataframe_renamed.metadata.query_column(i)['other_name'], + column_name.split(primitive.hyperparams['separator'])[0]) + self.assertEqual(dataframe_renamed.metadata.query_column(i)['name'], names[i]) diff --git a/tods/common-primitives/tests/test_replace_semantic_types.py b/tods/common-primitives/tests/test_replace_semantic_types.py new file mode 100644 index 0000000..258167a --- /dev/null +++ b/tods/common-primitives/tests/test_replace_semantic_types.py @@ -0,0 +1,97 @@ +import os +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, replace_semantic_types + +import utils as test_utils + + +class ReplaceSemanticTypesPrimitiveTestCase(unittest.TestCase): + def _get_iris_dataframe(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + call_metadata = primitive.produce(inputs=dataset) + + dataframe = call_metadata.value + + return dataframe + + def test_basic(self): + dataframe = self._get_iris_dataframe() + + hyperparams_class = replace_semantic_types.ReplaceSemanticTypesPrimitive.metadata.get_hyperparams() + primitive = replace_semantic_types.ReplaceSemanticTypesPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'from_semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',), + 'to_semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',), + })) + + outputs = primitive.produce(inputs=dataframe).value + + self._test_metadata(outputs.metadata) + + def _test_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(test_utils.convert_through_json(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + for i in range(1, 5): + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, i))), { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'][i - 1], + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, i) + + self.assertEqual(test_utils.convert_through_json(metadata.query((metadata_base.ALL_ELEMENTS, 5))), { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + self.assertTrue(metadata.get_elements((metadata_base.ALL_ELEMENTS,)) in [[0, 1, 2, 3, 4, 5], [metadata_base.ALL_ELEMENTS, 0, 1, 2, 3, 4, 5]]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_simple_profiler.py b/tods/common-primitives/tests/test_simple_profiler.py new file mode 100644 index 0000000..b9a6706 --- /dev/null +++ b/tods/common-primitives/tests/test_simple_profiler.py @@ -0,0 +1,446 @@ +import os.path +import pickle +import unittest + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, simple_profiler, train_score_split + + +class SimpleProfilerPrimitiveTestCase(unittest.TestCase): + def _get_iris(self, set_target_as_categorical): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + original_metadata = dataset.metadata + + # We make a very empty metadata. + dataset.metadata = metadata_base.DataMetadata().generate(dataset) + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'http://schema.org/Integer') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/PrimaryKey') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget') + + if set_target_as_categorical: + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/CategoricalData') + else: + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/UnknownType') + + return dataset, original_metadata + + def _test_metadata(self, original_metadata, dataframe_metadata, set_target_as_categorical): + for column_index in range(5): + self.assertCountEqual(original_metadata.query_column_field(column_index, 'semantic_types', at=('learningData',)), dataframe_metadata.query_column_field(column_index, 'semantic_types'), (set_target_as_categorical, column_index)) + + self.assertEqual(dataframe_metadata.query_column_field(5, 'semantic_types'), ( + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ), set_target_as_categorical) + + def test_basic(self): + for set_target_as_categorical in [False, True]: + dataset, original_metadata = self._get_iris(set_target_as_categorical) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = primitive.produce(inputs=dataset).value + + hyperparams_class = simple_profiler.SimpleProfilerPrimitive.metadata.get_hyperparams() + + primitive = simple_profiler.SimpleProfilerPrimitive(hyperparams=hyperparams_class.defaults()) + + primitive.set_training_data(inputs=dataframe) + primitive.fit() + + primitive_pickled = pickle.dumps(primitive) + primitive = pickle.loads(primitive_pickled) + + dataframe = primitive.produce(inputs=dataframe).value + + self._test_metadata(original_metadata, dataframe.metadata, set_target_as_categorical) + + def test_small_test(self): + for set_target_as_categorical in [False, True]: + dataset, original_metadata = self._get_iris(set_target_as_categorical) + + hyperparams_class = train_score_split.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = train_score_split.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'train_score_ratio': 0.9, + 'shuffle': True, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + train_dataset = results[0] + + self.assertEqual(len(train_dataset['learningData']), 135) + + results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + score_dataset = results[0] + + self.assertEqual(len(score_dataset['learningData']), 15) + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + train_dataframe = primitive.produce(inputs=train_dataset).value + + score_dataframe = primitive.produce(inputs=score_dataset).value + + hyperparams_class = simple_profiler.SimpleProfilerPrimitive.metadata.get_hyperparams() + + primitive = simple_profiler.SimpleProfilerPrimitive(hyperparams=hyperparams_class.defaults()) + + primitive.set_training_data(inputs=train_dataframe) + primitive.fit() + dataframe = primitive.produce(inputs=score_dataframe).value + + self._test_metadata(original_metadata, dataframe.metadata, set_target_as_categorical) + + def _get_column_semantic_types(self, dataframe): + number_of_columns = dataframe.metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + generated_semantic_types = [ + dataframe.metadata.query((metadata_base.ALL_ELEMENTS, i))['semantic_types'] + for i in range(number_of_columns) + ] + generated_semantic_types = [sorted(x) for x in generated_semantic_types] + + return generated_semantic_types + + def test_iris_csv(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'tables', 'learningData.csv') + ) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # Use profiler to assign semantic types + dataframe = self._profile_dataset(dataset=dataset) + + generated_semantic_types = self._get_column_semantic_types(dataframe) + + semantic_types = [ + [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + [ + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + ], + ] + + self.assertEqual(generated_semantic_types, semantic_types) + + def _profile_dataset(self, dataset, hyperparams=None): + if hyperparams is None: + hyperparams = {} + + hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataset).value + + hyperparams_class = simple_profiler.SimpleProfilerPrimitive.metadata.get_hyperparams() + primitive = simple_profiler.SimpleProfilerPrimitive(hyperparams=hyperparams_class.defaults().replace(hyperparams)) + primitive.set_training_data(inputs=dataframe) + primitive.fit() + + return primitive.produce(inputs=dataframe).value + + def test_boston(self): + dataset = container.dataset.Dataset.load('sklearn://boston') + + # Use profiler to assign semantic types + dataframe = self._profile_dataset(dataset=dataset) + + generated_semantic_types = self._get_column_semantic_types(dataframe) + + semantic_types = [ + ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Boolean', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + [ + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + ], + ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ], + ] + + self.assertEqual(generated_semantic_types, semantic_types) + + def test_diabetes(self): + dataset = container.dataset.Dataset.load('sklearn://diabetes') + + # Use profiler to assign semantic types + dataframe = self._profile_dataset(dataset=dataset) + + generated_semantic_types = self._get_column_semantic_types(dataframe) + + semantic_types = [ + ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ], + ] + + self.assertEqual(generated_semantic_types, semantic_types) + + def test_digits(self): + self.maxDiff = None + + dataset = container.dataset.Dataset.load('sklearn://digits') + + detect_semantic_types = list(simple_profiler.SimpleProfilerPrimitive.metadata.get_hyperparams().configuration['detect_semantic_types'].get_default()) + # Some pixels have very little different values. + detect_semantic_types.remove('http://schema.org/Boolean') + # There are just 16 colors, but we want to see them as integers. + detect_semantic_types.remove('https://metadata.datadrivendiscovery.org/types/CategoricalData') + + # Use profiler to assign semantic types + dataframe = self._profile_dataset(dataset=dataset, hyperparams={ + 'detect_semantic_types': detect_semantic_types, + }) + + generated_semantic_types = self._get_column_semantic_types(dataframe) + + semantic_types = ( + [['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']] + + 64 + * [ + [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ] + ] + + [ + [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ] + ] + ) + + self.assertEqual(generated_semantic_types, semantic_types) + + def test_iris(self): + dataset = container.dataset.Dataset.load('sklearn://iris') + + # Use profiler to assign semantic types + dataframe = self._profile_dataset(dataset=dataset) + + generated_semantic_types = self._get_column_semantic_types(dataframe) + + semantic_types = [ + ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ], + ] + + self.assertEqual(generated_semantic_types, semantic_types) + + def test_breast_cancer(self): + dataset = container.dataset.Dataset.load('sklearn://breast_cancer') + + # Use profiler to assign semantic types + dataframe = self._profile_dataset(dataset=dataset) + + generated_semantic_types = self._get_column_semantic_types(dataframe) + + semantic_types = ( + [['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey']] + + 30 + * [ + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ] + ] + + [ + [ + 'http://schema.org/Boolean', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ] + ] + ) + + self.assertEqual(generated_semantic_types, semantic_types) + + def test_linnerud(self): + dataset = container.dataset.Dataset.load('sklearn://linnerud') + + # Use profiler to assign semantic types + dataframe = self._profile_dataset(dataset=dataset) + + generated_semantic_types = self._get_column_semantic_types(dataframe) + + semantic_types = [ + ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute'], + [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + # Only the first "SuggestedTarget" column is made into a target. + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ], + [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + ], + [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + ], + ] + + self.assertEqual(generated_semantic_types, semantic_types) + + def test_wine(self): + dataset = container.dataset.Dataset.load('sklearn://wine') + + # Use profiler to assign semantic types + dataframe = self._profile_dataset(dataset=dataset) + + generated_semantic_types = self._get_column_semantic_types(dataframe) + + semantic_types = [ + ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget', + ], + ] + + self.assertEqual(generated_semantic_types, semantic_types) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_stack_ndarray_column.py b/tods/common-primitives/tests/test_stack_ndarray_column.py new file mode 100644 index 0000000..d6b3b1d --- /dev/null +++ b/tods/common-primitives/tests/test_stack_ndarray_column.py @@ -0,0 +1,77 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import stack_ndarray_column + + +class StackNDArrayColumnPrimitiveTestCase(unittest.TestCase): + def _get_data(self): + data = container.DataFrame({ + 'a': [1, 2, 3], + 'b': [container.ndarray([2, 3, 4]), container.ndarray([5, 6, 7]), container.ndarray([8, 9, 10])] + }, { + 'top_level': 'foobar1', + }, generate_metadata=True) + + data.metadata = data.metadata.update_column(1, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + }) + + return data + + def test_basic(self): + data = self._get_data() + + data_metadata_before = data.metadata.to_internal_json_structure() + + stack_hyperparams_class = stack_ndarray_column.StackNDArrayColumnPrimitive.metadata.get_hyperparams() + stack_primitive = stack_ndarray_column.StackNDArrayColumnPrimitive(hyperparams=stack_hyperparams_class.defaults()) + stack_array = stack_primitive.produce(inputs=data).value + + self.assertEqual(stack_array.shape, (3, 3)) + + self._test_metadata(stack_array.metadata) + + self.assertEqual(data.metadata.to_internal_json_structure(), data_metadata_before) + + def _test_metadata(self, metadata): + self.maxDiff = None + + self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'foobar1', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.numpy.ndarray', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 3, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + # It is unclear if name and semantic types should be moved to rows, but this is what currently happens. + 'name': 'b', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': '__NO_VALUE__', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', '__ALL_ELEMENTS__'], + 'metadata': { + 'structural_type': 'numpy.int64', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_tabular_extractor.py b/tods/common-primitives/tests/test_tabular_extractor.py new file mode 100644 index 0000000..29b2905 --- /dev/null +++ b/tods/common-primitives/tests/test_tabular_extractor.py @@ -0,0 +1,173 @@ +import os +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, column_parser, tabular_extractor + +import utils as test_utils + + +class TabularExtractorPrimitiveTestCase(unittest.TestCase): + def setUp(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We mark targets as attributes. + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + self.dataset = dataset + + # DatasetToDataFramePrimitive + + df_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + + df_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=df_hyperparams_class.defaults()) + + df_dataframe = df_primitive.produce(inputs=self.dataset).value + + # Set some missing values. + df_dataframe.iloc[1, 1] = "" + df_dataframe.iloc[10, 1] = "" + df_dataframe.iloc[15, 1] = "" + + # ColumnParserPrimitive + + cp_hyperparams_class = column_parser.ColumnParserPrimitive.metadata.get_hyperparams() + + # To simulate how Pandas "read_csv" is reading CSV files, we parse just numbers. + cp_primitive = column_parser.ColumnParserPrimitive( + hyperparams=cp_hyperparams_class.defaults().replace({ + 'parse_semantic_types': ['http://schema.org/Integer', 'http://schema.org/Float'], + }), + ) + + self.dataframe = cp_primitive.produce(inputs=df_dataframe).value + + def test_defaults(self): + te_hyperparams_class = tabular_extractor.AnnotatedTabularExtractorPrimitive.metadata.get_hyperparams() + + # It one-hot encodes categorical columns, it imputes numerical values, + # and adds missing indicator column for each. + te_primitive = tabular_extractor.AnnotatedTabularExtractorPrimitive( + hyperparams=te_hyperparams_class.defaults(), + ) + + te_primitive.set_training_data(inputs=self.dataframe) + te_primitive.fit() + + dataframe = te_primitive.produce(inputs=self.dataframe).value + + # 1 index column, 4 numerical columns with one indicator column each, + # 3 columns for one-hot encoding of "target" column and indicator column for that. + self.assertEqual(dataframe.shape, (150, 13)) + + self.assertEqual(test_utils.convert_through_json(utils.to_json_structure(dataframe.metadata.to_internal_simple_structure())), [{ + 'selector': [], + 'metadata': { + 'dimension': { + 'length': 150, + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'length': 13, + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + 'structural_type': 'int', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 7], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 8], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 9], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 10], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 11], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 12], + 'metadata': { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_term_filter.py b/tods/common-primitives/tests/test_term_filter.py new file mode 100644 index 0000000..5131238 --- /dev/null +++ b/tods/common-primitives/tests/test_term_filter.py @@ -0,0 +1,136 @@ +import unittest +import os + +from common_primitives import term_filter +from d3m import container + +import utils as test_utils + + +class TermFilterPrimitiveTestCase(unittest.TestCase): + def test_inclusive(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = term_filter.TermFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'inclusive': True, + 'terms': ['AAA', 'CCC'], + 'match_whole': True + }) + + filter_primitive = term_filter.TermFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + self.assertTrue(set(new_df['code'].unique()) == set(['AAA', 'CCC'])) + + def test_exclusive(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = term_filter.TermFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'inclusive': False, + 'terms': ['AAA', 'CCC'], + 'match_whole': True + }) + + filter_primitive = term_filter.TermFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + self.assertTrue(set(new_df['code'].unique()) == set(['BBB'])) + + def test_numeric(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + # set dataframe type to int to match output of a prior parse columns step + resource.iloc[:,3] = resource.iloc[:,3].astype(int) + + filter_hyperparams_class = term_filter.TermFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 3, + 'inclusive': False, + 'terms': ['1990'], + 'match_whole': True + }) + + filter_primitive = term_filter.TermFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + matches = new_df[~new_df['year'].astype(str).str.match('1990')] + self.assertTrue(set(matches['year'].unique()) == set([2000, 2010])) + + def test_partial_no_match(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = term_filter.TermFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'inclusive': True, + 'terms': ['AA', 'CC'], + 'match_whole': False + }) + + filter_primitive = term_filter.TermFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + self.assertTrue(set(new_df['code'].unique()) == set(['AAA', 'CCC'])) + + def test_escaped_regex(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = term_filter.TermFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 4, + 'inclusive': True, + 'terms': ['40.2'], + 'match_whole': False + }) + + filter_primitive = term_filter.TermFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + self.assertListEqual(list(new_df['value']), ['40.2346487255306']) + + def test_row_metadata_removal(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'database_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # add metadata for rows 0 and 1 + dataset.metadata = dataset.metadata.update(('learningData', 1), {'a': 0}) + dataset.metadata = dataset.metadata.update(('learningData', 2), {'b': 1}) + + resource = test_utils.get_dataframe(dataset) + + filter_hyperparams_class = term_filter.TermFilterPrimitive.metadata.get_hyperparams() + hp = filter_hyperparams_class({ + 'column': 1, + 'inclusive': False, + 'terms': ['AAA'], + 'match_whole': True + }) + + filter_primitive = term_filter.TermFilterPrimitive(hyperparams=hp) + new_df = filter_primitive.produce(inputs=resource).value + + # verify that the lenght is correct + self.assertEqual(len(new_df), new_df.metadata.query(())['dimension']['length']) + + # verify that the rows were re-indexed in the metadata + self.assertEquals(new_df.metadata.query((0,))['a'], 0) + self.assertEquals(new_df.metadata.query((1,))['b'], 1) + self.assertFalse('b' in new_df.metadata.query((2,))) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_text_reader.py b/tods/common-primitives/tests/test_text_reader.py new file mode 100644 index 0000000..00335be --- /dev/null +++ b/tods/common-primitives/tests/test_text_reader.py @@ -0,0 +1,30 @@ +import unittest +import os + +from d3m import container + +from common_primitives import dataset_to_dataframe, text_reader + + +class TextReaderPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'text_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults().replace({'dataframe_resource': '0'})) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + text_hyperparams_class = text_reader.TextReaderPrimitive.metadata.get_hyperparams() + text_primitive = text_reader.TextReaderPrimitive(hyperparams=text_hyperparams_class.defaults().replace({'return_result': 'replace'})) + tables = text_primitive.produce(inputs=dataframe).value + + self.assertEqual(tables.shape, (4, 1)) + + self.assertEqual(tables.metadata.query_column(0)['structural_type'], str) + self.assertEqual(tables.metadata.query_column(0)['semantic_types'], ('https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/Text')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_train_score_split.py b/tods/common-primitives/tests/test_train_score_split.py new file mode 100644 index 0000000..317367a --- /dev/null +++ b/tods/common-primitives/tests/test_train_score_split.py @@ -0,0 +1,88 @@ +import os +import pickle +import unittest + +from d3m import container +from d3m.metadata import base as metadata_base + +from common_primitives import train_score_split + + +class TrainScoreDatasetSplitPrimitiveTestCase(unittest.TestCase): + def test_produce_train(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = train_score_split.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = train_score_split.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'shuffle': True, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + # To test that pickling works. + pickle.dumps(primitive) + + results = primitive.produce(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(results[0]['learningData'].shape[0], 112) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [ + '0', '1', '2', '3', '4', '5', '6', '9', '10', '11', '12', '13', '14', '15', '17', '19', '20', + '21', '23', '25', '28', '29', '30', '31', '32', '34', '35', '36', '38', '39', '41', '42', '43', + '46', '47', '48', '49', '50', '52', '53', '55', '56', '57', '58', '60', '61', '64', '65', '67', + '68', '69', '70', '72', '74', '75', '77', '79', '80', '81', '82', '85', '87', '88', '89', '91', + '92', '94', '95', '96', '98', '99', '101', '102', '103', '104', '105', '106', '108', '109', '110', + '111', '112', '113', '115', '116', '117', '118', '119', '120', '122', '123', '124', '125', '128', + '129', '130', '131', '133', '135', '136', '138', '139', '140', '141', '142', '143', '144', '145', + '146', '147', '148', '149', + ]) + + def test_produce_score(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'tests', 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + # We set semantic types like runtime would. + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Target') + dataset.metadata = dataset.metadata.add_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataset.metadata = dataset.metadata.remove_semantic_type(('learningData', metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + hyperparams_class = train_score_split.TrainScoreDatasetSplitPrimitive.metadata.get_hyperparams() + + primitive = train_score_split.TrainScoreDatasetSplitPrimitive(hyperparams=hyperparams_class.defaults().replace({ + 'shuffle': True, + })) + + primitive.set_training_data(dataset=dataset) + primitive.fit() + + results = primitive.produce_score_data(inputs=container.List([0], generate_metadata=True)).value + + self.assertEqual(len(results), 1) + + for dataset in results: + self.assertEqual(len(dataset), 1) + + self.assertEqual(results[0]['learningData'].shape[0], 38) + self.assertEqual(list(results[0]['learningData'].iloc[:, 0]), [ + '7', '8', '16', '18', '22', '24', '26', '27', '33', '37', '40', '44', '45', '51', '54', + '59', '62', '63', '66', '71', '73', '76', '78', '83', '84', '86', '90', '93', '97', '100', + '107', '114', '121', '126', '127', '132', '134', '137', + ]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_unseen_label_decoder.py b/tods/common-primitives/tests/test_unseen_label_decoder.py new file mode 100644 index 0000000..108a5c6 --- /dev/null +++ b/tods/common-primitives/tests/test_unseen_label_decoder.py @@ -0,0 +1,51 @@ +import unittest + +from d3m import container + +from common_primitives import unseen_label_encoder, unseen_label_decoder + + +class UnseenLabelEncoderTestCase(unittest.TestCase): + def test_basic(self): + encoder_hyperparams_class = unseen_label_encoder.UnseenLabelEncoderPrimitive.metadata.get_hyperparams() + encoder_primitive = unseen_label_encoder.UnseenLabelEncoderPrimitive(hyperparams=encoder_hyperparams_class.defaults()) + + inputs = container.DataFrame({ + 'value': [0.0, 1.0, 2.0, 3.0], + 'number': [0, 1, 2, 3], + 'word': ['one', 'two', 'three', 'four'], + }, generate_metadata=True) + inputs.metadata = inputs.metadata.update_column(2, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData'], + }) + + encoder_primitive.set_training_data(inputs=inputs) + encoder_primitive.fit() + + inputs = container.DataFrame({ + 'value': [1.0, 2.0, 3.0], + 'number': [1, 2, 3], + 'word': ['one', 'two', 'five'], + }, generate_metadata=True) + inputs.metadata = inputs.metadata.update_column(2, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData'], + }) + + outputs = encoder_primitive.produce(inputs=inputs).value + + decoder_hyperparams_class = unseen_label_decoder.UnseenLabelDecoderPrimitive.metadata.get_hyperparams() + decoder_primitive = unseen_label_decoder.UnseenLabelDecoderPrimitive(hyperparams=decoder_hyperparams_class.defaults().replace({'encoder': encoder_primitive})) + + decoded = decoder_primitive.produce(inputs=outputs).value + + self.assertEqual(decoded.values.tolist(), [ + [1, 1.0, 'one'], + [2, 2.0, 'two'], + [3, 3.0, ''], + ]) + + self.assertEqual(decoded.metadata.query_column(2)['structural_type'], str) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_unseen_label_encoder.py b/tods/common-primitives/tests/test_unseen_label_encoder.py new file mode 100644 index 0000000..5057688 --- /dev/null +++ b/tods/common-primitives/tests/test_unseen_label_encoder.py @@ -0,0 +1,46 @@ +import unittest + +from d3m import container + +from common_primitives import unseen_label_encoder + + +class UnseenLabelEncoderTestCase(unittest.TestCase): + def test_basic(self): + encoder_hyperparams_class = unseen_label_encoder.UnseenLabelEncoderPrimitive.metadata.get_hyperparams() + encoder_primitive = unseen_label_encoder.UnseenLabelEncoderPrimitive(hyperparams=encoder_hyperparams_class.defaults()) + + inputs = container.DataFrame({ + 'value': [0.0, 1.0, 2.0, 3.0], + 'number': [0, 1, 2, 3], + 'word': ['one', 'two', 'three', 'four'], + }, generate_metadata=True) + inputs.metadata = inputs.metadata.update_column(2, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData'], + }) + + encoder_primitive.set_training_data(inputs=inputs) + encoder_primitive.fit() + + inputs = container.DataFrame({ + 'value': [1.0, 2.0, 3.0], + 'number': [1, 2, 3], + 'word': ['one', 'two', 'five'], + }, generate_metadata=True) + inputs.metadata = inputs.metadata.update_column(2, { + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData'], + }) + + outputs = encoder_primitive.produce(inputs=inputs).value + + self.assertEqual(outputs.values.tolist(), [ + [1, 1.0, 1], + [2, 2.0, 2], + [3, 3.0, 0], + ]) + + self.assertEqual(outputs.metadata.query_column(2)['structural_type'], int) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_video_reader.py b/tods/common-primitives/tests/test_video_reader.py new file mode 100644 index 0000000..4ae2f72 --- /dev/null +++ b/tods/common-primitives/tests/test_video_reader.py @@ -0,0 +1,35 @@ +import unittest +import os + +from d3m import container + +from common_primitives import dataset_to_dataframe, video_reader + + +class VideoReaderPrimitiveTestCase(unittest.TestCase): + def test_basic(self): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'video_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + dataframe_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataframe_hyperparams_class.defaults().replace({'dataframe_resource': '0'})) + dataframe = dataframe_primitive.produce(inputs=dataset).value + + video_hyperparams_class = video_reader.VideoReaderPrimitive.metadata.get_hyperparams() + video_primitive = video_reader.VideoReaderPrimitive(hyperparams=video_hyperparams_class.defaults().replace({'return_result': 'replace'})) + videos = video_primitive.produce(inputs=dataframe).value + + self.assertEqual(videos.shape, (2, 1)) + self.assertEqual(videos.iloc[0, 0].shape, (408, 240, 320, 3)) + self.assertEqual(videos.iloc[1, 0].shape, (79, 240, 320, 3)) + + self._test_metadata(videos.metadata) + + def _test_metadata(self, metadata): + self.assertEqual(metadata.query_column(0)['structural_type'], container.ndarray) + self.assertEqual(metadata.query_column(0)['semantic_types'], ('https://metadata.datadrivendiscovery.org/types/PrimaryKey', 'http://schema.org/VideoObject')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_xgboost_dart.py b/tods/common-primitives/tests/test_xgboost_dart.py new file mode 100644 index 0000000..a2928f4 --- /dev/null +++ b/tods/common-primitives/tests/test_xgboost_dart.py @@ -0,0 +1,687 @@ +import os +import pickle +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, extract_columns_semantic_types, xgboost_dart, column_parser + + +class XGBoostDartTestCase(unittest.TestCase): + def _get_iris(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = \ + dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = primitive.produce(inputs=dataset).value + + return dataframe + + def _get_iris_columns(self): + dataframe = self._get_iris() + + # We set custom metadata on columns. + for column_index in range(1, 5): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'attributes'}) + for column_index in range(5, 6): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'targets'}) + + # We set semantic types like runtime would. + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/Target') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataframe.metadata = dataframe.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + # Parsing. + hyperparams_class = \ + column_parser.ColumnParserPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + hyperparams_class = \ + extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',)})) + attributes = primitive.produce(inputs=dataframe).value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)})) + targets = primitive.produce(inputs=dataframe).value + + return dataframe, attributes, targets + + def test_single_target(self): + dataframe, attributes, targets = self._get_iris_columns() + + self.assertEqual(list(targets.columns), ['species']) + hyperparams_class = \ + xgboost_dart.XGBoostDartClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_dart.XGBoostDartClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + predictions = primitive.produce(inputs=attributes).value + self.assertEqual(list(predictions.columns), ['species']) + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + self._test_single_target_metadata(predictions.metadata) + + samples = primitive.sample(inputs=attributes).value + self.assertEqual(list(samples[0].columns), ['species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + self.assertEqual(list(log_likelihoods.columns), ['species']) + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + self.assertEqual(list(log_likelihood.columns), ['species']) + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -2.414982318878174) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + def test_single_target_continue_fit(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_dart.XGBoostDartClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_dart.XGBoostDartClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + # reset the training data to make continue_fit() work. + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.continue_fit() + params = primitive.get_params() + self.assertEqual(params['booster'].best_ntree_limit, + primitive.hyperparams['n_estimators'] + primitive.hyperparams['n_more_estimators']) + predictions = primitive.produce(inputs=attributes).value + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + self._test_single_target_metadata(predictions.metadata) + + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + def _test_single_target_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_multiple_targets(self): + dataframe, attributes, targets = self._get_iris_columns() + + targets = targets.append_columns(targets) + + self.assertEqual(list(targets.columns), ['species', 'species']) + + hyperparams_class = \ + xgboost_dart.XGBoostDartClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_dart.XGBoostDartClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + predictions = primitive.produce(inputs=attributes).value + self.assertEqual(list(predictions.columns), ['species', 'species']) + self.assertEqual(predictions.shape, (150, 2)) + for column_index in range(2): + self.assertEqual(predictions.iloc[0, column_index], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(column_index)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(column_index)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=attributes).value + self.assertEqual(list(samples[0].columns), ['species', 'species']) + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 2)) + for column_index in range(2): + self.assertEqual(samples[0].iloc[0, column_index], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(column_index)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(column_index)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + self.assertEqual(list(log_likelihoods.columns), ['species', 'species']) + + self.assertEqual(log_likelihoods.shape, (150, 2)) + for column_index in range(2): + self.assertEqual(log_likelihoods.metadata.query_column(column_index)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + self.assertEqual(list(log_likelihood.columns), ['species', 'species']) + + self.assertEqual(log_likelihood.shape, (1, 2)) + for column_index in range(2): + self.assertAlmostEqual(log_likelihood.iloc[0, column_index], -2.414982318878174) + self.assertEqual(log_likelihoods.metadata.query_column(column_index)['name'], 'species') + + def test_multiple_targets_continue_fit(self): + dataframe, attributes, targets = self._get_iris_columns() + second_targets = targets.copy() + second_targets['species'] = targets['species'].map( + {'Iris-setosa': 't-Iris-setosa', 'Iris-versicolor': 't-Iris-versicolor', + 'Iris-virginica': 't-Iris-virginica'}) + second_targets.rename(columns={'species': 't-species'}, inplace=True) + second_targets.metadata = second_targets.metadata.update_column(0, {'name': 't-species'}) + targets = targets.append_columns(second_targets) + hyperparams_class = \ + xgboost_dart.XGBoostDartClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_dart.XGBoostDartClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.continue_fit() + params = primitive.get_params() + for estimator in params['estimators']: + self.assertEqual(estimator.get_booster().best_ntree_limit, + primitive.hyperparams['n_estimators'] + primitive.hyperparams['n_more_estimators']) + + predictions = primitive.produce(inputs=attributes).value + + self.assertEqual(predictions.shape, (150, 2)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + self.assertEqual(predictions.iloc[0, 1], 't-Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 't-species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 2)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + self.assertEqual(samples[0].iloc[0, 1], 't-Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(1)['name'], 't-species') + self.assertEqual(samples[0].metadata.query_column(1)['custom_metadata'], 'targets') + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + + self.assertEqual(log_likelihoods.shape, (150, 2)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + self.assertEqual(log_likelihoods.metadata.query_column(1)['name'], 't-species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + + self.assertEqual(log_likelihood.shape, (1, 2)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + self.assertEqual(log_likelihoods.metadata.query_column(1)['name'], 't-species') + + def test_semantic_types(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_dart.XGBoostDartClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_dart.XGBoostDartClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + self.assertEqual(list(predictions.columns), ['species']) + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=dataframe).value + self.assertEqual(list(samples[0].columns), ['species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=dataframe, outputs=dataframe).value + self.assertEqual(list(log_likelihoods.columns), ['species']) + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=dataframe, outputs=dataframe).value + self.assertEqual(list(log_likelihood.columns), ['species']) + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -2.414982318878174) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + def test_return_append(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_dart.XGBoostDartClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_dart.XGBoostDartClassifierPrimitive(hyperparams=hyperparams_class.defaults()) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'sepalLength', + 'sepalWidth', + 'petalLength', + 'petalWidth', + 'species', + 'species', + ]) + + self.assertEqual(predictions.shape, (150, 7)) + self.assertEqual(predictions.iloc[0, 6], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 6), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 6), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(6)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(6)['custom_metadata'], 'targets') + + self._test_return_append_metadata(predictions.metadata) + + def _test_return_append_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 7, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'sepalLength', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'sepalWidth', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'petalLength', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'petalWidth', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'custom_metadata': 'targets', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }]) + + def test_return_new(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_dart.XGBoostDartClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_dart.XGBoostDartClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new'})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + ]) + self.assertEqual(predictions.shape, (150, 2)) + self.assertEqual(predictions.iloc[0, 1], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_new_metadata(predictions.metadata) + + def _test_return_new_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_return_replace(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_dart.XGBoostDartClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_dart.XGBoostDartClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace'})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + 'species', + ]) + + self.assertEqual(predictions.shape, (150, 3)) + self.assertEqual(predictions.iloc[0, 1], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_replace_metadata(predictions.metadata) + + def test_pickle_unpickle(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_dart.XGBoostDartClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_dart.XGBoostDartClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + before_pickled_prediction = primitive.produce(inputs=attributes).value + pickle_object = pickle.dumps(primitive) + primitive = pickle.loads(pickle_object) + after_unpickled_prediction = primitive.produce(inputs=attributes).value + _ = pickle.dumps(primitive) + self.assertTrue(container.DataFrame.equals(before_pickled_prediction, after_unpickled_prediction)) + + def _test_return_replace_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'custom_metadata': 'targets', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_xgboost_gbtree.py b/tods/common-primitives/tests/test_xgboost_gbtree.py new file mode 100644 index 0000000..1ec0e67 --- /dev/null +++ b/tods/common-primitives/tests/test_xgboost_gbtree.py @@ -0,0 +1,733 @@ +import os +import pickle +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, extract_columns_semantic_types, xgboost_gbtree, column_parser + + +class XGBoostTestCase(unittest.TestCase): + def _get_iris(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = \ + dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = primitive.produce(inputs=dataset).value + + return dataframe + + def _get_iris_columns(self): + dataframe = self._get_iris() + + # We set custom metadata on columns. + for column_index in range(1, 5): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'attributes'}) + for column_index in range(5, 6): + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'targets'}) + + # We set semantic types like runtime would. + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/Target') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataframe.metadata = dataframe.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 5), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + + # Parsing. + hyperparams_class = \ + column_parser.ColumnParserPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + hyperparams_class = \ + extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',)})) + attributes = primitive.produce(inputs=dataframe).value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/SuggestedTarget',)})) + targets = primitive.produce(inputs=dataframe).value + + return dataframe, attributes, targets + + def test_single_target(self): + dataframe, attributes, targets = self._get_iris_columns() + + self.assertEqual(list(targets.columns), ['species']) + hyperparams_class = \ + xgboost_gbtree.XGBoostGBTreeClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_gbtree.XGBoostGBTreeClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + predictions = primitive.produce(inputs=attributes).value + self.assertEqual(list(predictions.columns), ['species']) + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + self._test_single_target_metadata(predictions.metadata) + + samples = primitive.sample(inputs=attributes).value + self.assertEqual(list(samples[0].columns), ['species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + self.assertEqual(list(log_likelihoods.columns), ['species']) + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + self.assertEqual(list(log_likelihood.columns), ['species']) + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -3.4919378757476807) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + def test_single_target_continue_fit(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_gbtree.XGBoostGBTreeClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_gbtree.XGBoostGBTreeClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + # reset the training data to make continue_fit() work. + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.continue_fit() + params = primitive.get_params() + self.assertEqual(params['booster'].best_ntree_limit, + primitive.hyperparams['n_estimators'] + primitive.hyperparams['n_more_estimators']) + predictions = primitive.produce(inputs=attributes).value + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + self._test_single_target_metadata(predictions.metadata) + + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -2.4149818420410156) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + def _test_single_target_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_multiple_targets(self): + dataframe, attributes, targets = self._get_iris_columns() + + targets = targets.append_columns(targets) + self.assertEqual(list(targets.columns), ['species', 'species']) + + hyperparams_class = \ + xgboost_gbtree.XGBoostGBTreeClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_gbtree.XGBoostGBTreeClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + predictions = primitive.produce(inputs=attributes).value + self.assertEqual(list(predictions.columns), ['species', 'species']) + + self.assertEqual(predictions.shape, (150, 2)) + for column_index in range(2): + self.assertEqual(predictions.iloc[0, column_index], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(column_index)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(column_index)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=attributes).value + self.assertEqual(list(samples[0].columns), ['species', 'species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 2)) + for column_index in range(2): + self.assertEqual(samples[0].iloc[0, column_index], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(column_index)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(column_index)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + self.assertEqual(list(log_likelihoods.columns), ['species', 'species']) + + self.assertEqual(log_likelihoods.shape, (150, 2)) + for column_index in range(2): + self.assertEqual(log_likelihoods.metadata.query_column(column_index)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + + self.assertEqual(list(log_likelihood.columns), ['species', 'species']) + self.assertEqual(log_likelihood.shape, (1, 2)) + for column_index in range(2): + self.assertAlmostEqual(log_likelihood.iloc[0, column_index], -3.4919378757476807) + self.assertEqual(log_likelihoods.metadata.query_column(column_index)['name'], 'species') + + feature_importances = primitive.produce_feature_importances().value + self.assertEqual(list(feature_importances), ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth']) + self.assertEqual(feature_importances.metadata.query_column(0)['name'], 'sepalLength') + self.assertEqual(feature_importances.metadata.query_column(1)['name'], 'sepalWidth') + self.assertEqual(feature_importances.metadata.query_column(2)['name'], 'petalLength') + self.assertEqual(feature_importances.metadata.query_column(3)['name'], 'petalWidth') + + self.assertEqual(feature_importances.values.tolist(), [[0.012397459708154202, + 0.03404613956809044, + 0.5992223024368286, + 0.35433411598205566, + ]]) + + def test_multiple_targets_continue_fit(self): + dataframe, attributes, targets = self._get_iris_columns() + second_targets = targets.copy() + second_targets['species'] = targets['species'].map( + {'Iris-setosa': 't-Iris-setosa', 'Iris-versicolor': 't-Iris-versicolor', + 'Iris-virginica': 't-Iris-virginica'}) + second_targets.rename(columns={'species': 't-species'}, inplace=True) + second_targets.metadata = second_targets.metadata.update_column(0, {'name': 't-species'}) + targets = targets.append_columns(second_targets) + hyperparams_class = \ + xgboost_gbtree.XGBoostGBTreeClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_gbtree.XGBoostGBTreeClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.continue_fit() + params = primitive.get_params() + for estimator in params['estimators']: + self.assertEqual(estimator.get_booster().best_ntree_limit, + primitive.hyperparams['n_estimators'] + primitive.hyperparams['n_more_estimators']) + + predictions = primitive.produce(inputs=attributes).value + + + self.assertEqual(predictions.shape, (150, 2)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + self.assertEqual(predictions.iloc[0, 1], 't-Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 't-species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 2)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + self.assertEqual(samples[0].iloc[0, 1], 't-Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(1)['name'], 't-species') + self.assertEqual(samples[0].metadata.query_column(1)['custom_metadata'], 'targets') + log_likelihoods = primitive.log_likelihoods(inputs=attributes, outputs=targets).value + + self.assertEqual(log_likelihoods.shape, (150, 2)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + self.assertEqual(log_likelihoods.metadata.query_column(1)['name'], 't-species') + + log_likelihood = primitive.log_likelihood(inputs=attributes, outputs=targets).value + + self.assertEqual(log_likelihood.shape, (1, 2)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -2.4149818420410156) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + self.assertAlmostEqual(log_likelihood.iloc[0, 1], -2.4149818420410156) + self.assertEqual(log_likelihoods.metadata.query_column(1)['name'], 't-species') + + feature_importances = primitive.produce_feature_importances().value + + self.assertEqual(feature_importances.values.tolist(), + [[0.011062598787248135, + 0.026943154633045197, + 0.6588393449783325, + 0.3031548857688904]]) + + def test_semantic_types(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_gbtree.XGBoostGBTreeClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_gbtree.XGBoostGBTreeClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + self.assertEqual(list(predictions.columns), ['species']) + + self.assertEqual(predictions.shape, (150, 1)) + self.assertEqual(predictions.iloc[0, 0], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=dataframe).value + self.assertEqual(list(samples[0].columns), ['species']) + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertEqual(samples[0].iloc[0, 0], 'Iris-setosa') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'species') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + log_likelihoods = primitive.log_likelihoods(inputs=dataframe, outputs=dataframe).value + self.assertEqual(list(log_likelihoods.columns), ['species']) + + self.assertEqual(log_likelihoods.shape, (150, 1)) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + log_likelihood = primitive.log_likelihood(inputs=dataframe, outputs=dataframe).value + self.assertEqual(list(log_likelihood.columns), ['species']) + + self.assertEqual(log_likelihood.shape, (1, 1)) + self.assertAlmostEqual(log_likelihood.iloc[0, 0], -3.4919378757476807) + self.assertEqual(log_likelihoods.metadata.query_column(0)['name'], 'species') + + feature_importances = primitive.produce_feature_importances().value + self.assertEqual(list(feature_importances), ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth']) + self.assertEqual(feature_importances.metadata.query_column(0)['name'], 'sepalLength') + self.assertEqual(feature_importances.metadata.query_column(1)['name'], 'sepalWidth') + self.assertEqual(feature_importances.metadata.query_column(2)['name'], 'petalLength') + self.assertEqual(feature_importances.metadata.query_column(3)['name'], 'petalWidth') + + + self.assertEqual(feature_importances.values.tolist(), + [[0.012397459708154202, + 0.03404613956809044, + 0.5992223024368286, + 0.35433411598205566]]) + + def test_return_append(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_gbtree.XGBoostGBTreeClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_gbtree.XGBoostGBTreeClassifierPrimitive(hyperparams=hyperparams_class.defaults()) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'sepalLength', + 'sepalWidth', + 'petalLength', + 'petalWidth', + 'species', + 'species', + ]) + + self.assertEqual(predictions.shape, (150, 7)) + self.assertEqual(predictions.iloc[0, 6], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 6), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 6), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(6)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(6)['custom_metadata'], 'targets') + + self._test_return_append_metadata(predictions.metadata) + + def _test_return_append_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 7, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'sepalLength', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'sepalWidth', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'petalLength', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'petalWidth', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'custom_metadata': 'targets', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }]) + + def test_return_new(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_gbtree.XGBoostGBTreeClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_gbtree.XGBoostGBTreeClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new'})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + ]) + + self.assertEqual(predictions.shape, (150, 2)) + self.assertEqual(predictions.iloc[0, 1], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_new_metadata(predictions.metadata) + + def _test_return_new_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_return_replace(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_gbtree.XGBoostGBTreeClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_gbtree.XGBoostGBTreeClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace'})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(list(predictions.columns), [ + 'd3mIndex', + 'species', + 'species', + ]) + + self.assertEqual(predictions.shape, (150, 3)) + self.assertEqual(predictions.iloc[0, 1], 'Iris-setosa') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'species') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_replace_metadata(predictions.metadata) + + def test_pickle_unpickle(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_gbtree.XGBoostGBTreeClassifierPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_gbtree.XGBoostGBTreeClassifierPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + before_pickled_prediction = primitive.produce(inputs=attributes).value + pickle_object = pickle.dumps(primitive) + primitive = pickle.loads(pickle_object) + after_unpickled_prediction = primitive.produce(inputs=attributes).value + # try to pickle again to see if we load it properly + _ = pickle.dumps(primitive) + self.assertTrue(container.DataFrame.equals(before_pickled_prediction, after_unpickled_prediction)) + + def _test_return_replace_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'species', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'custom_metadata': 'targets', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/test_xgboost_regressor.py b/tods/common-primitives/tests/test_xgboost_regressor.py new file mode 100644 index 0000000..d513cc1 --- /dev/null +++ b/tods/common-primitives/tests/test_xgboost_regressor.py @@ -0,0 +1,617 @@ +import os +import pickle +import unittest + +from sklearn.metrics import mean_squared_error + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe, extract_columns_semantic_types, xgboost_regressor, column_parser + + +class XGBoostRegressorTestCase(unittest.TestCase): + def _get_iris(self): + dataset_doc_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + + hyperparams_class = \ + dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=hyperparams_class.defaults()) + + dataframe = primitive.produce(inputs=dataset).value + + return dataframe + + def _get_iris_columns(self): + dataframe = self._get_iris() + col_index_list = list(range(len(dataframe.columns))) + _, target = col_index_list.pop(0), col_index_list.pop(3) + original_target_col = 5 + # We set custom metadata on columns. + for column_index in col_index_list: + dataframe.metadata = dataframe.metadata.update_column(column_index, {'custom_metadata': 'attributes'}) + dataframe.metadata = dataframe.metadata.update_column(target, {'custom_metadata': 'targets'}) + dataframe.metadata = dataframe.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, target), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, original_target_col), + 'https://metadata.datadrivendiscovery.org/types/Attribute') + # We set semantic types like runtime would. + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, target), + 'https://metadata.datadrivendiscovery.org/types/Target') + dataframe.metadata = dataframe.metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, target), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget') + dataframe.metadata = dataframe.metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, target), 'https://metadata.datadrivendiscovery.org/types/Attribute') + + # Parsing. + hyperparams_class = \ + column_parser.ColumnParserPrimitive.metadata.query()['primitive_code']['class_type_arguments'][ + 'Hyperparams'] + primitive = column_parser.ColumnParserPrimitive(hyperparams=hyperparams_class.defaults()) + dataframe = primitive.produce(inputs=dataframe).value + + hyperparams_class = \ + extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/Attribute',)})) + attributes = primitive.produce(inputs=dataframe).value + + primitive = extract_columns_semantic_types.ExtractColumnsBySemanticTypesPrimitive( + hyperparams=hyperparams_class.defaults().replace( + {'semantic_types': ('https://metadata.datadrivendiscovery.org/types/TrueTarget',)})) + targets = primitive.produce(inputs=dataframe).value + + return dataframe, attributes, targets + + def test_single_target(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_regressor.XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + primitive = xgboost_regressor.XGBoostGBTreeRegressorPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + predictions = primitive.produce(inputs=attributes).value + mse = mean_squared_error(targets, predictions) + self.assertLessEqual(mse, 0.01) + self.assertEqual(predictions.shape, (150, 1)) + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'petalWidth') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + self._test_single_target_metadata(predictions.metadata) + + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'petalWidth') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + def test_single_target_continue(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_regressor.XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_regressor.XGBoostGBTreeRegressorPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + # reset the training data to make continue_fit() work. + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.continue_fit() + params = primitive.get_params() + self.assertEqual(params['booster'].best_ntree_limit, + primitive.hyperparams['n_estimators'] + primitive.hyperparams['n_more_estimators']) + predictions = primitive.produce(inputs=attributes).value + mse = mean_squared_error(targets, predictions) + self.assertLessEqual(mse, 0.01) + self.assertEqual(predictions.shape, (150, 1)) + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'petalWidth') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + self._test_single_target_metadata(predictions.metadata) + + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'petalWidth') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + def _test_single_target_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'float', + 'name': 'petalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_multiple_targets(self): + dataframe, attributes, targets = self._get_iris_columns() + + targets = targets.append_columns(targets) + + hyperparams_class = \ + xgboost_regressor.XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + primitive = xgboost_regressor.XGBoostGBTreeRegressorPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + predictions = primitive.produce(inputs=attributes).value + mse = mean_squared_error(targets, predictions) + self.assertLessEqual(mse, 0.01) + + self.assertEqual(predictions.shape, (150, 2)) + for column_index in range(2): + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(column_index)['name'], 'petalWidth') + self.assertEqual(predictions.metadata.query_column(column_index)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 2)) + for column_index in range(2): + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, column_index), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(column_index)['name'], 'petalWidth') + self.assertEqual(samples[0].metadata.query_column(column_index)['custom_metadata'], 'targets') + + feature_importances = primitive.produce_feature_importances().value + + self.assertEqual(feature_importances.values.tolist(), + [[0.0049971588887274265, + 0.006304567214101553, + 0.27505698800086975, + 0.7136412858963013]]) + + def test_multiple_targets_continue(self): + dataframe, attributes, targets = self._get_iris_columns() + second_targets = targets.copy() + second_targets.rename(columns={'petalWidth': 't-petalWidth'}, inplace=True) + second_targets.metadata = second_targets.metadata.update_column(0, {'name': 't-petalWidth'}) + targets = targets.append_columns(second_targets) + + hyperparams_class = \ + xgboost_regressor.XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + primitive = xgboost_regressor.XGBoostGBTreeRegressorPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + # Set training data again to make continue_fit work + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.continue_fit() + params = primitive.get_params() + for estimator in params['estimators']: + self.assertEqual(estimator.get_booster().best_ntree_limit, + primitive.hyperparams['n_estimators'] + primitive.hyperparams['n_more_estimators']) + + predictions = primitive.produce(inputs=attributes).value + mse = mean_squared_error(targets, predictions) + self.assertLessEqual(mse, 0.01) + self.assertEqual(predictions.shape, (150, 2)) + + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'petalWidth') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 't-petalWidth') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 2)) + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'petalWidth') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(1)['name'], 't-petalWidth') + self.assertEqual(samples[0].metadata.query_column(1)['custom_metadata'], 'targets') + + feature_importances = primitive.produce_feature_importances().value + + self.assertEqual(feature_importances.values.tolist(), + [[0.003233343129977584, + 0.003926052246242762, + 0.19553671777248383, + 0.7973038554191589]]) + + def test_semantic_types(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_regressor.XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + primitive = xgboost_regressor.XGBoostGBTreeRegressorPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(predictions.shape, (150, 1)) + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(0)['name'], 'petalWidth') + self.assertEqual(predictions.metadata.query_column(0)['custom_metadata'], 'targets') + + samples = primitive.sample(inputs=attributes).value + + self.assertEqual(len(samples), 1) + self.assertEqual(samples[0].shape, (150, 1)) + self.assertTrue(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(samples[0].metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 0), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(samples[0].metadata.query_column(0)['name'], 'petalWidth') + self.assertEqual(samples[0].metadata.query_column(0)['custom_metadata'], 'targets') + + feature_importances = primitive.produce_feature_importances().value + + self.assertEqual(feature_importances.values.tolist(), + [[0.0049971588887274265, + 0.006304567214101553, + 0.27505698800086975, + 0.7136412858963013]]) + + def test_return_append(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_regressor.XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + primitive = xgboost_regressor.XGBoostGBTreeRegressorPrimitive(hyperparams=hyperparams_class.defaults()) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(predictions.shape, (150, 7)) + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 6), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 6), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(6)['name'], 'petalWidth') + self.assertEqual(predictions.metadata.query_column(6)['custom_metadata'], 'targets') + + self._test_return_append_metadata(predictions.metadata) + + def _test_return_append_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 7, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'sepalLength', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'sepalWidth', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'petalLength', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute'], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'petalWidth', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'custom_metadata': 'targets', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'species', + 'structural_type': 'int', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', ], + 'custom_metadata': 'attributes', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'structural_type': 'float', + 'name': 'petalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }]) + + def test_return_new(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_regressor.XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + primitive = xgboost_regressor.XGBoostGBTreeRegressorPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new'})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(predictions.shape, (150, 2)) + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'petalWidth') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_new_metadata(predictions.metadata) + + def _test_return_new_metadata(self, predictions_metadata): + expected_metadata = [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'float', + 'name': 'petalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }] + + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), expected_metadata) + + def test_return_replace(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_regressor.XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments']['Hyperparams'] + primitive = xgboost_regressor.XGBoostGBTreeRegressorPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'replace'})) + + primitive.set_training_data(inputs=dataframe, outputs=dataframe) + primitive.fit() + + predictions = primitive.produce(inputs=dataframe).value + + self.assertEqual(predictions.shape, (150, 3)) + self.assertTrue(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget')) + self.assertFalse(predictions.metadata.has_semantic_type((metadata_base.ALL_ELEMENTS, 1), + 'https://metadata.datadrivendiscovery.org/types/TrueTarget')) + self.assertEqual(predictions.metadata.query_column(1)['name'], 'petalWidth') + self.assertEqual(predictions.metadata.query_column(1)['custom_metadata'], 'targets') + + self._test_return_replace_metadata(predictions.metadata) + + def test_pickle_unpickle(self): + dataframe, attributes, targets = self._get_iris_columns() + + hyperparams_class = \ + xgboost_regressor.XGBoostGBTreeRegressorPrimitive.metadata.query()['primitive_code'][ + 'class_type_arguments'][ + 'Hyperparams'] + primitive = xgboost_regressor.XGBoostGBTreeRegressorPrimitive( + hyperparams=hyperparams_class.defaults().replace({'return_result': 'new', 'add_index_columns': False})) + + primitive.set_training_data(inputs=attributes, outputs=targets) + primitive.fit() + + before_pickled_prediction = primitive.produce(inputs=attributes).value + pickle_object = pickle.dumps(primitive) + primitive = pickle.loads(pickle_object) + after_unpickled_prediction = primitive.produce(inputs=attributes).value + self.assertTrue(container.DataFrame.equals(before_pickled_prediction, after_unpickled_prediction)) + + def _test_return_replace_metadata(self, predictions_metadata): + self.assertEqual(utils.to_json_structure(predictions_metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + }, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'int', + 'semantic_types': ['http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'float', + 'name': 'petalWidth', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], + 'custom_metadata': 'targets', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'petalWidth', + 'structural_type': 'float', + 'semantic_types': ['http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Target', + 'https://metadata.datadrivendiscovery.org/types/TrueTarget'], + 'custom_metadata': 'targets', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/common-primitives/tests/utils.py b/tods/common-primitives/tests/utils.py new file mode 100644 index 0000000..18dc51c --- /dev/null +++ b/tods/common-primitives/tests/utils.py @@ -0,0 +1,112 @@ +import json +import os + +from d3m import utils, container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe + + +def convert_metadata(metadata): + return json.loads(json.dumps(metadata, cls=utils.JsonEncoder)) + + +def load_iris_metadata(): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + +def test_iris_metadata(test_obj, metadata, structural_type, rows_structural_type=None): + test_obj.maxDiff = None + + test_obj.assertEqual(convert_metadata(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': structural_type, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + if rows_structural_type is None: + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }) + else: + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'structural_type': rows_structural_type, + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }) + + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + for i in range(1, 5): + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS, i))), { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'][i - 1], + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, i) + + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS, 5))), { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + +def convert_through_json(data): + return json.loads(json.dumps(data, cls=utils.JsonEncoder)) + + +def normalize_semantic_types(data): + if isinstance(data, dict): + if 'semantic_types' in data: + # We sort them so that it is easier to compare them. + data['semantic_types'] = sorted(data['semantic_types']) + + return {key: normalize_semantic_types(value) for key, value in data.items()} + + return data + + +def effective_metadata(metadata): + output = metadata.to_json_structure() + + for entry in output: + entry['metadata'] = normalize_semantic_types(entry['metadata']) + + return output + + +def get_dataframe(dataset): + dataset_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataset_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + return dataframe diff --git a/tods/data_processing/CategoricalToBinary.py b/tods/data_processing/CategoricalToBinary.py new file mode 100644 index 0000000..d3e1009 --- /dev/null +++ b/tods/data_processing/CategoricalToBinary.py @@ -0,0 +1,395 @@ +import os +import typing +import pandas as pd +import numpy as np + + +from d3m import container, utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer +from d3m.primitive_interfaces.base import CallResult, DockerContainer + + +import common_primitives +import logging +import math + +from typing import cast, Dict, List, Union, Sequence, Optional, Tuple +from collections import OrderedDict +from scipy import sparse +from numpy import ndarray + +__all__ = ('CategoricalToBinary',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + + # parameters for column + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class Cat2B: + def __init__(self): + pass + + def produce(self, inputs): + + # print("input",inputs) + # print(type(inputs)) + dataframe = inputs + processed_df = utils.pandas.DataFrame() + for target_column in dataframe.columns : + try: + req_col = pd.DataFrame(dataframe.loc[:,target_column]) + categories = req_col[target_column].unique() + + column_names = [target_column+'_'+str(i) for i in categories] + column_dtype = req_col[target_column].dtype + + if column_dtype== np.object: + for i,j in zip(categories,column_names): + if i is not None: + req_col.loc[req_col[target_column]==i,j] = "1" + req_col.loc[req_col[target_column]!=i,j] = "0" + else: + req_col.loc[req_col[target_column].isna()==False,j] = "0" + req_col.loc[req_col[target_column].isna()==True,j] = None + + else: + for i,j in zip(categories,column_names): + if not math.isnan(i): + req_col.loc[req_col[target_column]==i,j] = "1" + req_col.loc[req_col[target_column]!=i,j] = "0" + else: + req_col.loc[req_col[target_column].isna()==False,j] = "0" + req_col.loc[req_col[target_column].isna()==True,j] = np.nan + + processed_df[column_names] = req_col[column_names] + except KeyError: + logging.warning("Target Column "+ target_column+" Not Found in Dataframe") + + return processed_df; + +class CategoricalToBinary(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which will convert all the distinct values present in a column to a binary represntation with each distinct value having a different column. + + + Parameters + ---------- + use_columns: Set + A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. + + exclude_columns: Set + A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. + + return_result: Enumeration + Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. + + use_semantic_types: Bool + Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. + + add_index_columns: Bool + Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". + + error_on_no_input: Bool( + Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. + + return_semantic_type: Enumeration[str]( + Decides what semantic type to attach to generated attributes' + """ + + __author__ = "DATA LAB" + metadata = metadata_base.PrimitiveMetadata( + { + "__author__ " : "DATA Lab at Texas A&M University", + 'name': "Converting Categorical to Binary", + 'python_path': 'd3m.primitives.tods.data_processing.categorical_to_binary', + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'contact': 'mailto:khlai037@tamu.edu', + 'uris': [ + 'https://gitlab.com/lhenry15/tods.git', + 'https://gitlab.com/lhenry15/tods/-/blob/purav/anomaly-primitives/anomaly_primitives/CategoricalToBinaryDataframe.py', + ], + }, + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.CATEGORICAL_TO_BINARY, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + 'id': 'bb6fb64d-cf20-45f0-8c4b-d7218f9c58c2', + 'hyperparameters_to_tune':"None", + 'version': '0.0.1', + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._clf = Cat2B() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + Args: + inputs: Container DataFrame + + Returns: + Container DataFrame added with binary version of a column a sort of one hot encoding of values under different columns + named as "column name_category value" for all the columns passed in list while building the pipeline + """ + + assert isinstance(inputs, container.DataFrame), type(dataframe) + + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + cols = [inputs.columns[x] for x in self._training_indices] + sk_inputs = container.DataFrame(data = inputs.iloc[:, self._training_indices].values,columns = cols, generate_metadata=True) + + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.produce(sk_inputs) + # print("sk_ouput",sk_output) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + # if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + # self._update_metadata(outputs) + return base.CallResult(outputs) + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + # return inputs, list(hyperparams['use_columns']) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + accepted_structural_types = (int, float, np.integer, np.float64,str) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + print(column_index, "does not match the structural_type requirements in metadata. Skipping column") + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + # print("length sematic type",len(semantic_types)) + # returing true for testing purposes for custom dataframes + return True; + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + print(semantic_types) + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + + outputs = container.DataFrame(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata,self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + # print(outputs.metadata.to_internal_simple_structure()) + + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + return target_columns_metadata + + +CategoricalToBinary.__doc__ = CategoricalToBinary.__doc__ + diff --git a/tods/data_processing/ColumnFilter.py b/tods/data_processing/ColumnFilter.py new file mode 100644 index 0000000..284366d --- /dev/null +++ b/tods/data_processing/ColumnFilter.py @@ -0,0 +1,149 @@ +import os +import sklearn +import numpy +import typing +import time +from scipy import sparse +from numpy import ndarray +from collections import OrderedDict +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple + +import numpy as np +import pandas as pd +import logging, uuid +from scipy import sparse +from numpy import ndarray +from collections import OrderedDict +from common_primitives import dataframe_utils, utils + +from d3m import utils +from d3m import container +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.container import DataFrame as d3m_dataframe +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.primitive_interfaces import base, transformer +from d3m.metadata import base as metadata_base, hyperparams +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from statsmodels.tsa.stattools import acf + + +# import os.path + + +__all__ = ('ColumnFilter',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + + +class Hyperparams(hyperparams.Hyperparams): + + # Keep previous + dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(2,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(0,1,3,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class ColumnFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that filters out columns of wrong shape in DataFrame (specifically columns generated by some features analysis) + """ + + metadata = metadata_base.PrimitiveMetadata({ + '__author__': "DATA Lab @Texas A&M University", + 'name': "Column Filter", + 'python_path': 'd3m.primitives.tods.data_processing.column_filter', + 'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/tods/tods/data_processing/column_filter.py']}, + 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.COLUMN_FILTER,], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'ColumnFilterPrimitive')), + 'version': '0.0.1', + }) + + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + #self._clf = column_filter() + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. + + Returns: + Container DataFrame after AutoCorrelation. + """ + outputs=inputs + index_to_keep = np.array([]) + + for i in range(len(inputs.columns)): + column_to_check = outputs.iloc[:,i] + cell_to_check = column_to_check.iloc[-1:] + + if not np.isnan(cell_to_check.values[0]): + index_to_keep=np.append(index_to_keep,i) + + outputs=outputs.iloc[:,index_to_keep] + + self._update_metadata(outputs) + + return CallResult(outputs) + + + + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs) + + diff --git a/tods/data_processing/ContinuityValidation.py b/tods/data_processing/ContinuityValidation.py new file mode 100644 index 0000000..7786552 --- /dev/null +++ b/tods/data_processing/ContinuityValidation.py @@ -0,0 +1,178 @@ +from d3m import container, exceptions +from d3m.primitive_interfaces import base, transformer +from d3m.metadata import base as metadata_base, hyperparams + +import os.path +from d3m import utils + +import time + +__all__ = ('ContinuityValidation',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + continuity_option = hyperparams.Enumeration( + values=['ablation', 'imputation'], + default='imputation', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Choose ablation or imputation the original data", + ) + + interval = hyperparams.Uniform( + default = 1, + lower = 0.000000001, + upper = 10000000000, + description='Only used in imputation, give the timestamp interval.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + +class ContinuityValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Check whether the seires data is consitent in time interval and provide processing if not consistent. + + Parameters + ---------- + continuity_option: enumeration + Choose ablation or imputation. + ablation: delete some rows and increase timestamp interval to keep the timestamp consistent + imputation: linearly imputate the absent timestamps to keep the timestamp consistent + interval: float + Only used in imputation, give the timestamp interval. ‘interval’ should be an integral multiple of 'timestamp' or 'timestamp' should be an integral multiple of ‘interval’ + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "continuity validation primitive", + "python_path": "d3m.primitives.tods.data_processing.continuity_validation", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/ContinuityValidation.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.CONTINUITY_VALIDATION, ], + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "id": "ef8fb025-d157-476c-8e2e-f8fe56162195", + "hyperparams_to_tune": ['continuity_option', 'interval'], + "version": "0.0.1", + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame with consistent timestamp + + """ + # self.logger.warning('Hi, ContinuityValidation.produce was called!') + if self.hyperparams['continuity_option'] == 'ablation': + outputs = self._continuity_ablation(inputs) + + if self.hyperparams['continuity_option'] == 'imputation': + outputs = self._continuity_imputation(inputs) + + + outputs.reset_index(drop=True, inplace=True) + self._update_metadata(outputs) + + # self._write(outputs) + return base.CallResult(outputs) + + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs) + + + def _continuity_ablation(self, inputs: Inputs): + + ablation_set = self._find_ablation_set(inputs) + inputs = inputs.loc[inputs['timestamp'].isin(ablation_set)].copy() + + inputs.sort_values("timestamp",inplace=True) + inputs['d3mIndex'] = list(range(inputs.shape[0])) + + return inputs + + + def _find_ablation_set(self, inputs): + """ + Find the longest series with minimum timestamp interval of inputs + """ + # find the min inteval and max interval + min_interval = inputs.iloc[1]['timestamp'] - inputs.iloc[0]['timestamp'] + for i in range(2, inputs.shape[0]): + curr_interval = inputs.iloc[i]['timestamp'] - inputs.iloc[i - 1]['timestamp'] + if min_interval > curr_interval: + min_interval = curr_interval + + max_interval = ((inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']) + min_interval * (2 - inputs.shape[0])) + + print((inputs.iloc[-1]['timestamp'] - inputs.iloc[0]['timestamp']), inputs.shape[0]) + + interval = min_interval + ablation_set = list() + origin_set = set(inputs['timestamp']) + + print(min_interval, max_interval) + + while interval <= max_interval: + start = 0 + while (inputs.iloc[start]['timestamp'] <= inputs.iloc[0]['timestamp'] + max_interval) and (inputs.iloc[start]['timestamp'] <= inputs.iloc[-1]['timestamp']): + tmp_list = list() + tmp = utils.numpy.arange(start=inputs.iloc[start]['timestamp'], step=interval,stop=inputs.iloc[-1]['timestamp']) + + for i in tmp: + if i in origin_set: + tmp_list.append(i) + else: break + + ablation_set.append(tmp_list) + start += 1 + + interval += min_interval + + max_size_index = 0 + for i in range(1, len(ablation_set)): + if len(ablation_set[i]) > len(ablation_set[max_size_index]): + max_size_index = i + return ablation_set[max_size_index] + + + def _continuity_imputation(self, inputs: Inputs): + """ + Linearly imputate the missing timestmap and value of inputs + """ + interval = self.hyperparams['interval'] + time1 = inputs.iloc[0]['timestamp'] + + for i in range(1, inputs.shape[0]): + time2 = inputs.iloc[i]['timestamp'] + if time2 - time1 != interval: + + blank_number = int((time2 - time1) / interval) # how many imputation should there be between two timestamps in original data + for j in range(1, blank_number): + + dict = {'timestamp':[time1 + interval * j], 'ground_truth':[int(inputs.iloc[i]['ground_truth'])]} + + for col in list(inputs.columns.values): + if not col in ['d3mIndex', 'timestamp', 'ground_truth']: + dict[col] = [inputs.iloc[i-1][col] + (inputs.iloc[i][col] - inputs.iloc[i-1][col]) / blank_number * j] + + inputs = inputs.append(utils.pandas.DataFrame(dict), ignore_index=True, sort=False) + + time1 = time2 + + inputs.sort_values("timestamp",inplace=True) + inputs['d3mIndex'] = list(range(inputs.shape[0])) + return inputs + + + def _write(self, inputs:Inputs): + """ + write inputs to current directory, only for test + """ + inputs.to_csv(str(time.time())+'.csv') diff --git a/tods/data_processing/DatasetToDataframe.py b/tods/data_processing/DatasetToDataframe.py new file mode 100644 index 0000000..dfd2e3e --- /dev/null +++ b/tods/data_processing/DatasetToDataframe.py @@ -0,0 +1,87 @@ +import os +import typing + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives + +__all__ = ('DatasetToDataFramePrimitive',) + +Inputs = container.Dataset +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", + ) + + +class DatasetToDataFramePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive which extracts a DataFrame out of a Dataset. + """ + + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '4b42ce1e-9b98-4a25-b68e-fad13311eb65', + 'version': '0.3.0', + 'name': "Extract a DataFrame from a Dataset", + 'python_path': 'd3m.primitives.tods.data_processing.dataset_to_dataframe', + 'source': { + 'name': common_primitives.__author__, + 'contact': 'mailto:mitar.commonprimitives@tnode.com', + 'uris': [ + 'https://gitlab.com/datadrivendiscovery/common-primitives/blob/master/common_primitives/dataset_to_dataframe.py', + 'https://gitlab.com/datadrivendiscovery/common-primitives.git', + ], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + }, + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + dataframe_resource_id, dataframe = base_utils.get_tabular_resource(inputs, self.hyperparams['dataframe_resource']) + + dataframe.metadata = self._update_metadata(inputs.metadata, dataframe_resource_id) + + assert isinstance(dataframe, container.DataFrame), type(dataframe) + + return base.CallResult(dataframe) + + def _update_metadata(self, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment) -> metadata_base.DataMetadata: + resource_metadata = dict(metadata.query((resource_id,))) + + if 'structural_type' not in resource_metadata or not issubclass(resource_metadata['structural_type'], container.DataFrame): + raise TypeError("The Dataset resource is not a DataFrame, but \"{type}\".".format( + type=resource_metadata.get('structural_type', None), + )) + + resource_metadata.update( + { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + }, + ) + + new_metadata = metadata_base.DataMetadata(resource_metadata) + + new_metadata = metadata.copy_to(new_metadata, (resource_id,)) + + # Resource is not anymore an entry point. + new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') + + return new_metadata diff --git a/tods/data_processing/DuplicationValidation.py b/tods/data_processing/DuplicationValidation.py new file mode 100644 index 0000000..05b42f4 --- /dev/null +++ b/tods/data_processing/DuplicationValidation.py @@ -0,0 +1,97 @@ +from d3m import container +from d3m.primitive_interfaces import base, transformer +from d3m.metadata import base as metadata_base, hyperparams + +import os.path +from d3m import utils + +import time + +__all__ = ('DuplicationValidation',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + """ + + """ + keep_option = hyperparams.Enumeration( + values=['first', 'average'], + default='first', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="When dropping rows, choose to keep the first one of duplicated data or calculate their average", + ) + + +class DuplicationValidation(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Check whether the seires data involves duplicate data in one timestamp, and provide processing if the duplication exists. + + Parameters + ---------- + keep_option: enumeration + When dropping rows, choose to keep the first one or calculate the average + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "duplication validation primitive", + "python_path": "d3m.primitives.tods.data_processing.duplication_validation", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DUPLICATION_VALIDATION,], + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "id": "cf6d8137-73d8-496e-a2e3-49f941ee716d", + "hyperparams_to_tune": ['keep_option'], + "version": "0.0.1", + }) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame after drop the duplication + """ + # self.logger.warning('Hi, DuplicationValidation.produce was called!') + + if self.hyperparams['keep_option'] == 'first': + outputs = self._timestamp_keep_first(inputs) + + if self.hyperparams['keep_option'] == 'average': + outputs = self._timestamp_keep_average(inputs) + + self._update_metadata(outputs) + + # self._write(outputs) + return base.CallResult(outputs) + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs) + + def _timestamp_keep_first(self, inputs: Inputs): + return inputs.drop_duplicates(subset=['timestamp'],keep='first') + + def _timestamp_keep_average(self, inputs: Inputs): + inputs_copy = inputs.copy() + inputs = inputs.drop_duplicates(subset=['timestamp'],keep='first') + + inputs_copy = inputs_copy.groupby('timestamp').mean().reset_index() + + for col in list(inputs.columns.values): + if not col in ['d3mIndex', 'timestamp', 'ground_truth']: + inputs.pop(col) + inputs.insert(2, col, inputs_copy[col].values) + + return inputs + + def _write(self, inputs:Inputs): + """ + write inputs to current directory, only for test + """ + inputs.to_csv(str(time.time())+'.csv') diff --git a/tods/data_processing/TimeIntervalTransform.py b/tods/data_processing/TimeIntervalTransform.py new file mode 100644 index 0000000..a50f9c5 --- /dev/null +++ b/tods/data_processing/TimeIntervalTransform.py @@ -0,0 +1,169 @@ +import os +import uuid +import typing +import collections + +import numpy as np +import pandas as pd + +import common_primitives +from common_primitives import dataframe_utils, utils + +from datetime import datetime, timezone +from d3m.primitive_interfaces import base, transformer +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams + + +__all__ = ('TimeIntervalTransform',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +""" +TODO: Implementation for up-sampling the data (when time_interval is less than current time series interval) +""" + +class Hyperparams(hyperparams.Hyperparams): + time_interval = hyperparams.Hyperparameter[typing.Union[str, None]]( + default='5T', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='timestamp to transform.' + ) + + # Keep previous + dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(2,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(0,1,3,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class TimeIntervalTransform(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + + """ + A primitive which configures the time interval of the dataframe. + Resample the timestamps based on the time_interval passed as hyperparameter + """ + + metadata = metadata_base.PrimitiveMetadata({ + '__author__': "DATA Lab @Texas A&M University", + 'name': "Time Interval Transform", + 'python_path': 'd3m.primitives.tods.data_processing.time_interval_transform', + 'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/TimeIntervalTransform.py']}, + 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.TIME_INTERVAL_TRANSFORM,], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'TimeIntervalTransformPrimitive')), + 'hyperparams_to_tune': ['time_interval'], + 'version': '0.0.2' + }) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + resource = inputs.reset_index(drop=True) + + """ + Args: + inputs: Container DataFrame + Returns: + Container DataFrame with resampled time intervals + """ + + if self.hyperparams['time_interval'] is None: + time_interval = '5T' + else: + time_interval = self.hyperparams['time_interval'] + + try: + outputs = self._time_interval_transform(inputs, hyperparams) + #print(outputs) + except Exception as e: + self.logger.error("Error in Performing Time Interval Transform",e) + + self._update_metadata(outputs) + + return base.CallResult(outputs) + + def _time_interval_transform(self, inputs: Inputs, hyperparams: Hyperparams): + + """ + Args: + inputs: Container DataFrame + Returns: + Container DataFrame with resampled time intervals + """ + + #configure dataframe for resampling + inputs['timestamp'] = pd.to_datetime(inputs['timestamp'], unit='s') + inputs['timestamp'] = inputs['timestamp'].dt.tz_localize('US/Pacific') + inputs = inputs.set_index('timestamp') + + #resample dataframe + inputs = inputs.resample(self.hyperparams['time_interval']).mean() + + #configure dataframe to original format + inputs = inputs.reset_index() + value_columns = list(set(inputs.columns) - set(['d3mIndex', 'timestamp', 'ground_truth'])) + inputs = inputs.reindex(columns=['d3mIndex','timestamp'] + value_columns + ['ground_truth']) + inputs['timestamp'] = inputs['timestamp'].astype(np.int64) // 10 ** 9 + inputs['d3mIndex'] = range(0, len(inputs)) + + """ + Since the mean of the ground_truth was taken for a set interval, + we should set those values that are greater than 0 to 1 so they are consistent with original data + """ + for i in range(len(inputs['ground_truth'])): + if(inputs['ground_truth'][i] > 0): + inputs.loc[i, 'ground_truth'] = 1 + + inputs = container.DataFrame(inputs) #convert pandas DataFrame back to d3m comtainer(Important) + + return inputs + + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs) + diff --git a/tods/data_processing/TimeStampValidation.py b/tods/data_processing/TimeStampValidation.py new file mode 100644 index 0000000..946e93f --- /dev/null +++ b/tods/data_processing/TimeStampValidation.py @@ -0,0 +1,99 @@ + +import os +import typing +import numpy + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams +from d3m.primitive_interfaces import base, transformer + + +__all__ = ('TimeStampValidationPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + pass + +class TimeStampValidationPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive to check time series is sorted by time stamp , if not then return sorted time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '5f791b09-e16f-42e1-bc53-39de308f5861', + 'version': '0.1.0', + 'name': 'Time Stamp Validation', + 'python_path': 'd3m.primitives.tods.data_processing.timestamp_validation', + 'keywords': ['Time Stamp', 'Sort Order'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/data_processing/TimeStampValidation.py'], + 'contact': 'mailto:khlai037@tamu.edu' + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING , + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_VALIDATION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame sorted by Time Stamp + + """ + self.logger.info('Time Stamp order validation called') + outputs = inputs + try: + if (self._is_time_stamp_sorted(inputs, 'timestamp')): + outputs = inputs + else: + outputs = inputs.sort_values(by=["timestamp"]) + + + self._update_metadata(outputs) + + outputs.reset_index(drop=True, inplace=True) + self.logger.info('Type of data : %s',type(outputs)) + + except Exception as e : + self.logger.error('Time Stamp order validation error %s :',e) + print(self.logger.info(base.CallResult(outputs).value)) + return base.CallResult(outputs) + + def _is_time_stamp_sorted(self,input:Inputs,column:str = 'timestamp') -> bool : + """ + + Args: + input: Container Dataframe + column: Column Name + + Returns: + Boolean : True if timestamp column is sorted False if not + + """ + return all(input[column][i] <= input[column][i+1] for i in range(len(input[column])-1)) + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs) diff --git a/tods/data_processing/__init__.py b/tods/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tods/detection_algorithm/AutoRegODetect.py b/tods/detection_algorithm/AutoRegODetect.py new file mode 100644 index 0000000..6844b1d --- /dev/null +++ b/tods/detection_algorithm/AutoRegODetect.py @@ -0,0 +1,226 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +# import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +from sklearn.utils.validation import check_is_fitted +from sklearn.linear_model import LinearRegression +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from detection_algorithm.core.MultiAutoRegOD import MultiAutoRegOD +from detection_algorithm.core.AutoRegOD import AutoRegOD + +from sklearn.utils import check_array, column_or_1d +from sklearn.utils.validation import check_is_fitted + +from combo.models.score_comb import average, maximization, median, aom, moa +from combo.utils.utility import standardizer +import uuid + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + method = hyperparams.Enumeration[str]( + values=['average', 'maximization', 'median'], + default='average', + description='Combination method: {average, maximization, median}. Pass in weights of detector for weighted version.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + weights = hyperparams.Union( + configuration=OrderedDict({ + 'ndarray': hyperparams.Hyperparameter[ndarray]( + default=np.array([]), + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Score weight by dimensions. If None, [1,1,...,1] will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + pass + + +class AutoRegODetector(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + Autoregressive models use linear regression to calculate a sample's + deviance from the predicted value, which is then used as its + outlier scores. This model is for multivariate time series. + This model handles multivariate time series by various combination + approaches. See AutoRegOD for univarite data. + + See :cite:`aggarwal2015outlier,zhao2020using` for details. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. When fitting this is used + to define the threshold on the decision function. + + method : str, optional (default='average') + Combination method: {'average', 'maximization', + 'median'}. Pass in weights of detector for weighted version. + + weights : numpy array of shape (1, n_dimensions) + Score weight by dimensions. (default=[1,1,...,1]) + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "AutoRegODetector", + "python_path": "d3m.primitives.tods.detection_algorithm.AutoRegODetector", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ISOLATION_FOREST, ], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['window_size', 'contamination', 'step_size', 'method', 'weights'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'AutoRegODetector')) + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = MultiAutoRegOD(window_size=hyperparams['window_size'], + contamination=hyperparams['contamination'], + step_size=hyperparams['step_size'], + method=hyperparams['method'], + weights=hyperparams['weights'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def produce_score(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + Outlier score of input DataFrame. + """ + return super().produce_score(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + diff --git a/tods/detection_algorithm/DeepLog.py b/tods/detection_algorithm/DeepLog.py new file mode 100644 index 0000000..1d72e56 --- /dev/null +++ b/tods/detection_algorithm/DeepLog.py @@ -0,0 +1,413 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import numpy as np +from keras.models import Sequential +from keras.layers import Dense, Dropout , LSTM +from keras.regularizers import l2 +from keras.losses import mean_squared_error +from sklearn.preprocessing import StandardScaler +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted +from pyod.utils.stat_models import pairwise_distances_no_broadcast +from pyod.models.base import BaseDetector + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas +import uuid + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase + + + +__all__ = ('DeepLog',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + hidden_size = hyperparams.Hyperparameter[int]( + default=64, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="hidden state dimension" + ) + + loss = hyperparams.Hyperparameter[typing.Union[str, None]]( + default='mean_squared_error', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="loss function" + ) + + optimizer = hyperparams.Hyperparameter[typing.Union[str, None]]( + default='Adam', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Optimizer" + ) + + epochs = hyperparams.Hyperparameter[int]( + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Epoch" + ) + + batch_size = hyperparams.Hyperparameter[int]( + default=32, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Batch size" + ) + + dropout_rate = hyperparams.Hyperparameter[float]( + default=0.2, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Dropout rate" + ) + + l2_regularizer = hyperparams.Hyperparameter[float]( + default=0.1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="l2 regularizer" + ) + + validation_size = hyperparams.Hyperparameter[float]( + default=0.1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="validation size" + ) + + window_size = hyperparams.Hyperparameter[int]( + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="window size" + ) + + features = hyperparams.Hyperparameter[int]( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Number of features in Input" + ) + + stacked_layers = hyperparams.Hyperparameter[int]( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Number of LSTM layers between input layer and Final Dense Layer" + ) + + preprocessing = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Whether to Preprosses the data" + ) + + verbose = hyperparams.Hyperparameter[int]( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="verbose" + ) + + + contamination = hyperparams.Uniform( + lower=0., + upper=0.5, + default=0.1, + description='the amount of contamination of the data set, i.e.the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + +class DeepLogPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + A primitive that uses DeepLog for outlier detection + + Parameters + ---------- + + + """ + + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + '__author__': "DATA Lab @Texas A&M University", + 'name': "DeepLog Anomolay Detection", + 'python_path': 'd3m.primitives.tods.detection_algorithm.deeplog', + 'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/MatrixProfile.py']}, + 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.DEEPLOG], + 'primitive_family': metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'DeepLogPrimitive')), + 'hyperparams_to_tune': ['hidden_size', 'loss', 'optimizer', 'epochs', 'batch_size', + 'l2_regularizer', 'validation_size', + 'window_size', 'features', 'stacked_layers', 'preprocessing', 'verbose', 'dropout_rate','contamination'], + 'version': '0.0.1', + } + ) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + self._clf = DeeplogLstm(hidden_size=hyperparams['hidden_size'], + loss=hyperparams['loss'], + optimizer=hyperparams['optimizer'], + epochs=hyperparams['epochs'], + batch_size=hyperparams['batch_size'], + dropout_rate=hyperparams['dropout_rate'], + l2_regularizer=hyperparams['l2_regularizer'], + validation_size=hyperparams['validation_size'], + window_size=hyperparams['window_size'], + stacked_layers=hyperparams['stacked_layers'], + preprocessing=hyperparams['preprocessing'], + verbose=hyperparams['verbose'], + contamination=hyperparams['contamination'] + + ) + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + +class DeeplogLstm(BaseDetector): + """Class to Implement Deep Log LSTM based on "https://www.cs.utah.edu/~lifeifei/papers/deeplog.pdf + Only Parameter Value anomaly detection layer has been implemented for time series data""" + + def __init__(self, hidden_size : int = 64, + optimizer : str ='adam',loss=mean_squared_error,preprocessing=True, + epochs : int =100, batch_size : int =32, dropout_rate : float =0.0, + l2_regularizer : float =0.1, validation_size : float =0.1, + window_size: int = 10 ,stacked_layers: int = 1,verbose : int = 1,contamination:int = 0.001): + + super(DeeplogLstm, self).__init__(contamination=contamination) + self.hidden_size = hidden_size + self.loss = loss + self.optimizer = optimizer + self.epochs = epochs + self.batch_size = batch_size + self.dropout_rate = dropout_rate + self.l2_regularizer = l2_regularizer + self.validation_size = validation_size + self.window_size = window_size + self.stacked_layers = stacked_layers + self.preprocessing = preprocessing + self.verbose = verbose + self.dropout_rate = dropout_rate + self.contamination = contamination + + + + + + def _build_model(self): + """ + Builds Stacked LSTM model. + Args: + inputs : Self object containing model parameters + + Returns: + return : model + """ + + model = Sequential() + + #InputLayer + model.add(LSTM(self.hidden_size,input_shape = (self.window_size,self.n_features_),return_sequences=True,dropout = self.dropout_rate)) + #stacked layer + for layers in range(self.stacked_layers): + if(layers == self.stacked_layers -1 ): + model.add(LSTM(self.hidden_size, return_sequences=False,dropout = self.dropout_rate)) + continue + model.add(LSTM(self.hidden_size,return_sequences=True,dropout = self.dropout_rate)) + #output layer + + model.add(Dense(self.n_features_)) + # Compile model + model.compile(loss=self.loss, optimizer=self.optimizer) + if self.verbose >= 1: + print(model.summary()) + return model + + def fit(self,X,y=None): + """ + Fit data to LSTM model. + Args: + inputs : X , ndarray of size (number of sample,features) + + Returns: + return : self object with trained model + """ + + + + + X = check_array(X) + self._set_n_classes(y) + self.n_samples_, self.n_features_ = X.shape[0], X.shape[1] + + + X_train,Y_train = self._preprocess_data_for_LSTM(X) + + self.model_ = self._build_model() + self.history_ = self.model_.fit(X_train, Y_train, + epochs=self.epochs, + batch_size=self.batch_size, + validation_split=self.validation_size, + verbose=self.verbose).history + pred_scores = np.zeros(X.shape) + pred_scores[self.window_size:] = self.model_.predict(X_train) + + Y_train_for_decision_scores = np.zeros(X.shape) + Y_train_for_decision_scores[self.window_size:] = Y_train + self.decision_scores_ = pairwise_distances_no_broadcast(Y_train_for_decision_scores, + pred_scores) + + self._process_decision_scores() + return self + + + def _preprocess_data_for_LSTM(self,X): + """ + Preposses data and prepare sequence of data based on number of samples needed in a window + Args: + inputs : X , ndarray of size (number of sample,features) + + Returns: + return : X , Y X being samples till (t-1) of data and Y the t time data + """ + if self.preprocessing: + self.scaler_ = StandardScaler() + X_norm = self.scaler_.fit_transform(X) + else: + X_norm = np.copy(X) + + X_data = [] + Y_data = [] + for index in range(X.shape[0] - self.window_size): + X_data.append(X_norm[index:index+self.window_size]) + Y_data.append(X_norm[index+self.window_size]) + X_data = np.asarray(X_data) + Y_data = np.asarray(Y_data) + + return X_data,Y_data + + + + def decision_function(self, X): + """Predict raw anomaly score of X using the fitted detector. + The anomaly score of an input sample is computed based on different + detector algorithms. . + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The training input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['model_', 'history_']) + + X = check_array(X) + print("inside") + print(X.shape) + print(X[0]) + X_norm,Y_norm = self._preprocess_data_for_LSTM(X) + pred_scores = np.zeros(X.shape) + pred_scores[self.window_size:] = self.model_.predict(X_norm) + Y_norm_for_decision_scores = np.zeros(X.shape) + Y_norm_for_decision_scores[self.window_size:] = Y_norm + return pairwise_distances_no_broadcast(Y_norm_for_decision_scores, pred_scores) + + + + + + diff --git a/tods/detection_algorithm/KDiscordODetect.py b/tods/detection_algorithm/KDiscordODetect.py new file mode 100644 index 0000000..d569539 --- /dev/null +++ b/tods/detection_algorithm/KDiscordODetect.py @@ -0,0 +1,347 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +# import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +from sklearn.utils.validation import check_is_fitted +from sklearn.linear_model import LinearRegression +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from detection_algorithm.core.KDiscord import KDiscord +import uuid + +from sklearn.utils import check_array, column_or_1d +from sklearn.utils.validation import check_is_fitted + +from combo.models.score_comb import average, maximization, median, aom, moa +from combo.utils.utility import standardizer + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + n_neighbors = hyperparams.Hyperparameter[int]( + default=5, + description='Number of neighbors to use by default for k neighbors queries.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + method = hyperparams.Enumeration[str]( + values=['largest', 'mean', 'median'], + default='largest', + description='Combine the distance to k neighbors as the outlier score.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + radius = hyperparams.Hyperparameter[float]( + default=1.0, + description='Range of parameter space to use by default for `radius_neighbors` queries.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + algorithm = hyperparams.Enumeration[str]( + values=['auto', 'ball_tree', 'kd_tree', 'brute'], + default='auto', + description='Algorithm used to compute the nearest neighbors.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + leaf_size = hyperparams.Hyperparameter[int]( + default=30, + description='Leaf size passed to `BallTree` or `KDTree`. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + metric = hyperparams.Enumeration[str]( + values=['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan', 'braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', + 'sqeuclidean', 'yule'], + default='minkowski', + description='metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + p = hyperparams.Hyperparameter[int]( + default=2, + description='Parameter for the Minkowski metric from.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + metric_params = hyperparams.Union[Union[Dict, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[Dict]( + default={}, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='Additional keyword arguments for the metric function.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + pass + + +class KDiscordODetector(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + KDiscord first split multivariate time series into + subsequences (matrices), and it use kNN outlier detection based on PyOD. + For an observation, its distance to its kth nearest neighbor could be + viewed as the outlying score. It could be viewed as a way to measure + the density. See :cite:`ramaswamy2000efficient,angiulli2002fast` for + details. + + See :cite:`aggarwal2015outlier,zhao2020using` for details. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_neighbors : int, optional (default = 5) + Number of neighbors to use by default for k neighbors queries. + + method : str, optional (default='largest') + {'largest', 'mean', 'median'} + + - 'largest': use the distance to the kth neighbor as the outlier score + - 'mean': use the average of all k neighbors as the outlier score + - 'median': use the median of the distance to k neighbors as the + outlier score + + radius : float, optional (default = 1.0) + Range of parameter space to use by default for `radius_neighbors` + queries. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use BallTree + - 'kd_tree' will use KDTree + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + .. deprecated:: 0.74 + ``algorithm`` is deprecated in PyOD 0.7.4 and will not be + possible in 0.7.6. It has to use BallTree for consistency. + + leaf_size : int, optional (default = 30) + Leaf size passed to BallTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : string or callable, default 'minkowski' + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', + 'sqeuclidean', 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. + + p : integer, optional (default = 2) + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances + + metric_params : dict, optional (default = None) + Additional keyword arguments for the metric function. + + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "KDiscordODetector", + "python_path": "d3m.primitives.tods.detection_algorithm.KDiscordODetector", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LOCAL_OUTLIER_FACTOR, ], # + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['n_neighbors', 'algorithm', 'leaf_size', 'p', 'contamination', + 'window_size', 'step_size', 'method', 'radius'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'KDiscordODetector')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = KDiscord(window_size=hyperparams['window_size'], + contamination=hyperparams['contamination'], + step_size=hyperparams['step_size'], + n_neighbors=hyperparams['n_neighbors'], + method=hyperparams['method'], + radius=hyperparams['radius'], + algorithm=hyperparams['algorithm'], + leaf_size=hyperparams['leaf_size'], + metric=hyperparams['metric'], + metric_params=hyperparams['metric_params'], + p=hyperparams['p'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def produce_score(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + Outlier score of input DataFrame. + """ + return super().produce_score(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + diff --git a/tods/detection_algorithm/LSTMODetect.py b/tods/detection_algorithm/LSTMODetect.py new file mode 100755 index 0000000..58bc229 --- /dev/null +++ b/tods/detection_algorithm/LSTMODetect.py @@ -0,0 +1,288 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +# import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +from sklearn.utils.validation import check_is_fitted +from sklearn.linear_model import LinearRegression +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from pyod.models.base import BaseDetector + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from detection_algorithm.core.LSTMOD import LSTMOutlierDetector + +from sklearn.utils import check_array, column_or_1d +from sklearn.utils.validation import check_is_fitted + +from pyod.models.base import BaseDetector +import uuid + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + train_contamination = hyperparams.Uniform( # Hyperparameter[float]( + lower=0., + upper=0.5, + default=0.0, + description='Contamination used to calculate relative_error_threshold.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + min_attack_time = hyperparams.Hyperparameter[int]( + default=5, + description='The minimum amount of recent time steps that is used to define a collective attack.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + danger_coefficient_weight = hyperparams.Uniform( + lower=0., + upper=1., + default=0.5, + description='Weight of danger coefficient in decision score.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + loss_func = hyperparams.Enumeration[str]( + values=['mean_squared_error'], + default='mean_squared_error', + description='String (name of objective function).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + optimizer = hyperparams.Enumeration[str]( + values=['adam', 'sgd', 'rmsprop', 'nadam', 'adamax', 'adadelta', 'adagrad'], + default='adam', + description='String (name of optimizer).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + epochs = hyperparams.Hyperparameter[int]( + default=10, + description='Number of epochs to train the model.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + batch_size = hyperparams.Hyperparameter[int]( + default=32, + description='Number of samples per gradient update.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + dropout_rate = hyperparams.Uniform( # Hyperparameter[float]( + lower=0., + upper=1., + default=0.1, + description='The dropout to be used across all layers.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + feature_dim = hyperparams.Hyperparameter[int]( + default=1, + description='Feature dim of time series data.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + hidden_dim = hyperparams.Hyperparameter[int]( + default=16, + description='Hidden dim of LSTM.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + n_hidden_layer = hyperparams.Hyperparameter[int]( + default=0, + description='Hidden layer number of LSTM.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + activation = hyperparams.Union[Union[str, None]]( + configuration=OrderedDict( + init=hyperparams.Enumeration[str]( + values=['relu', 'sigmoid', 'selu', 'tanh', 'softplus', 'softsign'], + default='relu', + description='Method to vote relative_error in a collect attack.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='Activations function of LSTMs input and hidden layers.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + diff_group_method = hyperparams.Enumeration[str]( + values=['average', 'max', 'min'], + default='average', + description='Method to vote relative_error in a collect attack.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + pass + + +class LSTMODetector(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. When fitting this is used + to define the threshold on the decision function. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "LSTMODetector", + "python_path": "d3m.primitives.tods.detection_algorithm.LSTMODetector", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/LSTMOD.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ISOLATION_FOREST, ], # up to update + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['contamination', 'train_contamination', 'min_attack_time', + 'danger_coefficient_weight', 'loss_func', 'optimizer', + 'epochs', 'batch_size', 'dropout_rate', 'feature_dim', 'hidden_dim', + 'n_hidden_layer', 'activation', 'diff_group_method'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'LSTMODetector')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = LSTMOutlierDetector(contamination=hyperparams['contamination'], + train_contamination=hyperparams['train_contamination'], + min_attack_time=hyperparams['min_attack_time'], + danger_coefficient_weight=hyperparams['danger_coefficient_weight'], + loss=hyperparams['loss_func'], + optimizer=hyperparams['optimizer'], + epochs=hyperparams['epochs'], + batch_size=hyperparams['batch_size'], + dropout_rate=hyperparams['dropout_rate'], + feature_dim=hyperparams['feature_dim'], + hidden_dim=hyperparams['hidden_dim'], + n_hidden_layer=hyperparams['n_hidden_layer'], + activation=hyperparams['activation'], + diff_group_method=hyperparams['diff_group_method'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + diff --git a/tods/detection_algorithm/MatrixProfile.py b/tods/detection_algorithm/MatrixProfile.py new file mode 100644 index 0000000..93d9ef3 --- /dev/null +++ b/tods/detection_algorithm/MatrixProfile.py @@ -0,0 +1,420 @@ +import os +import sklearn +import numpy +import typing +import time +from scipy import sparse +from numpy import ndarray +from collections import OrderedDict +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple + +import numpy as np +import pandas as pd +import logging, uuid +from scipy import sparse +from numpy import ndarray +from collections import OrderedDict +from common_primitives import dataframe_utils, utils + +from d3m import utils +from d3m import container +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.container import DataFrame as d3m_dataframe +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.primitive_interfaces import base, transformer +from d3m.metadata import base as metadata_base, hyperparams +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +import stumpy + +__all__ = ('MatrixProfile',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class PrimitiveCount: + primitive_no = 0 + + +class Hyperparams(hyperparams.Hyperparams): + window_size = hyperparams.UniformInt( + lower = 0, + upper = 100, #TODO: Define the correct the upper bound + default=50, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="window size to calculate" + ) + + # Keep previous + dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(2,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(0,1,3,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class MP: + """ + This is the class for matrix profile function + """ + def __init__(self, window_size): + self._window_size = window_size + return + + def produce(self, data): + + """ + + Args: + data: dataframe column + Returns: + nparray + + """ + transformed_columns=utils.pandas.DataFrame() + #transformed_columns=d3m_dataframe + for col in data.columns: + output = stumpy.stump(data[col], m = self._window_size) + output = pd.DataFrame(output) + #print("output", output) + transformed_columns=pd.concat([transformed_columns,output],axis=1) + #transformed_columns[col]=output + #print(transformed_columns) + return transformed_columns + +class MatrixProfile(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that performs matrix profile on a DataFrame using Stumpy package + Stumpy documentation: https://stumpy.readthedocs.io/en/latest/index.html + + Parameters + ---------- + T_A : ndarray + The time series or sequence for which to compute the matrix profile + m : int + Window size + T_B : ndarray + The time series or sequence that contain your query subsequences + of interest. Default is `None` which corresponds to a self-join. + ignore_trivial : bool + Set to `True` if this is a self-join. Otherwise, for AB-join, set this + to `False`. Default is `True`. + Returns + ------- + out : ndarray + The first column consists of the matrix profile, the second column + consists of the matrix profile indices, the third column consists of + the left matrix profile indices, and the fourth column consists of + the right matrix profile indices. + + """ + + + metadata = metadata_base.PrimitiveMetadata({ + '__author__': "DATA Lab @Texas A&M University", + 'name': "Matrix Profile", + #'python_path': 'd3m.primitives.tods.feature_analysis.matrix_profile', + 'python_path': 'd3m.primitives.tods.detection_algorithm.matrix_profile', + 'source': {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/MatrixProfile.py']}, + 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.MATRIX_PROFILE,], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + 'id': str(uuid.uuid3(uuid.NAMESPACE_DNS, 'MatrixProfilePrimitive')), + 'hyperparams_to_tune': ['window_size'], + 'version': '0.0.2', + }) + + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + self._clf = MP(window_size = hyperparams['window_size']) + self.primitiveNo = PrimitiveCount.primitive_no + PrimitiveCount.primitive_no+=1 + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + """ + + Args: + + inputs: Container DataFrame + + timeout: Default + + iterations: Default + + Returns: + + Container DataFrame containing Matrix Profile of selected columns + + """ + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + + if len(self._training_indices) > 0: + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.produce(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + #print(outputs) + #CallResult(outputs) + #print("___") + print(outputs.columns) + #outputs.columns = [str(x) for x in outputs.columns] + + return CallResult(outputs) + + # assert isinstance(inputs, container.DataFrame), type(container.DataFrame) + # _, self._columns_to_produce = self._get_columns_to_fit(inputs, self.hyperparams) + + # #print("columns_to_produce ", self._columns_to_produce) + + # outputs = inputs + # if len(self._columns_to_produce) > 0: + # for col in self.hyperparams['use_columns']: + # output = self._clf.produce(inputs.iloc[ : ,col]) + + # outputs = pd.concat((outputs, pd.DataFrame({inputs.columns[col]+'_matrix_profile': output[:,0], + # inputs.columns[col]+'_matrix_profile_indices': output[:,1], + # inputs.columns[col]+'_left_matrix_profile_indices': output[:,2], + # inputs.columns[col]+'_right_matrix_profile_indices': output[:,3]})), axis = 1) + + # else: + # if self.hyperparams['error_on_no_input']: + # raise RuntimeError("No input columns were selected") + # self.logger.warn("No input columns were selected") + + # #print(outputs) + # self._update_metadata(outputs) + + # return base.CallResult(outputs) + + + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + + """ + + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + + + """ + Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2) + columns_to_produce is still [2] + """ + return inputs.iloc[:, columns_to_produce], columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + + """ + + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, np.integer, np.float64) #changed numpy to np + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + + # print(column_metadata) + # print(column_metadata['structural_type'], accepted_structural_types) + + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + # print(column_metadata) + # print(semantic_types, accepted_semantic_types) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + + """ + + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + + """ + + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + + """ + + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + + """ + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + return target_columns_metadata diff --git a/tods/detection_algorithm/PCAODetect.py b/tods/detection_algorithm/PCAODetect.py new file mode 100644 index 0000000..26a118d --- /dev/null +++ b/tods/detection_algorithm/PCAODetect.py @@ -0,0 +1,364 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +# import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +from sklearn.utils.validation import check_is_fitted +from sklearn.linear_model import LinearRegression +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from detection_algorithm.core.PCA import PCA +import uuid + +from sklearn.utils import check_array, column_or_1d +from sklearn.utils.validation import check_is_fitted + +from combo.models.score_comb import average, maximization, median, aom, moa +from combo.utils.utility import standardizer + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + svd_solver = hyperparams.Enumeration[str]( + values=['auto', 'full', 'arpack', 'randomized'], + default='auto', + description='Algorithm of solver.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + n_components = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=1, # {}, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='Number of components to keep. It should be smaller than the window_size.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + + # hyperparams.Hyperparameter[int]( + # default=1, + # description='Number of components to keep. It should be smaller than the window_size.', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + # ) + + n_selected_components = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=1, # {}, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='Number of selected principal components for calculating the outlier scores. It is not necessarily equal to the total number of the principal components. If not set, use all principal components.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + + tol = hyperparams.Hyperparameter[float]( + default=0., + description='Tolerance for singular values computed by svd_solver == `arpack`.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + iterated_power = hyperparams.Union[Union[int, str]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=1, # {}, + ), + ninit=hyperparams.Hyperparameter[str]( + default='auto', + ), + ), + default='ninit', + description='Number of iterations for the power method computed by svd_solver == `randomized`.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + + random_state = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=0, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='the seed used by the random number generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + + whiten = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="If True, the eigenvalues are used in score computation. The eigenvectors with small eigenvalues comes with more importance in outlier score calculation.", + ) + + standardization = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="If True, perform standardization first to convert data to zero mean and unit variance.", + ) + + pass + + +class PCAODetector(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + PCA-based outlier detection with both univariate and multivariate + time series data. TS data will be first transformed to tabular format. + For univariate data, it will be in shape of [valid_length, window_size]. + for multivariate data with d sequences, it will be in the shape of + [valid_length, window_size]. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_components : int, float, None or string + Number of components to keep. It should be smaller than the window_size. + if n_components is not set all components are kept:: + + n_components == min(n_samples, n_features) + + if n_components == 'mle' and svd_solver == 'full', Minka\'s MLE is used + to guess the dimension + if ``0 < n_components < 1`` and svd_solver == 'full', select the number + of components such that the amount of variance that needs to be + explained is greater than the percentage specified by n_components + n_components cannot be equal to n_features for svd_solver == 'arpack'. + + n_selected_components : int, optional (default=None) + Number of selected principal components + for calculating the outlier scores. It is not necessarily equal to + the total number of the principal components. If not set, use + all principal components. + + whiten : bool, optional (default False) + When True (False by default) the `components_` vectors are multiplied + by the square root of n_samples and then divided by the singular values + to ensure uncorrelated outputs with unit component-wise variances. + + Whitening will remove some information from the transformed signal + (the relative variance scales of the components) but can sometime + improve the predictive accuracy of the downstream estimators by + making their data respect some hard-wired assumptions. + + svd_solver : string {'auto', 'full', 'arpack', 'randomized'} + auto : + the solver is selected by a default policy based on `X.shape` and + `n_components`: if the input data is larger than 500x500 and the + number of components to extract is lower than 80% of the smallest + dimension of the data, then the more efficient 'randomized' + method is enabled. Otherwise the exact full SVD is computed and + optionally truncated afterwards. + full : + run exact full SVD calling the standard LAPACK solver via + `scipy.linalg.svd` and select the components by postprocessing + arpack : + run SVD truncated to n_components calling ARPACK solver via + `scipy.sparse.linalg.svds`. It requires strictly + 0 < n_components < X.shape[1] + randomized : + run randomized SVD by the method of Halko et al. + + tol : float >= 0, optional (default .0) + Tolerance for singular values computed by svd_solver == 'arpack'. + + iterated_power : int >= 0, or 'auto', (default 'auto') + Number of iterations for the power method computed by + svd_solver == 'randomized'. + + random_state : int, RandomState instance or None, optional (default None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'. + + weighted : bool, optional (default=True) + If True, the eigenvalues are used in score computation. + The eigenvectors with small eigenvalues comes with more importance + in outlier score calculation. + + standardization : bool, optional (default=True) + If True, perform standardization first to convert + data to zero mean and unit variance. + See http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "PCAODetector", + "python_path": "d3m.primitives.tods.detection_algorithm.PCAODetector", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LOCAL_OUTLIER_FACTOR, ], # + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['n_components', 'n_selected_components', 'contamination', + 'whiten', 'svd_solver', 'tol', 'iterated_power', 'random_state', + 'standardization'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'PCAODetector')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = PCA(window_size=hyperparams['window_size'], + contamination=hyperparams['contamination'], + n_components=hyperparams['n_components'], + n_selected_components=hyperparams['n_selected_components'], + whiten=hyperparams['whiten'], + svd_solver=hyperparams['svd_solver'], + tol=hyperparams['tol'], + iterated_power=hyperparams['iterated_power'], + random_state=hyperparams['random_state'], + standardization=hyperparams['standardization'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def produce_score(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + Outlier score of input DataFrame. + """ + return super().produce_score(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + diff --git a/tods/detection_algorithm/PyodABOD.py b/tods/detection_algorithm/PyodABOD.py new file mode 100644 index 0000000..b4a1d9c --- /dev/null +++ b/tods/detection_algorithm/PyodABOD.py @@ -0,0 +1,207 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.abod import ABOD +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + n_neighbors = hyperparams.Hyperparameter[int]( + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Number of neighbors to use by default for k neighbors queries.", + ) + + method = hyperparams.Enumeration( + values=['fast', 'default'], + default='fast', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="'fast': fast ABOD. Only consider n_neighbors of training points 'default': original ABOD with all training points, which could be slow", + ) + + +class ABODPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + ABOD class for Angle-base Outlier Detection. + For an observation, the variance of its weighted cosine scores to all + neighbors could be viewed as the outlying score. + See :cite:`kriegel2008angle` for details. + + Two versions of ABOD are supported: + + - Fast ABOD: use k nearest neighbors to approximate. + - Original ABOD: consider all training points with high time complexity at + O(n^3). + + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_neighbors : int, optional (default=10) + Number of neighbors to use by default for k neighbors queries. + + method: str, optional (default='fast') + Valid values for metric are: + + - 'fast': fast ABOD. Only consider n_neighbors of training points + - 'default': original ABOD with all training points, which could be + slow + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "Angle-base Outlier Detection Primitive", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_abod", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/PyodABOD.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ANGLE_BASE_OUTLIER_DETECTION], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "id": "134f6c5f-717b-4683-bfbc-251bab07f6fa", + "hyperparams_to_tune": ['contamination', 'n_neighbors', 'method'], + "version": "0.0.1", + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = ABOD(contamination=hyperparams['contamination'], + n_neighbors=hyperparams['n_neighbors'], + method=hyperparams['method'], + ) + + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def produce_score(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + Returns: + Container DataFrame + Outlier score of input DataFrame. + """ + return super().produce_score(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) diff --git a/tods/detection_algorithm/PyodAE.py b/tods/detection_algorithm/PyodAE.py new file mode 100644 index 0000000..93f2bb8 --- /dev/null +++ b/tods/detection_algorithm/PyodAE.py @@ -0,0 +1,368 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import tensorflow +from tensorflow.keras.losses import mean_squared_error +from tensorflow import keras +from tensorflow.keras import losses,layers +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.auto_encoder import AutoEncoder as PyODAutoEncoder + +import uuid +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + hidden_neurons = hyperparams.List( + default=[4, 2, 4], + elements=hyperparams.Hyperparameter[int](1), + description='The number of neurons per hidden layers.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + hidden_activation = hyperparams.Enumeration[str]( + values=['relu', 'sigmoid', 'softmax', 'softplus', 'softsign', + 'tanh', 'selu', 'elu', 'exponential'], + default='relu', + description='Activation function to use for hidden layers.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + output_activation = hyperparams.Enumeration[str]( + values=['relu', 'sigmoid', 'softmax', 'softplus', 'softsign', + 'tanh', 'selu', 'elu', 'exponential'], + default='sigmoid', + description='Activation function to use for output layer.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + loss = hyperparams.Enumeration[str]( + values=['mean_squared_error'], + default='mean_squared_error', + description='Loss function.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + optimizer = hyperparams.Enumeration[str]( + values=['SGD', 'RMSprop', 'adam', 'Adadelta', 'Adagrad', + 'Adamax', 'Nadam', 'Ftrl'], + default='adam', + description='String (name of optimizer) or optimizer instance.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + epochs = hyperparams.Hyperparameter[int]( + default=100, + description='Number of epochs to train the model.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + batch_size = hyperparams.Hyperparameter[int]( + default=32, + description='Number of samples per gradient update.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + dropout_rate = hyperparams.Uniform( + lower=0., + upper=1., + default=0.2, + description='The dropout to be used across all layers.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + l2_regularizer = hyperparams.Uniform( + lower=0., + upper=1., + default=0.1, + description='The regularization strength of activity_regularizer applied on each layer.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + validation_size = hyperparams.Uniform( + lower=0., + upper=1., + default=0.1, + description='The percentage of data to be used for validation.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + preprocessing = hyperparams.UniformBool( + default=True, + description='If True, apply standardization on the data.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + verbose = hyperparams.Enumeration[int]( + values=[0, 1, 2], + default=1, + description='Verbosity mode.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + random_state = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=0, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='the seed used by the random number generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + contamination = hyperparams.Uniform( + lower=0., + upper=0.5, + default=0.1, + description='The amount of contamination of the data set, i.e. the proportion of outliers in the data set. ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + pass + + +class AutoEncoder(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + Auto Encoder (AE) is a type of neural networks for learning useful data + representations unsupervisedly. Similar to PCA, AE could be used to + detect outlying objects in the data by calculating the reconstruction + errors. See :cite:`aggarwal2015outlier` Chapter 3 for details. + + Parameters + ---------- + hidden_neurons : list, optional (default=[4,2,4]) + The number of neurons per hidden layers. + + hidden_activation : str, optional (default='relu') + Activation function to use for hidden layers. + All hidden layers are forced to use the same type of activation. + See https://keras.io/activations/ + + output_activation : str, optional (default='sigmoid') + Activation function to use for output layer. + See https://keras.io/activations/ + + loss : str or obj, optional (default=keras.losses.mean_squared_error) + String (name of objective function) or objective function. + See https://keras.io/losses/ + + optimizer : str, optional (default='adam') + String (name of optimizer) or optimizer instance. + See https://keras.io/optimizers/ + + epochs : int, optional (default=100) + Number of epochs to train the model. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + dropout_rate : float in (0., 1), optional (default=0.2) + The dropout to be used across all layers. + + l2_regularizer : float in (0., 1), optional (default=0.1) + The regularization strength of activity_regularizer + applied on each layer. By default, l2 regularizer is used. See + https://keras.io/regularizers/ + + validation_size : float in (0., 1), optional (default=0.1) + The percentage of data to be used for validation. + + preprocessing : bool, optional (default=True) + If True, apply standardization on the data. + + verbose : int, optional (default=1) + Verbosity mode. + - 0 = silent + - 1 = progress bar + - 2 = one line per epoch. + For verbosity >= 1, model summary may be printed. + + random_state : random_state: int, RandomState instance or None, optional + (default=None) + If int, random_state is the seed used by the random + number generator; If RandomState instance, random_state is the random + number generator; If None, the random number generator is the + RandomState instance used by `np.random`. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. When fitting this is used + to define the threshold on the decision function. + + Attributes + ---------- + encoding_dim_ : int + The number of neurons in the encoding layer. + + compression_rate_ : float + The ratio between the original feature and + the number of neurons in the encoding layer. + + model_ : Keras Object + The underlying AutoEncoder in Keras. + + history_: Keras Object + The AutoEncoder training history. + + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "TODS.anomaly_detection_primitives.AutoEncoder", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_ae", + "source": {'name': "DATA Lab at Texas A&M University", 'contact': 'mailto:khlai037@tamu.edu','uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.VARIATIONAL_AUTO_ENCODER, ], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparameters_to_tune": [''], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'AutoEncoderPrimitive')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + if hyperparams['loss'] == 'mean_squared_error': + loss = keras.losses.mean_squared_error + else: + raise ValueError('AE only suports mean squered error for now') + + self._clf = PyODAutoEncoder(contamination=hyperparams['contamination'], + hidden_neurons=hyperparams['hidden_neurons'], + hidden_activation=hyperparams['hidden_activation'], + output_activation=hyperparams['output_activation'], + loss=loss, + optimizer=hyperparams['optimizer'], + epochs=hyperparams['epochs'], + batch_size=hyperparams['batch_size'], + dropout_rate=hyperparams['dropout_rate'], + l2_regularizer=hyperparams['l2_regularizer'], + validation_size=hyperparams['validation_size'], + preprocessing=hyperparams['preprocessing'], + verbose=hyperparams['verbose'], + random_state=hyperparams['random_state'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + diff --git a/tods/detection_algorithm/PyodCBLOF.py b/tods/detection_algorithm/PyodCBLOF.py new file mode 100644 index 0000000..016b186 --- /dev/null +++ b/tods/detection_algorithm/PyodCBLOF.py @@ -0,0 +1,283 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.cblof import CBLOF +import uuid +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + n_clusters = hyperparams.Hyperparameter[int]( + default=8, + description='The number of clusters to form as well as the number of centroids to generate.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + # clustering_estimator = hyperparams.Choice( + # choices={ + # 'auto': hyperparams.Hyperparams.define( + # configuration=OrderedDict({}) + # ), + # 'full': hyperparams.Hyperparams.define( + # configuration=OrderedDict({}) + # ), + # }, + # default='auto', + # description='The base clustering algorithm for performing data clustering. A valid clustering algorithm should be passed in. The estimator should have standard sklearn APIs, fit() and predict(). The estimator should have attributes ``labels_`` and ``cluster_centers_``.', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + # ) + + alpha = hyperparams.Uniform( + lower=0.5, + upper=1., + default=0.9, + description='Coefficient for deciding small and large clusters. The ratio of the number of samples in large clusters to the number of samples in small clusters.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + beta = hyperparams.Hyperparameter[int]( + default=5, + description='Coefficient for deciding small and large clusters.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_weights = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="If set to True, the size of clusters are used as weights in outlier score calculation." + ) + + check_estimator = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="If set to True, check whether the base estimator is consistent with sklearn standard." + ) + + random_state = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=0, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='the seed used by the random number generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + pass + + +class CBLOFPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + The CBLOF operator calculates the outlier score based on cluster-based + local outlier factor. + CBLOF takes as an input the data set and the cluster model that was + generated by a clustering algorithm. It classifies the clusters into small + clusters and large clusters using the parameters alpha and beta. + The anomaly score is then calculated based on the size of the cluster the + point belongs to as well as the distance to the nearest large cluster. + Use weighting for outlier factor based on the sizes of the clusters as + proposed in the original publication. Since this might lead to unexpected + behavior (outliers close to small clusters are not found), it is disabled + by default.Outliers scores are solely computed based on their distance to + the closest large cluster center. + By default, kMeans is used for clustering algorithm instead of + Squeezer algorithm mentioned in the original paper for multiple reasons. + See :cite:`he2003discovering` for details. + + Parameters + ---------- + n_clusters : int, optional (default=8) + The number of clusters to form as well as the number of + centroids to generate. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + clustering_estimator : Estimator, optional (default=None) + The base clustering algorithm for performing data clustering. + A valid clustering algorithm should be passed in. The estimator should + have standard sklearn APIs, fit() and predict(). The estimator should + have attributes ``labels_`` and ``cluster_centers_``. + If ``cluster_centers_`` is not in the attributes once the model is fit, + it is calculated as the mean of the samples in a cluster. + If not set, CBLOF uses KMeans for scalability. See + https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html + + alpha : float in (0.5, 1), optional (default=0.9) + Coefficient for deciding small and large clusters. The ratio + of the number of samples in large clusters to the number of samples in + small clusters. + + beta : int or float in (1,), optional (default=5). + Coefficient for deciding small and large clusters. For a list + sorted clusters by size `|C1|, \|C2|, ..., |Cn|, beta = |Ck|/|Ck-1|` + + use_weights : bool, optional (default=False) + If set to True, the size of clusters are used as weights in + outlier score calculation. + + check_estimator : bool, optional (default=False) + If set to True, check whether the base estimator is consistent with + sklearn standard. + .. warning:: + check_estimator may throw errors with scikit-learn 0.20 above. + + random_state : int, RandomState or None, optional (default=None) + If int, random_state is the seed used by the random + number generator; If RandomState instance, random_state is the random + number generator; If None, the random number generator is the + RandomState instance used by `np.random`. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "TODS.anomaly_detection_primitives.CBLOFPrimitive", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_cblof", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LOCAL_OUTLIER_FACTOR, ], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['contamination'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'CBLOFPrimitive')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = CBLOF(contamination=hyperparams['contamination'], + n_clusters=hyperparams['n_clusters'], + alpha=hyperparams['alpha'], + beta=hyperparams['beta'], + use_weights=hyperparams['use_weights'], + check_estimator=hyperparams['check_estimator'], + random_state=hyperparams['random_state'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + diff --git a/tods/detection_algorithm/PyodCOF.py b/tods/detection_algorithm/PyodCOF.py new file mode 100644 index 0000000..98662ad --- /dev/null +++ b/tods/detection_algorithm/PyodCOF.py @@ -0,0 +1,198 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.cof import COF +# import uuid + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + + n_neighbors = hyperparams.Hyperparameter[int]( + default=5, + description='Number of neighbors to use by default for k neighbors queries.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + # method = hyperparams.Enumeration[str]( + # values=['largest', 'mean', 'median'], + # default='largest', + # description='Method to calculate outlier score.', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + # ) + + + +class PyodCOF(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + Connectivity-Based Outlier Factor (COF) COF uses the ratio of average + chaining distance of data point and the average of average chaining + distance of k nearest neighbor of the data point, as the outlier score + for observations. + See :cite:`tang2002enhancing` for details. + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + n_neighbors : int, optional (default=20) + Number of neighbors to use by default for k neighbors queries. + Note that n_neighbors should be less than the number of samples. + If n_neighbors is larger than the number of samples provided, + all samples will be used. + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + n_neighbors_: int + Number of neighbors to use by default for k neighbors queries. + """ + + __author__ = "Data Lab" + metadata = metadata_base.PrimitiveMetadata( + { + '__author__' : "DATA Lab at Texas A&M University", + 'name': "Connectivity-Based Outlier Factor (COF)", + 'python_path': 'd3m.primitives.tods.detection_algorithm.pyod_cof', + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'contact': 'mailto:khlai037@tamu.edu', + 'uris': [ + 'https://gitlab.com/lhenry15/tods.git', + 'https://gitlab.com/lhenry15/tods/-/blob/purav/anomaly-primitives/anomaly_primitives/PyodCOF.py', + ], + }, + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.PYOD_COF, + ], + 'primitive_family': metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + 'id': 'c7259da6-7ce6-42ad-83c6-15238679f5fa', + 'hyperparameters_to_tune':['rank','update','objective','max_iter','learning_rate'], + 'version': '0.0.1', + }, + ) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = COF(contamination=hyperparams['contamination'], + n_neighbors=hyperparams['n_neighbors'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) diff --git a/tods/detection_algorithm/PyodHBOS.py b/tods/detection_algorithm/PyodHBOS.py new file mode 100644 index 0000000..34c9e00 --- /dev/null +++ b/tods/detection_algorithm/PyodHBOS.py @@ -0,0 +1,216 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.hbos import HBOS +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + bin_edges_ : Optional[ndarray] + hist_ : Optional[ndarray] + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + n_bins = hyperparams.Hyperparameter[int]( + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="The number of bins.", + ) + + alpha = hyperparams.Uniform( + default = 0.1, + lower = 0.0, + upper = 1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description='The regularizer for preventing overflow.', + ) + + tol = hyperparams.Uniform( + default = 0.1, + lower = 0.0, + upper = 1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description='The parameter to decide the flexibility while dealing the samples falling outside the bins.', + ) + + +class HBOSPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + Histogram-based Outlier Detection (HBOS) + Histogram- based outlier detection (HBOS) is an efficient unsupervised + method. It assumes the feature independence and calculates the degree + of outlyingness by building histograms. See :cite:`goldstein2012histogram` + for details. + + Parameters + ---------- + n_bins : int, optional (default=10) + The number of bins. + + alpha : float in (0, 1), optional (default=0.1) + The regularizer for preventing overflow. + + tol : float in (0, 1), optional (default=0.1) + The parameter to decide the flexibility while dealing + the samples falling outside the bins. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + Attributes + ---------- + bin_edges_ : numpy array of shape (n_bins + 1, n_features ) + The edges of the bins. + + hist_ : numpy array of shape (n_bins, n_features) + The density of each histogram. + + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "HBOS", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_hbos", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/PyodHBOS.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.HISTOGRAM_BASED_OUTLIER_DETECTION], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "id": "dc722a9f-7621-4900-9b77-7b3b7631ff5e", + "hyperparams_to_tune": ['contamination', 'n_bins', 'alpha', 'tol'], + "version": "0.0.1", + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = HBOS(contamination=hyperparams['contamination'], + n_bins=hyperparams['n_bins'], + alpha=hyperparams['alpha'], + tol=hyperparams['tol'], + ) + + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def produce_score(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + Returns: + Container DataFrame + Outlier score of input DataFrame. + """ + return super().produce_score(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) diff --git a/tods/detection_algorithm/PyodIsolationForest.py b/tods/detection_algorithm/PyodIsolationForest.py new file mode 100644 index 0000000..1960876 --- /dev/null +++ b/tods/detection_algorithm/PyodIsolationForest.py @@ -0,0 +1,276 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.iforest import IForest +from typing import Union +import uuid + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + n_estimators = hyperparams.Hyperparameter[int]( + default=100, + description='The number of base estimators in the ensemble.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + max_samples = hyperparams.Enumeration[str]( + values=['auto', 'int', 'float'], + default='auto', # 'box-cox', # + description='The number of samples to draw from X to train each base estimator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + max_features = hyperparams.Hyperparameter[float]( + default=1., + description='The number of features to draw from X to train each base estimator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + bootstrap = hyperparams.UniformBool( + default=False, + description='If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + behaviour = hyperparams.Enumeration[str]( + values=['old', 'new'], + default='new', + description='Refer to https://github.com/yzhao062/pyod/blob/master/pyod/models/iforest.py.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + random_state = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=0, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='the seed used by the random number generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + verbose = hyperparams.Hyperparameter[int]( + default=0, + description='Controls the verbosity of the tree building process.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + pass + + +class IsolationForest(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + + """ + Wrapper of Pyod Isolation Forest with more functionalities. + The IsolationForest 'isolates' observations by randomly selecting a + feature and then randomly selecting a split value between the maximum and + minimum values of the selected feature. + See :cite:`liu2008isolation,liu2012isolation` for details. + Since recursive partitioning can be represented by a tree structure, the + number of splittings required to isolate a sample is equivalent to the path + length from the root node to the terminating node. + This path length, averaged over a forest of such random trees, is a + measure of normality and our decision function. + Random partitioning produces noticeably shorter paths for anomalies. + Hence, when a forest of random trees collectively produce shorter path + lengths for particular samples, they are highly likely to be anomalies. + + Parameters + ---------- + n_estimators : int, optional (default=100) + The number of base estimators in the ensemble. + + max_samples : int or float, optional (default="auto") + The number of samples to draw from X to train each base estimator. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. + - If "auto", then `max_samples=min(256, n_samples)`. + If max_samples is larger than the number of samples provided, + all samples will be used for all trees (no sampling). + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. Used when fitting to define the threshold + on the decision function. + + max_features : int or float, optional (default=1.0) + The number of features to draw from X to train each base estimator. + - If int, then draw `max_features` features. + - If float, then draw `max_features * X.shape[1]` features. + + bootstrap : bool, optional (default=False) + If True, individual trees are fit on random subsets of the training + data sampled with replacement. If False, sampling without replacement + is performed. + + behaviour : str, default='old' + Behaviour of the ``decision_function`` which can be either 'old' or + 'new'. Passing ``behaviour='new'`` makes the ``decision_function`` + change to match other anomaly detection algorithm API which will be + the default behaviour in the future. As explained in details in the + ``offset_`` attribute documentation, the ``decision_function`` becomes + dependent on the contamination parameter, in such a way that 0 becomes + its natural threshold to detect outliers. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + verbose : int, optional (default=0) + Controls the verbosity of the tree building process. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "TODS.anomaly_detection_primitives.IsolationForest", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_iforest", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.ISOLATION_FOREST, ], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['n_estimators', 'contamination'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'IsolationForest')) + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = IForest(contamination=hyperparams['contamination'], + n_estimators=hyperparams['n_estimators'], + max_samples=hyperparams['max_samples'], + max_features=hyperparams['max_features'], + bootstrap=hyperparams['bootstrap'], + behaviour=hyperparams['behaviour'], + random_state=hyperparams['random_state'], + verbose=hyperparams['verbose'], + ) + + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + diff --git a/tods/detection_algorithm/PyodKNN.py b/tods/detection_algorithm/PyodKNN.py new file mode 100644 index 0000000..c674bca --- /dev/null +++ b/tods/detection_algorithm/PyodKNN.py @@ -0,0 +1,317 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.knn import KNN +import uuid +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + n_neighbors = hyperparams.Hyperparameter[int]( + default=5, + description='Number of neighbors to use by default for k neighbors queries.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + method = hyperparams.Enumeration[str]( + values=['largest', 'mean', 'median'], + default='largest', + description='Method to calculate outlier score.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + radius = hyperparams.Hyperparameter[float]( + default=1.0, + description='Range of parameter space to use by default for `radius_neighbors` queries.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + algorithm = hyperparams.Enumeration[str]( + values=['auto', 'ball_tree', 'kd_tree', 'brute'], + default='auto', + description='Algorithm used to compute the nearest neighbors.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + leaf_size = hyperparams.Hyperparameter[int]( + default=30, + description='Leaf size passed to `BallTree` or `KDTree`. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + metric = hyperparams.Enumeration[str]( + values=['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan', 'braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', + 'sqeuclidean', 'yule'], + default='minkowski', + description='metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + p = hyperparams.Hyperparameter[int]( + default=2, + description='Parameter for the Minkowski metric from.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + metric_params = hyperparams.Union[Union[Dict, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[Dict]( + default={}, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='Additional keyword arguments for the metric function.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + pass + + +class KNNPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + kNN class for outlier detection. + For an observation, its distance to its kth nearest neighbor could be + viewed as the outlying score. It could be viewed as a way to measure + the density. See :cite:`ramaswamy2000efficient,angiulli2002fast` for + details. + Three kNN detectors are supported: + largest: use the distance to the kth neighbor as the outlier score + mean: use the average of all k neighbors as the outlier score + median: use the median of the distance to k neighbors as the outlier score + + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_neighbors : int, optional (default = 5) + Number of neighbors to use by default for k neighbors queries. + + method : str, optional (default='largest') + {'largest', 'mean', 'median'} + - 'largest': use the distance to the kth neighbor as the outlier score + - 'mean': use the average of all k neighbors as the outlier score + - 'median': use the median of the distance to k neighbors as the + outlier score + + radius : float, optional (default = 1.0) + Range of parameter space to use by default for `radius_neighbors` + queries. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + Algorithm used to compute the nearest neighbors: + - 'ball_tree' will use BallTree + - 'kd_tree' will use KDTree + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + .. deprecated:: 0.74 + ``algorithm`` is deprecated in PyOD 0.7.4 and will not be + possible in 0.7.6. It has to use BallTree for consistency. + + leaf_size : int, optional (default = 30) + Leaf size passed to BallTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : string or callable, default 'minkowski' + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + Distance matrices are not supported. + Valid values for metric are: + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', + 'sqeuclidean', 'yule'] + See the documentation for scipy.spatial.distance for details on these + metrics. + + p : integer, optional (default = 2) + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances + + metric_params : dict, optional (default = None) + Additional keyword arguments for the metric function. + + n_jobs : int, optional (default = 1) + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + Affects only kneighbors and kneighbors_graph methods. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "TODS.anomaly_detection_primitives.KNNPrimitive", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_knn", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['n_neighbors', 'method', 'radius', 'algorithm', 'leaf_size', 'p'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'KNNPrimitive')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = KNN(contamination=hyperparams['contamination'], + n_neighbors=hyperparams['n_neighbors'], + method=hyperparams['method'], + radius=hyperparams['radius'], + algorithm=hyperparams['algorithm'], + leaf_size=hyperparams['leaf_size'], + metric=hyperparams['metric'], + metric_params=hyperparams['metric_params'], + p=hyperparams['p'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + diff --git a/tods/detection_algorithm/PyodLODA.py b/tods/detection_algorithm/PyodLODA.py new file mode 100644 index 0000000..b4f37a2 --- /dev/null +++ b/tods/detection_algorithm/PyodLODA.py @@ -0,0 +1,187 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.loda import LODA +import uuid +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + n_bins = hyperparams.Hyperparameter[int]( + default=10, + description='The number of bins for the histogram.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + n_random_cuts = hyperparams.Hyperparameter[int]( + default=100, + description='The number of random cuts.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + pass + + +class LODAPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + Wrap of Pyod loda. Loda: Lightweight on-line detector of anomalies. See + :cite:`pevny2016loda` for more information. + + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_bins : int, optional (default = 10) + The number of bins for the histogram. + + n_random_cuts : int, optional (default = 100) + The number of random cuts. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "TODS.anomaly_detection_primitives.LODAPrimitive", + "python_path": "d3m.primitives.anomaly_detection.LODAPrimitive", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_loda", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LOCAL_OUTLIER_FACTOR, ], # Wrong + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['n_bins', 'n_random_cuts', 'contamination'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'LODAPrimitive')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = LODA(contamination=hyperparams['contamination'], + n_bins=hyperparams['n_bins'], + n_random_cuts=hyperparams['n_random_cuts'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + diff --git a/tods/detection_algorithm/PyodLOF.py b/tods/detection_algorithm/PyodLOF.py new file mode 100644 index 0000000..c2509e5 --- /dev/null +++ b/tods/detection_algorithm/PyodLOF.py @@ -0,0 +1,294 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.lof import LOF +import uuid +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + n_neighbors = hyperparams.Hyperparameter[int]( + default=20, + description='Number of neighbors to use by default for `kneighbors` queries. If n_neighbors is larger than the number of samples provided, all samples will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + algorithm = hyperparams.Enumeration[str]( + values=['auto', 'ball_tree', 'kd_tree', 'brute'], + default='auto', + description='Algorithm used to compute the nearest neighbors.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + leaf_size = hyperparams.Hyperparameter[int]( + default=30, + description='Leaf size passed to `BallTree` or `KDTree`. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + metric = hyperparams.Enumeration[str]( + values=['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan', 'braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', + 'sqeuclidean', 'yule'], + default='minkowski', + description='metric used for the distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + p = hyperparams.Hyperparameter[int]( + default=2, + description='Parameter for the Minkowski metric from.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + metric_params = hyperparams.Union[Union[Dict, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[Dict]( + default={}, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='Additional keyword arguments for the metric function.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + pass + + +class LOFPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + Wrapper of Pyod LOF Class with more functionalities. + Unsupervised Outlier Detection using Local Outlier Factor (LOF). + The anomaly score of each sample is called Local Outlier Factor. + It measures the local deviation of density of a given sample with + respect to its neighbors. + It is local in that the anomaly score depends on how isolated the object + is with respect to the surrounding neighborhood. + More precisely, locality is given by k-nearest neighbors, whose distance + is used to estimate the local density. + By comparing the local density of a sample to the local densities of + its neighbors, one can identify samples that have a substantially lower + density than their neighbors. These are considered outliers. + See :cite:`breunig2000lof` for details. + + Parameters + ---------- + n_neighbors : int, optional (default=20) + Number of neighbors to use by default for `kneighbors` queries. + If n_neighbors is larger than the number of samples provided, + all samples will be used. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + Algorithm used to compute the nearest neighbors: + - 'ball_tree' will use BallTree + - 'kd_tree' will use KDTree + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + leaf_size : int, optional (default=30) + Leaf size passed to `BallTree` or `KDTree`. This can + affect the speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : string or callable, default 'minkowski' + metric used for the distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + If 'precomputed', the training input X is expected to be a distance + matrix. + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + Valid values for metric are: + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', + 'sqeuclidean', 'yule'] + See the documentation for scipy.spatial.distance for details on these + metrics: + http://docs.scipy.org/doc/scipy/reference/spatial.distance.html + + p : integer, optional (default = 2) + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances + + metric_params : dict, optional (default = None) + Additional keyword arguments for the metric function. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. the proportion + of outliers in the data set. When fitting this is used to define the + threshold on the decision function. + + n_jobs : int, optional (default = 1) + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + Affects only kneighbors and kneighbors_graph methods. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "TODS.anomaly_detection_primitives.LOFPrimitive", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_lof", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.LOCAL_OUTLIER_FACTOR, ], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['n_neighbors', 'algorithm', 'leaf_size', 'p', 'contamination'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'LOFPrimitive')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = LOF(contamination=hyperparams['contamination'], + n_neighbors=hyperparams['n_neighbors'], + algorithm=hyperparams['algorithm'], + leaf_size=hyperparams['leaf_size'], + metric=hyperparams['metric'], + p=hyperparams['p'], + metric_params=hyperparams['metric_params'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + diff --git a/tods/detection_algorithm/PyodMoGaal.py b/tods/detection_algorithm/PyodMoGaal.py new file mode 100755 index 0000000..fc8d286 --- /dev/null +++ b/tods/detection_algorithm/PyodMoGaal.py @@ -0,0 +1,273 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.mo_gaal import MO_GAAL +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + + + + stop_epochs = hyperparams.Hyperparameter[int]( + default=5, + description='Number of epochs to train the model.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + lr_d = hyperparams.Uniform( + lower=0., + upper=1., + default=0.01, + description='The learn rate of the discriminator. ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + k = hyperparams.Uniform( + lower=0, + upper=100, + default=1, + description='The number of sub generators ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + lr_g = hyperparams.Uniform( + lower=0., + upper=1., + default=0.0001, + description='The learn rate of the generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + decay = hyperparams.Uniform( + lower=0., + upper=1., + default=1e-6, + description='The decay parameter for SGD', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + momentum = hyperparams.Uniform( + lower=0., + upper=1., + default=0.9, + description='The momentum parameter for SGD', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + contamination = hyperparams.Uniform( + lower=0., + upper=0.5, + default=0.1, + description='the amount of contamination of the data set, i.e.the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + random_state = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=0, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='the seed used by the random number generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + + + +class Mo_GaalPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """Multi-Objective Generative Adversarial Active Learning. + MO_GAAL directly generates informative potential outliers to assist the + classifier in describing a boundary that can separate outliers from normal + data effectively. Moreover, to prevent the generator from falling into the + mode collapsing problem, the network structure of SO-GAAL is expanded from + a single generator (SO-GAAL) to multiple generators with different + objectives (MO-GAAL) to generate a reasonable reference distribution for + the whole dataset. + Read more in the :cite:`liu2019generative`. + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + k : int, optional (default=10) + The number of sub generators. + stop_epochs : int, optional (default=20) + The number of epochs of training. + lr_d : float, optional (default=0.01) + The learn rate of the discriminator. + lr_g : float, optional (default=0.0001) + The learn rate of the generator. + decay : float, optional (default=1e-6) + The decay parameter for SGD. + momentum : float, optional (default=0.9) + The momentum parameter for SGD. + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is fitted. + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '906b96ea-f260-4ede-8f55-c26d1367eb32', + 'version': '0.1.0', + 'name': 'Mo_Gaal Anomaly Detection', + 'python_path': 'd3m.primitives.tods.detection_algorithm.pyod_mogaal', + 'keywords': ['Time Series', 'GAN'], + "hyperparams_to_tune": ['stop_epochs','lr_d','lr_g','decay','momentum','k'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git', + 'https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/detection_algorithm/PyodMoGaal.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = MO_GAAL(stop_epochs=hyperparams['stop_epochs'], + k=hyperparams['k'], + lr_d=hyperparams['lr_d'], + lr_g=hyperparams['lr_g'], + decay=hyperparams['decay'], + momentum=hyperparams['momentum'], + contamination=hyperparams['contamination'], + + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) diff --git a/tods/detection_algorithm/PyodOCSVM.py b/tods/detection_algorithm/PyodOCSVM.py new file mode 100644 index 0000000..8a09262 --- /dev/null +++ b/tods/detection_algorithm/PyodOCSVM.py @@ -0,0 +1,289 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.ocsvm import OCSVM +from typing import Union +import uuid + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + kernel = hyperparams.Enumeration[str]( + values=['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], + default='rbf', + description='Specifies the kernel type to be used in the algorithm.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + nu = hyperparams.Uniform( + lower=0., + upper=1., + default=0.5, + description='An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + degree = hyperparams.Hyperparameter[int]( + default=3, + description='Degree of the polynomial kernel function (poly). Ignored by all other kernels.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + gamma = hyperparams.Union[Union[float, str]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[float]( + default=0., + ), + ninit=hyperparams.Hyperparameter[str]( + default='auto', + ), + ), + default='ninit', + description='Kernel coefficient for rbf, poly and sigmoid. If gamma is auto then 1/n_features will be used instead.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + coef0 = hyperparams.Hyperparameter[float]( + default=0., + description='Independent term in kernel function. It is only significant in poly and sigmoid.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + tol = hyperparams.Hyperparameter[float]( + default=0.001, + description='Tolerance for stopping criterion.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + shrinking = hyperparams.UniformBool( + default=True, + description='Whether to use the shrinking heuristic.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + cache_size = hyperparams.Hyperparameter[int]( + default=200, + description='Specify the size of the kernel cache (in MB).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + verbose = hyperparams.UniformBool( + default=False, + description='Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + max_iter = hyperparams.Hyperparameter[int]( + default=-1, + description='Hard limit on iterations within solver, or -1 for no limit.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + pass + + +class OCSVMPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + Wrapper of scikit-learn one-class SVM Class with more functionalities. + Unsupervised Outlier Detection. + Estimate the support of a high-dimensional distribution. + The implementation is based on libsvm. + See http://scikit-learn.org/stable/modules/svm.html#svm-outlier-detection + and :cite:`scholkopf2001estimating`. + + Parameters + ---------- + kernel : string, optional (default='rbf') + Specifies the kernel type to be used in the algorithm. + It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or + a callable. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. + + nu : float, optional + An upper bound on the fraction of training + errors and a lower bound of the fraction of support + vectors. Should be in the interval (0, 1]. By default 0.5 + will be taken. + + degree : int, optional (default=3) + Degree of the polynomial kernel function ('poly'). + Ignored by all other kernels. + + gamma : float, optional (default='auto') + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + If gamma is 'auto' then 1/n_features will be used instead. + + coef0 : float, optional (default=0.0) + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + tol : float, optional + Tolerance for stopping criterion. + + shrinking : bool, optional + Whether to use the shrinking heuristic. + + cache_size : float, optional + Specify the size of the kernel cache (in MB). + + verbose : bool, default: False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, optional (default=-1) + Hard limit on iterations within solver, or -1 for no limit. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "TODS.anomaly_detection_primitives.OCSVMPrimitive", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_ocsvm", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.MARGIN_CLASSIFIER, ], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparams_to_tune": ['contamination', 'kernel', 'nu', 'gamma', 'degree'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'OCSVMPrimitive')) + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = OCSVM(contamination=hyperparams['contamination'], + kernel=hyperparams['kernel'], + nu=hyperparams['nu'], + degree=hyperparams['degree'], + gamma=hyperparams['gamma'], + coef0=hyperparams['coef0'], + tol=hyperparams['tol'], + shrinking=hyperparams['shrinking'], + cache_size=hyperparams['cache_size'], + verbose=hyperparams['verbose'], + max_iter=hyperparams['max_iter'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + diff --git a/tods/detection_algorithm/PyodSOD.py b/tods/detection_algorithm/PyodSOD.py new file mode 100644 index 0000000..f8082ca --- /dev/null +++ b/tods/detection_algorithm/PyodSOD.py @@ -0,0 +1,196 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas +import uuid + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.sod import SOD +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + n_neighbors = hyperparams.Hyperparameter[int]( + default=20, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Number of neighbors to use by default for k neighbors queries.", + ) + + ref_set = hyperparams.Hyperparameter[int]( + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="specifies the number of shared nearest neighbors to create the reference set. Note that ref_set must be smaller than n_neighbors.", + ) + + alpha = hyperparams.Hyperparameter[float]( + default=0.8, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="specifies the lower limit for selecting subspace. 0.8 is set as default as suggested in the original paper.", + ) + + +class SODPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + Subspace outlier detection (SOD) schema aims to detect outlier in + varying subspaces of a high dimensional feature space. For each data + object, SOD explores the axis-parallel subspace spanned by the data + object's neighbors and determines how much the object deviates from the + neighbors in this subspace. + See :cite:`kriegel2009outlier` for details. + + Parameters + ---------- + n_neighbors : int, optional (default=20) + Number of neighbors to use by default for k neighbors queries. + ref_set: int, optional (default=10) + specifies the number of shared nearest neighbors to create the + reference set. Note that ref_set must be smaller than n_neighbors. + alpha: float in (0., 1.), optional (default=0.8) + specifies the lower limit for selecting subspace. + 0.8 is set as default as suggested in the original paper. + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "__author__": "DATA Lab at Texas A&M University", + "name": "Subspace Outlier Detection Primitive", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_sod", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/PyodSOD.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SUBSPACE_OUTLIER_DETECTION], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'SODPrimitive')), + "hyperparams_to_tune": ['contamination', 'n_neighbors', 'ref_set', 'alpha'], + "version": "0.0.1", + }) + + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = SOD(contamination=hyperparams['contamination'], + n_neighbors=hyperparams['n_neighbors'], + ref_set=hyperparams['ref_set'], + alpha=hyperparams['alpha'], + ) + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) diff --git a/tods/detection_algorithm/PyodSoGaal.py b/tods/detection_algorithm/PyodSoGaal.py new file mode 100644 index 0000000..6113960 --- /dev/null +++ b/tods/detection_algorithm/PyodSoGaal.py @@ -0,0 +1,263 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.so_gaal import SO_GAAL +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + + + + stop_epochs = hyperparams.Hyperparameter[int]( + default=20, + description='Number of epochs to train the model.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + lr_d = hyperparams.Uniform( + lower=0., + upper=1., + default=0.01, + description='The learn rate of the discriminator. ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + lr_g = hyperparams.Uniform( + lower=0., + upper=1., + default=0.0001, + description='The learn rate of the generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + decay = hyperparams.Uniform( + lower=0., + upper=1., + default=1e-6, + description='The decay parameter for SGD', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + momentum = hyperparams.Uniform( + lower=0., + upper=1., + default=0.9, + description='The momentum parameter for SGD', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + contamination = hyperparams.Uniform( + lower=0., + upper=0.5, + default=0.1, + description='the amount of contamination of the data set, i.e.the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + random_state = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=0, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='the seed used by the random number generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + + + +class So_GaalPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """Single-Objective Generative Adversarial Active Learning. + SO-GAAL directly generates informative potential outliers to assist the + classifier in describing a boundary that can separate outliers from normal + data effectively. Moreover, to prevent the generator from falling into the + mode collapsing problem, the network structure of SO-GAAL is expanded from + a single generator (SO-GAAL) to multiple generators with different + objectives (MO-GAAL) to generate a reasonable reference distribution for + the whole dataset. + Read more in the :cite:`liu2019generative`. + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + stop_epochs : int, optional (default=20) + The number of epochs of training. + lr_d : float, optional (default=0.01) + The learn rate of the discriminator. + lr_g : float, optional (default=0.0001) + The learn rate of the generator. + decay : float, optional (default=1e-6) + The decay parameter for SGD. + momentum : float, optional (default=0.9) + The momentum parameter for SGD. + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is fitted. + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '56e6cfe9-d9e9-495f-83da-cfed6fa27da1', + 'version': '0.1.0', + 'name': 'So_Gaal Anomaly Detection', + 'python_path': 'd3m.primitives.tods.detection_algorithm.pyod_sogaal', + 'keywords': ['Time Series', 'GAN'], + "hyperparams_to_tune": ['stop_epochs','lr_d','lr_g','decay','momentum'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git', + 'https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/detection_algorithm/PyodSoGaal.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = SO_GAAL(stop_epochs=hyperparams['stop_epochs'], + lr_d=hyperparams['lr_d'], + lr_g=hyperparams['lr_g'], + decay=hyperparams['decay'], + momentum=hyperparams['momentum'], + contamination=hyperparams['contamination'], + + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) diff --git a/tods/detection_algorithm/PyodVAE.py b/tods/detection_algorithm/PyodVAE.py new file mode 100644 index 0000000..701c1e8 --- /dev/null +++ b/tods/detection_algorithm/PyodVAE.py @@ -0,0 +1,389 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import tensorflow +from tensorflow.keras.losses import mean_squared_error +from tensorflow import keras +from tensorflow.keras import losses,layers +# Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +from pyod.utils.utility import argmaxn + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase +from pyod.models.vae import VAE + +import uuid +# from typing import Union + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + ######## Add more Hyperparamters ####### + + encoder_neurons = hyperparams.List( + default=[4, 2, 4], + elements=hyperparams.Hyperparameter[int](1), + description='The number of neurons per hidden layers in encoder.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + decoder_neurons = hyperparams.List( + default=[4, 4, 4], + elements=hyperparams.Hyperparameter[int](1), + description='The number of neurons per hidden layers in decoder.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + hidden_activation = hyperparams.Enumeration[str]( + values=['relu', 'sigmoid', 'softmax', 'softplus', 'softsign', + 'tanh', 'selu', 'elu', 'exponential'], + default='relu', + description='Activation function to use for hidden layers.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + output_activation = hyperparams.Enumeration[str]( + values=['relu', 'sigmoid', 'softmax', 'softplus', 'softsign', + 'tanh', 'selu', 'elu', 'exponential'], + default='sigmoid', + description='Activation function to use for output layer.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + loss = hyperparams.Enumeration[str]( + values=['mean_squared_error'], + default='mean_squared_error', + description='Loss function.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + gamma = hyperparams.Hyperparameter[float]( + default=1.0, + description='Coefficient of beta VAE regime. Default is regular VAE.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + capacity = hyperparams.Hyperparameter[float]( + default=0.0, + description='Maximum capacity of a loss bottle neck.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + optimizer = hyperparams.Enumeration[str]( + values=['SGD', 'RMSprop', 'adam', 'Adadelta', 'Adagrad', + 'Adamax', 'Nadam', 'Ftrl'], + default='adam', + description='String (name of optimizer) or optimizer instance.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + epochs = hyperparams.Hyperparameter[int]( + default=100, + description='Number of epochs to train the model.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + batch_size = hyperparams.Hyperparameter[int]( + default=32, + description='Number of samples per gradient update.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + dropout_rate = hyperparams.Uniform( + lower=0., + upper=1., + default=0.2, + description='The dropout to be used across all layers.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + l2_regularizer = hyperparams.Uniform( + lower=0., + upper=1., + default=0.1, + description='The regularization strength of activity_regularizer applied on each layer.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + validation_size = hyperparams.Uniform( + lower=0., + upper=1., + default=0.1, + description='The percentage of data to be used for validation.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + preprocessing = hyperparams.UniformBool( + default=True, + description='If True, apply standardization on the data.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + verbosity = hyperparams.Enumeration[int]( + values=[0, 1, 2], + default=1, + description='Verbosity mode.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + random_state = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=0, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='the seed used by the random number generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + contamination = hyperparams.Uniform( + lower=0., + upper=0.5, + default=0.01, + description='The amount of contamination of the data set, i.e. the proportion of outliers in the data set. ', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + pass + + +class VariationalAutoEncoder(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + Auto Encoder (AE) is a type of neural networks for learning useful data + representations unsupervisedly. Similar to PCA, AE could be used to + detect outlying objects in the data by calculating the reconstruction + errors. See :cite:`aggarwal2015outlier` Chapter 3 for details. + + Parameters + ---------- + hidden_neurons : list, optional (default=[4, 2, 4]) + The number of neurons per hidden layers. + + hidden_activation : str, optional (default='relu') + Activation function to use for hidden layers. + All hidden layers are forced to use the same type of activation. + See https://keras.io/activations/ + + output_activation : str, optional (default='sigmoid') + Activation function to use for output layer. + See https://keras.io/activations/ + + loss : str or obj, optional (default=keras.losses.mean_squared_error) + String (name of objective function) or objective function. + See https://keras.io/losses/ + + optimizer : str, optional (default='adam') + String (name of optimizer) or optimizer instance. + See https://keras.io/optimizers/ + + epochs : int, optional (default=100) + Number of epochs to train the model. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + dropout_rate : float in (0., 1), optional (default=0.2) + The dropout to be used across all layers. + + l2_regularizer : float in (0., 1), optional (default=0.1) + The regularization strength of activity_regularizer + applied on each layer. By default, l2 regularizer is used. See + https://keras.io/regularizers/ + + validation_size : float in (0., 1), optional (default=0.1) + The percentage of data to be used for validation. + + preprocessing : bool, optional (default=True) + If True, apply standardization on the data. + + verbose : int, optional (default=1) + Verbosity mode. + - 0 = silent + - 1 = progress bar + - 2 = one line per epoch. + For verbosity >= 1, model summary may be printed. + + random_state : random_state: int, RandomState instance or None, optional + (default=None) + If int, random_state is the seed used by the random + number generator; If RandomState instance, random_state is the random + number generator; If None, the random number generator is the + RandomState instance used by `np.random`. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. When fitting this is used + to define the threshold on the decision function. + + Attributes + ---------- + encoding_dim_ : int + The number of neurons in the encoding layer. + + compression_rate_ : float + The ratio between the original feature and + the number of neurons in the encoding layer. + + model_ : Keras Object + The underlying AutoEncoder in Keras. + + history_: Keras Object + The AutoEncoder training history. + + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + metadata = metadata_base.PrimitiveMetadata({ + "name": "TODS.anomaly_detection_primitives.VariationalAutoEncoder", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_vae", + "source": {'name': "DATA Lab at Texas A&M University", 'contact': 'mailto:khlai037@tamu.edu','uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.VARIATIONAL_AUTO_ENCODER, ], + "primitive_family": metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + "version": "0.0.1", + "hyperparameters_to_tune": [''], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'AutoEncoderPrimitive')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + if hyperparams['loss'] == 'mean_squared_error': + loss = keras.losses.mean_squared_error + else: + raise ValueError('VAE only suports mean squered error for now') + + + self._clf = VAE(contamination=hyperparams['contamination'], + encoder_neurons=hyperparams['encoder_neurons'], + decoder_neurons=hyperparams['decoder_neurons'], + hidden_activation=hyperparams['hidden_activation'], + output_activation=hyperparams['output_activation'], + loss=loss, + gamma=hyperparams['gamma'], + capacity=hyperparams['capacity'], + optimizer=hyperparams['optimizer'], + epochs=hyperparams['epochs'], + batch_size=hyperparams['batch_size'], + dropout_rate=hyperparams['dropout_rate'], + l2_regularizer=hyperparams['l2_regularizer'], + validation_size=hyperparams['validation_size'], + preprocessing=hyperparams['preprocessing'], + verbosity=hyperparams['verbosity'], + random_state=hyperparams['random_state'], + ) + + return + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + diff --git a/tods/detection_algorithm/Telemanom.py b/tods/detection_algorithm/Telemanom.py new file mode 100644 index 0000000..cf67fa9 --- /dev/null +++ b/tods/detection_algorithm/Telemanom.py @@ -0,0 +1,473 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy as np +import typing +import pandas as pd + + +from keras.models import Sequential, load_model +from keras.callbacks import History, EarlyStopping, Callback +from keras.layers.recurrent import LSTM +from keras.layers.core import Dense, Activation, Dropout +from keras.layers import Flatten + +from d3m import container, utils +from d3m.base import utils as base_ut +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase +from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase + +from detection_algorithm.core.CollectiveBase import CollectiveBaseDetector + +from sklearn.utils import check_array + +# from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions + + +# from detection_algorithm.UODBasePrimitive import Params_ODBase, Hyperparams_ODBase, UnsupervisedOutlierDetectorBase + +from detection_algorithm.core.utils.errors import Errors +from detection_algorithm.core.utils.channel import Channel +from detection_algorithm.core.utils.modeling import Model + +# from pyod.models.base import BaseDetector + + + +__all__ = ('Telemanom',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(Params_ODBase): + ######## Add more Attributes ####### + + pass + + +class Hyperparams(Hyperparams_ODBase): + + + smoothing_perc = hyperparams.Hyperparameter[float]( + default=0.05, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="determines window size used in EWMA smoothing (percentage of total values for channel)" + ) + + + window_size_ = hyperparams.Hyperparameter[int]( + default=100, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="number of trailing batches to use in error calculation" + ) + + error_buffer = hyperparams.Hyperparameter[int]( + default=50, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="number of values surrounding an error that are brought into the sequence (promotes grouping on nearby sequences" + ) + + batch_size = hyperparams.Hyperparameter[int]( + default=70, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Batch size while predicting" + ) + + + # LSTM Model Parameters + + + dropout = hyperparams.Hyperparameter[float]( + default=0.3, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Dropout rate" + ) + + validation_split = hyperparams.Hyperparameter[float]( + default=0.2, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Validation split" + ) + + optimizer = hyperparams.Hyperparameter[typing.Union[str, None]]( + default='Adam', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Optimizer" + ) + + + lstm_batch_size = hyperparams.Hyperparameter[int]( + default=64, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="lstm model training batch size" + ) + + + loss_metric = hyperparams.Hyperparameter[typing.Union[str, None]]( + default='mean_squared_error', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="loss function" + ) + + + layers = hyperparams.List( + elements=hyperparams.Hyperparameter[int](1), + default=[10,10], + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="No of units for the 2 lstm layers" + ) + + # Training Parameters + + epochs = hyperparams.Hyperparameter[int]( + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Epoch" + ) + + patience = hyperparams.Hyperparameter[int]( + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Number of consequetive training iterations to allow without decreasing the val_loss by at least min_delta" + ) + + min_delta = hyperparams.Hyperparameter[float]( + default=0.0003, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Number of consequetive training iterations to allow without decreasing the val_loss by at least min_delta" + ) + + + l_s = hyperparams.Hyperparameter[int]( + default=100, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="num previous timesteps provided to model to predict future values" + ) + + n_predictions = hyperparams.Hyperparameter[int]( + default=10, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="number of steps ahead to predict" + ) + + + # Error thresholding parameters + # ================================== + + p = hyperparams.Hyperparameter[float]( + default=0.05, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="minimum percent decrease between max errors in anomalous sequences (used for pruning)" + ) + + # Contamination + + contamination = hyperparams.Uniform( + lower=0., + upper=0.5, + default=0.1, + description='the amount of contamination of the data set, i.e.the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + + +class TelemanomPrimitive(UnsupervisedOutlierDetectorBase[Inputs, Outputs, Params, Hyperparams]): + """ + A primitive that uses telmanom for outlier detection + + Parameters + ---------- + + + """ + + __author__ = "Data Lab" + metadata = metadata_base.PrimitiveMetadata( + { + '__author__' : "DATA Lab at Texas A&M University", + 'name': "Telemanom", + 'python_path': 'd3m.primitives.tods.detection_algorithm.telemanom', + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'contact': 'mailto:khlai037@tamu.edu', + 'uris': [ + 'https://gitlab.com/lhenry15/tods.git', + 'https://gitlab.com/lhenry15/tods/-/blob/purav/anomaly-primitives/anomaly_primitives/telemanom.py', + ], + }, + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.TELEMANOM, + ], + 'primitive_family': metadata_base.PrimitiveFamily.ANOMALY_DETECTION, + 'id': 'c7259da6-7ce6-42ad-83c6-15238679f5fa', + 'hyperparameters_to_tune':['layers','loss_metric','optimizer','epochs','p','l_s','patience','min_delta','dropout','smoothing_perc'], + 'version': '0.0.1', + }, + ) + + def __init__(self, *, + hyperparams: Hyperparams, # + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = Detector(smoothing_perc=self.hyperparams['smoothing_perc'], + window_size=self.hyperparams['window_size_'], + error_buffer=self.hyperparams['error_buffer'], + batch_size = self.hyperparams['batch_size'], + validation_split = self.hyperparams['validation_split'], + optimizer = self.hyperparams['optimizer'], + lstm_batch_size = self.hyperparams['lstm_batch_size'], + loss_metric = self.hyperparams['loss_metric'], + layers = self.hyperparams['layers'], + epochs = self.hyperparams['epochs'], + patience = self.hyperparams['patience'], + min_delta = self.hyperparams['min_delta'], + l_s = self.hyperparams['l_s'], + n_predictions = self.hyperparams['n_predictions'], + p = self.hyperparams['p'], + contamination=hyperparams['contamination'] + ) + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + super().set_training_data(inputs=inputs) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + return super().fit() + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + return super().produce(inputs=inputs, timeout=timeout, iterations=iterations) + + + def produce_score(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + Returns: + Container DataFrame + Outlier score of input DataFrame. + """ + return super().produce_score(inputs=inputs, timeout=timeout, iterations=iterations) + + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + return super().get_params() + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params + + Returns: + None + """ + super().set_params(params=params) + + + +class Detector(CollectiveBaseDetector): + """Class to Implement Deep Log LSTM based on "https://www.cs.utah.edu/~lifeifei/papers/deeplog.pdf + Only Parameter Value anomaly detection layer has been implemented for time series data""" + + def __init__(self,smoothing_perc=0.05,window_size = 10,error_buffer = 5,batch_size =30, \ + dropout = 0.3, validation_split=0.2,optimizer='adam',lstm_batch_size=64,loss_metric='mean_squared_error', \ + layers=[40,40],epochs = 1,patience =10,min_delta=0.0003,l_s=5,n_predictions=2,p = 0.05,contamination=0.1): + + # super(Detector, self).__init__(contamination=contamination) + super(Detector, self).__init__(contamination=contamination, + window_size=l_s, + step_size=1, + ) + + self._smoothin_perc = smoothing_perc + self._window_size =window_size + self._error_buffer = error_buffer + self._batch_size = batch_size + self._dropout = dropout + self._validation_split = validation_split + self._optimizer = optimizer + self._lstm_batch_size = lstm_batch_size + self._loss_metric = loss_metric + self._layers = layers + self._epochs = epochs + self._patience = patience + self._min_delta = min_delta + self._l_s = l_s + self._n_predictions = n_predictions + self._p = p + self.contamination = contamination + + # self.y_hat = None + self.results = [] + self.result_df = None + + self._model = None + self._channel = None + + + def fit(self,X,y=None): + """ + Fit data to LSTM model. + Args: + inputs : X , ndarray of size (number of sample,features) + + Returns: + return : self object with trained model + """ + X = check_array(X).astype(np.float) + self._set_n_classes(None) + + inputs = X + self._channel = Channel(n_predictions = self._n_predictions,l_s = self._l_s) + self._channel.shape_train_data(inputs) + + self._model = Model(self._channel,patience = self._patience, + min_delta =self._min_delta, + layers = self._layers, + dropout = self._dropout, + n_predictions = self._n_predictions, + loss_metric = self._loss_metric, + optimizer = self._optimizer, + lstm_batch_size = self._lstm_batch_size, + epochs = self._epochs, + validation_split = self._validation_split, + batch_size = self._batch_size, + l_s = self._l_s + ) + + self.decision_scores_, self.left_inds_, self.right_inds_ = self.decision_function(X) + self._process_decision_scores() + + return self + + + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + + X = check_array(X).astype(np.float) + self._set_n_classes(None) + + inputs = X + self._channel.shape_test_data(inputs) + self._channel = self._model.batch_predict(channel = self._channel) + + errors = Errors(channel = self._channel, + window_size = self._window_size, + batch_size = self._batch_size, + smoothing_perc = self._smoothin_perc, + n_predictions = self._n_predictions, + l_s = self._l_s, + error_buffer = self._error_buffer, + p = self._p + ) + + # prediciton smoothed error + prediction_errors = np.reshape(errors.e_s,(self._channel.X_train.shape[0],self._channel.X_train.shape[2])) + prediction_errors = np.sum(prediction_errors,axis=1) + + left_indices = [] + right_indices = [] + scores = [] + for i in range(len(prediction_errors)): + left_indices.append(i) + right_indices.append(i+self._l_s) + scores.append(prediction_errors[i]) + + + + return np.asarray(scores),np.asarray(left_indices),np.asarray(right_indices) + + + +# if __name__ == "__main__": + +# csv = pd.read_csv("/home/purav/Downloads/yahoo_train.csv") +# # X_train = np.asarray( +# # [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]).reshape(-1, 1) + +# # X_test = np.asarray( +# # [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1,1) + +# # print(X_train.shape, X_test.shape) + +# X_train = csv.iloc[:,[2,3,4,5,6]].values + +# clf = Detector(contamination=0.1) +# clf.fit(X_train) +# # pred_scores = clf.decision_function(X_test) +# pred_labels = clf.predict(X_train) + +# print(clf.threshold_) +# # print(np.percentile(pred_scores, 100 * 0.9)) + +# # print('pred_scores: ',pred_scores) +# print('scores: ',pred_labels[0].shape) +# print('left_indices: ',pred_labels[1].shape) +# print('right_indices: ',pred_labels[2].shape) diff --git a/tods/detection_algorithm/UODBasePrimitive.py b/tods/detection_algorithm/UODBasePrimitive.py new file mode 100755 index 0000000..668bae9 --- /dev/null +++ b/tods/detection_algorithm/UODBasePrimitive.py @@ -0,0 +1,687 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +# import typing +import abc +import typing +# +# # Custom import commands if any +import warnings +import numpy as np +from sklearn.utils import check_array +from sklearn.exceptions import NotFittedError +# from numba import njit +# from pyod.utils.utility import argmaxn +from pyod.models.base import BaseDetector + +import copy + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer, PrimitiveBase, MultiCallResult, Params, Hyperparams + +# # from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import * + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils + +Inputs = d3m_dataframe +# Inputs = container.Dataset +Outputs = d3m_dataframe + +# import abc +# import typing + +from d3m.primitive_interfaces.base import * + +__all__ = ('UnsupervisedOutlierDetectorBase',) + + +class Params_ODBase(params.Params): + + # decision_scores_: Optional[ndarray] + # threshold_: Optional[float] + # labels_: Optional[ndarray] + left_inds_: Optional[ndarray] + right_inds_: Optional[ndarray] + # clf_: Optional[BaseDetector] + clf_: Optional[Any] + + # Keep previous + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams_ODBase(hyperparams.Hyperparams): + + contamination = hyperparams.Uniform( # Hyperparameter[float]( + lower=0., + upper=0.5, + default=0.1, + description='If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + window_size = hyperparams.Hyperparameter[int]( + default=1, + description='The moving window size.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + step_size = hyperparams.Hyperparameter[int]( + default=1, + description='The displacement for moving window.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + # Keep previous + return_subseq_inds = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="If true, return value includes subsequence index." + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class UnsupervisedOutlierDetectorBase(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. When fitting this is used + to define the threshold on the decision function. + + Attributes + ---------- + clf_.decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + clf_.threshold_: float within (0, 1) + For outlier, decision_scores_ more than threshold_. + For inlier, decision_scores_ less than threshold_. + + clf_.labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers. + and 1 for outliers/anomalies. It is generated by applying. + ``threshold_`` on ``decision_scores_``. + + left_inds_ : ndarray, + One of the mapping from decision_score to data. + For point outlier detection, left_inds_ exactly equals the index of each data point. + For Collective outlier detection, left_inds_ equals the start index of each subsequence. + + left_inds_ : ndarray, + One of the mapping from decision_score to data. + For point outlier detection, left_inds_ exactly equals the index of each data point plus 1. + For Collective outlier detection, left_inds_ equals the ending index of each subsequence. + """ + # probability_score: + # window_size: int + # The moving window size. + + __author__ = "DATALAB @Taxes A&M University" + metadata: metadata_base.PrimitiveMetadata = None + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + self._clf = None + self._clf_fit_parameter = {} + self.primitiveNo = 0 + + self.window_size = hyperparams['window_size'] + self.step_size = hyperparams['step_size'] + self.left_inds_ = None + self.right_inds_ = None + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False +# + @abc.abstractmethod + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for outlier detection. + Args: + inputs: Container DataFrame + + Returns: + None + """ + self._inputs = inputs + self._fitted = False + + def _set_subseq_inds(self): + + self.left_inds_ = getattr(self._clf, 'left_inds_', None) + self.right_inds_ = getattr(self._clf, 'right_inds_', None) + + if self.left_inds_ is None or self.right_inds_ is None: + self.left_inds_ = numpy.arange(0, len(self._inputs), self.step_size) + self.right_inds_ = self.left_inds_ + self.window_size + self.right_inds_[self.right_inds_ > len(self._inputs)] = len(self._inputs) + # print(self.left_inds_, self.right_inds_) + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + # print('Fit:', self._clf) + + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + + # print('Fit: ', self._clf) + # print('Fit: ', self._training_inputs.values.shape) + # print('Fit: ', self._clf.fit(self._training_inputs.values)) + + self._clf.fit(X=self._training_inputs.values, **self._clf_fit_parameter) + self._fitted = True + self._set_subseq_inds() + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + + if self.hyperparams['return_subseq_inds']: + + if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD + pred_label = self._clf.predict(sk_inputs.values) + left_inds_ = numpy.arange(0, len(pred_label), self.step_size) + right_inds_ = left_inds_ + self.window_size + right_inds_[right_inds_ > len(pred_label)] = len(pred_label) + else: + pred_label, left_inds_, right_inds_ = self._clf.predict(sk_inputs.values) + + # print(pred_label.shape, left_inds_.shape, right_inds_.shape) + # print(pred_label, left_inds_, right_inds_) + + sk_output = numpy.concatenate((numpy.expand_dims(pred_label, axis=1), + numpy.expand_dims(left_inds_, axis=1), + numpy.expand_dims(right_inds_, axis=1)), axis=1) + + + else: + if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD + sk_output = self._clf.predict(sk_inputs.values) + + else: + sk_output, _, _ = self._clf.predict(sk_inputs.values) + + # print(sk_output) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + return CallResult(outputs) + + def produce_score(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to outlier detection. + + Returns: + Container DataFrame + 1 marks Outliers, 0 marks normal. + """ + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + + if self.hyperparams['return_subseq_inds']: + + if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD + pred_score = self._clf.decision_function(sk_inputs.values).ravel() + left_inds_ = numpy.arange(0, len(pred_score), self.step_size) + right_inds_ = left_inds_ + self.window_size + right_inds_[right_inds_ > len(pred_score)] = len(pred_score) + + else: + pred_score, left_inds_, right_inds_ = self._clf.decision_function(sk_inputs.values) + + # print(pred_score.shape, left_inds_.shape, right_inds_.shape) + + sk_output = numpy.concatenate((numpy.expand_dims(pred_score, axis=1), + numpy.expand_dims(left_inds_, axis=1), + numpy.expand_dims(right_inds_, axis=1)), axis=1) + + else: + if getattr(self._clf, 'left_inds_', None) is None or getattr(self._clf, 'right_inds_', None) is None: # point OD + sk_output = self._clf.decision_function(sk_inputs.values) + + else: + sk_output, _, _ = self._clf.decision_function(sk_inputs.values) + + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + return CallResult(outputs) + + + def get_params(self) -> Params_ODBase: + """ + Return parameters. + Args: + None + + Returns: + class Params_ODBase + """ + + if not self._fitted: + return Params_ODBase( + # decision_scores_=None, + # threshold_=None, + # labels_=None, + left_inds_=None, + right_inds_=None, + clf_=copy.copy(self._clf), + + # Keep previous + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params_ODBase( + # decision_scores_=getattr(self._clf, 'decision_scores_', None), + # threshold_=getattr(self._clf, 'threshold_', None), + # labels_=getattr(self._clf, 'labels_', None), + left_inds_=self.left_inds_, # numpy.array(self.left_inds_) + right_inds_=self.right_inds_, # numpy.array(self.right_inds_) + clf_=copy.copy(self._clf), + + # Keep previous + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + # pass + + + def set_params(self, *, params: Params_ODBase) -> None: + """ + Set parameters for outlier detection. + Args: + params: class Params_ODBase + + Returns: + None + """ + + # self._clf.decision_scores_ = params['decision_scores_'] + # self._clf.threshold_ = params['threshold_'] + # self._clf.labels_ = params['labels_'] + self.left_inds_ = params['left_inds_'] + self.right_inds_ = params['right_inds_'] + self._clf = copy.copy(params['clf_']) + + # Keep previous + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + + # if params['decision_scores_'] is not None: + # self._fitted = True + # if params['threshold_'] is not None: + # self._fitted = True + # if params['labels_'] is not None: + # self._fitted = True + if params['left_inds_'] is not None: + self._fitted = True + if params['right_inds_'] is not None: + self._fitted = True + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + # target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, + # self.hyperparams) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + # print(outputs.metadata.to_internal_simple_structure()) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata.base.DataMetadata + input_indices: list + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +# OutlierDetectorBase.__doc__ = OutlierDetectorBase.__doc__ diff --git a/tods/detection_algorithm/core/AutoRegOD.py b/tods/detection_algorithm/core/AutoRegOD.py new file mode 100644 index 0000000..9513e37 --- /dev/null +++ b/tods/detection_algorithm/core/AutoRegOD.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for univariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted +from sklearn.linear_model import LinearRegression + +from detection_algorithm.core.CollectiveBase import CollectiveBaseDetector + +from detection_algorithm.core.utility import get_sub_matrices + + +class AutoRegOD(CollectiveBaseDetector): + """Autoregressive models use linear regression to calculate a sample's + deviance from the predicted value, which is then used as its + outlier scores. This model is for univariate time series. + See MultiAutoRegOD for multivariate data. + + See :cite:`aggarwal2015outlier` Chapter 9 for details. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. When fitting this is used + to define the threshold on the decision function. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, window_size, step_size=1, contamination=0.1): + super(AutoRegOD, self).__init__(contamination=contamination) + self.window_size = window_size + self.step_size = step_size + + def fit(self, X: np.array) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = check_array(X).astype(np.float) + + # generate X and y + sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( + X, + window_size=self.window_size, + step=self.step_size, + return_numpy=True, + flatten=True) + # remove the last one + sub_matrices = sub_matrices[:-1, :] + self.left_inds_ = self.left_inds_[:-1] + self.right_inds_ = self.right_inds_[:-1] + + self.valid_len_ = sub_matrices.shape[0] + + y_buf = np.zeros([self.valid_len_, 1]) + + for i in range(self.valid_len_): + y_buf[i] = X[i * self.step_size + self.window_size] + # print(sub_matrices.shape, y_buf.shape) + + # fit the linear regression model + self.lr_ = LinearRegression(fit_intercept=True) + self.lr_.fit(sub_matrices, y_buf) + self.decision_scores_ = np.absolute( + y_buf.ravel() - self.lr_.predict(sub_matrices).ravel()) + + self._process_decision_scores() + return self + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['lr_']) + + sub_matrices, X_left_inds, X_right_inds = \ + get_sub_matrices(X, + window_size=self.window_size, + step=self.step_size, + return_numpy=True, + flatten=True) + + # remove the last one + sub_matrices = sub_matrices[:-1, :] + X_left_inds = X_left_inds[:-1] + X_right_inds = X_right_inds[:-1] + + valid_len = sub_matrices.shape[0] + + y_buf = np.zeros([valid_len, 1]) + + for i in range(valid_len): + y_buf[i] = X[i * self.step_size + self.window_size] + + pred_score = np.absolute( + y_buf.ravel() - self.lr_.predict(sub_matrices).ravel()) + + return pred_score, X_left_inds.ravel(), X_right_inds.ravel() + + +if __name__ == "__main__": + X_train = np.asarray( + [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, + 100]).reshape(-1, 1) + + X_test = np.asarray( + [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1, + 1) + + clf = AutoRegOD(window_size=3, contamination=0.2) + clf.fit(X_train) + decision_scores, left_inds_, right_inds = clf.decision_scores_, \ + clf.left_inds_, clf.right_inds_ + print(clf.left_inds_, clf.right_inds_) + pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) + pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) + pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) + + print(pred_scores) + print(pred_labels) + print(pred_probs) diff --git a/tods/detection_algorithm/core/CollectiveBase.py b/tods/detection_algorithm/core/CollectiveBase.py new file mode 100644 index 0000000..c511588 --- /dev/null +++ b/tods/detection_algorithm/core/CollectiveBase.py @@ -0,0 +1,476 @@ +# -*- coding: utf-8 -*- +"""Base class for all Collective outlier detector models +""" + +from __future__ import division +from __future__ import print_function + +import warnings +from collections import defaultdict + +from inspect import signature + +import abc +from abc import ABCMeta + +import numpy as np +from numpy import percentile +from scipy.special import erf +from sklearn.preprocessing import MinMaxScaler +from sklearn.utils import deprecated +from sklearn.utils.validation import check_is_fitted +from sklearn.utils.multiclass import check_classification_targets + + +def _pprint(params, offset=0, printer=repr): + # noinspection PyPep8 + """Pretty print the dictionary 'params' + + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + + :param params: The dictionary to pretty print + :type params: dict + + :param offset: The offset in characters to add at the begin of each line. + :type offset: int + + :param printer: The function to convert entries to strings, typically + the builtin str or repr + :type printer: callable + + :return: None + """ + + # Do a multi-line justified repr: + options = np.get_printoptions() + np.set_printoptions(precision=5, threshold=64, edgeitems=2) + params_list = list() + this_line_length = offset + line_sep = ',\n' + (1 + offset // 2) * ' ' + for i, (k, v) in enumerate(sorted(params.items())): + if type(v) is float: + # use str for representing floating point numbers + # this way we get consistent representation across + # architectures and versions. + this_repr = '%s=%s' % (k, str(v)) + else: + # use repr of the rest + this_repr = '%s=%s' % (k, printer(v)) + if len(this_repr) > 500: + this_repr = this_repr[:300] + '...' + this_repr[-100:] + if i > 0: + if this_line_length + len(this_repr) >= 75 or '\n' in this_repr: + params_list.append(line_sep) + this_line_length = len(line_sep) + else: + params_list.append(', ') + this_line_length += 2 + params_list.append(this_repr) + this_line_length += len(this_repr) + + np.set_printoptions(**options) + lines = ''.join(params_list) + # Strip trailing space to avoid nightmare in doctests + lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n')) + return lines + + +class CollectiveBaseDetector(metaclass=ABCMeta): + """Abstract class for all outlier detection algorithms. + + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + window_size : int, optional (default=1) + The moving window size. + + step_size :, optional (default=1) + The displacement for moving window. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + @abc.abstractmethod + def __init__(self, contamination=0.1, + window_size=1, + step_size=1): + + if not (0. < contamination <= 0.5): + raise ValueError("contamination must be in (0, 0.5], " + "got: %f" % contamination) + + self.contamination = contamination + self.window_size = window_size + self.step_size = step_size + self._classes = 2 # leave the parameter on for extension + self.left_inds_ = None + self.right_inds = None + + # noinspection PyIncorrectDocstring + @abc.abstractmethod + def fit(self, X, y=None): + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + pass + + @abc.abstractmethod + def decision_function(self, X): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + pass + + @deprecated() + def fit_predict(self, X, y=None): + """Fit detector first and then predict whether a particular sample + is an outlier or not. y is ignored in unsupervised models. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + outlier_labels : numpy array of shape (n_samples,) + For each observation, tells whether or not + it should be considered as an outlier according to the + fitted model. 0 stands for inliers and 1 for outliers. + + .. deprecated:: 0.6.9 + `fit_predict` will be removed in pyod 0.8.0.; it will be + replaced by calling `fit` function first and then accessing + `labels_` attribute for consistency. + """ + + self.fit(X, y) + return self.labels_ + + def predict(self, X): + """Predict if a particular sample is an outlier or not. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + Returns + ------- + outlier_labels : numpy array of shape (n_samples,) + For each observation, tells whether or not + it should be considered as an outlier according to the + fitted model. 0 stands for inliers and 1 for outliers. + """ + + check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) + + pred_score, X_left_inds, X_right_inds = self.decision_function(X) + + return (pred_score > self.threshold_).astype( + 'int').ravel(), X_left_inds.ravel(), X_right_inds.ravel() + + def predict_proba(self, X, method='linear'): + """Predict the probability of a sample being outlier. Two approaches + are possible: + + 1. simply use Min-max conversion to linearly transform the outlier + scores into the range of [0,1]. The model must be + fitted first. + 2. use unifying scores, see :cite:`kriegel2011interpreting`. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + method : str, optional (default='linear') + probability conversion method. It must be one of + 'linear' or 'unify'. + + Returns + ------- + outlier_probability : numpy array of shape (n_samples,) + For each observation, tells whether or not + it should be considered as an outlier according to the + fitted model. Return the outlier probability, ranging + in [0,1]. + """ + + check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) + train_scores = self.decision_scores_ + + test_scores, X_left_inds, X_right_inds = self.decision_function(X) + + probs = np.zeros([test_scores.shape[0], int(self._classes)]) + if method == 'linear': + scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1)) + probs[:, 1] = scaler.transform( + test_scores.reshape(-1, 1)).ravel().clip(0, 1) + probs[:, 0] = 1 - probs[:, 1] + return probs, X_left_inds.ravel(), X_right_inds.ravel() + + elif method == 'unify': + # turn output into probability + pre_erf_score = (test_scores - self._mu) / ( + self._sigma * np.sqrt(2)) + erf_score = erf(pre_erf_score) + probs[:, 1] = erf_score.clip(0, 1).ravel() + probs[:, 0] = 1 - probs[:, 1] + return probs, X_left_inds.ravel(), X_right_inds.ravel() + else: + raise ValueError(method, + 'is not a valid probability conversion method') + + def _predict_rank(self, X, normalized=False): + """Predict the outlyingness rank of a sample by a fitted model. The + method is for outlier detector score combination. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + normalized : bool, optional (default=False) + If set to True, all ranks are normalized to [0,1]. + + Returns + ------- + ranks : array, shape (n_samples,) + Outlying rank of a sample according to the training data. + + """ + + check_is_fitted(self, ['decision_scores_']) + + test_scores = self.decision_function(X) + train_scores = self.decision_scores_ + + sorted_train_scores = np.sort(train_scores) + ranks = np.searchsorted(sorted_train_scores, test_scores) + + if normalized: + # return normalized ranks + ranks = ranks / ranks.max() + return ranks + + def _set_n_classes(self, y): + """Set the number of classes if `y` is presented, which is not + expected. It could be useful for multi-class outlier detection. + + Parameters + ---------- + y : numpy array of shape (n_samples,) + Ground truth. + + Returns + ------- + self + """ + + self._classes = 2 # default as binary classification + if y is not None: + check_classification_targets(y) + self._classes = len(np.unique(y)) + warnings.warn( + "y should not be presented in unsupervised learning.") + return self + + def _process_decision_scores(self): + """Internal function to calculate key attributes: + + - threshold_: used to decide the binary label + - labels_: binary labels of training data + + Returns + ------- + self + """ + + self.threshold_ = percentile(self.decision_scores_, + 100 * (1 - self.contamination)) + self.labels_ = (self.decision_scores_ > self.threshold_).astype( + 'int').ravel() + + # calculate for predict_proba() + + self._mu = np.mean(self.decision_scores_) + self._sigma = np.std(self.decision_scores_) + + return self + + # noinspection PyMethodParameters + def _get_param_names(cls): + # noinspection PyPep8 + """Get parameter names for the estimator + + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + """ + + # fetch the constructor or the original constructor before + # deprecation wrapping if any + init = getattr(cls.__init__, 'deprecated_original', cls.__init__) + if init is object.__init__: + # No explicit constructor to introspect + return [] + + # introspect the constructor arguments to find the model parameters + # to represent + init_signature = signature(init) + # Consider the constructor parameters excluding 'self' + parameters = [p for p in init_signature.parameters.values() + if p.name != 'self' and p.kind != p.VAR_KEYWORD] + for p in parameters: + if p.kind == p.VAR_POSITIONAL: + raise RuntimeError("scikit-learn estimators should always " + "specify their parameters in the signature" + " of their __init__ (no varargs)." + " %s with constructor %s doesn't " + " follow this convention." + % (cls, init_signature)) + # Extract and sort argument names excluding 'self' + return sorted([p.name for p in parameters]) + + # noinspection PyPep8 + def get_params(self, deep=True): + """Get parameters for this estimator. + + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + + Parameters + ---------- + deep : bool, optional (default=True) + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns + ------- + params : mapping of string to any + Parameter names mapped to their values. + """ + + out = dict() + for key in self._get_param_names(): + # We need deprecation warnings to always be on in order to + # catch deprecated param values. + # This is set in utils/__init__.py but it gets overwritten + # when running under python3 somehow. + warnings.simplefilter("always", DeprecationWarning) + try: + with warnings.catch_warnings(record=True) as w: + value = getattr(self, key, None) + if len(w) and w[0].category == DeprecationWarning: + # if the parameter is deprecated, don't show it + continue + finally: + warnings.filters.pop(0) + + # XXX: should we rather test if instance of estimator? + if deep and hasattr(value, 'get_params'): + deep_items = value.get_params().items() + out.update((key + '__' + k, val) for k, val in deep_items) + out[key] = value + return out + + def set_params(self, **params): + # noinspection PyPep8 + """Set the parameters of this estimator. + The method works on simple estimators as well as on nested objects + (such as pipelines). The latter have parameters of the form + ``__`` so that it's possible to update each + component of a nested object. + + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + + Returns + ------- + self : object + """ + + if not params: + # Simple optimization to gain speed (inspect is slow) + return self + valid_params = self.get_params(deep=True) + + nested_params = defaultdict(dict) # grouped by prefix + for key, value in params.items(): + key, delim, sub_key = key.partition('__') + if key not in valid_params: + raise ValueError('Invalid parameter %s for estimator %s. ' + 'Check the list of available parameters ' + 'with `estimator.get_params().keys()`.' % + (key, self)) + + if delim: + nested_params[key][sub_key] = value + else: + setattr(self, key, value) + + for key, sub_params in nested_params.items(): + valid_params[key].set_params(**sub_params) + + return self + + def __repr__(self): + # noinspection PyPep8 + """ + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + """ + + class_name = self.__class__.__name__ + return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False), + offset=len(class_name), ),) diff --git a/tods/detection_algorithm/core/CollectiveCommonTest.py b/tods/detection_algorithm/core/CollectiveCommonTest.py new file mode 100755 index 0000000..a689eff --- /dev/null +++ b/tods/detection_algorithm/core/CollectiveCommonTest.py @@ -0,0 +1,169 @@ +# -*- coding: utf-8 -*- + +from __future__ import division +from __future__ import print_function + +import os +import sys + +import numpy as np +import unittest +# noinspection PyProtectedMember +from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import assert_array_less +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_greater +from sklearn.utils.testing import assert_greater_equal +from sklearn.utils.testing import assert_less_equal +from sklearn.utils.testing import assert_raises + +from sklearn.utils.estimator_checks import check_estimator + +from sklearn.metrics import roc_auc_score +from scipy.stats import rankdata + +# temporary solution for relative imports in case pyod is not installed +# if pyod is installed, no need to use the following line +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from pyod.utils.data import generate_data + + +class CollectiveCommonTest: + def __init__(self, + model, + X_train, + y_train, + X_test, + y_test, + roc_floor, + ): + self.clf = model + self.X_train = X_train + self.y_train = y_train + self.X_test = X_test + self.y_test = y_test + self.roc_floor = roc_floor + + self.clf.fit(self.X_train) + + pass + + def test_detector(self): + + self.test_parameters() + self.test_train_scores() + self.test_train_inds() + self.test_prediction_scores() + self.test_prediction_proba() + self.test_prediction_proba_linear() + self.test_prediction_proba_unify() + self.test_prediction_proba_parameter() + # self.test_fit_predict() + # self.test_fit_predict_score() + self.test_prediction_labels() + self.test_prediction_inds() + # self.test_predict_rank() + # self.test_predict_rank_normalized() + self.tearDown() + + def test_parameters(self): + assert (hasattr(self.clf, 'decision_scores_') and + self.clf.decision_scores_ is not None) + assert (hasattr(self.clf, 'labels_') and + self.clf.labels_ is not None) + assert (hasattr(self.clf, 'threshold_') and + self.clf.threshold_ is not None) + assert (hasattr(self.clf, 'left_inds_') and + self.clf.left_inds_ is not None) + assert (hasattr(self.clf, 'right_inds_') and + self.clf.right_inds_ is not None) + assert (hasattr(self.clf, '_mu') and + self.clf._mu is not None) + assert (hasattr(self.clf, '_sigma') and + self.clf._sigma is not None) + + def test_train_scores(self): + assert_equal(len(self.clf.decision_scores_), self.y_train.shape[0]) + + def test_train_inds(self): + inds_valid = self.clf.left_inds_ < self.clf.right_inds_ + assert_equal(self.clf.left_inds_.shape, self.clf.decision_scores_.shape) + assert_equal(self.clf.right_inds_.shape, self.clf.decision_scores_.shape) + assert_equal(all(inds_valid), True) + + def test_prediction_scores(self): + pred_scores, _, _ = self.clf.decision_function(self.X_test) + + # check score shapes + assert_equal(pred_scores.shape[0], self.y_test.shape[0]) + + # check performance + assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) + + def test_prediction_labels(self): + pred_labels, _, _ = self.clf.predict(self.X_test) + assert_equal(pred_labels.shape, self.y_test.shape) + + def test_prediction_inds(self): + _, left_inds, right_inds = self.clf.predict(self.X_test) + inds_valid = left_inds < right_inds + + assert_equal(left_inds.shape, self.y_test.shape) + assert_equal(right_inds.shape, self.y_test.shape) + assert_equal(all(inds_valid), True) + + + def test_prediction_proba(self): + pred_proba, _, _ = self.clf.predict_proba(self.X_test) + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_linear(self): + pred_proba, _, _ = self.clf.predict_proba(self.X_test, method='linear') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_unify(self): + pred_proba, _, _ = self.clf.predict_proba(self.X_test, method='unify') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_parameter(self): + with assert_raises(ValueError): + self.clf.predict_proba(self.X_test, method='something') + + def test_fit_predict(self): + pred_labels, _, _ = self.clf.fit_predict(X=self.X_train) + assert_equal(pred_labels.shape, self.y_train.shape) + + def test_fit_predict_score(self): + self.clf.fit_predict_score(self.X_test, self.y_test) + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='roc_auc_score') + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='prc_n_score') + with assert_raises(NotImplementedError): + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='something') + + def test_predict_rank(self): + pred_socres, _, _ = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, self.X_train.shape[0] + 1) + assert_array_less(-0.1, pred_ranks) + + def test_predict_rank_normalized(self): + pred_socres, _, _ = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, 1.01) + assert_array_less(-0.1, pred_ranks) + + def tearDown(self): + pass diff --git a/tods/detection_algorithm/core/KDiscord.py b/tods/detection_algorithm/core/KDiscord.py new file mode 100644 index 0000000..5cfe4c0 --- /dev/null +++ b/tods/detection_algorithm/core/KDiscord.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for multivariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted + +from detection_algorithm.core.CollectiveBase import CollectiveBaseDetector +from pyod.models.knn import KNN + +from detection_algorithm.core.utility import get_sub_matrices + + +# TODO: add an argument to exclude "near equal" samples +# TODO: another thought is to treat each dimension independent +class KDiscord(CollectiveBaseDetector): + """KDiscord first split multivariate time series into + subsequences (matrices), and it use kNN outlier detection based on PyOD. + For an observation, its distance to its kth nearest neighbor could be + viewed as the outlying score. It could be viewed as a way to measure + the density. See :cite:`ramaswamy2000efficient,angiulli2002fast` for + details. + + See :cite:`aggarwal2015outlier,zhao2020using` for details. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_neighbors : int, optional (default = 5) + Number of neighbors to use by default for k neighbors queries. + + method : str, optional (default='largest') + {'largest', 'mean', 'median'} + + - 'largest': use the distance to the kth neighbor as the outlier score + - 'mean': use the average of all k neighbors as the outlier score + - 'median': use the median of the distance to k neighbors as the + outlier score + + radius : float, optional (default = 1.0) + Range of parameter space to use by default for `radius_neighbors` + queries. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use BallTree + - 'kd_tree' will use KDTree + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + .. deprecated:: 0.74 + ``algorithm`` is deprecated in PyOD 0.7.4 and will not be + possible in 0.7.6. It has to use BallTree for consistency. + + leaf_size : int, optional (default = 30) + Leaf size passed to BallTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : string or callable, default 'minkowski' + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', + 'sqeuclidean', 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. + + p : integer, optional (default = 2) + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances + + metric_params : dict, optional (default = None) + Additional keyword arguments for the metric function. + + n_jobs : int, optional (default = 1) + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + Affects only kneighbors and kneighbors_graph methods. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, window_size, step_size=1, contamination=0.1, + n_neighbors=5, method='largest', + radius=1.0, algorithm='auto', leaf_size=30, + metric='minkowski', p=2, metric_params=None, n_jobs=1, + **kwargs): + super(KDiscord, self).__init__(contamination=contamination) + self.window_size = window_size + self.step_size = step_size + + # parameters for kNN + self.n_neighbors = n_neighbors + self.method = method + self.radius = radius + self.algorithm = algorithm + self.leaf_size = leaf_size + self.metric = metric + self.p = p + self.metric_params = metric_params + self.n_jobs = n_jobs + + # initialize a kNN model + self.model_ = KNN(contamination=self.contamination, + n_neighbors=self.n_neighbors, + radius=self.radius, + algorithm=self.algorithm, + leaf_size=self.leaf_size, + metric=self.metric, + p=self.p, + metric_params=self.metric_params, + n_jobs=self.n_jobs, + **kwargs) + + def fit(self, X: np.array) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = check_array(X).astype(np.float) + + # first convert it into submatrices, and flatten it + sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( + X, + self.window_size, + self.step_size, + return_numpy=True, + flatten=True) + + # fit the kNN model + self.model_.fit(sub_matrices) + self.decision_scores_ = self.model_.decision_scores_ + self._process_decision_scores() + return self + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['model_']) + X = check_array(X).astype(np.float) + # first convert it into submatrices, and flatten it + sub_matrices, X_left_inds, X_right_inds = get_sub_matrices( + X, + self.window_size, + self.step_size, + return_numpy=True, + flatten=True) + + # return the prediction result by kNN + return self.model_.decision_function(sub_matrices), \ + X_left_inds.ravel(), X_right_inds.ravel() + + +if __name__ == "__main__": + X_train = np.asarray( + [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, + 100]).reshape(-1, 1) + + X_test = np.asarray( + [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1, + 1) + + # X_train = np.asarray( + # [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], + # [12., 12], + # [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + # + # X_test = np.asarray( + # [[12., 10], [8., 12], [80., 80], [92., 983], + # [18., 16], [20., 7], [18., 10], [3., 5], [5., 9], [23., 12], + # [22., 15]]) + + clf = KDiscord(window_size=3, step_size=1, contamination=0.2, + n_neighbors=5) + + clf.fit(X_train) + decision_scores, left_inds_, right_inds = clf.decision_scores_, \ + clf.left_inds_, clf.right_inds_ + print(clf.left_inds_, clf.right_inds_) + pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) + pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) + pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) + + print(pred_scores) + print(pred_labels) + print(pred_probs) diff --git a/tods/detection_algorithm/core/LSTMOD.py b/tods/detection_algorithm/core/LSTMOD.py new file mode 100755 index 0000000..91f7820 --- /dev/null +++ b/tods/detection_algorithm/core/LSTMOD.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for univariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted +from scipy.special import erf +from sklearn.preprocessing import MinMaxScaler + +from detection_algorithm.core.CollectiveBase import CollectiveBaseDetector + +# from tod.utility import get_sub_matrices + +from keras.layers import Dense, LSTM +from keras.models import Sequential + +class LSTMOutlierDetector(CollectiveBaseDetector): + + def __init__(self,contamination=0.1, + train_contamination=0.0, + min_attack_time=5, + danger_coefficient_weight=0.5, + loss='mean_squared_error', + optimizer='adam', + epochs=10, + batch_size=8, + dropout_rate=0.0, + feature_dim=1, + hidden_dim=8, + n_hidden_layer=0, + activation=None, + diff_group_method='average' + ): + + super(LSTMOutlierDetector, self).__init__(contamination=contamination, + window_size=min_attack_time, + step_size=1, + ) + + self.train_contamination = train_contamination + self.min_attack_time = min_attack_time + self.danger_coefficient_weight = danger_coefficient_weight + self.relative_error_threshold = None + + self.loss = loss + self.optimizer = optimizer + self.epochs = epochs + self.batch_size = batch_size + self.dropout_rate = dropout_rate + self.feature_dim = feature_dim + self.hidden_dim = hidden_dim + self.n_hidden_layer = n_hidden_layer + self.diff_group_method = diff_group_method + + + self.model_ = Sequential() + self.model_.add(LSTM(units=hidden_dim, input_shape=(feature_dim, 1), + dropout=dropout_rate, activation=activation)) + + for layer_idx in range(n_hidden_layer): + self.model_.add(LSTM(units=hidden_dim, input_shape=(hidden_dim, 1), + dropout=dropout_rate, activation=activation)) + + self.model_.add(Dense(units=feature_dim, input_shape=(hidden_dim, 1), activation=None)) + + self.model_.compile(loss=self.loss, optimizer=self.optimizer) + + def fit(self, X: np.array, y=None) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = check_array(X).astype(np.float) + self._set_n_classes(None) + X_buf, y_buf = self._get_sub_matrices(X) + + # fit the LSTM model + self.model_.fit(X_buf, y_buf, epochs=self.epochs, batch_size=self.batch_size) + + relative_error = self._relative_error(X) + + if self.train_contamination < 1e-6: + self.relative_error_threshold = max(relative_error) + else: + self.relative_error_threshold = np.percentile(relative_error, 100 * (1 - self.train_contamination)) + + self.decision_scores_, self.left_inds_, self.right_inds_ = self.decision_function(X) + self._process_decision_scores() + + return self + + def _get_sub_matrices(self, X: np.array): + # return X[:-1].reshape(-1, 1, self.feature_dim), X[1:] + return np.expand_dims(X[:-1], axis=2), X[1:] + + + def _relative_error(self, X: np.array): + + X = check_array(X).astype(np.float) + X_buf, y_buf = self._get_sub_matrices(X) + + y_predict = self.model_.predict(X_buf) + + relative_error = (np.linalg.norm(y_predict - y_buf, axis=1) / np.linalg.norm(y_buf + 1e-6, axis=1)).ravel() + + return relative_error + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['model_']) + + relative_error = self._relative_error(X) + + error_num_buf = (relative_error > self.relative_error_threshold).astype(int) + + if not (self.diff_group_method in ['max', 'min', 'average']): + raise ValueError(self.diff_group_method, "is not a valid method") + + relative_error_left_inds = np.ones((len(relative_error), )) * len(relative_error) + relative_error_right_inds = np.zeros((len(relative_error), )) + + + if self.diff_group_method == 'average': + danger_coefficient = np.zeros(relative_error.shape) + averaged_relative_error = np.zeros(relative_error.shape) + calculated_times = np.zeros(relative_error.shape) + + for i in range(len(relative_error) - self.min_attack_time + 1): + dc_tmp = error_num_buf[i:i+self.min_attack_time].sum() / self.min_attack_time + are_tmp = relative_error[i:i+self.min_attack_time].sum() / self.min_attack_time + + for j in range(self.min_attack_time): + averaged_relative_error[i + j] += are_tmp + danger_coefficient[i + j] += dc_tmp + calculated_times[i + j] += 1 + relative_error_left_inds[i + j] = i if i < relative_error_left_inds[i + j] else relative_error_left_inds[i + j] + relative_error_right_inds[i + j] = i+self.min_attack_time if i+self.min_attack_time > relative_error_right_inds[i + j] else relative_error_left_inds[i + j] + + # print(calculated_times) + danger_coefficient /= calculated_times + averaged_relative_error /= calculated_times + # print(danger_coefficient, averaged_relative_error) + + + else: + danger_coefficient = np.zeros(relative_error.shape) + averaged_relative_error = np.zeros(relative_error.shape) + + if self.diff_group_method == 'min': + danger_coefficient += float('inf') + averaged_relative_error += float('inf') + + for i in range(len(relative_error) - self.min_attack_time + 1): + dc_tmp = error_num_buf[i:i+self.min_attack_time].sum() / self.min_attack_time + are_tmp = relative_error[i:i+self.min_attack_time].sum() / self.min_attack_time + + if self.diff_group_method == 'max': + for j in range(self.min_attack_time): + if are_tmp > averaged_relative_error[i + j] or dc_tmp > danger_coefficient[i+j]: + relative_error_left_inds[i + j] = i + relative_error_right_inds[i + j] = i+self.min_attack_time + if are_tmp > averaged_relative_error[i + j]: + averaged_relative_error[i + j] = are_tmp + if dc_tmp > danger_coefficient[i+j]: + danger_coefficient[i + j] = dc_tmp + + else: + for j in range(self.min_attack_time): + if are_tmp < averaged_relative_error[i + j] or dc_tmp < danger_coefficient[i+j]: + relative_error_left_inds[i + j] = i + relative_error_right_inds[i + j] = i+self.min_attack_time + if are_tmp < averaged_relative_error[i + j]: + averaged_relative_error[i + j] = are_tmp + if dc_tmp < danger_coefficient[i+j]: + danger_coefficient[i + j] = dc_tmp + + + # print(relative_error_left_inds) + # print(relative_error_right_inds) + pred_score = danger_coefficient * self.danger_coefficient_weight + averaged_relative_error * (1 - self.danger_coefficient_weight) + + return pred_score, relative_error_left_inds, relative_error_right_inds + + + +if __name__ == "__main__": + X_train = np.asarray( + [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]).reshape(-1, 1) + + X_test = np.asarray( + [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1,1) + + # print(X_train.shape, X_test.shape) + + clf = LSTMOutlierDetector(contamination=0.1) + clf.fit(X_train) + # pred_scores = clf.decision_function(X_test) + pred_labels, left_inds, right_inds = clf.predict(X_test) + + print(pred_labels.shape, left_inds.shape, right_inds.shape) + + print(clf.threshold_) + # print(np.percentile(pred_scores, 100 * 0.9)) + + # print('pred_scores: ',pred_scores) + print('pred_labels: ',pred_labels) diff --git a/tods/detection_algorithm/core/MultiAutoRegOD.py b/tods/detection_algorithm/core/MultiAutoRegOD.py new file mode 100644 index 0000000..054b96f --- /dev/null +++ b/tods/detection_algorithm/core/MultiAutoRegOD.py @@ -0,0 +1,226 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for multivariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted +from sklearn.utils import column_or_1d + +from detection_algorithm.core.CollectiveBase import CollectiveBaseDetector +from combo.models.score_comb import average, maximization, median, aom, moa +from combo.utils.utility import standardizer + +from detection_algorithm.core.AutoRegOD import AutoRegOD +from detection_algorithm.core.utility import get_sub_sequences_length + + +class MultiAutoRegOD(CollectiveBaseDetector): + """Autoregressive models use linear regression to calculate a sample's + deviance from the predicted value, which is then used as its + outlier scores. This model is for multivariate time series. + This model handles multivariate time series by various combination + approaches. See AutoRegOD for univarite data. + + See :cite:`aggarwal2015outlier,zhao2020using` for details. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. When fitting this is used + to define the threshold on the decision function. + + method : str, optional (default='average') + Combination method: {'average', 'maximization', + 'median'}. Pass in weights of detector for weighted version. + + weights : numpy array of shape (1, n_dimensions) + Score weight by dimensions. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, window_size, step_size=1, method='average', + weights=None, contamination=0.1): + super(MultiAutoRegOD, self).__init__(contamination=contamination) + self.window_size = window_size + self.step_size = step_size + self.method = method + self.weights = weights + + def _validate_weights(self): + """Internal function for validating and adjust weights. + + Returns + ------- + + """ + if self.weights is None: + self.weights = np.ones([1, self.n_models_]) + else: + self.weights = column_or_1d(self.weights).reshape( + 1, len(self.weights)) + assert (self.weights.shape[1] == self.n_models_) + + # adjust probability by a factor for integrity + adjust_factor = self.weights.shape[1] / np.sum(self.weights) + self.weights = self.weights * adjust_factor + + def _fit_univariate_model(self, X): + """Internal function for fitting one dimensional ts. + """ + X = check_array(X) + n_samples, n_sequences = X.shape[0], X.shape[1] + + models = [] + + # train one model for each dimension + for i in range(n_sequences): + models.append(AutoRegOD(window_size=self.window_size, + step_size=self.step_size, + contamination=self.contamination)) + models[i].fit(X[:, i].reshape(-1, 1)) + + return models + + def _score_combination(self, scores): + """Internal function for combining univarite scores. + """ + + # combine by different approaches + if self.method == 'average': + return average(scores, estimator_weights=self.weights) + if self.method == 'maximization': + return maximization(scores) + if self.method == 'median': + return median(scores) + + def fit(self, X: np.array) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = check_array(X).astype(np.float) + + # fit each dimension individually + self.models_ = self._fit_univariate_model(X) + self.valid_len_ = self.models_[0].valid_len_ + self.n_models_ = len(self.models_) + + # assign the left and right inds, same for all models + self.left_inds_ = self.models_[0].left_inds_ + self.right_inds_ = self.models_[0].right_inds_ + + # validate and adjust weights + self._validate_weights() + + # combine the scores from all dimensions + self._decison_mat = np.zeros([self.valid_len_, self.n_models_]) + for i in range(self.n_models_): + self._decison_mat[:, i] = self.models_[i].decision_scores_ + + # scale scores by standardization before score combination + self._decison_mat_scalaled, self._score_scalar = standardizer( + self._decison_mat, keep_scalar=True) + + self.decision_scores_ = self._score_combination( + self._decison_mat_scalaled) + + self._process_decision_scores() + return self + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['models_']) + X = check_array(X).astype(np.float) + assert (X.shape[1] == self.n_models_) + n_samples = len(X) + + # need to subtract 1 because need to have y for subtraction + valid_len = get_sub_sequences_length(n_samples, self.window_size, + self.step_size) - 1 + + # combine the scores from all dimensions + decison_mat = np.zeros([valid_len, self.n_models_]) + for i in range(self.n_models_): + decison_mat[:, i], X_left_inds, X_right_inds = \ + self.models_[i].decision_function(X[:, i].reshape(-1, 1)) + + # scale the decision mat + decison_mat_scaled = self._score_scalar.transform(decison_mat) + decision_scores = self._score_combination(decison_mat_scaled) + + return decision_scores, X_left_inds, X_right_inds + + +if __name__ == "__main__": + X_train = np.asarray( + [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], [12., 12], + [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + + X_test = np.asarray( + [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], [12., 12], + [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + + # X_test = np.asarray( + # [[12., 10], [8., 12], [80., 80], [92., 983], + # [18., 16], [20., 7], [18., 10], [3., 5], [5., 9], [23., 12], + # [22., 15]]) + + clf = MultiAutoRegOD(window_size=3, step_size=1, contamination=0.2) + + clf.fit(X_train) + decision_scores, left_inds_, right_inds = clf.decision_scores_, \ + clf.left_inds_, clf.right_inds_ + print(clf.left_inds_, clf.right_inds_) + pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) + pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) + pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) + + print(pred_scores) + print(pred_labels) + print(pred_probs) diff --git a/tods/detection_algorithm/core/PCA.py b/tods/detection_algorithm/core/PCA.py new file mode 100644 index 0000000..ad7e208 --- /dev/null +++ b/tods/detection_algorithm/core/PCA.py @@ -0,0 +1,264 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for multivariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted + +from detection_algorithm.core.CollectiveBase import CollectiveBaseDetector +from pyod.models.pca import PCA as PCA_PYOD + +from detection_algorithm.core.utility import get_sub_matrices + + +class PCA(CollectiveBaseDetector): + """PCA-based outlier detection with both univariate and multivariate + time series data. TS data will be first transformed to tabular format. + For univariate data, it will be in shape of [valid_length, window_size]. + for multivariate data with d sequences, it will be in the shape of + [valid_length, window_size]. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_components : int, float, None or string + Number of components to keep. It should be smaller than the window_size. + if n_components is not set all components are kept:: + + n_components == min(n_samples, n_features) + + if n_components == 'mle' and svd_solver == 'full', Minka\'s MLE is used + to guess the dimension + if ``0 < n_components < 1`` and svd_solver == 'full', select the number + of components such that the amount of variance that needs to be + explained is greater than the percentage specified by n_components + n_components cannot be equal to n_features for svd_solver == 'arpack'. + + n_selected_components : int, optional (default=None) + Number of selected principal components + for calculating the outlier scores. It is not necessarily equal to + the total number of the principal components. If not set, use + all principal components. + + copy : bool (default True) + If False, data passed to fit are overwritten and running + fit(X).transform(X) will not yield the expected results, + use fit_transform(X) instead. + + whiten : bool, optional (default False) + When True (False by default) the `components_` vectors are multiplied + by the square root of n_samples and then divided by the singular values + to ensure uncorrelated outputs with unit component-wise variances. + + Whitening will remove some information from the transformed signal + (the relative variance scales of the components) but can sometime + improve the predictive accuracy of the downstream estimators by + making their data respect some hard-wired assumptions. + + svd_solver : string {'auto', 'full', 'arpack', 'randomized'} + auto : + the solver is selected by a default policy based on `X.shape` and + `n_components`: if the input data is larger than 500x500 and the + number of components to extract is lower than 80% of the smallest + dimension of the data, then the more efficient 'randomized' + method is enabled. Otherwise the exact full SVD is computed and + optionally truncated afterwards. + full : + run exact full SVD calling the standard LAPACK solver via + `scipy.linalg.svd` and select the components by postprocessing + arpack : + run SVD truncated to n_components calling ARPACK solver via + `scipy.sparse.linalg.svds`. It requires strictly + 0 < n_components < X.shape[1] + randomized : + run randomized SVD by the method of Halko et al. + + tol : float >= 0, optional (default .0) + Tolerance for singular values computed by svd_solver == 'arpack'. + + iterated_power : int >= 0, or 'auto', (default 'auto') + Number of iterations for the power method computed by + svd_solver == 'randomized'. + + random_state : int, RandomState instance or None, optional (default None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'. + + weighted : bool, optional (default=True) + If True, the eigenvalues are used in score computation. + The eigenvectors with small eigenvalues comes with more importance + in outlier score calculation. + + standardization : bool, optional (default=True) + If True, perform standardization first to convert + data to zero mean and unit variance. + See http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, window_size, step_size=1, contamination=0.1, + n_components=None, n_selected_components=None, + copy=True, whiten=False, svd_solver='auto', + tol=0.0, iterated_power='auto', random_state=None, + weighted=True, standardization=True): + super(PCA, self).__init__(contamination=contamination) + self.window_size = window_size + self.step_size = step_size + + # parameters for PCA + self.n_components = n_components + self.n_selected_components = n_selected_components + self.copy = copy + self.whiten = whiten + self.svd_solver = svd_solver + self.tol = tol + self.iterated_power = iterated_power + self.random_state = random_state + self.weighted = weighted + self.standardization = standardization + + # initialize a kNN model + self.model_ = PCA_PYOD(n_components=self.n_components, + n_selected_components=self.n_selected_components, + contamination=self.contamination, + copy=self.copy, + whiten=self.whiten, + svd_solver=self.svd_solver, + tol=self.tol, + iterated_power=self.iterated_power, + random_state=self.random_state, + weighted=self.weighted, + standardization=self.standardization) + + def fit(self, X: np.array) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = check_array(X).astype(np.float) + + # first convert it into submatrices, and flatten it + sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( + X, + self.window_size, + self.step_size, + return_numpy=True, + flatten=True, + flatten_order='F') + + # if self.n_components > sub_matrices.shape[1]: + # raise ValueError('n_components exceeds window_size times the number of sequences.') + + # fit the PCA model + self.model_.fit(sub_matrices) + self.decision_scores_ = self.model_.decision_scores_ + self._process_decision_scores() + return self + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['model_']) + X = check_array(X).astype(np.float) + # first convert it into submatrices, and flatten it + sub_matrices, X_left_inds, X_right_inds = get_sub_matrices( + X, + self.window_size, + self.step_size, + return_numpy=True, + flatten=True, + flatten_order='F') + + # return the prediction result by PCA + return self.model_.decision_function( + sub_matrices), X_left_inds.ravel(), X_right_inds.ravel() + + +if __name__ == "__main__": + # X_train = np.asarray( + # [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]).reshape(-1, 1) + + # X_test = np.asarray( + # [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1, + # 1) + + X_train = np.asarray( + [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], + [12., 12], + [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + + w = get_sub_matrices(X_train, window_size=3, step=2, flatten=False) + X_test = np.asarray( + [[12., 10], [8., 12], [80., 80], [92., 983], + [18., 16], [20., 7], [18., 10], [3., 5], [5., 9], [23., 12], + [22., 15]]) + + clf = PCA(window_size=3, step_size=2, contamination=0.2) + + clf.fit(X_train) + decision_scores, left_inds_, right_inds = clf.decision_scores_, \ + clf.left_inds_, clf.right_inds_ + print(clf.left_inds_, clf.right_inds_) + pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) + pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) + pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) + + print(pred_scores) + print(pred_labels) + print(pred_probs) diff --git a/tods/detection_algorithm/core/UODCommonTest.py b/tods/detection_algorithm/core/UODCommonTest.py new file mode 100755 index 0000000..b17648b --- /dev/null +++ b/tods/detection_algorithm/core/UODCommonTest.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- + +from __future__ import division +from __future__ import print_function + +import os +import sys + +import numpy as np +import unittest +# noinspection PyProtectedMember +from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import assert_array_less +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_greater +from sklearn.utils.testing import assert_greater_equal +from sklearn.utils.testing import assert_less_equal +from sklearn.utils.testing import assert_raises + +from sklearn.utils.estimator_checks import check_estimator + +from sklearn.metrics import roc_auc_score +from scipy.stats import rankdata + +# temporary solution for relative imports in case pyod is not installed +# if pyod is installed, no need to use the following line +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from pyod.utils.data import generate_data + + +class UODCommonTest: + def __init__(self, + model, + X_train, + y_train, + X_test, + y_test, + roc_floor, + ): + self.clf = model + self.X_train = X_train + self.y_train = y_train + self.X_test = X_test + self.y_test = y_test + self.roc_floor = roc_floor + + self.clf.fit(self.X_train) + + pass + + def test_detector(self): + + self.test_parameters() + self.test_train_scores() + self.test_prediction_scores() + self.test_prediction_proba() + self.test_prediction_proba_linear() + self.test_prediction_proba_unify() + self.test_prediction_proba_parameter() + # self.test_fit_predict() + # self.test_fit_predict_score() + self.test_prediction_labels() + # self.test_predict_rank() + # self.test_predict_rank_normalized() + self.tearDown() + + def test_parameters(self): + assert (hasattr(self.clf, 'decision_scores_') and + self.clf.decision_scores_ is not None) + assert (hasattr(self.clf, 'labels_') and + self.clf.labels_ is not None) + assert (hasattr(self.clf, 'threshold_') and + self.clf.threshold_ is not None) + assert (hasattr(self.clf, '_mu') and + self.clf._mu is not None) + assert (hasattr(self.clf, '_sigma') and + self.clf._sigma is not None) + + def test_train_scores(self): + assert_equal(len(self.clf.decision_scores_), self.y_train.shape[0]) + + def test_prediction_scores(self): + pred_scores = self.clf.decision_function(self.X_test) + + # check score shapes + assert_equal(pred_scores.shape[0], self.y_test.shape[0]) + + # check performance + assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) + + def test_prediction_labels(self): + pred_labels = self.clf.predict(self.X_test) + assert_equal(pred_labels.shape, self.y_test.shape) + + def test_prediction_proba(self): + pred_proba = self.clf.predict_proba(self.X_test) + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_linear(self): + pred_proba = self.clf.predict_proba(self.X_test, method='linear') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_unify(self): + pred_proba = self.clf.predict_proba(self.X_test, method='unify') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_parameter(self): + with assert_raises(ValueError): + self.clf.predict_proba(self.X_test, method='something') + + def test_fit_predict(self): + pred_labels = self.clf.fit_predict(X=self.X_train) + assert_equal(pred_labels.shape, self.y_train.shape) + + def test_fit_predict_score(self): + self.clf.fit_predict_score(self.X_test, self.y_test) + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='roc_auc_score') + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='prc_n_score') + with assert_raises(NotImplementedError): + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='something') + + def test_predict_rank(self): + pred_socres = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, self.X_train.shape[0] + 1) + assert_array_less(-0.1, pred_ranks) + + def test_predict_rank_normalized(self): + pred_socres = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, 1.01) + assert_array_less(-0.1, pred_ranks) + + def tearDown(self): + pass diff --git a/tods/detection_algorithm/core/algorithm_implementation.py b/tods/detection_algorithm/core/algorithm_implementation.py new file mode 100644 index 0000000..e69de29 diff --git a/tods/detection_algorithm/core/test_CollectiveBase.py b/tods/detection_algorithm/core/test_CollectiveBase.py new file mode 100644 index 0000000..3c8323e --- /dev/null +++ b/tods/detection_algorithm/core/test_CollectiveBase.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- +from __future__ import division +from __future__ import print_function + +import os +import sys + +import unittest +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_raises + +import numpy as np + +# temporary solution for relative imports in case pyod is not installed +# if pyod is installed, no need to use the following line +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from detection_algorithm.core.CollectiveBase import CollectiveBaseDetector +from pyod.utils.data import generate_data + + +# Check sklearn\tests\test_base +# A few test classes +# noinspection PyMissingConstructor,PyPep8Naming +class MyEstimator(CollectiveBaseDetector): + + def __init__(self, l1=0, empty=None): + self.l1 = l1 + self.empty = empty + + def fit(self, X, y=None): + pass + + def decision_function(self, X): + pass + + +# noinspection PyMissingConstructor +class K(CollectiveBaseDetector): + def __init__(self, c=None, d=None): + self.c = c + self.d = d + + def fit(self, X, y=None): + pass + + def decision_function(self, X): + pass + + +# noinspection PyMissingConstructor +class T(CollectiveBaseDetector): + def __init__(self, a=None, b=None): + self.a = a + self.b = b + + def fit(self, X, y=None): + pass + + def decision_function(self, X): + pass + + +# noinspection PyMissingConstructor +class ModifyInitParams(CollectiveBaseDetector): + """Deprecated behavior. + Equal parameters but with a type cast. + Doesn't fulfill a is a + """ + + def __init__(self, a=np.array([0])): + self.a = a.copy() + + def fit(self, X, y=None): + pass + + def decision_function(self, X): + pass + + +# noinspection PyMissingConstructor +class VargEstimator(CollectiveBaseDetector): + """scikit-learn estimators shouldn't have vargs.""" + + def __init__(self, *vargs): + pass + + def fit(self, X, y=None): + pass + + def decision_function(self, X): + pass + + +class Dummy1(CollectiveBaseDetector): + def __init__(self, contamination=0.1): + super(Dummy1, self).__init__(contamination=contamination) + + def decision_function(self, X): + pass + + def fit(self, X, y=None): + pass + + +class Dummy2(CollectiveBaseDetector): + def __init__(self, contamination=0.1): + super(Dummy2, self).__init__(contamination=contamination) + + def decision_function(self, X): + pass + + def fit(self, X, y=None): + return X + + +class Dummy3(CollectiveBaseDetector): + def __init__(self, contamination=0.1): + super(Dummy3, self).__init__(contamination=contamination) + + def decision_function(self, X): + pass + + def fit(self, X, y=None): + self.labels_ = X + + +class TestBASE(unittest.TestCase): + def setUp(self): + self.n_train = 100 + self.n_test = 50 + self.contamination = 0.1 + self.roc_floor = 0.6 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination) + + def test_init(self): + """ + Test base class initialization + + :return: + """ + self.dummy_clf = Dummy1() + assert_equal(self.dummy_clf.contamination, 0.1) + + self.dummy_clf = Dummy1(contamination=0.2) + assert_equal(self.dummy_clf.contamination, 0.2) + + with assert_raises(ValueError): + Dummy1(contamination=0.51) + + with assert_raises(ValueError): + Dummy1(contamination=0) + + with assert_raises(ValueError): + Dummy1(contamination=-0.5) + + def test_fit(self): + self.dummy_clf = Dummy2() + assert_equal(self.dummy_clf.fit(0), 0) + + def test_fit_predict(self): + # TODO: add more testcases + + self.dummy_clf = Dummy3() + + assert_equal(self.dummy_clf.fit_predict(0), 0) + + def test_predict_proba(self): + # TODO: create uniform testcases + pass + + def test_rank(self): + # TODO: create uniform testcases + pass + + def test_repr(self): + # Smoke test the repr of the base estimator. + my_estimator = MyEstimator() + repr(my_estimator) + test = T(K(), K()) + assert_equal( + repr(test), + "T(a=K(c=None, d=None), b=K(c=None, d=None))" + ) + + some_est = T(a=["long_params"] * 1000) + assert_equal(len(repr(some_est)), 415) + + def test_str(self): + # Smoke test the str of the base estimator + my_estimator = MyEstimator() + str(my_estimator) + + def test_get_params(self): + test = T(K(), K()) + + assert ('a__d' in test.get_params(deep=True)) + assert ('a__d' not in test.get_params(deep=False)) + + test.set_params(a__d=2) + assert (test.a.d == 2) + assert_raises(ValueError, test.set_params, a__a=2) + + def tearDown(self): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/detection_algorithm/core/utility.py b/tods/detection_algorithm/core/utility.py new file mode 100644 index 0000000..a486b99 --- /dev/null +++ b/tods/detection_algorithm/core/utility.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +"""Utility functions for supporting time-series based outlier detection. +""" + +import numpy as np +from sklearn.utils import check_array + + +# def get_sub_sequences(X, window_size, step=1): +# """Chop a univariate time series into sub sequences. + +# Parameters +# ---------- +# X : numpy array of shape (n_samples,) +# The input samples. + +# window_size : int +# The moving window size. + +# step_size : int, optional (default=1) +# The displacement for moving window. + +# Returns +# ------- +# X_sub : numpy array of shape (valid_len, window_size) +# The numpy matrix with each row stands for a subsequence. +# """ +# X = check_array(X).astype(np.float) +# n_samples = len(X) + +# # get the valid length +# valid_len = get_sub_sequences_length(n_samples, window_size, step) + +# X_sub = np.zeros([valid_len, window_size]) +# # y_sub = np.zeros([valid_len, 1]) + +# # exclude the edge +# steps = list(range(0, n_samples, step)) +# steps = steps[:valid_len] + +# for idx, i in enumerate(steps): +# X_sub[idx,] = X[i: i + window_size].ravel() + +# return X_sub + +def get_sub_matrices(X, window_size, step=1, return_numpy=True, flatten=True, + flatten_order='F'): + """Chop a multivariate time series into sub sequences (matrices). + + Parameters + ---------- + X : numpy array of shape (n_samples,) + The input samples. + + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + return_numpy : bool, optional (default=True) + If True, return the data format in 3d numpy array. + + flatten : bool, optional (default=True) + If True, flatten the returned array in 2d. + + flatten_order : str, optional (default='F') + Decide the order of the flatten for multivarite sequences. + ‘C’ means to flatten in row-major (C-style) order. + ‘F’ means to flatten in column-major (Fortran- style) order. + ‘A’ means to flatten in column-major order if a is Fortran contiguous in memory, + row-major order otherwise. ‘K’ means to flatten a in the order the elements occur in memory. + The default is ‘F’. + + Returns + ------- + X_sub : numpy array of shape (valid_len, window_size*n_sequences) + The numpy matrix with each row stands for a flattend submatrix. + """ + X = check_array(X).astype(np.float) + n_samples, n_sequences = X.shape[0], X.shape[1] + + # get the valid length + valid_len = get_sub_sequences_length(n_samples, window_size, step) + + X_sub = [] + X_left_inds = [] + X_right_inds = [] + + # exclude the edge + steps = list(range(0, n_samples, step)) + steps = steps[:valid_len] + + # print(n_samples, n_sequences) + for idx, i in enumerate(steps): + X_sub.append(X[i: i + window_size, :]) + X_left_inds.append(i) + X_right_inds.append(i + window_size) + + X_sub = np.asarray(X_sub) + + if return_numpy: + if flatten: + temp_array = np.zeros([valid_len, window_size * n_sequences]) + if flatten_order == 'C': + for i in range(valid_len): + temp_array[i, :] = X_sub[i, :, :].flatten(order='C') + + else: + for i in range(valid_len): + temp_array[i, :] = X_sub[i, :, :].flatten(order='F') + return temp_array, np.asarray(X_left_inds), np.asarray( + X_right_inds) + + else: + return np.asarray(X_sub), np.asarray(X_left_inds), np.asarray( + X_right_inds) + else: + return X_sub, np.asarray(X_left_inds), np.asarray(X_right_inds) + + +def get_sub_sequences_length(n_samples, window_size, step): + """Pseudo chop a univariate time series into sub sequences. Return valid + length only. + + Parameters + ---------- + X : numpy array of shape (n_samples,) + The input samples. + + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + Returns + ------- + valid_len : int + The number of subsequences. + + """ + # if X.shape[0] == 1: + # n_samples = X.shape[1] + # elif X.shape[1] == 1: + # n_samples = X.shape[0] + # else: + # raise ValueError("X is not a univarite series. The shape is {shape}.".format(shape=X.shape)) + + # valid_len = n_samples - window_size + 1 + # valida_len = int_down(n_samples-window_size)/step + 1 + valid_len = int(np.floor((n_samples - window_size) / step)) + 1 + return valid_len + + +if __name__ == "__main__": + X_train = np.asarray( + [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, + 100]).reshape(-1, 1) + + X_train = np.asarray( + [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], [12., 12], + [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + + # n_samples = X.shape[0] + + window_size = 3 + + # valid_len = n_samples - window_size + 1 + + # X_sub = np.zeros([valid_len, window_size]) + + # for i in range(valid_len): + # X_sub[i, ] = X[i: i+window_size] + + # X_sub_2 = get_sub_sequences(X, window_size, step=2) + X_sub_3, X_left_inds, X_right_inds = get_sub_matrices(X_train, window_size, + step=2, + flatten_order='C') diff --git a/tods/detection_algorithm/core/utils/channel.py b/tods/detection_algorithm/core/utils/channel.py new file mode 100644 index 0000000..197bee4 --- /dev/null +++ b/tods/detection_algorithm/core/utils/channel.py @@ -0,0 +1,114 @@ +import numpy as np +import os +import logging + +logger = logging.getLogger('telemanom') + + +class Channel: + def __init__(self,n_predictions,l_s): + # , config, chan_id): + """ + Load and reshape channel values (predicted and actual). + + Args: + config (obj): Config object containing parameters for processing + chan_id (str): channel id + + Attributes: + id (str): channel id + config (obj): see Args + X_train (arr): training inputs with dimensions + [timesteps, l_s, input dimensions) + X_test (arr): test inputs with dimensions + [timesteps, l_s, input dimensions) + y_train (arr): actual channel training values with dimensions + [timesteps, n_predictions, 1) + y_test (arr): actual channel test values with dimensions + [timesteps, n_predictions, 1) + train (arr): train data loaded from .npy file + test(arr): test data loaded from .npy file + """ + + # self.id = chan_id + # self.config = config + self.X_train = None + self.y_train = None + self.X_test = None + self.y_test = None + self.y_hat = None + self.train = None + self.test = None + + self._n_predictions = n_predictions + self._l_s = l_s + + def shape_train_data(self, arr): + # , train=True): + """Shape raw input streams for ingestion into LSTM. config.l_s specifies + the sequence length of prior timesteps fed into the model at + each timestep t. + + Args: + arr (np array): array of input streams with + dimensions [timesteps, 1, input dimensions] + train (bool): If shaping training data, this indicates + data can be shuffled + """ + # print("in shape data") + # print("arr shape",arr.shape) + # print("ls",self.config.l_s) + # print("n_pred",self.config.n_predictions) + data = [] + + for i in range(len(arr) - self._l_s - self._n_predictions): + data.append(arr[i:i + self._l_s + self._n_predictions]) + data = np.array(data) + # print("data shape",data.shape) + # assert len(data.shape) == 3 + + # if train: + # # np.random.shuffle(data) + # self.X_train = data[:, :-self.config.n_predictions, :] + # self.y_train = data[:, -self.config.n_predictions:, :] # telemetry value is at position 0 + # self.y_train = np.reshape(self.y_train,(self.y_train.shape[0],self.y_train.shape[1]*self.y_train.shape[2])) + # print("X train shape",self.X_train .shape) + # print("Y train shape",self.y_train .shape) + # else: + + self.X_train = data[:, :-self._n_predictions, :] + self.y_train = data[:, -self._n_predictions:, :] # telemetry value is at position 0 + self.y_train = np.reshape(self.y_train,(self.y_train.shape[0],self.y_train.shape[1]*self.y_train.shape[2])) + + + + + def shape_test_data(self, arr): + data = [] + + for i in range(len(arr) - self._l_s - self._n_predictions): + data.append(arr[i:i + self._l_s + self._n_predictions]) + data = np.array(data) + # print("data shape",data.shape) + self.X_test = data[:, :-self._n_predictions, :] + self.y_test = data[:, -self._n_predictions:, :] # telemetry value is at position 0 + self.y_test = np.reshape(self.y_test,(self.y_test.shape[0],self.y_test.shape[1]*self.y_test.shape[2])) + + + # def load_data(self): + # """ + # Load train and test data from local. + # """ + # # try: + # # self.train = np.load(os.path.join("data", "train", "{}.npy".format(self.id))) + # # self.test = np.load(os.path.join("data", "test", "{}.npy".format(self.id))) + + # # except FileNotFoundError as e: + # # # logger.critical(e) + # # # logger.critical("Source data not found, may need to add data to repo: ") + # # print("Source data not found, may need to add data to repo: ") + + # print("before shape function") + # print(self.train.shape) + # self.shape_data(self.train) + # self.shape_data(self.test, train=False) \ No newline at end of file diff --git a/tods/detection_algorithm/core/utils/errors.py b/tods/detection_algorithm/core/utils/errors.py new file mode 100644 index 0000000..a09d638 --- /dev/null +++ b/tods/detection_algorithm/core/utils/errors.py @@ -0,0 +1,532 @@ +import numpy as np +import pandas as pd +import more_itertools as mit +import os +import logging + +logger = logging.getLogger('telemanom') + + +class Errors: + def __init__(self, channel, window_size,batch_size, smoothing_perc,n_predictions,l_s,error_buffer,p): + """ + Batch processing of errors between actual and predicted values + for a channel. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + config (obj): Config object containing parameters for processing + run_id (str): Datetime referencing set of predictions in use + + Attributes: + config (obj): see Args + window_size (int): number of trailing batches to use in error + calculation + n_windows (int): number of windows in test values for channel + i_anom (arr): indices of anomalies in channel test values + E_seq (arr of tuples): array of (start, end) indices for each + continuous anomaly sequence in test values + anom_scores (arr): score indicating relative severity of each + anomaly sequence in E_seq + e (arr): errors in prediction (predicted - actual) + e_s (arr): exponentially-smoothed errors in prediction + normalized (arr): prediction errors as a percentage of the range + of the channel values + """ + + # self.config = config + + + self._window_size =window_size + self._batch_size = batch_size + self._smoothing_perc = smoothing_perc + self._n_predictions = n_predictions + self._l_s = l_s + self._error_buffer = error_buffer + self._p = p + + + self.window_size = self._window_size + self.n_windows = int((channel.y_test.shape[0] - + (self._batch_size * self.window_size)) + / self._batch_size) + self.i_anom = np.array([]) + self.E_seq = [] + self.anom_scores = [] + channel.y_test = np.reshape(channel.y_test,(channel.X_test.shape[0],self._n_predictions,channel.X_test.shape[2])) + # print("*****************************") + # print("y_hat shape",channel.y_hat.shape) + # print("y_test shape",channel.y_test.shape) + + channel.y_hat = np.reshape(channel.y_hat, (channel.y_hat.shape[0]*channel.y_hat.shape[1]*channel.y_hat.shape[2])) + channel.y_test = np.reshape(channel.y_test, (channel.y_test.shape[0]*channel.y_test.shape[1]*channel.y_test.shape[2])) + + # print("after y_hat shape",channel.y_hat.shape) + # print(" after y_test shape",channel.y_test.shape) + + + # raw prediction error + self.e = [abs(y_h-y_t) for y_h, y_t in + zip(channel.y_hat, channel.y_test)] + + self.e = np.reshape(self.e,(channel.X_test.shape[0],self._n_predictions,channel.X_test.shape[2])) + # print("raw shape",self.e.shape) + + n_pred = self._n_predictions + n_feature = channel.X_test.shape[2] + + # Aggregation for point wise + # aggregated_error = np.zeros(n_feature*(len(self.e)+n_pred-1)) + # aggregated_error = np.reshape(aggregated_error,((len(self.e)+n_pred-1),n_feature)) + # # print(aggregated_error) + # for i in range(0,len(self.e)): + # for j in range(len(self.e[i])): + # aggregated_error[i+j]+= self.e[i][j] + + # for i in range(1, len(aggregated_error)+1): + # if i < n_pred: + # aggregated_error[i-1] /=i + # elif len(aggregated_error) - i+1 < n_pred: + # aggregated_error[i-1]/= len(aggregated_error) - i+1 + # else: + # aggregated_error[i-1] /=n_pred + + # Aggregation sequence wise + aggregated_error = [] + for i in range(0,len(self.e)): + aggregated_error.append(np.sum(self.e[i],axis=0)) + + aggregated_error = np.asarray(aggregated_error) + # print(aggregated_error.shape) + + smoothing_window = int(self._batch_size * self._window_size + * self._smoothing_perc) + if not len(channel.y_hat) == len(channel.y_test): + raise ValueError('len(y_hat) != len(y_test): {}, {}' + .format(len(channel.y_hat), len(channel.y_test))) + + # smoothed prediction error + self.e_s = pd.DataFrame(aggregated_error).ewm(span=smoothing_window)\ + .mean().values.flatten() + + # print("ES",self.e_s) + # print("ES",self.e_s.shape) + # for values at beginning < sequence length, just use avg + # if not channel.id == 'C-2': # anomaly occurs early in window + + # print("LHS",self.e_s[:self.config.l_s]) + # print("RHS",[np.mean(self.e_s[:self.config.l_s * 2])] * self.config.l_s) + # b = [np.mean(self.e_s[:self.config.l_s * 2])] * self.config.l_s + # print("RHS shape",len(b)) + # self.e_s[:self._l_s] = [np.mean(self.e_s[:self._l_s * 2])] * self._l_s + + # np.save(os.path.join('data', run_id, 'smoothed_errors', '{}.npy' + # .format(channel.id)), + # np.array(self.e_s)) + + self.normalized = np.mean(self.e / np.ptp(channel.y_test)) + # logger.info("normalized prediction error: {0:.2f}" + # .format(self.normalized)) + + def adjust_window_size(self, channel): + """ + Decrease the historical error window size (h) if number of test + values is limited. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + """ + + while self.n_windows < 0: + self.window_size -= 1 + self.n_windows = int((channel.y_test.shape[0] + - (self._batch_size * self.window_size)) + / self._batch_size) + if self.window_size == 1 and self.n_windows < 0: + raise ValueError('Batch_size ({}) larger than y_test (len={}). ' + 'Adjust in config.yaml.' + .format(self._batch_size, + channel.y_test.shape[0])) + + def merge_scores(self): + """ + If anomalous sequences from subsequent batches are adjacent they + will automatically be combined. This combines the scores for these + initial adjacent sequences (scores are calculated as each batch is + processed) where applicable. + """ + + merged_scores = [] + score_end_indices = [] + + for i, score in enumerate(self.anom_scores): + if not score['start_idx']-1 in score_end_indices: + merged_scores.append(score['score']) + score_end_indices.append(score['end_idx']) + + def process_batches(self, channel): + """ + Top-level function for the Error class that loops through batches + of values for a channel. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + """ + + self.adjust_window_size(channel) + + for i in range(0, self.n_windows+1): + prior_idx = i * self._batch_size + idx = (self._window_size * self._batch_size) \ + + (i * self._batch_size) + if i == self.n_windows: + idx = channel.y_test.shape[0] + + window = ErrorWindow(channel, prior_idx, idx, self, i,self._l_s,self._error_buffer,self._batch_size,self._p) + + window.find_epsilon() + window.find_epsilon(inverse=True) + + window.compare_to_epsilon(self) + window.compare_to_epsilon(self, inverse=True) + + if len(window.i_anom) == 0 and len(window.i_anom_inv) == 0: + continue + + window.prune_anoms() + window.prune_anoms(inverse=True) + + if len(window.i_anom) == 0 and len(window.i_anom_inv) == 0: + continue + + window.i_anom = np.sort(np.unique( + np.append(window.i_anom, window.i_anom_inv))).astype('int') + window.score_anomalies(prior_idx) + # print("window anom scores", window.anom_scores) + + # update indices to reflect true indices in full set of values + self.i_anom = np.append(self.i_anom, window.i_anom + prior_idx) + self.anom_scores = self.anom_scores + window.anom_scores + + if len(self.i_anom) > 0: + # group anomalous indices into continuous sequences + groups = [list(group) for group in + mit.consecutive_groups(self.i_anom)] + self.E_seq = [(int(g[0]), int(g[-1])) for g in groups + if not g[0] == g[-1]] + + # additional shift is applied to indices so that they represent the + # position in the original data array, obtained from the .npy files, + # and not the position on y_test (See PR #27). + self.E_seq = [(e_seq[0] + self._l_s, + e_seq[1] + self._l_s) for e_seq in self.E_seq] + + self.merge_scores() + + +class ErrorWindow: + def __init__(self, channel,start_idx, end_idx, errors, window_num,l_s,error_buffer,batch_size,p): + """ + Data and calculations for a specific window of prediction errors. + Includes finding thresholds, pruning, and scoring anomalous sequences + for errors and inverted errors (flipped around mean) - significant drops + in values can also be anomalous. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + config (obj): Config object containing parameters for processing + start_idx (int): Starting index for window within full set of + channel test values + end_idx (int): Ending index for window within full set of channel + test values + errors (arr): Errors class object + window_num (int): Current window number within channel test values + + Attributes: + i_anom (arr): indices of anomalies in window + i_anom_inv (arr): indices of anomalies in window of inverted + telemetry values + E_seq (arr of tuples): array of (start, end) indices for each + continuous anomaly sequence in window + E_seq_inv (arr of tuples): array of (start, end) indices for each + continuous anomaly sequence in window of inverted telemetry + values + non_anom_max (float): highest smoothed error value below epsilon + non_anom_max_inv (float): highest smoothed error value below + epsilon_inv + config (obj): see Args + anom_scores (arr): score indicating relative severity of each + anomaly sequence in E_seq within a window + window_num (int): see Args + sd_lim (int): default number of standard deviations to use for + threshold if no winner or too many anomalous ranges when scoring + candidate thresholds + sd_threshold (float): number of standard deviations for calculation + of best anomaly threshold + sd_threshold_inv (float): same as above for inverted channel values + e_s (arr): exponentially-smoothed prediction errors in window + e_s_inv (arr): inverted e_s + sd_e_s (float): standard deviation of e_s + mean_e_s (float): mean of e_s + epsilon (float): threshold for e_s above which an error is + considered anomalous + epsilon_inv (float): threshold for inverted e_s above which an error + is considered anomalous + y_test (arr): Actual telemetry values for window + sd_values (float): st dev of y_test + perc_high (float): the 95th percentile of y_test values + perc_low (float): the 5th percentile of y_test values + inter_range (float): the range between perc_high - perc_low + num_to_ignore (int): number of values to ignore initially when + looking for anomalies + """ + + self._l_s = l_s + self._error_buffer = error_buffer + self._batch_size = batch_size + self._p = p + + + self.i_anom = np.array([]) + self.E_seq = np.array([]) + self.non_anom_max = -1000000 + self.i_anom_inv = np.array([]) + self.E_seq_inv = np.array([]) + self.non_anom_max_inv = -1000000 + + # self.config = config + self.anom_scores = [] + + self.window_num = window_num + + self.sd_lim = 12.0 + self.sd_threshold = self.sd_lim + self.sd_threshold_inv = self.sd_lim + + self.e_s = errors.e_s[start_idx:end_idx] + + self.mean_e_s = np.mean(self.e_s) + self.sd_e_s = np.std(self.e_s) + self.e_s_inv = np.array([self.mean_e_s + (self.mean_e_s - e) + for e in self.e_s]) + + self.epsilon = self.mean_e_s + self.sd_lim * self.sd_e_s + self.epsilon_inv = self.mean_e_s + self.sd_lim * self.sd_e_s + + self.y_test = channel.y_test[start_idx:end_idx] + self.sd_values = np.std(self.y_test) + + self.perc_high, self.perc_low = np.percentile(self.y_test, [95, 5]) + self.inter_range = self.perc_high - self.perc_low + + # ignore initial error values until enough history for processing + self.num_to_ignore = self._l_s * 2 + # if y_test is small, ignore fewer + if len(channel.y_test) < 2500: + self.num_to_ignore = self._l_s + if len(channel.y_test) < 1800: + self.num_to_ignore = 0 + + def find_epsilon(self, inverse=False): + """ + Find the anomaly threshold that maximizes function representing + tradeoff between: + a) number of anomalies and anomalous ranges + b) the reduction in mean and st dev if anomalous points are removed + from errors + (see https://arxiv.org/pdf/1802.04431.pdf) + + Args: + inverse (bool): If true, epsilon is calculated for inverted errors + """ + e_s = self.e_s if not inverse else self.e_s_inv + + max_score = -10000000 + + for z in np.arange(2.5, self.sd_lim, 0.5): + epsilon = self.mean_e_s + (self.sd_e_s * z) + + pruned_e_s = e_s[e_s < epsilon] + + i_anom = np.argwhere(e_s >= epsilon).reshape(-1,) + buffer = np.arange(1, self._error_buffer) + i_anom = np.sort(np.concatenate((i_anom, + np.array([i+buffer for i in i_anom]) + .flatten(), + np.array([i-buffer for i in i_anom]) + .flatten()))) + i_anom = i_anom[(i_anom < len(e_s)) & (i_anom >= 0)] + i_anom = np.sort(np.unique(i_anom)) + + if len(i_anom) > 0: + # group anomalous indices into continuous sequences + groups = [list(group) for group + in mit.consecutive_groups(i_anom)] + E_seq = [(g[0], g[-1]) for g in groups if not g[0] == g[-1]] + + mean_perc_decrease = (self.mean_e_s - np.mean(pruned_e_s)) \ + / self.mean_e_s + sd_perc_decrease = (self.sd_e_s - np.std(pruned_e_s)) \ + / self.sd_e_s + score = (mean_perc_decrease + sd_perc_decrease) \ + / (len(E_seq) ** 2 + len(i_anom)) + + # sanity checks / guardrails + if score >= max_score and len(E_seq) <= 5 and \ + len(i_anom) < (len(e_s) * 0.5): + max_score = score + if not inverse: + self.sd_threshold = z + self.epsilon = self.mean_e_s + z * self.sd_e_s + else: + self.sd_threshold_inv = z + self.epsilon_inv = self.mean_e_s + z * self.sd_e_s + + def compare_to_epsilon(self, errors_all, inverse=False): + """ + Compare smoothed error values to epsilon (error threshold) and group + consecutive errors together into sequences. + + Args: + errors_all (obj): Errors class object containing list of all + previously identified anomalies in test set + """ + + e_s = self.e_s if not inverse else self.e_s_inv + epsilon = self.epsilon if not inverse else self.epsilon_inv + + # Check: scale of errors compared to values too small? + if not (self.sd_e_s > (.05 * self.sd_values) or max(self.e_s) + > (.05 * self.inter_range)) or not max(self.e_s) > 0.05: + return + + i_anom = np.argwhere((e_s >= epsilon) & + (e_s > 0.05 * self.inter_range)).reshape(-1,) + + if len(i_anom) == 0: + return + buffer = np.arange(1, self._error_buffer+1) + i_anom = np.sort(np.concatenate((i_anom, + np.array([i + buffer for i in i_anom]) + .flatten(), + np.array([i - buffer for i in i_anom]) + .flatten()))) + i_anom = i_anom[(i_anom < len(e_s)) & (i_anom >= 0)] + + # if it is first window, ignore initial errors (need some history) + if self.window_num == 0: + i_anom = i_anom[i_anom >= self.num_to_ignore] + else: + i_anom = i_anom[i_anom >= len(e_s) - self._batch_size] + + i_anom = np.sort(np.unique(i_anom)) + + # capture max of non-anomalous values below the threshold + # (used in filtering process) + batch_position = self.window_num * self._batch_size + window_indices = np.arange(0, len(e_s)) + batch_position + adj_i_anom = i_anom + batch_position + window_indices = np.setdiff1d(window_indices, + np.append(errors_all.i_anom, adj_i_anom)) + candidate_indices = np.unique(window_indices - batch_position) + non_anom_max = np.max(np.take(e_s, candidate_indices)) + + # group anomalous indices into continuous sequences + groups = [list(group) for group in mit.consecutive_groups(i_anom)] + E_seq = [(g[0], g[-1]) for g in groups if not g[0] == g[-1]] + + if inverse: + self.i_anom_inv = i_anom + self.E_seq_inv = E_seq + self.non_anom_max_inv = non_anom_max + else: + self.i_anom = i_anom + self.E_seq = E_seq + self.non_anom_max = non_anom_max + + def prune_anoms(self, inverse=False): + """ + Remove anomalies that don't meet minimum separation from the next + closest anomaly or error value + + Args: + inverse (bool): If true, epsilon is calculated for inverted errors + """ + + E_seq = self.E_seq if not inverse else self.E_seq_inv + e_s = self.e_s if not inverse else self.e_s_inv + non_anom_max = self.non_anom_max if not inverse \ + else self.non_anom_max_inv + + if len(E_seq) == 0: + return + + E_seq_max = np.array([max(e_s[e[0]:e[1]+1]) for e in E_seq]) + E_seq_max_sorted = np.sort(E_seq_max)[::-1] + E_seq_max_sorted = np.append(E_seq_max_sorted, [non_anom_max]) + + i_to_remove = np.array([]) + for i in range(0, len(E_seq_max_sorted)-1): + if (E_seq_max_sorted[i] - E_seq_max_sorted[i+1]) \ + / E_seq_max_sorted[i] < self._p: + i_to_remove = np.append(i_to_remove, np.argwhere( + E_seq_max == E_seq_max_sorted[i])) + else: + i_to_remove = np.array([]) + i_to_remove[::-1].sort() + + if len(i_to_remove) > 0: + E_seq = np.delete(E_seq, i_to_remove, axis=0) + + if len(E_seq) == 0 and inverse: + self.i_anom_inv = np.array([]) + return + elif len(E_seq) == 0 and not inverse: + self.i_anom = np.array([]) + return + + indices_to_keep = np.concatenate([range(e_seq[0], e_seq[-1]+1) + for e_seq in E_seq]) + + if not inverse: + mask = np.isin(self.i_anom, indices_to_keep) + self.i_anom = self.i_anom[mask] + else: + mask_inv = np.isin(self.i_anom_inv, indices_to_keep) + self.i_anom_inv = self.i_anom_inv[mask_inv] + + def score_anomalies(self, prior_idx): + """ + Calculate anomaly scores based on max distance from epsilon + for each anomalous sequence. + + Args: + prior_idx (int): starting index of window within full set of test + values for channel + """ + + groups = [list(group) for group in mit.consecutive_groups(self.i_anom)] + + for e_seq in groups: + + score_dict = { + "start_idx": e_seq[0] + prior_idx, + "end_idx": e_seq[-1] + prior_idx, + "score": 0 + } + + score = max([abs(self.e_s[i] - self.epsilon) + / (self.mean_e_s + self.sd_e_s) for i in + range(e_seq[0], e_seq[-1] + 1)]) + inv_score = max([abs(self.e_s_inv[i] - self.epsilon_inv) + / (self.mean_e_s + self.sd_e_s) for i in + range(e_seq[0], e_seq[-1] + 1)]) + + # the max score indicates whether anomaly was from regular + # or inverted errors + score_dict['score'] = max([score, inv_score]) + self.anom_scores.append(score_dict) \ No newline at end of file diff --git a/tods/detection_algorithm/core/utils/modeling.py b/tods/detection_algorithm/core/utils/modeling.py new file mode 100644 index 0000000..3da589b --- /dev/null +++ b/tods/detection_algorithm/core/utils/modeling.py @@ -0,0 +1,205 @@ +from keras.models import Sequential, load_model +from keras.callbacks import History, EarlyStopping, Callback +from keras.layers.recurrent import LSTM +from keras.layers.core import Dense, Activation, Dropout +from keras.layers import Flatten +import numpy as np +import os +import logging + +# suppress tensorflow CPU speedup warnings +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' +logger = logging.getLogger('telemanom') + + +class Model: + def __init__(self, channel,patience,min_delta,layers,dropout,n_predictions,loss_metric, + optimizer,lstm_batch_size,epochs,validation_split,batch_size,l_s + ): + """ + Loads/trains RNN and predicts future telemetry values for a channel. + + Args: + config (obj): Config object containing parameters for processing + and model training + run_id (str): Datetime referencing set of predictions in use + channel (obj): Channel class object containing train/test data + for X,y for a single channel + + Attributes: + config (obj): see Args + chan_id (str): channel id + run_id (str): see Args + y_hat (arr): predicted channel values + model (obj): trained RNN model for predicting channel values + """ + + # self.config = config + # self.chan_id = channel.id + # self.run_id = run_id + self.y_hat = np.array([]) + self.model = None + + # self.save() + + self._patience = patience + self._min_delta = min_delta + self._layers = layers + self._dropout = dropout + self._n_predictions = n_predictions + self._loss_metric = loss_metric + self._optimizer = optimizer + self._lstm_batch_size = lstm_batch_size + self._epochs = epochs + self._validation_split = validation_split + self._batch_size = batch_size + self._l_s = l_s + + self.train_new(channel) + + + # def load(self): + # """ + # Load model for channel. + # """ + + # logger.info('Loading pre-trained model') + # self.model = load_model(os.path.join('data', self.config.use_id, + # 'models', self.chan_id + '.h5')) + + def train_new(self, channel): + """ + Train LSTM model according to specifications in config.yaml. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + """ + + cbs = [History(), EarlyStopping(monitor='val_loss', + patience=self._patience, + min_delta=self._min_delta, + verbose=1)] + + self.model = Sequential() + + self.model.add(LSTM( + self._layers[0], + input_shape=(None, channel.X_train.shape[2]), + return_sequences=True)) + self.model.add(Dropout(self._dropout)) + + self.model.add(LSTM( + self._layers[1], + return_sequences=False)) + self.model.add(Dropout(self._dropout)) + + self.model.add(Dense( + self._n_predictions + *channel.X_train.shape[2] + )) + self.model.add(Activation('linear')) + + self.model.compile(loss=self._loss_metric, + optimizer=self._optimizer) + + + # print(self.model.summary()) + + self.model.fit(channel.X_train, + channel.y_train, + batch_size=self._lstm_batch_size, + epochs=self._epochs, + validation_split=self._validation_split, + callbacks=cbs, + verbose=True) + + + + # def save(self): + # """ + # Save trained model. + # """ + + # self.model.save(os.path.join('data', self.run_id, 'models', + # '{}.h5'.format(self.chan_id))) + + def aggregate_predictions(self, y_hat_batch, method='mean'): + """ + Aggregates predictions for each timestep. When predicting n steps + ahead where n > 1, will end up with multiple predictions for a + timestep. + + Args: + y_hat_batch (arr): predictions shape (, = 0 else 0 + + # predictions pertaining to a specific timestep lie along diagonal + y_hat_t = np.flipud(y_hat_batch[start_idx:t+1]).diagonal() + + if method == 'first': + agg_y_hat_batch = np.append(agg_y_hat_batch, [y_hat_t[0]]) + elif method == 'mean': + agg_y_hat_batch = np.append(agg_y_hat_batch, np.mean(y_hat_t)) + + agg_y_hat_batch = agg_y_hat_batch.reshape(len(agg_y_hat_batch), 1) + self.y_hat = np.append(self.y_hat, agg_y_hat_batch) + + + + def batch_predict(self, channel): + """ + Used trained LSTM model to predict test data arriving in batches. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + + Returns: + channel (obj): Channel class object with y_hat values as attribute + """ + + # num_batches = int((y_test.shape[0] - self._l_s) + # / self._batch_size) + # if num_batches < 0: + # raise ValueError('l_s ({}) too large for stream length {}.' + # .format(self._l_s, y_test.shape[0])) + + # # simulate data arriving in batches, predict each batch + # for i in range(0, num_batches + 1): + # prior_idx = i * self._batch_size + # idx = (i + 1) * self._batch_size + + # if i + 1 == num_batches + 1: + # # remaining values won't necessarily equal batch size + # idx = y_test.shape[0] + + # X_test_batch = X_test[prior_idx:idx] + # y_hat_batch = self.model.predict(X_test_batch) + # y_hat_batch = np.reshape(y_hat_batch,(X_test.shape[0],self._n_predictions,X_test.shape[2])) + # # print("PREDICTIONS",y_hat_batch.shape) + # self.aggregate_predictions(y_hat_batch) + + # self.y_hat = np.reshape(self.y_hat, (self.y_hat.size,)) + + # channel.y_hat = self.y_hat + + # # np.save(os.path.join('data', self.run_id, 'y_hat', '{}.npy' + # # .format(self.chan_id)), self.y_hat) + + # return channel + + self.y_hat = self.model.predict(channel.X_test) + self.y_hat = np.reshape(self.y_hat,(channel.X_test.shape[0],self._n_predictions,channel.X_test.shape[2])) + # print("shape before ",self.y_hat.shape) + channel.y_hat = self.y_hat + return channel diff --git a/tods/detection_algorithm/core/utils/utils.py b/tods/detection_algorithm/core/utils/utils.py new file mode 100644 index 0000000..e69de29 diff --git a/tods/entry_points.ini b/tods/entry_points.ini new file mode 100644 index 0000000..5c68b0d --- /dev/null +++ b/tods/entry_points.ini @@ -0,0 +1,81 @@ +[d3m.primitives] +tods.data_processing.dataset_to_dataframe = data_processing.DatasetToDataframe:DatasetToDataFramePrimitive +tods.data_processing.time_interval_transform = data_processing.TimeIntervalTransform:TimeIntervalTransform +tods.data_processing.categorical_to_binary = data_processing.CategoricalToBinary:CategoricalToBinary +tods.data_processing.column_filter = data_processing.ColumnFilter:ColumnFilter +tods.data_processing.timestamp_validation = data_processing.TimeStampValidation:TimeStampValidationPrimitive +tods.data_processing.duplication_validation = data_processing.DuplicationValidation:DuplicationValidation +tods.data_processing.continuity_validation = data_processing.ContinuityValidation:ContinuityValidation + + +tods.timeseries_processing.transformation.axiswise_scaler=timeseries_processing.SKAxiswiseScaler:SKAxiswiseScaler +tods.timeseries_processing.transformation.standard_scaler=timeseries_processing.SKStandardScaler:SKStandardScaler +tods.timeseries_processing.transformation.power_transformer=timeseries_processing.SKPowerTransformer:SKPowerTransformer +tods.timeseries_processing.transformation.quantile_transformer=timeseries_processing.SKQuantileTransformer:SKQuantileTransformer +tods.timeseries_processing.transformation.moving_average_transform = timeseries_processing.MovingAverageTransform:MovingAverageTransform +tods.timeseries_processing.transformation.simple_exponential_smoothing = timeseries_processing.SimpleExponentialSmoothing:SimpleExponentialSmoothing +tods.timeseries_processing.transformation.holt_smoothing = timeseries_processing.HoltSmoothing:HoltSmoothing +tods.timeseries_processing.transformation.holt_winters_exponential_smoothing= timeseries_processing.HoltWintersExponentialSmoothing:HoltWintersExponentialSmoothing +tods.timeseries_processing.decomposition.time_series_seasonality_trend_decomposition = timeseries_processing.TimeSeriesSeasonalityTrendDecomposition:TimeSeriesSeasonalityTrendDecompositionPrimitive + + +tods.feature_analysis.auto_correlation = feature_analysis.AutoCorrelation:AutoCorrelation +tods.feature_analysis.statistical_mean = feature_analysis.StatisticalMean:StatisticalMeanPrimitive +tods.feature_analysis.statistical_median = feature_analysis.StatisticalMedian:StatisticalMedianPrimitive +tods.feature_analysis.statistical_g_mean = feature_analysis.StatisticalGmean:StatisticalGmeanPrimitive +tods.feature_analysis.statistical_abs_energy = feature_analysis.StatisticalAbsEnergy:StatisticalAbsEnergyPrimitive +tods.feature_analysis.statistical_abs_sum = feature_analysis.StatisticalAbsSum:StatisticalAbsSumPrimitive +tods.feature_analysis.statistical_h_mean = feature_analysis.StatisticalHmean:StatisticalHmeanPrimitive +tods.feature_analysis.statistical_maximum = feature_analysis.StatisticalMaximum:StatisticalMaximumPrimitive +tods.feature_analysis.statistical_minimum = feature_analysis.StatisticalMinimum:StatisticalMinimumPrimitive +tods.feature_analysis.statistical_mean_abs = feature_analysis.StatisticalMeanAbs:StatisticalMeanAbsPrimitive +tods.feature_analysis.statistical_mean_abs_temporal_derivative = feature_analysis.StatisticalMeanAbsTemporalDerivative:StatisticalMeanAbsTemporalDerivativePrimitive +tods.feature_analysis.statistical_mean_temporal_derivative = feature_analysis.StatisticalMeanTemporalDerivative:StatisticalMeanTemporalDerivativePrimitive +tods.feature_analysis.statistical_median_abs_deviation = feature_analysis.StatisticalMedianAbsoluteDeviation:StatisticalMedianAbsoluteDeviationPrimitive +tods.feature_analysis.statistical_kurtosis = feature_analysis.StatisticalKurtosis:StatisticalKurtosisPrimitive +tods.feature_analysis.statistical_skew = feature_analysis.StatisticalSkew:StatisticalSkewPrimitive +tods.feature_analysis.statistical_std = feature_analysis.StatisticalStd:StatisticalStdPrimitive +tods.feature_analysis.statistical_var = feature_analysis.StatisticalVar:StatisticalVarPrimitive +tods.feature_analysis.statistical_variation = feature_analysis.StatisticalVariation:StatisticalVariationPrimitive +tods.feature_analysis.statistical_vec_sum = feature_analysis.StatisticalVecSum:StatisticalVecSumPrimitive +tods.feature_analysis.statistical_willison_amplitude = feature_analysis.StatisticalWillisonAmplitude:StatisticalWillisonAmplitudePrimitive +tods.feature_analysis.statistical_zero_crossing = feature_analysis.StatisticalZeroCrossing:StatisticalZeroCrossingPrimitive +tods.feature_analysis.spectral_residual_transform = feature_analysis.SpectralResidualTransform:SpectralResidualTransformPrimitive +tods.feature_analysis.fast_fourier_transform = feature_analysis.FastFourierTransform:FastFourierTransform +tods.feature_analysis.discrete_cosine_transform = feature_analysis.DiscreteCosineTransform:DiscreteCosineTransform +tods.feature_analysis.non_negative_matrix_factorization = feature_analysis.NonNegativeMatrixFactorization:NonNegativeMatrixFactorization +tods.feature_analysis.bk_filter = feature_analysis.BKFilter:BKFilter +tods.feature_analysis.hp_filter = feature_analysis.HPFilter:HPFilter +tods.feature_analysis.truncated_svd = feature_analysis.SKTruncatedSVD:SKTruncatedSVD +tods.feature_analysis.wavelet_transform = feature_analysis.WaveletTransform:WaveletTransformer +tods.feature_analysis.trmf = feature_analysis.TRMF:TRMF + +tods.detection_algorithm.pyod_ae = detection_algorithm.PyodAE:AutoEncoder +tods.detection_algorithm.pyod_vae = detection_algorithm.PyodVAE:VariationalAutoEncoder +tods.detection_algorithm.pyod_cof = detection_algorithm.PyodCOF:PyodCOF +tods.detection_algorithm.pyod_sod = detection_algorithm.PyodSOD:SODPrimitive +tods.detection_algorithm.pyod_abod = detection_algorithm.PyodABOD:ABODPrimitive +tods.detection_algorithm.pyod_hbos = detection_algorithm.PyodHBOS:HBOSPrimitive +tods.detection_algorithm.pyod_iforest = detection_algorithm.PyodIsolationForest:IsolationForest +tods.detection_algorithm.pyod_lof = detection_algorithm.PyodLOF:LOFPrimitive +tods.detection_algorithm.pyod_autoencoder = detection_algorithm.PyodAutoEncoder:AutoEncoderPrimitive +tods.detection_algorithm.pyod_knn = detection_algorithm.PyodKNN:KNNPrimitive +tods.detection_algorithm.pyod_ocsvm = detection_algorithm.PyodOCSVM:OCSVMPrimitive +tods.detection_algorithm.pyod_loda = detection_algorithm.PyodLODA:LODAPrimitive +tods.detection_algorithm.pyod_cblof = detection_algorithm.PyodCBLOF:CBLOFPrimitive +tods.detection_algorithm.pyod_sogaal = detection_algorithm.PyodSoGaal:So_GaalPrimitive +tods.detection_algorithm.pyod_mogaal = detection_algorithm.PyodMoGaal:Mo_GaalPrimitive + +tods.detection_algorithm.matrix_profile = detection_algorithm.MatrixProfile:MatrixProfile +tods.detection_algorithm.AutoRegODetector = detection_algorithm.AutoRegODetect:AutoRegODetector +tods.detection_algorithm.KDiscordDetector = detection_algorithm.KDiscordODetect:KDiscordDetector +tods.detection_algorithm.PCADetector = detection_algorithm.PCAODetect:PCADetector + +tods.detection_algorithm.LSTMODetector = detection_algorithm.LSTMODetect:LSTMODetector +tods.detection_algorithm.AutoRegODetector = detection_algorithm.AutoRegODetect:AutoRegODetector +tods.detection_algorithm.PCAODetector = detection_algorithm.PCAODetect:PCAODetector +tods.detection_algorithm.KDiscordODetector = detection_algorithm.KDiscordODetect:KDiscordODetector +tods.detection_algorithm.deeplog = detection_algorithm.DeepLog:DeepLogPrimitive +tods.detection_algorithm.telemanom = detection_algorithm.Telemanom:TelemanomPrimitive + +tods.reinforcement.rule_filter = reinforcement.RuleBasedFilter:RuleBasedFilter diff --git a/tods/feature_analysis/AutoCorrelation.py b/tods/feature_analysis/AutoCorrelation.py new file mode 100644 index 0000000..15358fb --- /dev/null +++ b/tods/feature_analysis/AutoCorrelation.py @@ -0,0 +1,387 @@ +import os +import typing +import collections +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple + +import numpy as np +import pandas as pd +import logging, uuid +from scipy import sparse +from numpy import ndarray +from collections import OrderedDict +from common_primitives import dataframe_utils, utils + +from d3m.base import utils as base_utils +from d3m.primitive_interfaces import base, transformer +from d3m import container, exceptions, utils as d3m_utils +from d3m.metadata import base as metadata_base, hyperparams + +from statsmodels.tsa.stattools import acf + +__all__ = ('AutoCorrelation',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + """ + AutoCorrelation = hyperparams.Enumeration( + values = ["acf", "pacf", "pacf_yw", "pacf_ols"], + default = "acf", + semantic_types=[], + description='AutoCorrelation to use' + ) + """ + unbiased = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="If True, then denominators for autocovariance are n-k, otherwise n." + ) + nlags = hyperparams.UniformInt( + lower = 0, + upper = 100, #TODO: Define the correct the upper bound + default=40, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Number of lags to return autocorrelation for." + ) + qstat = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="If True, returns the Ljung-Box q statistic for each autocorrelationcoefficient." + ) + fft = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="If True, computes the ACF via FFT." + ) + alpha = hyperparams.Bounded[float]( + lower=0, + upper=1, + lower_inclusive=True, + upper_inclusive=True, + default = 0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="""If a number is given, the confidence intervals for the given level are returned. + For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett"s formula.""" + ) + missing = hyperparams.Enumeration[str]( + values=["none", "raise", "conservative", "drop"], + default="none", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="""Specifying how the NaNs are to be treated. "none" performs no checks. "raise" raises an exception if NaN values are found. + "drop" removes the missing observations and then estimates the autocovariances treating the non-missing as contiguous. + "conservative" computes the autocovariance using nan-ops so that nans are removed when computing the mean + and cross-products that are used to estimate the autocovariance. + When using "conservative", n is set to the number of non-missing observations.""" + ) + + # Keep previous + dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(2,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(0,1,3,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class ACF: + """ + This is the class for autocorrelation function + """ + def __init__(self, unbiased=False, nlags=40, qstat=False, fft=None, alpha=None,missing="none"): + self._unbiased = unbiased + self._nlags = nlags + self._qstat = qstat + self._fft = fft + self._alpha = alpha + self._missing = missing + + def produce(self, data): + + """ + + Args: + data: dataframe column + Returns: + nparray + + """ + + output = acf(data) + return output + + + +class AutoCorrelation(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive that performs autocorrelation on a DataFrame + acf() function documentation: https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.acf.html + """ + + __author__ = "DATA Lab @Texas A&M University" + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '8c246c78-3082-4ec9-844e-5c98fcc76f9f', + 'version': '0.0.2', + 'name': "AutoCorrelation of values", + 'python_path': 'd3m.primitives.tods.feature_analysis.auto_correlation', + 'algorithm_types': [metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION,], #TODO: check is this right? + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + "hyperparams_to_tune": ['unbiased', 'nlags', 'qstat', 'fft', 'alpha', 'missing'], + 'source': { + 'name': 'DATA Lab @Texas A&M University', + 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods/-/blob/Yile/anomaly-primitives/anomaly_primitives/AutoCorrelation.py'], + }, + 'installation': [{ + 'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/datadrivendiscovery/common-primitives.git@{git_commit}#egg=common_primitives'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + }], + }, + ) + + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._clf = ACF(unbiased = hyperparams['unbiased'], + nlags = hyperparams['nlags'], + qstat = hyperparams['qstat'], + fft = hyperparams['fft'], + alpha = hyperparams['alpha'], + missing = hyperparams['missing'] + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + Returns: + Container DataFrame containing moving average of selected columns + """ + + assert isinstance(inputs, container.DataFrame), type(container.DataFrame) + _, self._columns_to_produce = self._get_columns_to_fit(inputs, self.hyperparams) + + + outputs = inputs + if len(self._columns_to_produce) > 0: + for col in self.hyperparams['use_columns']: + output = self._clf.produce(inputs.iloc[ : ,col]) + outputs = pd.concat((outputs, pd.Series(output).rename(inputs.columns[col] + '_acf')), axis = 1) + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + self._update_metadata(outputs) + + return base.CallResult(outputs) + + + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + Returns: + list + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + + + """ + Encountered error: when hyperparams['use_columns'] = (2,3) and hyperparams['exclude_columns'] is (1,2) + columns_to_produce is still [2] + """ + return inputs.iloc[:, columns_to_produce], columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + Returns: + bool + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, np.integer, np.float64) #changed numpy to np + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + + """ + Wrap predictions into dataframe + + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + + """ + + outputs = container.DataFrame(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._columns_to_produce, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Updata metadata for selected columns. + + Args: + inputs_metadata: metadata.base.DataMetadata + input_indices: list + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + return target_columns_metadata diff --git a/tods/feature_analysis/BKFilter.py b/tods/feature_analysis/BKFilter.py new file mode 100644 index 0000000..c35d12c --- /dev/null +++ b/tods/feature_analysis/BKFilter.py @@ -0,0 +1,376 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer +from d3m.metadata import base as metadata_base, hyperparams + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + + +import os.path + +import time +import statsmodels.api as sm + +__all__ = ('BKFilter',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + # Tuning + low = hyperparams.UniformInt( + lower=0, + upper=100000000, + default=6, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Minimum period for oscillations, ie., Baxter and King suggest that the Burns-Mitchell U.S. business cycle has 6 for quarterly data and 1.5 for annual data.", + ) + high = hyperparams.UniformInt( + lower=0, + upper=100000000, + default=32, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Maximum period for oscillations BK suggest that the U.S. business cycle has 32 for quarterly data and 8 for annual data.", + ) + K = hyperparams.UniformInt( + lower=0, + upper=100000000, + default=1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Lead-lag length of the filter. Baxter and King propose a truncation length of 12 for quarterly data and 3 for annual data.", + ) + + # Control + columns_using_method= hyperparams.Enumeration( + values=['name', 'index'], + default='index', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Choose to use columns by names or indecies. If 'name', \"use_columns\" or \"exclude_columns\" is used. If 'index', \"use_columns_name\" or \"exclude_columns_name\" is used." + ) + use_columns_name = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column names to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns_name = hyperparams.Set( + elements=hyperparams.Hyperparameter[str](''), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column names to not operate on. Applicable only if \"use_columns_name\" is not provided.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class BKFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Filter a time series using the Baxter-King bandpass filter. + + Parameters + ---------- + low: int + Minimum period for oscillations, ie., Baxter and King suggest that the Burns-Mitchell U.S. business cycle has 6 for quarterly data and 1.5 for annual data. + + high: int + Maximum period for oscillations BK suggest that the U.S. business cycle has 32 for quarterly data and 8 for annual data. + + K: int + Lead-lag length of the filter. Baxter and King propose a truncation length of 12 for quarterly data and 3 for annual data. + + use_columns: Set + A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. + + exclude_columns: Set + A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. + + return_result: Enumeration + Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. + + use_semantic_types: Bool + Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. + + add_index_columns: Bool + Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". + + error_on_no_input: Bool( + Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. + + return_semantic_type: Enumeration[str]( + Decides what semantic type to attach to generated attributes' + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "Baxter-King Filter Primitive", + "python_path": "d3m.primitives.tods.feature_analysis.bk_filter", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.BK_FILTER,], + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + "id": "b2bfadc5-dbca-482c-b188-8585e5f245c4", + "hyperparams_to_tune": ['low', 'high', 'K'], + "version": "0.0.1", + }) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. + + Returns: + Container DataFrame after BKFilter. + """ + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._bkfilter(sk_inputs, low=self.hyperparams['low'], high=self.hyperparams['high'], K=self.hyperparams['K']) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + # self._write(outputs) + # self.logger.warning('produce was called3') + return CallResult(outputs) + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = [] + exclude_columns = [] + + # if hyperparams['columns_using_method'] == 'name': + # inputs_cols = inputs.columns.values.tolist() + # for i in range(len(inputs_cols)): + # if inputs_cols[i] in hyperparams['use_columns_name']: + # use_columns.append(i) + # elif inputs_cols[i] in hyperparams['exclude_columns_name']: + # exclude_columns.append(i) + # else: + use_columns=hyperparams['use_columns'] + exclude_columns=hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs:Inputs): + inputs.to_csv(str(time.time())+'.csv') + + def _bkfilter(self, X, low, high, K): + """ + Perform BKFilter + Args: + X: slected rows to be performed + K, low, high: Parameters of BKFilter + + Returns: + Dataframe, results of BKFilter + """ + transformed_X = utils.pandas.DataFrame() + for col in X.columns: + cycle = sm.tsa.filters.bkfilter(X[col], low=low, high=high, K=K) + cycle_df = utils.pandas.DataFrame(cycle) + transformed_X = utils.pandas.concat([transformed_X,cycle_df], axis=1) + + return transformed_X diff --git a/tods/feature_analysis/DiscreteCosineTransform.py b/tods/feature_analysis/DiscreteCosineTransform.py new file mode 100644 index 0000000..031a892 --- /dev/null +++ b/tods/feature_analysis/DiscreteCosineTransform.py @@ -0,0 +1,480 @@ +import os +import typing +import pandas as pd +import numpy as np + +from d3m import container, utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives +import logging +import math +from scipy.fft import dct +from collections import OrderedDict +from typing import cast, Dict, List, Union, Sequence, Optional, Tuple + + +from scipy import sparse +from numpy import ndarray + +__all__ = ('DiscreteCosineTransform',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + + type_ = hyperparams.UniformInt( + lower=1, + upper=4, + upper_inclusive = True, + default=2, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Type of the DCT. Default is 2", + ) + + + axis = hyperparams.Hyperparameter[int]( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Axis over which to compute the DCT. If not given, the last axis is used.", + ) + + n = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=10, + ), + unlimited=hyperparams.Constant( + default=None, + description='If n is not given, the length of the input along the axis specified by axis is used.', + ), + ), + default='unlimited', + description='Length of the transformed axis of the output. If n is smaller than the length of the input, the input is cropped. If it is larger, the input is padded with zeros.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + norm = hyperparams.Enumeration( + values=[None,"ortho"], + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Normalization mode. Default is None, meaning no normalization on the forward transforms and scaling by 1/n on the ifft. For norm=""ortho"", both directions are scaled by 1/sqrt(n).", + ) + + overwrite_x = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="If True, the contents of x can be destroyed; the default is False. See the notes below for more details.", + + ) + + workers = hyperparams.Union[Union[float, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=10, + ), + unlimited=hyperparams.Constant( + default=None, + description='If nothing is give as a paramter', + ), + ), + default='unlimited', + description="Maximum number of workers to use for parallel computation. If negative, the value wraps around from os.cpu_count().", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + # parameters for column + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class DCT: + def __init__(self,type_,n,axis,overwrite_x,norm,workers): + self._type = type_ + self._n = n + self._axis = axis + self._overwrite_x = overwrite_x + self._norm = norm + self._workers = workers + + def produce(self, inputs): + + dataframe = inputs + processed_df = utils.pandas.DataFrame() + try: + for target_column in dataframe.columns : + dct_input = dataframe[target_column].values + dct_output = dct(x=dct_input,type=self._type,n=self._n,axis=self._axis,overwrite_x=self._overwrite_x,norm=self._norm,workers=self._workers) + processed_df[target_column+"_dct_coeff"]=pd.Series(dct_output) + + except IndexError: + logging.warning("Index not found in dataframe") + + return processed_df; + + + + +class DiscreteCosineTransform(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Compute the 1-D discrete Cosine Transform. + Return the Discrete Cosine Transform of arbitrary type sequence x. + + scipy documentation: https://docs.scipy.org/doc/scipy/reference/generated/scipy.fft.dct.html#scipy.fft.dct + + + Parameters + ---------- + + type_: int + Type of the DCT. Default is 2 + + n: int + Length of the transformed axis of the output. If n is smaller than the length of the input, the input is cropped. If it is larger, the input is padded with zeros. + + axis: int + Axis over which to compute the DCT. If not given, the last axis is used. + + norm: str + Normalization mode. Default is None, meaning no normalization on the forward transforms and scaling by 1/n on the ifft. For norm=""ortho"", both directions are scaled by 1/sqrt(n). + + overwrite_x: boolean + If True, the contents of x can be destroyed; the default is False. See the notes below for more details. + + workers: int + Maximum number of workers to use for parallel computation. If negative, the value wraps around from os.cpu_count(). Defualt is None. + + + use_columns: Set + A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. + + exclude_columns: Set + A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. + + return_result: Enumeration + Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. + + use_semantic_types: Bool + Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. + + add_index_columns: Bool + Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". + + error_on_no_input: Bool( + Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. + + return_semantic_type: Enumeration[str]( + Decides what semantic type to attach to generated attributes' + + """ + + __author__ = "Data Lab" + metadata = metadata_base.PrimitiveMetadata( + { + "__author__ " : "DATA Lab at Texas A&M University", + 'name': "Discrete Cosine Transform", + 'python_path': 'd3m.primitives.tods.feature_analysis.discrete_cosine_transform', + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'contact': 'mailto:khlai037@tamu.edu', + 'uris': [ + 'https://gitlab.com/lhenry15/tods.git', + 'https://gitlab.com/lhenry15/tods/-/blob/purav/anomaly-primitives/anomaly_primitives/DiscreteCosineTransform.py', + ], + }, + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DISCRETE_COSINE_TRANSFORM, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + 'id': '584fa7d5-39cc-4cf8-8d5b-5f3a2648f767', + 'hyperparameters_to_tune':['n','norm','axis','type_'], + 'version': '0.0.1', + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._clf = DCT(type_=self.hyperparams['type_'], + n=self.hyperparams['n'], + axis=self.hyperparams['axis'], + overwrite_x=self.hyperparams['overwrite_x'], + norm = self.hyperparams['norm'], + workers = self.hyperparams['workers'] + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + + Returns: + Container DataFrame added with DCT coefficients in a column named 'column_name_dct_coeff' + + """ + assert isinstance(inputs, container.DataFrame), type(dataframe) + + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + cols = [inputs.columns[x] for x in self._training_indices] + sk_inputs = container.DataFrame(data = inputs.iloc[:, self._training_indices].values,columns = cols, generate_metadata=True) + + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.produce(sk_inputs) + + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + # if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + + return base.CallResult(outputs) + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + # return inputs, list(hyperparams['use_columns']) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + accepted_structural_types = (int, float, np.integer, np.float64,str) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + print(column_index, "does not match the structural_type requirements in metadata. Skipping column") + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + # print("length sematic type",len(semantic_types)) + + # returing true for testing purposes for custom dataframes + return True; + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + print(semantic_types) + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + + outputs = container.DataFrame(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata,self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + # print(outputs.metadata.to_internal_simple_structure()) + + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + return target_columns_metadata + + +DiscreteCosineTransform.__doc__ = DiscreteCosineTransform.__doc__ + + + diff --git a/tods/feature_analysis/FastFourierTransform.py b/tods/feature_analysis/FastFourierTransform.py new file mode 100644 index 0000000..5e72d84 --- /dev/null +++ b/tods/feature_analysis/FastFourierTransform.py @@ -0,0 +1,470 @@ +import os +import typing +import pandas as pd +import numpy as np + +from d3m import container, utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +import common_primitives +import logging +from cmath import polar +from scipy.fft import fft +from collections import OrderedDict +from typing import cast, Dict, List, Union, Sequence, Optional, Tuple + +from scipy import sparse +from numpy import ndarray + +__all__ = ('FastFourierTransform',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + + axis = hyperparams.Hyperparameter[int]( + default=-1, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Axis over which to compute the FFT. If not given, the last axis is used.", + ) + + n = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=10, + ), + unlimited=hyperparams.Constant( + default=None, + description='If n is not given, the length of the input along the axis specified by axis is used.', + ), + ), + default='unlimited', + description='Length of the transformed axis of the output. If n is smaller than the length of the input, the input is cropped. If it is larger, the input is padded with zeros.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + norm = hyperparams.Enumeration( + values=[None,"ortho"], + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Normalization mode. Default is None, meaning no normalization on the forward transforms and scaling by 1/n on the ifft. For norm=""ortho"", both directions are scaled by 1/sqrt(n).", + ) + + overwrite_x = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="If True, the contents of x can be destroyed; the default is False. See the notes below for more details.", + + ) + + workers = hyperparams.Union[Union[float, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[int]( + lower=1, + upper=None, + default=10, + ), + unlimited=hyperparams.Constant( + default=None, + description='If nothing is give as a paramter', + ), + ), + default='unlimited', + description="Maximum number of workers to use for parallel computation. If negative, the value wraps around from os.cpu_count().", + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + ) + + # TODO: Decide what to do with plan parameter how to work with it + # plan + + # parameters for column + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class FFT: + def __init__(self,n,axis,overwrite_x,norm,workers): + + self._n = n + self._axis = axis + self._overwrite_x = overwrite_x + self._norm = norm + self._workers = workers + + def produce(self, inputs): + + dataframe = inputs + processed_df = utils.pandas.DataFrame() + try: + for target_column in dataframe.columns : + + fft_input = dataframe[target_column].values + fft_output = fft(x=fft_input,n=self._n,axis=self._axis,overwrite_x=self._overwrite_x,norm=self._norm,workers=self._workers) + tuples = [polar(i) for i in fft_output] + absolute_values = [i[0] for i in tuples] + phase_values = [i[1] for i in tuples] + processed_df[target_column+"_fft_abs"]=pd.Series(absolute_values) + processed_df[target_column+"_fft_phse"]=pd.Series(phase_values) + + except IndexError: + logging.warning("Index not found in dataframe") + + return processed_df; + + + + +class FastFourierTransform(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Compute the 1-D discrete Fourier Transform. + This function computes the 1-D n-point discrete Fourier Transform (DFT) with the efficient Fast Fourier Transform (FFT) algorithm + + scipy documentation : https://docs.scipy.org/doc/scipy/reference/generated/scipy.fft.fft.html#scipy.fft.fft + + Parameters + ---------- + + n: int + Length of the transformed axis of the output. If n is smaller than the length of the input, the input is cropped. If it is larger, the input is padded with zeros. + + axis: int + Axis over which to compute the FFT. If not given, the last axis is used. + + norm: str + Normalization mode. Default is None, meaning no normalization on the forward transforms and scaling by 1/n on the ifft. For norm=""ortho"", both directions are scaled by 1/sqrt(n). + + overwrite_x: boolean + If True, the contents of x can be destroyed; the default is False. See the notes below for more details. + + workers: int + Maximum number of workers to use for parallel computation. If negative, the value wraps around from os.cpu_count(). Defualt is None. + + + use_columns: Set + A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. + + exclude_columns: Set + A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. + + return_result: Enumeration + Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. + + use_semantic_types: Bool + Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. + + add_index_columns: Bool + Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". + + error_on_no_input: Bool( + Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. + + return_semantic_type: Enumeration[str]( + Decides what semantic type to attach to generated attributes' + """ + + __author__ = "Data Lab" + metadata = metadata_base.PrimitiveMetadata( + { + '__author__' : "DATA Lab at Texas A&M University", + 'name': "Fast Fourier Transform", + 'python_path': 'd3m.primitives.tods.feature_analysis.fast_fourier_transform', + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'contact': 'mailto:khlai037@tamu.edu', + 'uris': [ + 'https://gitlab.com/lhenry15/tods.git', + 'https://gitlab.com/lhenry15/tods/-/blob/purav/anomaly-primitives/anomaly_primitives/FastFourierTransform.py', + ], + }, + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.FAST_FOURIER_TRANSFORM, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + 'id': '7bd269bc-de7e-47b8-8d6c-0bd46594d3cb', + 'hyperparameters_to_tune':['n','norm','axis'], + 'version': '0.0.1', + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + self._clf = FFT(n=self.hyperparams['n'], + axis=self.hyperparams['axis'], + overwrite_x=self.hyperparams['overwrite_x'], + norm = self.hyperparams['norm'], + workers = self.hyperparams['workers'] + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + + Returns: + Container DataFrame added with absolute and phase value in a columns named 'column_name_fft_abs' and 'column_name_fft_phse'. + These values correspnd to the absolute and angle values for a complex number we get as FFT coefficients + + """ + assert isinstance(inputs, container.DataFrame), type(dataframe) + + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + cols = [inputs.columns[x] for x in self._training_indices] + sk_inputs = container.DataFrame(data = inputs.iloc[:, self._training_indices].values,columns = cols, generate_metadata=True) + + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.produce(sk_inputs) + + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + # if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + + return base.CallResult(outputs) + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + # return inputs, list(hyperparams['use_columns']) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + accepted_structural_types = (int, float, np.integer, np.float64,str) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + print(column_index, "does not match the structural_type requirements in metadata. Skipping column") + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + # print("length sematic type",len(semantic_types)) + + # returing true for testing purposes for custom dataframes + return True; + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + print(semantic_types) + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + + outputs = container.DataFrame(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata,self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + # print(outputs.metadata.to_internal_simple_structure()) + + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + return target_columns_metadata + + +FastFourierTransform.__doc__ = FastFourierTransform.__doc__ + + + diff --git a/tods/feature_analysis/HPFilter.py b/tods/feature_analysis/HPFilter.py new file mode 100644 index 0000000..ba26d5e --- /dev/null +++ b/tods/feature_analysis/HPFilter.py @@ -0,0 +1,353 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer +from d3m.metadata import base as metadata_base, hyperparams + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + + +import statsmodels.api as sm + +__all__ = ('HPFilter',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + # Tuning + lamb = hyperparams.UniformInt( + lower=0, + upper=100000000, + default=1600, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="The Hodrick-Prescott smoothing parameter. A value of 1600 is suggested for quarterly data. Ravn and Uhlig suggest using a value of 6.25 (1600/4**4) for annual data and 129600 (1600*3**4) for monthly data.", + ) + + # Control + # columns_using_method= hyperparams.Enumeration( + # values=['name', 'index'], + # default='index', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + # description="Choose to use columns by names or indecies. If 'name', \"use_columns\" or \"exclude_columns\" is used. If 'index', \"use_columns_name\" or \"exclude_columns_name\" is used." + # ) + # use_columns_name = hyperparams.Set( + # elements=hyperparams.Hyperparameter[str](''), + # default=(), + # semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + # description="A set of column names to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + # ) + # exclude_columns_name = hyperparams.Set( + # elements=hyperparams.Hyperparameter[str](''), + # default=(), + # semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + # description="A set of column names to not operate on. Applicable only if \"use_columns_name\" is not provided.", + # ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class HPFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Filter a time series using the Hodrick-Prescott filter. + + Parameters + ---------- + lamb: int + The Hodrick-Prescott smoothing parameter. A value of 1600 is suggested for quarterly data. Ravn and Uhlig suggest using a value of 6.25 (1600/4**4) for annual data and 129600 (1600*3**4) for monthly data. + + use_columns: Set + A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. + + exclude_columns: Set + A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. + + return_result: Enumeration + Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. + + use_semantic_types: Bool + Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. + + add_index_columns: Bool + Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". + + error_on_no_input: Bool( + Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. + + return_semantic_type: Enumeration[str]( + Decides what semantic type to attach to generated attributes' + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "Hodrick-Prescott filter Primitive", + "python_path": "d3m.primitives.tods.feature_analysis.hp_filter", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/DuplicationValidation.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.HP_FILTER,], + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + "id": "3af1be06-e45e-4ead-8523-4373264598e4", + "hyperparams_to_tune": ['lamb'], + "version": "0.0.1", + }) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. + + Returns: + Container DataFrame after HPFilter. + """ + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._hpfilter(sk_inputs, lamb=self.hyperparams['lamb']) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + # self._write(outputs) + # self.logger.warning('produce was called3') + return CallResult(outputs) + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = [] + exclude_columns = [] + + # if hyperparams['columns_using_method'] == 'name': + # inputs_cols = inputs.columns.values.tolist() + # for i in range(len(inputs_cols)): + # if inputs_cols[i] in hyperparams['use_columns_name']: + # use_columns.append(i) + # elif inputs_cols[i] in hyperparams['exclude_columns_name']: + # exclude_columns.append(i) + # else: + use_columns=hyperparams['use_columns'] + exclude_columns=hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs:Inputs): + inputs.to_csv(str(time.time())+'.csv') + + def _hpfilter(self, X, lamb): + """ + Perform HPFilter + Args: + X: slected rows to be performed + K, low, high: Parameters of HPFilter + + Returns: + Dataframe, results of HPFilter + """ + transformed_X = utils.pandas.DataFrame() + for col in X.columns: + cycle, trend = sm.tsa.filters.hpfilter(X[col], lamb=lamb) + transformed_X[col+"_cycle"] = cycle + transformed_X[col+"_trend"] = trend + + return transformed_X diff --git a/tods/feature_analysis/NonNegativeMatrixFactorization.py b/tods/feature_analysis/NonNegativeMatrixFactorization.py new file mode 100644 index 0000000..afbb881 --- /dev/null +++ b/tods/feature_analysis/NonNegativeMatrixFactorization.py @@ -0,0 +1,523 @@ +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +from typing import cast, Dict, List, Union, Sequence, Optional, Tuple +from collections import OrderedDict +from scipy import sparse + +import nimfa +import pandas as pd +import numpy +from numpy import ndarray +import warnings + + + +__all__ = ('NonNegativeMatrixFactorization',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + + rank = hyperparams.Hyperparameter[int]( + default=30, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="The factorization rank to achieve. Default is 30.", + ) + + seed = hyperparams.Enumeration( + values=['nndsvd','random_c','random_vcol','random','fixed'], + default='random', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="""Method to seed the computation of a factorization""", + ) + + W = hyperparams.Union( + configuration=OrderedDict({ + 'ndarray': hyperparams.Hyperparameter[ndarray]( + default=numpy.array([]), + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Score weight by dimensions. If None, [1,1,...,1] will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + H = hyperparams.Union( + configuration=OrderedDict({ + 'ndarray': hyperparams.Hyperparameter[ndarray]( + default=numpy.array([]), + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ), + 'none': hyperparams.Constant( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + }), + default='none', + description='Score weight by dimensions. If None, [1,1,...,1] will be used.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + update = hyperparams.Enumeration( + values=['euclidean','divergence'], + default='euclidean', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="""Type of update equations used in factorization. When specifying model parameter update can be assigned to:" + 'euclidean' for classic Euclidean distance update equations," + 'divergence' for divergence update equations." + + By default Euclidean update equations are used.""", + ) + + + objective = hyperparams.Enumeration( + values=['fro','div','conn'], + default='fro', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="""Type of objective function used in factorization. When specifying model parameter :param:`objective` can be assigned to: + + ‘fro’ for standard Frobenius distance cost function, + ‘div’ for divergence of target matrix from NMF estimate cost function (KL), + ‘conn’ for measuring the number of consecutive iterations in which the connectivity matrix has not changed. + + By default the standard Frobenius distance cost function is used.""", + ) + + max_iter = hyperparams.Hyperparameter[int]( + default=30, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Maximum number of factorization iterations. Note that the number of iterations depends on the speed of method convergence. Default is 30.", + ) + + learning_rate = hyperparams.Union[Union[float, None]]( + configuration=OrderedDict( + limit=hyperparams.Bounded[float]( + lower=0, + upper=None, + default=0.01, + ), + unlimited=hyperparams.Constant( + default=None, + description='If nothing is give as a paramter', + ), + ), + default='unlimited', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Minimal required improvement of the residuals from the previous iteration. They are computed between the target matrix and its MF estimate using the objective function associated to the MF algorithm. Default is None.", + ) + + + # parameters for column + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(2,3), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class NMF: + def __init__(self, rank,W,H,seed,update,objective,max_iter,learning_rate): + self._rank = rank + self._seed = seed + self._W = W, + self._H = H, + self._update = update + self._objective = objective + self._max_iter = max_iter + self._learning_rate = learning_rate + + def produce(self, inputs): + + warnings.filterwarnings("ignore") # for removing warnings thrown by nimfa + # for testing + # a = numpy.array([[1,0,1,0,1],[1,0,1,0,1],[1,0,1,0,1]]) + # b = numpy.array([[1,0],[1,0],[1,0],[1,0],[1,0]]) + # print(type(a)) + # print(type(self._W[0])) + nmf = nimfa.Nmf(V = numpy.array(inputs.values), + seed=self._seed, + W=self._W[0], + H=self._H[0], + rank=self._rank, + update = self._update, + objective=self._objective, + min_residuals=self._learning_rate + ) + nmf_fit = nmf() + W = nmf_fit.basis() + H = nmf_fit.coef() + + column_names = ['row_latent_vector_'+str(i) for i in range(self._rank)] + W = pd.DataFrame(data = W,columns = column_names) + # print(type(W)) + + #TODO: Column latent vector + column_names = ['column_latent_vector_'+str(i) for i in range(inputs.shape[1])] + H = pd.DataFrame(data = H,columns = column_names) + + W.reset_index(drop=True, inplace=True) + H.reset_index(drop=True, inplace=True) + result = pd.concat([W, H], axis=1) + # print(result.head(10)) + return result + + +class NonNegativeMatrixFactorization(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Calculates Latent factors of a given matrix of timeseries data + + Parameters + ---------- + rank: int + The factorization rank to achieve. Default is 30. + + update: str + Type of update equations used in factorization. When specifying model parameter update can be assigned to:" + 'euclidean' for classic Euclidean distance update equations," + 'divergence' for divergence update equations." + + By default Euclidean update equations are used. + + objective: str + Type of objective function used in factorization. When specifying model parameter :param:`objective` can be assigned to: + + ‘fro’ for standard Frobenius distance cost function, + ‘div’ for divergence of target matrix from NMF estimate cost function (KL), + ‘conn’ for measuring the number of consecutive iterations in which the connectivity matrix has not changed. + + By default the standard Frobenius distance cost function is used. + + max_iter: int + Maximum number of factorization iterations. Note that the number of iterations depends on the speed of method convergence. Default is 30. + + learning_rate: float + Minimal required improvement of the residuals from the previous iteration. They are computed between the target matrix and its MF estimate using the objective function associated to the MF algorithm. Default is None. + + + use_columns: Set + A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. + + exclude_columns: Set + A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. + + return_result: Enumeration + Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. + + use_semantic_types: Bool + Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. + + add_index_columns: Bool + Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". + + error_on_no_input: Bool( + Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. + + return_semantic_type: Enumeration[str]( + Decides what semantic type to attach to generated attributes' + """ + + + + __author__ = "Data Lab" + metadata = metadata_base.PrimitiveMetadata( + { + '__author__' : "DATA Lab at Texas A&M University", + 'name': "Fast Fourier Transform", + 'python_path': 'd3m.primitives.tods.feature_analysis.non_negative_matrix_factorization', + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'contact': 'mailto:khlai037@tamu.edu', + 'uris': [ + 'https://gitlab.com/lhenry15/tods.git', + 'https://gitlab.com/lhenry15/tods/-/blob/purav/anomaly-primitives/anomaly_primitives/NonNegativeMatrixFactorization.py', + ], + }, + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.NON_NEGATIVE_MATRIX_FACTORIZATION, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + 'id': 'c7259da6-7ce6-42ad-83c6-15238679f5fa', + 'hyperparameters_to_tune':['rank','update','objective','max_iter','learning_rate'], + 'version': '0.0.1', + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) + + + + self._clf = NMF(rank=self.hyperparams['rank'], + seed=self.hyperparams['seed'], + W=self.hyperparams['W'], + H=self.hyperparams['H'], + objective=self.hyperparams['objective'], + update=self.hyperparams['update'], + max_iter=self.hyperparams['max_iter'], + learning_rate = self.hyperparams['learning_rate'], + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + assert isinstance(inputs, container.DataFrame), type(dataframe) + + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + cols = [inputs.columns[x] for x in self._training_indices] + sk_inputs = container.DataFrame(data = inputs.iloc[:, self._training_indices].values,columns = cols, generate_metadata=True) + + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.produce(sk_inputs) + + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + # if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + + return base.CallResult(outputs) + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + # return inputs, list(hyperparams['use_columns']) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + accepted_structural_types = (int, float, numpy.integer, numpy.float64,str) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + print(column_index, "does not match the structural_type requirements in metadata. Skipping column") + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + # print("length sematic type",len(semantic_types)) + + # returing true for testing purposes for custom dataframes + return True; + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + print(semantic_types) + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + + outputs = container.DataFrame(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata,self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + # print(outputs.metadata.to_internal_simple_structure()) + + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + return target_columns_metadata + +NonNegativeMatrixFactorization.__doc__ = NonNegativeMatrixFactorization.__doc__ diff --git a/tods/feature_analysis/SKTruncatedSVD.py b/tods/feature_analysis/SKTruncatedSVD.py new file mode 100644 index 0000000..2b6f038 --- /dev/null +++ b/tods/feature_analysis/SKTruncatedSVD.py @@ -0,0 +1,510 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import time + +# Custom import commands if any +from sklearn.decomposition.truncated_svd import TruncatedSVD + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + +__all__ = ('SKTruncatedSVD',) + +class PrimitiveCount: + primitive_no = 0 + +class Params(params.Params): + components_: Optional[ndarray] + explained_variance_ratio_: Optional[ndarray] + explained_variance_: Optional[ndarray] + singular_values_: Optional[ndarray] + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + +class Hyperparams(hyperparams.Hyperparams): + n_components = hyperparams.Bounded[int]( + default=2, + lower=0, + upper=None, + description='Desired dimensionality of output data. Must be strictly less than the number of features. The default value is useful for visualisation. For LSA, a value of 100 is recommended.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + algorithm = hyperparams.Choice( + choices={ + 'randomized': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'n_iter': hyperparams.Bounded[int]( + default=5, + lower=0, + upper=None, + description='Number of iterations for randomized SVD solver. Not used in arpack', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ), + 'arpack': hyperparams.Hyperparams.define( + configuration=OrderedDict({ + 'tol': hyperparams.Bounded[float]( + default=0, + lower=0, + upper=None, + description='Tolerance for ARPACK. 0 means machine precision. Ignored by randomized SVD solver.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + }) + ) + }, + default='randomized', + description='SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds), or "randomized" for the randomized algorithm due to Halko (2009).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKTruncatedSVD(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn TruncatedSVD + `sklearn documentation `_ + + Parameters + ---------- + n_components: int + Desired dimensionality of output data. Must be strictly less than the number of features. The default value is useful for visualisation. For LSA, a value of 100 is recommended. + + algorithm: hyperparams.Choice + SVD solver to use. Either "arpack" for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds), or "randomized" for the randomized algorithm due to Halko (2009). + + use_columns: Set + A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. + + exclude_columns: Set + A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. + + return_result: Enumeration + Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false. + + use_semantic_types: Bool + Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe. + + add_index_columns: Bool + Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". + + error_on_no_input: Bool( + Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. + + return_semantic_type: Enumeration[str]( + Decides what semantic type to attach to generated attributes' + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "Truncated SVD", + "python_path": "d3m.primitives.tods.feature_analysis.truncated_svd", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/SKTruncatedSVD.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SINGULAR_VALUE_DECOMPOSITION, ], + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + "id": "9231fde3-7322-3c41-b4cf-d00a93558c44", + "hyperparams_to_tune": ['n_components', 'algorithm', 'use_columns', 'exclude_columns', 'return_result', 'use_semantic_types', 'add_index_columns', 'error_on_no_input', 'return_semantic_type'], + "version": "0.0.1", + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = TruncatedSVD( + n_components=self.hyperparams['n_components'], + algorithm=self.hyperparams['algorithm']['choice'], + n_iter=self.hyperparams['algorithm'].get('n_iter', 5), + tol=self.hyperparams['algorithm'].get('tol', 0), + random_state=self.random_seed, + ) + + self.primitiveNo = PrimitiveCount.primitive_no + PrimitiveCount.primitive_no += 1 + + + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for SKTruncatedSVD. + Args: + inputs: Container DataFrame + + Returns: + None + """ + # self.logger.warning('set was called!') + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + if self._fitted: + return CallResult(None) + + # Get cols to fit. + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + # If there is no cols to fit, return None + if self._training_inputs is None: + return CallResult(None) + + # Call SVD in sklearn and set _fitted to true + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. + + Returns: + Container DataFrame after Truncated SVD. + """ + # self.logger.warning(str(self.metadata.query()['name'])) + + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + # self._write(outputs) + # self.logger.warning('produce was called!') + return CallResult(outputs) + + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + if not self._fitted: + return Params( + components_=None, + explained_variance_ratio_=None, + explained_variance_=None, + singular_values_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + components_=getattr(self._clf, 'components_', None), + explained_variance_ratio_=getattr(self._clf, 'explained_variance_ratio_', None), + explained_variance_=getattr(self._clf, 'explained_variance_', None), + singular_values_=getattr(self._clf, 'singular_values_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for SKTruncatedSVD. + Args: + params: class Params + + Returns: + None + """ + self._clf.components_ = params['components_'] + self._clf.explained_variance_ratio_ = params['explained_variance_ratio_'] + self._clf.explained_variance_ = params['explained_variance_'] + self._clf.singular_values_ = params['singular_values_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['components_'] is not None: + self._fitted = True + if params['explained_variance_ratio_'] is not None: + self._fitted = True + if params['explained_variance_'] is not None: + self._fitted = True + if params['singular_values_'] is not None: + self._fitted = True + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams, self.primitiveNo) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams, primitiveNo): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "{0}{1}_{2}".format(cls.metadata.query()['name'], primitiveNo, column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs:Inputs): + """ + write inputs to current directory, only for test + """ + inputs.to_csv(str(time.time())+'.csv') + + +# SKTruncatedSVD.__doc__ = TruncatedSVD.__doc__ diff --git a/tods/feature_analysis/SpectralResidualTransform.py b/tods/feature_analysis/SpectralResidualTransform.py new file mode 100644 index 0000000..a68b8c7 --- /dev/null +++ b/tods/feature_analysis/SpectralResidualTransform.py @@ -0,0 +1,364 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('SpectralResidualTransformPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + + avg_filter_dimension = hyperparams.Hyperparameter(default=3, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Spectral Residual average filter dimension") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class SpectralResidualTransformPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find Spectral Residual Transform of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '88dda04b-090b-49a5-8035-279eb3be9cd9', + 'version': '0.1.0', + 'name': 'Time Series Spectral Residual', + 'python_path': 'd3m.primitives.tods.feature_analysis.spectral_residual_transform', + 'keywords': ['Time Series','FFT'], + "hyperparams_to_tune": ['avg_filter_dimension'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/SpectralResidualTransform.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing Spectral Residual Transform of time series + """ + self.logger.info('Spectral Residual Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + spectral_residual_input = inputs + if self.hyperparams['use_semantic_types']: + spectral_residual_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + spectral_residual_output = self._spectral_residual_transform(spectral_residual_input,self.hyperparams["avg_filter_dimension"]) + + if sparse.issparse(spectral_residual_output): + spectral_residual_output = spectral_residual_output.toarray() + outputs = self._wrap_predictions(inputs, spectral_residual_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Spectral Residual Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _spectral_residual_transform(self, X,avg_filter_dimension): + """ + This method transform a time series into spectral residual series + :param values: list. + a list of float values. + :return: mag: list. + a list of float values as the spectral residual values + """ + EPS = 1e-8 + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + values = X[column].values + trans = np.fft.fft(values) + mag = np.sqrt(trans.real ** 2 + trans.imag ** 2) + eps_index = np.where(mag <= EPS)[0] + mag[eps_index] = EPS + + mag_log = np.log(mag) + mag_log[eps_index] = 0 + spectral = np.exp(mag_log - self._average_filter(mag_log, n=avg_filter_dimension)) + + trans.real = trans.real * spectral / mag + trans.imag = trans.imag * spectral / mag + trans.real[eps_index] = 0 + trans.imag[eps_index] = 0 + + wave_r = np.fft.ifft(trans) + mag = np.round(np.sqrt(wave_r.real ** 2 + wave_r.imag ** 2),4) + transformed_X[column + "_spectral_residual"] = mag + + return transformed_X + + def _average_filter(self,values, n=3): + """ + Calculate the sliding window average for the give time series. + Mathematically, res[i] = sum_{j=i-t+1}^{i} values[j] / t, where t = min(n, i+1) + :param values: list. + a list of float numbers + :param n: int, default 3. + window size. + :return res: list. + a list of value after the average_filter process. + """ + + if n >= len(values): + n = len(values) + + res = np.cumsum(values, dtype=float) + res[n:] = res[n:] - res[:-n] + res[n:] = res[n:] / n + + for i in range(1, n): + res[i] /= (i + 1) + + return res + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + diff --git a/tods/feature_analysis/StatisticalAbsEnergy.py b/tods/feature_analysis/StatisticalAbsEnergy.py new file mode 100644 index 0000000..f31ace7 --- /dev/null +++ b/tods/feature_analysis/StatisticalAbsEnergy.py @@ -0,0 +1,331 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalAbsEnergyPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalAbsEnergyPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find abs_energy of time series + """ + + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '73299ffe-d8bb-43c6-a6cc-9261f5e17a5e', + 'version': '0.1.0', + 'name': 'Time Series Statistical Abs Energy', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_abs_energy', + 'keywords': ['Time Series','AbsEnergy'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalAbsEnergy.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing abs_energy of time series + """ + self.logger.info('Statistical AbsEnergy Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_abs_energy_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_abs_energy_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_abs_energy_output = self._abs_energy(statistical_abs_energy_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_abs_energy_output): + statistical_abs_energy_output = statistical_abs_energy_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_abs_energy_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical AbsEnergy Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _abs_energy(self,X,window_size): + """ statistical abs_energy of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with abs_energy + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_abs_energy = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_abs_energy[iter] = np.round(np.sum(sequence*sequence),4) + column_abs_energy[:window_size-1] = column_abs_energy[window_size-1] + transformed_X[column + "_abs_energy"] = column_abs_energy + + return transformed_X diff --git a/tods/feature_analysis/StatisticalAbsSum.py b/tods/feature_analysis/StatisticalAbsSum.py new file mode 100644 index 0000000..1c58f0a --- /dev/null +++ b/tods/feature_analysis/StatisticalAbsSum.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalAbsSumPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalAbsSumPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find abs_sum of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'fbc10e6f-d75b-4815-b4c8-5ad4f2f577db', + 'version': '0.1.0', + 'name': 'Time Series Absolute Sum', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_abs_sum', + 'keywords': ['Time Series','AbsSum'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalAbsSum.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing abs_sum of time series + """ + self.logger.info('Statistical AbsSum Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_abs_sum_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_abs_sum_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_abs_sum_output = self._abs_sum(statistical_abs_sum_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_abs_sum_output): + statistical_abs_sum_output = statistical_abs_sum_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_abs_sum_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical AbsSum Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _abs_sum(self,X,window_size): + """ statistical abs_sum of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with abs_sum + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_abs_sum = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_abs_sum[iter] = np.sum(np.abs(sequence)) + column_abs_sum[:window_size-1] = column_abs_sum[window_size-1] + transformed_X[column + "_abs_sum"] = column_abs_sum + + return transformed_X diff --git a/tods/feature_analysis/StatisticalGmean.py b/tods/feature_analysis/StatisticalGmean.py new file mode 100644 index 0000000..cc03f82 --- /dev/null +++ b/tods/feature_analysis/StatisticalGmean.py @@ -0,0 +1,337 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +from scipy import stats + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.exceptions import UnexpectedValueError + +__all__ = ('StatisticalGmeanPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 consider entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalGmeanPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find gmean of time series . + Will only take positive values as inputs . + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '6be88a7d-e72d-45c6-bd3b-3191d4eff623', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_g_mean', + 'keywords': ['Time Series','Gmean'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalGmean.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing gmean of time series + """ + self.logger.info('Statistical Gmean Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_gmean_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_gmean_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_gmean_output = self._gmean(statistical_gmean_input,self.hyperparams["window_size"]) + + + if sparse.issparse(statistical_gmean_output): + statistical_gmean_output = statistical_gmean_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_gmean_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Gmean Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _gmean(self,X,window_size): + """ statistical gmean of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with gmean + """ + + + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + + column_value = X[column].values + column_gmean = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_gmean[iter] = stats.gmean(sequence).round(4) + column_gmean[:window_size-1] = column_gmean[window_size-1] + transformed_X[column + "_gmean"] = column_gmean + + return transformed_X diff --git a/tods/feature_analysis/StatisticalHmean.py b/tods/feature_analysis/StatisticalHmean.py new file mode 100644 index 0000000..1fa3270 --- /dev/null +++ b/tods/feature_analysis/StatisticalHmean.py @@ -0,0 +1,334 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +from scipy import stats + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalHmeanPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 consider entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalHmeanPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find Harmonic mean of time series + Harmonic mean only defined if all elements greater than or equal to zero + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '7c4bf669-26f4-4756-8e00-c3e5e89fa43c', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_h_mean', + 'keywords': ['Time Series','Hmean'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalHmean.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing hmean of time series + """ + self.logger.info('Statistical Hmean Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_hmean_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_hmean_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_hmean_output = self._hmean(statistical_hmean_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_hmean_output): + statistical_hmean_output = statistical_hmean_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_hmean_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Hmean Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _hmean(self,X,window_size): + """ statistical hmean of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with hmean + """ + + + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_hmean = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_hmean[iter] = stats.hmean(sequence).round(4) + column_hmean[:window_size-1] = column_hmean[window_size-1] + transformed_X[column + "_hmean"] = column_hmean + + return transformed_X diff --git a/tods/feature_analysis/StatisticalKurtosis.py b/tods/feature_analysis/StatisticalKurtosis.py new file mode 100644 index 0000000..7a8e261 --- /dev/null +++ b/tods/feature_analysis/StatisticalKurtosis.py @@ -0,0 +1,331 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +from scipy import stats + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalKurtosisPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 consider entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalKurtosisPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find kurtosis of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'c86af521-05b6-4f7c-a7b9-929318d944fc', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_kurtosis', + 'keywords': ['Time Series','Kurtosis'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalKurtosis.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing kurtosis of time series + """ + self.logger.info('Statistical Kurtosis Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_kurtosis_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_kurtosis_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_kurtosis_output = self._kurtosis(statistical_kurtosis_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_kurtosis_output): + statistical_kurtosis_output = statistical_kurtosis_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_kurtosis_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Kurtosis Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _kurtosis(self,X,window_size): + """ statistical kurtosis of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with kurtosis + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_kurtosis = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_kurtosis[iter] = round(stats.kurtosis(sequence),4) + column_kurtosis[:window_size-1] = column_kurtosis[window_size-1] + transformed_X[column + "_kurtosis"] = column_kurtosis + + return transformed_X diff --git a/tods/feature_analysis/StatisticalMaximum.py b/tods/feature_analysis/StatisticalMaximum.py new file mode 100644 index 0000000..35ef123 --- /dev/null +++ b/tods/feature_analysis/StatisticalMaximum.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalMaximumPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalMaximumPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find maximum of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '3b448057-ac26-4f1b-96b6-141782f16a54', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_maximum', + 'keywords': ['Time Series','Maximum'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMaximum.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing maximum of time series + """ + self.logger.info('Statistical Maximum Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_maximum_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_maximum_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_maximum_output = self._maximum(statistical_maximum_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_maximum_output): + statistical_maximum_output = statistical_maximum_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_maximum_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Maximum Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _maximum(self,X,window_size): + """ statistical maximum of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with maximum + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_maximum = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_maximum[iter] = np.max(sequence) + column_maximum[:window_size-1] = column_maximum[window_size-1] + transformed_X[column + "_maximum"] = column_maximum + + return transformed_X diff --git a/tods/feature_analysis/StatisticalMean.py b/tods/feature_analysis/StatisticalMean.py new file mode 100644 index 0000000..e1db7e4 --- /dev/null +++ b/tods/feature_analysis/StatisticalMean.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalMeanPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalMeanPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find mean of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'eaff2f35-978c-4530-a12e-061a5f0beacd', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_mean', + 'keywords': ['Time Series','Mean'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMean.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing mean of time series + """ + self.logger.info('Statistical Mean Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_mean_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_mean_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_mean_output = self._mean(statistical_mean_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_mean_output): + statistical_mean_output = statistical_mean_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_mean_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Mean Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _mean(self,X,window_size): + """ statistical mean of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with mean + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_mean = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_mean[iter] = np.mean(sequence) + column_mean[:window_size-1] = column_mean[window_size-1] + transformed_X[column + "_mean"] = column_mean + + return transformed_X diff --git a/tods/feature_analysis/StatisticalMeanAbs.py b/tods/feature_analysis/StatisticalMeanAbs.py new file mode 100644 index 0000000..cddef84 --- /dev/null +++ b/tods/feature_analysis/StatisticalMeanAbs.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalMeanAbsPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalMeanAbsPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find mean_abs of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'c397f0b2-45da-4263-8cca-b4e1a9502918', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_mean_abs', + 'keywords': ['Time Series','MeanAbs'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMeanAbs.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing mean_abs of time series + """ + self.logger.info('Statistical MeanAbs Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_mean_abs_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_mean_abs_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_mean_abs_output = self._mean_abs(statistical_mean_abs_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_mean_abs_output): + statistical_mean_abs_output = statistical_mean_abs_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_mean_abs_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical MeanAbs Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _mean_abs(self,X,window_size): + """ statistical mean_abs of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with mean_abs + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_mean_abs = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_mean_abs[iter] = np.mean(np.abs(sequence)) + column_mean_abs[:window_size-1] = column_mean_abs[window_size-1] + transformed_X[column + "_mean_abs"] = column_mean_abs + + return transformed_X diff --git a/tods/feature_analysis/StatisticalMeanAbsTemporalDerivative.py b/tods/feature_analysis/StatisticalMeanAbsTemporalDerivative.py new file mode 100644 index 0000000..dca5755 --- /dev/null +++ b/tods/feature_analysis/StatisticalMeanAbsTemporalDerivative.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalMeanAbsTemporalDerivativePrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalMeanAbsTemporalDerivativePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find mean_abs_temporal_derivative of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'eb571238-6229-4fe4-94b3-684f043e4dbf', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_mean_abs_temporal_derivative', + 'keywords': ['Time Series','MeanAbsTemporalDerivative'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMeanAbsTemporalDerivative.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing mean_abs_temporal_derivative of time series + """ + self.logger.info('Statistical MeanAbsTemporalDerivative Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_mean_abs_temporal_derivative_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_mean_abs_temporal_derivative_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_mean_abs_temporal_derivative_output = self._mean_abs_temporal_derivative(statistical_mean_abs_temporal_derivative_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_mean_abs_temporal_derivative_output): + statistical_mean_abs_temporal_derivative_output = statistical_mean_abs_temporal_derivative_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_mean_abs_temporal_derivative_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical MeanAbsTemporalDerivative Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _mean_abs_temporal_derivative(self,X,window_size): + """ statistical mean_abs_temporal_derivative of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with mean_abs_temporal_derivative + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_mean_abs_temporal_derivative = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_mean_abs_temporal_derivative[iter] = np.mean(np.abs(np.diff(sequence))) + column_mean_abs_temporal_derivative[:window_size-1] = column_mean_abs_temporal_derivative[window_size-1] + transformed_X[column + "_mean_abs_temporal_derivative"] = column_mean_abs_temporal_derivative + + return transformed_X diff --git a/tods/feature_analysis/StatisticalMeanTemporalDerivative.py b/tods/feature_analysis/StatisticalMeanTemporalDerivative.py new file mode 100644 index 0000000..957367b --- /dev/null +++ b/tods/feature_analysis/StatisticalMeanTemporalDerivative.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalMeanTemporalDerivativePrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalMeanTemporalDerivativePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find mean_temporal_derivative of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'bc051fbb-836b-414e-ad3e-5bf29c9f78f1', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_mean_temporal_derivative', + 'keywords': ['Time Series','MeanTemporalDerivative'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMeanTemporalDerivative.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing mean_temporal_derivative of time series + """ + self.logger.info('Statistical MeanTemporalDerivative Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_mean_temporal_derivative_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_mean_temporal_derivative_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_mean_temporal_derivative_output = self._mean_temporal_derivative(statistical_mean_temporal_derivative_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_mean_temporal_derivative_output): + statistical_mean_temporal_derivative_output = statistical_mean_temporal_derivative_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_mean_temporal_derivative_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical MeanTemporalDerivative Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _mean_temporal_derivative(self,X,window_size): + """ statistical mean_temporal_derivative of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with mean_temporal_derivative + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_mean_temporal_derivative = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_mean_temporal_derivative[iter] = np.mean(np.diff(sequence)) + column_mean_temporal_derivative[:window_size-1] = column_mean_temporal_derivative[window_size-1] + transformed_X[column + "_mean_temporal_derivative"] = column_mean_temporal_derivative + + return transformed_X diff --git a/tods/feature_analysis/StatisticalMedian.py b/tods/feature_analysis/StatisticalMedian.py new file mode 100644 index 0000000..48ecb4d --- /dev/null +++ b/tods/feature_analysis/StatisticalMedian.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalMedianPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default windiow -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalMedianPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find median of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '9f05a450-c1f0-49f6-971b-dcc3789174d0', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_median', + 'keywords': ['Time Series','Median'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMedian.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing median of time series + """ + self.logger.info('Statistical Median Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_median_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_median_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_median_output = self._median(statistical_median_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_median_output): + statistical_median_output = statistical_median_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_median_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Median Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _median(self,X,window_size): + """ statistical median of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with median + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_median = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_median[iter] = np.median(sequence) + column_median[:window_size-1] = column_median[window_size-1] + transformed_X[column + "_median"] = column_median + + return transformed_X diff --git a/tods/feature_analysis/StatisticalMedianAbsoluteDeviation.py b/tods/feature_analysis/StatisticalMedianAbsoluteDeviation.py new file mode 100644 index 0000000..deabe8f --- /dev/null +++ b/tods/feature_analysis/StatisticalMedianAbsoluteDeviation.py @@ -0,0 +1,331 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +from scipy import stats + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalMedianAbsoluteDeviationPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 consider entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalMedianAbsoluteDeviationPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find median_absolute_deviation of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '36e7d739-72c3-4e6e-91b8-b2b64cbe4e12', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_median_abs_deviation', + 'keywords': ['Time Series','MedianAbsoluteDeviation'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMedianAbsoluteDeviation.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing median_absolute_deviation of time series + """ + self.logger.info('Statistical MedianAbsoluteDeviation Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_median_absolute_deviation_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_median_absolute_deviation_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_median_absolute_deviation_output = self._median_absolute_deviation(statistical_median_absolute_deviation_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_median_absolute_deviation_output): + statistical_median_absolute_deviation_output = statistical_median_absolute_deviation_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_median_absolute_deviation_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical MedianAbsoluteDeviation Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _median_absolute_deviation(self,X,window_size): + """ statistical median_absolute_deviation of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with median_absolute_deviation + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_median_absolute_deviation = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_median_absolute_deviation[iter] = stats.median_absolute_deviation(sequence).round(4) + column_median_absolute_deviation[:window_size-1] = column_median_absolute_deviation[window_size-1] + transformed_X[column + "_median_absolute_deviation"] = column_median_absolute_deviation + + return transformed_X diff --git a/tods/feature_analysis/StatisticalMinimum.py b/tods/feature_analysis/StatisticalMinimum.py new file mode 100644 index 0000000..907074f --- /dev/null +++ b/tods/feature_analysis/StatisticalMinimum.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalMinimumPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalMinimumPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find minimum of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '255955d0-1d64-433b-b9f0-e2a1b679be45', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_minimum', + 'keywords': ['Time Series','Minimum'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalMinimum.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing minimum of time series + """ + self.logger.info('Statistical Minimum Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_minimum_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_minimum_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_minimum_output = self._minimum(statistical_minimum_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_minimum_output): + statistical_minimum_output = statistical_minimum_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_minimum_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Minimum Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _minimum(self,X,window_size): + """ statistical minimum of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with minimum + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_minimum = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_minimum[iter] = np.min(sequence) + column_minimum[:window_size-1] = column_minimum[window_size-1] + transformed_X[column + "_minimum"] = column_minimum + + return transformed_X diff --git a/tods/feature_analysis/StatisticalSkew.py b/tods/feature_analysis/StatisticalSkew.py new file mode 100644 index 0000000..374e9ac --- /dev/null +++ b/tods/feature_analysis/StatisticalSkew.py @@ -0,0 +1,331 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +from scipy import stats + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalSkewPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 consider entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalSkewPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find skew of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'cd154af5-8f98-480a-8a72-6a22365c3c6f', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_skew', + 'keywords': ['Time Series','Skew'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalSkew.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing skew of time series + """ + self.logger.info('Statistical Skew Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_skew_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_skew_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_skew_output = self._skew(statistical_skew_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_skew_output): + statistical_skew_output = statistical_skew_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_skew_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Skew Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _skew(self,X,window_size): + """ statistical skew of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with skew + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_skew = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_skew[iter] = round(stats.skew(sequence),4) + column_skew[:window_size-1] = column_skew[window_size-1] + transformed_X[column + "_skew"] = column_skew + + return transformed_X diff --git a/tods/feature_analysis/StatisticalStd.py b/tods/feature_analysis/StatisticalStd.py new file mode 100644 index 0000000..cd0d783 --- /dev/null +++ b/tods/feature_analysis/StatisticalStd.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalStdPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalStdPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find std of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '652fc98a-8bd9-45a2-8005-dc781bf0c136', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_std', + 'keywords': ['Time Series','Std'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalStd.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing std of time series + """ + self.logger.info('Statistical Std Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_std_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_std_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_std_output = self._std(statistical_std_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_std_output): + statistical_std_output = statistical_std_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_std_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Std Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _std(self,X,window_size): + """ statistical std of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with std + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_std = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_std[iter] = np.std(sequence) + column_std[:window_size-1] = column_std[window_size-1] + transformed_X[column + "_std"] = column_std + + return transformed_X diff --git a/tods/feature_analysis/StatisticalVar.py b/tods/feature_analysis/StatisticalVar.py new file mode 100644 index 0000000..c558338 --- /dev/null +++ b/tods/feature_analysis/StatisticalVar.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalVarPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalVarPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find var of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '9b237f3f-c638-44f4-adb1-f3f24a173711', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_var', + 'keywords': ['Time Series','Var'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalVar.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing var of time series + """ + self.logger.info('Statistical Var Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_var_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_var_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_var_output = self._var(statistical_var_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_var_output): + statistical_var_output = statistical_var_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_var_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Var Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _var(self,X,window_size): + """ statistical var of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with var + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_var = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_var[iter] = np.var(sequence) + column_var[:window_size-1] = column_var[window_size-1] + transformed_X[column + "_var"] = column_var + + return transformed_X diff --git a/tods/feature_analysis/StatisticalVariation.py b/tods/feature_analysis/StatisticalVariation.py new file mode 100644 index 0000000..86d5498 --- /dev/null +++ b/tods/feature_analysis/StatisticalVariation.py @@ -0,0 +1,331 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +from scipy import stats + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalVariationPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 consider entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalVariationPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find variation of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'ea6e852f-164b-4245-b5e6-02fde55c5491', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_variation', + 'keywords': ['Time Series','Variation'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalVariation.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing variation of time series + """ + self.logger.info('Statistical Variation Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_variation_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_variation_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_variation_output = self._variation(statistical_variation_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_variation_output): + statistical_variation_output = statistical_variation_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_variation_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Variation Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _variation(self,X,window_size): + """ statistical variation of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with variation + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_variation = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_variation[iter] = stats.variation(sequence).round(4) + column_variation[:window_size-1] = column_variation[window_size-1] + transformed_X[column + "_variation"] = column_variation + + return transformed_X diff --git a/tods/feature_analysis/StatisticalVecSum.py b/tods/feature_analysis/StatisticalVecSum.py new file mode 100644 index 0000000..e773da2 --- /dev/null +++ b/tods/feature_analysis/StatisticalVecSum.py @@ -0,0 +1,330 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalVecSumPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalVecSumPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find vec_sum of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'a5ff2fc8-657e-4c4f-8a4e-6949dd37bf9c', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_vec_sum', + 'keywords': ['Time Series','VecSum'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalVecSum.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing vec_sum of time series + """ + self.logger.info('Statistical VecSum Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_vec_sum_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_vec_sum_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_vec_sum_output = self._vec_sum(statistical_vec_sum_input,self.hyperparams["window_size"]) + + if sparse.issparse(statistical_vec_sum_output): + statistical_vec_sum_output = statistical_vec_sum_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_vec_sum_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical VecSum Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _vec_sum(self,X,window_size): + """ statistical vec_sum of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with vec_sum + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_vec_sum = np.zeros(len(column_value)) + for iter in range(window_size-1,len(column_value)): + sequence = column_value[iter-window_size+1:iter+1] + column_vec_sum[iter] = np.sum(sequence) + column_vec_sum[:window_size-1] = column_vec_sum[window_size-1] + transformed_X[column + "_vec_sum"] = column_vec_sum + + return transformed_X diff --git a/tods/feature_analysis/StatisticalWillisonAmplitude.py b/tods/feature_analysis/StatisticalWillisonAmplitude.py new file mode 100644 index 0000000..ff29dfc --- /dev/null +++ b/tods/feature_analysis/StatisticalWillisonAmplitude.py @@ -0,0 +1,342 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalWillisonAmplitudePrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + #Tuning Parameter + #default -1 considers entire time series is considered + window_size = hyperparams.Hyperparameter(default=-1, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="Window Size for decomposition") + + threshold = hyperparams.Hyperparameter(default= 0, semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ], description="threshold for willison amplitude") + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalWillisonAmplitudePrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find willison amplitude of time series + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'f1dee9fb-7e3b-499d-a559-7979fa4a2e1c', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_willison_amplitude', + 'keywords': ['Time Series','WillisonAmplitude'], + "hyperparams_to_tune": ['window_size'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalWillisonAmplitude.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing willison amplitude of time series + """ + self.logger.info('Statistical Willison Amplitude Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_willison_amplitude_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_willison_amplitude_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_willison_amplitude_output = self._willison_amplitude(statistical_willison_amplitude_input, + self.hyperparams["window_size"],self.hyperparams['threshold']) + + if sparse.issparse(statistical_willison_amplitude_output): + statistical_willison_amplitude_output = statistical_willison_amplitude_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_willison_amplitude_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical Willison Amplitude Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _willison_amplitude(self,X,window_size,threshold): + """ statistical willison_amplitude of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with willison_amplitude + """ + if(window_size==-1): + window_size = len(X) + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_diff_value = np.zeros(len(X[column])+1) # array of 1 extra row use to adjust X(t)-X(t-1) + column_diff_value[1:] = (X[column].values) + column_diff_value = np.abs(np.diff(column_diff_value)) + willison_func_values = np.zeros(len(X[column])) + mask = (column_diff_value>threshold) + willison_func_values[mask] = 1 + + column_willison_amplitude = np.zeros(len(X[column].values)) + + for iter in range(window_size-1,len(column_diff_value)): + sequence = willison_func_values[iter-window_size+1:iter+1] + column_willison_amplitude[iter] = np.sum(sequence) + column_willison_amplitude[:window_size-1] = column_willison_amplitude[window_size-1] + transformed_X[column + "_willison_amplitude"] = column_willison_amplitude + + return transformed_X diff --git a/tods/feature_analysis/StatisticalZeroCrossing.py b/tods/feature_analysis/StatisticalZeroCrossing.py new file mode 100644 index 0000000..a2f53c0 --- /dev/null +++ b/tods/feature_analysis/StatisticalZeroCrossing.py @@ -0,0 +1,322 @@ +import os +from typing import Any,Optional,List +import statsmodels.api as sm +import numpy as np +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('StatisticalZeroCrossingPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + + + #control parameter + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + + +class StatisticalZeroCrossingPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Primitive to find zero_crossing of time series. A column indicating zero crossing on ith row . 1 indicates crossing 0 is for normal + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': '1064c78f-37e2-45a1-94a3-401a6726c220', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.feature_analysis.statistical_zero_crossing', + 'keywords': ['Time Series','ZeroCrossing'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/StatisticalZeroCrossing.py'], + 'contact': 'mailto:khlai037@tamu.edu' + + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING, + ], + 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing zero crossing indicator of time series + """ + self.logger.info('Statistical ZeroCrossing Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + statistical_zero_crossing_input = inputs + if self.hyperparams['use_semantic_types']: + statistical_zero_crossing_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + statistical_zero_crossing_output = self._zero_crossing(statistical_zero_crossing_input) + + if sparse.issparse(statistical_zero_crossing_output): + statistical_zero_crossing_output = statistical_zero_crossing_output.toarray() + outputs = self._wrap_predictions(inputs, statistical_zero_crossing_output) + + #if len(outputs.columns) == len(self._input_column_names): + # outputs.columns = self._input_column_names + + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + self.logger.info('Statistical ZeroCrossing Primitive returned') + + return base.CallResult(outputs) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _zero_crossing(self,X): + """ statistical zero crossing indicatoe of time series sequence + Args: + X : DataFrame + Time series. + Returns: + DataFrame + A object with zero crossing + """ + transformed_X = utils.pandas.DataFrame() + for column in X.columns: + column_value = X[column].values + column_zero_crossing = np.zeros(len(column_value)) + for iter in range(1,len(column_value)): + if(column_value[iter]*column_value[iter-1] < 0.0 ): + column_zero_crossing[iter] = 1 + transformed_X[column + "_zero_crossing"] = column_zero_crossing + + return transformed_X diff --git a/tods/feature_analysis/TRMF.py b/tods/feature_analysis/TRMF.py new file mode 100644 index 0000000..3b21fa1 --- /dev/null +++ b/tods/feature_analysis/TRMF.py @@ -0,0 +1,746 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy as np +import typing +import time + +# Custom import commands if any +from sklearn.decomposition.truncated_svd import TruncatedSVD + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces import base, transformer +# from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + +__all__ = ('TRMF',) + +# class Params(params.Params): +# components_: Optional[ndarray] +# explained_variance_ratio_: Optional[ndarray] +# explained_variance_: Optional[ndarray] +# singular_values_: Optional[ndarray] +# input_column_names: Optional[Any] +# target_names_: Optional[Sequence[Any]] +# training_indices_: Optional[Sequence[int]] +# target_column_indices_: Optional[Sequence[int]] +# target_columns_metadata_: Optional[List[OrderedDict]] + + +class Hyperparams(hyperparams.Hyperparams): + # Tuning + lags = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(1,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Set of lag indices to use in model.", + ) + K = hyperparams.UniformInt( + lower=0, + upper=100000000, + default=2, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Length of latent embedding dimension.", + ) + lambda_f = hyperparams.Uniform( + lower=0, + upper=100000000, + default=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Regularization parameter used for matrix F.", + ) + lambda_x = hyperparams.Uniform( + lower=0, + upper=100000000, + default=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Regularization parameter used for matrix X.", + ) + lambda_w = hyperparams.Uniform( + lower=0, + upper=100000000, + default=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Regularization parameter used for matrix W.", + ) + alpha = hyperparams.Uniform( + lower=0, + upper=100000000, + default=1000.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Regularization parameter used for make the sum of lag coefficient close to 1. That helps to avoid big deviations when forecasting.", + ) + eta = hyperparams.Uniform( + lower=0, + upper=100000000, + default=1.0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Regularization parameter used for X when undercovering autoregressive dependencies.", + ) + max_iter = hyperparams.UniformInt( + lower=0, + upper=100000000, + default=1000, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Number of iterations of updating matrices F, X and W.", + ) + F_step = hyperparams.Uniform( + lower=0, + upper=100000000, + default=0.0001, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Step of gradient descent when updating matrix F.", + ) + X_step = hyperparams.Uniform( + lower=0, + upper=100000000, + default=0.0001, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Step of gradient descent when updating matrix X.", + ) + W_step = hyperparams.Uniform( + lower=0, + upper=100000000, + default=0.0001, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Step of gradient descent when updating matrix W.", + ) + + # Control + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class TRMF(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """Temporal Regularized Matrix Factorization. + + Parameters + ---------- + + lags : array-like, shape (n_lags,) + Set of lag indices to use in model. + + K : int + Length of latent embedding dimension + + lambda_f : float + Regularization parameter used for matrix F. + + lambda_x : float + Regularization parameter used for matrix X. + + lambda_w : float + Regularization parameter used for matrix W. + + alpha : float + Regularization parameter used for make the sum of lag coefficient close to 1. + That helps to avoid big deviations when forecasting. + + eta : float + Regularization parameter used for X when undercovering autoregressive dependencies. + + max_iter : int + Number of iterations of updating matrices F, X and W. + + F_step : float + Step of gradient descent when updating matrix F. + + X_step : float + Step of gradient descent when updating matrix X. + + W_step : float + Step of gradient descent when updating matrix W. + + + Attributes + ---------- + + F : ndarray, shape (n_timeseries, K) + Latent embedding of timeseries. + + X : ndarray, shape (K, n_timepoints) + Latent embedding of timepoints. + + W : ndarray, shape (K, n_lags) + Matrix of autoregressive coefficients. + + Reference + ---------- + "https://github.com/SemenovAlex/trmf" + + Yu, H. F., Rao, N., & Dhillon, I. S. (2016). Temporal regularized matrix factorization for high-dimensional time series prediction. + In Advances in neural information processing systems (pp. 847-855). + Which can be found there: http://www.cs.utexas.edu/~rofuyu/papers/tr-mf-nips.pdf + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "Temporal Regularized Matrix Factorization Primitive", + "python_path": "d3m.primitives.tods.feature_analysis.trmf", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/Junjie/anomaly-primitives/anomaly_primitives/TRMF.py']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.TEMPORAL_REGULARIZED_MATRIX_FACTORIZATION, ], + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_CONSTRUCTION, + "id": "d6be6941-61d0-4cbd-85ef-a10c86aa40b1", + "hyperparams_to_tune": ['lags', 'K', 'lambda_f', 'lambda_x', 'lambda_w', 'alpha', 'eta', 'max_iter', 'F_step', 'X_step', 'W_step'], + "version": "0.0.1", + }) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. + + Returns: + Container DataFrame after Truncated SVD. + """ + self._clf = trmf( + lags=self.hyperparams['lags'], + K=self.hyperparams['K'], + lambda_f=self.hyperparams['lambda_f'], + lambda_x=self.hyperparams['lambda_x'], + lambda_w=self.hyperparams['lambda_w'], + alpha=self.hyperparams['alpha'], + eta=self.hyperparams['eta'], + max_iter=self.hyperparams['max_iter'], + F_step=self.hyperparams['F_step'], + X_step=self.hyperparams['X_step'], + W_step=self.hyperparams['W_step'], + ) + + + tmp = inputs.copy() + for col in inputs.columns: + tmp[col] = inputs[col]/inputs[col].max() + + self._inputs = tmp + self._fitted = False + + + # Get cols to fit. + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + + sk_output = self._clf.get_X() + + + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + # self._write(outputs) + return CallResult(outputs) + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, np.integer, np.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs:Inputs): + """ + write inputs to current directory, only for test + """ + inputs.to_csv(str(time.time())+'.csv') + + +""" +Temporal Regularized Matrix Factorization +""" +class trmf: + + # Added by JJ + def get_X(self): + return self.X.T + + + # Original + def __init__(self, lags, K, lambda_f, lambda_x, lambda_w, alpha, eta, max_iter=1000, + F_step=0.0001, X_step=0.0001, W_step=0.0001): + self.lags = lags + self.L = len(lags) + self.K = K + self.lambda_f = lambda_f + self.lambda_x = lambda_x + self.lambda_w = lambda_w + self.alpha = alpha + self.eta = eta + self.max_iter = max_iter + self.F_step = F_step + self.X_step = X_step + self.W_step = W_step + + self.W = None + self.F = None + self.X = None + + + def fit(self, train, resume=False): + """Fit the TRMF model according to the given training data. + + Model fits through sequential updating three matrices: + - matrix self.F; + - matrix self.X; + - matrix self.W. + + Each matrix updated with gradient descent. + + Parameters + ---------- + train : ndarray, shape (n_timeseries, n_timepoints) + Training data. + + resume : bool + Used to continue fitting. + + Returns + ------- + self : object + Returns self. + """ + + if not resume: + self.Y = train.T + mask = np.array((~np.isnan(self.Y)).astype(int)) + self.mask = mask + self.Y[self.mask == 0] = 0. + self.N, self.T = self.Y.shape + self.W = np.random.randn(self.K, self.L) / self.L + self.F = np.random.randn(self.N, self.K) + self.X = np.random.randn(self.K, self.T) + + for _ in range(self.max_iter): + self._update_F(step=self.F_step) + self._update_X(step=self.X_step) + self._update_W(step=self.W_step) + + + def predict(self, h): + """Predict each of timeseries h timepoints ahead. + + Model evaluates matrix X with the help of matrix W, + then it evaluates prediction by multiplying it by F. + + Parameters + ---------- + h : int + Number of timepoints to forecast. + + Returns + ------- + preds : ndarray, shape (n_timeseries, T) + Predictions. + """ + + X_preds = self._predict_X(h) + return np.dot(self.F, X_preds) + + + def _predict_X(self, h): + """Predict X h timepoints ahead. + + Evaluates matrix X with the help of matrix W. + + Parameters + ---------- + h : int + Number of timepoints to forecast. + + Returns + ------- + X_preds : ndarray, shape (self.K, h) + Predictions of timepoints latent embeddings. + """ + + X_preds = np.zeros((self.K, h)) + X_adjusted = np.hstack([self.X, X_preds]) + for t in range(self.T, self.T + h): + for l in range(self.L): + lag = self.lags[l] + X_adjusted[:, t] += X_adjusted[:, t - lag] * self.W[:, l] + return X_adjusted[:, self.T:] + + def impute_missings(self): + """Impute each missing element in timeseries. + + Model uses matrix X and F to get all missing elements. + + Parameters + ---------- + + Returns + ------- + data : ndarray, shape (n_timeseries, T) + Predictions. + """ + data = self.Y + data[self.mask == 0] = np.dot(self.F, self.X)[self.mask == 0] + return data + + + def _update_F(self, step, n_iter=1): + """Gradient descent of matrix F. + + n_iter steps of gradient descent of matrix F. + + Parameters + ---------- + step : float + Step of gradient descent when updating matrix. + + n_iter : int + Number of gradient steps to be made. + + Returns + ------- + self : objects + Returns self. + """ + + for _ in range(n_iter): + self.F -= step * self._grad_F() + + + def _update_X(self, step, n_iter=1): + """Gradient descent of matrix X. + + n_iter steps of gradient descent of matrix X. + + Parameters + ---------- + step : float + Step of gradient descent when updating matrix. + + n_iter : int + Number of gradient steps to be made. + + Returns + ------- + self : objects + Returns self. + """ + + for _ in range(n_iter): + self.X -= step * self._grad_X() + + + def _update_W(self, step, n_iter=1): + """Gradient descent of matrix W. + + n_iter steps of gradient descent of matrix W. + + Parameters + ---------- + step : float + Step of gradient descent when updating matrix. + + n_iter : int + Number of gradient steps to be made. + + Returns + ------- + self : objects + Returns self. + """ + + for _ in range(n_iter): + self.W -= step * self._grad_W() + + + def _grad_F(self): + """Gradient of matrix F. + + Evaluating gradient of matrix F. + + Parameters + ---------- + + Returns + ------- + self : objects + Returns self. + """ + + return - 2 * np.dot((self.Y - np.dot(self.F, self.X)) * self.mask, self.X.T) + 2 * self.lambda_f * self.F + + + def _grad_X(self): + """Gradient of matrix X. + + Evaluating gradient of matrix X. + + Parameters + ---------- + + Returns + ------- + self : objects + Returns self. + """ + + for l in range(self.L): + lag = self.lags[l] + W_l = self.W[:, l].repeat(self.T, axis=0).reshape(self.K, self.T) + X_l = self.X * W_l + z_1 = self.X - np.roll(X_l, lag, axis=1) + z_1[:, :max(self.lags)] = 0. + z_2 = - (np.roll(self.X, -lag, axis=1) - X_l) * W_l + z_2[:, -lag:] = 0. + + grad_T_x = z_1 + z_2 + return - 2 * np.dot(self.F.T, self.mask * (self.Y - np.dot(self.F, self.X))) + self.lambda_x * grad_T_x + self.eta * self.X + + + def _grad_W(self): + """Gradient of matrix W. + + Evaluating gradient of matrix W. + + Parameters + ---------- + + Returns + ------- + self : objects + Returns self. + """ + + grad = np.zeros((self.K, self.L)) + for l in range(self.L): + lag = self.lags[l] + W_l = self.W[:, l].repeat(self.T, axis=0).reshape(self.K, self.T) + X_l = self.X * W_l + z_1 = self.X - np.roll(X_l, lag, axis=1) + z_1[:, :max(self.lags)] = 0. + z_2 = - (z_1 * np.roll(self.X, lag, axis=1)).sum(axis=1) + grad[:, l] = z_2 + return grad + self.W * 2 * self.lambda_w / self.lambda_x -\ + self.alpha * 2 * (1 - self.W.sum(axis=1)).repeat(self.L).reshape(self.W.shape) diff --git a/tods/feature_analysis/WaveletTransform.py b/tods/feature_analysis/WaveletTransform.py new file mode 100644 index 0000000..31497fd --- /dev/null +++ b/tods/feature_analysis/WaveletTransform.py @@ -0,0 +1,557 @@ +import os +import typing +from numpy import ndarray +import numpy as np + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer +from typing import Union + +import pywt +import pandas +import math + +import common_primitives +import numpy +from typing import Optional, List +from collections import OrderedDict +from scipy import sparse +import logging +import uuid + +__all__ = ('WaveletTransformer',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + wavelet = hyperparams.Enumeration( + values=['bior1.1', 'bior1.3', 'bior1.5', 'bior2.2', 'bior2.4', 'bior2.6', 'bior2.8', + 'bior3.1', 'bior3.3', 'bior3.5', 'bior3.7', 'bior3.9', 'bior4.4', 'bior5.5', + 'bior6.8', 'cgau1', 'cgau2', 'cgau3', 'cgau4', 'cgau5', 'cgau6', 'cgau7', 'cgau8', + 'cmor', 'coif1', 'coif2', 'coif3', 'coif4', 'coif5', 'coif6', 'coif7', 'coif8', + 'coif9', 'coif10', 'coif11', 'coif12', 'coif13', 'coif14', 'coif15', 'coif16', + 'coif17', 'db1', 'db2', 'db3', 'db4', 'db5', 'db6', 'db7', 'db8', 'db9', 'db10', + 'db11', 'db12', 'db13', 'db14', 'db15', 'db16', 'db17', 'db18', 'db19', 'db20', + 'db21', 'db22', 'db23', 'db24', 'db25', 'db26', 'db27', 'db28', 'db29', 'db30', + 'db31', 'db32', 'db33', 'db34', 'db35', 'db36', 'db37', 'db38', 'dmey', 'fbsp', + 'gaus1', 'gaus2', 'gaus3', 'gaus4', 'gaus5', 'gaus6', 'gaus7', 'gaus8', 'haar', + 'mexh', 'morl', 'rbio1.1', 'rbio1.3', 'rbio1.5', 'rbio2.2', 'rbio2.4', 'rbio2.6', + 'rbio2.8', 'rbio3.1', 'rbio3.3', 'rbio3.5', 'rbio3.7', 'rbio3.9', 'rbio4.4', + 'rbio5.5', 'rbio6.8', 'shan', 'sym2', 'sym3', 'sym4', 'sym5', 'sym6', 'sym7', + 'sym8', 'sym9', 'sym10', 'sym11', 'sym12', 'sym13', 'sym14', 'sym15', 'sym16', + 'sym17', 'sym18', 'sym19', 'sym20'], + + default='db8', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Wavelet to use.", + ) + mode = hyperparams.Enumeration( + values=['zero', 'constant', 'symmetric', 'periodic', 'smooth', 'periodization', 'reflect', + 'antisymmetric', 'antireflect'], + + default='symmetric', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Signal extension mode.", + ) + axis = hyperparams.UniformInt( + lower=0, + upper=2, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="Axis over which to compute the DWT. If 0, independently DWT each feature, otherwise (if 1) DWT each sample.", + ) + + level = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=0, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description="Decomposition level (must be >= 0). If level is None (default) then it will be calculated using the dwt_max_level function.", + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + + # level = hyperparams.Hyperparameter[None]( + # default=None, + # semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + # description="Decomposition level (must be >= 0). If level is None (default) then it will be calculated using the dwt_max_level function.", + # ) + + inverse = hyperparams.UniformInt( + lower=0, + upper=2, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Inverse wavelet transformation if inverse=1.", + ) + id = hyperparams.Hyperparameter[str]( + default='0000', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="identification number.", + ) + + + + # Keep previous + dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", + ) + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(2,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(0,1,3,), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class WaveletTransformer(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive of Multilevel 1D Discrete Wavelet Transform of data. + See `PyWavelet documentation `_ for details. + Parameters + ---------- + wavelet: str + Wavelet to use + + mode: str + Signal extension mode, see https://pywavelets.readthedocs.io/en/latest/ref/signal-extension-modes.html#ref-modes for details. + + axis: int + Axis over which to compute the DWT. If not given, transforming along columns. + + window_size : int + The moving window size. + + level: int + Decomposition level (must be > 0). If level is 0 (default) then it will be calculated using the maximum level. + + Attributes + ---------- + None + """ + + __author__ = "DATALAB @Taxes A&M University" + metadata = metadata_base.PrimitiveMetadata( + { + "name": "Wavelet_transformation", + "python_path": "d3m.primitives.tods.feature_analysis.wavelet_transform", + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.FREQUENCY_TRANSFORM, ], + "primitive_family": metadata_base.PrimitiveFamily.FEATURE_EXTRACTION, + "version": "0.0.1", + "hyperparams_to_tune": ['wavelet', 'mode', 'axis', 'level'], + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'WaveletTransformer')), + }, + ) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) # , random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Wavelet(wavelet=self.hyperparams['wavelet'], + mode=self.hyperparams['mode'], + axis=self.hyperparams['axis'], + level=self.hyperparams['level'], + # id=self.hyperparams['id'], + ) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to Wavelet transform. + + Returns: + [cA_n, cD_n, cD_n-1, …, cD2, cD1]: Container DataFrame after Wavelet Transformation. + Ordered frame of coefficients arrays where n denotes the level of decomposition. The first element (cA_n) of the result is approximation coefficients array and the following elements (cD_n - cD_1) are details coefficients arrays. + """ + assert isinstance(inputs, container.DataFrame), type(container.DataFrame) + + _, self._columns_to_produce = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = inputs.columns + + # print('columns_to_produce=', self._columns_to_produce) + + + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._columns_to_produce] + output_columns = [] + if len(self._columns_to_produce) > 0: + sk_output = self._clf.produce(sk_inputs, self.hyperparams['inverse']) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._columns_to_produce, + columns_list=output_columns) + + # print(inputs) + # print(outputs) + # if self.hyperparams['inverse'] == 1: + # print(outputs) + # print(outputs.metadata.to_internal_simple_structure()) + + # outputs = inputs + return base.CallResult(outputs) + + # return base.CallResult(dataframe) + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + # print('======_get_columns_to_fit======') + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams[ + 'use_columns'], + exclude_columns=hyperparams[ + 'exclude_columns'], + can_use_column=can_produce_column) + + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + + # print(column_metadata) + # print(column_metadata['structural_type'], accepted_structural_types) + + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + # print(column_metadata) + # print(semantic_types, accepted_semantic_types) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[ + OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = container.DataFrame(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._columns_to_produce, outputs.metadata, + self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata.base.DataMetadata + input_indices: list + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + # print(column_metadata['semantic_types']) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # print(target_columns_metadata) + return target_columns_metadata + +WaveletTransformer.__doc__ = WaveletTransformer.__doc__ + +class Wavelet: + + wt_info = dict() + + def __init__(self, wavelet='db1', mode='symmetric', axis=-1, level=1, id=0): + self._wavelet = wavelet + self._mode = mode + self._axis = axis + self._level = level + self._id = id + return + + def produce(self, data, inverse): + + if inverse == 1: + output = self.inverse_transform_to_dataframe(coeffs=data) + + else: + output = self.transform_to_dataframe(data) + + return output + + + def transform_to_dataframe(self, data): + + # print(data) + coeffs_buf = pandas.DataFrame(columns=[]) + + + for index, data_to_transform in data.iteritems(): + # data_to_transform = data.squeeze(1) + # print(data_to_transform) + if self._level == None: + wavelet_dec_len = pywt.Wavelet(self._wavelet).dec_len + self._level = pywt.dwt_max_level(len(data_to_transform), wavelet_dec_len) + + coeffs = pywt.wavedec(data=data_to_transform, wavelet=self._wavelet, level=self._level) + coeffs_T = pandas.DataFrame(coeffs).T + coeffs_buf = pandas.concat([coeffs_buf, coeffs_T], axis=1) + # coeffs_T = ndarray(coeffs).T + # print(coeffs_T) + + # print(coeffs_buf) + + return coeffs_buf # coeffs_T + + def transform_to_single_dataframe(self, data): + + # print(data) + data_to_transform = data.squeeze(1) + wavelet_dec_len = pywt.Wavelet(self._wavelet).dec_len + self._level = pywt.dwt_max_level(len(data_to_transform), wavelet_dec_len) + + coeffs = pywt.wavedec(data=data_to_transform, wavelet=self._wavelet, level=self._level) + + cAD_size = [len(cAD) for cAD in coeffs] + Wavelet.wt_info[self._id] = {'wavelet': self._wavelet, + 'cAD_size': cAD_size, + } + + # print(len(data_to_transform)) + # + coeffs_list = [] # ndarray([0]) + for cAD in coeffs: + # print(cAD.shape) + # print(cAD[0:10]) + coeffs_list += list(cAD) + + # print(len(coeffs_list)) + + coeffs_T = pandas.DataFrame(coeffs_list) + # print(coeffs_T) + + return coeffs_T + + def inverse_transform_to_dataframe(self, coeffs): + # print('=======inverse_transform======') + # print('level: ', self._level) + # print(coeffs) + + coeffs_list = [numpy.array(col[~pandas.isnull(col)]) for index, col in coeffs.iteritems()] + # print(coeffs_list) + data = pywt.waverec(coeffs=coeffs_list, wavelet=self._wavelet) + + # print(data) + return data # [0:-1] + + def inverse_transform_to_single_dataframe(self, coeffs): + # print('=======inverse_transform======') + # print('level: ', self._level) + # print(coeffs) + # print(Wavelet.wt_info[self._id]) + wt_info = Wavelet.wt_info[self._id] + # print(wt_info) + # print(wt_info['cAD_size']) + # print(wt_info['wavelet']) + cAD_size = wt_info['cAD_size'] + self._wavelet = wt_info['wavelet'] + + coeffs_format = [] + coeff = coeffs + for cAD_len in cAD_size: + coeffs_format.append(np.array(coeff[0:cAD_len]).squeeze(axis=1)) + coeff = coeff[cAD_len:] + + # for cAD in coeffs_format: + # print(cAD.shape) + # print(cAD[0:10]) + + # print(coeffs_format) + data = pywt.waverec(coeffs=coeffs_format, wavelet=self._wavelet) + + # print(data.shape) + # print(data) + return data # [0:-1] diff --git a/tods/feature_analysis/__init__.py b/tods/feature_analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tods/reinforcement/RuleBasedFilter.py b/tods/reinforcement/RuleBasedFilter.py new file mode 100644 index 0000000..f87ed53 --- /dev/null +++ b/tods/reinforcement/RuleBasedFilter.py @@ -0,0 +1,348 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer +from d3m.metadata import base as metadata_base, hyperparams + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + + +import os.path + +import time +import re + + +Inputs = container.DataFrame +Outputs = container.DataFrame + + +class Hyperparams(hyperparams.Hyperparams): + # Tuning + rule = hyperparams.Hyperparameter[str]( + default='', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description='The rule of filtering.' + ) + + # Control + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + # use_semantic_types = hyperparams.UniformBool( + # default=False, + # semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + # description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + # ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class RuleBasedFilter(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Filter the selected columns according to the rule. + + Parameters + ---------- + rule: String + The rule to follow when performing the filter. Write it like how we write 'if' in python. And wrap column index with two '#': #col_num#. + e.g. "#1# > 10" means that the numbers in column 1 must be greater than 10. + The indicies of columns should be same with those in 'use_columns'. + + use_columns: Set + A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped. + The indicies of columns should be same with those in 'rule'. + + exclude_columns: Set + A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided. + + return_result: Enumeration + Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? + + add_index_columns: Bool + Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\". + + error_on_no_input: Bool( + Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False. + + return_semantic_type: Enumeration[str]( + Decides what semantic type to attach to generated attributes' + """ + + __author__: "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "name": "Rule-Based Filtering", + "python_path": "d3m.primitives.tods.reinforcement.rule_filter", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git', ]}, + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.RULE_BASED_FILTER,], + "primitive_family": metadata_base.PrimitiveFamily.REINFORCEMENT, + "id": "42744c37-8879-4785-9f18-6de9d612ea93", + "hyperparams_to_tune": ['rule',], + "version": "0.0.1", + }) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. + + Returns: + Container DataFrame after BKFilter. + """ + # Get cols to fit. + + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + + operated_col = [int(x.strip('#')) for x in re.findall(r'#\d*#', self.hyperparams['rule'])] + if set(operated_col) != set(self._training_indices): + # print(operated_col, self._training_indices) + raise RuntimeError("Column numbers in 'rule' and 'use_columns' are not matched.") + + + if len(self._training_indices) > 0: + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + + + # if not self._fitted: + # raise PrimitiveNotFittedError("Primitive not fitted.") + # sk_inputs = inputs + # if self.hyperparams['use_semantic_types']: + # sk_inputs = inputs.iloc[:, self._training_indices] + + output_columns = [] + + + if len(self._training_indices) > 0: + sk_output = self._rule_based_filter(inputs, self.hyperparams['rule']) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + # self._write(outputs) + # self.logger.warning('produce was called3') + return CallResult(outputs) + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + # if not hyperparams['use_semantic_types']: + # return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = [] + exclude_columns = [] + + # if hyperparams['columns_using_method'] == 'name': + # inputs_cols = inputs.columns.values.tolist() + # for i in range(len(inputs_cols)): + # if inputs_cols[i] in hyperparams['use_columns_name']: + # use_columns.append(i) + # elif inputs_cols[i] in hyperparams['exclude_columns_name']: + # exclude_columns.append(i) + # else: + use_columns=hyperparams['use_columns'] + exclude_columns=hyperparams['exclude_columns'] + + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, use_columns=use_columns, exclude_columns=exclude_columns, can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs:Inputs): + inputs.to_csv(str(time.time())+'.csv') + + def _rule_based_filter(self, X, rule): + """ + Filter the selected columns according to the rule. + Args: + X: slected rows to be performed + rule: The rule to follow when performing the filter + + Returns: + Dataframe, results of Rule-Based Filter + """ + list_result = [0] * X.shape[0] + + rule = re.sub(r'#\d*#', lambda x: 'row[' + x.group(0).strip('#') + ']', rule) + + for index, row in X.iterrows(): + if not eval(rule): + list_result[index] = 1 + + return utils.pandas.DataFrame({'result': list_result}) + diff --git a/tods/requirements.txt b/tods/requirements.txt new file mode 100644 index 0000000..d587988 --- /dev/null +++ b/tods/requirements.txt @@ -0,0 +1,31 @@ +scikit-learn==0.21.3 +pytypes==1.0b5 +frozendict==1.2 +numpy>=1.15.4,<=1.18.1 +jsonschema==2.6.0 +requests>=2.19.1,<=2.22.0 +strict-rfc3339==0.7 +rfc3987==1.3.8 +webcolors>=1.8.1,<=1.10 +dateparser>=0.7.0,<=0.7.2 +python-dateutil==2.8.1 +pandas==0.23.4 +typing-inspect==0.5.0 +GitPython>=2.1.11,<=3.0.5 +jsonpath-ng==1.4.3 +custom-inherit>=2.2.0,<=2.2.2 +PyYAML>=5.1,<=5.3 +pycurl>=7.43.0.2,<=7.43.0.3 +pyarrow==0.15.1 +gputil>=1.3.0,<=1.4.0 +pyrsistent>=0.14.11,<=0.15.7 +scipy>=1.2.1,<=1.4.1 +openml==0.10.1 +lightgbm>=2.2.2,<=2.3.0 +opencv-python-headless<=4.1.1.26,>=4.1 +imageio>=2.3.0,<=2.6.0 +pillow==6.2.1 +xgboost>=0.81,<=0.90 +Jinja2==2.9.4 +simplejson==3.12.0 +gitdb2==2.0.6 diff --git a/tods/run_tests.py b/tods/run_tests.py new file mode 100755 index 0000000..16c264a --- /dev/null +++ b/tods/run_tests.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +import sys +import unittest + +runner = unittest.TextTestRunner(verbosity=1) + +tests = unittest.TestLoader().discover('tests') + +if not runner.run(tests).wasSuccessful(): + sys.exit(1) diff --git a/tods/setup.py b/tods/setup.py new file mode 100644 index 0000000..4aa078c --- /dev/null +++ b/tods/setup.py @@ -0,0 +1,45 @@ +import os +from setuptools import setup, find_packages +import logging + +PACKAGE_NAME = 'tods' + +def read_file_entry_points(fname): + with open(fname) as entry_points: + return entry_points.read() + +def merge_entry_points(): + entry_list = ['entry_points.ini'] + merge_entry = [] + for entry_name in entry_list: + entry_point = read_file_entry_points(entry_name).replace(' ', '') + path_list = entry_point.split('\n')[1:] + merge_entry += path_list + entry_point_merge = dict() + entry_point_merge['d3m.primitives'] = list(set(merge_entry)) # remove dumplicated elements + return entry_point_merge + +setup( + name=PACKAGE_NAME, + version='0.0.1', + description='Primitives for time-series outlier detection', + author='DATA Lab', + packages=find_packages(exclude=['contrib', 'docs', 'tests*']), + install_requires=[ + 'd3m', + 'Jinja2', + 'simplejson==3.12.0', + 'scikit-learn==0.21.3', + 'statsmodels==0.11.1', + 'PyWavelets>=1.1.1', + 'tensorflow', # should be removed later + 'keras', # should be removed later + 'pyod', + 'nimfa==1.4.0', + 'stumpy==1.4.0', + 'more-itertools==8.5.0' + ], + + entry_points = merge_entry_points() + +) diff --git a/tods/tests/test_AutoRegODetect.py b/tods/tests/test_AutoRegODetect.py new file mode 100644 index 0000000..8381bfe --- /dev/null +++ b/tods/tests/test_AutoRegODetect.py @@ -0,0 +1,114 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.AutoRegODetect import AutoRegODetector +from pyod.utils.data import generate_data + +from detection_algorithm.core.CollectiveCommonTest import CollectiveCommonTest + +import numpy as np + +class AutoRegODetectTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.contamination = 0.1 + self.window_size = 2 + self.roc_floor = 0. # 0.8 + # self.n_train = 200 + # self.n_test = 100 + # self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + # n_train=self.n_train, n_test=self.n_test, + # contamination=self.contamination, random_state=42) + # self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + # self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + self.X_train = d3m_dataframe({'data': [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]}, + columns=['data'], generate_metadata=True) + self.y_train = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.X_test = d3m_dataframe({'data': [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]}, + columns=['data'], generate_metadata=True) + self.y_test = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1]) + + + + hyperparams_default = AutoRegODetector.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'window_size': self.window_size, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + self.primitive = AutoRegODetector(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.collective_common_test = CollectiveCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.collective_common_test.test_detector() + pass + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 9, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'AutoRegODetector0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'AutoRegODetector0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'AutoRegODetector0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_BKFilter.py b/tods/tests/test_BKFilter.py new file mode 100755 index 0000000..b167aea --- /dev/null +++ b/tods/tests/test_BKFilter.py @@ -0,0 +1,127 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from feature_analysis import BKFilter + + +class BKFilterTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3.], 'b': [2., 3., 4.], 'c': [3., 4., 5.],}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = BKFilter.BKFilter.metadata.get_hyperparams() + primitive = BKFilter.BKFilter(hyperparams=hyperparams_class.defaults()) + new_main = primitive.produce(inputs=main).value + print(new_main) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'a', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'b', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'c', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'output_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'output_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'output_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_CategoricalBinary.py b/tods/tests/test_CategoricalBinary.py new file mode 100644 index 0000000..40fb003 --- /dev/null +++ b/tods/tests/test_CategoricalBinary.py @@ -0,0 +1,146 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from data_processing import CategoricalToBinary +import numpy as np +import pandas as pd +import utils as test_utils +import os +from common_primitives import dataset_to_dataframe,column_parser + +class CategoricalBinaryTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + + main = container.DataFrame({'A': [1, 2], 'B': ['a','b']}, + columns=['A', 'B'], + generate_metadata=True) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'B', + }, + }]) + + + self.assertIsInstance(main, container.DataFrame) + + hyperparams_class = CategoricalToBinary.CategoricalToBinary.metadata.get_hyperparams() + hp = hyperparams_class.defaults().replace({ + 'use_semantic_types':True, + 'use_columns': (0,), + 'return_result':'append', + }) + + primitive = CategoricalToBinary.CategoricalToBinary(hyperparams=hp) + new_main = primitive.produce(inputs=main).value + + c = pd.DataFrame({"A":[1,2], "B":['a','b'],"A_1":["1","0"],"A_2":["0","1"]}) + + pd.testing.assert_frame_equal(new_main, c) + # print("new_main\n",new_main) + + # print(utils.to_json_structure(new_main.metadata.to_internal_simple_structure())) + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'str', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'A_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'str', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'A_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'str', + }, + }]) + + + # print(new_main) + # print(test_utils.convert_through_json(new_main.metadata.query(()))) + # print(test_utils.convert_through_json(new_main.metadata.query((metadata_base.ALL_ELEMENTS,)))) + # print(mean_mse, std_mse) + + + # print("after testing") + + # self.assertAlmostEqual(mean_mse.__float__(), 0., delta=1e-8) + # self.assertAlmostEqual(std_mse.__float__(), 0., delta=1e-8) + + # print(main.metadata.to_internal_simple_structure()) + # print(new_main.metadata.to_internal_simple_structure()) + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_ColumnFilter.py b/tods/tests/test_ColumnFilter.py new file mode 100644 index 0000000..843b3be --- /dev/null +++ b/tods/tests/test_ColumnFilter.py @@ -0,0 +1,106 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from data_processing import ColumnFilter + + +class ColumnFilterTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3.], 'b': [2., 3., 4.], 'c': [3., 4., 5.],}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = ColumnFilter.ColumnFilter.metadata.get_hyperparams() + primitive = ColumnFilter.ColumnFilter(hyperparams=hyperparams_class.defaults()) + new_main = primitive.produce(inputs=main).value + print(new_main) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'a', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'b', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'c', + 'structural_type': 'numpy.float64', + }, + }]) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_ContinuityValidation.py b/tods/tests/test_ContinuityValidation.py new file mode 100644 index 0000000..6169f79 --- /dev/null +++ b/tods/tests/test_ContinuityValidation.py @@ -0,0 +1,137 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from data_processing import ContinuityValidation + + +class ContinuityValidationTest(unittest.TestCase): + def test_basic(self): + main = container.DataFrame({'d3mIndex': [0, 1, 2], 'timestamp': [1., 2., 4.], 'a': [1., 2., 3.], 'b': [2., 3., 4.], 'ground_truth': [0, 0, 0],}, + columns=['d3mIndex', 'timestamp', 'a', 'b', 'ground_truth'], + generate_metadata=True) + + # print(main) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'd3mIndex'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'} + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'} + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'ground_truth'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = ContinuityValidation.ContinuityValidation.metadata.get_hyperparams() + primitive = ContinuityValidation.ContinuityValidation(hyperparams=hyperparams_class.defaults()) + new_main = primitive.produce(inputs=main).value + # print(new_main) + + expected_output = container.DataFrame({'d3mIndex': [0, 1, 2, 3], + 'timestamp': [1., 2., 3., 4.], + 'a': [1., 2., 2.5, 3.], 'b': [2., 3., 3.5, 4.], + 'ground_truth': [0, 0, 0, 0]}) + self.assertEqual(new_main.values.tolist() , expected_output.values.tolist()) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'timestamp', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'a', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'b', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'ground_truth', + 'structural_type': 'numpy.int64', + }, + }]) + + self._test_continuity(new_main) + + + def _test_continuity(self, data_value): + tmp_col = data_value['timestamp'] + interval = tmp_col[1] - tmp_col[0] + for i in range(2, tmp_col.shape[0]): + self.assertEqual(interval, tmp_col[i] - tmp_col[i-1]) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_DeepLog.py b/tods/tests/test_DeepLog.py new file mode 100644 index 0000000..01e549a --- /dev/null +++ b/tods/tests/test_DeepLog.py @@ -0,0 +1,105 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from detection_algorithm.DeepLog import DeepLogPrimitive + + + +class DeepLogTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3., 4.], 'b': [2., 3., 4., 5.], 'c': [3., 4., 5., 6.]}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = DeepLogPrimitive.metadata.get_hyperparams() + hyperparams = hyperparams_class.defaults() + hyperparams = hyperparams.replace({'batch_size': 4}) + + print(hyperparams) + + primitive = DeepLogPrimitive(hyperparams=hyperparams) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + new_main_score = primitive.produce_score(inputs=main).value + print(new_main) + print(new_main_score) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_DiscreteCosineTransform.py b/tods/tests/test_DiscreteCosineTransform.py new file mode 100644 index 0000000..1443bc8 --- /dev/null +++ b/tods/tests/test_DiscreteCosineTransform.py @@ -0,0 +1,124 @@ +from d3m import container,utils +from d3m.metadata import base as metadata_base +import unittest +from feature_analysis import DiscreteCosineTransform +from common_primitives import dataset_to_dataframe + +import utils as test_utils +import os +import numpy as np +import pandas as pd +import logging +from scipy.fft import fft +from cmath import polar + +class DctTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + column_index =0 + + main = container.DataFrame({'A': [1, 2, 3], 'B': ['a','b','c']}, + columns=['A', 'B'], + generate_metadata=True) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'B', + }, + }]) + + + self.assertIsInstance(main, container.DataFrame) + + hyperparams_class = DiscreteCosineTransform.DiscreteCosineTransform.metadata.get_hyperparams() + hp = hyperparams_class.defaults().replace({ + 'use_semantic_types':True, + 'use_columns': (0,), + 'return_result':'append', + }) + primitive = DiscreteCosineTransform.DiscreteCosineTransform(hyperparams=hp) + new_main = primitive.produce(inputs=main).value + + c = pd.DataFrame({"A":[1,2,3], "B":['a','b','c'],'A_dct_coeff':[1.200000e+01,-3.464102e+00,-4.440892e-16]}) + + pd.testing.assert_frame_equal(new_main, c) + + params = primitive.get_params() + primitive.set_params(params=params) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'str', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'A_dct_coeff', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_DuplicationValidation.py b/tods/tests/test_DuplicationValidation.py new file mode 100644 index 0000000..b9a9bcb --- /dev/null +++ b/tods/tests/test_DuplicationValidation.py @@ -0,0 +1,112 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from data_processing import DuplicationValidation + + +class DuplicationValidationTest(unittest.TestCase): + def test_basic(self): + main = container.DataFrame({'timestamp': [1., 1., 4.],'a': [1., 2., 3.], 'b': [2., 3., 4.],}, + columns=['timestamp', 'a', 'b'], + generate_metadata=True) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = DuplicationValidation.DuplicationValidation.metadata.get_hyperparams() + primitive = DuplicationValidation.DuplicationValidation(hyperparams=hyperparams_class.defaults()) + new_main = primitive.produce(inputs=main).value + print(new_main) + + expected_output = container.DataFrame({'timestamp': [1., 4.],'a': [1., 3.], 'b': [2., 4.],}) + self.assertEqual(new_main.values.tolist() , expected_output.values.tolist()) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'timestamp', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'a', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'b', + 'structural_type': 'numpy.float64', + }, + }]) + + self._test_drop_duplication(new_main) + + + def _test_drop_duplication(self, data_value): + self.assertEqual(True in list(data_value.duplicated('timestamp')), False) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_FastFourierTransform.py b/tods/tests/test_FastFourierTransform.py new file mode 100644 index 0000000..76e3846 --- /dev/null +++ b/tods/tests/test_FastFourierTransform.py @@ -0,0 +1,133 @@ +from d3m import container,utils +from d3m.metadata import base as metadata_base +import unittest +from feature_analysis import FastFourierTransform +from common_primitives import dataset_to_dataframe + +import utils as test_utils +import os +import numpy as np +import pandas as pd +import logging +from scipy.fft import fft +from cmath import polar + +class FftTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + column_index =0 + + + main = container.DataFrame({'A': [1, 2, 3], 'B': ['a','b','c']}, + columns=['A', 'B'], + generate_metadata=True) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'str', + 'name': 'B', + }, + }]) + + + self.assertIsInstance(main, container.DataFrame) + + hyperparams_class = FastFourierTransform.FastFourierTransform.metadata.get_hyperparams() + hp = hyperparams_class.defaults().replace({ + 'use_semantic_types':True, + 'use_columns': (0,), + 'return_result':'append', + }) + primitive = FastFourierTransform.FastFourierTransform(hyperparams=hp) + new_main = primitive.produce(inputs=main).value + + c = pd.DataFrame({"A":[1,2,3], "B":['a','b','c'],'A_fft_abs':[6.000000,1.732051,1.732051],'A_fft_phse':[-0.000000,2.617994,-2.617994]}) + + pd.testing.assert_frame_equal(new_main, c) + + params = primitive.get_params() + primitive.set_params(params=params) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'str', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'A_fft_abs', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'A_fft_phse', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_HPFilter.py b/tods/tests/test_HPFilter.py new file mode 100644 index 0000000..07945d9 --- /dev/null +++ b/tods/tests/test_HPFilter.py @@ -0,0 +1,148 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from feature_analysis import HPFilter + + +class HPFilterTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3.], 'b': [2., 3., 4.], 'c': [3., 4., 5.],}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = HPFilter.HPFilter.metadata.get_hyperparams() + primitive = HPFilter.HPFilter(hyperparams=hyperparams_class.defaults()) + new_main = primitive.produce(inputs=main).value + print(new_main) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 9, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'a', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'b', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'c', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'a_cycle', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'a_trend', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'b_cycle', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'name': 'b_trend', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 7], + 'metadata': { + 'name': 'c_cycle', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 8], + 'metadata': { + 'name': 'c_trend', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_HoltSmoothing.py b/tods/tests/test_HoltSmoothing.py new file mode 100644 index 0000000..d7069f9 --- /dev/null +++ b/tods/tests/test_HoltSmoothing.py @@ -0,0 +1,71 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + + +from timeseries_processing import HoltSmoothing +import pandas as pd + + +class HoltSmoothingTestCase(unittest.TestCase): + def test_basic(self): + main = container.DataFrame({'timestamp': [1, 2, 3], 'value': [1,2,3]}, { + 'top_level': 'main', + }, generate_metadata=True) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'value'}, + }]) + + hyperparams_class = HoltSmoothing.HoltSmoothing.metadata.get_hyperparams() + primitive = HoltSmoothing.HoltSmoothing(hyperparams=hyperparams_class.defaults()) + # primitive.set_training_data(inputs=main) + # primitive.fit() + output_main = primitive.produce(inputs=main).value + output_main = round(output_main, 2) + # new_main_drop = new_main.iloc[2:] + # new_main_drop = new_main_drop.reset_index(drop = True) + print ( "output", output_main) + + expected_result = container.DataFrame(data = { 'timestamp' : [1,2,3], 'value_holt_smoothing': [2.00,2.76,3.54]}) + print ("expected_result", expected_result) + # output_main.reset_index() + + + + self.assertEqual(output_main[['timestamp','value_holt_smoothing']].values.tolist(), expected_result[['timestamp','value_holt_smoothing']].values.tolist()) + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_HoltWintersExponentialSmoothing.py b/tods/tests/test_HoltWintersExponentialSmoothing.py new file mode 100644 index 0000000..f9ecc8d --- /dev/null +++ b/tods/tests/test_HoltWintersExponentialSmoothing.py @@ -0,0 +1,71 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + + +from timeseries_processing import HoltWintersExponentialSmoothing +import pandas as pd + + +class HoltSmoothingTestCase(unittest.TestCase): + def test_basic(self): + main = container.DataFrame({'timestamp': [1, 2, 3,4], 'value': [0.32,0.32,0.31,0.33],}, { + 'top_level': 'main', + }, generate_metadata=True) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'value'}, + }]) + + hyperparams_class = HoltWintersExponentialSmoothing.HoltWintersExponentialSmoothing.metadata.get_hyperparams() + primitive = HoltWintersExponentialSmoothing.HoltWintersExponentialSmoothing(hyperparams=hyperparams_class.defaults()) + # primitive.set_training_data(inputs=main) + # primitive.fit() + output_main = primitive.produce(inputs=main).value + output_main = round(output_main,2) + + # new_main_drop = new_main.iloc[2:] + # new_main_drop = new_main_drop.reset_index(drop = True) + print ( "output", output_main) + + expected_result = container.DataFrame(data = { 'timestamp' : [1,2,3,4], 'value': [0.32,0.32,0.31,0.32]}) + print ("expected_result", expected_result) + # output_main.reset_index() + + self.assertEqual(output_main[['timestamp','value_holt_winters_smoothing']].values.tolist(), expected_result[['timestamp','value']].values.tolist()) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_KDiscordODetect.py b/tods/tests/test_KDiscordODetect.py new file mode 100644 index 0000000..0b94407 --- /dev/null +++ b/tods/tests/test_KDiscordODetect.py @@ -0,0 +1,114 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.KDiscordODetect import KDiscordODetector +from pyod.utils.data import generate_data + +from detection_algorithm.core.CollectiveCommonTest import CollectiveCommonTest + +import numpy as np + +class KDiscordODetectTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.contamination = 0.1 + self.window_size = 2 + self.roc_floor = 0. # 0.8 + # self.n_train = 200 + # self.n_test = 100 + # self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + # n_train=self.n_train, n_test=self.n_test, + # contamination=self.contamination, random_state=42) + # self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + # self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + self.X_train = d3m_dataframe({'data': [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]}, + columns=['data'], generate_metadata=True) + self.y_train = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.X_test = d3m_dataframe({'data': [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]}, + columns=['data'], generate_metadata=True) + self.y_test = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0]) + + + + hyperparams_default = KDiscordODetector.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'window_size': self.window_size, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + self.primitive = KDiscordODetector(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.collective_common_test = CollectiveCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.collective_common_test.test_detector() + pass + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 10, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'KDiscordODetector0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'KDiscordODetector0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'KDiscordODetector0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_LSTMODetector.py b/tods/tests/test_LSTMODetector.py new file mode 100644 index 0000000..adaf49f --- /dev/null +++ b/tods/tests/test_LSTMODetector.py @@ -0,0 +1,96 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.LSTMODetect import LSTMODetector +from pyod.utils.data import generate_data + +from detection_algorithm.core.CollectiveCommonTest import CollectiveCommonTest + +import numpy as np + +class LSTMODTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.contamination = 0.1 + self.roc_floor = 0. # 0.8 + # self.n_train = 200 + # self.n_test = 100 + # self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + # n_train=self.n_train, n_test=self.n_test, + # contamination=self.contamination, random_state=42) + # self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + # self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + self.X_train = d3m_dataframe({'data': [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]}, + columns=['data'], generate_metadata=True) + self.y_train = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.X_test = d3m_dataframe({'data': [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]}, + columns=['data'], generate_metadata=True) + self.y_test = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0]) + + + + hyperparams_default = LSTMODetector.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + self.primitive = LSTMODetector(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.collective_common_test = CollectiveCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.collective_common_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 10, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'LSTMODetector0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_MatrixProfile.py b/tods/tests/test_MatrixProfile.py new file mode 100644 index 0000000..35071da --- /dev/null +++ b/tods/tests/test_MatrixProfile.py @@ -0,0 +1,104 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from detection_algorithm.MatrixProfile import MatrixProfile + + + +class MatrixProfileTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3., 4., 5., 6., 7., 8., 9.], + 'b': [2., 3., 4., 5., 6., 7., 8., 9., 10.], + 'c': [3., 4., 5., 6., 7., 8., 9., 10., 11.]}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 9, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = MatrixProfile.metadata.get_hyperparams() + hyperparams = hyperparams_class.defaults() + hyperparams = hyperparams.replace({'window_size': 3}) + + primitive = MatrixProfile(hyperparams=hyperparams) + #primitive.set_training_data(inputs=main) + #primitive.fit() + new_main = primitive.produce(inputs=main).value + print(new_main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 9, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_MovingAverageTransform.py b/tods/tests/test_MovingAverageTransform.py new file mode 100644 index 0000000..aedb9c1 --- /dev/null +++ b/tods/tests/test_MovingAverageTransform.py @@ -0,0 +1,69 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + + +from timeseries_processing import MovingAverageTransform +import pandas as pd + + +class MovingAverageTransformTestCase(unittest.TestCase): + def test_basic(self): + main = container.DataFrame({'timestamp': [20201, 20202, 20203,20204,20205], 'value': [100,200,300,400,500],}, { + 'top_level': 'main', + }, generate_metadata=True) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'value'}, + }]) + + hyperparams_class = MovingAverageTransform.MovingAverageTransform.metadata.get_hyperparams() + primitive = MovingAverageTransform.MovingAverageTransform(hyperparams=hyperparams_class.defaults()) + # primitive.set_training_data(inputs=main) + # primitive.fit() + output_main = primitive.produce(inputs=main).value + + # new_main_drop = new_main.iloc[2:] + # new_main_drop = new_main_drop.reset_index(drop = True) + # print ( "input", new_main_drop) + + expected_result = container.DataFrame(data = { 'timestamp' : [20201,20202,20203,20204,20205], 'value': [150.0,200.0,300.0,400.0,450.0]}) + print ("expected_result", expected_result) + # new_main_drop.reset_index() + + + self.assertEqual(output_main[['timestamp','value_moving_average']].values.tolist(), expected_result[['timestamp','value']].values.tolist()) + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_NonNegativeMatrixFactorization.py b/tods/tests/test_NonNegativeMatrixFactorization.py new file mode 100644 index 0000000..eb741f1 --- /dev/null +++ b/tods/tests/test_NonNegativeMatrixFactorization.py @@ -0,0 +1,188 @@ +from d3m import container +from d3m.metadata import base as metadata_base +import unittest +from feature_analysis import NonNegativeMatrixFactorization +from common_primitives import dataset_to_dataframe,column_parser +from d3m import container,utils +from d3m.container import DataFrame as d3m_dataframe + +import utils as test_utils +import os +import numpy as np +import pandas as pd +import logging +from scipy.fft import fft +from cmath import polar +import nimfa + +LENGTH = 1400 + +class NmfTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + + main = container.DataFrame({'A': [1, 2, 3], 'B': [4,5,6]}, + columns=['A', 'B'], + generate_metadata=True) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'A', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'numpy.int64', + 'name': 'B', + }, + }]) + + a = np.array([[1,0,1,0,1],[1,0,1,0,1],[1,0,1,0,1]]) + b = np.array([[1,0],[1,0],[1,0],[1,0],[1,0]]) + + hyperparams_class = NonNegativeMatrixFactorization.NonNegativeMatrixFactorization.metadata.get_hyperparams() + hp = hyperparams_class.defaults().replace({ + 'use_semantic_types': True, + 'use_columns': (0,1,), + 'return_result':'append', + 'rank':5, + 'seed':'fixed', + 'W':a, + 'H': b, + }) + primitive = NonNegativeMatrixFactorization.NonNegativeMatrixFactorization(hyperparams=hp) + new_main = primitive.produce(inputs=main).value + + print("new_main",new_main) + c = pd.DataFrame({"A":[1,2,3,np.nan,np.nan], "B":[4,5,6,np.nan,np.nan], + 'row_latent_vector_0':[0.816725,1.078965,1.341205,np.nan,np.nan], + 'row_latent_vector_1':[3.514284e-16,2.383547e-16,2.227207e-16,np.nan,np.nan], + 'row_latent_vector_2':[0.816725,1.078965,1.341205,np.nan,np.nan], + 'row_latent_vector_3':[3.514284e-16,2.383547e-16,2.227207e-16,np.nan,np.nan], + 'row_latent_vector_4':[0.816725,1.078965,1.341205,np.nan,np.nan], + 'column_latent_vector_0':[ 0.642626,0.542312,0.642626,0.542312,0.642626], + 'column_latent_vector_1':[ 1.534324,1.848782,1.534324,1.848782,1.534324], + }) + pd.testing.assert_frame_equal(new_main, c) + + params = primitive.get_params() + primitive.set_params(params=params) + + + # print(utils.to_json_structure(new_main.metadata.to_internal_simple_structure())) + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 9, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'A', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'B', + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'row_latent_vector_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'row_latent_vector_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'row_latent_vector_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'row_latent_vector_3', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 6], + 'metadata': { + 'name': 'row_latent_vector_4', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 7], + 'metadata': { + 'name': 'column_latent_vector_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 8], + 'metadata': { + 'name': 'column_latent_vector_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + params = primitive.get_params() + primitive.set_params(params=params) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PCAODetect.py b/tods/tests/test_PCAODetect.py new file mode 100644 index 0000000..4ab0c1c --- /dev/null +++ b/tods/tests/test_PCAODetect.py @@ -0,0 +1,114 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PCAODetect import PCAODetector +from pyod.utils.data import generate_data + +from detection_algorithm.core.CollectiveCommonTest import CollectiveCommonTest + +import numpy as np + +class KDiscordODetectTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.contamination = 0.1 + self.window_size = 2 + self.roc_floor = 0. # 0.8 + # self.n_train = 200 + # self.n_test = 100 + # self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + # n_train=self.n_train, n_test=self.n_test, + # contamination=self.contamination, random_state=42) + # self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + # self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + self.X_train = d3m_dataframe({'data': [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]}, + columns=['data'], generate_metadata=True) + self.y_train = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.X_test = d3m_dataframe({'data': [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]}, + columns=['data'], generate_metadata=True) + self.y_test = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0]) + + + + hyperparams_default = PCAODetector.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'window_size': self.window_size, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + self.primitive = PCAODetector(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.collective_common_test = CollectiveCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.collective_common_test.test_detector() + pass + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 10, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'PCAODetector0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'PCAODetector0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'PCAODetector0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodABOD.py b/tods/tests/test_PyodABOD.py new file mode 100644 index 0000000..2d35aff --- /dev/null +++ b/tods/tests/test_PyodABOD.py @@ -0,0 +1,136 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from detection_algorithm.PyodABOD import ABODPrimitive + + +class ABODTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3.], 'b': [2., 3., 4.], 'c': [3., 4., 5.],}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = ABODPrimitive.metadata.get_hyperparams() + hyperparams = hyperparams_class.defaults() + hyperparams = hyperparams.replace({'return_result': 'new', + 'method': 'default', + + }) + + primitive = ABODPrimitive(hyperparams=hyperparams) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + new_main_score = primitive.produce_score(inputs=main).value + print(new_main) + print(new_main_score) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'Angle-base Outlier Detection Primitive0_0', + 'structural_type': 'numpy.int64', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'] + }, + }]) + + + self.assertEqual(utils.to_json_structure(new_main_score.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'Angle-base Outlier Detection Primitive0_0', + 'structural_type': 'numpy.float64', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'] + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodAE.py b/tods/tests/test_PyodAE.py new file mode 100644 index 0000000..ae57cc4 --- /dev/null +++ b/tods/tests/test_PyodAE.py @@ -0,0 +1,104 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodAE import AutoEncoder +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + +import numpy as np + +class PyodAECase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = 0.8 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = AutoEncoder.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + + self.primitive = AutoEncoder(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.AutoEncoder0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.AutoEncoder0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.AutoEncoder0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodCBLOF.py b/tods/tests/test_PyodCBLOF.py new file mode 100644 index 0000000..4bdc1f2 --- /dev/null +++ b/tods/tests/test_PyodCBLOF.py @@ -0,0 +1,103 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodCBLOF import CBLOFPrimitive +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + +import numpy as np + +class PyodLOFTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = 0.8 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = CBLOFPrimitive.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + + self.primitive = CBLOFPrimitive(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.CBLOFPrimitive0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.CBLOFPrimitive0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.CBLOFPrimitive0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodCOF.py b/tods/tests/test_PyodCOF.py new file mode 100644 index 0000000..3bcf41b --- /dev/null +++ b/tods/tests/test_PyodCOF.py @@ -0,0 +1,105 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from detection_algorithm.PyodCOF import PyodCOF +import utils as test_utils +import pandas as pd + +class ABODTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3.], 'b': [2., 3., 4.], 'c': [3., 4., 11.],}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = PyodCOF.metadata.get_hyperparams() + hyperparams = hyperparams_class.defaults() + hyperparams = hyperparams.replace({'return_result': 'new', + + }) + + primitive = PyodCOF(hyperparams=hyperparams) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + # print(type(new_main)) + + c = pd.DataFrame({0:[0,0,1]}) + + pd.testing.assert_frame_equal(new_main, c) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'Connectivity-Based Outlier Factor (COF)0_0', + 'structural_type': 'numpy.int64', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'] + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodHBOS.py b/tods/tests/test_PyodHBOS.py new file mode 100644 index 0000000..6d67356 --- /dev/null +++ b/tods/tests/test_PyodHBOS.py @@ -0,0 +1,135 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from detection_algorithm.PyodHBOS import HBOSPrimitive + + +class HBOSTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3.], 'b': [2., 3., 4.], 'c': [3., 4., 5.],}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = HBOSPrimitive.metadata.get_hyperparams() + hyperparams = hyperparams_class.defaults() + hyperparams = hyperparams.replace({'return_result': 'new', + + }) + + primitive = HBOSPrimitive(hyperparams=hyperparams) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + new_main_score = primitive.produce_score(inputs=main).value + print(new_main) + print(new_main_score) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'HBOS0_0', + 'structural_type': 'numpy.int64', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'] + }, + }]) + + + self.assertEqual(utils.to_json_structure(new_main_score.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'HBOS0_0', + 'structural_type': 'numpy.float64', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'] + }, + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodIsolationForest.py b/tods/tests/test_PyodIsolationForest.py new file mode 100644 index 0000000..1ca7af2 --- /dev/null +++ b/tods/tests/test_PyodIsolationForest.py @@ -0,0 +1,104 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodIsolationForest import IsolationForest +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + + +import numpy as np + +class PyodIsolationForestTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = 0.8 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = IsolationForest.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + + self.primitive = IsolationForest(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.IsolationForest0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.IsolationForest0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.IsolationForest0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodKNN.py b/tods/tests/test_PyodKNN.py new file mode 100644 index 0000000..95854e3 --- /dev/null +++ b/tods/tests/test_PyodKNN.py @@ -0,0 +1,102 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodKNN import KNNPrimitive +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + +import numpy as np + +class PyodKNNTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = 0.8 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = KNNPrimitive.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + self.primitive = KNNPrimitive(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.KNNPrimitive0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.KNNPrimitive0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.KNNPrimitive0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodLODA.py b/tods/tests/test_PyodLODA.py new file mode 100644 index 0000000..094ae1f --- /dev/null +++ b/tods/tests/test_PyodLODA.py @@ -0,0 +1,103 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodLODA import LODAPrimitive +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + +import numpy as np + +class PyodLODATestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = 0.8 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = LODAPrimitive.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + + self.primitive = LODAPrimitive(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.LODAPrimitive0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.LODAPrimitive0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.LODAPrimitive0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodLOF.py b/tods/tests/test_PyodLOF.py new file mode 100644 index 0000000..f1e39d7 --- /dev/null +++ b/tods/tests/test_PyodLOF.py @@ -0,0 +1,104 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodLOF import LOFPrimitive +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + +import numpy as np + +class PyodLOFTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = 0.8 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = LOFPrimitive.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + + self.primitive = LOFPrimitive(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.LOFPrimitive0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.LOFPrimitive0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.LOFPrimitive0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodMoGaal.py b/tods/tests/test_PyodMoGaal.py new file mode 100644 index 0000000..ca5dfa8 --- /dev/null +++ b/tods/tests/test_PyodMoGaal.py @@ -0,0 +1,103 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodMoGaal import Mo_GaalPrimitive +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + +import numpy as np + +class PyodSoGaalTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = 0.0 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = Mo_GaalPrimitive.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + + self.primitive = Mo_GaalPrimitive(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'Mo_Gaal Anomaly Detection0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'Mo_Gaal Anomaly Detection0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'Mo_Gaal Anomaly Detection0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodOCSVM.py b/tods/tests/test_PyodOCSVM.py new file mode 100644 index 0000000..dae500e --- /dev/null +++ b/tods/tests/test_PyodOCSVM.py @@ -0,0 +1,103 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodOCSVM import OCSVMPrimitive +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + +import numpy as np + +class PyodOCSVMTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = 0.8 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = OCSVMPrimitive.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + + self.primitive = OCSVMPrimitive(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.OCSVMPrimitive0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.OCSVMPrimitive0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.OCSVMPrimitive0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodSOD.py b/tods/tests/test_PyodSOD.py new file mode 100644 index 0000000..684144a --- /dev/null +++ b/tods/tests/test_PyodSOD.py @@ -0,0 +1,102 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from detection_algorithm.PyodSOD import SODPrimitive + + + +class SODTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3., 4.], 'b': [2., 3., 4., 5.], 'c': [3., 4., 5., 6.]}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = SODPrimitive.metadata.get_hyperparams() + hyperparams = hyperparams_class.defaults() + hyperparams = hyperparams.replace({'return_result': 'new', 'n_neighbors': 3, 'ref_set': 2}) + + primitive = SODPrimitive(hyperparams=hyperparams) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + print(new_main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodSoGaal.py b/tods/tests/test_PyodSoGaal.py new file mode 100644 index 0000000..60b6e2c --- /dev/null +++ b/tods/tests/test_PyodSoGaal.py @@ -0,0 +1,102 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodSoGaal import So_GaalPrimitive +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + +import numpy as np + +class PyodSoGaalTestCase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = -1e-5 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = So_GaalPrimitive.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + self.primitive = So_GaalPrimitive(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'So_Gaal Anomaly Detection0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'So_Gaal Anomaly Detection0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'So_Gaal Anomaly Detection0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_PyodVAE.py b/tods/tests/test_PyodVAE.py new file mode 100644 index 0000000..b0ff5fd --- /dev/null +++ b/tods/tests/test_PyodVAE.py @@ -0,0 +1,104 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from d3m.container import DataFrame as d3m_dataframe + +from detection_algorithm.PyodVAE import VariationalAutoEncoder +from pyod.utils.data import generate_data + +from detection_algorithm.core.UODCommonTest import UODCommonTest + +import numpy as np + +class PyodAVECase(unittest.TestCase): + def setUp(self): + + self.maxDiff = None + self.n_train = 200 + self.n_test = 100 + self.contamination = 0.1 + self.roc_floor = 0.8 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination, random_state=42) + + self.X_train = d3m_dataframe(self.X_train, generate_metadata=True) + self.X_test = d3m_dataframe(self.X_test, generate_metadata=True) + + hyperparams_default = VariationalAutoEncoder.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'contamination': self.contamination, }) + hyperparams = hyperparams.replace({'return_subseq_inds': True, }) + + self.primitive = VariationalAutoEncoder(hyperparams=hyperparams) + + self.primitive.set_training_data(inputs=self.X_train) + self.primitive.fit() + self.prediction_labels = self.primitive.produce(inputs=self.X_test).value + self.prediction_score = self.primitive.produce_score(inputs=self.X_test).value + + self.uodbase_test = UODCommonTest(model=self.primitive._clf, + X_train=self.X_train, + y_train=self.y_train, + X_test=self.X_test, + y_test=self.y_test, + roc_floor=self.roc_floor, + ) + + def test_detector(self): + self.uodbase_test.test_detector() + + def test_metadata(self): + # print(self.prediction_labels.metadata.to_internal_simple_structure()) + self.assertEqual(utils.to_json_structure(self.prediction_labels.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 100, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.VariationalAutoEncoder0_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.VariationalAutoEncoder0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'TODS.anomaly_detection_primitives.VariationalAutoEncoder0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.int64', + }, + }]) + + def test_params(self): + params = self.primitive.get_params() + self.primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_SKAxiswiseScaler.py b/tods/tests/test_SKAxiswiseScaler.py new file mode 100644 index 0000000..1e1e4e6 --- /dev/null +++ b/tods/tests/test_SKAxiswiseScaler.py @@ -0,0 +1,157 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from timeseries_processing import SKAxiswiseScaler +import numpy as np + +class SKStandardizationTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'a1': [1., 2., 3.], 'b1': [2., 3., 4.], + 'a2': [3., 4., 5.], 'c1': [4., 5., 6.], + 'a3': [5., 6., 7.], 'a1a': [6., 7., 8.]}, + # {'top_level': 'main', }, + columns=['a1', 'b1', 'a2', 'c1', 'a3', 'a1a'], + generate_metadata=True) + main.metadata = main.metadata.update_column(0, {'name': 'aaa111'}) + main.metadata = main.metadata.update_column(1, {'name': 'bbb111'}) + main.metadata = main.metadata.update_column(2, {'name': 'aaa222'}) + main.metadata = main.metadata.update_column(3, {'name': 'ccc111'}) + main.metadata = main.metadata.update_column(4, {'name': 'aaa333'}) + main.metadata = main.metadata.update_column(5, {'name': 'aaa111'}) + + # print(main) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'aaa111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'bbb111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'aaa222'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'ccc111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'aaa333'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'aaa111'}, + }]) + + hyperparams_class = SKAxiswiseScaler.SKAxiswiseScaler.metadata.get_hyperparams() + primitive = SKAxiswiseScaler.SKAxiswiseScaler(hyperparams=hyperparams_class.defaults()) + new_main = primitive.produce(inputs=main).value + new_mean, new_std = new_main.values.mean(0), new_main.values.std(0) + + mean_mse = np.matmul(new_mean.T, new_mean) + std_mse = np.matmul((new_std - np.ones_like(new_std)).T, (new_std - np.ones_like(new_std))) + + # print(new_main) + # print(mean_mse, std_mse) + + self.assertAlmostEqual(mean_mse.__float__(), 0., delta=1e-8) + self.assertAlmostEqual(std_mse.__float__(), 0., delta=1e-8) + + # print(main.metadata.to_internal_simple_structure()) + # print(new_main.metadata.to_internal_simple_structure()) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa111', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'aaa222', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'ccc111', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'aaa333', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'aaa111', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_SKPowerTransformer.py b/tods/tests/test_SKPowerTransformer.py new file mode 100644 index 0000000..b703ab2 --- /dev/null +++ b/tods/tests/test_SKPowerTransformer.py @@ -0,0 +1,110 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from timeseries_processing import SKPowerTransformer +import numpy as np +import pandas as pd +from d3m.container import DataFrame as d3m_dataframe +from scipy.stats import kstest, shapiro + +class SKPowerTransoformerTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + + dataset_fname = '../datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv' + dataset = pd.read_csv(dataset_fname) + # dataset = np.random.rand(1000) + + main = d3m_dataframe(dataset, generate_metadata=True) + # print(main) + + hyperparams_class = SKPowerTransformer.SKPowerTransformer.metadata.get_hyperparams() + primitive = SKPowerTransformer.SKPowerTransformer(hyperparams=hyperparams_class.defaults()) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + + test_data = new_main.values[:, 2] + # hist_data = new_main.values + std_normal_samples = np.random.randn(test_data.__len__()) + + # Plot the distribution + # import matplotlib.pyplot as plt + # plt.hist(test_data, bins=100, alpha=0.6) + # plt.hist(std_normal_samples, bins=100, alpha=0.6) + # plt.legend(labels=['PowerTransformer', 'Standard Gaussian'], loc='best') + # plt.savefig('./fig/test_SKPowerTransformer.png') + # plt.close() + # plt.show() + + # centerization check + new_mean, new_std = test_data.mean(), test_data.std() + mean_mse = new_mean ** 2 + std_mse = (new_std - 1) ** 2 + # print(mean_mse, std_mse) + self.assertAlmostEqual(mean_mse.__float__(), 0., delta=1e-5) + self.assertAlmostEqual(std_mse.__float__(), 0., delta=1e-5) + # + # print(main.metadata.to_internal_simple_structure()) + # print(new_main.metadata.to_internal_simple_structure()) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 7027, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'timestamp', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'value', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'ground_truth', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_SKQuantileTransformer.py b/tods/tests/test_SKQuantileTransformer.py new file mode 100644 index 0000000..4436acc --- /dev/null +++ b/tods/tests/test_SKQuantileTransformer.py @@ -0,0 +1,109 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from timeseries_processing import SKQuantileTransformer +import numpy as np +import pandas as pd +from d3m.container import DataFrame as d3m_dataframe +from scipy.stats import kstest, shapiro + +class SKQuantileTransformerTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + + dataset_fname = '../datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv' + dataset = pd.read_csv(dataset_fname) + # dataset = np.random.rand(1000) + main = d3m_dataframe(dataset, generate_metadata=True) + # print(main) + + hyperparams_class = SKQuantileTransformer.SKQuantileTransformer.metadata.get_hyperparams() + primitive = SKQuantileTransformer.SKQuantileTransformer(hyperparams=hyperparams_class.defaults()) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + + test_data = new_main.values[:, 2] + # hist_data = new_main.values + std_normal_samples = np.random.randn(test_data.__len__()) + + # Plot the distribution + # import matplotlib.pyplot as plt + # plt.hist(test_data, bins=100, alpha=0.6) + # plt.hist(std_normal_samples, bins=100, alpha=0.6) + # plt.legend(labels=['QuantileTransformer', 'Standard Gaussian'], loc='best') + # plt.savefig('./fig/test_SKQuantileTransformer.png') + # plt.close() + # plt.show() + + # centerization check + new_mean, new_std = test_data.mean(), test_data.std() + mean_mse = new_mean ** 2 + std_mse = (new_std-1) ** 2 + # print(mean_mse, std_mse) + self.assertAlmostEqual(mean_mse.__float__(), 0., delta=1e-5) + self.assertAlmostEqual(std_mse.__float__(), 0., delta=1e-5) + # + # print(main.metadata.to_internal_simple_structure()) + # print(new_main.metadata.to_internal_simple_structure()) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 7027, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'd3mIndex', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'timestamp', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'value', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'ground_truth', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_SKStandardizer.py b/tods/tests/test_SKStandardizer.py new file mode 100644 index 0000000..30cdce9 --- /dev/null +++ b/tods/tests/test_SKStandardizer.py @@ -0,0 +1,159 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from timeseries_processing import SKStandardScaler +import numpy as np + +class SKStandardizationTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'a1': [1., 2., 3.], 'b1': [2., 3., 4.], + 'a2': [3., 4., 5.], 'c1': [4., 5., 6.], + 'a3': [5., 6., 7.], 'a1a': [6., 7., 8.]}, + # {'top_level': 'main', }, + columns=['a1', 'b1', 'a2', 'c1', 'a3', 'a1a'], + generate_metadata=True) + main.metadata = main.metadata.update_column(0, {'name': 'aaa111'}) + main.metadata = main.metadata.update_column(1, {'name': 'bbb111'}) + main.metadata = main.metadata.update_column(2, {'name': 'aaa222'}) + main.metadata = main.metadata.update_column(3, {'name': 'ccc111'}) + main.metadata = main.metadata.update_column(4, {'name': 'aaa333'}) + main.metadata = main.metadata.update_column(5, {'name': 'aaa111'}) + + # print(main) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'aaa111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'bbb111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'aaa222'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'ccc111'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'aaa333'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'aaa111'}, + }]) + + hyperparams_class = SKStandardScaler.SKStandardScaler.metadata.get_hyperparams() + primitive = SKStandardScaler.SKStandardScaler(hyperparams=hyperparams_class.defaults()) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + new_mean, new_std = new_main.values.mean(0), new_main.values.std(0) + + mean_mse = np.matmul(new_mean.T, new_mean) + std_mse = np.matmul((new_std - np.ones_like(new_std)).T, (new_std - np.ones_like(new_std))) + + # print(new_main) + # print(mean_mse, std_mse) + + self.assertAlmostEqual(mean_mse.__float__(), 0., delta=1e-8) + self.assertAlmostEqual(std_mse.__float__(), 0., delta=1e-8) + + # print(main.metadata.to_internal_simple_structure()) + # print(new_main.metadata.to_internal_simple_structure()) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'aaa111', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'bbb111', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'aaa222', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'ccc111', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'aaa333', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 5], + 'metadata': { + 'name': 'aaa111', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_SKTruncatedSVD.py b/tods/tests/test_SKTruncatedSVD.py new file mode 100644 index 0000000..216e68e --- /dev/null +++ b/tods/tests/test_SKTruncatedSVD.py @@ -0,0 +1,126 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from feature_analysis import SKTruncatedSVD + + +class SKTruncatedSVDTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3.], 'b': [2., 3., 4.], 'c': [3., 4., 5.],}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = SKTruncatedSVD.SKTruncatedSVD.metadata.get_hyperparams() + primitive = SKTruncatedSVD.SKTruncatedSVD(hyperparams=hyperparams_class.defaults()) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce(inputs=main).value + print(new_main) + + # expected_output = container.DataFrame({'timestamp': [1., 4.],'a': [1., 3.], 'b': [2., 4.],}) + # self.assertEqual(new_main.values.tolist() , expected_output.values.tolist()) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'a', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'b', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'c', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'Truncated SVD0_0', + 'structural_type': 'numpy.float64', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'Truncated SVD0_1', + 'structural_type': 'numpy.float64', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + }, + }]) + + + + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_SimpleExponentialSmoothing.py b/tods/tests/test_SimpleExponentialSmoothing.py new file mode 100644 index 0000000..51b6952 --- /dev/null +++ b/tods/tests/test_SimpleExponentialSmoothing.py @@ -0,0 +1,69 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + + +from timeseries_processing import SimpleExponentialSmoothing +import pandas as pd + + +class SimpleExponentialSmoothingTestCase(unittest.TestCase): + def test_basic(self): + main = container.DataFrame({'timestamp': [20201, 20202, 20203], 'value_0': [100,200,300],}, { + 'top_level': 'main', + }, generate_metadata=True) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 2, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'value_0'}, + }]) + + hyperparams_class = SimpleExponentialSmoothing.SimpleExponentialSmoothing.metadata.get_hyperparams() + primitive = SimpleExponentialSmoothing.SimpleExponentialSmoothing(hyperparams=hyperparams_class.defaults()) + # primitive.set_training_data(inputs=main) + # primitive.fit() + output_main = primitive.produce(inputs=main).value + + # new_main_drop = new_main.iloc[2:] + # new_main_drop = new_main_drop.reset_index(drop = True) + print ( "output", output_main) + + expected_result = container.DataFrame(data = { 'timestamp' : [20201,20202,20203], 'value_0': [100,100,120]}) + print ("expected_result", expected_result) + # output_main.reset_index() + + + self.assertEqual(output_main[['timestamp','value_0_simple_exponential_smoothing']].values.tolist(), expected_result[['timestamp','value_0']].values.tolist()) + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_SpectralResidualTransform.py b/tods/tests/test_SpectralResidualTransform.py new file mode 100644 index 0000000..118d3c3 --- /dev/null +++ b/tods/tests/test_SpectralResidualTransform.py @@ -0,0 +1,107 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import SpectralResidualTransform + +class SpectralResidualTransformTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = SpectralResidualTransform.SpectralResidualTransformPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True + }) + + primitive = SpectralResidualTransform.SpectralResidualTransformPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_spectral_residual': [0.2018, 0.1364, 0.4252, 0.6807], 'b_spectral_residual': [0.2584, 0.2536, 0.4522, 0.5880]}, + columns=['timestamp', 'values', 'b', 'values_spectral_residual', 'b_spectral_residual']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_spectral_residual', + 'b_spectral_residual']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_spectral_residual', 'b_spectral_residual' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_spectral_residual', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_spectral_residual', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StastiticalStd.py b/tods/tests/test_StastiticalStd.py new file mode 100644 index 0000000..a284dff --- /dev/null +++ b/tods/tests/test_StastiticalStd.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalStd + +class StatisticalStdTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalStd.StatisticalStdPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalStd.StatisticalStdPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_std': [0.5, 0.5, 0.5, 0.5], 'b_std': [1.5, 1.5, 0.5, 0.5]}, + columns=['timestamp', 'values', 'b', 'values_std', 'b_std']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_std', + 'b_std']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_std', 'b_std' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_std', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_std', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalAbsEnergy.py b/tods/tests/test_StatisticalAbsEnergy.py new file mode 100644 index 0000000..f83a8fa --- /dev/null +++ b/tods/tests/test_StatisticalAbsEnergy.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalAbsEnergy + +class StatisticalAbsEnergyTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalAbsEnergy.StatisticalAbsEnergyPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalAbsEnergy.StatisticalAbsEnergyPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_abs_energy': [5.0,5.0, 13.0, 25.0], 'b_abs_energy': [17.0,17.0, 41.0,61.0]}, + columns=['timestamp', 'values', 'b', 'values_abs_energy', 'b_abs_energy']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_abs_energy', + 'b_abs_energy']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_abs_energy', 'b_abs_energy' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_abs_energy', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_abs_energy', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalAbsSum.py b/tods/tests/test_StatisticalAbsSum.py new file mode 100644 index 0000000..6b4d997 --- /dev/null +++ b/tods/tests/test_StatisticalAbsSum.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalAbsSum + +class StatisticalAbsSumTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, -5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalAbsSum.StatisticalAbsSumPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalAbsSum.StatisticalAbsSumPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, -5.0, 6.0], + 'values_abs_sum': [3.0,3.0,5.0,7.0], 'b_abs_sum': [5.0,5.0,9.0,11.0]}, + columns=['timestamp', 'values', 'b', 'values_abs_sum', 'b_abs_sum']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_abs_sum', + 'b_abs_sum']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_abs_sum', 'b_abs_sum' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_abs_sum', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_abs_sum', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalGmean.py b/tods/tests/test_StatisticalGmean.py new file mode 100644 index 0000000..391a863 --- /dev/null +++ b/tods/tests/test_StatisticalGmean.py @@ -0,0 +1,109 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalGmean + +class StatisticalGmeanTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalGmean.StatisticalGmeanPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalGmean.StatisticalGmeanPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main[['values_gmean', 'b_gmean']]) + + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_gmean': [1.4142, 1.4142, 2.4495, 3.4641], 'b_gmean': [2.0000, 2.0000, 4.4721, 5.4772]}, + columns=['timestamp', 'values', 'b', 'values_gmean', 'b_gmean']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_gmean', + 'b_gmean']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_gmean', 'b_gmean' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_gmean', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_gmean', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalHmean.py b/tods/tests/test_StatisticalHmean.py new file mode 100644 index 0000000..35d9107 --- /dev/null +++ b/tods/tests/test_StatisticalHmean.py @@ -0,0 +1,109 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalHmean + +class StatisticalHmeanTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalHmean.StatisticalHmeanPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalHmean.StatisticalHmeanPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + #print(output_main[['values_hmean', 'b_hmean']]) + + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_hmean': [1.3333, 1.3333, 2.4000, 3.4286], 'b_hmean': [1.6000, 1.6000, 4.4444, 5.4545]}, + columns=['timestamp', 'values', 'b', 'values_hmean', 'b_hmean']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_hmean', + 'b_hmean']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_hmean', 'b_hmean' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_hmean', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_hmean', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalKurtosis.py b/tods/tests/test_StatisticalKurtosis.py new file mode 100644 index 0000000..64fd97d --- /dev/null +++ b/tods/tests/test_StatisticalKurtosis.py @@ -0,0 +1,109 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalKurtosis + +class StatisticalKurtosisTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalKurtosis.StatisticalKurtosisPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalKurtosis.StatisticalKurtosisPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main[['values_kurtosis', 'b_kurtosis']]) + + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_kurtosis': [-2.0,-2.0,-2.0,-2.0], 'b_kurtosis': [-2.0,-2.0,-2.0,-2.0]}, + columns=['timestamp', 'values', 'b', 'values_kurtosis', 'b_kurtosis']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_kurtosis', + 'b_kurtosis']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_kurtosis', 'b_kurtosis' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_kurtosis', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_kurtosis', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalMaximum.py b/tods/tests/test_StatisticalMaximum.py new file mode 100644 index 0000000..170db43 --- /dev/null +++ b/tods/tests/test_StatisticalMaximum.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalMaximum + +class StatisticalMaximumTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalMaximum.StatisticalMaximumPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalMaximum.StatisticalMaximumPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_maximum': [2.0,2.0, 3.0, 4.0], 'b_maximum': [4.0,4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b', 'values_maximum', 'b_maximum']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_maximum', + 'b_maximum']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_maximum', 'b_maximum' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_maximum', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_maximum', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalMean.py b/tods/tests/test_StatisticalMean.py new file mode 100644 index 0000000..044d58a --- /dev/null +++ b/tods/tests/test_StatisticalMean.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalMean + +class StatisticalMeanTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalMean.StatisticalMeanPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalMean.StatisticalMeanPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_mean': [1.5, 1.5, 2.5, 3.5], 'b_mean': [2.5, 2.5, 4.5, 5.5]}, + columns=['timestamp', 'values', 'b', 'values_mean', 'b_mean']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_mean', + 'b_mean']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_mean', 'b_mean' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_mean', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_mean', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalMeanAbs.py b/tods/tests/test_StatisticalMeanAbs.py new file mode 100644 index 0000000..1c255c5 --- /dev/null +++ b/tods/tests/test_StatisticalMeanAbs.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalMeanAbs + +class StatisticalMeanAbsTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalMeanAbs.StatisticalMeanAbsPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalMeanAbs.StatisticalMeanAbsPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_mean_abs': [1.5, 1.5, 2.5, 3.5], 'b_mean_abs': [2.5, 2.5, 4.5, 5.5]}, + columns=['timestamp', 'values', 'b', 'values_mean_abs', 'b_mean_abs']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_mean_abs', + 'b_mean_abs']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_mean_abs', 'b_mean_abs' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_mean_abs', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_mean_abs', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalMeanAbsTemporalDerivative.py b/tods/tests/test_StatisticalMeanAbsTemporalDerivative.py new file mode 100644 index 0000000..1cc0b06 --- /dev/null +++ b/tods/tests/test_StatisticalMeanAbsTemporalDerivative.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalMeanAbsTemporalDerivative + +class StatisticalMeanAbsTemporalDerivativeTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalMeanAbsTemporalDerivative.StatisticalMeanAbsTemporalDerivativePrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalMeanAbsTemporalDerivative.StatisticalMeanAbsTemporalDerivativePrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main[['values_mean_abs_temporal_derivative', 'b_mean_abs_temporal_derivative']]) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_mean_abs_temporal_derivative': [1.0, 1.0, 1.0, 1.0], 'b_mean_abs_temporal_derivative': [3.0, 3.0, 1.0, 1.0]}, + columns=['timestamp', 'values', 'b', 'values_mean_abs_temporal_derivative', 'b_mean_abs_temporal_derivative']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_mean_abs_temporal_derivative', + 'b_mean_abs_temporal_derivative']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_mean_abs_temporal_derivative', 'b_mean_abs_temporal_derivative' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_mean_abs_temporal_derivative', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_mean_abs_temporal_derivative', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalMeanTemporalDerivative.py b/tods/tests/test_StatisticalMeanTemporalDerivative.py new file mode 100644 index 0000000..45f0adc --- /dev/null +++ b/tods/tests/test_StatisticalMeanTemporalDerivative.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalMeanTemporalDerivative + +class StatisticalMeanTemporalDerivativeTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalMeanTemporalDerivative.StatisticalMeanTemporalDerivativePrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalMeanTemporalDerivative.StatisticalMeanTemporalDerivativePrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main[['values_mean_temporal_derivative', 'b_mean_temporal_derivative']]) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_mean_temporal_derivative': [1.0, 1.0, 1.0, 1.0], 'b_mean_temporal_derivative': [3.0, 3.0, 1.0, 1.0]}, + columns=['timestamp', 'values', 'b', 'values_mean_temporal_derivative', 'b_mean_temporal_derivative']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_mean_temporal_derivative', + 'b_mean_temporal_derivative']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_mean_temporal_derivative', 'b_mean_temporal_derivative' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_mean_temporal_derivative', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_mean_temporal_derivative', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalMedian.py b/tods/tests/test_StatisticalMedian.py new file mode 100644 index 0000000..fc7e556 --- /dev/null +++ b/tods/tests/test_StatisticalMedian.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalMedian + +class StatisticalMedianTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalMedian.StatisticalMedianPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':3 + }) + + primitive = StatisticalMedian.StatisticalMedianPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_median': [2.0,2.0,2.0,3.0], 'b_median': [4.0,4.0,4.0,5.0]}, + columns=['timestamp', 'values', 'b', 'values_median', 'b_median']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_median', + 'b_median']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_median', 'b_median' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_median', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_median', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalMedianAbsoluteDeviation.py b/tods/tests/test_StatisticalMedianAbsoluteDeviation.py new file mode 100644 index 0000000..5c08ecb --- /dev/null +++ b/tods/tests/test_StatisticalMedianAbsoluteDeviation.py @@ -0,0 +1,109 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalMedianAbsoluteDeviation + +class StatisticalMedianAbsoluteDeviationTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalMedianAbsoluteDeviation.StatisticalMedianAbsoluteDeviationPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalMedianAbsoluteDeviation.StatisticalMedianAbsoluteDeviationPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main[['values_median_absolute_deviation', 'b_median_absolute_deviation']]) + + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_median_absolute_deviation': [0.7413, 0.7413, 0.7413, 0.7413], 'b_median_absolute_deviation': [2.2239, 2.2239, 0.7413, 0.7413]}, + columns=['timestamp', 'values', 'b', 'values_median_absolute_deviation', 'b_median_absolute_deviation']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_median_absolute_deviation', + 'b_median_absolute_deviation']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_median_absolute_deviation', 'b_median_absolute_deviation' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_median_absolute_deviation', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_median_absolute_deviation', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalMinimum.py b/tods/tests/test_StatisticalMinimum.py new file mode 100644 index 0000000..b258283 --- /dev/null +++ b/tods/tests/test_StatisticalMinimum.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalMinimum + +class StatisticalMinimumTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalMinimum.StatisticalMinimumPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalMinimum.StatisticalMinimumPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_minimum': [1.0,1.0, 2.0, 3.0], 'b_minimum': [1.0,1.0, 4.0, 5.0]}, + columns=['timestamp', 'values', 'b', 'values_minimum', 'b_minimum']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_minimum', + 'b_minimum']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_minimum', 'b_minimum' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_minimum', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_minimum', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalSkew.py b/tods/tests/test_StatisticalSkew.py new file mode 100644 index 0000000..17fdde0 --- /dev/null +++ b/tods/tests/test_StatisticalSkew.py @@ -0,0 +1,109 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalSkew + +class StatisticalSkewTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalSkew.StatisticalSkewPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalSkew.StatisticalSkewPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main[['values_skew', 'b_skew']]) + + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_skew': [0.0,0.0,0.0,0.0], 'b_skew': [0.0,0.0,0.0,0.0]}, + columns=['timestamp', 'values', 'b', 'values_skew', 'b_skew']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_skew', + 'b_skew']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_skew', 'b_skew' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_skew', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_skew', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalVar.py b/tods/tests/test_StatisticalVar.py new file mode 100644 index 0000000..b98a562 --- /dev/null +++ b/tods/tests/test_StatisticalVar.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalVar + +class StatisticalVarTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalVar.StatisticalVarPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalVar.StatisticalVarPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_var': [0.25, 0.25, 0.25, 0.25], 'b_var': [2.25, 2.25, 0.25, 0.25]}, + columns=['timestamp', 'values', 'b', 'values_var', 'b_var']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_var', + 'b_var']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_var', 'b_var' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_var', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_var', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalVariation.py b/tods/tests/test_StatisticalVariation.py new file mode 100644 index 0000000..cfc2264 --- /dev/null +++ b/tods/tests/test_StatisticalVariation.py @@ -0,0 +1,109 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalVariation + +class StatisticalVariationTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalVariation.StatisticalVariationPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalVariation.StatisticalVariationPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main[['values_variation', 'b_variation']]) + + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_variation': [0.3333, 0.3333, 0.2000, 0.1429], 'b_variation': [0.6000, 0.6000, 0.1111, 0.0909]}, + columns=['timestamp', 'values', 'b', 'values_variation', 'b_variation']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_variation', + 'b_variation']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_variation', 'b_variation' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_variation', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_variation', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalVecSum.py b/tods/tests/test_StatisticalVecSum.py new file mode 100644 index 0000000..648f23b --- /dev/null +++ b/tods/tests/test_StatisticalVecSum.py @@ -0,0 +1,108 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalVecSum + +class StatisticalVecSumTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, -5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalVecSum.StatisticalVecSumPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2 + }) + + primitive = StatisticalVecSum.StatisticalVecSumPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, -5.0, 6.0], + 'values_vec_sum': [3.0,3.0,5.0,7.0], 'b_vec_sum': [5.0,5.0,-1.0,1.0]}, + columns=['timestamp', 'values', 'b', 'values_vec_sum', 'b_vec_sum']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_vec_sum', + 'b_vec_sum']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_vec_sum', 'b_vec_sum' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_vec_sum', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_vec_sum', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalWillisonAmplitude.py b/tods/tests/test_StatisticalWillisonAmplitude.py new file mode 100644 index 0000000..5777885 --- /dev/null +++ b/tods/tests/test_StatisticalWillisonAmplitude.py @@ -0,0 +1,109 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalWillisonAmplitude + +class StatisticalWillisonAmplitudeTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalWillisonAmplitude.StatisticalWillisonAmplitudePrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True, + 'window_size':2, + 'threshold':1 + }) + + primitive = StatisticalWillisonAmplitude.StatisticalWillisonAmplitudePrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main[['values_willison_amplitude', 'b_willison_amplitude']]) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_willison_amplitude': [0.0,0.0,0.0,0.0], 'b_willison_amplitude': [1.0,1.0,1.0,0.0]}, + columns=['timestamp', 'values', 'b', 'values_willison_amplitude', 'b_willison_amplitude']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_willison_amplitude', + 'b_willison_amplitude']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_willison_amplitude', 'b_willison_amplitude' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_willison_amplitude', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_willison_amplitude', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_StatisticalZeroCrossing.py b/tods/tests/test_StatisticalZeroCrossing.py new file mode 100644 index 0000000..d52bf26 --- /dev/null +++ b/tods/tests/test_StatisticalZeroCrossing.py @@ -0,0 +1,107 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis import StatisticalZeroCrossing + +class StatisticalZeroCrossingTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, -2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = StatisticalZeroCrossing.StatisticalZeroCrossingPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True + }) + + primitive = StatisticalZeroCrossing.StatisticalZeroCrossingPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + print(output_main) + expected_output = container.DataFrame( + {'timestamp': [1, 3, 2, 5], 'values': [1.0, -2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0], + 'values_zero_crossing': [0,1,1,0], 'b_zero_crossing': [0,0,0,0]}, + columns=['timestamp', 'values', 'b', 'values_zero_crossing', 'b_zero_crossing']) + + self.assertEqual(output_main[['timestamp', 'values', 'b', 'values_zero_crossing', + 'b_zero_crossing']].values.tolist(), expected_output[ + ['timestamp', 'values', 'b', 'values_zero_crossing', 'b_zero_crossing' + ]].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 5, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_zero_crossing', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'b_zero_crossing', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tods/tests/test_TRMF.py b/tods/tests/test_TRMF.py new file mode 100644 index 0000000..57470c3 --- /dev/null +++ b/tods/tests/test_TRMF.py @@ -0,0 +1,122 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from feature_analysis import TRMF + + +class TRMFTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3.], 'b': [2., 3., 4.], 'c': [3., 4., 5.],}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + + hyperparams_class = TRMF.TRMF.metadata.get_hyperparams() + primitive = TRMF.TRMF(hyperparams=hyperparams_class.defaults()) + # primitive.set_training_data(inputs=main) + # primitive.fit() + new_main = primitive.produce(inputs=main).value + print(new_main) + + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 5, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'a', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'b', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'c', + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 3], + 'metadata': { + 'name': 'output_0', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 4], + 'metadata': { + 'name': 'output_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/test_Telemanom.py b/tods/tests/test_Telemanom.py new file mode 100644 index 0000000..cae82e6 --- /dev/null +++ b/tods/tests/test_Telemanom.py @@ -0,0 +1,120 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base +from detection_algorithm.Telemanom import TelemanomPrimitive + + +class SODTest(unittest.TestCase): + def test_basic(self): + self.maxDiff = None + main = container.DataFrame({'a': [1., 2., 3., 4.,5,6,7,8,9], 'b': [2., 3., 4., 5.,6,7,8,9,10], 'c': [3., 4., 5., 6.,7,8,9,10,11]}, + columns=['a', 'b', 'c'], + generate_metadata=True) + + print(main) + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 9, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'c'} + }]) + + + self.assertIsInstance(main, container.DataFrame) + + hyperparams_class = TelemanomPrimitive.metadata.get_hyperparams() + hyperparams = hyperparams_class.defaults() + hyperparams = hyperparams.replace({'l_s': 2,'n_predictions':1,'return_result':'new','return_subseq_inds':True,'use_columns':(0,1,2)}) + + # print("hyperparams",hyperparams) + + primitive = TelemanomPrimitive(hyperparams=hyperparams) + primitive.set_training_data(inputs=main) + primitive.fit() + new_main = primitive.produce_score(inputs=main).value + + print("new main",new_main) + + # print(utils.to_json_structure(new_main.metadata.to_internal_simple_structure())) + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 6, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'Telemanom0_0', + 'structural_type': 'numpy.float64', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'structural_type': 'numpy.float64', + 'name': 'Telemanom0_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'structural_type': 'numpy.float64', + 'name': 'Telemanom0_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + } + }]) + + +if __name__ == '__main__': + unittest.main() + + + + + diff --git a/tods/tests/test_TimeSeriesSeasonalityTrendDecomposition.py b/tods/tests/test_TimeSeriesSeasonalityTrendDecomposition.py new file mode 100644 index 0000000..7edd499 --- /dev/null +++ b/tods/tests/test_TimeSeriesSeasonalityTrendDecomposition.py @@ -0,0 +1,114 @@ + + +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from timeseries_processing import TimeSeriesSeasonalityTrendDecomposition + + +class TimeSeriesSeasonalityTrendDecompositionTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + main = container.DataFrame({'timestamp': [1, 3, 2, 5], 'values': [1.0, 2.0, 3.0, 4.0], 'b': [1.0, 4.0, 5.0, 6.0]}, + columns=['timestamp', 'values', 'b'], + generate_metadata=True) + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'values'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + hyperparams_class = TimeSeriesSeasonalityTrendDecomposition.TimeSeriesSeasonalityTrendDecompositionPrimitive.metadata.get_hyperparams() + + hp = hyperparams_class.defaults().replace({ + 'use_columns': [1,2], + 'use_semantic_types' : True + }) + + primitive = TimeSeriesSeasonalityTrendDecomposition.TimeSeriesSeasonalityTrendDecompositionPrimitive(hyperparams=hp) + + output_main = primitive.produce(inputs=main).value + + expected_output = container.DataFrame({'timestamp': [1,3,2,5], 'values': [1.0,2.0,3.0,4.0],'b': [1.0, 4.0, 5.0, 6.0], + 'values_trend':[1.0,2.0,3.0,4.0],'values_seasonal':[0.0,0.0,0.0,0.0],'b_trend':[1.0, 4.0, 5.0, 6.0],'b_seasonal':[0.0,0.0,0.0,0.0]}, + columns=['timestamp', 'values', 'b','values_trend','values_seasonal','b_trend','b_seasonal']) + + self.assertEqual(output_main[['timestamp','values','b','values_trend','values_seasonal','b_trend','b_seasonal']].values.tolist() , expected_output[['timestamp','values','b', 'values_trend','values_seasonal','b_trend','b_seasonal']].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), + [{'metadata': {'dimension': {'length': 4, + 'name': 'rows', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularRow']}, + 'schema': 'https://metadata.datadrivendiscovery.org/schemas/v0/container.json', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'structural_type': 'd3m.container.pandas.DataFrame'}, + 'selector': []}, + {'metadata': {'dimension': {'length': 7, + 'name': 'columns', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/TabularColumn']}}, + 'selector': ['__ALL_ELEMENTS__']}, + {'metadata': {'name': 'timestamp', 'structural_type': 'numpy.int64'}, + 'selector': ['__ALL_ELEMENTS__', 0]}, + {'metadata': {'name': 'values', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 1]}, + {'metadata': {'name': 'b', 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 2]}, + {'metadata': {'name': 'values_trend', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 3]}, + {'metadata': {'name': 'values_seasonal', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 4]}, + {'metadata': {'name': 'b_trend', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 5]}, + {'metadata': {'name': 'b_seasonal', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64'}, + 'selector': ['__ALL_ELEMENTS__', 6]} + ]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() + diff --git a/tods/tests/test_TimeStampValidation.py b/tods/tests/test_TimeStampValidation.py new file mode 100644 index 0000000..da6a62e --- /dev/null +++ b/tods/tests/test_TimeStampValidation.py @@ -0,0 +1,100 @@ + + +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from data_processing import TimeStampValidation + + +class TimeStampValidationTestCase(unittest.TestCase): + def test_basic(self): + + main = container.DataFrame({'timestamp': [1,3,2,5], 'a': [1.0,2.0,3.0,4.0],'b':[1.0,4.0,5.0,6.0]},columns=['timestamp', 'a', 'b'], + generate_metadata=True) + + + + + self.assertEqual(utils.to_json_structure(main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + + hyperparams_class = TimeStampValidation.TimeStampValidationPrimitive.metadata.get_hyperparams() + primitive = TimeStampValidation.TimeStampValidationPrimitive(hyperparams=hyperparams_class.defaults()) + output_main = primitive.produce(inputs=main).value + print(output_main[['timestamp','a','b']].values.tolist()) + expected_output = container.DataFrame({'timestamp': [1,2,3,5], 'a': [1.0,3.0,2.0,4.0],'b': [1.0,5.0,4.0,6.0]}) + + self.assertEqual(output_main[['timestamp','a','b']].values.tolist() , expected_output[['timestamp','a','b']].values.tolist()) + + self.assertEqual(utils.to_json_structure(output_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 4, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': {'structural_type': 'numpy.int64', 'name': 'timestamp'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'a'}, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': {'structural_type': 'numpy.float64', 'name': 'b'}, + }]) + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() + diff --git a/tods/tests/test_WaveletTransformer.py b/tods/tests/test_WaveletTransformer.py new file mode 100644 index 0000000..72bb062 --- /dev/null +++ b/tods/tests/test_WaveletTransformer.py @@ -0,0 +1,135 @@ +import unittest + +from d3m import container, utils +from d3m.metadata import base as metadata_base + +from feature_analysis.WaveletTransform import WaveletTransformer +import numpy as np +import pandas as pd +from d3m.container import DataFrame as d3m_dataframe +import os + +class WaveletTransformerTestCase(unittest.TestCase): + def test_basic(self): + self.maxDiff=None + curr_path = os.path.dirname(__file__) + dataset_fname = os.path.join(curr_path, '../../datasets/anomaly/kpi/TRAIN/dataset_TRAIN/tables/learningData.csv') + dataset = pd.read_csv(dataset_fname) + # print(dataset.columns) + value = dataset['value'] + main = d3m_dataframe(value, generate_metadata=True) + + ################## Test Wavelet transform ################## + + hyperparams_default = WaveletTransformer.metadata.get_hyperparams().defaults() + hyperparams = hyperparams_default.replace({'wavelet': 'db8', + 'level': 2, + 'inverse': 0, + 'return_result': 'new'}) + + primitive = WaveletTransformer(hyperparams=hyperparams) + new_main = primitive.produce(inputs=main).value + + # print(new_main) + # print(mean_mse, std_mse) + + # self.assertAlmostEqual(mean_mse.__float__(), 0., delta=1e-8) + # self.assertAlmostEquael(std_mse.__float__(), 0., delta=1e-8) + + # print(main.metadata.to_internal_simple_structure()) + # print(new_main.metadata.to_internal_simple_structure()) + + self.assertEqual(utils.to_json_structure(new_main.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 3521, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 3, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'value', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 1], + 'metadata': { + 'name': 'output_1', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 2], + 'metadata': { + 'name': 'output_2', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + ################## Test inverse transform ################## + + hyperparams = hyperparams_default.replace({'inverse': 1}) + + primitive = WaveletTransformer(hyperparams=hyperparams) + main_recover = primitive.produce(inputs=main).value + + self.assertAlmostEqual(main_recover.values.tolist(), main.values.tolist(), delta=1e-6) + # print(main.metadata.to_internal_simple_structure()) + # print(main_recover.metadata.to_internal_simple_structure()) + + self.assertEqual(utils.to_json_structure(main_recover.metadata.to_internal_simple_structure()), [{ + 'selector': [], + 'metadata': { + # 'top_level': 'main', + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': 'd3m.container.pandas.DataFrame', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Table'], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 7027, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__'], + 'metadata': { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 1, + }, + }, + }, { + 'selector': ['__ALL_ELEMENTS__', 0], + 'metadata': { + 'name': 'value', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute'], + 'structural_type': 'numpy.float64', + }, + }]) + + + params = primitive.get_params() + primitive.set_params(params=params) + + +if __name__ == '__main__': + unittest.main() diff --git a/tods/tests/utils.py b/tods/tests/utils.py new file mode 100644 index 0000000..18dc51c --- /dev/null +++ b/tods/tests/utils.py @@ -0,0 +1,112 @@ +import json +import os + +from d3m import utils, container +from d3m.metadata import base as metadata_base + +from common_primitives import dataset_to_dataframe + + +def convert_metadata(metadata): + return json.loads(json.dumps(metadata, cls=utils.JsonEncoder)) + + +def load_iris_metadata(): + dataset_doc_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'datasets', 'iris_dataset_1', 'datasetDoc.json')) + dataset = container.Dataset.load('file://{dataset_doc_path}'.format(dataset_doc_path=dataset_doc_path)) + return dataset + + +def test_iris_metadata(test_obj, metadata, structural_type, rows_structural_type=None): + test_obj.maxDiff = None + + test_obj.assertEqual(convert_metadata(metadata.query(())), { + 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, + 'structural_type': structural_type, + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/Table', + ], + 'dimension': { + 'name': 'rows', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], + 'length': 150, + } + }) + + if rows_structural_type is None: + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }) + else: + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS,))), { + 'structural_type': rows_structural_type, + 'dimension': { + 'name': 'columns', + 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], + 'length': 6, + } + }) + + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS, 0))), { + 'name': 'd3mIndex', + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Integer', + 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', + ], + }) + + for i in range(1, 5): + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS, i))), { + 'name': ['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth'][i - 1], + 'structural_type': 'str', + 'semantic_types': [ + 'http://schema.org/Float', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }, i) + + test_obj.assertEqual(convert_metadata(metadata.query((metadata_base.ALL_ELEMENTS, 5))), { + 'name': 'species', + 'structural_type': 'str', + 'semantic_types': [ + 'https://metadata.datadrivendiscovery.org/types/CategoricalData', + 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget', + 'https://metadata.datadrivendiscovery.org/types/Attribute', + ], + }) + + +def convert_through_json(data): + return json.loads(json.dumps(data, cls=utils.JsonEncoder)) + + +def normalize_semantic_types(data): + if isinstance(data, dict): + if 'semantic_types' in data: + # We sort them so that it is easier to compare them. + data['semantic_types'] = sorted(data['semantic_types']) + + return {key: normalize_semantic_types(value) for key, value in data.items()} + + return data + + +def effective_metadata(metadata): + output = metadata.to_json_structure() + + for entry in output: + entry['metadata'] = normalize_semantic_types(entry['metadata']) + + return output + + +def get_dataframe(dataset): + dataset_hyperparams_class = dataset_to_dataframe.DatasetToDataFramePrimitive.metadata.get_hyperparams() + dataframe_primitive = dataset_to_dataframe.DatasetToDataFramePrimitive(hyperparams=dataset_hyperparams_class.defaults()) + dataframe = dataframe_primitive.produce(inputs=dataset).value + return dataframe diff --git a/tods/timeseries_processing/.HoltSmoothing.py.swo b/tods/timeseries_processing/.HoltSmoothing.py.swo new file mode 100644 index 0000000..6160edf Binary files /dev/null and b/tods/timeseries_processing/.HoltSmoothing.py.swo differ diff --git a/tods/timeseries_processing/HoltSmoothing.py b/tods/timeseries_processing/HoltSmoothing.py new file mode 100644 index 0000000..6ee1015 --- /dev/null +++ b/tods/timeseries_processing/HoltSmoothing.py @@ -0,0 +1,340 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import pandas as pd +# Custom import commands if any +from sklearn.preprocessing.data import Normalizer +from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult,DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + +import os +from typing import Any,Optional,List + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams,params +from d3m.primitive_interfaces import base, transformer + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + # Added by Mia + endog = hyperparams.Bounded[int]( + lower = 2, + upper = None, + default = 3, + description='Array like time series.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + # keep previous + norm = hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2', 'max'], + description='The norm to use to normalize each non zero sample.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe", + ) + + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class HoltSmoothing(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Holt Smoothing + `statsmodels documentation `_ + + """ + + __author__ = "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.HOLT_SMOOTHING, ], + "name": "statsmodels.preprocessing.HoltSmoothing", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.tods.timeseries_processing.transformation.holt_smoothing", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/HoltSmoothing.py']}, + "version": "0.0.1", + "id": "3688b5b4-885c-40bb-9731-fe3969ea81b0", + "hyperparams_to_tune": ['endog','use_columns'], + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Normalizer( + norm=self.hyperparams['norm'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + self.logger.info('Holt Smoothing Primitive called') + outputs = inputs + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + try: + columns_to_calculate_holt_smoothing= List[str] + if(self.hyperparams['use_columns']==()): + columns_to_calculate_holt_smoothing = list(set(inputs.columns)-set(['d3mIndex','timestamp','ground_truth'])) + else: + columns_to_calculate_holt_smoothing = self.hyperparams['use_columns'] + for column in self._training_indices: + outputs[inputs.columns[column]+"_holt_smoothing"] = Holt(inputs.iloc[:, column]).fit(smoothing_level=0.2, smoothing_slope=0.2,optimized=False).fittedvalues + except Exception as e: + self.logger.error("Error in Calculating Holt smoothing",e) + self._update_metadata(outputs) + #print(inputs) + #print("-------------") + print(outputs) + + return base.CallResult(outputs) + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs,) + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +HoltSmoothing.__doc__ = Normalizer.__doc__ diff --git a/tods/timeseries_processing/HoltWintersExponentialSmoothing.py b/tods/timeseries_processing/HoltWintersExponentialSmoothing.py new file mode 100644 index 0000000..a2d46aa --- /dev/null +++ b/tods/timeseries_processing/HoltWintersExponentialSmoothing.py @@ -0,0 +1,338 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import pandas as pd +# Custom import commands if any +from sklearn.preprocessing.data import Normalizer +from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult,DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + +import os +from typing import Any,Optional,List + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams,params +from d3m.primitive_interfaces import base, transformer + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + # Added by Mia + endog = hyperparams.Bounded[int]( + lower = 2, + upper = None, + default = 3, + description='Array like time seires.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + # keep previous + norm = hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2', 'max'], + description='The norm to use to normalize each non zero sample.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class HoltWintersExponentialSmoothing(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + HoltWinter Exponential Smoothing + `Statsmodels documentation `_ + + """ + + __author__ = "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.HOLT_WINTERS_EXPONENTIAL_SMOOTHING, ], + "name": "statsmodels.preprocessing.data.HoltWintersExponentialSmoothing", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + #3"python_path": "d3m.primitives.tods.timeseries_processing.transformation.holt_winters_exponential_smoothing.Preprocessing", + "python_path": "d3m.primitives.tods.timeseries_processing.transformation.holt_winters_exponential_smoothing", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/HoltWintersExponentialSmoothing.py']}, + "version": "0.0.1", + "id": "b8c6647c-3787-4efd-bf01-b0ca11c643c6", + "hyperparams_to_tune": ['endog','use_columns'], + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Normalizer( + norm=self.hyperparams['norm'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + self.logger.info('Holt Winters Smoothing Primitive called') + outputs = inputs + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + try: + columns_to_calculate_holt_winters_exponential_smoothing= List[str] + if(self.hyperparams['use_columns']==()): + columns_to_calculate_holt_winters_exponential_smoothing = list(set(inputs.columns)-set(['d3mIndex','timestamp','ground_truth'])) + else: + columns_to_calculate_holt_winters_exponential_smoothing = self.hyperparams['use_columns'] + for column in self._training_indices: + outputs[inputs.columns[column]+"_holt_winters_smoothing"] = ExponentialSmoothing(inputs.iloc[:, column], seasonal_periods=3, trend = 'add', seasonal='add').fit(use_boxcox=False).fittedvalues + except Exception as e: + self.logger.error("Error in Calculating Holt Winters smoothing",e) + self._update_metadata(outputs) + + return base.CallResult(outputs) + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs,) + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +HoltWintersExponentialSmoothing.__doc__ = Normalizer.__doc__ diff --git a/tods/timeseries_processing/MovingAverageTransform.py b/tods/timeseries_processing/MovingAverageTransform.py new file mode 100644 index 0000000..50e9175 --- /dev/null +++ b/tods/timeseries_processing/MovingAverageTransform.py @@ -0,0 +1,341 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import pandas as pd +# Custom import commands if any +from sklearn.preprocessing.data import Normalizer + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult,DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + +import os +from typing import Any,Optional,List + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams,params +from d3m.primitive_interfaces import base, transformer + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + + window_size = hyperparams.Bounded[int]( + lower = 2, + upper = None, + default = 3, + description='Size of moving window.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + norm = hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2', 'max'], + description='The norm to use to normalize each non zero sample.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class MovingAverageTransform(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + A primitive to generate moving average + Genrates moving average based on the window_size passed as hyperparameter. + Columns for which moving average is calculated is passed as hyperparameter . Default is all values column + """ + + __author__ = "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.MOVING_AVERAGE_TRANSFORM, ], + "name": "pandas.preprocessing.data.MovingAverageTransform", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.tods.timeseries_processing.transformation.moving_average_transform", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/MovingAverageTransform.py']}, + "version": "0.0.1", + "id": "ab8c90a6-d10e-49f1-8c5a-38884defc570", + "hyperparams_to_tune": ['window_size'], + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Normalizer( + norm=self.hyperparams['norm'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + + self.logger.info('Time Series Moving Average Primitive called') + outputs = inputs + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + + try: + columns_to_calculate_moving_average= List[str] + if(self.hyperparams['use_columns']==()): + columns_to_calculate_moving_average = list(set(inputs.columns)-set(['d3mIndex','timestamp','ground_truth'])) + else: + columns_to_calculate_moving_average = self.hyperparams['use_columns'] + for column in self._training_indices: + outputs[inputs.columns[column]+"_moving_average"] = (inputs.iloc[:, column]).rolling(3,min_periods=1,center=True).mean() + except Exception as e: + self.logger.error("Error in Calculating Moving Average",e) + self._update_metadata(outputs) + # print(inputs) + # print("-------------") + # print(outputs) + + return base.CallResult(outputs) + + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs,) + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +MovingAverageTransform.__doc__ = Normalizer.__doc__ diff --git a/tods/timeseries_processing/SKAxiswiseScaler.py b/tods/timeseries_processing/SKAxiswiseScaler.py new file mode 100644 index 0000000..dc60572 --- /dev/null +++ b/tods/timeseries_processing/SKAxiswiseScaler.py @@ -0,0 +1,398 @@ +import os +import typing +from numpy import ndarray + +from d3m import container, utils as d3m_utils +from d3m.base import utils as base_utils +from d3m.metadata import base as metadata_base, hyperparams +from d3m.primitive_interfaces import base, transformer + +from sklearn.preprocessing import scale + +import common_primitives +import numpy +from typing import Optional, List +from collections import OrderedDict +from scipy import sparse +import logging +import uuid + +__all__ = ('SKAxiswiseScaler',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Hyperparams(hyperparams.Hyperparams): + # Added by Guanchu + axis = hyperparams.UniformInt( + lower=0, + upper=2, + default=0, + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + description="axis used to compute the means and standard deviations along. If 0, independently standardize each feature, otherwise (if 1) standardize each sample.", + ) + with_mean = hyperparams.UniformBool( + default=True, + description='If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + with_std = hyperparams.UniformBool( + default=True, + description='If True, scale the data to unit variance (or equivalently, unit standard deviation).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + # copy = hyperparams.UniformBool( + # default=True, + # description='If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + # ) + + ## copy from Unsupervised Primitives + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + # Keep previous + dataframe_resource = hyperparams.Hyperparameter[typing.Union[str, None]]( + default=None, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Resource ID of a DataFrame to extract if there are multiple tabular resources inside a Dataset and none is a dataset entry point.", + ) + +class Scaler: + def __init__(self, axis=0, with_mean=True, with_std=True, copy=True): + self._axis = axis + self._with_mean = with_mean + self._with_std = with_std + self._copy = copy + + def produce(self, inputs): + + return scale(inputs, + axis=self._axis, + with_mean=self._with_mean, + with_std=self._with_std, + copy=self._copy) + + # sk_output = inputs + # for axis in self._axis_set: + # sk_output = scale(sk_output, + # axis=axis, + # with_mean=self._with_mean, + # with_std=self._with_std, + # copy=self._copy) + # return sk_output + + + +class SKAxiswiseScaler(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + Standardize a dataset along any axis, and center to the mean and component wise scale to unit variance. + See `sklearn documentation `_ for more details. + + Parameters + ---------- + axis: int (0 by default). + axis used to compute the means and standard deviations along. If 0, independently standardize each feature, otherwise (if 1) standardize each sample. + + with_mean: boolean, True by default. + If True, center the data before scaling. + + with_std: boolean, True by default. + If True, scale the data to unit variance (or equivalently, unit standard deviation). + """ + + __author__ = "DATALAB @Taxes A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_MAPPING, ], + "name": "Axis_wise_scale", + "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + "python_path": "d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler", + "hyperparams_to_tune": ['with_mean', 'with_std', 'axis'], + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "version": "0.0.1", + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'SKAxiswiseScaler')), + }) + + def __init__(self, *, hyperparams: Hyperparams) -> None: + super().__init__(hyperparams=hyperparams) # , random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Scaler(axis=self.hyperparams['axis'], # 0, + with_mean=self.hyperparams['with_mean'], + with_std=self.hyperparams['with_std'], + # copy=self.hyperparams['copy'], + ) + + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to scale. + + Returns: + Container DataFrame after scaling. + """ + + assert isinstance(inputs, container.DataFrame), type(dataframe) + + _, self._columns_to_produce = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = inputs.columns + # print(self._columns_to_produce) + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._columns_to_produce] + output_columns = [] + if len(self._columns_to_produce) > 0: + sk_output = self._clf.produce(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + # print(outputs.metadata.to_internal_simple_structure()) + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._columns_to_produce, + columns_list=output_columns) + + + + # print(inputs) + # print(outputs) + return base.CallResult(outputs) + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams[ + 'exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + # print(semantic_types) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + + outputs = container.DataFrame(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._columns_to_produce, outputs.metadata, + self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + # print(outputs.metadata.to_internal_simple_structure()) + + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata.base.DataMetadata + input_indices: list + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + +SKAxiswiseScaler.__doc__ = SKAxiswiseScaler.__doc__ diff --git a/tods/timeseries_processing/SKPowerTransformer.py b/tods/timeseries_processing/SKPowerTransformer.py new file mode 100644 index 0000000..f062f9d --- /dev/null +++ b/tods/timeseries_processing/SKPowerTransformer.py @@ -0,0 +1,500 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import copy + +# Custom import commands if any +from sklearn.preprocessing import PowerTransformer +from sklearn.base import BaseEstimator + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils +import uuid + +Inputs = d3m_dataframe +# Inputs = container.Dataset +Outputs = d3m_dataframe + +__all__ = ('SKPowerTransformer',) + +class Params(params.Params): + + # lambda_: Optional[ndarray] + clf_: Optional[BaseEstimator] + + # Keep previous + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + +class Hyperparams(hyperparams.Hyperparams): + # Added by Guanchu + method = hyperparams.Enumeration[str]( + values=['yeo-johnson', 'box-cox'], + default='yeo-johnson', # 'box-cox', # + description='yeo-johnson works with positive and negative values. box-cox only works with strictly positive values', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + standardize = hyperparams.UniformBool( + default=True, + description='Set to True to apply zero-mean, unit-variance normalization to the transformed output', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + # copy = hyperparams.UniformBool( + # default=True, + # description='Set to False to perform inplace computation during transformation.', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + # ) + + # Keep previous + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class SKPowerTransformer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + PowerTransformer primitive using sklearn to make data more Gaussian-like. + See `sklearn documentation `_ for more details. + Parameters + ---------- + method : str ('yeo-johnson' or 'box-cox') + PowerTransforming algorithm to use. + + standardize : bool + Set to True to apply zero-mean, unit-variance normalization to the transformed output. + + Attributes + ---------- + lambda_: numpy array of float, shape (n_features,) + The parameters of the power transformation for the selected features. + + """ + + __author__ = "DATALAB @Taxes A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_MAPPING, ], + "name": "Power_transformation", + "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + "python_path": "d3m.primitives.tods.timeseries_processing.transformation.power_transformer", + "hyperparams_to_tune": ['method', 'standardize'], + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "version": "0.0.1", + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'SKPowerTransformer')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = PowerTransformer(method=self.hyperparams['method'], + standardize=self.hyperparams['standardize'], + # copy=self.hyperparams['copy'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + # print(self._clf.get_params(deep=True)) + # print(getattr(self._clf, 'lambdas_')) + # print(dir(self._clf)) + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for Powertransformer. + Args: + inputs: Container DataFrame + + Returns: + None + """ + + self._inputs = inputs + self._fitted = False + + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit_transform(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to powertransformation + + Returns: + Container DataFrame after powertransformation. + """ + # print(self._training_indices) + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + # print(inputs) + # print(outputs) + return CallResult(outputs) + + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + + if not self._fitted: + return Params( + # lambda_=None, + clf_ = copy.copy(self._clf), + # Keep previous + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + # lambda_=getattr(self._clf, 'lambda_', None), + clf_=copy.copy(self._clf), + # Keep previous + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for Powertransformer. + Args: + params: class Params + + Returns: + None + """ + + # self._clf.lambda_ = params['lambda_'] + self._clf = params['clf_'] + # Keep previous + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + # if params['lambda_'] is not None: + # self._fitted = True + if params['clf_'] is not None: + self._fitted = True + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams[ + 'exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, + self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata.base.DataMetadata + input_indices: list + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKPowerTransformer.__doc__ = SKPowerTransformer.__doc__ diff --git a/tods/timeseries_processing/SKQuantileTransformer.py b/tods/timeseries_processing/SKQuantileTransformer.py new file mode 100644 index 0000000..66594a6 --- /dev/null +++ b/tods/timeseries_processing/SKQuantileTransformer.py @@ -0,0 +1,501 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing.data import QuantileTransformer + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +import uuid + + +Inputs = d3m_dataframe +Outputs = d3m_dataframe + +__all__ = ('SKQuantileTransformer',) + + +class Params(params.Params): + n_quantiles_: Optional[int] + quantiles_: Optional[ndarray] + references_: Optional[ndarray] + + # Keep previous + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + +class Hyperparams(hyperparams.Hyperparams): + n_quantiles = hyperparams.Hyperparameter[int]( + default=1000, + description='Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative distribution function.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + output_distribution = hyperparams.Enumeration[str]( + default='normal', + values=['uniform', 'normal'], + description='Marginal distribution for the transformed data. The choices are \'uniform\' (default) or \'normal\'.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + ignore_implicit_zeros = hyperparams.UniformBool( + default=False, + description='Only applies to sparse matrices. If True, the sparse entries of the matrix are discarded to compute the quantile statistics. If False, these entries are treated as zeros.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + subsample = hyperparams.Hyperparameter[int]( + default=100000, + description='Maximum number of samples used to estimate the quantiles for computational efficiency. Note that the subsampling procedure may differ for value-identical sparse and dense matrices.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + random_state = hyperparams.Union[Union[int, None]]( + configuration=OrderedDict( + init=hyperparams.Hyperparameter[int]( + default=0, + ), + ninit=hyperparams.Hyperparameter[None]( + default=None, + ), + ), + default='ninit', + description='the seed used by the random number generator.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], + ) + + # Keep previous + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SKQuantileTransformer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for sklearn QuantileTransformer + See `sklearn documentation `_ for more details. + + Parameters + ---------- + n_quantiles: int, optional (default=1000 or n_samples) + Number of quantiles to be computed. It corresponds to the number of landmarks used to discretize the cumulative distribution function. If n_quantiles is larger than the number of samples, n_quantiles is set to the number of samples as a larger number of quantiles does not give a better approximation of the cumulative distribution function estimator. + + output_distribution: str, optional (default=’uniform’) + Marginal distribution for the transformed data. The choices are ‘uniform’ (default) or ‘normal’. + + ignore_implicit_zeros: bool, optional (default=False) + Only applies to sparse matrices. If True, the sparse entries of the matrix are discarded to compute the quantile statistics. If False, these entries are treated as zeros. + + subsample: int, optional (default=1e5) + Maximum number of samples used to estimate the quantiles for computational efficiency. Note that the subsampling procedure may differ for value-identical sparse and dense matrices. + + random_state: int, RandomState instance or None, optional (default=None) + Determines random number generation for subsampling and smoothing noise. Please see subsample for more details. Pass an int for reproducible results across multiple function calls. See Glossary + + Attributes + ---------- + n_quantiles_: int + The actual number of quantiles used to discretize the cumulative distribution function. + + quantiles_: ndarray, shape (n_quantiles, n_features) + The values corresponding the quantiles of reference. + + references_: ndarray, shape(n_quantiles, ) + Quantiles of references. + """ + + __author__ = "DATALAB @Taxes A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, ], + "name": "Quantile_transformation", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.tods.timeseries_processing.transformation.quantile_transformer", + "hyperparams_to_tune": ['n_quantiles', 'output_distribution', 'ignore_implicit_zeros', 'subsample', 'random_state'], + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "version": "0.0.1", + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'SKQuantileTransformer')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = QuantileTransformer( + n_quantiles=self.hyperparams['n_quantiles'], + output_distribution=self.hyperparams['output_distribution'], + ignore_implicit_zeros=self.hyperparams['ignore_implicit_zeros'], + subsample=self.hyperparams['subsample'], + random_state=self.hyperparams['random_state'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + """ + Set training data for QuantileTransformer. + Args: + inputs: Container DataFrame + + Returns: + None + """ + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to Quantile Transform. + + Returns: + Container DataFrame after Quantile Transformation. + """ + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + # print(inputs) + # print(outputs) + return CallResult(outputs) + + + def get_params(self) -> Params: + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + if not self._fitted: + return Params( + n_quantiles_=None, + quantiles_=None, + references_=None, + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + n_quantiles_=getattr(self._clf, 'n_quantiles_', None), + quantiles_=getattr(self._clf, 'quantiles_', None), + references_=getattr(self._clf, 'references_', None), + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + """ + Set parameters for QuantileTransformer. + Args: + params: class Params + + Returns: + None + """ + self._clf.n_quantiles_ = params['n_quantiles_'] + self._clf.quantiles_ = params['quantiles_'] + self._clf.references_ = params['references_'] + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['n_quantiles_'] is not None: + self._fitted = True + if params['quantiles_'] is not None: + self._fitted = True + if params['references_'] is not None: + self._fitted = True + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata.base.DataMetadata + input_indices: list + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKQuantileTransformer.__doc__ = QuantileTransformer.__doc__ diff --git a/tods/timeseries_processing/SKStandardScaler.py b/tods/timeseries_processing/SKStandardScaler.py new file mode 100644 index 0000000..d6f251b --- /dev/null +++ b/tods/timeseries_processing/SKStandardScaler.py @@ -0,0 +1,545 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing + +# Custom import commands if any +from sklearn.preprocessing import StandardScaler + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError +from d3m.primitive_interfaces.base import CallResult, DockerContainer + +# from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase +from d3m.primitive_interfaces.transformer import TransformerPrimitiveBase + +from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin +from d3m import exceptions +import pandas + +from d3m import container, utils as d3m_utils +import uuid + +Inputs = d3m_dataframe +# Inputs = container.Dataset +Outputs = d3m_dataframe + +__all__ = ('SKStandardScaler',) + + +class Params(params.Params): + + scale_: Optional[ndarray] + mean_: Optional[ndarray] + var_: Optional[ndarray] + n_samples_seen_: Optional[numpy.int64] + + # Keep previous + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): + # Added by Guanchu + with_mean = hyperparams.UniformBool( + default=True, + description='If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + with_std = hyperparams.UniformBool( + default=True, + description='If True, scale the data to unit variance (or equivalently, unit standard deviation).', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + # copy = hyperparams.UniformBool( + # default=True, + # description='If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned.', + # semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + # ) + + # Keep previous + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class SKStandardScaler(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Standardize features by removing the mean and scaling to unit variance. + See `sklearn documentation `_ for more details. + + Parameters + ---------- + with_mean : bool + If True, center the data before scaling. This does not work (and will raise an exception) when attempted on sparse matrices, because centering them entails building a dense matrix which in common use cases is likely to be too large to fit in memory. + + with_std : bool + If True, scale the data to unit variance (or equivalently, unit standard deviation). + + Attributes + ---------- + scale_: ndarray or None, shape (n_features,) + Per feature relative scaling of the data. This is calculated using np.sqrt(var_). Equal to None when with_std=False. + + mean_: ndarray or None, shape (n_features,) + The mean value for each feature in the training set. Equal to None when with_mean=False. + + var_: ndarray or None, shape (n_features,) + The variance for each feature in the training set. Used to compute scale_. Equal to None when with_std=False. + + n_samples_seen_: int or array, shape (n_features,) + The number of samples processed by the estimator for each feature. If there are not missing samples, the n_samples_seen will be an integer, otherwise it will be an array. Will be reset on new calls to fit, but increments across partial_fit calls. + """ + + __author__ = "DATALAB @Taxes A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_CONVERSION, ], + "name": "Standard_scaler", + "primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, + "python_path": "d3m.primitives.tods.timeseries_processing.transformation.standard_scaler", + "hyperparams_to_tune": ['with_mean', 'with_std'], + "source": {'name': "DATALAB @Taxes A&M University", 'contact': 'mailto:khlai037@tamu.edu', + 'uris': ['https://gitlab.com/lhenry15/tods.git']}, + "version": "0.0.1", + "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, 'SKStandardScaler')), + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = StandardScaler(with_mean=self.hyperparams['with_mean'], + with_std=self.hyperparams['with_std'], + # copy=self.hyperparams['copy'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + # print(self._clf.get_params(deep=True)) + # print(getattr(self._clf, 'lambdas_')) + # print(dir(self._clf)) + + + def set_training_data(self, *, inputs: Inputs) -> None: + + """ + Set training data for Standardizer. + Args: + inputs: Container DataFrame + + Returns: + None + """ + + self._inputs = inputs + self._fitted = False + + + def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: + + """ + Fit model with training data. + Args: + *: Container DataFrame. Time series data up to fit. + + Returns: + None + """ + + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit_transform(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + # print(self._training_inputs.std()) + + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: + + """ + Process the testing data. + Args: + inputs: Container DataFrame. Time series data up to standardlize. + + Returns: + Container DataFrame after standardlization. + """ + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + sk_inputs = inputs + if self.hyperparams['use_semantic_types']: + sk_inputs = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + sk_output = self._clf.transform(sk_inputs) + if sparse.issparse(sk_output): + sk_output = sk_output.toarray() + outputs = self._wrap_predictions(inputs, sk_output) + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + # print(outputs.metadata.to_internal_simple_structure()) + + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + # print(inputs) + # print(outputs) + # print(inputs.metadata.to_internal_simple_structure()) + # print(outputs.metadata.to_internal_simple_structure()) + + return CallResult(outputs) + + + def get_params(self) -> Params: + + """ + Return parameters. + Args: + None + + Returns: + class Params + """ + + if not self._fitted: + return Params( + scale_=None, + mean_=None, + var_=None, + n_sample_seen_=None, + + # Keep previous + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + # print(self._clf.n_samples_seen_.shape) + # print(type(self._clf.n_samples_seen_)) + # print(type(self._clf.mean_)) + return Params( + scale_=getattr(self._clf, 'scale_', None), + mean_=getattr(self._clf, 'mean_', None), + var_=getattr(self._clf, 'var_', None), + n_samples_seen_=getattr(self._clf, 'n_samples_seen_', None), + # Keep previous + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata, + ) + + + def set_params(self, *, params: Params) -> None: + + """ + Set parameters for Standardizer. + Args: + params: class Params + + Returns: + None + """ + + self._clf.scale_ = params['scale_'] + self._clf.mean_ = params['mean_'] + self._clf.var_ = params['var_'] + self._clf.n_samples_seen_ = params['n_samples_seen_'] + # Keep previous + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + + if params['scale_'] is not None: + self._fitted = True + if params['mean_'] is not None: + self._fitted = True + if params['var_'] is not None: + self._fitted = True + if params['n_samples_seen_'] is not None: + self._fitted = True + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + + + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams[ + 'exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + # print(semantic_types) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + + """ + Output metadata of selected columns. + Args: + outputs_metadata: metadata_base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, + self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + # print(outputs.metadata.to_internal_simple_structure()) + + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata.base.DataMetadata + input_indices: list + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + d3m.metadata.base.DataMetadata + """ + + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SKStandardScaler.__doc__ = SKStandardScaler.__doc__ diff --git a/tods/timeseries_processing/SimpleExponentialSmoothing.py b/tods/timeseries_processing/SimpleExponentialSmoothing.py new file mode 100644 index 0000000..580ce64 --- /dev/null +++ b/tods/timeseries_processing/SimpleExponentialSmoothing.py @@ -0,0 +1,349 @@ +from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os +import sklearn +import numpy +import typing +import pandas as pd +# Custom import commands if any +from sklearn.preprocessing.data import Normalizer +from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt + + +from d3m.container.numpy import ndarray as d3m_ndarray +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base +from d3m import utils +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + + +from d3m.primitive_interfaces.base import CallResult,DockerContainer +from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase + +import os +from typing import Any,Optional,List + +from d3m import container, utils as d3m_utils +from d3m.metadata import base as metadata_base +from d3m.metadata import hyperparams,params +from d3m.primitive_interfaces import base, transformer + + + +Inputs = d3m_dataframe +# Inputs = container.Dataset +Outputs = d3m_dataframe + + +class Params(params.Params): + input_column_names: Optional[Any] + target_names_: Optional[Sequence[Any]] + training_indices_: Optional[Sequence[int]] + target_column_indices_: Optional[Sequence[int]] + target_columns_metadata_: Optional[List[OrderedDict]] + + + +class Hyperparams(hyperparams.Hyperparams): +# Added by Mia + endog = hyperparams.Bounded[int]( + lower = 2, + upper = None, + default = 3, + description='Array like time series', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + +# Keep previous + norm = hyperparams.Enumeration[str]( + default='l2', + values=['l1', 'l2', 'max'], + description='The norm to use to normalize each non zero sample.', + semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] + ) + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='new', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + +class SimpleExponentialSmoothing(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): + """ + Primitive wrapping for simple exponential smoothing + `statsmodels documentation `_ + + """ + + __author__ = "DATA Lab at Texas A&M University" + metadata = metadata_base.PrimitiveMetadata({ + "algorithm_types": [metadata_base.PrimitiveAlgorithmType.SIMPLE_EXPONENTIAL_SMOOTHING,], + "name": "statsmodels.preprocessing.data.SimpleExponentialSmoothing", + "primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, + "python_path": "d3m.primitives.tods.timeseries_processing.transformation.simple_exponential_smoothing", + "source": {'name': 'DATA Lab at Texas A&M University', 'contact': 'mailto:khlai037@tamu.edu', 'uris': ['https://gitlab.com/lhenry15/tods.git', 'https://gitlab.com/lhenry15/tods/-/blob/mia/anomaly-primitives/anomaly_primitives/SimpleExponentialSmoothing.py']}, + "version": "0.0.1", + "id": "3e92984e-b7d1-4de0-9203-3a6093ddb38e", + "hyperparams_to_tune": ['endog','use_columns'], + }) + + def __init__(self, *, + hyperparams: Hyperparams, + random_seed: int = 0, + docker_containers: Dict[str, DockerContainer] = None) -> None: + + super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) + + # False + self._clf = Normalizer( + norm=self.hyperparams['norm'], + ) + + self._inputs = None + self._outputs = None + self._training_inputs = None + self._training_outputs = None + self._target_names = None + self._training_indices = None + self._target_column_indices = None + self._target_columns_metadata: List[OrderedDict] = None + self._input_column_names = None + self._fitted = False + + + def set_training_data(self, *, inputs: Inputs) -> None: + self._inputs = inputs + self._fitted = False + + def fit(self, *, timeout: float = None, iterations: int = None)-> CallResult[None]: + if self._fitted: + return CallResult(None) + + self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if self._training_inputs is None: + return CallResult(None) + + if len(self._training_indices) > 0: + self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + return CallResult(None) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + + self.logger.info('Simple Exponential Smoothing Primitive called') + outputs = inputs + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + try: + columns_to_calculate_simple_exponential_smoothing= List[str] + if(self.hyperparams['use_columns']==()): + columns_to_calculate_simple_exponential_smoothing = list(set(inputs.columns)-set(['d3mIndex','timestamp','ground_truth'])) + else: + columns_to_calculate_simple_exponential_smoothing = self.hyperparams['use_columns'] + for column in self._training_indices: + outputs[inputs.columns[column]+"_simple_exponential_smoothing"] = SimpleExpSmoothing(inputs.iloc[:, column]).fit(smoothing_level=0.2,optimized=False).fittedvalues + except Exception as e: + self.logger.error("Error in Calculating simple exponential smoothing",e) + self._update_metadata(outputs) + #print(inputs) + #print("-------------") + print(outputs) + + return base.CallResult(outputs) + + + def _update_metadata(self, outputs): + outputs.metadata = outputs.metadata.generate(outputs,) + + + + + def get_params(self) -> Params: + if not self._fitted: + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + return Params( + input_column_names=self._input_column_names, + training_indices_=self._training_indices, + target_names_=self._target_names, + target_column_indices_=self._target_column_indices, + target_columns_metadata_=self._target_columns_metadata + ) + + def set_params(self, *, params: Params) -> None: + self._input_column_names = params['input_column_names'] + self._training_indices = params['training_indices_'] + self._target_names = params['target_names_'] + self._target_column_indices = params['target_column_indices_'] + self._target_columns_metadata = params['target_columns_metadata_'] + self._fitted = True + + + + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=hyperparams['use_columns'], + exclude_columns=hyperparams['exclude_columns'], + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + return True + + return False + + + @classmethod + def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) + + # Update semantic types and prepare it for predicted targets. + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = [] + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + return outputs + + + @classmethod + def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], + outputs_metadata: metadata_base.DataMetadata, hyperparams): + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in input_indices: + column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") + if column_name is None: + column_name = "output_{}".format(column_index) + + column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) + semantic_types = set(column_metadata.get('semantic_types', [])) + semantic_types_to_remove = set([]) + add_semantic_types = set() + add_semantic_types.add(hyperparams["return_semantic_type"]) + semantic_types = semantic_types - semantic_types_to_remove + semantic_types = semantic_types.union(add_semantic_types) + column_metadata['semantic_types'] = list(semantic_types) + + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + # If outputs has more columns than index, add Attribute Type to all remaining + if outputs_length > len(input_indices): + for column_index in range(len(input_indices), outputs_length): + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_name = "output_{}".format(column_index) + column_metadata["semantic_types"] = list(semantic_types) + column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + +SimpleExponentialSmoothing.__doc__ = Normalizer.__doc__ diff --git a/tods/timeseries_processing/TimeSeriesSeasonalityTrendDecomposition.py b/tods/timeseries_processing/TimeSeriesSeasonalityTrendDecomposition.py new file mode 100644 index 0000000..69630df --- /dev/null +++ b/tods/timeseries_processing/TimeSeriesSeasonalityTrendDecomposition.py @@ -0,0 +1,348 @@ + +import os +from typing import Any,Optional,List +import statsmodels.api as sm + +from d3m import container, utils as d3m_utils +from d3m import utils + +from numpy import ndarray +from collections import OrderedDict +from scipy import sparse +import os + +import numpy +import typing +import time + +from d3m import container +from d3m.primitive_interfaces import base, transformer + +from d3m.container import DataFrame as d3m_dataframe +from d3m.metadata import hyperparams, params, base as metadata_base + +from d3m.base import utils as base_utils +from d3m.exceptions import PrimitiveNotFittedError + +__all__ = ('TimeSeriesSeasonalityTrendDecompositionPrimitive',) + +Inputs = container.DataFrame +Outputs = container.DataFrame + +class Params(params.Params): + #to-do : how to make params dynamic + use_column_names: Optional[Any] + + + +class Hyperparams(hyperparams.Hyperparams): + period = hyperparams.Hyperparameter(default=1,semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/TuningParameter', + ],description="Window Size for decomposition") + + model = hyperparams.Hyperparameter(default='additive', semantic_types=[ + 'https://metadata.datadrivendiscovery.org/types/ControlParameter', + ], description="Window Size for decomposition") + + use_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", + ) + exclude_columns = hyperparams.Set( + elements=hyperparams.Hyperparameter[int](-1), + default=(), + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", + ) + return_result = hyperparams.Enumeration( + values=['append', 'replace', 'new'], + default='append', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", + ) + use_semantic_types = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" + ) + add_index_columns = hyperparams.UniformBool( + default=False, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", + ) + error_on_no_input = hyperparams.UniformBool( + default=True, + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], + description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", + ) + + return_semantic_type = hyperparams.Enumeration[str]( + values=['https://metadata.datadrivendiscovery.org/types/Attribute', + 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], + default='https://metadata.datadrivendiscovery.org/types/Attribute', + description='Decides what semantic type to attach to generated attributes', + semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] + ) + + +class TimeSeriesSeasonalityTrendDecompositionPrimitive(transformer.TransformerPrimitiveBase[Inputs, Outputs, Hyperparams]): + """ + A primitive to decompose time series in trend , seasonality and residual + Decomposition is done based on period(frequency) passed as hyperparameter + The columns for which decomposition is done is passed as hyperparameter .Default is all value columns + + """ + __author__ = "DATA Lab at Texas A&M University", + metadata = metadata_base.PrimitiveMetadata( + { + 'id': 'fe79c99b-7e9b-4b4c-bc70-6e0ec798acbc', + 'version': '0.1.0', + 'name': 'Time Series Decompostional', + 'python_path': 'd3m.primitives.tods.timeseries_processing.decomposition.time_series_seasonality_trend_decomposition', + 'keywords': ['Time Series', 'Trend', 'Seasonality','Residual'], + 'source': { + 'name': 'DATA Lab at Texas A&M University', + 'uris': ['https://gitlab.com/lhenry15/tods.git','https://gitlab.com/lhenry15/tods/-/blob/devesh/tods/feature_analysis/TimeSeriesSeasonalityTrendDecomposition.py'], + 'contact': 'mailto:khlai037@tamu.edu' + }, + 'installation': [ + {'type': metadata_base.PrimitiveInstallationType.PIP, + 'package_uri': 'git+https://gitlab.com/lhenry15/tods.git@{git_commit}#egg=TODS'.format( + git_commit=d3m_utils.current_git_commit(os.path.dirname(__file__)), + ), + } + + ], + 'algorithm_types': [ + metadata_base.PrimitiveAlgorithmType.DATA_PROFILING , + ], + 'primitive_family': metadata_base.PrimitiveFamily.DATA_VALIDATION, + + } + ) + + def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]: + """ + + Args: + inputs: Container DataFrame + timeout: Default + iterations: Default + + Returns: + Container DataFrame containing decomposed time series + """ + self.logger.info('Time Series seasonality trend decomposition Primitive called') + + # Get cols to fit. + self._fitted = False + self._training_inputs, self._training_indices = self._get_columns_to_fit(inputs, self.hyperparams) + self._input_column_names = self._training_inputs.columns + + if len(self._training_indices) > 0: + # self._clf.fit(self._training_inputs) + self._fitted = True + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + + if not self._fitted: + raise PrimitiveNotFittedError("Primitive not fitted.") + seasonality_trend_decomposition_input = inputs + if self.hyperparams['use_semantic_types']: + seasonality_trend_decomposition_input = inputs.iloc[:, self._training_indices] + output_columns = [] + if len(self._training_indices) > 0: + seasonality_trend_decomposed_output = self._seasonality_trend_decomposition(seasonality_trend_decomposition_input, + model=self.hyperparams["model"], + period=self.hyperparams["period"]) + if sparse.issparse(seasonality_trend_decomposed_output): + seasonality_trend_decomposed_output = seasonality_trend_decomposed_output.toarray() + outputs = self._wrap_predictions(inputs, seasonality_trend_decomposed_output) + + if len(outputs.columns) == len(self._input_column_names): + outputs.columns = self._input_column_names + output_columns = [outputs] + + else: + if self.hyperparams['error_on_no_input']: + raise RuntimeError("No input columns were selected") + self.logger.warn("No input columns were selected") + outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], + add_index_columns=self.hyperparams['add_index_columns'], + inputs=inputs, column_indices=self._training_indices, + columns_list=output_columns) + + + self.logger.info('Time Series Seasonality Trend Decomposition Primitive returned') + + return base.CallResult(outputs) + + + @classmethod + def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): + """ + Select columns to fit. + Args: + inputs: Container DataFrame + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + list + """ + if not hyperparams['use_semantic_types']: + return inputs, list(range(len(inputs.columns))) + + inputs_metadata = inputs.metadata + + def can_produce_column(column_index: int) -> bool: + return cls._can_produce_column(inputs_metadata, column_index, hyperparams) + + use_columns = hyperparams['use_columns'] + exclude_columns = hyperparams['exclude_columns'] + + columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, + use_columns=use_columns, + exclude_columns=exclude_columns, + can_use_column=can_produce_column) + return inputs.iloc[:, columns_to_produce], columns_to_produce + # return columns_to_produce + + @classmethod + def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, + hyperparams: Hyperparams) -> bool: + """ + Output whether a column can be processed. + Args: + inputs_metadata: d3m.metadata.base.DataMetadata + column_index: int + + Returns: + bool + """ + column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) + + accepted_structural_types = (int, float, numpy.integer, numpy.float64) + accepted_semantic_types = set() + accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") + if not issubclass(column_metadata['structural_type'], accepted_structural_types): + + return False + + semantic_types = set(column_metadata.get('semantic_types', [])) + return True + if len(semantic_types) == 0: + cls.logger.warning("No semantic types found in column metadata") + return False + + # Making sure all accepted_semantic_types are available in semantic_types + if len(accepted_semantic_types - semantic_types) == 0: + + return True + + return False + + @classmethod + def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], + target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: + """ + Updata metadata for selected columns. + Args: + inputs_metadata: metadata_base.DataMetadata + outputs: Container Dataframe + target_columns_metadata: list + + Returns: + d3m.metadata.base.DataMetadata + """ + outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) + + for column_index, column_metadata in enumerate(target_columns_metadata): + column_metadata.pop("structural_type", None) + outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) + + return outputs_metadata + + def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: + """ + Wrap predictions into dataframe + Args: + inputs: Container Dataframe + predictions: array-like data (n_samples, n_features) + + Returns: + Dataframe + """ + outputs = d3m_dataframe(predictions, generate_metadata=True) + target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) + outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) + + return outputs + + @classmethod + def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): + """ + Add target columns metadata + Args: + outputs_metadata: metadata.base.DataMetadata + hyperparams: d3m.metadata.hyperparams.Hyperparams + + Returns: + List[OrderedDict] + """ + outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] + target_columns_metadata: List[OrderedDict] = [] + for column_index in range(outputs_length): + # column_name = "output_{}".format(column_index) + column_metadata = OrderedDict() + semantic_types = set() + semantic_types.add(hyperparams["return_semantic_type"]) + column_metadata['semantic_types'] = list(semantic_types) + + # column_metadata["name"] = str(column_name) + target_columns_metadata.append(column_metadata) + + return target_columns_metadata + + def _write(self, inputs: Inputs): + inputs.to_csv(str(time.time()) + '.csv') + + + def _seasonality_trend_decomposition(self,X,model,period): + """ + Seasonal decomposition using moving averages + Args: + X: DataFrame + Time series. + + model{“additive”, “multiplicative”}, optional + Type of seasonal component + + period : int, optional + Period of the series. + + + Returns: + DataFrame + A object with seasonal, trend, and resid attributes + + """ + transformed_X = utils.pandas.DataFrame() + + for column in X.columns: + decomposed_components = sm.tsa.seasonal_decompose(X[column], model=model, + period=period,two_sided=False) + period = self.hyperparams['period'] + if (period > 1): + decomposed_components.trend[0:int(period / 2)] = decomposed_components.trend[int(period / 2)] + decomposed_components.trend[ + len(decomposed_components.trend) - int(period / 2):len(decomposed_components.trend)] = \ + decomposed_components.trend[len(decomposed_components.trend) - int(period / 2) - 1] + + transformed_X[column + "_trend"] = decomposed_components.trend + transformed_X[column + "_seasonal"] = decomposed_components.seasonal + + return transformed_X diff --git a/tods/timeseries_processing/__init__.py b/tods/timeseries_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tods/tods/__init__.py b/tods/tods/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tods/tods/search/__init__.py b/tods/tods/search/__init__.py new file mode 100644 index 0000000..179c117 --- /dev/null +++ b/tods/tods/search/__init__.py @@ -0,0 +1 @@ +from .brute_force_search import BruteForceSearch diff --git a/tods/tods/search/brute_force_search.py b/tods/tods/search/brute_force_search.py new file mode 100644 index 0000000..28fb9c3 --- /dev/null +++ b/tods/tods/search/brute_force_search.py @@ -0,0 +1,36 @@ +# A Brute-Force Search +import uuid + +from d3m.metadata.pipeline import Pipeline + +from axolotl.algorithms.base import PipelineSearchBase +from axolotl.utils import pipeline as pipeline_utils, schemas as schemas_utils + +def random_rank(pipeline_result): + if pipeline_result.status == 'COMPLETED': + pipeline_result.rank = random.uniform(0, 1) + return pipeline_result + +class BruteForceSearch(PipelineSearchBase): + def __init__(self, problem_description, backend, *, primitives_blocklist=None, ranking_function=None): + super().__init__(problem_description=problem_description, backend=backend, + primitives_blocklist=primitives_blocklist, ranking_function=ranking_function) + if self.ranking_function is None: + self.ranking_function = random_rank + + # Find th candidates + self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords']) + print('task_description:', self.task_description) + self.available_pipelines = self._return_pipelines( + self.task_description['task_type'], self.task_description['task_subtype'], self.task_description['data_types']) + print('available_pipelines:', self.available_pipelines) + + def _return_pipelines(self, task_type, task_subtype, data_type): + pipeline_candidates = [] + for pipeline_dict in schemas_utils.get_pipelines_db()['CLASSIFICATION']: + pipeline = pipeline_utils.load_pipeline(pipeline_dict) + pipeline.id = str(uuid.uuid4()) + pipeline.created = Pipeline().created + pipeline_candidates.append(pipeline) + + return pipeline_candidates